diff --git a/.gitattributes b/.gitattributes index de1efff290be..2b50bc0ac691 100644 --- a/.gitattributes +++ b/.gitattributes @@ -14,6 +14,7 @@ Readme text *.bat text *.py text *.pl text +*.ps1 text *.sh text eol=lf build-and-test text eol=lf configure text eol=lf diff --git a/Source/ActionsLib/EvalActions.cpp b/Source/ActionsLib/EvalActions.cpp index ee8b9b333005..89570c1b88b5 100644 --- a/Source/ActionsLib/EvalActions.cpp +++ b/Source/ActionsLib/EvalActions.cpp @@ -57,6 +57,10 @@ static void DoEvalBase(const ConfigParameters& config, IDataReader& reader) int traceLevel = config(L"traceLevel", "0"); size_t numMBsToShowResult = config(L"numMBsToShowResult", "100"); + size_t maxSamplesInRAM = config(L"maxSamplesInRAM", (size_t)SIZE_MAX); + size_t numSubminiBatches = config(L"numSubminibatches", (size_t)1); + //TODO: switch to a global parallel setting for both training and evaluation. + bool useParallel = config(L"parallelTrain", false); ConfigArray evalNodeNames = config(L"evalNodeNames", ""); vector evalNodeNamesVector; @@ -66,8 +70,8 @@ static void DoEvalBase(const ConfigParameters& config, IDataReader& reader) } auto net = ComputationNetwork::CreateFromFile(deviceId, modelPath); - - SimpleEvaluator eval(net, numMBsToShowResult, traceLevel); + + SimpleEvaluator eval(net, useParallel, numMBsToShowResult, traceLevel, maxSamplesInRAM, numSubminiBatches); eval.Evaluate(&reader, evalNodeNamesVector, mbSize[0], epochSize); } @@ -114,6 +118,10 @@ void DoCrossValidate(const ConfigParameters& config) int traceLevel = config(L"traceLevel", "0"); size_t numMBsToShowResult = config(L"numMBsToShowResult", "100"); + size_t maxSamplesInRAM = config(L"maxSamplesInRAM", (size_t)SIZE_MAX); + size_t numSubminiBatches = config(L"numSubminibatches", (size_t)1); + //TODO: switch to a global parallel setting for both training and evaluation. + bool useParallel = config(L"parallelTrain", false); ConfigArray evalNodeNames = config(L"evalNodeNames", ""); vector evalNodeNamesVector; @@ -146,8 +154,8 @@ void DoCrossValidate(const ConfigParameters& config) cvModels.push_back(cvModelPath); auto net = ComputationNetwork::CreateFromFile(deviceId, cvModelPath); - - SimpleEvaluator eval(net, numMBsToShowResult, traceLevel); + + SimpleEvaluator eval(net, useParallel, numMBsToShowResult, traceLevel, maxSamplesInRAM, numSubminiBatches); fprintf(stderr, "model %ls --> \n", cvModelPath.c_str()); auto evalErrors = eval.Evaluate(&cvDataReader, evalNodeNamesVector, mbSize[0], epochSize); diff --git a/Source/ActionsLib/OtherActions.cpp b/Source/ActionsLib/OtherActions.cpp index a5a3b003b91f..39cb6ac45ccc 100644 --- a/Source/ActionsLib/OtherActions.cpp +++ b/Source/ActionsLib/OtherActions.cpp @@ -476,49 +476,50 @@ template void DoWriteWordAndClassInfo(const ConfigParameters& config); template void DoTopologyPlot(const ConfigParameters& config) { - wstring modelPath = config(L"modelPath"); - wstring outdot = config(L"outputDotFile"); // filename for the dot language output, if not specified, %modelpath%.dot will be used - wstring outRending = config(L"outputFile"); // filename for the rendered topology plot + wstring modelPath = config(L"modelPath"); + wstring outputDotFile = config(L"outputDotFile"); // filename for the dot language output, if not specified, %modelpath%.dot will be used + wstring outputFile = config(L"outputFile"); // filename for the rendered topology plot // this can be empty, in that case no rendering will be done // or if this is set, renderCmd must be set, so CNTK will call re - wstring RenderCmd = config(L"RenderCmd"); // if this option is set, then CNTK will call the render to convert the outdotFile to a graph + wstring renderCmd = config(L"renderCmd"); // if this option is set, then CNTK will call the render to convert the outdotFile to a graph // e.g. "d:\Tools\graphviz\bin\dot.exe -Tpng -x -o" // where and are two special placeholders - // ======================================== - // Sec. 1 option check - // ======================================== - if (outdot.empty()) - { - outdot = modelPath + L".dot"; - } + // output dot file defaults to modelpath.dot + if (outputDotFile.empty()) + outputDotFile = modelPath + L".dot"; - wstring rescmd; - if (!outRending.empty()) // we need to render the plot - { - std::wregex inputPlaceHolder(L"(.+)()(.*)"); - std::wregex outputPlaceHolder(L"(.+)()(.*)"); + ComputationNetwork net(CPUDEVICE); + net.Load(modelPath); + + net.PlotNetworkTopology(outputDotFile); + fprintf(stderr, "Created network description in dot format: %ls\n", outputDotFile.c_str()); - rescmd = regex_replace(RenderCmd, inputPlaceHolder, L"$1" + outdot + L"$3"); - rescmd = regex_replace(rescmd, outputPlaceHolder, L"$1" + outRending + L"$3"); + if (!outputFile.empty()) + { + if (renderCmd.empty()) + InvalidArgument("plot: If you specify an outputFile, you also need a renderCmd."); +#if 0 // this part is problematic under early version of gcc (< 4.9) + static const wregex inputPlaceHolder(L"(.+)()(.*)"); + static const wregex outputPlaceHolder(L"(.+)()(.*)"); + + // patch in the pathnames + renderCmd = regex_replace(renderCmd, inputPlaceHolder, L"$1" + outputDotFile + L"$3"); + renderCmd = regex_replace(renderCmd, outputPlaceHolder, L"$1" + outputFile + L"$3"); +#endif + msra::strfun::ReplaceAll(renderCmd, wstring(L""), outputDotFile); + msra::strfun::ReplaceAll(renderCmd, wstring(L""), outputFile); } - ComputationNetwork net(-1); - net.Load(modelPath); - net.PlotNetworkTopology(outdot); - fprintf(stderr, "Output network description in dot language to %S\n", outdot.c_str()); - if (!outRending.empty()) - { - fprintf(stderr, "Executing a third-part tool for rendering dot:\n%S\n", rescmd.c_str()); + fprintf(stderr, "Executing third-party tool for rendering dot:\n%ls\n", renderCmd.c_str()); #ifdef __unix__ - const auto rc = system(msra::strfun::utf8(rescmd).c_str()); - rc /*ignoring the result--this gets flagged by gcc if we don't save the return value*/; + auto rc = system(msra::strfun::utf8(renderCmd).c_str()); + rc; // ignoring the result--this gets flagged by gcc if we don't save the return value #else - _wsystem(rescmd.c_str()); + _wsystem(renderCmd.c_str()); #endif - fprintf(stderr, "Done\n"); - } + fprintf(stderr, "Done.\n"); } template void DoTopologyPlot(const ConfigParameters& config); diff --git a/Source/CNTK/BrainScript/ExperimentalNetworkBuilder.cpp b/Source/CNTK/BrainScript/ExperimentalNetworkBuilder.cpp index 6a6dcbfb4b1b..0868af87be5b 100644 --- a/Source/CNTK/BrainScript/ExperimentalNetworkBuilder.cpp +++ b/Source/CNTK/BrainScript/ExperimentalNetworkBuilder.cpp @@ -58,6 +58,7 @@ L"ParameterTensor(dims, learningRateMultiplier = 1.0, init = 'uniform'/*|fixedVa L"TransposeDimensions(input, dim1, dim2, tag='') = new ComputationNode [ operation = 'TransposeDimensions' ; inputs = input /*plus the function args*/ ]\n" L"Transpose(x) = TransposeDimensions(x, 1, 2)\n" L"Times(A, B, outputRank=1, tag='') = new ComputationNode [ operation = 'Times' ; inputs = ( A : B ) /*plus the function args*/ ]\n" + // TODO: Logistic should be generated with with BinaryStandardNode macro below. L"Logistic(label, probability, tag='') = new ComputationNode [ operation = 'Logistic' ; inputs = (label : probability) /*plus the function args*/ ]\n" L"WeightedLogistic(label, probability, instanceWeight, tag='') = new ComputationNode [ operation = 'Logistic' ; inputs = (label : probability : instanceWeight) /*plus the function args*/ ]\n" L"ReconcileMBLayout(dataInput, layoutInput, tag='') = new ComputationNode [ operation = 'ReconcileMBLayout' ; inputs = (dataInput : layoutInput) /*plus the function args*/ ]\n" diff --git a/Source/CNTK/CNTK.cpp b/Source/CNTK/CNTK.cpp index cca579ce7d6a..c6c96cbce929 100644 --- a/Source/CNTK/CNTK.cpp +++ b/Source/CNTK/CNTK.cpp @@ -405,7 +405,7 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp wstring startupMessage = msra::strfun::wstrprintf(L"running on %ls at %ls\n", msra::strfun::utf16(GetHostName()).c_str(), msra::strfun::utf16(TimeDateStamp()).c_str()); startupMessage += msra::strfun::wstrprintf(L"command line: %ls", exePath.c_str()); for (const auto& arg : args) - startupMessage += L" " + arg; + startupMessage += L" " + arg; fprintf(stderr, "%ls\n", startupMessage.c_str()); @@ -580,9 +580,7 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is fprintf(stderr, "running on %s at %s\n", GetHostName().c_str(), timestamp.c_str()); fprintf(stderr, "command line: \n"); for (int i = 0; i < argc; i++) - { - fprintf(stderr, "%s ", WCharToString(argv[i]).c_str()); - } + fprintf(stderr, "%*s%ls", i > 0 ? 2 : 0, "", argv[i]); // use 2 spaces for better visual separability // This simply merges all the different config parameters specified (eg, via config files or via command line directly), // and prints it. diff --git a/Source/CNTK/ModelEditLanguage.cpp b/Source/CNTK/ModelEditLanguage.cpp index cdc645f38c00..4b48d1ba888d 100644 --- a/Source/CNTK/ModelEditLanguage.cpp +++ b/Source/CNTK/ModelEditLanguage.cpp @@ -596,7 +596,7 @@ void MELScript::CallFunction(const std::string& p_name, const ConfigPa case melPropBatchNormMode: { bool evalMode = params[2]; - netNdl->cn->SetBatchNormlizationNodesBelowEvalMode(evalMode, node); + netNdl->cn->SetBatchNormalizationNodesBelowEvalMode(evalMode, node); break; } default: diff --git a/Source/CNTK/prebuild.bat b/Source/CNTK/prebuild.bat index e556956f6519..dff7ac3d8e4d 100644 --- a/Source/CNTK/prebuild.bat +++ b/Source/CNTK/prebuild.bat @@ -11,18 +11,21 @@ echo #define _BUILDINFO_H >> buildinfo.h$$ FOR /F %%i IN ('hostname') DO SET HOST=%%i :: assuming hostname always exists -:: not sure whether git in path ? -call git --version 2 > nul -if not %ERRORLEVEL% == 9009 ( - echo #define _GIT_EXIST >> buildinfo.h$$ - FOR /F %%i IN ('git rev-parse --abbrev-ref HEAD') DO SET BRANCH=%%i - FOR /F %%i IN ('git rev-parse HEAD') DO SET COMMIT=%%i - set STATUS= - git diff --quiet --cached - if not errorlevel 1 git diff --quiet - if errorlevel 1 set STATUS= ^(modified^) - echo #define _BUILDBRANCH_ "!BRANCH!" >> buildinfo.h$$ - echo #define _BUILDSHA1_ "!COMMIT!!STATUS!">> buildinfo.h$$ +:: note: we'll only use git which is in path +where -q git +if not errorlevel 1 ( + call git --version > NUL 2>&1 + if not errorlevel 1 ( + echo #define _GIT_EXIST >> buildinfo.h$$ + FOR /F %%i IN ('call git rev-parse --abbrev-ref HEAD') DO SET BRANCH=%%i + FOR /F %%i IN ('call git rev-parse HEAD') DO SET COMMIT=%%i + set STATUS= + call git diff --quiet --cached + if not errorlevel 1 call git diff --quiet + if errorlevel 1 set STATUS= ^(modified^) + echo #define _BUILDBRANCH_ "!BRANCH!" >> buildinfo.h$$ + echo #define _BUILDSHA1_ "!COMMIT!!STATUS!">> buildinfo.h$$ + ) ) :: For now, math lib is basically hardwired @@ -75,4 +78,4 @@ echo #endif >> buildinfo.h$$ ::: update file only if it changed (otherwise CNTK.cpp will get rebuilt each time) fc buildinfo.h$$ buildinfo.h > NUL 2>&1 -if ERRORLEVEL 1 move /Y buildinfo.h$$ buildinfo.h +if errorlevel 1 move /Y buildinfo.h$$ buildinfo.h diff --git a/Source/Common/ExceptionWithCallStack.cpp b/Source/Common/ExceptionWithCallStack.cpp index 2f540483bc9e..5b28191a6b34 100644 --- a/Source/Common/ExceptionWithCallStack.cpp +++ b/Source/Common/ExceptionWithCallStack.cpp @@ -18,55 +18,155 @@ namespace Microsoft { namespace MSR { namespace CNTK { using namespace std; +static string MakeFunctionNameStandOut(string name); +static void CollectCallStack(size_t skipLevels, bool makeFunctionNamesStandOut, const function& write); + +/// This function retrieves the call stack as a string +template +string ExceptionWithCallStack::GetCallStack(size_t skipLevels /*= 0*/, bool makeFunctionNamesStandOut /*= false*/) +{ + try + { + string output; + CollectCallStack(skipLevels + 1/*skip this function*/, makeFunctionNamesStandOut, [&output](string stack) + { + output += stack; + }); + return output; + } + catch (...) // since we run as part of error reporting, don't get hung up on our own error + { + return string(); + } +} + +/// This function outputs the call stack to the std err +template +void ExceptionWithCallStack::PrintCallStack(size_t skipLevels /*= 0*/, bool makeFunctionNamesStandOut /*= false*/) +{ + CollectCallStack(skipLevels + 1/*skip this function*/, makeFunctionNamesStandOut, [](string stack) + { + cerr << stack; + }); +} + +// make the unmangled name a bit more readable +// Insert spaces around the main function name for better visual parsability; and double-spaces between function arguments. +// This uses some heuristics for C++ names that may be fragile, but that's OK since this only adds/removes spaces. +static string MakeFunctionNameStandOut(string origName) +{ + try // guard against exception, since this is used for exception reporting + { + auto name = origName; + // strip off modifiers for parsing (will be put back at the end) + string modifiers; + auto pos = name.find_last_not_of(" abcdefghijklmnopqrstuvwxyz"); + if (pos != string::npos) + { + modifiers = name.substr(pos + 1); + name = name.substr(0, pos + 1); + } + bool hasArgList = !name.empty() && name.back() == ')'; + size_t angleDepth = 0; + size_t parenDepth = 0; + bool hitEnd = !hasArgList; // hit end of function name already? + bool hitStart = false; + // we parse the function name from the end; escape nested <> and () + // We look for the end and start of the function name itself (without namespace qualifiers), + // and for commas separating function arguments. + for (size_t i = name.size(); i--> 0;) + { + // account for nested <> and () + if (name[i] == '>') + angleDepth++; + else if (name[i] == '<') + angleDepth--; + else if (name[i] == ')') + parenDepth++; + else if (name[i] == '(') + parenDepth--; + // space before '>' + if (name[i] == ' ' && i + 1 < name.size() && name[i + 1] == '>') + name.erase(i, 1); // remove + // commas + if (name[i] == ',') + { + if (i + 1 < name.size() && name[i + 1] == ' ') + name.erase(i + 1, 1); // remove spaces after comma + if (!hitEnd && angleDepth == 0 && parenDepth == 1) + name.insert(i + 1, " "); // except for top-level arguments, we separate them by 2 spaces for better readability + } + // function name + if ((name[i] == '(' || name[i] == '<') && + parenDepth == 0 && angleDepth == 0 && + (i == 0 || name[i - 1] != '>') && + !hitEnd && !hitStart) // we hit the start of the argument list + { + hitEnd = true; + name.insert(i, " "); + } + else if ((name[i] == ' ' || name[i] == ':' || name[i] == '>') && hitEnd && !hitStart && i > 0) // we hit the start of the function name + { + if (name[i] != ' ') + name.insert(i + 1, " "); + name.insert(i + 1, " "); // in total insert 2 spaces + hitStart = true; + } + } + return name + modifiers; + } + catch (...) + { + return origName; + } +} + /// This function collects the stack tracke and writes it through the provided write function /// Function for writing the text associated to a the callstack /// Function for writing and "end-of-line" / "newline" /// -template -void ExceptionWithCallStack::CollectCallStack(const function& write, const function& newline) +static void CollectCallStack(size_t skipLevels, bool makeFunctionNamesStandOut, const function& write) { - newline(); - write("[CALL STACK]"); - newline(); + static const int MAX_CALLERS = 62; + static const unsigned short MAX_CALL_STACK_DEPTH = 20; + + write("\n[CALL STACK]\n"); + #ifdef _WIN32 + + // RtlCaptureStackBackTrace() is a kernel API without default binding, we must manually determine its function pointer. typedef USHORT(WINAPI * CaptureStackBackTraceType)(__in ULONG, __in ULONG, __out PVOID*, __out_opt PULONG); - CaptureStackBackTraceType func = (CaptureStackBackTraceType)(GetProcAddress(LoadLibrary(L"kernel32.dll"), "RtlCaptureStackBackTrace")); + CaptureStackBackTraceType RtlCaptureStackBackTrace = (CaptureStackBackTraceType)(GetProcAddress(LoadLibrary(L"kernel32.dll"), "RtlCaptureStackBackTrace")); + if (RtlCaptureStackBackTrace == nullptr) // failed somehow + return write("Failed to generate CALL STACK. GetProcAddress(\"RtlCaptureStackBackTrace\") failed with error " + msra::strfun::utf8(FormatWin32Error(GetLastError())) + "\n"); - if (func == nullptr) - return; + HANDLE process = GetCurrentProcess(); + if (!SymInitialize(process, nullptr, TRUE)) + return write("Failed to generate CALL STACK. SymInitialize() failed with error " + msra::strfun::utf8(FormatWin32Error(GetLastError())) + "\n"); + // get the call stack void* callStack[MAX_CALLERS]; unsigned short frames; - SYMBOL_INFO* symbolInfo; - HANDLE process; - - process = GetCurrentProcess(); - if (!SymInitialize(process, nullptr, TRUE)) - { - DWORD error = GetLastError(); - write("Failed to print CALL STACK! SymInitialize error : " + msra::strfun::utf8(FormatWin32Error(error))); - newline(); - return; - } + frames = RtlCaptureStackBackTrace(0, MAX_CALLERS, callStack, nullptr); - frames = (func)(0, MAX_CALLERS, callStack, nullptr); - symbolInfo = (SYMBOL_INFO*)calloc(sizeof(SYMBOL_INFO) + 256 * sizeof(char), 1); + SYMBOL_INFO* symbolInfo = (SYMBOL_INFO*)calloc(sizeof(SYMBOL_INFO) + 256 * sizeof(char), 1); // this is a variable-length structure, can't use vector easily symbolInfo->MaxNameLen = 255; symbolInfo->SizeOfStruct = sizeof(SYMBOL_INFO); frames = min(frames, MAX_CALL_STACK_DEPTH); - unsigned int firstFrame = 4; // 4 bottom functions are CollectCallStack(), GetCallStack(), ThrowFormatted(), and XXXError() - for (unsigned int i = firstFrame; i < frames; i++) + // format and emit + size_t firstFrame = skipLevels + 1; // skip CollectCallStack() + for (size_t i = firstFrame; i < frames; i++) { if (i == firstFrame) - write( " >"); + write(" > "); else - write(" -"); + write(" - "); if (SymFromAddr(process, (DWORD64)(callStack[i]), 0, symbolInfo)) { - write(symbolInfo->Name); - newline(); + write(makeFunctionNamesStandOut ? MakeFunctionNameStandOut(symbolInfo->Name) : symbolInfo->Name); + write("\n"); } else { @@ -74,119 +174,84 @@ void ExceptionWithCallStack::CollectCallStack(const function sourceFileWidth) + // sourceFile = "..." + sourceFile.substr(sourceFile.size() - (sourceFileWidth-3)); + while (*beginAddress == ' ') // eat unnecessary space + beginAddress++; + string pcOffset = beginOffset ? string(" + ") + beginOffset : string(); + snprintf(buffer, buf_size, "%-20s%-50s%s\n", beginAddress, fName.c_str(), pcOffset.c_str()); } + else // Couldn't parse the line. Print the whole line as it came. + snprintf(buffer, buf_size, "%s\n", symbolList[i]); write(buffer); } free(symbolList); -#endif -} -/// This function retrieves the call stack as a string -template -std::string ExceptionWithCallStack::GetCallStack() -{ - std::string output; - auto WriteToString = [&output](std::string stack) - { - output += stack; - }; - - auto WriteNewLineToString = [&output] - { - output += "\n"; - }; - - CollectCallStack(WriteToString, WriteNewLineToString); - - return output; -} - -/// This function outputs the call stack to the std err -template -void ExceptionWithCallStack::PrintCallStack() -{ - auto WriteToStdErr = [](std::string stack) - { - std::cerr << stack; - }; - - auto WriteNewLineToStdErr = [] - { - std::cerr << std::endl; - }; - - CollectCallStack(WriteToStdErr, WriteNewLineToStdErr); +#endif } template class ExceptionWithCallStack; template class ExceptionWithCallStack; template class ExceptionWithCallStack; + }}} diff --git a/Source/Common/Include/Basics.h b/Source/Common/Include/Basics.h index 3adcafea9786..a9d8b11a5cd4 100644 --- a/Source/Common/Include/Basics.h +++ b/Source/Common/Include/Basics.h @@ -69,10 +69,10 @@ __declspec_noreturn static inline void ThrowFormatted(const char* format, ...) fprintf(stderr, "\nAbout to throw exception '%s'\n", buffer); #endif //Microsoft::MSR::CNTK::ExceptionWithCallStack::PrintCallStack(); - // Note: The call stack will suppress this function and its call site (XXXError()). + // Note: The call stack will skip 2 levels to suppress this function and its call sites (XXXError()). // If more layers are added here, it would have to be adjusted. // TODO: Change ExceptionWithCallStack to take a parameter how many levels to skip. - throw ExceptionWithCallStack(buffer, ExceptionWithCallStack::GetCallStack()); + throw ExceptionWithCallStack(buffer, ExceptionWithCallStack::GetCallStack(/*skipLevels=*/2, /*makeFunctionNamesStandOut=*/true)); }; #pragma warning(pop) diff --git a/Source/Common/Include/ExceptionWithCallStack.h b/Source/Common/Include/ExceptionWithCallStack.h index b36ea83c702a..09c6ce853828 100644 --- a/Source/Common/Include/ExceptionWithCallStack.h +++ b/Source/Common/Include/ExceptionWithCallStack.h @@ -35,10 +35,6 @@ struct /*interface*/ IExceptionWithCallStackBase template class ExceptionWithCallStack : public E, public IExceptionWithCallStackBase { -private: - static const int MAX_CALLERS = 62; - static const unsigned short MAX_CALL_STACK_DEPTH = 20; - public: ExceptionWithCallStack(const std::string& msg, const std::string& callstack) : E(msg), m_callStack(callstack) @@ -46,16 +42,17 @@ class ExceptionWithCallStack : public E, public IExceptionWithCallStackBase virtual const char * CallStack() const override { return m_callStack.c_str(); } - static void PrintCallStack(); - static std::string GetCallStack(); - + static void PrintCallStack(size_t skipLevels = 0, bool makeFunctionNamesStandOut = false); + static std::string GetCallStack(size_t skipLevels = 0, bool makeFunctionNamesStandOut = false); // generate call stack as a string, which should then be passed to the constructor of this --TODO: Why not generate it directly in the constructor? + protected: std::string m_callStack; - -private: - static void CollectCallStack(const function& write, const function& newline); }; -typedef ExceptionWithCallStack DebugUtil; // some code calls PrintCallStack() directly, using this namespace +// some older code uses this namespace +namespace DebugUtil +{ + static inline void PrintCallStack() { ExceptionWithCallStack::PrintCallStack(0, false); } +}; }}} diff --git a/Source/ComputationNetworkLib/ComputationNetwork.cpp b/Source/ComputationNetworkLib/ComputationNetwork.cpp index 6a79f88ea035..4e1edf85b852 100644 --- a/Source/ComputationNetworkLib/ComputationNetwork.cpp +++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp @@ -462,6 +462,7 @@ template if (dropoutRate != prevDropoutRate) { fprintf(stderr, "Switching dropout rate to %.8g.\n", dropoutRate); + // TODO: Change this to use an interface that is independent of . list dropoutNodes = net->GetNodesWithType(OperationNameOf(DropoutNode), criterionNode); if (dropoutNodes.size() == 0 && dropoutRate > 0) fprintf(stderr, "WARNING: there is no dropout node.\n"); diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h index 700b5b6eb3e4..61916eab75ec 100644 --- a/Source/ComputationNetworkLib/ComputationNetwork.h +++ b/Source/ComputationNetworkLib/ComputationNetwork.h @@ -92,6 +92,8 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb Read(fileName); // perform all further post-processing, caching, etc. CompileNetwork(); + // To ensure that all the BN nodes changed to eval mode unless it's in Training mode. + SetBatchNormalizationNodesBelowEvalMode(true); } // static helper to instantiate a network from a file @@ -328,7 +330,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb void AddFeatureNode(ComputationNodeBasePtr featureNode); void RemoveFeatureNode(ComputationNodeBasePtr featureNode); void SetLearnableNodesBelowLearningRateMultiplier(const float learningRateMultiplier, const ComputationNodeBasePtr& rootNode = nullptr); - void SetBatchNormlizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode = nullptr); + void SetBatchNormalizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode = nullptr); // ----------------------------------------------------------------------- // node access diff --git a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp index 81d776e1c871..5707878e4dec 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp @@ -323,7 +323,7 @@ void ComputationNetwork::SetLearnableNodesBelowLearningRateMultiplier(const floa } } -void ComputationNetwork::SetBatchNormlizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode /* = nullptr */) +void ComputationNetwork::SetBatchNormalizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode /* = nullptr */) { vector nodes; if (rootNode == nullptr) diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h index d712c183d7c7..ca6491c3ec6c 100644 --- a/Source/ComputationNetworkLib/InputAndParamNodes.h +++ b/Source/ComputationNetworkLib/InputAndParamNodes.h @@ -507,18 +507,18 @@ template class SparseInputValue; // ----------------------------------------------------------------------- // LookupTableNode (embedding matrix, bag-of-word representation of the inputs) -// implements an embedding, assuming a specific representation of the input data +// Implements an embedding. The input vector can consist of multiple stacked +// This is a tensor product where the matrix width may be an integer fraction of the features. +// If it is, then the matrix will be replicated. +// This is the same as if the input data were a tensor where the same matrix is applied to each column of the tensor. +// TimesNode can do that. // ----------------------------------------------------------------------- template class LookupTableNode : public ComputationNode, public NumInputs<2> { - typedef ComputationNode Base; - UsingComputationNodeMembersBoilerplate; - static const std::wstring TypeName() - { - return L"LookupTable"; - } + typedef ComputationNode Base; UsingComputationNodeMembersBoilerplate; + static const std::wstring TypeName() { return L"LookupTable"; } public: DeclareConstructorFromConfigWithNumInputs(LookupTableNode); @@ -578,10 +578,10 @@ class LookupTableNode : public ComputationNode, public NumInputs<2> virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& t) override { - // input0 is the weight (each column is an embedding of one word), input 1 contains m_bnrLooked words in each column (sample) - Matrix functionValues = ValueFor(t); - const Matrix& input0 = Input(0)->ValueAsMatrix(); - Matrix input1 = Input(1)->ValueFor(t); + // input0 is the weight (each column is an embedding of one word), input 1 contains m_nbrLooked words in each column (sample) + Matrix functionValues = ValueFor(t); + const Matrix& input0 = Input(0)->ValueAsMatrix(); + Matrix input1 = Input(1)->ValueFor(t); size_t rows1 = input1.GetNumRows(), cols1 = input1.GetNumCols(); size_t cols0 = input0.GetNumCols(); @@ -591,7 +591,7 @@ class LookupTableNode : public ComputationNode, public NumInputs<2> if (cols0 * wordsInEachSample != rows1) LogicError("LookupTableNode: rows of input 1 is not a multiple of cols of input 0. This usually happens when the feature dimension is not specified as that in the network definition of look-up-table dimension size."); - auto input1Reshaped = input1.Reshaped(rows1 / wordsInEachSample, cols1 * wordsInEachSample); + auto input1Reshaped = input1.Reshaped(rows1 / wordsInEachSample, cols1 * wordsInEachSample); // BUGBUG: Won't work for sparse. auto functionValuesReshaped = functionValues.Reshaped(input0.GetNumRows(), input1Reshaped.GetNumCols()); functionValuesReshaped.AssignProductOf(input0, false, input1Reshaped, false); @@ -681,4 +681,5 @@ class LookupTableNode : public ComputationNode, public NumInputs<2> template class LookupTableNode; template class LookupTableNode; -} } } + +}}} diff --git a/Source/ComputationNetworkLib/LinearAlgebraNodes.h b/Source/ComputationNetworkLib/LinearAlgebraNodes.h index 6b65a65598ec..cf60fc45eb50 100644 --- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h +++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h @@ -403,7 +403,8 @@ template class TransposeTimesNode; // ----------------------------------------------------------------------- // ElementTimesNode (factor1, factor2) -// This allows broadcasting, and can thus also scale with a row, a column, or a scalar. +// This allows broadcasting, and can thus also scale with a row, a column, or a scalar, +// as well as mutliplying with a diagonal matrix (if represented as a column vector). // ----------------------------------------------------------------------- template @@ -459,6 +460,7 @@ template class ElementTimesNode; // ----------------------------------------------------------------------- // DiagTimesNode (vector representing the diagonal of a square matrix, data) +// TODO: This is redundant with ElementTimes and should be removed (with a compat stub). // ----------------------------------------------------------------------- template @@ -586,7 +588,10 @@ template class DiagTimesNode; // ----------------------------------------------------------------------- // SumElementsNode (input) -// sums up all elements in the input into a single scalar +// Sums up all elements in the input across all samples into a single scalar. +// When applied to minibatch data, this will sum across all sequences in the +// minibatch, like a training-criterion node. This is one of the few operations +// that cross the boundary between input sequences. // ----------------------------------------------------------------------- template @@ -633,7 +638,8 @@ template class SumElementsNode; // ----------------------------------------------------------------------- // SumColumnElementsNode (input) -// sums up all elements in each column of the input, reducing each column to a scalar +// Sums up all elements in each sample (column) of the input. Every sample +// will be reduced to a scalar. This is equivalent to multiplying with a row of ones. // TODO: This should be deprecated, in favor of a reduce node. // TODO: Implement this with the tensor library. // ----------------------------------------------------------------------- @@ -818,6 +824,7 @@ template class TransposeDimensionsNode; // CosDistanceNode (left, right) // column-wise cos distance // TODO: Would it be useful to allow one of the two to be a single column? +// TODO: Allow to reduce only over a single dimension, or a subset. // ----------------------------------------------------------------------- template @@ -874,7 +881,7 @@ class CosDistanceNode : public ComputationNode, public NumInputs<2> sliceOutputValue.AssignInnerProductOf(sliceInput0Value, sliceInput1Value, true); sliceOutputValue.ElementMultiplyWith(*m_invNorm0); sliceOutputValue.ElementMultiplyWith(*m_invNorm1); - // TODO: This formulation above allows to use the tensor lib for this, with automatic broadcasting. + // TODO: This formulation above would allow to use the TensorView lib for this, with automatic broadcasting. } virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override @@ -945,7 +952,8 @@ template class CosDistanceNode; // ----------------------------------------------------------------------- // KhatriRaoProductNode (left, right) -// compute an outer product of column vectors (for each sample) +// Compute an outer product of column vectors (for each sample). +// TODO: This is a special kind of tensor product, and calls for a tensor representation. // ----------------------------------------------------------------------- template @@ -1263,4 +1271,5 @@ class CosDistanceWithNegativeSamplesNode : public ComputationNode, pub template class CosDistanceWithNegativeSamplesNode; template class CosDistanceWithNegativeSamplesNode; -} } } + +}}} diff --git a/Source/ComputationNetworkLib/NonlinearityNodes.h b/Source/ComputationNetworkLib/NonlinearityNodes.h index 9aa5beeea5a3..f22b435163cb 100644 --- a/Source/ComputationNetworkLib/NonlinearityNodes.h +++ b/Source/ComputationNetworkLib/NonlinearityNodes.h @@ -90,7 +90,7 @@ class UnaryElementWiseWithOpCodeNodeBase : public ComputationNode, pub // These are all implemented by single-opcode functions and can thus be declared by a macro. // ----------------------------------------------------------------------- -#pragma push_macro("DeclareUnaryTensorOp") +#pragma push_macro("DeclareUnaryElementWiseWithOpCodeNode") #define DeclareUnaryElementWiseWithOpCodeNode(Name, Forward, Backward, gradientFromOutput) \ template \ class Name##Node : public UnaryElementWiseWithOpCodeNodeBase \ @@ -110,15 +110,15 @@ class UnaryElementWiseWithOpCodeNodeBase : public ComputationNode, pub } \ } -// Name Forward and Backward opcodes -DeclareUnaryElementWiseWithOpCodeNode(Sigmoid, Sigmoid, ElementwiseProductWithSigmoidDerivativeFromOutput, true); -DeclareUnaryElementWiseWithOpCodeNode(Tanh, Tanh, ElementwiseProductWithTanhDerivativeFromOutput, true); +// Name Forward and Backward opcodes Gradient from output? +DeclareUnaryElementWiseWithOpCodeNode(Sigmoid, Sigmoid, ElementwiseProductWithSigmoidDerivativeFromOutput, true); +DeclareUnaryElementWiseWithOpCodeNode(Tanh, Tanh, ElementwiseProductWithTanhDerivativeFromOutput, true); DeclareUnaryElementWiseWithOpCodeNode(RectifiedLinear, LinearRectifier, ElementwiseProductWithLinearRectifierDerivativeFromOutput, true); -DeclareUnaryElementWiseWithOpCodeNode(Log, Log, ElementwiseProductWithLogDerivativeFromOutput, true); -DeclareUnaryElementWiseWithOpCodeNode(Exp, Exp, ElementwiseProduct, true); -DeclareUnaryElementWiseWithOpCodeNode(Cosine, Cosine, ElementwiseProductWithCosDerivative, false); +DeclareUnaryElementWiseWithOpCodeNode(Log, Log, ElementwiseProductWithLogDerivativeFromOutput, true); +DeclareUnaryElementWiseWithOpCodeNode(Exp, Exp, ElementwiseProduct, true); +DeclareUnaryElementWiseWithOpCodeNode(Cosine, Cosine, ElementwiseProductWithCosDerivative, false); -#pragma pop_macro("DeclareUnaryTensorOp") +#pragma pop_macro("DeclareUnaryElementWiseWithOpCodeNode") // ----------------------------------------------------------------------- // SoftmaxNodeBase (input) -- shared base of Softmax and LogSoftmax diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp index b67e82e3fe74..08b96183403f 100644 --- a/Source/Math/CPUMatrix.cpp +++ b/Source/Math/CPUMatrix.cpp @@ -122,7 +122,6 @@ void CPUMatrix::ZeroInit() m_numRows = 0; m_numCols = 0; m_elemSizeAllocated = 0; - m_matrixName = NULL; m_format = matrixFormatDense; m_externalBuffer = false; } @@ -133,14 +132,6 @@ CPUMatrix::CPUMatrix() ZeroInit(); } -//matrixName is used to verify that correct matrix is read. -template -CPUMatrix::CPUMatrix(FILE* f, const char* matrixName) -{ - ZeroInit(); - ReadFromFile(f, matrixName); -} - // helper to allocate an array of ElemType // Use this instead of new[] to get NaN initialization for debugging. template @@ -182,7 +173,6 @@ CPUMatrix::CPUMatrix(const CPUMatrix& deepCopyFrom) ZeroInit(); if (!deepCopyFrom.IsEmpty()) SetValue(deepCopyFrom); - SetMatrixName(deepCopyFrom.m_matrixName); } //assignment operator, deep copy @@ -192,7 +182,6 @@ CPUMatrix& CPUMatrix::operator=(const CPUMatrix& d Clear(); if (!deepCopyFrom.IsEmpty()) SetValue(deepCopyFrom); - SetMatrixName(deepCopyFrom.m_matrixName); return *this; } @@ -205,7 +194,6 @@ CPUMatrix::CPUMatrix(CPUMatrix&& moveFrom) m_numCols = moveFrom.m_numCols; m_elemSizeAllocated = moveFrom.m_elemSizeAllocated; m_pArray = moveFrom.m_pArray; // shallow copy the pointer - m_matrixName = moveFrom.m_matrixName; m_format = moveFrom.m_format; m_externalBuffer = moveFrom.m_externalBuffer; // release the pointer from the source object so that the destructor won't release it twice diff --git a/Source/Math/CPUMatrix.h b/Source/Math/CPUMatrix.h index 22390005d474..2a8f7dbd3619 100644 --- a/Source/Math/CPUMatrix.h +++ b/Source/Math/CPUMatrix.h @@ -35,10 +35,8 @@ class MATH_API CPUMatrix : public BaseMatrix using B::m_elemSizeAllocated; using B::m_externalBuffer; using B::m_format; - using B::m_matrixName; // without this, base members would require to use thi-> in GCC public: CPUMatrix(); - CPUMatrix(FILE* f, const char* matrixName); // matrixName is used to verify that correct matrix is read. CPUMatrix(const size_t numRows, const size_t numCols); CPUMatrix(const size_t numRows, const size_t numCols, ElemType* pArray, const size_t matrixFlags = matrixFlagNormal); CPUMatrix(const CPUMatrix& deepCopyFrom); // copy constructor, deep copy @@ -55,7 +53,6 @@ class MATH_API CPUMatrix : public BaseMatrix using B::GetNumRows; using B::GetNumCols; using B::SetOwnBuffer; - using B::SetMatrixName; size_t BufferSize() const { @@ -398,10 +395,6 @@ class MATH_API CPUMatrix : public BaseMatrix stream >> d_array[i]; stream.GetMarker(fileMarkerEndSection, std::wstring(L"EMAT")); us.SetValue(numRows, numCols, d_array, matrixFlagNormal); - if (us.m_matrixName) - delete[] us.m_matrixName; - us.m_matrixName = new wchar_t[matrixName.length() + 1]; - wmemcpy(us.m_matrixName, matrixName.c_str(), matrixName.length() + 1); delete[] d_array; return stream; @@ -411,7 +404,7 @@ class MATH_API CPUMatrix : public BaseMatrix stream.PutMarker(fileMarkerBeginSection, std::wstring(L"BMAT")); stream << sizeof(ElemType); - std::wstring s = (us.m_matrixName == NULL) ? std::wstring(L"unnamed") : std::wstring(us.m_matrixName); + std::wstring s = std::wstring(L"unnamed"); int format = us.m_format; stream << s << format; diff --git a/Source/Math/CPUSparseMatrix.cpp b/Source/Math/CPUSparseMatrix.cpp index 627efb0f13e0..e9df17d6f00b 100644 --- a/Source/Math/CPUSparseMatrix.cpp +++ b/Source/Math/CPUSparseMatrix.cpp @@ -157,7 +157,6 @@ CPUSparseMatrix::CPUSparseMatrix(const CPUSparseMatrix& deep ZeroInit(); if (!deepCopyFrom.IsEmpty()) SetValue(deepCopyFrom); - SetMatrixName(deepCopyFrom.m_matrixName); } // assignment operator, deep copy @@ -167,7 +166,6 @@ CPUSparseMatrix& CPUSparseMatrix::operator=(const CPUSparseM Clear(); if (!deepCopyFrom.IsEmpty()) SetValue(deepCopyFrom); - SetMatrixName(deepCopyFrom.m_matrixName); return *this; } @@ -234,8 +232,6 @@ void CPUSparseMatrix::ReleaseMemory() // In that case we shouldn't free anything. if (!m_externalBuffer) { - delete[] m_matrixName; - if (m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) { delete[] m_pArray; @@ -384,7 +380,6 @@ CPUSparseMatrix CPUSparseMatrix::ColumnSlice(size_t startCol CPUSparseMatrix slice(m_format); slice.m_numRows = m_numRows; slice.m_numCols = numCols; - // BUGBUG: m_matrixName? // BUGBUG: m_sliceViewOffset? slice.m_externalBuffer = true; slice.m_sliceOf = const_cast*>(this); // BUGBUG: ColumnSlice() returns a reference to a mutable matrix, even if itself is 'const'; should not be. @@ -1275,8 +1270,6 @@ MATH_API File& operator>>(File& stream, CPUSparseMatrix& us) } stream.GetMarker(fileMarkerEndSection, std::wstring(L"EMAT")); - us.SetMatrixName(matrixName.c_str()); - return stream; } @@ -1291,15 +1284,7 @@ MATH_API File& operator<<(File& stream, const CPUSparseMatrix& us) stream.PutMarker(fileMarkerBeginSection, std::wstring(L"BMAT")); stream << sizeof(ElemType); - if (us.GetMatrixName() == nullptr) - { - std::wstring s(L"nnmatrix"); - stream << s; - } - else - { - stream << us.GetMatrixName(); - } + stream << std::wstring(L"nnmatrix"); // Note this is needed for compatability, and could potentially be an empty string size_t nz, numRows, numCols; size_t compressedSize = us.SecondaryIndexCount(); @@ -1342,6 +1327,7 @@ template CPUSparseMatrix::CPUSparseMatrix(CPUSparseMatrix const&); template CPUSparseMatrix::CPUSparseMatrix(CPUSparseMatrix&&); template CPUSparseMatrix& CPUSparseMatrix::operator=(CPUSparseMatrix&& moveFrom); template void CPUSparseMatrix::SetValue(size_t, size_t, char); +template void CPUSparseMatrix::SetValue(CPUSparseMatrix const&); template char* CPUSparseMatrix::BufferPointer() const; template void CPUSparseMatrix::Reset(void); template CPUSparseMatrix::~CPUSparseMatrix(); diff --git a/Source/Math/CPUSparseMatrix.h b/Source/Math/CPUSparseMatrix.h index f160372a3123..5527d0ec632d 100644 --- a/Source/Math/CPUSparseMatrix.h +++ b/Source/Math/CPUSparseMatrix.h @@ -27,7 +27,6 @@ class MATH_API CPUSparseMatrix : public BaseMatrix using Base::m_computeDevice; using Base::m_externalBuffer; using Base::m_format; - using Base::m_matrixName; using Base::m_numCols; using Base::m_numRows; using Base::m_nz; @@ -38,7 +37,6 @@ class MATH_API CPUSparseMatrix : public BaseMatrix public: using Base::OwnBuffer; using Base::IsEmpty; - using Base::SetMatrixName; private: void ZeroInit(); @@ -168,7 +166,6 @@ class MATH_API CPUSparseMatrix : public BaseMatrix ElemType SumOfElements() const; // sum of all elements public: - // void Print(const char* /*matrixName*/) const { NOT_IMPLEMENTED; } void Print(const char* matrixName, ptrdiff_t rowStart, ptrdiff_t rowEnd, ptrdiff_t colStart, ptrdiff_t colEnd) const; void Print(const char* matrixName = NULL) const; // print whole matrix. can be expensive diff --git a/Source/Math/CommonMatrix.h b/Source/Math/CommonMatrix.h index 233bb4358be7..17d4d9d64a52 100644 --- a/Source/Math/CommonMatrix.h +++ b/Source/Math/CommonMatrix.h @@ -261,10 +261,6 @@ class BaseMatrix { m_externalBuffer = !own; } - wchar_t* GetMatrixName() const - { - return m_matrixName; - } size_t NzCount() const { return m_nz; @@ -283,38 +279,16 @@ class BaseMatrix LogicError("VerifySize: expected matrix size %lu x %lu, but it is %lu x %lu", rows, cols, GetNumRows(), GetNumCols()); } - void SetMatrixName(const wchar_t* s) - { - Clear(); - if (s != nullptr) - { - size_t n = wcslen(s); - m_matrixName = new wchar_t[n + 1]; - wmemcpy(m_matrixName, s, n + 1); - } - } BaseMatrix() { - m_numRows = m_numCols = m_elemSizeAllocated = 0; - m_pArray = NULL; - m_matrixName = NULL; + ZeroInit(); m_format = matrixFormatDense; - m_externalBuffer = false; - m_nz = 0; m_computeDevice = CPUDEVICE; } - ~BaseMatrix() - { - Clear(); - } protected: - void Clear() - { - delete[] m_matrixName; - m_matrixName = nullptr; - } + void Clear() {} void ZeroInit() { @@ -325,7 +299,6 @@ class BaseMatrix m_externalBuffer = false; m_pArray = nullptr; m_nz = 0; - m_matrixName = nullptr; } // copy all metadata (but not content taht pArray points to) @@ -341,7 +314,6 @@ class BaseMatrix m_externalBuffer = other.m_externalBuffer; m_pArray = other.m_pArray; m_nz = other.m_nz; - m_matrixName = other.m_matrixName; } protected: @@ -355,7 +327,6 @@ class BaseMatrix bool m_externalBuffer; // is the buffer used by this matrix, ElemType* m_pArray; size_t m_nz; // Number of non-zero elements for sparse matrices (unused in other formats) - wchar_t* m_matrixName; // TODO: Use std::wstring? }; }}} diff --git a/Source/Math/GPUMatrix.cu b/Source/Math/GPUMatrix.cu index f424629858d0..8c31ec4c9a50 100644 --- a/Source/Math/GPUMatrix.cu +++ b/Source/Math/GPUMatrix.cu @@ -418,7 +418,6 @@ void GPUMatrix::ZeroInit(int deviceId) m_numRows = 0; m_numCols = 0; m_elemSizeAllocated = 0; - m_matrixName = NULL; m_format = matrixFormatDense; m_externalBuffer = false; } @@ -429,13 +428,6 @@ GPUMatrix::GPUMatrix(int deviceId) ZeroInit(deviceId); }; -//matrixName is used to verify that correct matrix is read. -template -GPUMatrix::GPUMatrix(FILE* f, const char* matrixName, int /*deviceId*/) -{ - ReadFromFile(f, matrixName); -} - template GPUMatrix::GPUMatrix(const size_t numRows, const size_t numCols, int deviceId) { @@ -463,7 +455,6 @@ GPUMatrix::GPUMatrix(const GPUMatrix& deepCopyFrom) { ZeroInit(deepCopyFrom.m_computeDevice); SetValue(deepCopyFrom); - SetMatrixName(deepCopyFrom.m_matrixName); } template @@ -473,7 +464,6 @@ GPUMatrix::GPUMatrix(GPUMatrix&& moveFrom) m_numCols = moveFrom.m_numCols; m_computeDevice = moveFrom.m_computeDevice; m_pArray = moveFrom.m_pArray; // shallow copy the pointer - m_matrixName = moveFrom.m_matrixName; m_elemSizeAllocated = moveFrom.m_elemSizeAllocated; m_format = moveFrom.m_format; m_externalBuffer = moveFrom.m_externalBuffer; @@ -489,7 +479,6 @@ GPUMatrix& GPUMatrix::operator=(const GPUMatrix& d if (this != &deepCopyFrom) { SetValue(deepCopyFrom); - SetMatrixName(deepCopyFrom.m_matrixName); } return *this; } @@ -606,7 +595,6 @@ GPUMatrix& GPUMatrix::AssignColumnSlice(const GPUMatrix& GPUMatrix::AssignTransposeOf(const GPUMatrix::SetValue(const size_t numRows, const size_t numCols, i m_numCols = numCols; m_pArray = pArray; m_elemSizeAllocated = GetNumElements(); - m_matrixName = NULL; m_format = matrixFormatDense; m_externalBuffer = true; m_computeDevice = deviceId; @@ -2744,21 +2730,6 @@ void GPUMatrix::Print(const char* matrixName /*=nullptr*/) const Print(matrixName, 0, GetNumRows() - 1, 0, GetNumCols() - 1); } -// file I/O -//matrixName is used to verify that correct matrix is read. -template -void GPUMatrix::ReadFromFile(FILE*, const char* /*matrixName*/) -{ - NOT_IMPLEMENTED; -} - -//matrixName is used to verify that correct matrix is read. -template -void GPUMatrix::WriteToFile(FILE*, const char* /*matrixName*/) -{ - NOT_IMPLEMENTED; -} - //helpfer function used for convolution neural network template GPUMatrix& GPUMatrix::AssignPackedConvolutionInput(const GPUMatrix& inputSubBatch, diff --git a/Source/Math/GPUMatrix.h b/Source/Math/GPUMatrix.h index 22d6601f098e..822db1be5f5e 100644 --- a/Source/Math/GPUMatrix.h +++ b/Source/Math/GPUMatrix.h @@ -103,7 +103,6 @@ class MATH_API GPUMatrix : public BaseMatrix static const int MaxGpus = 8; // support up to 8 GPUs using BaseMatrix::m_computeDevice; using BaseMatrix::m_elemSizeAllocated; - using BaseMatrix::m_matrixName; using BaseMatrix::m_format; using BaseMatrix::m_externalBuffer; using BaseMatrix::m_nz; @@ -113,7 +112,6 @@ class MATH_API GPUMatrix : public BaseMatrix using BaseMatrix::GetArray; using BaseMatrix::GetNumRows; using BaseMatrix::GetNumCols; - using BaseMatrix::SetMatrixName; private: static cublasHandle_t s_cuHandle[MaxGpus]; @@ -139,7 +137,6 @@ class MATH_API GPUMatrix : public BaseMatrix public: explicit GPUMatrix(int deviceId); - GPUMatrix(FILE* f, const char* matrixName, int deviceId); GPUMatrix(const size_t numRows, const size_t numCols, int deviceId); GPUMatrix(const size_t numRows, const size_t numCols, int deviceId, ElemType* pArray, const size_t matrixFlags = matrixFlagNormal); GPUMatrix(const GPUMatrix& deepCopyFrom); @@ -372,9 +369,6 @@ class MATH_API GPUMatrix : public BaseMatrix void Print(const char* matrixName, size_t rowStart, size_t rowEnd, size_t colStart, size_t colEnd) const; void Print(const char* matrixName = NULL) const; // print whole matrix. can be expensive - void ReadFromFile(FILE* f, const char* matrixName); // matrixName is used to verify that correct matrix is read. - void WriteToFile(FILE* f, const char* matrixName); // matrixName is used to verify that correct matrix is read. - GPUMatrix& AssignPackedConvolutionInput(const GPUMatrix& inputSubBatch, const size_t inputWidth, const size_t inputHeight, const size_t inputChannels, const size_t outputWidth, const size_t outputHeight, const size_t outputChannels, @@ -491,19 +485,16 @@ class MATH_API GPUMatrix : public BaseMatrix stream >> elsize; if (sizeof(ElemType) != elsize) LogicError("Template argument size doesn't match those in file"); - std::wstring matrixName; + std::wstring matrixNameDummy; // Note this is not used anymore, just a dummy for compatability. size_t numRows, numCols; int format; - stream >> matrixName >> format >> numRows >> numCols; + stream >> matrixNameDummy >> format >> numRows >> numCols; ElemType* d_array = new ElemType[numRows * numCols]; for (size_t i = 0; i < numRows * numCols; ++i) stream >> d_array[i]; stream.GetMarker(fileMarkerEndSection, std::wstring(L"EMAT")); us.SetValue(numRows, numCols, us.GetComputeDeviceId(), d_array, matrixFlagNormal | format); delete[] d_array; - us.m_matrixName = new wchar_t[matrixName.length() + 1]; - wmemcpy(us.m_matrixName, matrixName.c_str(), matrixName.length() + 1); - // us.m_matrixName = matrixName; return stream; } friend File& operator<<(File& stream, const GPUMatrix& us) @@ -511,7 +502,8 @@ class MATH_API GPUMatrix : public BaseMatrix stream.PutMarker(fileMarkerBeginSection, std::wstring(L"BMAT")); stream << sizeof(ElemType); - std::wstring s = (us.m_matrixName == NULL) ? std::wstring(L"unnamed") : std::wstring(us.m_matrixName); + // TODO: This is now ignored on input, so we can should change to an empty string. This might break parsing, and must be tested first + std::wstring s = std::wstring(L"unnamed"); int format = us.m_format; stream << s << format; diff --git a/Source/Math/GPUSparseMatrix.cu b/Source/Math/GPUSparseMatrix.cu index 78beeb3e36d4..40c9d63dfce6 100644 --- a/Source/Math/GPUSparseMatrix.cu +++ b/Source/Math/GPUSparseMatrix.cu @@ -145,8 +145,6 @@ template GetNumNZElements()); } - SetMatrixName(deepCopy.m_matrixName); - // TODO: to copy other varibles used only for class based LM } @@ -302,7 +300,6 @@ void GPUSparseMatrix::CopyToDenseMatrix(GPUMatrix& denseMatr } CUSPARSE_CALL(cusparseDestroy(cusparseHandle)); - denseMatrix.SetMatrixName(m_matrixName); } template @@ -515,7 +512,6 @@ void GPUSparseMatrix::SetValue(const GPUMatrix& denseMatrix, (int) m_numRows, nnzPerRowOrCol, reinterpret_cast(BufferPointer()), RowLocation(), ColLocation())); } } - SetMatrixName(denseMatrix.GetMatrixName()); } template @@ -525,7 +521,6 @@ GPUSparseMatrix& GPUSparseMatrix::operator=(const GPUSparseM if (this != &deepCopy) SetValue(deepCopy); - SetMatrixName(deepCopy.m_matrixName); return *this; } @@ -576,9 +571,6 @@ template // In that case we shouldn't free anything. if (OwnBuffer()) { - delete[] m_matrixName; - m_matrixName = nullptr; - delete[](byte*) m_tempHostBuffer; m_tempHostBuffer = nullptr; @@ -2142,7 +2134,6 @@ GPUSparseMatrix GPUSparseMatrix::ColumnSlice(size_t startCol slice.m_format = m_format; slice.m_externalBuffer = true; slice.m_sliceOf = const_cast*>(this); // BUGBUG: ColumnSlice() returns a reference to a mutable matrix, even if itself is 'const'; should not be. - slice.m_matrixName = m_matrixName; slice.m_blockSize = m_blockSize; slice.m_rowToId = m_rowToId; slice.m_tempHostBuffer = m_tempHostBuffer; @@ -2190,8 +2181,6 @@ GPUMatrix GPUSparseMatrix::CopyColumnSliceToDense(size_t sta CUSPARSE_CALL(cusparseDestroy(cusparseHandle)); - slice.SetMatrixName(m_matrixName); - return slice; } @@ -2717,7 +2706,6 @@ MATH_API File& operator>>(File& stream, GPUSparseMatrix& us) } stream.GetMarker(fileMarkerEndSection, std::wstring(L"EMAT")); - us.SetMatrixName(matrixName.c_str()); return stream; } @@ -2733,15 +2721,8 @@ MATH_API File& operator<<(File& stream, const GPUSparseMatrix& us) stream.PutMarker(fileMarkerBeginSection, std::wstring(L"BMAT")); stream << sizeof(ElemType); - if (us.GetMatrixName() == nullptr) - { - std::wstring s(L"nnmatrix"); - stream << s; - } - else - { - stream << us.GetMatrixName(); - } + std::wstring s(L"nnmatrix"); + stream << s; size_t nz = us.GetNumNZElements(), numElemAllocated = us.GetNumElemAllocated(), numRows = us.GetNumRows(), numCols = us.GetNumCols(); size_t compressedSize = us.SecondaryIndexCount(); diff --git a/Source/Math/GPUSparseMatrix.h b/Source/Math/GPUSparseMatrix.h index 0b823e0fa14c..3287ad23d3a6 100644 --- a/Source/Math/GPUSparseMatrix.h +++ b/Source/Math/GPUSparseMatrix.h @@ -33,14 +33,12 @@ class MATH_API GPUSparseMatrix : public BaseMatrix using Base::m_format; using Base::m_computeDevice; using Base::m_externalBuffer; - using Base::m_matrixName; using Base::OwnBuffer; using Base::GetFormat; using Base::SetFormat; using Base::GetNumRows; using Base::GetNumCols; using Base::SetComputeDeviceId; - using Base::SetMatrixName; using Base::SetNzCount; using Base::Clear; // without this, base members would require to use thi-> in GCC diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp index a73fa4166712..3ff79a7e3347 100644 --- a/Source/Math/Matrix.cpp +++ b/Source/Math/Matrix.cpp @@ -289,42 +289,6 @@ Matrix::Matrix(BaseMatrix* baseMatrix, ElemType* pArray, DEV m_baseMatrix->SetArray(pArray); } -//matrixName is used to verify that correct matrix is read. -template -Matrix::Matrix(FILE* f, const char* matrixName, DEVICEID_TYPE deviceId, const MatrixType matrixType) -{ - Init(deviceId); - - if (matrixType == MatrixType::SPARSE) - { - if (m_preferredDeviceId == CPUDEVICE) - { - NOT_IMPLEMENTED; - // m_CPUSparseMatrix = new CPUSparseMatrix(f,matrixName); - SetDataLocation(CPU, SPARSE); - } - else - { - NOT_IMPLEMENTED; - // m_GPUSparseMatrix = new GPUSparseMatrix(f,matrixName, m_preferredDeviceId); - SetDataLocation(GPU, SPARSE); - } - } - else - { - if (m_preferredDeviceId == CPUDEVICE) - { - m_CPUMatrix = new CPUMatrix(f, matrixName); - SetDataLocation(CPU, DENSE); - } - else - { - m_GPUMatrix = new GPUMatrix(f, matrixName, m_preferredDeviceId); - SetDataLocation(GPU, DENSE); - } - } -} - template Matrix::Matrix(const size_t numRows, const size_t numCols, DEVICEID_TYPE deviceId, const MatrixType matrixType, const MatrixFormat matrixFormat) { @@ -3388,39 +3352,6 @@ void Matrix::VectorMin(Matrix& minIndexes, Matrix& #pragma region Other helper Functions -template -wchar_t* Matrix::GetMatrixName() const -{ - return m_baseMatrix->GetMatrixName(); -} - -template -void Matrix::SetMatrixName(const wchar_t* s) -{ - if (m_currentDataLocation == CurrentDataLocation::BOTH) - { - if (GetMatrixType() == MatrixType::DENSE) - { - m_CPUMatrix->SetMatrixName(s); - m_GPUMatrix->SetMatrixName(s); - } - else if (GetMatrixType() == MatrixType::SPARSE) - { - m_CPUSparseMatrix->SetMatrixName(s); - m_GPUSparseMatrix->SetMatrixName(s); - } - } - else - { - DISPATCH_MATRIX_ON_FLAG(this, - nullptr, - m_CPUMatrix->SetMatrixName(s), - m_GPUMatrix->SetMatrixName(s), - m_CPUSparseMatrix->SetMatrixName(s), - m_GPUSparseMatrix->SetMatrixName(s)); - } -} - template int Matrix::GetDeviceId() const { diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h index a159dd13a824..2afb374977d1 100644 --- a/Source/Math/Matrix.h +++ b/Source/Math/Matrix.h @@ -87,7 +87,6 @@ class MATH_API Matrix : public MatrixBase // Elseif deviceId>=0 then the matrix will be based on GPU with specified deviceId explicit Matrix(DEVICEID_TYPE deviceId); Matrix(BaseMatrix* baseMatrix, ElemType* pArray, DEVICEID_TYPE deviceId); // constructor for setting Matrix from a base matrix (externally managed butter pArray) - Matrix(FILE* f, const char* matrixName, DEVICEID_TYPE deviceId, const MatrixType matrixType = DENSE); // matrixName is used to verify that correct matrix is read. Matrix(const size_t numRows, const size_t numCols, DEVICEID_TYPE deviceId, const MatrixType matrixType = DENSE, const MatrixFormat matrixFormat = matrixFormatDense); Matrix(const size_t numRows, const size_t numCols, ElemType* pArray, DEVICEID_TYPE deviceId, const size_t matrixFlags = matrixFlagNormal, const size_t nnz = 0); Matrix(const Matrix& deepCopyFrom, DEVICEID_TYPE deviceId); @@ -160,8 +159,6 @@ class MATH_API Matrix : public MatrixBase { return GetNumElements() == 0; } - wchar_t* GetMatrixName() const; - void SetMatrixName(const wchar_t* s); bool IsEmpty() const; size_t BufferSize() const; ElemType* BufferPointer() const; diff --git a/Source/Math/NoGPU.cpp b/Source/Math/NoGPU.cpp index 41e793884a81..47a6c2392cf2 100644 --- a/Source/Math/NoGPU.cpp +++ b/Source/Math/NoGPU.cpp @@ -777,12 +777,6 @@ void GPUMatrix::ZeroInit(int deviceId) template GPUMatrix::GPUMatrix(int deviceId){}; -//matrixName is used to verify that correct matrix is read. -template -GPUMatrix::GPUMatrix(FILE* f, const char* matrixName, int deviceId) -{ -} - template GPUMatrix::GPUMatrix(const size_t numRows, const size_t numCols, int deviceId){}; @@ -1662,19 +1656,6 @@ void GPUMatrix::Print(const char* matrixName /*=nullptr*/) const { } -// file I/O -//matrixName is used to verify that correct matrix is read. -template -void GPUMatrix::ReadFromFile(FILE* f, const char* matrixName) -{ -} - -//matrixName is used to verify that correct matrix is read. -template -void GPUMatrix::WriteToFile(FILE* f, const char* matrixName) -{ -} - //helpfer function used for convolution neural network template GPUMatrix& GPUMatrix::AssignPackedConvolutionInput(const GPUMatrix& inputSubBatch, diff --git a/Source/Readers/HTKMLFReader/HTKMLFReader.cpp b/Source/Readers/HTKMLFReader/HTKMLFReader.cpp index d368f624781c..e4654023fdec 100644 --- a/Source/Readers/HTKMLFReader/HTKMLFReader.cpp +++ b/Source/Readers/HTKMLFReader/HTKMLFReader.cpp @@ -122,6 +122,7 @@ void HTKMLFReader::PrepareForTrainingOrTesting(const ConfigRecordType& vector statelistpaths; vector numContextLeft; vector numContextRight; + size_t numExpandToUtt = 0; std::vector featureNames; std::vector labelNames; @@ -139,6 +140,12 @@ void HTKMLFReader::PrepareForTrainingOrTesting(const ConfigRecordType& { const ConfigRecordType& thisFeature = readerConfig(featureNames[i]); m_featDims.push_back(thisFeature(L"dim")); + + bool expandToUtt = thisFeature(L"expandToUtterance", false); // should feature be processed as an ivector? + m_expandToUtt.push_back(expandToUtt); + if (expandToUtt) + numExpandToUtt++; + intargvector contextWindow = thisFeature(L"contextWindow", ConfigRecordType::Array(intargvector(vector{1}))); if (contextWindow.size() == 1) // symmetric { @@ -158,6 +165,10 @@ void HTKMLFReader::PrepareForTrainingOrTesting(const ConfigRecordType& { InvalidArgument("contextFrames must have 1 or 2 values specified, found %d", (int) contextWindow.size()); } + + if (expandToUtt && (numContextLeft[i] != 0 || numContextRight[1] != 0)) + RuntimeError("contextWindow expansion not permitted when expandToUtterance=true"); + // update m_featDims to reflect the total input dimension (featDim x contextWindow), not the native feature dimension // that is what the lower level feature readers expect m_featDims[i] = m_featDims[i] * (1 + numContextLeft[i] + numContextRight[i]); @@ -291,6 +302,12 @@ void HTKMLFReader::PrepareForTrainingOrTesting(const ConfigRecordType& if (iFeat != scriptpaths.size() || iLabel != mlfpathsmulti.size()) RuntimeError("# of inputs files vs. # of inputs or # of output files vs # of outputs inconsistent\n"); + if (iFeat == numExpandToUtt) + RuntimeError("At least one feature stream must be frame-based, not utterance-based"); + + if (m_expandToUtt[0]) // first feature stream is ivector type - that will mess up lower level feature reader + RuntimeError("The first feature stream in the file must be frame-based not utterance based. Please reorder the feature blocks of your config appropriately"); + if (readerConfig.Exists(L"randomize")) { wstring randomizeString = readerConfig.CanBeString(L"randomize") ? readerConfig(L"randomize") : wstring(); @@ -317,6 +334,9 @@ void HTKMLFReader::PrepareForTrainingOrTesting(const ConfigRecordType& if (readMethod == L"blockRandomize" && randomize == randomizeNone) InvalidArgument("'randomize' cannot be 'none' when 'readMethod' is 'blockRandomize'."); + if (readMethod == L"rollingWindow" && numExpandToUtt>0) + RuntimeError("rollingWindow reader does not support expandToUtt. Change to blockRandomize.\n"); + // read all input files (from multiple inputs) // TO DO: check for consistency (same number of files in each script file) numFiles = 0; @@ -487,7 +507,7 @@ void HTKMLFReader::PrepareForTrainingOrTesting(const ConfigRecordType& // now get the frame source. This has better randomization and doesn't create temp files bool minimizeReaderMemoryFootprint = readerConfig(L"minimizeReaderMemoryFootprint", true); - m_frameSource.reset(new msra::dbn::minibatchutterancesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, numContextLeft, numContextRight, randomize, *m_lattices, m_latticeMap, m_frameMode, minimizeReaderMemoryFootprint)); + m_frameSource.reset(new msra::dbn::minibatchutterancesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, numContextLeft, numContextRight, randomize, *m_lattices, m_latticeMap, m_frameMode, minimizeReaderMemoryFootprint, m_expandToUtt)); m_frameSource->setverbosity(m_verbosity); } else if (EqualCI(readMethod, L"rollingWindow")) @@ -1482,6 +1502,7 @@ bool HTKMLFReader::GetMinibatchToWrite(StreamMinibatchInputs& matrices m_fileEvalSource->Reset(); // load next file (or set of files) + size_t nfr = 0; foreach_index (i, m_inputFilesMultiIO) { msra::asr::htkfeatreader reader; @@ -1492,9 +1513,19 @@ bool HTKMLFReader::GetMinibatchToWrite(StreamMinibatchInputs& matrices string featkind; unsigned int sampperiod; msra::util::attempt(5, [&]() - { - reader.read(path, featkind, sampperiod, feat); // whole file read as columns of feature vectors - }); + { + reader.read(path, featkind, sampperiod, feat); // whole file read as columns of feature vectors + }); + if (i == 0) + nfr = feat.cols(); + else if (feat.cols() == 1 && nfr > 1) + { // This broadcasts a vector to be multiple columns, as needed for i-vector support + msra::dbn::matrix feat_col(feat); + feat.resize(feat.rows(), nfr); + for (size_t i = 0; i < feat.rows(); i++) + for (size_t j = 0; j < feat.cols(); j++) + feat(i, j) = feat_col(i, 0); + } fprintf(stderr, "evaluate: reading %d frames of %ls\n", (int) feat.cols(), ((wstring) path).c_str()); m_fileEvalSource->AddFile(feat, featkind, sampperiod, i); } diff --git a/Source/Readers/HTKMLFReader/HTKMLFReader.h b/Source/Readers/HTKMLFReader/HTKMLFReader.h index 19786e8189fa..234e8552f83b 100644 --- a/Source/Readers/HTKMLFReader/HTKMLFReader.h +++ b/Source/Readers/HTKMLFReader/HTKMLFReader.h @@ -96,6 +96,7 @@ class HTKMLFReader : public IDataReader size_t m_inputFileIndex; std::vector m_featDims; std::vector m_labelDims; + std::vector m_expandToUtt; // support for i-vector type of input - single fram should be applied to entire utterance std::vector>> m_labelToTargetMapMultiIO; diff --git a/Source/Readers/HTKMLFReader/htkfeatio.h b/Source/Readers/HTKMLFReader/htkfeatio.h index 070c99b65d70..1b9129c39df0 100644 --- a/Source/Readers/HTKMLFReader/htkfeatio.h +++ b/Source/Readers/HTKMLFReader/htkfeatio.h @@ -697,19 +697,29 @@ class htkfeatreader : protected htkfeatio v.insert(iter, 0.0f); } } - foreach_index (k, v) + foreach_index(k, v) feat(k, t) = v[k]; } } // read an entire utterance into an already allocated matrix // Matrix type needs to have operator(i,j) template - void read(const parsedpath& ppath, const string& kindstr, const unsigned int period, MATRIX& feat) + void read(const parsedpath& ppath, const string& kindstr, const unsigned int period, MATRIX& feat, bool needsExpansion=false) { // open the file and check dimensions size_t numframes = open(ppath); - if (feat.cols() != numframes || feat.rows() != featdim) - LogicError("read: stripe read called with wrong dimensions"); + if (needsExpansion) + { + if (numframes != 1) + throw std::logic_error("read: if doing utterance-based expansion of features (e.g. ivectors), utterance must contain 1 frame only"); + if (feat.rows() != featdim) + throw std::logic_error("read: stripe read called with wrong dimensions"); + } + else + { + if (feat.cols() != numframes || feat.rows() != featdim) + LogicError("read: stripe read called with wrong dimensions"); + } if (kindstr != featkind || period != featperiod) LogicError("read: attempting to mixing different feature kinds"); @@ -717,6 +727,16 @@ class htkfeatreader : protected htkfeatio try { read(feat, 0, numframes); + if (needsExpansion) // copy first frame to all the frames in the stripe + { + for (int t = 1; t < feat.cols(); t++) + { + for (int k = 0; k < feat.rows(); k++) + { + feat(k, t) = feat(k, 0); + } + } + } } catch (...) { diff --git a/Source/Readers/HTKMLFReader/utterancesourcemulti.h b/Source/Readers/HTKMLFReader/utterancesourcemulti.h index 5645173bd661..40e585427e2b 100644 --- a/Source/Readers/HTKMLFReader/utterancesourcemulti.h +++ b/Source/Readers/HTKMLFReader/utterancesourcemulti.h @@ -30,6 +30,7 @@ class minibatchutterancesourcemulti : public minibatchsource std::vector sampperiod; // (for reference and to check against model) std::vector featkind; std::vector featdim; + std::vector expandToUtt; // indicator of whether features should be applied to entire utterance, e.g. ivectors const bool framemode; // true -> actually return frame-level randomized frames (not possible in lattice mode) std::vector> counts; // [s] occurence count for all states (used for priors) int verbosity; @@ -48,17 +49,21 @@ class minibatchutterancesourcemulti : public minibatchsource size_t classidsbegin; // index into allclassids[] array (first frame) utterancedesc(msra::asr::htkfeatreader::parsedpath &&ppath, size_t classidsbegin) - : parsedpath(std::move(ppath)), classidsbegin(classidsbegin) + : parsedpath(std::move(ppath)), classidsbegin(classidsbegin), framesToExpand(0), needsExpansion(false) { } - + bool needsExpansion; // ivector type of feature + size_t framesToExpand; // expected number of frames (to expand ivectors) wstring logicalpath() const { return parsedpath; /*type cast will return logical path*/ } size_t numframes() const { - return parsedpath.numframes(); + if (needsExpansion) + return framesToExpand; + else + return parsedpath.numframes(); } wstring key() const // key used for looking up lattice (not stored to save space) { @@ -70,6 +75,11 @@ class minibatchutterancesourcemulti : public minibatchsource return removeExtension(logicalpath()); #endif } + void expandtoutterance(size_t requiredFrames) + { + needsExpansion = true; + framesToExpand = requiredFrames; + } }; // Make sure type 'utterancedesc' has a move constructor @@ -158,7 +168,7 @@ class minibatchutterancesourcemulti : public minibatchsource // fprintf (stderr, "."); // read features for this file auto uttframes = getutteranceframes(i); // matrix stripe for this utterance (currently unfilled) - reader.read(utteranceset[i].parsedpath, (const string &) featkind, sampperiod, uttframes); // note: file info here used for checkuing only + reader.read(utteranceset[i].parsedpath, (const string &)featkind, sampperiod, uttframes, utteranceset[i].needsExpansion); // note: file info here used for checkuing only // page in lattice data if (!latticesource.empty()) latticesource.getlattices(utteranceset[i].key(), lattices[i], uttframes.cols()); @@ -831,8 +841,8 @@ class minibatchutterancesourcemulti : public minibatchsource // This mode requires utterances with time stamps. minibatchutterancesourcemulti(const std::vector> &infiles, const std::vector>> &labels, std::vector vdim, std::vector udim, std::vector leftcontext, std::vector rightcontext, size_t randomizationrange, - const latticesource &lattices, const map &allwordtranscripts, const bool framemode, bool minimizeMemoryFootprint) - : vdim(vdim), leftcontext(leftcontext), rightcontext(rightcontext), sampperiod(0), featdim(0), randomizationrange(randomizationrange), currentsweep(SIZE_MAX), lattices(lattices), allwordtranscripts(allwordtranscripts), framemode(framemode), chunksinram(0), timegetbatch(0), verbosity(2), m_generatePhoneBoundaries(!lattices.empty()), m_frameRandomizer(randomizedchunks, minimizeMemoryFootprint) + const latticesource &lattices, const map &allwordtranscripts, const bool framemode, bool minimizeMemoryFootprint, std::vector expandToUtt) + : vdim(vdim), leftcontext(leftcontext), rightcontext(rightcontext), sampperiod(0), featdim(0), randomizationrange(randomizationrange), currentsweep(SIZE_MAX), lattices(lattices), allwordtranscripts(allwordtranscripts), framemode(framemode), chunksinram(0), timegetbatch(0), verbosity(2), m_generatePhoneBoundaries(!lattices.empty()), m_frameRandomizer(randomizedchunks, minimizeMemoryFootprint), expandToUtt(expandToUtt) // [v-hansu] change framemode (lattices.empty()) into framemode (false) to run utterance mode without lattice // you also need to change another line, search : [v-hansu] comment out to run utterance mode without lattice { @@ -881,6 +891,8 @@ class minibatchutterancesourcemulti : public minibatchsource numutts = infiles[m].size(); uttisvalid = std::vector(numutts, true); uttduration = std::vector(numutts, 0); + if (expandToUtt[m]) + RuntimeError("minibatchutterancesourcemulti: the first feature stream must be frame-based not utterance based"); } else if (infiles[m].size() != numutts) RuntimeError("minibatchutterancesourcemulti: all feature files must have same number of utterances"); @@ -889,8 +901,10 @@ class minibatchutterancesourcemulti : public minibatchsource { utterancedesc utterance(msra::asr::htkfeatreader::parsedpath(infiles[m][i]), 0); // mseltzer - is this foolproof for multiio? is classids always non-empty? const size_t uttframes = utterance.numframes(); // will throw if frame bounds not given --required to be given in this mode + if (expandToUtt[m] && uttframes != 1) + RuntimeError("minibatchutterancesource: utterance-based features must be 1 frame in duration"); // we need at least 2 frames for boundary markers to work - if (uttframes < 2) + else if (!expandToUtt[m] && uttframes < 2) RuntimeError("minibatchutterancesource: utterances < 2 frames not supported"); if (uttframes > frameref::maxframesperutterance) { @@ -905,7 +919,7 @@ class minibatchutterancesourcemulti : public minibatchsource uttduration[i] = uttframes; uttisvalid[i] = true; } - else if (uttduration[i] != uttframes) + else if (uttduration[i] != uttframes && !expandToUtt[m]) { fprintf(stderr, "minibatchutterancesource: skipping %d-th file due to inconsistency in duration in different feature streams (%d vs %d frames)\n", i, (int) uttduration[i], (int) uttframes); uttduration[i] = 0; @@ -954,7 +968,15 @@ class minibatchutterancesourcemulti : public minibatchsource { utterancedesc utterance(msra::asr::htkfeatreader::parsedpath(infiles[m][i]), labels.empty() ? 0 : classidsbegin[i]); // mseltzer - is this foolproof for multiio? is classids always non-empty? const size_t uttframes = utterance.numframes(); // will throw if frame bounds not given --required to be given in this mode - assert(uttframes == uttduration[i]); // ensure nothing funky happened + if (expandToUtt[m]) + { + assert(uttframes == 1); + utterance.expandtoutterance(uttduration[i]); + } + else + { + assert(uttframes == uttduration[i]); // ensure nothing funky happened + } // already performed these checks above // we need at least 2 frames for boundary markers to work // if (uttframes < 2) diff --git a/Source/SGDLib/DataReaderHelpers.h b/Source/SGDLib/DataReaderHelpers.h index 73d6d6cec09d..a71345c2a4bb 100644 --- a/Source/SGDLib/DataReaderHelpers.h +++ b/Source/SGDLib/DataReaderHelpers.h @@ -185,6 +185,31 @@ namespace Microsoft { namespace MSR { namespace CNTK { return selected; } + template + static size_t GetNumSubminibatchesNeeded(IDataReader* dataReader, + size_t maxSamplesInRAM, + size_t numSubminibatches, + size_t tunedMBSize) + { + if (numSubminibatches > 1) // user-specified maximum number of samples + return numSubminibatches; + + if (maxSamplesInRAM < SIZE_MAX) + { + // into how many pieces would we need to break the minibatch? + // TODO: The following calculation relies on the ill-devised definition of "minibatch" of the current truncated BPTT implementation. Adapt this once fixed. + size_t numParallelSequences = dataReader->GetNumParallelSequences(); + size_t estimatedMBSize = tunedMBSize * numParallelSequences; + return (estimatedMBSize + maxSamplesInRAM - 1) / maxSamplesInRAM; + } + + // The return value of this method decides how many subminibatch needed for the training or + // eval process. The current process only starts the subminibatch loop when the calculated + // subminibatch number is larger than 1. So here returning 0 or 1 shares the same behavior. + // But the default value should still be 0 which means no subminibatch needed for this case. + return 0; + } + // =================================================================== // SubminibatchHelpers -- helper for sub-minibatch implementation // TODO: Can this just exist inside SGD.cpp? diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp index 508553dfccf4..5c7881835229 100644 --- a/Source/SGDLib/SGD.cpp +++ b/Source/SGDLib/SGD.cpp @@ -335,6 +335,7 @@ void SGD::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net, // set dropout rate for this epoch ComputationNetwork::SetDropoutRate(net, criterionNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed); + net->SetBatchNormalizationNodesBelowEvalMode(false, criterionNodes[0]); // learning rate adjustment if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None || i < m_learningRatesParam.size()) @@ -437,6 +438,8 @@ void SGD::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net, timer.Stop(); double epochTime = timer.ElapsedSeconds(); + net->SetBatchNormalizationNodesBelowEvalMode(true, criterionNodes[0]); + if (m_useEvalCriterionControlLR && epochEvalErrors.size() > 0) { lrControlCriterion = epochEvalErrors[0]; @@ -484,40 +487,37 @@ void SGD::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net, } } - if ((g_mpi == nullptr) || g_mpi->IsMainNode()) + if (validationSetDataReader != trainSetDataReader && validationSetDataReader != nullptr) { - if (validationSetDataReader != trainSetDataReader && validationSetDataReader != nullptr) + SimpleEvaluator evalforvalidation(net, g_mpi != nullptr); + vector cvSetTrainAndEvalNodes; + if (criterionNodes.size() > 0) { - SimpleEvaluator evalforvalidation(net); - vector cvSetTrainAndEvalNodes; - if (criterionNodes.size() > 0) - { - cvSetTrainAndEvalNodes.push_back(criterionNodes[0]->NodeName()); - } - if (evaluationNodes.size() > 0) - { - cvSetTrainAndEvalNodes.push_back(evaluationNodes[0]->NodeName()); - } + cvSetTrainAndEvalNodes.push_back(criterionNodes[0]->NodeName()); + } + if (evaluationNodes.size() > 0) + { + cvSetTrainAndEvalNodes.push_back(evaluationNodes[0]->NodeName()); + } // BUGBUG: We should not use the training MB size. The training MB size is constrained by both convergence and memory. Eval is only constrained by memory. - vector vScore = evalforvalidation.Evaluate(validationSetDataReader, cvSetTrainAndEvalNodes, m_mbSize[i]); - fprintf(stderr, "Finished Epoch[%2d of %d]: [Validation Set] TrainLossPerSample = %.8g", i + 1, (int) m_maxEpochs, vScore[0]); - if (vScore.size() > 1) + vector vScore = evalforvalidation.Evaluate(validationSetDataReader, cvSetTrainAndEvalNodes, m_mbSize[i]); + fprintf(stderr, "Finished Epoch[%2d of %d]: [Validation Set] TrainLossPerSample = %.8g", i + 1, (int) m_maxEpochs, vScore[0]); + if (vScore.size() > 1) + { + fprintf(stderr, "; EvalErrPerSample = %.8g", vScore[1]); + } + fprintf(stderr, "\n"); + + if (m_useCVSetControlLRIfCVExists) + { + if (m_useEvalCriterionControlLR && vScore.size() > 1) { - fprintf(stderr, "; EvalErrPerSample = %.8g", vScore[1]); + lrControlCriterion = vScore[1]; } - fprintf(stderr, "\n"); - - if (m_useCVSetControlLRIfCVExists) + else { - if (m_useEvalCriterionControlLR && vScore.size() > 1) - { - lrControlCriterion = vScore[1]; - } - else - { - lrControlCriterion = vScore[0]; // the first one is the training criterion - } + lrControlCriterion = vScore[0]; // the first one is the training criterion } } } @@ -781,22 +781,7 @@ size_t SGD::TrainOneEpoch(ComputationNetworkPtr net, // prepare for sub-minibatching // Sub-minibatching is used if a single minibatch is too large to fit into GPU RAM. DataReaderHelpers::SubminibatchDispatcher smbDispatcher; - size_t numSubminibatchesNeeded = 0; - if (m_maxSamplesInRAM < SIZE_MAX || m_numSubminiBatches > 1) // user-specified maximum number of samples that fit into GPU RAM; or 0 if not enabled - { - if (m_maxSamplesInRAM < SIZE_MAX) - { - // into how many pieces would we need to break the minibatch? - // TODO: The following calculation relies on the ill-devised definition of "minibatch" of the current truncated BPTT implementation. Adapt this once fixed. - size_t numParallelSequences = trainSetDataReader->GetNumParallelSequences(); - size_t estimatedMBSize = tunedMBSize * numParallelSequences; - numSubminibatchesNeeded = (size_t) std::ceil((float) estimatedMBSize / m_maxSamplesInRAM); - } - if (m_numSubminiBatches > 1) - { - numSubminibatchesNeeded = m_numSubminiBatches; - } - } + size_t numSubminibatchesNeeded = DataReaderHelpers::GetNumSubminibatchesNeeded(trainSetDataReader, m_maxSamplesInRAM, m_numSubminiBatches, tunedMBSize); // this is non-trivial, we need a manager object to handle this if (numSubminibatchesNeeded > 1) smbDispatcher.Init(net, learnableNodes, criterionNodes, evaluationNodes); diff --git a/Source/SGDLib/SimpleDistGradAggregator.h b/Source/SGDLib/SimpleDistGradAggregator.h index 1ee7dcdce48a..fedd2be6a522 100644 --- a/Source/SGDLib/SimpleDistGradAggregator.h +++ b/Source/SGDLib/SimpleDistGradAggregator.h @@ -5,6 +5,7 @@ #include #include "GPUDataTransferer.h" #include "TimerUtility.h" +#include "MatrixQuantizerImpl.h" namespace Microsoft { namespace MSR { namespace CNTK { diff --git a/Source/SGDLib/SimpleEvaluator.h b/Source/SGDLib/SimpleEvaluator.h index 9cd7ae054df7..be31c0fae794 100644 --- a/Source/SGDLib/SimpleEvaluator.h +++ b/Source/SGDLib/SimpleEvaluator.h @@ -11,6 +11,9 @@ #include "DataReaderHelpers.h" #include "TrainingNodes.h" // TODO: we should move the functions that depend on these to the .cpp #include "ProgressTracing.h" +#include "DistGradHeader.h" +#include "IDistGradAggregator.h" +#include "SimpleDistGradAggregator.h" #include #include @@ -20,13 +23,24 @@ using namespace std; namespace Microsoft { namespace MSR { namespace CNTK { +template +class IDistGradAggregator; + // TODO: get rid of dependency on ElemType template class SimpleEvaluator { public: - SimpleEvaluator(ComputationNetworkPtr net, const size_t numMBsToShowResult = 100, const int traceLevel = 0) - : m_net(net), m_numMBsToShowResult(numMBsToShowResult), m_traceLevel(traceLevel) + SimpleEvaluator(ComputationNetworkPtr net, const bool parallelRun, const size_t numMBsToShowResult = 100, const int traceLevel = 0, const size_t maxSamplesInRAM = SIZE_MAX, + const size_t numSubminiBatches = 1) + : m_net(net), + m_numMBsToShowResult(numMBsToShowResult), + m_traceLevel(traceLevel), + m_maxSamplesInRAM(maxSamplesInRAM), + m_numSubminiBatches(numSubminiBatches), + m_parallelRun(parallelRun), + m_distGradAgg(nullptr), + m_gradHeader(nullptr) { } @@ -93,32 +107,92 @@ class SimpleEvaluator for (int i = 0; i < evalResults.size(); i++) evalResultsLastMBs.push_back((ElemType) 0); + //TODO: we should add support for distributed reading dataReader->StartMinibatchLoop(mbSize, 0, testSize); m_net->StartEvaluateMinibatchLoop(evalNodes); + std::vector*> learnParamsGradients; + DataReaderHelpers::SubminibatchDispatcher smbDispatcher; + size_t numSubminibatchesNeeded = DataReaderHelpers::GetNumSubminibatchesNeeded(dataReader, m_maxSamplesInRAM, m_numSubminiBatches, mbSize); + + // Passing in two empty node lists so the dispatcher can work for the evalNodes. + std::list learnableNodes; + std::vector criterionNodes; + if (numSubminibatchesNeeded > 1) + smbDispatcher.Init(m_net, learnableNodes, criterionNodes, evalNodes); + const size_t numIterationsBeforePrintingProgress = 100; size_t numItersSinceLastPrintOfProgress = 0; - while (DataReaderHelpers::GetMinibatchIntoNetwork(*dataReader, m_net, nullptr, false, false, inputMatrices, actualMBSize)) + while (DataReaderHelpers::GetMinibatchIntoNetwork(*dataReader, m_net, nullptr, false, m_parallelRun, inputMatrices, actualMBSize)) { - ComputationNetwork::BumpEvalTimeStamp(featureNodes); - ComputationNetwork::BumpEvalTimeStamp(labelNodes); + size_t actualNumSubminibatches = numSubminibatchesNeeded <= 1 ? 1 : smbDispatcher.GetMinibatchIntoCache(*dataReader, *m_net, inputMatrices, numSubminibatchesNeeded); + for (size_t ismb = 0; ismb < actualNumSubminibatches; ismb++) + { + if (actualNumSubminibatches > 1) + { + smbDispatcher.GetSubMinibatchToNet(ismb); // get sub-minibatch from full-size one + } + + ComputationNetwork::BumpEvalTimeStamp(featureNodes); + ComputationNetwork::BumpEvalTimeStamp(labelNodes); + + m_net->ForwardProp(evalNodes); + + // house-keeping for sub-minibatching + if (actualNumSubminibatches > 1) + smbDispatcher.DoneWithCurrentSubMinibatch(ismb); // page state out + } // end sub-minibatch loop + + if (actualNumSubminibatches > 1) + smbDispatcher.DoneWithCurrentMinibatch(); - // for now since we share the same label masking flag we call this on one node only - // Later, when we apply different labels on different nodes - // we need to add code to call this function multiple times, one for each criteria node size_t numSamplesWithLabel = m_net->GetNumSamplesWithLabel(actualMBSize); - for (int i = 0; i < evalNodes.size(); i++) + size_t aggregateNumSamplesWithLabel = numSamplesWithLabel; + if (m_parallelRun) { - m_net->ForwardProp(evalNodes[i]); - evalResults[i] += (double) evalNodes[i]->Get00Element(); // criterionNode should be a scalar + if (m_gradHeader == nullptr) + { + m_gradHeader = DistGradHeader::Create(evalNodes.size()); + m_distGradAgg = make_shared>(g_mpi, false, m_traceLevel); + } + + m_gradHeader->numEvalNode = evalNodes.size(); + m_gradHeader->numSamples = actualMBSize; + m_gradHeader->numSamplesWithLabel = numSamplesWithLabel; + m_gradHeader->criterion = 0.0; + for (size_t i = 0; i < evalNodes.size(); i++) + m_gradHeader->evalErrors[i] = evalNodes[i]->Get00Element(); + + // TODO: We are reusing the aggregation logic inside SimpleDistGradAggregator, which has a heavy dependency + // on the gradient matrix. At some point we should refacotr the aggregator class to be able to only calculating + // eval results and then remove this hack. + if (learnParamsGradients.size() == 0) + { + Matrix* matrix = new Matrix((DEVICEID_TYPE)m_net->GetDeviceId()); + learnParamsGradients.push_back(matrix); + } + + // Using SimpleDistAggregator for eval results only. At some point we should rename the class to be just + // IDistAggregator and SimpleDistAggregator. + m_distGradAgg->AggregateGradients(learnParamsGradients, m_gradHeader, 0); + aggregateNumSamplesWithLabel = m_gradHeader->numSamplesWithLabel; + for (size_t i = 0; i < evalResults.size(); i++) + evalResults[i] += m_gradHeader->evalErrors[i]; + } + else + { + for (int i = 0; i < evalNodes.size(); i++) + { + evalResults[i] += (double)evalNodes[i]->Get00Element(); // criterionNode should be a scalar + } } - totalEpochSamples += numSamplesWithLabel; + totalEpochSamples += aggregateNumSamplesWithLabel; numMBsRun++; if (m_traceLevel > 0) { - numSamplesLastMBs += numSamplesWithLabel; + numSamplesLastMBs += aggregateNumSamplesWithLabel; if (numMBsRun % m_numMBsToShowResult == 0) { @@ -211,6 +285,12 @@ class SimpleEvaluator protected: ComputationNetworkPtr m_net; size_t m_numMBsToShowResult; + size_t m_maxSamplesInRAM; + size_t m_numSubminiBatches; + bool m_parallelRun; + + shared_ptr> m_distGradAgg; + struct DistGradHeader* m_gradHeader; int m_traceLevel; void operator=(const SimpleEvaluator&); // (not assignable) }; diff --git a/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/baseline.cpu.txt b/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/baseline.cpu.txt new file mode 100644 index 000000000000..c3fc02a9c241 --- /dev/null +++ b/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/baseline.cpu.txt @@ -0,0 +1,1305 @@ +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 17:23:46 + Last modified date: Thu Mar 3 05:46:23 2016 + Build type: release + Build target: GPU + With 1bit-SGD: no + Math lib: acml + CUDA_PATH: /usr/local/cuda-7.0 + CUB_PATH: /usr/local/cub-1.4.1 + CUDNN_PATH: /usr/local/cudnn-4.0 + Build Branch: HEAD + Build SHA1: dafcfee4846f7c5a7d3b29ace536b8734ff409d1 + Built by philly on Source/CNTK/buildinfo.h0 + Build Path: Source/CNTK/buildinfo.h1 +------------------------------------------------------------------- +Changed current directory to '/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data' +MPIWrapper: initializing MPI +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 17:23:46 + Last modified date: Thu Mar 3 05:46:23 2016 + Build type: release + Build target: GPU + With 1bit-SGD: no + Math lib: acml + CUDA_PATH: /usr/local/cuda-7.0 + CUB_PATH: /usr/local/cub-1.4.1 + CUDNN_PATH: /usr/local/cudnn-4.0 + Build Branch: HEAD + Build SHA1: dafcfee4846f7c5a7d3b29ace536b8734ff409d1 + Built by philly on Source/CNTK/buildinfo.h0 + Build Path: Source/CNTK/buildinfo.h1 +------------------------------------------------------------------- +Changed current directory to '/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data' +MPIWrapper: initializing MPI +-------------------------------------------------------------------------- +[[41784,1],0]: A high-performance Open MPI point-to-point messaging module +was unable to find any relevant network interfaces: + +Module: OpenFabrics (openib) + Host: aa5a66a48ad8 + +Another transport will be used instead, although this may result in +lower performance. +-------------------------------------------------------------------------- +ping [requestnodes (before change)]: 2 nodes pinging each other +ping [requestnodes (before change)]: 2 nodes pinging each other +ping [requestnodes (before change)]: all 2 nodes responded +requestnodes [MPIWrapper]: using 2 out of 2 MPI nodes (2 requested); we (0) are in (participating) +ping [requestnodes (after change)]: 2 nodes pinging each other +ping [requestnodes (before change)]: all 2 nodes responded +requestnodes [MPIWrapper]: using 2 out of 2 MPI nodes (2 requested); we (1) are in (participating) +ping [requestnodes (after change)]: 2 nodes pinging each other +ping [requestnodes (after change)]: all 2 nodes responded +mpihelper: we are cog 1 in a gearbox of 2 +ping [mpihelper]: 2 nodes pinging each other +ping [requestnodes (after change)]: all 2 nodes responded +mpihelper: we are cog 0 in a gearbox of 2 +ping [mpihelper]: 2 nodes pinging each other +ping [mpihelper]: all 2 nodes responded +ping [mpihelper]: all 2 nodes responded +Redirecting stderr to file /tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu/stderr_speechTrain.logrank0 +Redirecting stderr to file /tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu/stderr_speechTrain.logrank1 +[aa5a66a48ad8:24041] 1 more process has sent help message help-mpi-btl-base.txt / btl:no-nics +[aa5a66a48ad8:24041] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: Build info: +MPI Rank 0: +MPI Rank 0: Built time: Mar 3 2016 17:23:46 +MPI Rank 0: Last modified date: Thu Mar 3 05:46:23 2016 +MPI Rank 0: Build type: release +MPI Rank 0: Build target: GPU +MPI Rank 0: With 1bit-SGD: no +MPI Rank 0: Math lib: acml +MPI Rank 0: CUDA_PATH: /usr/local/cuda-7.0 +MPI Rank 0: CUB_PATH: /usr/local/cub-1.4.1 +MPI Rank 0: CUDNN_PATH: /usr/local/cudnn-4.0 +MPI Rank 0: Build Branch: HEAD +MPI Rank 0: Build SHA1: dafcfee4846f7c5a7d3b29ace536b8734ff409d1 +MPI Rank 0: Built by philly on Source/CNTK/buildinfo.h0 +MPI Rank 0: Build Path: Source/CNTK/buildinfo.h1 +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: running on localhost at 2016/03/03 17:28:40 +MPI Rank 0: command line: +MPI Rank 0: /home/philly/jenkins/workspace/CNTK-Test-Linux-W2/build/gpu/release/bin/cntk configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/cntkcv.cntk currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data RunDir=/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation OutputDir=/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu DeviceId=-1 numCPUThreads=2 stderr=/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu/stderr +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: precision = "double" +MPI Rank 0: command = speechTrain +MPI Rank 0: deviceId = $DeviceId$ +MPI Rank 0: parallelTrain = true +MPI Rank 0: speechTrain = [ +MPI Rank 0: action = "train" +MPI Rank 0: modelPath = "$RunDir$/models/cntkSpeech.dnn" +MPI Rank 0: deviceId = $DeviceId$ +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SimpleNetworkBuilder = [ +MPI Rank 0: layerSizes = 363:512:512:132 +MPI Rank 0: trainingCriterion = "CrossEntropyWithSoftmax" +MPI Rank 0: evalCriterion = "ErrorPrediction" +MPI Rank 0: layerTypes = "Sigmoid" +MPI Rank 0: initValueScale = 1.0 +MPI Rank 0: applyMeanVarNorm = true +MPI Rank 0: uniformInit = true +MPI Rank 0: needPrior = true +MPI Rank 0: ] +MPI Rank 0: ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above +MPI Rank 0: layerSizes = 363:512:512:132 +MPI Rank 0: trainingCriterion = 'CE' +MPI Rank 0: evalCriterion = 'Err' +MPI Rank 0: applyMeanVarNorm = true +MPI Rank 0: L = Length(layerSizes)-1 // number of model layers +MPI Rank 0: features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') +MPI Rank 0: featNorm = if applyMeanVarNorm +MPI Rank 0: then MeanVarNorm(features) +MPI Rank 0: else features +MPI Rank 0: layers[layer:1..L-1] = if layer > 1 +MPI Rank 0: then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 0: else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 0: outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) +MPI Rank 0: outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) +MPI Rank 0: CE = if trainingCriterion == 'CE' +MPI Rank 0: then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') +MPI Rank 0: else Fail('unknown trainingCriterion ' + trainingCriterion) +MPI Rank 0: Err = if evalCriterion == 'Err' then +MPI Rank 0: ErrorPrediction(labels, outZ, tag='eval') +MPI Rank 0: else Fail('unknown evalCriterion ' + evalCriterion) +MPI Rank 0: logPrior = LogPrior(labels) +MPI Rank 0: // TODO: how to add a tag to an infix operation? +MPI Rank 0: ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') +MPI Rank 0: ] +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize = 20480 +MPI Rank 0: minibatchSize = 64:256:1024 +MPI Rank 0: learningRatesPerMB = 1.0:0.5:0.1 +MPI Rank 0: numMBsToShowResult = 10 +MPI Rank 0: momentumPerMB = 0.9:0.656119 +MPI Rank 0: dropoutRate = 0.0 +MPI Rank 0: maxEpochs = 3 +MPI Rank 0: keepCheckPointFiles = true +MPI Rank 0: clippingThresholdPerSample = 1#INF +MPI Rank 0: ParallelTrain = [ +MPI Rank 0: parallelizationMethod = "DataParallelSGD" +MPI Rank 0: distributedMBReading = true +MPI Rank 0: DataParallelSGD = [ +MPI Rank 0: gradientBits = 64 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: AutoAdjust = [ +MPI Rank 0: reduceLearnRateIfImproveLessThan = 0 +MPI Rank 0: loadBestModel = true +MPI Rank 0: increaseLearnRateIfImproveMoreThan = 1000000000 +MPI Rank 0: learnRateDecreaseFactor = 0.5 +MPI Rank 0: learnRateIncreaseFactor = 1.382 +MPI Rank 0: autoAdjustLR = "adjustAfterEpoch" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = "HTKMLFReader" +MPI Rank 0: readMethod = "blockRandomize" +MPI Rank 0: miniBatchMode = "partial" +MPI Rank 0: randomize = "auto" +MPI Rank 0: verbosity = 0 +MPI Rank 0: features = [ +MPI Rank 0: dim = 363 +MPI Rank 0: type = "real" +MPI Rank 0: scpFile = "glob_0000.scp" +MPI Rank 0: ] +MPI Rank 0: labels = [ +MPI Rank 0: mlfFile = "$DataDir$/glob_0000.mlf" +MPI Rank 0: labelMappingFile = "$DataDir$/state.list" +MPI Rank 0: labelDim = 132 +MPI Rank 0: labelType = "category" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: cvreader = [ +MPI Rank 0: readerType = "HTKMLFReader" +MPI Rank 0: readMethod = "blockRandomize" +MPI Rank 0: miniBatchMode = "partial" +MPI Rank 0: randomize = "auto" +MPI Rank 0: verbosity = 0 +MPI Rank 0: features = [ +MPI Rank 0: dim = 363 +MPI Rank 0: type = "real" +MPI Rank 0: scpFile = "glob_0000.cv.scp" +MPI Rank 0: ] +MPI Rank 0: labels = [ +MPI Rank 0: mlfFile = "$DataDir$/glob_0000.mlf" +MPI Rank 0: labelMappingFile = "$DataDir$/state.list" +MPI Rank 0: labelDim = 132 +MPI Rank 0: labelType = "category" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data +MPI Rank 0: RunDir=/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu +MPI Rank 0: DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data +MPI Rank 0: ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation +MPI Rank 0: OutputDir=/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu +MPI Rank 0: DeviceId=-1 +MPI Rank 0: numCPUThreads=2 +MPI Rank 0: stderr=/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: precision = "double" +MPI Rank 0: command = speechTrain +MPI Rank 0: deviceId = -1 +MPI Rank 0: parallelTrain = true +MPI Rank 0: speechTrain = [ +MPI Rank 0: action = "train" +MPI Rank 0: modelPath = "/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu/models/cntkSpeech.dnn" +MPI Rank 0: deviceId = -1 +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SimpleNetworkBuilder = [ +MPI Rank 0: layerSizes = 363:512:512:132 +MPI Rank 0: trainingCriterion = "CrossEntropyWithSoftmax" +MPI Rank 0: evalCriterion = "ErrorPrediction" +MPI Rank 0: layerTypes = "Sigmoid" +MPI Rank 0: initValueScale = 1.0 +MPI Rank 0: applyMeanVarNorm = true +MPI Rank 0: uniformInit = true +MPI Rank 0: needPrior = true +MPI Rank 0: ] +MPI Rank 0: ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above +MPI Rank 0: layerSizes = 363:512:512:132 +MPI Rank 0: trainingCriterion = 'CE' +MPI Rank 0: evalCriterion = 'Err' +MPI Rank 0: applyMeanVarNorm = true +MPI Rank 0: L = Length(layerSizes)-1 // number of model layers +MPI Rank 0: features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') +MPI Rank 0: featNorm = if applyMeanVarNorm +MPI Rank 0: then MeanVarNorm(features) +MPI Rank 0: else features +MPI Rank 0: layers[layer:1..L-1] = if layer > 1 +MPI Rank 0: then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 0: else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 0: outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) +MPI Rank 0: outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) +MPI Rank 0: CE = if trainingCriterion == 'CE' +MPI Rank 0: then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') +MPI Rank 0: else Fail('unknown trainingCriterion ' + trainingCriterion) +MPI Rank 0: Err = if evalCriterion == 'Err' then +MPI Rank 0: ErrorPrediction(labels, outZ, tag='eval') +MPI Rank 0: else Fail('unknown evalCriterion ' + evalCriterion) +MPI Rank 0: logPrior = LogPrior(labels) +MPI Rank 0: // TODO: how to add a tag to an infix operation? +MPI Rank 0: ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') +MPI Rank 0: ] +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize = 20480 +MPI Rank 0: minibatchSize = 64:256:1024 +MPI Rank 0: learningRatesPerMB = 1.0:0.5:0.1 +MPI Rank 0: numMBsToShowResult = 10 +MPI Rank 0: momentumPerMB = 0.9:0.656119 +MPI Rank 0: dropoutRate = 0.0 +MPI Rank 0: maxEpochs = 3 +MPI Rank 0: keepCheckPointFiles = true +MPI Rank 0: clippingThresholdPerSample = 1#INF +MPI Rank 0: ParallelTrain = [ +MPI Rank 0: parallelizationMethod = "DataParallelSGD" +MPI Rank 0: distributedMBReading = true +MPI Rank 0: DataParallelSGD = [ +MPI Rank 0: gradientBits = 64 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: AutoAdjust = [ +MPI Rank 0: reduceLearnRateIfImproveLessThan = 0 +MPI Rank 0: loadBestModel = true +MPI Rank 0: increaseLearnRateIfImproveMoreThan = 1000000000 +MPI Rank 0: learnRateDecreaseFactor = 0.5 +MPI Rank 0: learnRateIncreaseFactor = 1.382 +MPI Rank 0: autoAdjustLR = "adjustAfterEpoch" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = "HTKMLFReader" +MPI Rank 0: readMethod = "blockRandomize" +MPI Rank 0: miniBatchMode = "partial" +MPI Rank 0: randomize = "auto" +MPI Rank 0: verbosity = 0 +MPI Rank 0: features = [ +MPI Rank 0: dim = 363 +MPI Rank 0: type = "real" +MPI Rank 0: scpFile = "glob_0000.scp" +MPI Rank 0: ] +MPI Rank 0: labels = [ +MPI Rank 0: mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/glob_0000.mlf" +MPI Rank 0: labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/state.list" +MPI Rank 0: labelDim = 132 +MPI Rank 0: labelType = "category" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: cvreader = [ +MPI Rank 0: readerType = "HTKMLFReader" +MPI Rank 0: readMethod = "blockRandomize" +MPI Rank 0: miniBatchMode = "partial" +MPI Rank 0: randomize = "auto" +MPI Rank 0: verbosity = 0 +MPI Rank 0: features = [ +MPI Rank 0: dim = 363 +MPI Rank 0: type = "real" +MPI Rank 0: scpFile = "glob_0000.cv.scp" +MPI Rank 0: ] +MPI Rank 0: labels = [ +MPI Rank 0: mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/glob_0000.mlf" +MPI Rank 0: labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/state.list" +MPI Rank 0: labelDim = 132 +MPI Rank 0: labelType = "category" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data +MPI Rank 0: RunDir=/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu +MPI Rank 0: DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data +MPI Rank 0: ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation +MPI Rank 0: OutputDir=/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu +MPI Rank 0: DeviceId=-1 +MPI Rank 0: numCPUThreads=2 +MPI Rank 0: stderr=/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: configparameters: cntkcv.cntk:command=speechTrain +MPI Rank 0: configparameters: cntkcv.cntk:ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation +MPI Rank 0: configparameters: cntkcv.cntk:currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data +MPI Rank 0: configparameters: cntkcv.cntk:DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data +MPI Rank 0: configparameters: cntkcv.cntk:deviceId=-1 +MPI Rank 0: configparameters: cntkcv.cntk:numCPUThreads=2 +MPI Rank 0: configparameters: cntkcv.cntk:OutputDir=/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu +MPI Rank 0: configparameters: cntkcv.cntk:parallelTrain=true +MPI Rank 0: configparameters: cntkcv.cntk:precision=double +MPI Rank 0: configparameters: cntkcv.cntk:RunDir=/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu +MPI Rank 0: configparameters: cntkcv.cntk:speechTrain=[ +MPI Rank 0: action = "train" +MPI Rank 0: modelPath = "/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu/models/cntkSpeech.dnn" +MPI Rank 0: deviceId = -1 +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SimpleNetworkBuilder = [ +MPI Rank 0: layerSizes = 363:512:512:132 +MPI Rank 0: trainingCriterion = "CrossEntropyWithSoftmax" +MPI Rank 0: evalCriterion = "ErrorPrediction" +MPI Rank 0: layerTypes = "Sigmoid" +MPI Rank 0: initValueScale = 1.0 +MPI Rank 0: applyMeanVarNorm = true +MPI Rank 0: uniformInit = true +MPI Rank 0: needPrior = true +MPI Rank 0: ] +MPI Rank 0: ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above +MPI Rank 0: layerSizes = 363:512:512:132 +MPI Rank 0: trainingCriterion = 'CE' +MPI Rank 0: evalCriterion = 'Err' +MPI Rank 0: applyMeanVarNorm = true +MPI Rank 0: L = Length(layerSizes)-1 // number of model layers +MPI Rank 0: features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') +MPI Rank 0: featNorm = if applyMeanVarNorm +MPI Rank 0: then MeanVarNorm(features) +MPI Rank 0: else features +MPI Rank 0: layers[layer:1..L-1] = if layer > 1 +MPI Rank 0: then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 0: else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 0: outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) +MPI Rank 0: outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) +MPI Rank 0: CE = if trainingCriterion == 'CE' +MPI Rank 0: then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') +MPI Rank 0: else Fail('unknown trainingCriterion ' + trainingCriterion) +MPI Rank 0: Err = if evalCriterion == 'Err' then +MPI Rank 0: ErrorPrediction(labels, outZ, tag='eval') +MPI Rank 0: else Fail('unknown evalCriterion ' + evalCriterion) +MPI Rank 0: logPrior = LogPrior(labels) +MPI Rank 0: // TODO: how to add a tag to an infix operation? +MPI Rank 0: ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') +MPI Rank 0: ] +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize = 20480 +MPI Rank 0: minibatchSize = 64:256:1024 +MPI Rank 0: learningRatesPerMB = 1.0:0.5:0.1 +MPI Rank 0: numMBsToShowResult = 10 +MPI Rank 0: momentumPerMB = 0.9:0.656119 +MPI Rank 0: dropoutRate = 0.0 +MPI Rank 0: maxEpochs = 3 +MPI Rank 0: keepCheckPointFiles = true +MPI Rank 0: clippingThresholdPerSample = 1#INF +MPI Rank 0: ParallelTrain = [ +MPI Rank 0: parallelizationMethod = "DataParallelSGD" +MPI Rank 0: distributedMBReading = true +MPI Rank 0: DataParallelSGD = [ +MPI Rank 0: gradientBits = 64 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: AutoAdjust = [ +MPI Rank 0: reduceLearnRateIfImproveLessThan = 0 +MPI Rank 0: loadBestModel = true +MPI Rank 0: increaseLearnRateIfImproveMoreThan = 1000000000 +MPI Rank 0: learnRateDecreaseFactor = 0.5 +MPI Rank 0: learnRateIncreaseFactor = 1.382 +MPI Rank 0: autoAdjustLR = "adjustAfterEpoch" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = "HTKMLFReader" +MPI Rank 0: readMethod = "blockRandomize" +MPI Rank 0: miniBatchMode = "partial" +MPI Rank 0: randomize = "auto" +MPI Rank 0: verbosity = 0 +MPI Rank 0: features = [ +MPI Rank 0: dim = 363 +MPI Rank 0: type = "real" +MPI Rank 0: scpFile = "glob_0000.scp" +MPI Rank 0: ] +MPI Rank 0: labels = [ +MPI Rank 0: mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/glob_0000.mlf" +MPI Rank 0: labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/state.list" +MPI Rank 0: labelDim = 132 +MPI Rank 0: labelType = "category" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: cvreader = [ +MPI Rank 0: readerType = "HTKMLFReader" +MPI Rank 0: readMethod = "blockRandomize" +MPI Rank 0: miniBatchMode = "partial" +MPI Rank 0: randomize = "auto" +MPI Rank 0: verbosity = 0 +MPI Rank 0: features = [ +MPI Rank 0: dim = 363 +MPI Rank 0: type = "real" +MPI Rank 0: scpFile = "glob_0000.cv.scp" +MPI Rank 0: ] +MPI Rank 0: labels = [ +MPI Rank 0: mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/glob_0000.mlf" +MPI Rank 0: labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/state.list" +MPI Rank 0: labelDim = 132 +MPI Rank 0: labelType = "category" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: cntkcv.cntk:stderr=/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu/stderr +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: Commands: speechTrain +MPI Rank 0: Precision = "double" +MPI Rank 0: Using 2 CPU threads. +MPI Rank 0: CNTKModelPath: /tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu/models/cntkSpeech.dnn +MPI Rank 0: CNTKCommandTrainInfo: speechTrain : 3 +MPI Rank 0: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 0: +MPI Rank 0: ############################################################################## +MPI Rank 0: # # +MPI Rank 0: # Action "train" # +MPI Rank 0: # # +MPI Rank 0: ############################################################################## +MPI Rank 0: +MPI Rank 0: CNTKCommandTrainBegin: speechTrain +MPI Rank 0: SimpleNetworkBuilder Using CPU +MPI Rank 0: reading script file glob_0000.scp ... 948 entries +MPI Rank 0: total 132 state names in state list /home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/state.list +MPI Rank 0: htkmlfreader: reading MLF file /home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/glob_0000.mlf ... total 948 entries +MPI Rank 0: ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances +MPI Rank 0: label set 0: 129 classes +MPI Rank 0: minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames +MPI Rank 0: reading script file glob_0000.cv.scp ... 300 entries +MPI Rank 0: total 132 state names in state list /home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/state.list +MPI Rank 0: htkmlfreader: reading MLF file /home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/glob_0000.mlf ... total 948 entries +MPI Rank 0: ...........................................................................feature set 0: 83050 frames in 300 out of 300 utterances +MPI Rank 0: label set 0: 129 classes +MPI Rank 0: minibatchutterancesource: 300 utterances grouped into 1 chunks, av. chunk size: 300.0 utterances, 83050.0 frames +MPI Rank 0: +MPI Rank 0: Post-processing network... +MPI Rank 0: +MPI Rank 0: 7 roots: +MPI Rank 0: CrossEntropyWithSoftmax = CrossEntropyWithSoftmax +MPI Rank 0: EvalErrorPrediction = ErrorPrediction +MPI Rank 0: InvStdOfFeatures = InvStdDev +MPI Rank 0: MeanOfFeatures = Mean +MPI Rank 0: PosteriorProb = Softmax +MPI Rank 0: Prior = Mean +MPI Rank 0: ScaledLogLikelihood = Minus +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for CrossEntropyWithSoftmax CrossEntropyWithSoftmax operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for EvalErrorPrediction ErrorPrediction operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for InvStdOfFeatures InvStdDev operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for MeanOfFeatures Mean operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for PosteriorProb Softmax operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for Prior Mean operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for ScaledLogLikelihood Minus operation +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating network. 25 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132 x *] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [132 x 512] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [512 x 512] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [512 x 363] +MPI Rank 0: Validating --> features = InputValue -> [363 x *] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363 x *]) -> [363] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363 x *]) -> [363] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 x *], MeanOfFeatures[363], InvStdOfFeatures[363]) -> [363 x *] +MPI Rank 0: Validating --> W0*features = Times(W0[512 x 363], MVNormalizedFeatures[363 x *]) -> [512 x *] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [512 x 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512 x *], B0[512 x 1]) -> [512 x 1 x *] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 0: Validating --> W1*H1 = Times(W1[512 x 512], H1[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [512 x 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512 x 1 x *], B1[512 x 1]) -> [512 x 1 x *] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 0: Validating --> W2*H1 = Times(W2[132 x 512], H2[512 x 1 x *]) -> [132 x 1 x *] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [132 x 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[132 x 1 x *], B2[132 x 1]) -> [132 x 1 x *] +MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 x *], HLast[132 x 1 x *]) -> [1] +MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 x *], HLast[132 x 1 x *]) -> [1] +MPI Rank 0: Validating --> PosteriorProb = Softmax(HLast[132 x 1 x *]) -> [132 x 1 x *] +MPI Rank 0: Validating --> Prior = Mean(labels[132 x *]) -> [132] +MPI Rank 0: Validating --> LogOfPrior = Log(Prior[132]) -> [132] +MPI Rank 0: Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 x *], LogOfPrior[132]) -> [132 x 1 x *] +MPI Rank 0: +MPI Rank 0: Validating network. 17 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132 x *] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [132 x 512] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [512 x 512] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [512 x 363] +MPI Rank 0: Validating --> features = InputValue -> [363 x *] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363 x *]) -> [363] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363 x *]) -> [363] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 x *], MeanOfFeatures[363], InvStdOfFeatures[363]) -> [363 x *] +MPI Rank 0: Validating --> W0*features = Times(W0[512 x 363], MVNormalizedFeatures[363 x *]) -> [512 x *] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [512 x 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512 x *], B0[512 x 1]) -> [512 x 1 x *] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 0: Validating --> W1*H1 = Times(W1[512 x 512], H1[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [512 x 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512 x 1 x *], B1[512 x 1]) -> [512 x 1 x *] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 0: Validating --> W2*H1 = Times(W2[132 x 512], H2[512 x 1 x *]) -> [132 x 1 x *] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [132 x 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[132 x 1 x *], B2[132 x 1]) -> [132 x 1 x *] +MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 x *], HLast[132 x 1 x *]) -> [1] +MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 x *], HLast[132 x 1 x *]) -> [1] +MPI Rank 0: Validating --> PosteriorProb = Softmax(HLast[132 x 1 x *]) -> [132 x 1 x *] +MPI Rank 0: Validating --> Prior = Mean(labels[132 x *]) -> [132] +MPI Rank 0: Validating --> LogOfPrior = Log(Prior[132]) -> [132] +MPI Rank 0: Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 x *], LogOfPrior[132]) -> [132 x 1 x *] +MPI Rank 0: +MPI Rank 0: Validating network, final pass. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132 x *] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [132 x 512] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [512 x 512] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [512 x 363] +MPI Rank 0: Validating --> features = InputValue -> [363 x *] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363 x *]) -> [363] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363 x *]) -> [363] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 x *], MeanOfFeatures[363], InvStdOfFeatures[363]) -> [363 x *] +MPI Rank 0: Validating --> W0*features = Times(W0[512 x 363], MVNormalizedFeatures[363 x *]) -> [512 x *] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [512 x 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512 x *], B0[512 x 1]) -> [512 x 1 x *] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 0: Validating --> W1*H1 = Times(W1[512 x 512], H1[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [512 x 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512 x 1 x *], B1[512 x 1]) -> [512 x 1 x *] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 0: Validating --> W2*H1 = Times(W2[132 x 512], H2[512 x 1 x *]) -> [132 x 1 x *] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [132 x 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[132 x 1 x *], B2[132 x 1]) -> [132 x 1 x *] +MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 x *], HLast[132 x 1 x *]) -> [1] +MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 x *], HLast[132 x 1 x *]) -> [1] +MPI Rank 0: Validating --> PosteriorProb = Softmax(HLast[132 x 1 x *]) -> [132 x 1 x *] +MPI Rank 0: Validating --> Prior = Mean(labels[132 x *]) -> [132] +MPI Rank 0: Validating --> LogOfPrior = Log(Prior[132]) -> [132] +MPI Rank 0: Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 x *], LogOfPrior[132]) -> [132 x 1 x *] +MPI Rank 0: +MPI Rank 0: 12 out of 25 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: Post-processing network complete. +MPI Rank 0: +MPI Rank 0: SGD using CPU. +MPI Rank 0: +MPI Rank 0: Training criterion node(s): +MPI Rank 0: CrossEntropyWithSoftmax = CrossEntropyWithSoftmax +MPI Rank 0: +MPI Rank 0: Evaluation criterion node(s): +MPI Rank 0: EvalErrorPrediction = ErrorPrediction +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: +MPI Rank 0: Precomputing --> 3 PreCompute nodes found. +MPI Rank 0: +MPI Rank 0: NodeName: MeanOfFeatures +MPI Rank 0: NodeName: InvStdOfFeatures +MPI Rank 0: NodeName: Prior +MPI Rank 0: minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 0: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms +MPI Rank 0: +MPI Rank 0: Precomputing --> Completed. +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Starting Epoch 1: learning rate per sample = 0.015625 effective momentum = 0.900000 momentum as time constant = 607.4 samples +MPI Rank 0: minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 0 of 2, with 1 datapasses +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 2, NumGradientBits = 64), distributed reading is ENABLED. +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 1- 10, 3.12%]: SamplesSeen = 640; TrainLossPerSample = 4.36628272; EvalErr[0]PerSample = 0.90937500; TotalTime = 0.6558s; SamplesPerSecond = 975.9 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 11- 20, 6.25%]: SamplesSeen = 640; TrainLossPerSample = 4.15914991; EvalErr[0]PerSample = 0.89218750; TotalTime = 0.7034s; SamplesPerSecond = 909.9 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 21- 30, 9.38%]: SamplesSeen = 640; TrainLossPerSample = 3.99837967; EvalErr[0]PerSample = 0.86875000; TotalTime = 0.5056s; SamplesPerSecond = 1265.8 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 31- 40, 12.50%]: SamplesSeen = 640; TrainLossPerSample = 3.86616341; EvalErr[0]PerSample = 0.86250000; TotalTime = 0.6970s; SamplesPerSecond = 918.3 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 41- 50, 15.62%]: SamplesSeen = 640; TrainLossPerSample = 3.80082643; EvalErr[0]PerSample = 0.87968750; TotalTime = 0.7546s; SamplesPerSecond = 848.1 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 51- 60, 18.75%]: SamplesSeen = 640; TrainLossPerSample = 3.73336112; EvalErr[0]PerSample = 0.87812500; TotalTime = 0.5956s; SamplesPerSecond = 1074.5 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 61- 70, 21.88%]: SamplesSeen = 640; TrainLossPerSample = 3.57119384; EvalErr[0]PerSample = 0.82031250; TotalTime = 0.7604s; SamplesPerSecond = 841.6 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 71- 80, 25.00%]: SamplesSeen = 640; TrainLossPerSample = 3.44001005; EvalErr[0]PerSample = 0.81562500; TotalTime = 0.7107s; SamplesPerSecond = 900.5 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 81- 90, 28.12%]: SamplesSeen = 640; TrainLossPerSample = 3.36131109; EvalErr[0]PerSample = 0.77343750; TotalTime = 0.5269s; SamplesPerSecond = 1214.6 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 91- 100, 31.25%]: SamplesSeen = 640; TrainLossPerSample = 3.39817487; EvalErr[0]PerSample = 0.85000000; TotalTime = 0.5401s; SamplesPerSecond = 1185.1 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 101- 110, 34.38%]: SamplesSeen = 640; TrainLossPerSample = 3.25116276; EvalErr[0]PerSample = 0.77031250; TotalTime = 0.7174s; SamplesPerSecond = 892.1 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 111- 120, 37.50%]: SamplesSeen = 640; TrainLossPerSample = 3.35774005; EvalErr[0]PerSample = 0.79843750; TotalTime = 0.7503s; SamplesPerSecond = 852.9 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 121- 130, 40.62%]: SamplesSeen = 640; TrainLossPerSample = 3.19791351; EvalErr[0]PerSample = 0.76406250; TotalTime = 0.5165s; SamplesPerSecond = 1239.0 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 131- 140, 43.75%]: SamplesSeen = 640; TrainLossPerSample = 3.06449990; EvalErr[0]PerSample = 0.71718750; TotalTime = 0.7309s; SamplesPerSecond = 875.6 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 141- 150, 46.88%]: SamplesSeen = 640; TrainLossPerSample = 3.05357361; EvalErr[0]PerSample = 0.74218750; TotalTime = 0.7179s; SamplesPerSecond = 891.5 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 151- 160, 50.00%]: SamplesSeen = 640; TrainLossPerSample = 3.02144079; EvalErr[0]PerSample = 0.74531250; TotalTime = 0.6208s; SamplesPerSecond = 1030.9 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 161- 170, 53.12%]: SamplesSeen = 640; TrainLossPerSample = 2.89890004; EvalErr[0]PerSample = 0.69687500; TotalTime = 0.7758s; SamplesPerSecond = 824.9 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 171- 180, 56.25%]: SamplesSeen = 640; TrainLossPerSample = 2.74598358; EvalErr[0]PerSample = 0.68593750; TotalTime = 0.7144s; SamplesPerSecond = 895.9 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 181- 190, 59.38%]: SamplesSeen = 640; TrainLossPerSample = 2.83604141; EvalErr[0]PerSample = 0.70625000; TotalTime = 0.5762s; SamplesPerSecond = 1110.6 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 191- 200, 62.50%]: SamplesSeen = 640; TrainLossPerSample = 2.62522562; EvalErr[0]PerSample = 0.64687500; TotalTime = 0.6546s; SamplesPerSecond = 977.8 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 201- 210, 65.62%]: SamplesSeen = 640; TrainLossPerSample = 2.65507979; EvalErr[0]PerSample = 0.66562500; TotalTime = 0.6420s; SamplesPerSecond = 997.0 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 211- 220, 68.75%]: SamplesSeen = 640; TrainLossPerSample = 2.59593989; EvalErr[0]PerSample = 0.65937500; TotalTime = 0.6957s; SamplesPerSecond = 919.9 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 221- 230, 71.88%]: SamplesSeen = 640; TrainLossPerSample = 2.51177605; EvalErr[0]PerSample = 0.62343750; TotalTime = 0.5174s; SamplesPerSecond = 1237.0 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 231- 240, 75.00%]: SamplesSeen = 640; TrainLossPerSample = 2.42438840; EvalErr[0]PerSample = 0.63281250; TotalTime = 0.7410s; SamplesPerSecond = 863.7 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 241- 250, 78.12%]: SamplesSeen = 640; TrainLossPerSample = 2.40372959; EvalErr[0]PerSample = 0.65156250; TotalTime = 0.6776s; SamplesPerSecond = 944.5 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 251- 260, 81.25%]: SamplesSeen = 640; TrainLossPerSample = 2.48277420; EvalErr[0]PerSample = 0.63906250; TotalTime = 0.5688s; SamplesPerSecond = 1125.2 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 261- 270, 84.38%]: SamplesSeen = 640; TrainLossPerSample = 2.34181483; EvalErr[0]PerSample = 0.61718750; TotalTime = 0.7335s; SamplesPerSecond = 872.5 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 271- 280, 87.50%]: SamplesSeen = 640; TrainLossPerSample = 2.22951559; EvalErr[0]PerSample = 0.57656250; TotalTime = 0.6841s; SamplesPerSecond = 935.5 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 281- 290, 90.62%]: SamplesSeen = 640; TrainLossPerSample = 2.32715885; EvalErr[0]PerSample = 0.62031250; TotalTime = 0.5804s; SamplesPerSecond = 1102.7 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 291- 300, 93.75%]: SamplesSeen = 640; TrainLossPerSample = 2.21143816; EvalErr[0]PerSample = 0.61406250; TotalTime = 0.5394s; SamplesPerSecond = 1186.6 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 301- 310, 96.88%]: SamplesSeen = 640; TrainLossPerSample = 2.29118500; EvalErr[0]PerSample = 0.60156250; TotalTime = 0.7031s; SamplesPerSecond = 910.2 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 311- 320, 100.00%]: SamplesSeen = 640; TrainLossPerSample = 2.19155470; EvalErr[0]PerSample = 0.56406250; TotalTime = 0.7620s; SamplesPerSecond = 839.9 +MPI Rank 0: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.0129278; TotalSamplesSeen = 20480; EvalErrPerSample = 0.7277832; AvgLearningRatePerSample = 0.015625; EpochTime=21.0933 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: minibatchiterator: epoch 0: frames [0..83050] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 0: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms +MPI Rank 0: Final Results: Minibatch[1-1298]: SamplesSeen = 83050 CrossEntropyWithSoftmax: CrossEntropyWithSoftmax/Sample = 2.1824241 Perplexity = 8.8677763 EvalErrorPrediction: ErrorPrediction/Sample = 0.58616496 +MPI Rank 0: Finished Epoch[ 1 of 3]: [Validation Set] TrainLossPerSample = 2.1824241; EvalErrPerSample = 0.58616496 +MPI Rank 0: SGD: Saving checkpoint model '/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu/models/cntkSpeech.dnn.1' +MPI Rank 0: +MPI Rank 0: Starting Epoch 2: learning rate per sample = 0.001953 effective momentum = 0.656119 momentum as time constant = 607.5 samples +MPI Rank 0: minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 0 of 2, with 1 datapasses +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 2, NumGradientBits = 64), distributed reading is ENABLED. +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 1- 10, 12.50%]: SamplesSeen = 2560; TrainLossPerSample = 2.05064112; EvalErr[0]PerSample = 0.55039063; TotalTime = 1.5403s; SamplesPerSecond = 1662.1 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 11- 20, 25.00%]: SamplesSeen = 2560; TrainLossPerSample = 2.02000655; EvalErr[0]PerSample = 0.54492188; TotalTime = 1.3251s; SamplesPerSecond = 1931.9 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 21- 30, 37.50%]: SamplesSeen = 2560; TrainLossPerSample = 2.01868507; EvalErr[0]PerSample = 0.55000000; TotalTime = 1.2665s; SamplesPerSecond = 2021.3 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 31- 40, 50.00%]: SamplesSeen = 2560; TrainLossPerSample = 1.96698601; EvalErr[0]PerSample = 0.53867188; TotalTime = 1.4454s; SamplesPerSecond = 1771.1 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 41- 50, 62.50%]: SamplesSeen = 2560; TrainLossPerSample = 1.93942125; EvalErr[0]PerSample = 0.54023438; TotalTime = 1.1160s; SamplesPerSecond = 2293.8 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 51- 60, 75.00%]: SamplesSeen = 2560; TrainLossPerSample = 2.00412188; EvalErr[0]PerSample = 0.54335937; TotalTime = 1.4520s; SamplesPerSecond = 1763.1 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 61- 70, 87.50%]: SamplesSeen = 2560; TrainLossPerSample = 1.93180079; EvalErr[0]PerSample = 0.52343750; TotalTime = 1.3162s; SamplesPerSecond = 1945.0 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 71- 80, 100.00%]: SamplesSeen = 2560; TrainLossPerSample = 1.94186507; EvalErr[0]PerSample = 0.54257813; TotalTime = 1.3149s; SamplesPerSecond = 1947.0 +MPI Rank 0: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.984191; TotalSamplesSeen = 40960; EvalErrPerSample = 0.54169922; AvgLearningRatePerSample = 0.001953125; EpochTime=10.7924 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: minibatchiterator: epoch 0: frames [0..83050] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 0: Final Results: Minibatch[1-325]: SamplesSeen = 83050 CrossEntropyWithSoftmax: CrossEntropyWithSoftmax/Sample = 1.8974794 Perplexity = 6.6690634 EvalErrorPrediction: ErrorPrediction/Sample = 0.52758579 +MPI Rank 0: Finished Epoch[ 2 of 3]: [Validation Set] TrainLossPerSample = 1.8974794; EvalErrPerSample = 0.52758579 +MPI Rank 0: SGD: Saving checkpoint model '/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu/models/cntkSpeech.dnn.2' +MPI Rank 0: +MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.000098 effective momentum = 0.656119 momentum as time constant = 2429.9 samples +MPI Rank 0: minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 0 of 2, with 1 datapasses +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 2, NumGradientBits = 64), distributed reading is ENABLED. +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 1- 10, 50.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.90809843; EvalErr[0]PerSample = 0.52558594; TotalTime = 4.0322s; SamplesPerSecond = 2539.6 +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 11- 20, 100.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.90322337; EvalErr[0]PerSample = 0.52568359; TotalTime = 3.9336s; SamplesPerSecond = 2603.2 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.9056609; TotalSamplesSeen = 61440; EvalErrPerSample = 0.52563477; AvgLearningRatePerSample = 9.7656251e-05; EpochTime=7.99789 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: minibatchiterator: epoch 0: frames [0..83050] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 0: Final Results: Minibatch[1-82]: SamplesSeen = 83050 CrossEntropyWithSoftmax: CrossEntropyWithSoftmax/Sample = 1.8771737 Perplexity = 6.5350089 EvalErrorPrediction: ErrorPrediction/Sample = 0.51938591 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Validation Set] TrainLossPerSample = 1.8771737; EvalErrPerSample = 0.51938591 +MPI Rank 0: SGD: Saving checkpoint model '/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu/models/cntkSpeech.dnn' +MPI Rank 0: CNTKCommandTrainEnd: speechTrain +MPI Rank 0: +MPI Rank 0: Action "train" complete. +MPI Rank 0: +MPI Rank 0: COMPLETED +MPI Rank 0: ~MPIWrapper +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: Build info: +MPI Rank 1: +MPI Rank 1: Built time: Mar 3 2016 17:23:46 +MPI Rank 1: Last modified date: Thu Mar 3 05:46:23 2016 +MPI Rank 1: Build type: release +MPI Rank 1: Build target: GPU +MPI Rank 1: With 1bit-SGD: no +MPI Rank 1: Math lib: acml +MPI Rank 1: CUDA_PATH: /usr/local/cuda-7.0 +MPI Rank 1: CUB_PATH: /usr/local/cub-1.4.1 +MPI Rank 1: CUDNN_PATH: /usr/local/cudnn-4.0 +MPI Rank 1: Build Branch: HEAD +MPI Rank 1: Build SHA1: dafcfee4846f7c5a7d3b29ace536b8734ff409d1 +MPI Rank 1: Built by philly on Source/CNTK/buildinfo.h0 +MPI Rank 1: Build Path: Source/CNTK/buildinfo.h1 +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: running on localhost at 2016/03/03 17:28:41 +MPI Rank 1: command line: +MPI Rank 1: /home/philly/jenkins/workspace/CNTK-Test-Linux-W2/build/gpu/release/bin/cntk configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/cntkcv.cntk currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data RunDir=/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation OutputDir=/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu DeviceId=-1 numCPUThreads=2 stderr=/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu/stderr +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: precision = "double" +MPI Rank 1: command = speechTrain +MPI Rank 1: deviceId = $DeviceId$ +MPI Rank 1: parallelTrain = true +MPI Rank 1: speechTrain = [ +MPI Rank 1: action = "train" +MPI Rank 1: modelPath = "$RunDir$/models/cntkSpeech.dnn" +MPI Rank 1: deviceId = $DeviceId$ +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SimpleNetworkBuilder = [ +MPI Rank 1: layerSizes = 363:512:512:132 +MPI Rank 1: trainingCriterion = "CrossEntropyWithSoftmax" +MPI Rank 1: evalCriterion = "ErrorPrediction" +MPI Rank 1: layerTypes = "Sigmoid" +MPI Rank 1: initValueScale = 1.0 +MPI Rank 1: applyMeanVarNorm = true +MPI Rank 1: uniformInit = true +MPI Rank 1: needPrior = true +MPI Rank 1: ] +MPI Rank 1: ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above +MPI Rank 1: layerSizes = 363:512:512:132 +MPI Rank 1: trainingCriterion = 'CE' +MPI Rank 1: evalCriterion = 'Err' +MPI Rank 1: applyMeanVarNorm = true +MPI Rank 1: L = Length(layerSizes)-1 // number of model layers +MPI Rank 1: features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') +MPI Rank 1: featNorm = if applyMeanVarNorm +MPI Rank 1: then MeanVarNorm(features) +MPI Rank 1: else features +MPI Rank 1: layers[layer:1..L-1] = if layer > 1 +MPI Rank 1: then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 1: else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 1: outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) +MPI Rank 1: outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) +MPI Rank 1: CE = if trainingCriterion == 'CE' +MPI Rank 1: then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') +MPI Rank 1: else Fail('unknown trainingCriterion ' + trainingCriterion) +MPI Rank 1: Err = if evalCriterion == 'Err' then +MPI Rank 1: ErrorPrediction(labels, outZ, tag='eval') +MPI Rank 1: else Fail('unknown evalCriterion ' + evalCriterion) +MPI Rank 1: logPrior = LogPrior(labels) +MPI Rank 1: // TODO: how to add a tag to an infix operation? +MPI Rank 1: ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') +MPI Rank 1: ] +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize = 20480 +MPI Rank 1: minibatchSize = 64:256:1024 +MPI Rank 1: learningRatesPerMB = 1.0:0.5:0.1 +MPI Rank 1: numMBsToShowResult = 10 +MPI Rank 1: momentumPerMB = 0.9:0.656119 +MPI Rank 1: dropoutRate = 0.0 +MPI Rank 1: maxEpochs = 3 +MPI Rank 1: keepCheckPointFiles = true +MPI Rank 1: clippingThresholdPerSample = 1#INF +MPI Rank 1: ParallelTrain = [ +MPI Rank 1: parallelizationMethod = "DataParallelSGD" +MPI Rank 1: distributedMBReading = true +MPI Rank 1: DataParallelSGD = [ +MPI Rank 1: gradientBits = 64 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: AutoAdjust = [ +MPI Rank 1: reduceLearnRateIfImproveLessThan = 0 +MPI Rank 1: loadBestModel = true +MPI Rank 1: increaseLearnRateIfImproveMoreThan = 1000000000 +MPI Rank 1: learnRateDecreaseFactor = 0.5 +MPI Rank 1: learnRateIncreaseFactor = 1.382 +MPI Rank 1: autoAdjustLR = "adjustAfterEpoch" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = "HTKMLFReader" +MPI Rank 1: readMethod = "blockRandomize" +MPI Rank 1: miniBatchMode = "partial" +MPI Rank 1: randomize = "auto" +MPI Rank 1: verbosity = 0 +MPI Rank 1: features = [ +MPI Rank 1: dim = 363 +MPI Rank 1: type = "real" +MPI Rank 1: scpFile = "glob_0000.scp" +MPI Rank 1: ] +MPI Rank 1: labels = [ +MPI Rank 1: mlfFile = "$DataDir$/glob_0000.mlf" +MPI Rank 1: labelMappingFile = "$DataDir$/state.list" +MPI Rank 1: labelDim = 132 +MPI Rank 1: labelType = "category" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: cvreader = [ +MPI Rank 1: readerType = "HTKMLFReader" +MPI Rank 1: readMethod = "blockRandomize" +MPI Rank 1: miniBatchMode = "partial" +MPI Rank 1: randomize = "auto" +MPI Rank 1: verbosity = 0 +MPI Rank 1: features = [ +MPI Rank 1: dim = 363 +MPI Rank 1: type = "real" +MPI Rank 1: scpFile = "glob_0000.cv.scp" +MPI Rank 1: ] +MPI Rank 1: labels = [ +MPI Rank 1: mlfFile = "$DataDir$/glob_0000.mlf" +MPI Rank 1: labelMappingFile = "$DataDir$/state.list" +MPI Rank 1: labelDim = 132 +MPI Rank 1: labelType = "category" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data +MPI Rank 1: RunDir=/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu +MPI Rank 1: DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data +MPI Rank 1: ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation +MPI Rank 1: OutputDir=/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu +MPI Rank 1: DeviceId=-1 +MPI Rank 1: numCPUThreads=2 +MPI Rank 1: stderr=/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: precision = "double" +MPI Rank 1: command = speechTrain +MPI Rank 1: deviceId = -1 +MPI Rank 1: parallelTrain = true +MPI Rank 1: speechTrain = [ +MPI Rank 1: action = "train" +MPI Rank 1: modelPath = "/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu/models/cntkSpeech.dnn" +MPI Rank 1: deviceId = -1 +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SimpleNetworkBuilder = [ +MPI Rank 1: layerSizes = 363:512:512:132 +MPI Rank 1: trainingCriterion = "CrossEntropyWithSoftmax" +MPI Rank 1: evalCriterion = "ErrorPrediction" +MPI Rank 1: layerTypes = "Sigmoid" +MPI Rank 1: initValueScale = 1.0 +MPI Rank 1: applyMeanVarNorm = true +MPI Rank 1: uniformInit = true +MPI Rank 1: needPrior = true +MPI Rank 1: ] +MPI Rank 1: ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above +MPI Rank 1: layerSizes = 363:512:512:132 +MPI Rank 1: trainingCriterion = 'CE' +MPI Rank 1: evalCriterion = 'Err' +MPI Rank 1: applyMeanVarNorm = true +MPI Rank 1: L = Length(layerSizes)-1 // number of model layers +MPI Rank 1: features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') +MPI Rank 1: featNorm = if applyMeanVarNorm +MPI Rank 1: then MeanVarNorm(features) +MPI Rank 1: else features +MPI Rank 1: layers[layer:1..L-1] = if layer > 1 +MPI Rank 1: then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 1: else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 1: outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) +MPI Rank 1: outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) +MPI Rank 1: CE = if trainingCriterion == 'CE' +MPI Rank 1: then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') +MPI Rank 1: else Fail('unknown trainingCriterion ' + trainingCriterion) +MPI Rank 1: Err = if evalCriterion == 'Err' then +MPI Rank 1: ErrorPrediction(labels, outZ, tag='eval') +MPI Rank 1: else Fail('unknown evalCriterion ' + evalCriterion) +MPI Rank 1: logPrior = LogPrior(labels) +MPI Rank 1: // TODO: how to add a tag to an infix operation? +MPI Rank 1: ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') +MPI Rank 1: ] +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize = 20480 +MPI Rank 1: minibatchSize = 64:256:1024 +MPI Rank 1: learningRatesPerMB = 1.0:0.5:0.1 +MPI Rank 1: numMBsToShowResult = 10 +MPI Rank 1: momentumPerMB = 0.9:0.656119 +MPI Rank 1: dropoutRate = 0.0 +MPI Rank 1: maxEpochs = 3 +MPI Rank 1: keepCheckPointFiles = true +MPI Rank 1: clippingThresholdPerSample = 1#INF +MPI Rank 1: ParallelTrain = [ +MPI Rank 1: parallelizationMethod = "DataParallelSGD" +MPI Rank 1: distributedMBReading = true +MPI Rank 1: DataParallelSGD = [ +MPI Rank 1: gradientBits = 64 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: AutoAdjust = [ +MPI Rank 1: reduceLearnRateIfImproveLessThan = 0 +MPI Rank 1: loadBestModel = true +MPI Rank 1: increaseLearnRateIfImproveMoreThan = 1000000000 +MPI Rank 1: learnRateDecreaseFactor = 0.5 +MPI Rank 1: learnRateIncreaseFactor = 1.382 +MPI Rank 1: autoAdjustLR = "adjustAfterEpoch" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = "HTKMLFReader" +MPI Rank 1: readMethod = "blockRandomize" +MPI Rank 1: miniBatchMode = "partial" +MPI Rank 1: randomize = "auto" +MPI Rank 1: verbosity = 0 +MPI Rank 1: features = [ +MPI Rank 1: dim = 363 +MPI Rank 1: type = "real" +MPI Rank 1: scpFile = "glob_0000.scp" +MPI Rank 1: ] +MPI Rank 1: labels = [ +MPI Rank 1: mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/glob_0000.mlf" +MPI Rank 1: labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/state.list" +MPI Rank 1: labelDim = 132 +MPI Rank 1: labelType = "category" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: cvreader = [ +MPI Rank 1: readerType = "HTKMLFReader" +MPI Rank 1: readMethod = "blockRandomize" +MPI Rank 1: miniBatchMode = "partial" +MPI Rank 1: randomize = "auto" +MPI Rank 1: verbosity = 0 +MPI Rank 1: features = [ +MPI Rank 1: dim = 363 +MPI Rank 1: type = "real" +MPI Rank 1: scpFile = "glob_0000.cv.scp" +MPI Rank 1: ] +MPI Rank 1: labels = [ +MPI Rank 1: mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/glob_0000.mlf" +MPI Rank 1: labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/state.list" +MPI Rank 1: labelDim = 132 +MPI Rank 1: labelType = "category" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data +MPI Rank 1: RunDir=/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu +MPI Rank 1: DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data +MPI Rank 1: ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation +MPI Rank 1: OutputDir=/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu +MPI Rank 1: DeviceId=-1 +MPI Rank 1: numCPUThreads=2 +MPI Rank 1: stderr=/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: configparameters: cntkcv.cntk:command=speechTrain +MPI Rank 1: configparameters: cntkcv.cntk:ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation +MPI Rank 1: configparameters: cntkcv.cntk:currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data +MPI Rank 1: configparameters: cntkcv.cntk:DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data +MPI Rank 1: configparameters: cntkcv.cntk:deviceId=-1 +MPI Rank 1: configparameters: cntkcv.cntk:numCPUThreads=2 +MPI Rank 1: configparameters: cntkcv.cntk:OutputDir=/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu +MPI Rank 1: configparameters: cntkcv.cntk:parallelTrain=true +MPI Rank 1: configparameters: cntkcv.cntk:precision=double +MPI Rank 1: configparameters: cntkcv.cntk:RunDir=/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu +MPI Rank 1: configparameters: cntkcv.cntk:speechTrain=[ +MPI Rank 1: action = "train" +MPI Rank 1: modelPath = "/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu/models/cntkSpeech.dnn" +MPI Rank 1: deviceId = -1 +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SimpleNetworkBuilder = [ +MPI Rank 1: layerSizes = 363:512:512:132 +MPI Rank 1: trainingCriterion = "CrossEntropyWithSoftmax" +MPI Rank 1: evalCriterion = "ErrorPrediction" +MPI Rank 1: layerTypes = "Sigmoid" +MPI Rank 1: initValueScale = 1.0 +MPI Rank 1: applyMeanVarNorm = true +MPI Rank 1: uniformInit = true +MPI Rank 1: needPrior = true +MPI Rank 1: ] +MPI Rank 1: ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above +MPI Rank 1: layerSizes = 363:512:512:132 +MPI Rank 1: trainingCriterion = 'CE' +MPI Rank 1: evalCriterion = 'Err' +MPI Rank 1: applyMeanVarNorm = true +MPI Rank 1: L = Length(layerSizes)-1 // number of model layers +MPI Rank 1: features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') +MPI Rank 1: featNorm = if applyMeanVarNorm +MPI Rank 1: then MeanVarNorm(features) +MPI Rank 1: else features +MPI Rank 1: layers[layer:1..L-1] = if layer > 1 +MPI Rank 1: then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 1: else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 1: outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) +MPI Rank 1: outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) +MPI Rank 1: CE = if trainingCriterion == 'CE' +MPI Rank 1: then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') +MPI Rank 1: else Fail('unknown trainingCriterion ' + trainingCriterion) +MPI Rank 1: Err = if evalCriterion == 'Err' then +MPI Rank 1: ErrorPrediction(labels, outZ, tag='eval') +MPI Rank 1: else Fail('unknown evalCriterion ' + evalCriterion) +MPI Rank 1: logPrior = LogPrior(labels) +MPI Rank 1: // TODO: how to add a tag to an infix operation? +MPI Rank 1: ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') +MPI Rank 1: ] +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize = 20480 +MPI Rank 1: minibatchSize = 64:256:1024 +MPI Rank 1: learningRatesPerMB = 1.0:0.5:0.1 +MPI Rank 1: numMBsToShowResult = 10 +MPI Rank 1: momentumPerMB = 0.9:0.656119 +MPI Rank 1: dropoutRate = 0.0 +MPI Rank 1: maxEpochs = 3 +MPI Rank 1: keepCheckPointFiles = true +MPI Rank 1: clippingThresholdPerSample = 1#INF +MPI Rank 1: ParallelTrain = [ +MPI Rank 1: parallelizationMethod = "DataParallelSGD" +MPI Rank 1: distributedMBReading = true +MPI Rank 1: DataParallelSGD = [ +MPI Rank 1: gradientBits = 64 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: AutoAdjust = [ +MPI Rank 1: reduceLearnRateIfImproveLessThan = 0 +MPI Rank 1: loadBestModel = true +MPI Rank 1: increaseLearnRateIfImproveMoreThan = 1000000000 +MPI Rank 1: learnRateDecreaseFactor = 0.5 +MPI Rank 1: learnRateIncreaseFactor = 1.382 +MPI Rank 1: autoAdjustLR = "adjustAfterEpoch" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = "HTKMLFReader" +MPI Rank 1: readMethod = "blockRandomize" +MPI Rank 1: miniBatchMode = "partial" +MPI Rank 1: randomize = "auto" +MPI Rank 1: verbosity = 0 +MPI Rank 1: features = [ +MPI Rank 1: dim = 363 +MPI Rank 1: type = "real" +MPI Rank 1: scpFile = "glob_0000.scp" +MPI Rank 1: ] +MPI Rank 1: labels = [ +MPI Rank 1: mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/glob_0000.mlf" +MPI Rank 1: labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/state.list" +MPI Rank 1: labelDim = 132 +MPI Rank 1: labelType = "category" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: cvreader = [ +MPI Rank 1: readerType = "HTKMLFReader" +MPI Rank 1: readMethod = "blockRandomize" +MPI Rank 1: miniBatchMode = "partial" +MPI Rank 1: randomize = "auto" +MPI Rank 1: verbosity = 0 +MPI Rank 1: features = [ +MPI Rank 1: dim = 363 +MPI Rank 1: type = "real" +MPI Rank 1: scpFile = "glob_0000.cv.scp" +MPI Rank 1: ] +MPI Rank 1: labels = [ +MPI Rank 1: mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/glob_0000.mlf" +MPI Rank 1: labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/state.list" +MPI Rank 1: labelDim = 132 +MPI Rank 1: labelType = "category" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: cntkcv.cntk:stderr=/tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu/stderr +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: Commands: speechTrain +MPI Rank 1: Precision = "double" +MPI Rank 1: Using 2 CPU threads. +MPI Rank 1: CNTKModelPath: /tmp/cntk-test-20160303172706.796822/Speech/DNN_ParallelCrossValidation@release_cpu/models/cntkSpeech.dnn +MPI Rank 1: CNTKCommandTrainInfo: speechTrain : 3 +MPI Rank 1: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 1: +MPI Rank 1: ############################################################################## +MPI Rank 1: # # +MPI Rank 1: # Action "train" # +MPI Rank 1: # # +MPI Rank 1: ############################################################################## +MPI Rank 1: +MPI Rank 1: CNTKCommandTrainBegin: speechTrain +MPI Rank 1: SimpleNetworkBuilder Using CPU +MPI Rank 1: reading script file glob_0000.scp ... 948 entries +MPI Rank 1: total 132 state names in state list /home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/state.list +MPI Rank 1: htkmlfreader: reading MLF file /home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/glob_0000.mlf ... total 948 entries +MPI Rank 1: ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances +MPI Rank 1: label set 0: 129 classes +MPI Rank 1: minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames +MPI Rank 1: reading script file glob_0000.cv.scp ... 300 entries +MPI Rank 1: total 132 state names in state list /home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/state.list +MPI Rank 1: htkmlfreader: reading MLF file /home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/glob_0000.mlf ... total 948 entries +MPI Rank 1: ...........................................................................feature set 0: 83050 frames in 300 out of 300 utterances +MPI Rank 1: label set 0: 129 classes +MPI Rank 1: minibatchutterancesource: 300 utterances grouped into 1 chunks, av. chunk size: 300.0 utterances, 83050.0 frames +MPI Rank 1: +MPI Rank 1: Post-processing network... +MPI Rank 1: +MPI Rank 1: 7 roots: +MPI Rank 1: CrossEntropyWithSoftmax = CrossEntropyWithSoftmax +MPI Rank 1: EvalErrorPrediction = ErrorPrediction +MPI Rank 1: InvStdOfFeatures = InvStdDev +MPI Rank 1: MeanOfFeatures = Mean +MPI Rank 1: PosteriorProb = Softmax +MPI Rank 1: Prior = Mean +MPI Rank 1: ScaledLogLikelihood = Minus +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for CrossEntropyWithSoftmax CrossEntropyWithSoftmax operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for EvalErrorPrediction ErrorPrediction operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for InvStdOfFeatures InvStdDev operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for MeanOfFeatures Mean operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for PosteriorProb Softmax operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for Prior Mean operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for ScaledLogLikelihood Minus operation +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating network. 25 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132 x *] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [132 x 512] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [512 x 512] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [512 x 363] +MPI Rank 1: Validating --> features = InputValue -> [363 x *] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363 x *]) -> [363] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363 x *]) -> [363] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 x *], MeanOfFeatures[363], InvStdOfFeatures[363]) -> [363 x *] +MPI Rank 1: Validating --> W0*features = Times(W0[512 x 363], MVNormalizedFeatures[363 x *]) -> [512 x *] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [512 x 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512 x *], B0[512 x 1]) -> [512 x 1 x *] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 1: Validating --> W1*H1 = Times(W1[512 x 512], H1[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [512 x 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512 x 1 x *], B1[512 x 1]) -> [512 x 1 x *] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 1: Validating --> W2*H1 = Times(W2[132 x 512], H2[512 x 1 x *]) -> [132 x 1 x *] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [132 x 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[132 x 1 x *], B2[132 x 1]) -> [132 x 1 x *] +MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 x *], HLast[132 x 1 x *]) -> [1] +MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 x *], HLast[132 x 1 x *]) -> [1] +MPI Rank 1: Validating --> PosteriorProb = Softmax(HLast[132 x 1 x *]) -> [132 x 1 x *] +MPI Rank 1: Validating --> Prior = Mean(labels[132 x *]) -> [132] +MPI Rank 1: Validating --> LogOfPrior = Log(Prior[132]) -> [132] +MPI Rank 1: Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 x *], LogOfPrior[132]) -> [132 x 1 x *] +MPI Rank 1: +MPI Rank 1: Validating network. 17 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132 x *] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [132 x 512] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [512 x 512] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [512 x 363] +MPI Rank 1: Validating --> features = InputValue -> [363 x *] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363 x *]) -> [363] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363 x *]) -> [363] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 x *], MeanOfFeatures[363], InvStdOfFeatures[363]) -> [363 x *] +MPI Rank 1: Validating --> W0*features = Times(W0[512 x 363], MVNormalizedFeatures[363 x *]) -> [512 x *] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [512 x 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512 x *], B0[512 x 1]) -> [512 x 1 x *] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 1: Validating --> W1*H1 = Times(W1[512 x 512], H1[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [512 x 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512 x 1 x *], B1[512 x 1]) -> [512 x 1 x *] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 1: Validating --> W2*H1 = Times(W2[132 x 512], H2[512 x 1 x *]) -> [132 x 1 x *] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [132 x 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[132 x 1 x *], B2[132 x 1]) -> [132 x 1 x *] +MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 x *], HLast[132 x 1 x *]) -> [1] +MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 x *], HLast[132 x 1 x *]) -> [1] +MPI Rank 1: Validating --> PosteriorProb = Softmax(HLast[132 x 1 x *]) -> [132 x 1 x *] +MPI Rank 1: Validating --> Prior = Mean(labels[132 x *]) -> [132] +MPI Rank 1: Validating --> LogOfPrior = Log(Prior[132]) -> [132] +MPI Rank 1: Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 x *], LogOfPrior[132]) -> [132 x 1 x *] +MPI Rank 1: +MPI Rank 1: Validating network, final pass. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132 x *] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [132 x 512] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [512 x 512] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [512 x 363] +MPI Rank 1: Validating --> features = InputValue -> [363 x *] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363 x *]) -> [363] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363 x *]) -> [363] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 x *], MeanOfFeatures[363], InvStdOfFeatures[363]) -> [363 x *] +MPI Rank 1: Validating --> W0*features = Times(W0[512 x 363], MVNormalizedFeatures[363 x *]) -> [512 x *] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [512 x 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512 x *], B0[512 x 1]) -> [512 x 1 x *] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 1: Validating --> W1*H1 = Times(W1[512 x 512], H1[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [512 x 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512 x 1 x *], B1[512 x 1]) -> [512 x 1 x *] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 1: Validating --> W2*H1 = Times(W2[132 x 512], H2[512 x 1 x *]) -> [132 x 1 x *] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [132 x 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[132 x 1 x *], B2[132 x 1]) -> [132 x 1 x *] +MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 x *], HLast[132 x 1 x *]) -> [1] +MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 x *], HLast[132 x 1 x *]) -> [1] +MPI Rank 1: Validating --> PosteriorProb = Softmax(HLast[132 x 1 x *]) -> [132 x 1 x *] +MPI Rank 1: Validating --> Prior = Mean(labels[132 x *]) -> [132] +MPI Rank 1: Validating --> LogOfPrior = Log(Prior[132]) -> [132] +MPI Rank 1: Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 x *], LogOfPrior[132]) -> [132 x 1 x *] +MPI Rank 1: +MPI Rank 1: 12 out of 25 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: Post-processing network complete. +MPI Rank 1: +MPI Rank 1: SGD using CPU. +MPI Rank 1: +MPI Rank 1: Training criterion node(s): +MPI Rank 1: CrossEntropyWithSoftmax = CrossEntropyWithSoftmax +MPI Rank 1: +MPI Rank 1: Evaluation criterion node(s): +MPI Rank 1: EvalErrorPrediction = ErrorPrediction +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Allocating matrices for forward and/or backward propagation. +MPI Rank 1: +MPI Rank 1: Precomputing --> 3 PreCompute nodes found. +MPI Rank 1: +MPI Rank 1: NodeName: MeanOfFeatures +MPI Rank 1: NodeName: InvStdOfFeatures +MPI Rank 1: NodeName: Prior +MPI Rank 1: minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 1: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms +MPI Rank 1: +MPI Rank 1: Precomputing --> Completed. +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Starting Epoch 1: learning rate per sample = 0.015625 effective momentum = 0.900000 momentum as time constant = 607.4 samples +MPI Rank 1: minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 1 of 2, with 1 datapasses +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 2, NumGradientBits = 64), distributed reading is ENABLED. +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 1- 10, 3.12%]: SamplesSeen = 640; TrainLossPerSample = 4.36628272; EvalErr[0]PerSample = 0.90937500; TotalTime = 0.6513s; SamplesPerSecond = 982.6 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 11- 20, 6.25%]: SamplesSeen = 640; TrainLossPerSample = 4.15914991; EvalErr[0]PerSample = 0.89218750; TotalTime = 0.7000s; SamplesPerSecond = 914.3 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 21- 30, 9.38%]: SamplesSeen = 640; TrainLossPerSample = 3.99837967; EvalErr[0]PerSample = 0.86875000; TotalTime = 0.5136s; SamplesPerSecond = 1246.2 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 31- 40, 12.50%]: SamplesSeen = 640; TrainLossPerSample = 3.86616341; EvalErr[0]PerSample = 0.86250000; TotalTime = 0.6912s; SamplesPerSecond = 926.0 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 41- 50, 15.62%]: SamplesSeen = 640; TrainLossPerSample = 3.80082643; EvalErr[0]PerSample = 0.87968750; TotalTime = 0.7486s; SamplesPerSecond = 855.0 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 51- 60, 18.75%]: SamplesSeen = 640; TrainLossPerSample = 3.73336112; EvalErr[0]PerSample = 0.87812500; TotalTime = 0.5991s; SamplesPerSecond = 1068.3 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 61- 70, 21.88%]: SamplesSeen = 640; TrainLossPerSample = 3.57119384; EvalErr[0]PerSample = 0.82031250; TotalTime = 0.7631s; SamplesPerSecond = 838.7 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 71- 80, 25.00%]: SamplesSeen = 640; TrainLossPerSample = 3.44001005; EvalErr[0]PerSample = 0.81562500; TotalTime = 0.7083s; SamplesPerSecond = 903.5 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 81- 90, 28.12%]: SamplesSeen = 640; TrainLossPerSample = 3.36131109; EvalErr[0]PerSample = 0.77343750; TotalTime = 0.5294s; SamplesPerSecond = 1209.0 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 91- 100, 31.25%]: SamplesSeen = 640; TrainLossPerSample = 3.39817487; EvalErr[0]PerSample = 0.85000000; TotalTime = 0.5381s; SamplesPerSecond = 1189.3 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 101- 110, 34.38%]: SamplesSeen = 640; TrainLossPerSample = 3.25116276; EvalErr[0]PerSample = 0.77031250; TotalTime = 0.7212s; SamplesPerSecond = 887.4 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 111- 120, 37.50%]: SamplesSeen = 640; TrainLossPerSample = 3.35774005; EvalErr[0]PerSample = 0.79843750; TotalTime = 0.7457s; SamplesPerSecond = 858.3 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 121- 130, 40.62%]: SamplesSeen = 640; TrainLossPerSample = 3.19791351; EvalErr[0]PerSample = 0.76406250; TotalTime = 0.5219s; SamplesPerSecond = 1226.4 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 131- 140, 43.75%]: SamplesSeen = 640; TrainLossPerSample = 3.06449990; EvalErr[0]PerSample = 0.71718750; TotalTime = 0.7330s; SamplesPerSecond = 873.1 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 141- 150, 46.88%]: SamplesSeen = 640; TrainLossPerSample = 3.05357361; EvalErr[0]PerSample = 0.74218750; TotalTime = 0.7142s; SamplesPerSecond = 896.1 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 151- 160, 50.00%]: SamplesSeen = 640; TrainLossPerSample = 3.02144079; EvalErr[0]PerSample = 0.74531250; TotalTime = 0.6251s; SamplesPerSecond = 1023.8 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 161- 170, 53.12%]: SamplesSeen = 640; TrainLossPerSample = 2.89890004; EvalErr[0]PerSample = 0.69687500; TotalTime = 0.7733s; SamplesPerSecond = 827.6 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 171- 180, 56.25%]: SamplesSeen = 640; TrainLossPerSample = 2.74598358; EvalErr[0]PerSample = 0.68593750; TotalTime = 0.7083s; SamplesPerSecond = 903.5 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 181- 190, 59.38%]: SamplesSeen = 640; TrainLossPerSample = 2.83604141; EvalErr[0]PerSample = 0.70625000; TotalTime = 0.5757s; SamplesPerSecond = 1111.7 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 191- 200, 62.50%]: SamplesSeen = 640; TrainLossPerSample = 2.62522562; EvalErr[0]PerSample = 0.64687500; TotalTime = 0.5510s; SamplesPerSecond = 1161.6 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 201- 210, 65.62%]: SamplesSeen = 640; TrainLossPerSample = 2.65507979; EvalErr[0]PerSample = 0.66562500; TotalTime = 0.8584s; SamplesPerSecond = 745.5 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 211- 220, 68.75%]: SamplesSeen = 640; TrainLossPerSample = 2.59593989; EvalErr[0]PerSample = 0.65937500; TotalTime = 0.5817s; SamplesPerSecond = 1100.2 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 221- 230, 71.88%]: SamplesSeen = 640; TrainLossPerSample = 2.51177605; EvalErr[0]PerSample = 0.62343750; TotalTime = 0.5232s; SamplesPerSecond = 1223.1 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 231- 240, 75.00%]: SamplesSeen = 640; TrainLossPerSample = 2.42438840; EvalErr[0]PerSample = 0.63281250; TotalTime = 0.7371s; SamplesPerSecond = 868.3 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 241- 250, 78.12%]: SamplesSeen = 640; TrainLossPerSample = 2.40372959; EvalErr[0]PerSample = 0.65156250; TotalTime = 0.6779s; SamplesPerSecond = 944.0 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 251- 260, 81.25%]: SamplesSeen = 640; TrainLossPerSample = 2.48277420; EvalErr[0]PerSample = 0.63906250; TotalTime = 0.5721s; SamplesPerSecond = 1118.7 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 261- 270, 84.38%]: SamplesSeen = 640; TrainLossPerSample = 2.34181483; EvalErr[0]PerSample = 0.61718750; TotalTime = 0.7323s; SamplesPerSecond = 874.0 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 271- 280, 87.50%]: SamplesSeen = 640; TrainLossPerSample = 2.22951559; EvalErr[0]PerSample = 0.57656250; TotalTime = 0.6802s; SamplesPerSecond = 940.9 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 281- 290, 90.62%]: SamplesSeen = 640; TrainLossPerSample = 2.32715885; EvalErr[0]PerSample = 0.62031250; TotalTime = 0.5897s; SamplesPerSecond = 1085.3 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 291- 300, 93.75%]: SamplesSeen = 640; TrainLossPerSample = 2.21143816; EvalErr[0]PerSample = 0.61406250; TotalTime = 0.5291s; SamplesPerSecond = 1209.5 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 301- 310, 96.88%]: SamplesSeen = 640; TrainLossPerSample = 2.29118500; EvalErr[0]PerSample = 0.60156250; TotalTime = 0.7085s; SamplesPerSecond = 903.3 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 311- 320, 100.00%]: SamplesSeen = 640; TrainLossPerSample = 2.19155470; EvalErr[0]PerSample = 0.56406250; TotalTime = 0.7657s; SamplesPerSecond = 835.8 +MPI Rank 1: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.0129278; TotalSamplesSeen = 20480; EvalErrPerSample = 0.7277832; AvgLearningRatePerSample = 0.015625; EpochTime=21.0877 +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Allocating matrices for forward and/or backward propagation. +MPI Rank 1: minibatchiterator: epoch 0: frames [0..83050] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 1: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms +MPI Rank 1: Final Results: Minibatch[1-1298]: SamplesSeen = 83050 CrossEntropyWithSoftmax: CrossEntropyWithSoftmax/Sample = 2.1824241 Perplexity = 8.8677763 EvalErrorPrediction: ErrorPrediction/Sample = 0.58616496 +MPI Rank 1: Finished Epoch[ 1 of 3]: [Validation Set] TrainLossPerSample = 2.1824241; EvalErrPerSample = 0.58616496 +MPI Rank 1: +MPI Rank 1: Starting Epoch 2: learning rate per sample = 0.001953 effective momentum = 0.656119 momentum as time constant = 607.5 samples +MPI Rank 1: minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 1 of 2, with 1 datapasses +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 2, NumGradientBits = 64), distributed reading is ENABLED. +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 1- 10, 12.50%]: SamplesSeen = 2560; TrainLossPerSample = 2.05064112; EvalErr[0]PerSample = 0.55039063; TotalTime = 1.5368s; SamplesPerSecond = 1665.8 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 11- 20, 25.00%]: SamplesSeen = 2560; TrainLossPerSample = 2.02000655; EvalErr[0]PerSample = 0.54492188; TotalTime = 1.3231s; SamplesPerSecond = 1934.8 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 21- 30, 37.50%]: SamplesSeen = 2560; TrainLossPerSample = 2.01868507; EvalErr[0]PerSample = 0.55000000; TotalTime = 1.2784s; SamplesPerSecond = 2002.5 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 31- 40, 50.00%]: SamplesSeen = 2560; TrainLossPerSample = 1.96698601; EvalErr[0]PerSample = 0.53867188; TotalTime = 1.4403s; SamplesPerSecond = 1777.5 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 41- 50, 62.50%]: SamplesSeen = 2560; TrainLossPerSample = 1.93942125; EvalErr[0]PerSample = 0.54023438; TotalTime = 1.1193s; SamplesPerSecond = 2287.1 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 51- 60, 75.00%]: SamplesSeen = 2560; TrainLossPerSample = 2.00412188; EvalErr[0]PerSample = 0.54335937; TotalTime = 1.4540s; SamplesPerSecond = 1760.7 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 61- 70, 87.50%]: SamplesSeen = 2560; TrainLossPerSample = 1.93180079; EvalErr[0]PerSample = 0.52343750; TotalTime = 1.3134s; SamplesPerSecond = 1949.2 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 71- 80, 100.00%]: SamplesSeen = 2560; TrainLossPerSample = 1.94186507; EvalErr[0]PerSample = 0.54257813; TotalTime = 1.3101s; SamplesPerSecond = 1954.1 +MPI Rank 1: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.984191; TotalSamplesSeen = 40960; EvalErrPerSample = 0.54169922; AvgLearningRatePerSample = 0.001953125; EpochTime=10.7924 +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Allocating matrices for forward and/or backward propagation. +MPI Rank 1: minibatchiterator: epoch 0: frames [0..83050] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 1: Final Results: Minibatch[1-325]: SamplesSeen = 83050 CrossEntropyWithSoftmax: CrossEntropyWithSoftmax/Sample = 1.8974794 Perplexity = 6.6690634 EvalErrorPrediction: ErrorPrediction/Sample = 0.52758579 +MPI Rank 1: Finished Epoch[ 2 of 3]: [Validation Set] TrainLossPerSample = 1.8974794; EvalErrPerSample = 0.52758579 +MPI Rank 1: +MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.000098 effective momentum = 0.656119 momentum as time constant = 2429.9 samples +MPI Rank 1: minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 1 of 2, with 1 datapasses +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 2, NumGradientBits = 64), distributed reading is ENABLED. +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 1- 10, 50.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.90809843; EvalErr[0]PerSample = 0.52558594; TotalTime = 4.0313s; SamplesPerSecond = 2540.1 +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 11- 20, 100.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.90322337; EvalErr[0]PerSample = 0.52568359; TotalTime = 3.9330s; SamplesPerSecond = 2603.6 +MPI Rank 1: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.9056609; TotalSamplesSeen = 61440; EvalErrPerSample = 0.52563477; AvgLearningRatePerSample = 9.7656251e-05; EpochTime=7.99776 +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Allocating matrices for forward and/or backward propagation. +MPI Rank 1: minibatchiterator: epoch 0: frames [0..83050] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 1: Final Results: Minibatch[1-82]: SamplesSeen = 83050 CrossEntropyWithSoftmax: CrossEntropyWithSoftmax/Sample = 1.8771737 Perplexity = 6.5350089 EvalErrorPrediction: ErrorPrediction/Sample = 0.51938591 +MPI Rank 1: Finished Epoch[ 3 of 3]: [Validation Set] TrainLossPerSample = 1.8771737; EvalErrPerSample = 0.51938591 +MPI Rank 1: CNTKCommandTrainEnd: speechTrain +MPI Rank 1: +MPI Rank 1: Action "train" complete. +MPI Rank 1: +MPI Rank 1: COMPLETED +MPI Rank 1: ~MPIWrapper diff --git a/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/baseline.gpu.txt b/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/baseline.gpu.txt new file mode 100644 index 000000000000..38de79728be4 --- /dev/null +++ b/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/baseline.gpu.txt @@ -0,0 +1,1307 @@ +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 17:23:45 + Last modified date: Thu Mar 3 17:23:41 2016 + Build type: debug + Build target: GPU + With 1bit-SGD: no + Math lib: acml + CUDA_PATH: /usr/local/cuda-7.0 + CUB_PATH: /usr/local/cub-1.4.1 + CUDNN_PATH: /usr/local/cudnn-4.0 + Build Branch: HEAD + Build SHA1: dafcfee4846f7c5a7d3b29ace536b8734ff409d1 + Built by philly on Source/CNTK/buildinfo.h0 + Build Path: Source/CNTK/buildinfo.h1 +------------------------------------------------------------------- +Changed current directory to '/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data' +MPIWrapper: initializing MPI +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 17:23:45 + Last modified date: Thu Mar 3 17:23:41 2016 + Build type: debug + Build target: GPU + With 1bit-SGD: no + Math lib: acml + CUDA_PATH: /usr/local/cuda-7.0 + CUB_PATH: /usr/local/cub-1.4.1 + CUDNN_PATH: /usr/local/cudnn-4.0 + Build Branch: HEAD + Build SHA1: dafcfee4846f7c5a7d3b29ace536b8734ff409d1 + Built by philly on Source/CNTK/buildinfo.h0 + Build Path: Source/CNTK/buildinfo.h1 +------------------------------------------------------------------- +Changed current directory to '/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data' +MPIWrapper: initializing MPI +-------------------------------------------------------------------------- +[[59648,1],0]: A high-performance Open MPI point-to-point messaging module +was unable to find any relevant network interfaces: + +Module: OpenFabrics (openib) + Host: 01c36695f011 + +Another transport will be used instead, although this may result in +lower performance. +-------------------------------------------------------------------------- +ping [requestnodes (before change)]: 2 nodes pinging each other +ping [requestnodes (before change)]: 2 nodes pinging each other +ping [requestnodes (before change)]: all 2 nodes responded +requestnodes [MPIWrapper]: using 2 out of 2 MPI nodes (2 requested); we (1) are in (participating) +ping [requestnodes (after change)]: 2 nodes pinging each other +ping [requestnodes (after change)]: all 2 nodes responded +mpihelper: we are cog 1 in a gearbox of 2 +ping [mpihelper]: 2 nodes pinging each other +ping [mpihelper]: all 2 nodes responded +ping [requestnodes (before change)]: all 2 nodes responded +requestnodes [MPIWrapper]: using 2 out of 2 MPI nodes (2 requested); we (0) are in (participating) +ping [requestnodes (after change)]: 2 nodes pinging each other +ping [requestnodes (after change)]: all 2 nodes responded +mpihelper: we are cog 0 in a gearbox of 2 +ping [mpihelper]: 2 nodes pinging each other +ping [mpihelper]: all 2 nodes responded +Redirecting stderr to file /tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu/stderr_speechTrain.logrank0 +Redirecting stderr to file /tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu/stderr_speechTrain.logrank1 +[01c36695f011:00369] 1 more process has sent help message help-mpi-btl-base.txt / btl:no-nics +[01c36695f011:00369] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: Build info: +MPI Rank 0: +MPI Rank 0: Built time: Mar 3 2016 17:23:45 +MPI Rank 0: Last modified date: Thu Mar 3 17:23:41 2016 +MPI Rank 0: Build type: debug +MPI Rank 0: Build target: GPU +MPI Rank 0: With 1bit-SGD: no +MPI Rank 0: Math lib: acml +MPI Rank 0: CUDA_PATH: /usr/local/cuda-7.0 +MPI Rank 0: CUB_PATH: /usr/local/cub-1.4.1 +MPI Rank 0: CUDNN_PATH: /usr/local/cudnn-4.0 +MPI Rank 0: Build Branch: HEAD +MPI Rank 0: Build SHA1: dafcfee4846f7c5a7d3b29ace536b8734ff409d1 +MPI Rank 0: Built by philly on Source/CNTK/buildinfo.h0 +MPI Rank 0: Build Path: Source/CNTK/buildinfo.h1 +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: running on localhost at 2016/03/03 17:28:01 +MPI Rank 0: command line: +MPI Rank 0: /home/philly/jenkins/workspace/CNTK-Test-Linux-W2/build/gpu/debug/bin/cntk configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/cntkcv.cntk currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data RunDir=/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation OutputDir=/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu DeviceId=0 numCPUThreads=2 stderr=/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu/stderr +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: precision = "double" +MPI Rank 0: command = speechTrain +MPI Rank 0: deviceId = $DeviceId$ +MPI Rank 0: parallelTrain = true +MPI Rank 0: speechTrain = [ +MPI Rank 0: action = "train" +MPI Rank 0: modelPath = "$RunDir$/models/cntkSpeech.dnn" +MPI Rank 0: deviceId = $DeviceId$ +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SimpleNetworkBuilder = [ +MPI Rank 0: layerSizes = 363:512:512:132 +MPI Rank 0: trainingCriterion = "CrossEntropyWithSoftmax" +MPI Rank 0: evalCriterion = "ErrorPrediction" +MPI Rank 0: layerTypes = "Sigmoid" +MPI Rank 0: initValueScale = 1.0 +MPI Rank 0: applyMeanVarNorm = true +MPI Rank 0: uniformInit = true +MPI Rank 0: needPrior = true +MPI Rank 0: ] +MPI Rank 0: ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above +MPI Rank 0: layerSizes = 363:512:512:132 +MPI Rank 0: trainingCriterion = 'CE' +MPI Rank 0: evalCriterion = 'Err' +MPI Rank 0: applyMeanVarNorm = true +MPI Rank 0: L = Length(layerSizes)-1 // number of model layers +MPI Rank 0: features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') +MPI Rank 0: featNorm = if applyMeanVarNorm +MPI Rank 0: then MeanVarNorm(features) +MPI Rank 0: else features +MPI Rank 0: layers[layer:1..L-1] = if layer > 1 +MPI Rank 0: then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 0: else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 0: outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) +MPI Rank 0: outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) +MPI Rank 0: CE = if trainingCriterion == 'CE' +MPI Rank 0: then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') +MPI Rank 0: else Fail('unknown trainingCriterion ' + trainingCriterion) +MPI Rank 0: Err = if evalCriterion == 'Err' then +MPI Rank 0: ErrorPrediction(labels, outZ, tag='eval') +MPI Rank 0: else Fail('unknown evalCriterion ' + evalCriterion) +MPI Rank 0: logPrior = LogPrior(labels) +MPI Rank 0: // TODO: how to add a tag to an infix operation? +MPI Rank 0: ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') +MPI Rank 0: ] +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize = 20480 +MPI Rank 0: minibatchSize = 64:256:1024 +MPI Rank 0: learningRatesPerMB = 1.0:0.5:0.1 +MPI Rank 0: numMBsToShowResult = 10 +MPI Rank 0: momentumPerMB = 0.9:0.656119 +MPI Rank 0: dropoutRate = 0.0 +MPI Rank 0: maxEpochs = 3 +MPI Rank 0: keepCheckPointFiles = true +MPI Rank 0: clippingThresholdPerSample = 1#INF +MPI Rank 0: ParallelTrain = [ +MPI Rank 0: parallelizationMethod = "DataParallelSGD" +MPI Rank 0: distributedMBReading = true +MPI Rank 0: DataParallelSGD = [ +MPI Rank 0: gradientBits = 64 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: AutoAdjust = [ +MPI Rank 0: reduceLearnRateIfImproveLessThan = 0 +MPI Rank 0: loadBestModel = true +MPI Rank 0: increaseLearnRateIfImproveMoreThan = 1000000000 +MPI Rank 0: learnRateDecreaseFactor = 0.5 +MPI Rank 0: learnRateIncreaseFactor = 1.382 +MPI Rank 0: autoAdjustLR = "adjustAfterEpoch" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = "HTKMLFReader" +MPI Rank 0: readMethod = "blockRandomize" +MPI Rank 0: miniBatchMode = "partial" +MPI Rank 0: randomize = "auto" +MPI Rank 0: verbosity = 0 +MPI Rank 0: features = [ +MPI Rank 0: dim = 363 +MPI Rank 0: type = "real" +MPI Rank 0: scpFile = "glob_0000.scp" +MPI Rank 0: ] +MPI Rank 0: labels = [ +MPI Rank 0: mlfFile = "$DataDir$/glob_0000.mlf" +MPI Rank 0: labelMappingFile = "$DataDir$/state.list" +MPI Rank 0: labelDim = 132 +MPI Rank 0: labelType = "category" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: cvreader = [ +MPI Rank 0: readerType = "HTKMLFReader" +MPI Rank 0: readMethod = "blockRandomize" +MPI Rank 0: miniBatchMode = "partial" +MPI Rank 0: randomize = "auto" +MPI Rank 0: verbosity = 0 +MPI Rank 0: features = [ +MPI Rank 0: dim = 363 +MPI Rank 0: type = "real" +MPI Rank 0: scpFile = "glob_0000.cv.scp" +MPI Rank 0: ] +MPI Rank 0: labels = [ +MPI Rank 0: mlfFile = "$DataDir$/glob_0000.mlf" +MPI Rank 0: labelMappingFile = "$DataDir$/state.list" +MPI Rank 0: labelDim = 132 +MPI Rank 0: labelType = "category" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data +MPI Rank 0: RunDir=/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu +MPI Rank 0: DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data +MPI Rank 0: ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation +MPI Rank 0: OutputDir=/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu +MPI Rank 0: DeviceId=0 +MPI Rank 0: numCPUThreads=2 +MPI Rank 0: stderr=/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: precision = "double" +MPI Rank 0: command = speechTrain +MPI Rank 0: deviceId = 0 +MPI Rank 0: parallelTrain = true +MPI Rank 0: speechTrain = [ +MPI Rank 0: action = "train" +MPI Rank 0: modelPath = "/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu/models/cntkSpeech.dnn" +MPI Rank 0: deviceId = 0 +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SimpleNetworkBuilder = [ +MPI Rank 0: layerSizes = 363:512:512:132 +MPI Rank 0: trainingCriterion = "CrossEntropyWithSoftmax" +MPI Rank 0: evalCriterion = "ErrorPrediction" +MPI Rank 0: layerTypes = "Sigmoid" +MPI Rank 0: initValueScale = 1.0 +MPI Rank 0: applyMeanVarNorm = true +MPI Rank 0: uniformInit = true +MPI Rank 0: needPrior = true +MPI Rank 0: ] +MPI Rank 0: ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above +MPI Rank 0: layerSizes = 363:512:512:132 +MPI Rank 0: trainingCriterion = 'CE' +MPI Rank 0: evalCriterion = 'Err' +MPI Rank 0: applyMeanVarNorm = true +MPI Rank 0: L = Length(layerSizes)-1 // number of model layers +MPI Rank 0: features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') +MPI Rank 0: featNorm = if applyMeanVarNorm +MPI Rank 0: then MeanVarNorm(features) +MPI Rank 0: else features +MPI Rank 0: layers[layer:1..L-1] = if layer > 1 +MPI Rank 0: then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 0: else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 0: outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) +MPI Rank 0: outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) +MPI Rank 0: CE = if trainingCriterion == 'CE' +MPI Rank 0: then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') +MPI Rank 0: else Fail('unknown trainingCriterion ' + trainingCriterion) +MPI Rank 0: Err = if evalCriterion == 'Err' then +MPI Rank 0: ErrorPrediction(labels, outZ, tag='eval') +MPI Rank 0: else Fail('unknown evalCriterion ' + evalCriterion) +MPI Rank 0: logPrior = LogPrior(labels) +MPI Rank 0: // TODO: how to add a tag to an infix operation? +MPI Rank 0: ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') +MPI Rank 0: ] +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize = 20480 +MPI Rank 0: minibatchSize = 64:256:1024 +MPI Rank 0: learningRatesPerMB = 1.0:0.5:0.1 +MPI Rank 0: numMBsToShowResult = 10 +MPI Rank 0: momentumPerMB = 0.9:0.656119 +MPI Rank 0: dropoutRate = 0.0 +MPI Rank 0: maxEpochs = 3 +MPI Rank 0: keepCheckPointFiles = true +MPI Rank 0: clippingThresholdPerSample = 1#INF +MPI Rank 0: ParallelTrain = [ +MPI Rank 0: parallelizationMethod = "DataParallelSGD" +MPI Rank 0: distributedMBReading = true +MPI Rank 0: DataParallelSGD = [ +MPI Rank 0: gradientBits = 64 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: AutoAdjust = [ +MPI Rank 0: reduceLearnRateIfImproveLessThan = 0 +MPI Rank 0: loadBestModel = true +MPI Rank 0: increaseLearnRateIfImproveMoreThan = 1000000000 +MPI Rank 0: learnRateDecreaseFactor = 0.5 +MPI Rank 0: learnRateIncreaseFactor = 1.382 +MPI Rank 0: autoAdjustLR = "adjustAfterEpoch" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = "HTKMLFReader" +MPI Rank 0: readMethod = "blockRandomize" +MPI Rank 0: miniBatchMode = "partial" +MPI Rank 0: randomize = "auto" +MPI Rank 0: verbosity = 0 +MPI Rank 0: features = [ +MPI Rank 0: dim = 363 +MPI Rank 0: type = "real" +MPI Rank 0: scpFile = "glob_0000.scp" +MPI Rank 0: ] +MPI Rank 0: labels = [ +MPI Rank 0: mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/glob_0000.mlf" +MPI Rank 0: labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/state.list" +MPI Rank 0: labelDim = 132 +MPI Rank 0: labelType = "category" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: cvreader = [ +MPI Rank 0: readerType = "HTKMLFReader" +MPI Rank 0: readMethod = "blockRandomize" +MPI Rank 0: miniBatchMode = "partial" +MPI Rank 0: randomize = "auto" +MPI Rank 0: verbosity = 0 +MPI Rank 0: features = [ +MPI Rank 0: dim = 363 +MPI Rank 0: type = "real" +MPI Rank 0: scpFile = "glob_0000.cv.scp" +MPI Rank 0: ] +MPI Rank 0: labels = [ +MPI Rank 0: mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/glob_0000.mlf" +MPI Rank 0: labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/state.list" +MPI Rank 0: labelDim = 132 +MPI Rank 0: labelType = "category" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data +MPI Rank 0: RunDir=/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu +MPI Rank 0: DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data +MPI Rank 0: ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation +MPI Rank 0: OutputDir=/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu +MPI Rank 0: DeviceId=0 +MPI Rank 0: numCPUThreads=2 +MPI Rank 0: stderr=/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: configparameters: cntkcv.cntk:command=speechTrain +MPI Rank 0: configparameters: cntkcv.cntk:ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation +MPI Rank 0: configparameters: cntkcv.cntk:currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data +MPI Rank 0: configparameters: cntkcv.cntk:DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data +MPI Rank 0: configparameters: cntkcv.cntk:deviceId=0 +MPI Rank 0: configparameters: cntkcv.cntk:numCPUThreads=2 +MPI Rank 0: configparameters: cntkcv.cntk:OutputDir=/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu +MPI Rank 0: configparameters: cntkcv.cntk:parallelTrain=true +MPI Rank 0: configparameters: cntkcv.cntk:precision=double +MPI Rank 0: configparameters: cntkcv.cntk:RunDir=/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu +MPI Rank 0: configparameters: cntkcv.cntk:speechTrain=[ +MPI Rank 0: action = "train" +MPI Rank 0: modelPath = "/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu/models/cntkSpeech.dnn" +MPI Rank 0: deviceId = 0 +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SimpleNetworkBuilder = [ +MPI Rank 0: layerSizes = 363:512:512:132 +MPI Rank 0: trainingCriterion = "CrossEntropyWithSoftmax" +MPI Rank 0: evalCriterion = "ErrorPrediction" +MPI Rank 0: layerTypes = "Sigmoid" +MPI Rank 0: initValueScale = 1.0 +MPI Rank 0: applyMeanVarNorm = true +MPI Rank 0: uniformInit = true +MPI Rank 0: needPrior = true +MPI Rank 0: ] +MPI Rank 0: ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above +MPI Rank 0: layerSizes = 363:512:512:132 +MPI Rank 0: trainingCriterion = 'CE' +MPI Rank 0: evalCriterion = 'Err' +MPI Rank 0: applyMeanVarNorm = true +MPI Rank 0: L = Length(layerSizes)-1 // number of model layers +MPI Rank 0: features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') +MPI Rank 0: featNorm = if applyMeanVarNorm +MPI Rank 0: then MeanVarNorm(features) +MPI Rank 0: else features +MPI Rank 0: layers[layer:1..L-1] = if layer > 1 +MPI Rank 0: then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 0: else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 0: outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) +MPI Rank 0: outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) +MPI Rank 0: CE = if trainingCriterion == 'CE' +MPI Rank 0: then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') +MPI Rank 0: else Fail('unknown trainingCriterion ' + trainingCriterion) +MPI Rank 0: Err = if evalCriterion == 'Err' then +MPI Rank 0: ErrorPrediction(labels, outZ, tag='eval') +MPI Rank 0: else Fail('unknown evalCriterion ' + evalCriterion) +MPI Rank 0: logPrior = LogPrior(labels) +MPI Rank 0: // TODO: how to add a tag to an infix operation? +MPI Rank 0: ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') +MPI Rank 0: ] +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize = 20480 +MPI Rank 0: minibatchSize = 64:256:1024 +MPI Rank 0: learningRatesPerMB = 1.0:0.5:0.1 +MPI Rank 0: numMBsToShowResult = 10 +MPI Rank 0: momentumPerMB = 0.9:0.656119 +MPI Rank 0: dropoutRate = 0.0 +MPI Rank 0: maxEpochs = 3 +MPI Rank 0: keepCheckPointFiles = true +MPI Rank 0: clippingThresholdPerSample = 1#INF +MPI Rank 0: ParallelTrain = [ +MPI Rank 0: parallelizationMethod = "DataParallelSGD" +MPI Rank 0: distributedMBReading = true +MPI Rank 0: DataParallelSGD = [ +MPI Rank 0: gradientBits = 64 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: AutoAdjust = [ +MPI Rank 0: reduceLearnRateIfImproveLessThan = 0 +MPI Rank 0: loadBestModel = true +MPI Rank 0: increaseLearnRateIfImproveMoreThan = 1000000000 +MPI Rank 0: learnRateDecreaseFactor = 0.5 +MPI Rank 0: learnRateIncreaseFactor = 1.382 +MPI Rank 0: autoAdjustLR = "adjustAfterEpoch" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = "HTKMLFReader" +MPI Rank 0: readMethod = "blockRandomize" +MPI Rank 0: miniBatchMode = "partial" +MPI Rank 0: randomize = "auto" +MPI Rank 0: verbosity = 0 +MPI Rank 0: features = [ +MPI Rank 0: dim = 363 +MPI Rank 0: type = "real" +MPI Rank 0: scpFile = "glob_0000.scp" +MPI Rank 0: ] +MPI Rank 0: labels = [ +MPI Rank 0: mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/glob_0000.mlf" +MPI Rank 0: labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/state.list" +MPI Rank 0: labelDim = 132 +MPI Rank 0: labelType = "category" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: cvreader = [ +MPI Rank 0: readerType = "HTKMLFReader" +MPI Rank 0: readMethod = "blockRandomize" +MPI Rank 0: miniBatchMode = "partial" +MPI Rank 0: randomize = "auto" +MPI Rank 0: verbosity = 0 +MPI Rank 0: features = [ +MPI Rank 0: dim = 363 +MPI Rank 0: type = "real" +MPI Rank 0: scpFile = "glob_0000.cv.scp" +MPI Rank 0: ] +MPI Rank 0: labels = [ +MPI Rank 0: mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/glob_0000.mlf" +MPI Rank 0: labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/state.list" +MPI Rank 0: labelDim = 132 +MPI Rank 0: labelType = "category" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: cntkcv.cntk:stderr=/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu/stderr +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: Commands: speechTrain +MPI Rank 0: Precision = "double" +MPI Rank 0: Using 2 CPU threads. +MPI Rank 0: CNTKModelPath: /tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu/models/cntkSpeech.dnn +MPI Rank 0: CNTKCommandTrainInfo: speechTrain : 3 +MPI Rank 0: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 0: +MPI Rank 0: ############################################################################## +MPI Rank 0: # # +MPI Rank 0: # Action "train" # +MPI Rank 0: # # +MPI Rank 0: ############################################################################## +MPI Rank 0: +MPI Rank 0: CNTKCommandTrainBegin: speechTrain +MPI Rank 0: SimpleNetworkBuilder Using GPU 0 +MPI Rank 0: reading script file glob_0000.scp ... 948 entries +MPI Rank 0: total 132 state names in state list /home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/state.list +MPI Rank 0: htkmlfreader: reading MLF file /home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/glob_0000.mlf ... total 948 entries +MPI Rank 0: ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances +MPI Rank 0: label set 0: 129 classes +MPI Rank 0: minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames +MPI Rank 0: reading script file glob_0000.cv.scp ... 300 entries +MPI Rank 0: total 132 state names in state list /home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/state.list +MPI Rank 0: htkmlfreader: reading MLF file /home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/glob_0000.mlf ... total 948 entries +MPI Rank 0: ...........................................................................feature set 0: 83050 frames in 300 out of 300 utterances +MPI Rank 0: label set 0: 129 classes +MPI Rank 0: minibatchutterancesource: 300 utterances grouped into 1 chunks, av. chunk size: 300.0 utterances, 83050.0 frames +MPI Rank 0: SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==8 +MPI Rank 0: +MPI Rank 0: Post-processing network... +MPI Rank 0: +MPI Rank 0: 7 roots: +MPI Rank 0: CrossEntropyWithSoftmax = CrossEntropyWithSoftmax +MPI Rank 0: EvalErrorPrediction = ErrorPrediction +MPI Rank 0: InvStdOfFeatures = InvStdDev +MPI Rank 0: MeanOfFeatures = Mean +MPI Rank 0: PosteriorProb = Softmax +MPI Rank 0: Prior = Mean +MPI Rank 0: ScaledLogLikelihood = Minus +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for CrossEntropyWithSoftmax CrossEntropyWithSoftmax operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for EvalErrorPrediction ErrorPrediction operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for InvStdOfFeatures InvStdDev operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for MeanOfFeatures Mean operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for PosteriorProb Softmax operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for Prior Mean operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for ScaledLogLikelihood Minus operation +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating network. 25 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132 {1} x *] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [132 x 512 {1,132}] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [512 x 512 {1,512}] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [512 x 363 {1,512}] +MPI Rank 0: Validating --> features = InputValue -> [363 {1} x *] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363 {1} x *]) -> [363 {1}] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363 {1} x *]) -> [363 {1}] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 {1} x *], MeanOfFeatures[363 {1}], InvStdOfFeatures[363 {1}]) -> [363 {1} x *] +MPI Rank 0: Validating --> W0*features = Times(W0[512 x 363 {1,512}], MVNormalizedFeatures[363 {1} x *]) -> [512 {1} x *] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [512 x 1 {1,512}] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512 {1} x *], B0[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> W1*H1 = Times(W1[512 x 512 {1,512}], H1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [512 x 1 {1,512}] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512 x 1 {1,512} x *], B1[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> W2*H1 = Times(W2[132 x 512 {1,132}], H2[512 x 1 {1,512} x *]) -> [132 x 1 {1,132} x *] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [132 x 1 {1,132}] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[132 x 1 {1,132} x *], B2[132 x 1 {1,132}]) -> [132 x 1 {1,132} x *] +MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}] +MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}] +MPI Rank 0: Validating --> PosteriorProb = Softmax(HLast[132 x 1 {1,132} x *]) -> [132 x 1 {1,132} x *] +MPI Rank 0: Validating --> Prior = Mean(labels[132 {1} x *]) -> [132 {1}] +MPI Rank 0: Validating --> LogOfPrior = Log(Prior[132 {1}]) -> [132 {1}] +MPI Rank 0: Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 {1,132} x *], LogOfPrior[132 {1}]) -> [132 x 1 {1,132} x *] +MPI Rank 0: +MPI Rank 0: Validating network. 17 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132 {1} x *] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [132 x 512 {1,132}] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [512 x 512 {1,512}] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [512 x 363 {1,512}] +MPI Rank 0: Validating --> features = InputValue -> [363 {1} x *] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363 {1} x *]) -> [363 {1}] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363 {1} x *]) -> [363 {1}] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 {1} x *], MeanOfFeatures[363 {1}], InvStdOfFeatures[363 {1}]) -> [363 {1} x *] +MPI Rank 0: Validating --> W0*features = Times(W0[512 x 363 {1,512}], MVNormalizedFeatures[363 {1} x *]) -> [512 {1} x *] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [512 x 1 {1,512}] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512 {1} x *], B0[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> W1*H1 = Times(W1[512 x 512 {1,512}], H1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [512 x 1 {1,512}] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512 x 1 {1,512} x *], B1[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> W2*H1 = Times(W2[132 x 512 {1,132}], H2[512 x 1 {1,512} x *]) -> [132 x 1 {1,132} x *] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [132 x 1 {1,132}] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[132 x 1 {1,132} x *], B2[132 x 1 {1,132}]) -> [132 x 1 {1,132} x *] +MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}] +MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}] +MPI Rank 0: Validating --> PosteriorProb = Softmax(HLast[132 x 1 {1,132} x *]) -> [132 x 1 {1,132} x *] +MPI Rank 0: Validating --> Prior = Mean(labels[132 {1} x *]) -> [132 {1}] +MPI Rank 0: Validating --> LogOfPrior = Log(Prior[132 {1}]) -> [132 {1}] +MPI Rank 0: Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 {1,132} x *], LogOfPrior[132 {1}]) -> [132 x 1 {1,132} x *] +MPI Rank 0: +MPI Rank 0: Validating network, final pass. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132 {1} x *] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [132 x 512 {1,132}] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [512 x 512 {1,512}] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [512 x 363 {1,512}] +MPI Rank 0: Validating --> features = InputValue -> [363 {1} x *] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363 {1} x *]) -> [363 {1}] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363 {1} x *]) -> [363 {1}] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 {1} x *], MeanOfFeatures[363 {1}], InvStdOfFeatures[363 {1}]) -> [363 {1} x *] +MPI Rank 0: Validating --> W0*features = Times(W0[512 x 363 {1,512}], MVNormalizedFeatures[363 {1} x *]) -> [512 {1} x *] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [512 x 1 {1,512}] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512 {1} x *], B0[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> W1*H1 = Times(W1[512 x 512 {1,512}], H1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [512 x 1 {1,512}] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512 x 1 {1,512} x *], B1[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> W2*H1 = Times(W2[132 x 512 {1,132}], H2[512 x 1 {1,512} x *]) -> [132 x 1 {1,132} x *] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [132 x 1 {1,132}] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[132 x 1 {1,132} x *], B2[132 x 1 {1,132}]) -> [132 x 1 {1,132} x *] +MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}] +MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}] +MPI Rank 0: Validating --> PosteriorProb = Softmax(HLast[132 x 1 {1,132} x *]) -> [132 x 1 {1,132} x *] +MPI Rank 0: Validating --> Prior = Mean(labels[132 {1} x *]) -> [132 {1}] +MPI Rank 0: Validating --> LogOfPrior = Log(Prior[132 {1}]) -> [132 {1}] +MPI Rank 0: Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 {1,132} x *], LogOfPrior[132 {1}]) -> [132 x 1 {1,132} x *] +MPI Rank 0: +MPI Rank 0: 12 out of 25 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: Post-processing network complete. +MPI Rank 0: +MPI Rank 0: SGD using GPU 0. +MPI Rank 0: +MPI Rank 0: Training criterion node(s): +MPI Rank 0: CrossEntropyWithSoftmax = CrossEntropyWithSoftmax +MPI Rank 0: +MPI Rank 0: Evaluation criterion node(s): +MPI Rank 0: EvalErrorPrediction = ErrorPrediction +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: +MPI Rank 0: Precomputing --> 3 PreCompute nodes found. +MPI Rank 0: +MPI Rank 0: NodeName: MeanOfFeatures +MPI Rank 0: NodeName: InvStdOfFeatures +MPI Rank 0: NodeName: Prior +MPI Rank 0: minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 0: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms +MPI Rank 0: +MPI Rank 0: Precomputing --> Completed. +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Starting Epoch 1: learning rate per sample = 0.015625 effective momentum = 0.900000 momentum as time constant = 607.4 samples +MPI Rank 0: minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 0 of 2, with 1 datapasses +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 2, NumGradientBits = 64), distributed reading is ENABLED. +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 1- 10, 3.12%]: SamplesSeen = 640; TrainLossPerSample = 4.40318406; EvalErr[0]PerSample = 0.90468750; TotalTime = 0.5039s; SamplesPerSecond = 1270.0 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 11- 20, 6.25%]: SamplesSeen = 640; TrainLossPerSample = 4.15980357; EvalErr[0]PerSample = 0.87187500; TotalTime = 0.5919s; SamplesPerSecond = 1081.3 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 21- 30, 9.38%]: SamplesSeen = 640; TrainLossPerSample = 3.98424210; EvalErr[0]PerSample = 0.87812500; TotalTime = 0.5902s; SamplesPerSecond = 1084.4 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 31- 40, 12.50%]: SamplesSeen = 640; TrainLossPerSample = 3.86209050; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.4978s; SamplesPerSecond = 1285.8 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 41- 50, 15.62%]: SamplesSeen = 640; TrainLossPerSample = 3.80597620; EvalErr[0]PerSample = 0.88593750; TotalTime = 0.4762s; SamplesPerSecond = 1343.9 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 51- 60, 18.75%]: SamplesSeen = 640; TrainLossPerSample = 3.73511552; EvalErr[0]PerSample = 0.87812500; TotalTime = 0.5952s; SamplesPerSecond = 1075.4 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 61- 70, 21.88%]: SamplesSeen = 640; TrainLossPerSample = 3.57260725; EvalErr[0]PerSample = 0.81875000; TotalTime = 0.5756s; SamplesPerSecond = 1112.0 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 71- 80, 25.00%]: SamplesSeen = 640; TrainLossPerSample = 3.42293687; EvalErr[0]PerSample = 0.80468750; TotalTime = 0.5939s; SamplesPerSecond = 1077.6 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 81- 90, 28.12%]: SamplesSeen = 640; TrainLossPerSample = 3.34304309; EvalErr[0]PerSample = 0.76718750; TotalTime = 0.4947s; SamplesPerSecond = 1293.7 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 91- 100, 31.25%]: SamplesSeen = 640; TrainLossPerSample = 3.37037793; EvalErr[0]PerSample = 0.84687500; TotalTime = 0.5568s; SamplesPerSecond = 1149.4 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 101- 110, 34.38%]: SamplesSeen = 640; TrainLossPerSample = 3.21606065; EvalErr[0]PerSample = 0.76093750; TotalTime = 0.5515s; SamplesPerSecond = 1160.4 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 111- 120, 37.50%]: SamplesSeen = 640; TrainLossPerSample = 3.31610118; EvalErr[0]PerSample = 0.78437500; TotalTime = 0.4800s; SamplesPerSecond = 1333.4 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 121- 130, 40.62%]: SamplesSeen = 640; TrainLossPerSample = 3.14285888; EvalErr[0]PerSample = 0.75000000; TotalTime = 0.5823s; SamplesPerSecond = 1099.1 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 131- 140, 43.75%]: SamplesSeen = 640; TrainLossPerSample = 3.01821991; EvalErr[0]PerSample = 0.70937500; TotalTime = 0.5501s; SamplesPerSecond = 1163.4 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 141- 150, 46.88%]: SamplesSeen = 640; TrainLossPerSample = 3.01218944; EvalErr[0]PerSample = 0.73906250; TotalTime = 0.6128s; SamplesPerSecond = 1044.4 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 151- 160, 50.00%]: SamplesSeen = 640; TrainLossPerSample = 2.98947652; EvalErr[0]PerSample = 0.73593750; TotalTime = 0.5233s; SamplesPerSecond = 1223.1 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 161- 170, 53.12%]: SamplesSeen = 640; TrainLossPerSample = 2.86297716; EvalErr[0]PerSample = 0.70000000; TotalTime = 0.5186s; SamplesPerSecond = 1234.0 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 171- 180, 56.25%]: SamplesSeen = 640; TrainLossPerSample = 2.71901077; EvalErr[0]PerSample = 0.68593750; TotalTime = 0.5450s; SamplesPerSecond = 1174.3 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 181- 190, 59.38%]: SamplesSeen = 640; TrainLossPerSample = 2.80860596; EvalErr[0]PerSample = 0.71250000; TotalTime = 0.6014s; SamplesPerSecond = 1064.2 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 191- 200, 62.50%]: SamplesSeen = 640; TrainLossPerSample = 2.60590434; EvalErr[0]PerSample = 0.64687500; TotalTime = 0.5394s; SamplesPerSecond = 1186.5 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 201- 210, 65.62%]: SamplesSeen = 640; TrainLossPerSample = 2.63920069; EvalErr[0]PerSample = 0.66875000; TotalTime = 0.5246s; SamplesPerSecond = 1219.9 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 211- 220, 68.75%]: SamplesSeen = 640; TrainLossPerSample = 2.58372597; EvalErr[0]PerSample = 0.65781250; TotalTime = 0.6020s; SamplesPerSecond = 1063.2 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 221- 230, 71.88%]: SamplesSeen = 640; TrainLossPerSample = 2.50997096; EvalErr[0]PerSample = 0.62031250; TotalTime = 0.5544s; SamplesPerSecond = 1154.3 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 231- 240, 75.00%]: SamplesSeen = 640; TrainLossPerSample = 2.42126950; EvalErr[0]PerSample = 0.62968750; TotalTime = 0.4947s; SamplesPerSecond = 1293.8 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 241- 250, 78.12%]: SamplesSeen = 640; TrainLossPerSample = 2.40125789; EvalErr[0]PerSample = 0.65156250; TotalTime = 0.5809s; SamplesPerSecond = 1101.8 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 251- 260, 81.25%]: SamplesSeen = 640; TrainLossPerSample = 2.47110816; EvalErr[0]PerSample = 0.63281250; TotalTime = 0.5377s; SamplesPerSecond = 1190.3 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 261- 270, 84.38%]: SamplesSeen = 640; TrainLossPerSample = 2.33215267; EvalErr[0]PerSample = 0.60312500; TotalTime = 0.5818s; SamplesPerSecond = 1100.0 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 271- 280, 87.50%]: SamplesSeen = 640; TrainLossPerSample = 2.21936103; EvalErr[0]PerSample = 0.56875000; TotalTime = 0.5136s; SamplesPerSecond = 1246.0 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 281- 290, 90.62%]: SamplesSeen = 640; TrainLossPerSample = 2.31959580; EvalErr[0]PerSample = 0.61093750; TotalTime = 0.5158s; SamplesPerSecond = 1240.7 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 291- 300, 93.75%]: SamplesSeen = 640; TrainLossPerSample = 2.19592881; EvalErr[0]PerSample = 0.61718750; TotalTime = 0.6005s; SamplesPerSecond = 1065.7 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 301- 310, 96.88%]: SamplesSeen = 640; TrainLossPerSample = 2.28411654; EvalErr[0]PerSample = 0.60000000; TotalTime = 0.6034s; SamplesPerSecond = 1060.6 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 311- 320, 100.00%]: SamplesSeen = 640; TrainLossPerSample = 2.18307184; EvalErr[0]PerSample = 0.55781250; TotalTime = 0.4843s; SamplesPerSecond = 1321.5 +MPI Rank 0: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 2.9972357; TotalSamplesSeen = 20480; EvalErrPerSample = 0.72426758; AvgLearningRatePerSample = 0.015625; EpochTime=17.6027 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: minibatchiterator: epoch 0: frames [0..83050] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 0: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms +MPI Rank 0: Final Results: Minibatch[1-1298]: SamplesSeen = 83050 CrossEntropyWithSoftmax: CrossEntropyWithSoftmax/Sample = 2.1793731 Perplexity = 8.8407623 EvalErrorPrediction: ErrorPrediction/Sample = 0.58275738 +MPI Rank 0: Finished Epoch[ 1 of 3]: [Validation Set] TrainLossPerSample = 2.1793731; EvalErrPerSample = 0.58275738 +MPI Rank 0: SGD: Saving checkpoint model '/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu/models/cntkSpeech.dnn.1' +MPI Rank 0: +MPI Rank 0: Starting Epoch 2: learning rate per sample = 0.001953 effective momentum = 0.656119 momentum as time constant = 607.5 samples +MPI Rank 0: minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 0 of 2, with 1 datapasses +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 2, NumGradientBits = 64), distributed reading is ENABLED. +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 1- 10, 12.50%]: SamplesSeen = 2560; TrainLossPerSample = 2.04166118; EvalErr[0]PerSample = 0.54531250; TotalTime = 0.6780s; SamplesPerSecond = 3776.1 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 11- 20, 25.00%]: SamplesSeen = 2560; TrainLossPerSample = 2.01439158; EvalErr[0]PerSample = 0.54023438; TotalTime = 0.6189s; SamplesPerSecond = 4136.1 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 21- 30, 37.50%]: SamplesSeen = 2560; TrainLossPerSample = 2.01068322; EvalErr[0]PerSample = 0.55039063; TotalTime = 0.7011s; SamplesPerSecond = 3651.4 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 31- 40, 50.00%]: SamplesSeen = 2560; TrainLossPerSample = 1.95327476; EvalErr[0]PerSample = 0.53515625; TotalTime = 0.6745s; SamplesPerSecond = 3795.1 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 41- 50, 62.50%]: SamplesSeen = 2560; TrainLossPerSample = 1.92813201; EvalErr[0]PerSample = 0.53867188; TotalTime = 0.6841s; SamplesPerSecond = 3742.2 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 51- 60, 75.00%]: SamplesSeen = 2560; TrainLossPerSample = 1.99300635; EvalErr[0]PerSample = 0.53671875; TotalTime = 0.6736s; SamplesPerSecond = 3800.5 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 61- 70, 87.50%]: SamplesSeen = 2560; TrainLossPerSample = 1.92264042; EvalErr[0]PerSample = 0.52304688; TotalTime = 0.7287s; SamplesPerSecond = 3513.0 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 71- 80, 100.00%]: SamplesSeen = 2560; TrainLossPerSample = 1.92843715; EvalErr[0]PerSample = 0.53554687; TotalTime = 0.6584s; SamplesPerSecond = 3888.4 +MPI Rank 0: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9740283; TotalSamplesSeen = 40960; EvalErrPerSample = 0.53813477; AvgLearningRatePerSample = 0.001953125; EpochTime=5.45193 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: minibatchiterator: epoch 0: frames [0..83050] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 0: Final Results: Minibatch[1-325]: SamplesSeen = 83050 CrossEntropyWithSoftmax: CrossEntropyWithSoftmax/Sample = 1.8879998 Perplexity = 6.6061417 EvalErrorPrediction: ErrorPrediction/Sample = 0.52534618 +MPI Rank 0: Finished Epoch[ 2 of 3]: [Validation Set] TrainLossPerSample = 1.8879998; EvalErrPerSample = 0.52534618 +MPI Rank 0: SGD: Saving checkpoint model '/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu/models/cntkSpeech.dnn.2' +MPI Rank 0: +MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.000098 effective momentum = 0.656119 momentum as time constant = 2429.9 samples +MPI Rank 0: minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 0 of 2, with 1 datapasses +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 2, NumGradientBits = 64), distributed reading is ENABLED. +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 1- 10, 50.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.89842932; EvalErr[0]PerSample = 0.52373047; TotalTime = 1.2198s; SamplesPerSecond = 8394.9 +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 11- 20, 100.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.89204376; EvalErr[0]PerSample = 0.52128906; TotalTime = 1.1216s; SamplesPerSecond = 9129.8 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8952365; TotalSamplesSeen = 61440; EvalErrPerSample = 0.52250977; AvgLearningRatePerSample = 9.7656251e-05; EpochTime=2.41398 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: minibatchiterator: epoch 0: frames [0..83050] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 0: Final Results: Minibatch[1-82]: SamplesSeen = 83050 CrossEntropyWithSoftmax: CrossEntropyWithSoftmax/Sample = 1.8678214 Perplexity = 6.4741763 EvalErrorPrediction: ErrorPrediction/Sample = 0.51708609 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Validation Set] TrainLossPerSample = 1.8678214; EvalErrPerSample = 0.51708609 +MPI Rank 0: SGD: Saving checkpoint model '/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu/models/cntkSpeech.dnn' +MPI Rank 0: CNTKCommandTrainEnd: speechTrain +MPI Rank 0: +MPI Rank 0: Action "train" complete. +MPI Rank 0: +MPI Rank 0: COMPLETED +MPI Rank 0: ~MPIWrapper +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: Build info: +MPI Rank 1: +MPI Rank 1: Built time: Mar 3 2016 17:23:45 +MPI Rank 1: Last modified date: Thu Mar 3 17:23:41 2016 +MPI Rank 1: Build type: debug +MPI Rank 1: Build target: GPU +MPI Rank 1: With 1bit-SGD: no +MPI Rank 1: Math lib: acml +MPI Rank 1: CUDA_PATH: /usr/local/cuda-7.0 +MPI Rank 1: CUB_PATH: /usr/local/cub-1.4.1 +MPI Rank 1: CUDNN_PATH: /usr/local/cudnn-4.0 +MPI Rank 1: Build Branch: HEAD +MPI Rank 1: Build SHA1: dafcfee4846f7c5a7d3b29ace536b8734ff409d1 +MPI Rank 1: Built by philly on Source/CNTK/buildinfo.h0 +MPI Rank 1: Build Path: Source/CNTK/buildinfo.h1 +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: running on localhost at 2016/03/03 17:28:01 +MPI Rank 1: command line: +MPI Rank 1: /home/philly/jenkins/workspace/CNTK-Test-Linux-W2/build/gpu/debug/bin/cntk configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/cntkcv.cntk currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data RunDir=/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation OutputDir=/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu DeviceId=0 numCPUThreads=2 stderr=/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu/stderr +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: precision = "double" +MPI Rank 1: command = speechTrain +MPI Rank 1: deviceId = $DeviceId$ +MPI Rank 1: parallelTrain = true +MPI Rank 1: speechTrain = [ +MPI Rank 1: action = "train" +MPI Rank 1: modelPath = "$RunDir$/models/cntkSpeech.dnn" +MPI Rank 1: deviceId = $DeviceId$ +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SimpleNetworkBuilder = [ +MPI Rank 1: layerSizes = 363:512:512:132 +MPI Rank 1: trainingCriterion = "CrossEntropyWithSoftmax" +MPI Rank 1: evalCriterion = "ErrorPrediction" +MPI Rank 1: layerTypes = "Sigmoid" +MPI Rank 1: initValueScale = 1.0 +MPI Rank 1: applyMeanVarNorm = true +MPI Rank 1: uniformInit = true +MPI Rank 1: needPrior = true +MPI Rank 1: ] +MPI Rank 1: ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above +MPI Rank 1: layerSizes = 363:512:512:132 +MPI Rank 1: trainingCriterion = 'CE' +MPI Rank 1: evalCriterion = 'Err' +MPI Rank 1: applyMeanVarNorm = true +MPI Rank 1: L = Length(layerSizes)-1 // number of model layers +MPI Rank 1: features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') +MPI Rank 1: featNorm = if applyMeanVarNorm +MPI Rank 1: then MeanVarNorm(features) +MPI Rank 1: else features +MPI Rank 1: layers[layer:1..L-1] = if layer > 1 +MPI Rank 1: then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 1: else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 1: outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) +MPI Rank 1: outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) +MPI Rank 1: CE = if trainingCriterion == 'CE' +MPI Rank 1: then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') +MPI Rank 1: else Fail('unknown trainingCriterion ' + trainingCriterion) +MPI Rank 1: Err = if evalCriterion == 'Err' then +MPI Rank 1: ErrorPrediction(labels, outZ, tag='eval') +MPI Rank 1: else Fail('unknown evalCriterion ' + evalCriterion) +MPI Rank 1: logPrior = LogPrior(labels) +MPI Rank 1: // TODO: how to add a tag to an infix operation? +MPI Rank 1: ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') +MPI Rank 1: ] +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize = 20480 +MPI Rank 1: minibatchSize = 64:256:1024 +MPI Rank 1: learningRatesPerMB = 1.0:0.5:0.1 +MPI Rank 1: numMBsToShowResult = 10 +MPI Rank 1: momentumPerMB = 0.9:0.656119 +MPI Rank 1: dropoutRate = 0.0 +MPI Rank 1: maxEpochs = 3 +MPI Rank 1: keepCheckPointFiles = true +MPI Rank 1: clippingThresholdPerSample = 1#INF +MPI Rank 1: ParallelTrain = [ +MPI Rank 1: parallelizationMethod = "DataParallelSGD" +MPI Rank 1: distributedMBReading = true +MPI Rank 1: DataParallelSGD = [ +MPI Rank 1: gradientBits = 64 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: AutoAdjust = [ +MPI Rank 1: reduceLearnRateIfImproveLessThan = 0 +MPI Rank 1: loadBestModel = true +MPI Rank 1: increaseLearnRateIfImproveMoreThan = 1000000000 +MPI Rank 1: learnRateDecreaseFactor = 0.5 +MPI Rank 1: learnRateIncreaseFactor = 1.382 +MPI Rank 1: autoAdjustLR = "adjustAfterEpoch" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = "HTKMLFReader" +MPI Rank 1: readMethod = "blockRandomize" +MPI Rank 1: miniBatchMode = "partial" +MPI Rank 1: randomize = "auto" +MPI Rank 1: verbosity = 0 +MPI Rank 1: features = [ +MPI Rank 1: dim = 363 +MPI Rank 1: type = "real" +MPI Rank 1: scpFile = "glob_0000.scp" +MPI Rank 1: ] +MPI Rank 1: labels = [ +MPI Rank 1: mlfFile = "$DataDir$/glob_0000.mlf" +MPI Rank 1: labelMappingFile = "$DataDir$/state.list" +MPI Rank 1: labelDim = 132 +MPI Rank 1: labelType = "category" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: cvreader = [ +MPI Rank 1: readerType = "HTKMLFReader" +MPI Rank 1: readMethod = "blockRandomize" +MPI Rank 1: miniBatchMode = "partial" +MPI Rank 1: randomize = "auto" +MPI Rank 1: verbosity = 0 +MPI Rank 1: features = [ +MPI Rank 1: dim = 363 +MPI Rank 1: type = "real" +MPI Rank 1: scpFile = "glob_0000.cv.scp" +MPI Rank 1: ] +MPI Rank 1: labels = [ +MPI Rank 1: mlfFile = "$DataDir$/glob_0000.mlf" +MPI Rank 1: labelMappingFile = "$DataDir$/state.list" +MPI Rank 1: labelDim = 132 +MPI Rank 1: labelType = "category" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data +MPI Rank 1: RunDir=/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu +MPI Rank 1: DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data +MPI Rank 1: ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation +MPI Rank 1: OutputDir=/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu +MPI Rank 1: DeviceId=0 +MPI Rank 1: numCPUThreads=2 +MPI Rank 1: stderr=/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: precision = "double" +MPI Rank 1: command = speechTrain +MPI Rank 1: deviceId = 0 +MPI Rank 1: parallelTrain = true +MPI Rank 1: speechTrain = [ +MPI Rank 1: action = "train" +MPI Rank 1: modelPath = "/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu/models/cntkSpeech.dnn" +MPI Rank 1: deviceId = 0 +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SimpleNetworkBuilder = [ +MPI Rank 1: layerSizes = 363:512:512:132 +MPI Rank 1: trainingCriterion = "CrossEntropyWithSoftmax" +MPI Rank 1: evalCriterion = "ErrorPrediction" +MPI Rank 1: layerTypes = "Sigmoid" +MPI Rank 1: initValueScale = 1.0 +MPI Rank 1: applyMeanVarNorm = true +MPI Rank 1: uniformInit = true +MPI Rank 1: needPrior = true +MPI Rank 1: ] +MPI Rank 1: ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above +MPI Rank 1: layerSizes = 363:512:512:132 +MPI Rank 1: trainingCriterion = 'CE' +MPI Rank 1: evalCriterion = 'Err' +MPI Rank 1: applyMeanVarNorm = true +MPI Rank 1: L = Length(layerSizes)-1 // number of model layers +MPI Rank 1: features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') +MPI Rank 1: featNorm = if applyMeanVarNorm +MPI Rank 1: then MeanVarNorm(features) +MPI Rank 1: else features +MPI Rank 1: layers[layer:1..L-1] = if layer > 1 +MPI Rank 1: then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 1: else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 1: outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) +MPI Rank 1: outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) +MPI Rank 1: CE = if trainingCriterion == 'CE' +MPI Rank 1: then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') +MPI Rank 1: else Fail('unknown trainingCriterion ' + trainingCriterion) +MPI Rank 1: Err = if evalCriterion == 'Err' then +MPI Rank 1: ErrorPrediction(labels, outZ, tag='eval') +MPI Rank 1: else Fail('unknown evalCriterion ' + evalCriterion) +MPI Rank 1: logPrior = LogPrior(labels) +MPI Rank 1: // TODO: how to add a tag to an infix operation? +MPI Rank 1: ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') +MPI Rank 1: ] +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize = 20480 +MPI Rank 1: minibatchSize = 64:256:1024 +MPI Rank 1: learningRatesPerMB = 1.0:0.5:0.1 +MPI Rank 1: numMBsToShowResult = 10 +MPI Rank 1: momentumPerMB = 0.9:0.656119 +MPI Rank 1: dropoutRate = 0.0 +MPI Rank 1: maxEpochs = 3 +MPI Rank 1: keepCheckPointFiles = true +MPI Rank 1: clippingThresholdPerSample = 1#INF +MPI Rank 1: ParallelTrain = [ +MPI Rank 1: parallelizationMethod = "DataParallelSGD" +MPI Rank 1: distributedMBReading = true +MPI Rank 1: DataParallelSGD = [ +MPI Rank 1: gradientBits = 64 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: AutoAdjust = [ +MPI Rank 1: reduceLearnRateIfImproveLessThan = 0 +MPI Rank 1: loadBestModel = true +MPI Rank 1: increaseLearnRateIfImproveMoreThan = 1000000000 +MPI Rank 1: learnRateDecreaseFactor = 0.5 +MPI Rank 1: learnRateIncreaseFactor = 1.382 +MPI Rank 1: autoAdjustLR = "adjustAfterEpoch" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = "HTKMLFReader" +MPI Rank 1: readMethod = "blockRandomize" +MPI Rank 1: miniBatchMode = "partial" +MPI Rank 1: randomize = "auto" +MPI Rank 1: verbosity = 0 +MPI Rank 1: features = [ +MPI Rank 1: dim = 363 +MPI Rank 1: type = "real" +MPI Rank 1: scpFile = "glob_0000.scp" +MPI Rank 1: ] +MPI Rank 1: labels = [ +MPI Rank 1: mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/glob_0000.mlf" +MPI Rank 1: labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/state.list" +MPI Rank 1: labelDim = 132 +MPI Rank 1: labelType = "category" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: cvreader = [ +MPI Rank 1: readerType = "HTKMLFReader" +MPI Rank 1: readMethod = "blockRandomize" +MPI Rank 1: miniBatchMode = "partial" +MPI Rank 1: randomize = "auto" +MPI Rank 1: verbosity = 0 +MPI Rank 1: features = [ +MPI Rank 1: dim = 363 +MPI Rank 1: type = "real" +MPI Rank 1: scpFile = "glob_0000.cv.scp" +MPI Rank 1: ] +MPI Rank 1: labels = [ +MPI Rank 1: mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/glob_0000.mlf" +MPI Rank 1: labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/state.list" +MPI Rank 1: labelDim = 132 +MPI Rank 1: labelType = "category" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data +MPI Rank 1: RunDir=/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu +MPI Rank 1: DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data +MPI Rank 1: ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation +MPI Rank 1: OutputDir=/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu +MPI Rank 1: DeviceId=0 +MPI Rank 1: numCPUThreads=2 +MPI Rank 1: stderr=/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: configparameters: cntkcv.cntk:command=speechTrain +MPI Rank 1: configparameters: cntkcv.cntk:ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation +MPI Rank 1: configparameters: cntkcv.cntk:currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data +MPI Rank 1: configparameters: cntkcv.cntk:DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data +MPI Rank 1: configparameters: cntkcv.cntk:deviceId=0 +MPI Rank 1: configparameters: cntkcv.cntk:numCPUThreads=2 +MPI Rank 1: configparameters: cntkcv.cntk:OutputDir=/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu +MPI Rank 1: configparameters: cntkcv.cntk:parallelTrain=true +MPI Rank 1: configparameters: cntkcv.cntk:precision=double +MPI Rank 1: configparameters: cntkcv.cntk:RunDir=/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu +MPI Rank 1: configparameters: cntkcv.cntk:speechTrain=[ +MPI Rank 1: action = "train" +MPI Rank 1: modelPath = "/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu/models/cntkSpeech.dnn" +MPI Rank 1: deviceId = 0 +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SimpleNetworkBuilder = [ +MPI Rank 1: layerSizes = 363:512:512:132 +MPI Rank 1: trainingCriterion = "CrossEntropyWithSoftmax" +MPI Rank 1: evalCriterion = "ErrorPrediction" +MPI Rank 1: layerTypes = "Sigmoid" +MPI Rank 1: initValueScale = 1.0 +MPI Rank 1: applyMeanVarNorm = true +MPI Rank 1: uniformInit = true +MPI Rank 1: needPrior = true +MPI Rank 1: ] +MPI Rank 1: ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above +MPI Rank 1: layerSizes = 363:512:512:132 +MPI Rank 1: trainingCriterion = 'CE' +MPI Rank 1: evalCriterion = 'Err' +MPI Rank 1: applyMeanVarNorm = true +MPI Rank 1: L = Length(layerSizes)-1 // number of model layers +MPI Rank 1: features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') +MPI Rank 1: featNorm = if applyMeanVarNorm +MPI Rank 1: then MeanVarNorm(features) +MPI Rank 1: else features +MPI Rank 1: layers[layer:1..L-1] = if layer > 1 +MPI Rank 1: then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 1: else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 1: outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) +MPI Rank 1: outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) +MPI Rank 1: CE = if trainingCriterion == 'CE' +MPI Rank 1: then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') +MPI Rank 1: else Fail('unknown trainingCriterion ' + trainingCriterion) +MPI Rank 1: Err = if evalCriterion == 'Err' then +MPI Rank 1: ErrorPrediction(labels, outZ, tag='eval') +MPI Rank 1: else Fail('unknown evalCriterion ' + evalCriterion) +MPI Rank 1: logPrior = LogPrior(labels) +MPI Rank 1: // TODO: how to add a tag to an infix operation? +MPI Rank 1: ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') +MPI Rank 1: ] +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize = 20480 +MPI Rank 1: minibatchSize = 64:256:1024 +MPI Rank 1: learningRatesPerMB = 1.0:0.5:0.1 +MPI Rank 1: numMBsToShowResult = 10 +MPI Rank 1: momentumPerMB = 0.9:0.656119 +MPI Rank 1: dropoutRate = 0.0 +MPI Rank 1: maxEpochs = 3 +MPI Rank 1: keepCheckPointFiles = true +MPI Rank 1: clippingThresholdPerSample = 1#INF +MPI Rank 1: ParallelTrain = [ +MPI Rank 1: parallelizationMethod = "DataParallelSGD" +MPI Rank 1: distributedMBReading = true +MPI Rank 1: DataParallelSGD = [ +MPI Rank 1: gradientBits = 64 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: AutoAdjust = [ +MPI Rank 1: reduceLearnRateIfImproveLessThan = 0 +MPI Rank 1: loadBestModel = true +MPI Rank 1: increaseLearnRateIfImproveMoreThan = 1000000000 +MPI Rank 1: learnRateDecreaseFactor = 0.5 +MPI Rank 1: learnRateIncreaseFactor = 1.382 +MPI Rank 1: autoAdjustLR = "adjustAfterEpoch" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = "HTKMLFReader" +MPI Rank 1: readMethod = "blockRandomize" +MPI Rank 1: miniBatchMode = "partial" +MPI Rank 1: randomize = "auto" +MPI Rank 1: verbosity = 0 +MPI Rank 1: features = [ +MPI Rank 1: dim = 363 +MPI Rank 1: type = "real" +MPI Rank 1: scpFile = "glob_0000.scp" +MPI Rank 1: ] +MPI Rank 1: labels = [ +MPI Rank 1: mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/glob_0000.mlf" +MPI Rank 1: labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/state.list" +MPI Rank 1: labelDim = 132 +MPI Rank 1: labelType = "category" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: cvreader = [ +MPI Rank 1: readerType = "HTKMLFReader" +MPI Rank 1: readMethod = "blockRandomize" +MPI Rank 1: miniBatchMode = "partial" +MPI Rank 1: randomize = "auto" +MPI Rank 1: verbosity = 0 +MPI Rank 1: features = [ +MPI Rank 1: dim = 363 +MPI Rank 1: type = "real" +MPI Rank 1: scpFile = "glob_0000.cv.scp" +MPI Rank 1: ] +MPI Rank 1: labels = [ +MPI Rank 1: mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/glob_0000.mlf" +MPI Rank 1: labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/state.list" +MPI Rank 1: labelDim = 132 +MPI Rank 1: labelType = "category" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: cntkcv.cntk:stderr=/tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu/stderr +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: Commands: speechTrain +MPI Rank 1: Precision = "double" +MPI Rank 1: Using 2 CPU threads. +MPI Rank 1: CNTKModelPath: /tmp/cntk-test-20160303172526.47992/Speech/DNN_ParallelCrossValidation@debug_gpu/models/cntkSpeech.dnn +MPI Rank 1: CNTKCommandTrainInfo: speechTrain : 3 +MPI Rank 1: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 1: +MPI Rank 1: ############################################################################## +MPI Rank 1: # # +MPI Rank 1: # Action "train" # +MPI Rank 1: # # +MPI Rank 1: ############################################################################## +MPI Rank 1: +MPI Rank 1: CNTKCommandTrainBegin: speechTrain +MPI Rank 1: SimpleNetworkBuilder Using GPU 0 +MPI Rank 1: reading script file glob_0000.scp ... 948 entries +MPI Rank 1: total 132 state names in state list /home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/state.list +MPI Rank 1: htkmlfreader: reading MLF file /home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/glob_0000.mlf ... total 948 entries +MPI Rank 1: ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances +MPI Rank 1: label set 0: 129 classes +MPI Rank 1: minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames +MPI Rank 1: reading script file glob_0000.cv.scp ... 300 entries +MPI Rank 1: total 132 state names in state list /home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/state.list +MPI Rank 1: htkmlfreader: reading MLF file /home/philly/jenkins/workspace/CNTK-Test-Linux-W2/Tests/EndToEndTests/Speech/Data/glob_0000.mlf ... total 948 entries +MPI Rank 1: ...........................................................................feature set 0: 83050 frames in 300 out of 300 utterances +MPI Rank 1: label set 0: 129 classes +MPI Rank 1: minibatchutterancesource: 300 utterances grouped into 1 chunks, av. chunk size: 300.0 utterances, 83050.0 frames +MPI Rank 1: SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==8 +MPI Rank 1: +MPI Rank 1: Post-processing network... +MPI Rank 1: +MPI Rank 1: 7 roots: +MPI Rank 1: CrossEntropyWithSoftmax = CrossEntropyWithSoftmax +MPI Rank 1: EvalErrorPrediction = ErrorPrediction +MPI Rank 1: InvStdOfFeatures = InvStdDev +MPI Rank 1: MeanOfFeatures = Mean +MPI Rank 1: PosteriorProb = Softmax +MPI Rank 1: Prior = Mean +MPI Rank 1: ScaledLogLikelihood = Minus +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for CrossEntropyWithSoftmax CrossEntropyWithSoftmax operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for EvalErrorPrediction ErrorPrediction operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for InvStdOfFeatures InvStdDev operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for MeanOfFeatures Mean operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for PosteriorProb Softmax operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for Prior Mean operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for ScaledLogLikelihood Minus operation +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating network. 25 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132 {1} x *] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [132 x 512 {1,132}] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [512 x 512 {1,512}] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [512 x 363 {1,512}] +MPI Rank 1: Validating --> features = InputValue -> [363 {1} x *] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363 {1} x *]) -> [363 {1}] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363 {1} x *]) -> [363 {1}] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 {1} x *], MeanOfFeatures[363 {1}], InvStdOfFeatures[363 {1}]) -> [363 {1} x *] +MPI Rank 1: Validating --> W0*features = Times(W0[512 x 363 {1,512}], MVNormalizedFeatures[363 {1} x *]) -> [512 {1} x *] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [512 x 1 {1,512}] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512 {1} x *], B0[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> W1*H1 = Times(W1[512 x 512 {1,512}], H1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [512 x 1 {1,512}] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512 x 1 {1,512} x *], B1[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> W2*H1 = Times(W2[132 x 512 {1,132}], H2[512 x 1 {1,512} x *]) -> [132 x 1 {1,132} x *] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [132 x 1 {1,132}] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[132 x 1 {1,132} x *], B2[132 x 1 {1,132}]) -> [132 x 1 {1,132} x *] +MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}] +MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}] +MPI Rank 1: Validating --> PosteriorProb = Softmax(HLast[132 x 1 {1,132} x *]) -> [132 x 1 {1,132} x *] +MPI Rank 1: Validating --> Prior = Mean(labels[132 {1} x *]) -> [132 {1}] +MPI Rank 1: Validating --> LogOfPrior = Log(Prior[132 {1}]) -> [132 {1}] +MPI Rank 1: Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 {1,132} x *], LogOfPrior[132 {1}]) -> [132 x 1 {1,132} x *] +MPI Rank 1: +MPI Rank 1: Validating network. 17 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132 {1} x *] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [132 x 512 {1,132}] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [512 x 512 {1,512}] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [512 x 363 {1,512}] +MPI Rank 1: Validating --> features = InputValue -> [363 {1} x *] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363 {1} x *]) -> [363 {1}] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363 {1} x *]) -> [363 {1}] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 {1} x *], MeanOfFeatures[363 {1}], InvStdOfFeatures[363 {1}]) -> [363 {1} x *] +MPI Rank 1: Validating --> W0*features = Times(W0[512 x 363 {1,512}], MVNormalizedFeatures[363 {1} x *]) -> [512 {1} x *] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [512 x 1 {1,512}] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512 {1} x *], B0[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> W1*H1 = Times(W1[512 x 512 {1,512}], H1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [512 x 1 {1,512}] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512 x 1 {1,512} x *], B1[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> W2*H1 = Times(W2[132 x 512 {1,132}], H2[512 x 1 {1,512} x *]) -> [132 x 1 {1,132} x *] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [132 x 1 {1,132}] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[132 x 1 {1,132} x *], B2[132 x 1 {1,132}]) -> [132 x 1 {1,132} x *] +MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}] +MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}] +MPI Rank 1: Validating --> PosteriorProb = Softmax(HLast[132 x 1 {1,132} x *]) -> [132 x 1 {1,132} x *] +MPI Rank 1: Validating --> Prior = Mean(labels[132 {1} x *]) -> [132 {1}] +MPI Rank 1: Validating --> LogOfPrior = Log(Prior[132 {1}]) -> [132 {1}] +MPI Rank 1: Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 {1,132} x *], LogOfPrior[132 {1}]) -> [132 x 1 {1,132} x *] +MPI Rank 1: +MPI Rank 1: Validating network, final pass. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132 {1} x *] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [132 x 512 {1,132}] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [512 x 512 {1,512}] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [512 x 363 {1,512}] +MPI Rank 1: Validating --> features = InputValue -> [363 {1} x *] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363 {1} x *]) -> [363 {1}] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363 {1} x *]) -> [363 {1}] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 {1} x *], MeanOfFeatures[363 {1}], InvStdOfFeatures[363 {1}]) -> [363 {1} x *] +MPI Rank 1: Validating --> W0*features = Times(W0[512 x 363 {1,512}], MVNormalizedFeatures[363 {1} x *]) -> [512 {1} x *] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [512 x 1 {1,512}] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512 {1} x *], B0[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> W1*H1 = Times(W1[512 x 512 {1,512}], H1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [512 x 1 {1,512}] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512 x 1 {1,512} x *], B1[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> W2*H1 = Times(W2[132 x 512 {1,132}], H2[512 x 1 {1,512} x *]) -> [132 x 1 {1,132} x *] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [132 x 1 {1,132}] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[132 x 1 {1,132} x *], B2[132 x 1 {1,132}]) -> [132 x 1 {1,132} x *] +MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}] +MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}] +MPI Rank 1: Validating --> PosteriorProb = Softmax(HLast[132 x 1 {1,132} x *]) -> [132 x 1 {1,132} x *] +MPI Rank 1: Validating --> Prior = Mean(labels[132 {1} x *]) -> [132 {1}] +MPI Rank 1: Validating --> LogOfPrior = Log(Prior[132 {1}]) -> [132 {1}] +MPI Rank 1: Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 {1,132} x *], LogOfPrior[132 {1}]) -> [132 x 1 {1,132} x *] +MPI Rank 1: +MPI Rank 1: 12 out of 25 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: Post-processing network complete. +MPI Rank 1: +MPI Rank 1: SGD using GPU 0. +MPI Rank 1: +MPI Rank 1: Training criterion node(s): +MPI Rank 1: CrossEntropyWithSoftmax = CrossEntropyWithSoftmax +MPI Rank 1: +MPI Rank 1: Evaluation criterion node(s): +MPI Rank 1: EvalErrorPrediction = ErrorPrediction +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Allocating matrices for forward and/or backward propagation. +MPI Rank 1: +MPI Rank 1: Precomputing --> 3 PreCompute nodes found. +MPI Rank 1: +MPI Rank 1: NodeName: MeanOfFeatures +MPI Rank 1: NodeName: InvStdOfFeatures +MPI Rank 1: NodeName: Prior +MPI Rank 1: minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 1: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms +MPI Rank 1: +MPI Rank 1: Precomputing --> Completed. +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Starting Epoch 1: learning rate per sample = 0.015625 effective momentum = 0.900000 momentum as time constant = 607.4 samples +MPI Rank 1: minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 1 of 2, with 1 datapasses +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 2, NumGradientBits = 64), distributed reading is ENABLED. +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 1- 10, 3.12%]: SamplesSeen = 640; TrainLossPerSample = 4.40318406; EvalErr[0]PerSample = 0.90468750; TotalTime = 0.5044s; SamplesPerSecond = 1268.7 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 11- 20, 6.25%]: SamplesSeen = 640; TrainLossPerSample = 4.15980357; EvalErr[0]PerSample = 0.87187500; TotalTime = 0.5876s; SamplesPerSecond = 1089.3 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 21- 30, 9.38%]: SamplesSeen = 640; TrainLossPerSample = 3.98424210; EvalErr[0]PerSample = 0.87812500; TotalTime = 0.5951s; SamplesPerSecond = 1075.5 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 31- 40, 12.50%]: SamplesSeen = 640; TrainLossPerSample = 3.86209050; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.4935s; SamplesPerSecond = 1296.9 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 41- 50, 15.62%]: SamplesSeen = 640; TrainLossPerSample = 3.80597620; EvalErr[0]PerSample = 0.88593750; TotalTime = 0.4594s; SamplesPerSecond = 1393.1 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 51- 60, 18.75%]: SamplesSeen = 640; TrainLossPerSample = 3.73511552; EvalErr[0]PerSample = 0.87812500; TotalTime = 0.5922s; SamplesPerSecond = 1080.7 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 61- 70, 21.88%]: SamplesSeen = 640; TrainLossPerSample = 3.57260725; EvalErr[0]PerSample = 0.81875000; TotalTime = 0.5924s; SamplesPerSecond = 1080.3 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 71- 80, 25.00%]: SamplesSeen = 640; TrainLossPerSample = 3.42293687; EvalErr[0]PerSample = 0.80468750; TotalTime = 0.5982s; SamplesPerSecond = 1069.8 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 81- 90, 28.12%]: SamplesSeen = 640; TrainLossPerSample = 3.34304309; EvalErr[0]PerSample = 0.76718750; TotalTime = 0.4933s; SamplesPerSecond = 1297.3 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 91- 100, 31.25%]: SamplesSeen = 640; TrainLossPerSample = 3.37037793; EvalErr[0]PerSample = 0.84687500; TotalTime = 0.5630s; SamplesPerSecond = 1136.8 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 101- 110, 34.38%]: SamplesSeen = 640; TrainLossPerSample = 3.21606065; EvalErr[0]PerSample = 0.76093750; TotalTime = 0.5480s; SamplesPerSecond = 1167.8 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 111- 120, 37.50%]: SamplesSeen = 640; TrainLossPerSample = 3.31610118; EvalErr[0]PerSample = 0.78437500; TotalTime = 0.4741s; SamplesPerSecond = 1349.9 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 121- 130, 40.62%]: SamplesSeen = 640; TrainLossPerSample = 3.14285888; EvalErr[0]PerSample = 0.75000000; TotalTime = 0.5828s; SamplesPerSecond = 1098.1 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 131- 140, 43.75%]: SamplesSeen = 640; TrainLossPerSample = 3.01821991; EvalErr[0]PerSample = 0.70937500; TotalTime = 0.5491s; SamplesPerSecond = 1165.6 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 141- 150, 46.88%]: SamplesSeen = 640; TrainLossPerSample = 3.01218944; EvalErr[0]PerSample = 0.73906250; TotalTime = 0.6073s; SamplesPerSecond = 1053.8 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 151- 160, 50.00%]: SamplesSeen = 640; TrainLossPerSample = 2.98947652; EvalErr[0]PerSample = 0.73593750; TotalTime = 0.5492s; SamplesPerSecond = 1165.4 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 161- 170, 53.12%]: SamplesSeen = 640; TrainLossPerSample = 2.86297716; EvalErr[0]PerSample = 0.70000000; TotalTime = 0.4994s; SamplesPerSecond = 1281.5 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 171- 180, 56.25%]: SamplesSeen = 640; TrainLossPerSample = 2.71901077; EvalErr[0]PerSample = 0.68593750; TotalTime = 0.5388s; SamplesPerSecond = 1187.7 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 181- 190, 59.38%]: SamplesSeen = 640; TrainLossPerSample = 2.80860596; EvalErr[0]PerSample = 0.71250000; TotalTime = 0.6017s; SamplesPerSecond = 1063.7 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 191- 200, 62.50%]: SamplesSeen = 640; TrainLossPerSample = 2.60590434; EvalErr[0]PerSample = 0.64687500; TotalTime = 0.5662s; SamplesPerSecond = 1130.4 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 201- 210, 65.62%]: SamplesSeen = 640; TrainLossPerSample = 2.63920069; EvalErr[0]PerSample = 0.66875000; TotalTime = 0.5087s; SamplesPerSecond = 1258.0 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 211- 220, 68.75%]: SamplesSeen = 640; TrainLossPerSample = 2.58372597; EvalErr[0]PerSample = 0.65781250; TotalTime = 0.5937s; SamplesPerSecond = 1078.1 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 221- 230, 71.88%]: SamplesSeen = 640; TrainLossPerSample = 2.50997096; EvalErr[0]PerSample = 0.62031250; TotalTime = 0.5504s; SamplesPerSecond = 1162.9 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 231- 240, 75.00%]: SamplesSeen = 640; TrainLossPerSample = 2.42126950; EvalErr[0]PerSample = 0.62968750; TotalTime = 0.5068s; SamplesPerSecond = 1262.9 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 241- 250, 78.12%]: SamplesSeen = 640; TrainLossPerSample = 2.40125789; EvalErr[0]PerSample = 0.65156250; TotalTime = 0.5871s; SamplesPerSecond = 1090.0 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 251- 260, 81.25%]: SamplesSeen = 640; TrainLossPerSample = 2.47110816; EvalErr[0]PerSample = 0.63281250; TotalTime = 0.5379s; SamplesPerSecond = 1189.7 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 261- 270, 84.38%]: SamplesSeen = 640; TrainLossPerSample = 2.33215267; EvalErr[0]PerSample = 0.60312500; TotalTime = 0.5621s; SamplesPerSecond = 1138.6 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 271- 280, 87.50%]: SamplesSeen = 640; TrainLossPerSample = 2.21936103; EvalErr[0]PerSample = 0.56875000; TotalTime = 0.5416s; SamplesPerSecond = 1181.6 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 281- 290, 90.62%]: SamplesSeen = 640; TrainLossPerSample = 2.31959580; EvalErr[0]PerSample = 0.61093750; TotalTime = 0.5001s; SamplesPerSecond = 1279.8 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 291- 300, 93.75%]: SamplesSeen = 640; TrainLossPerSample = 2.19592881; EvalErr[0]PerSample = 0.61718750; TotalTime = 0.5991s; SamplesPerSecond = 1068.2 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 301- 310, 96.88%]: SamplesSeen = 640; TrainLossPerSample = 2.28411654; EvalErr[0]PerSample = 0.60000000; TotalTime = 0.6028s; SamplesPerSecond = 1061.6 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 311- 320, 100.00%]: SamplesSeen = 640; TrainLossPerSample = 2.18307184; EvalErr[0]PerSample = 0.55781250; TotalTime = 0.4905s; SamplesPerSecond = 1304.7 +MPI Rank 1: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 2.9972357; TotalSamplesSeen = 20480; EvalErrPerSample = 0.72426758; AvgLearningRatePerSample = 0.015625; EpochTime=17.6022 +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Allocating matrices for forward and/or backward propagation. +MPI Rank 1: minibatchiterator: epoch 0: frames [0..83050] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 1: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms +MPI Rank 1: Final Results: Minibatch[1-1298]: SamplesSeen = 83050 CrossEntropyWithSoftmax: CrossEntropyWithSoftmax/Sample = 2.1793731 Perplexity = 8.8407623 EvalErrorPrediction: ErrorPrediction/Sample = 0.58275738 +MPI Rank 1: Finished Epoch[ 1 of 3]: [Validation Set] TrainLossPerSample = 2.1793731; EvalErrPerSample = 0.58275738 +MPI Rank 1: +MPI Rank 1: Starting Epoch 2: learning rate per sample = 0.001953 effective momentum = 0.656119 momentum as time constant = 607.5 samples +MPI Rank 1: minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 1 of 2, with 1 datapasses +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 2, NumGradientBits = 64), distributed reading is ENABLED. +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 1- 10, 12.50%]: SamplesSeen = 2560; TrainLossPerSample = 2.04166118; EvalErr[0]PerSample = 0.54531250; TotalTime = 0.6771s; SamplesPerSecond = 3780.7 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 11- 20, 25.00%]: SamplesSeen = 2560; TrainLossPerSample = 2.01439158; EvalErr[0]PerSample = 0.54023438; TotalTime = 0.6139s; SamplesPerSecond = 4170.1 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 21- 30, 37.50%]: SamplesSeen = 2560; TrainLossPerSample = 2.01068322; EvalErr[0]PerSample = 0.55039063; TotalTime = 0.7061s; SamplesPerSecond = 3625.7 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 31- 40, 50.00%]: SamplesSeen = 2560; TrainLossPerSample = 1.95327476; EvalErr[0]PerSample = 0.53515625; TotalTime = 0.6724s; SamplesPerSecond = 3807.3 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 41- 50, 62.50%]: SamplesSeen = 2560; TrainLossPerSample = 1.92813201; EvalErr[0]PerSample = 0.53867188; TotalTime = 0.6897s; SamplesPerSecond = 3712.0 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 51- 60, 75.00%]: SamplesSeen = 2560; TrainLossPerSample = 1.99300635; EvalErr[0]PerSample = 0.53671875; TotalTime = 0.6607s; SamplesPerSecond = 3874.7 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 61- 70, 87.50%]: SamplesSeen = 2560; TrainLossPerSample = 1.92264042; EvalErr[0]PerSample = 0.52304688; TotalTime = 0.7512s; SamplesPerSecond = 3407.8 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 71- 80, 100.00%]: SamplesSeen = 2560; TrainLossPerSample = 1.92843715; EvalErr[0]PerSample = 0.53554687; TotalTime = 0.6351s; SamplesPerSecond = 4030.6 +MPI Rank 1: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9740283; TotalSamplesSeen = 40960; EvalErrPerSample = 0.53813477; AvgLearningRatePerSample = 0.001953125; EpochTime=5.44531 +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Allocating matrices for forward and/or backward propagation. +MPI Rank 1: minibatchiterator: epoch 0: frames [0..83050] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 1: Final Results: Minibatch[1-325]: SamplesSeen = 83050 CrossEntropyWithSoftmax: CrossEntropyWithSoftmax/Sample = 1.8879998 Perplexity = 6.6061417 EvalErrorPrediction: ErrorPrediction/Sample = 0.52534618 +MPI Rank 1: Finished Epoch[ 2 of 3]: [Validation Set] TrainLossPerSample = 1.8879998; EvalErrPerSample = 0.52534618 +MPI Rank 1: +MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.000098 effective momentum = 0.656119 momentum as time constant = 2429.9 samples +MPI Rank 1: minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 1 of 2, with 1 datapasses +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 2, NumGradientBits = 64), distributed reading is ENABLED. +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 1- 10, 50.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.89842932; EvalErr[0]PerSample = 0.52373047; TotalTime = 1.2510s; SamplesPerSecond = 8185.7 +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 11- 20, 100.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.89204376; EvalErr[0]PerSample = 0.52128906; TotalTime = 1.1043s; SamplesPerSecond = 9272.9 +MPI Rank 1: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8952365; TotalSamplesSeen = 61440; EvalErrPerSample = 0.52250977; AvgLearningRatePerSample = 9.7656251e-05; EpochTime=2.41353 +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Allocating matrices for forward and/or backward propagation. +MPI Rank 1: minibatchiterator: epoch 0: frames [0..83050] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 1: Final Results: Minibatch[1-82]: SamplesSeen = 83050 CrossEntropyWithSoftmax: CrossEntropyWithSoftmax/Sample = 1.8678214 Perplexity = 6.4741763 EvalErrorPrediction: ErrorPrediction/Sample = 0.51708609 +MPI Rank 1: Finished Epoch[ 3 of 3]: [Validation Set] TrainLossPerSample = 1.8678214; EvalErrPerSample = 0.51708609 +MPI Rank 1: CNTKCommandTrainEnd: speechTrain +MPI Rank 1: +MPI Rank 1: Action "train" complete. +MPI Rank 1: +MPI Rank 1: COMPLETED +MPI Rank 1: ~MPIWrapper \ No newline at end of file diff --git a/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/baseline.windows.cpu.txt b/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/baseline.windows.cpu.txt new file mode 100644 index 000000000000..d7c6d81a9f88 --- /dev/null +++ b/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/baseline.windows.cpu.txt @@ -0,0 +1,1230 @@ +=== Running C:\Program Files\Microsoft MPI\Bin\/mpiexec.exe -n 2 D:\src\cntk\x64\debug\cntk.exe configFile=D:\src\cntk\Tests\EndToEndTests\Speech\DNN\ParallelCrossValidation/cntkcv.cntk currentDirectory=D:\src\cntk\Tests\EndToEndTests\Speech\Data RunDir=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu DataDir=D:\src\cntk\Tests\EndToEndTests\Speech\Data ConfigDir=D:\src\cntk\Tests\EndToEndTests\Speech\DNN\ParallelCrossValidation OutputDir=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu DeviceId=-1 numCPUThreads=20 stderr=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu/stderr +MPIWrapper: initializing MPI +MPIWrapper: initializing MPI +ping [requestnodes (before change)]: 2 nodes pinging each other +ping [requestnodes (before change)]: 2 nodes pinging each other +ping [requestnodes (before change)]: all 2 nodes responded +ping [requestnodes (before change)]: all 2 nodes responded +requestnodes [MPIWrapper]: using 2 out of 2 MPI nodes (2 requested); we (0) are in (participating) +requestnodes [MPIWrapper]: using 2 out of 2 MPI nodes (2 requested); we (1) are in (participating) +ping [requestnodes (after change)]: 2 nodes pinging each other +ping [requestnodes (after change)]: 2 nodes pinging each other +ping [requestnodes (after change)]: all 2 nodes responded +ping [requestnodes (after change)]: all 2 nodes responded +mpihelper: we are cog 0 in a gearbox of 2 +mpihelper: we are cog 1 in a gearbox of 2 +ping [mpihelper]: 2 nodes pinging each other +ping [mpihelper]: 2 nodes pinging each other +ping [mpihelper]: all 2 nodes responded +ping [mpihelper]: all 2 nodes responded +MPI Rank 0: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu/stderr_speechTrain.logrank0 +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: Build info: +MPI Rank 0: +MPI Rank 0: Built time: Mar 1 2016 16:21:17 +MPI Rank 0: Last modified date: Fri Feb 26 14:22:38 2016 +MPI Rank 0: Build type: Debug +MPI Rank 0: Build target: GPU +MPI Rank 0: With 1bit-SGD: no +MPI Rank 0: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 +MPI Rank 0: CUB_PATH: c:\src\cub-1.4.1 +MPI Rank 0: CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda +MPI Rank 0: Build Branch: weixi/pcv +MPI Rank 0: Build SHA1: 6cb3b9d86a12663b8b08404811e7d882815f2326 (modified) +MPI Rank 0: Built by weixi on GCRCN0509 +MPI Rank 0: Build Path: D:\src\cntk\Source\CNTK\ +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: running on GCRCN0509 at 2016/03/02 01:00:19 +MPI Rank 0: command line: +MPI Rank 0: D:\src\cntk\x64\debug\cntk.exe configFile=D:\src\cntk\Tests\EndToEndTests\Speech\DNN\ParallelCrossValidation/cntkcv.cntk currentDirectory=D:\src\cntk\Tests\EndToEndTests\Speech\Data RunDir=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu DataDir=D:\src\cntk\Tests\EndToEndTests\Speech\Data ConfigDir=D:\src\cntk\Tests\EndToEndTests\Speech\DNN\ParallelCrossValidation OutputDir=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu DeviceId=-1 numCPUThreads=20 stderr=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu/stderr +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: precision = "double" +MPI Rank 0: command = speechTrain +MPI Rank 0: deviceId = $DeviceId$ +MPI Rank 0: parallelTrain = true +MPI Rank 0: speechTrain = [ +MPI Rank 0: action = "train" +MPI Rank 0: modelPath = "$RunDir$/models/cntkSpeech.dnn" +MPI Rank 0: deviceId = $DeviceId$ +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SimpleNetworkBuilder = [ +MPI Rank 0: layerSizes = 363:512:512:132 +MPI Rank 0: trainingCriterion = "CrossEntropyWithSoftmax" +MPI Rank 0: evalCriterion = "ErrorPrediction" +MPI Rank 0: layerTypes = "Sigmoid" +MPI Rank 0: initValueScale = 1.0 +MPI Rank 0: applyMeanVarNorm = true +MPI Rank 0: uniformInit = true +MPI Rank 0: needPrior = true +MPI Rank 0: ] +MPI Rank 0: ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above +MPI Rank 0: layerSizes = 363:512:512:132 +MPI Rank 0: trainingCriterion = 'CE' +MPI Rank 0: evalCriterion = 'Err' +MPI Rank 0: applyMeanVarNorm = true +MPI Rank 0: L = Length(layerSizes)-1 // number of model layers +MPI Rank 0: features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') +MPI Rank 0: featNorm = if applyMeanVarNorm +MPI Rank 0: then MeanVarNorm(features) +MPI Rank 0: else features +MPI Rank 0: layers[layer:1..L-1] = if layer > 1 +MPI Rank 0: then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 0: else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 0: outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) +MPI Rank 0: outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) +MPI Rank 0: CE = if trainingCriterion == 'CE' +MPI Rank 0: then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') +MPI Rank 0: else Fail('unknown trainingCriterion ' + trainingCriterion) +MPI Rank 0: Err = if evalCriterion == 'Err' then +MPI Rank 0: ErrorPrediction(labels, outZ, tag='eval') +MPI Rank 0: else Fail('unknown evalCriterion ' + evalCriterion) +MPI Rank 0: logPrior = LogPrior(labels) +MPI Rank 0: // TODO: how to add a tag to an infix operation? +MPI Rank 0: ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') +MPI Rank 0: ] +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize = 20480 +MPI Rank 0: minibatchSize = 64:256:1024 +MPI Rank 0: learningRatesPerMB = 1.0:0.5:0.1 +MPI Rank 0: numMBsToShowResult = 10 +MPI Rank 0: momentumPerMB = 0.9:0.656119 +MPI Rank 0: dropoutRate = 0.0 +MPI Rank 0: maxEpochs = 3 +MPI Rank 0: keepCheckPointFiles = true +MPI Rank 0: clippingThresholdPerSample = 1#INF +MPI Rank 0: ParallelTrain = [ +MPI Rank 0: parallelizationMethod = "DataParallelSGD" +MPI Rank 0: distributedMBReading = true +MPI Rank 0: DataParallelSGD = [ +MPI Rank 0: gradientBits = 64 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: AutoAdjust = [ +MPI Rank 0: reduceLearnRateIfImproveLessThan = 0 +MPI Rank 0: loadBestModel = true +MPI Rank 0: increaseLearnRateIfImproveMoreThan = 1000000000 +MPI Rank 0: learnRateDecreaseFactor = 0.5 +MPI Rank 0: learnRateIncreaseFactor = 1.382 +MPI Rank 0: autoAdjustLR = "adjustAfterEpoch" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = "HTKMLFReader" +MPI Rank 0: readMethod = "blockRandomize" +MPI Rank 0: miniBatchMode = "partial" +MPI Rank 0: randomize = "auto" +MPI Rank 0: verbosity = 0 +MPI Rank 0: features = [ +MPI Rank 0: dim = 363 +MPI Rank 0: type = "real" +MPI Rank 0: scpFile = "glob_0000.scp" +MPI Rank 0: ] +MPI Rank 0: labels = [ +MPI Rank 0: mlfFile = "$DataDir$/glob_0000.mlf" +MPI Rank 0: labelMappingFile = "$DataDir$/state.list" +MPI Rank 0: labelDim = 132 +MPI Rank 0: labelType = "category" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: cvreader = [ +MPI Rank 0: readerType = "HTKMLFReader" +MPI Rank 0: readMethod = "blockRandomize" +MPI Rank 0: miniBatchMode = "partial" +MPI Rank 0: randomize = "auto" +MPI Rank 0: verbosity = 0 +MPI Rank 0: features = [ +MPI Rank 0: dim = 363 +MPI Rank 0: type = "real" +MPI Rank 0: scpFile = "glob_0000.cv.scp" +MPI Rank 0: ] +MPI Rank 0: labels = [ +MPI Rank 0: mlfFile = "$DataDir$/glob_0000.mlf" +MPI Rank 0: labelMappingFile = "$DataDir$/state.list" +MPI Rank 0: labelDim = 132 +MPI Rank 0: labelType = "category" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: currentDirectory=D:\src\cntk\Tests\EndToEndTests\Speech\Data +MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu +MPI Rank 0: DataDir=D:\src\cntk\Tests\EndToEndTests\Speech\Data +MPI Rank 0: ConfigDir=D:\src\cntk\Tests\EndToEndTests\Speech\DNN\ParallelCrossValidation +MPI Rank 0: OutputDir=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu +MPI Rank 0: DeviceId=-1 +MPI Rank 0: numCPUThreads=20 +MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: precision = "double" +MPI Rank 0: command = speechTrain +MPI Rank 0: deviceId = -1 +MPI Rank 0: parallelTrain = true +MPI Rank 0: speechTrain = [ +MPI Rank 0: action = "train" +MPI Rank 0: modelPath = "C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu/models/cntkSpeech.dnn" +MPI Rank 0: deviceId = -1 +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SimpleNetworkBuilder = [ +MPI Rank 0: layerSizes = 363:512:512:132 +MPI Rank 0: trainingCriterion = "CrossEntropyWithSoftmax" +MPI Rank 0: evalCriterion = "ErrorPrediction" +MPI Rank 0: layerTypes = "Sigmoid" +MPI Rank 0: initValueScale = 1.0 +MPI Rank 0: applyMeanVarNorm = true +MPI Rank 0: uniformInit = true +MPI Rank 0: needPrior = true +MPI Rank 0: ] +MPI Rank 0: ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above +MPI Rank 0: layerSizes = 363:512:512:132 +MPI Rank 0: trainingCriterion = 'CE' +MPI Rank 0: evalCriterion = 'Err' +MPI Rank 0: applyMeanVarNorm = true +MPI Rank 0: L = Length(layerSizes)-1 // number of model layers +MPI Rank 0: features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') +MPI Rank 0: featNorm = if applyMeanVarNorm +MPI Rank 0: then MeanVarNorm(features) +MPI Rank 0: else features +MPI Rank 0: layers[layer:1..L-1] = if layer > 1 +MPI Rank 0: then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 0: else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 0: outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) +MPI Rank 0: outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) +MPI Rank 0: CE = if trainingCriterion == 'CE' +MPI Rank 0: then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') +MPI Rank 0: else Fail('unknown trainingCriterion ' + trainingCriterion) +MPI Rank 0: Err = if evalCriterion == 'Err' then +MPI Rank 0: ErrorPrediction(labels, outZ, tag='eval') +MPI Rank 0: else Fail('unknown evalCriterion ' + evalCriterion) +MPI Rank 0: logPrior = LogPrior(labels) +MPI Rank 0: // TODO: how to add a tag to an infix operation? +MPI Rank 0: ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') +MPI Rank 0: ] +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize = 20480 +MPI Rank 0: minibatchSize = 64:256:1024 +MPI Rank 0: learningRatesPerMB = 1.0:0.5:0.1 +MPI Rank 0: numMBsToShowResult = 10 +MPI Rank 0: momentumPerMB = 0.9:0.656119 +MPI Rank 0: dropoutRate = 0.0 +MPI Rank 0: maxEpochs = 3 +MPI Rank 0: keepCheckPointFiles = true +MPI Rank 0: clippingThresholdPerSample = 1#INF +MPI Rank 0: ParallelTrain = [ +MPI Rank 0: parallelizationMethod = "DataParallelSGD" +MPI Rank 0: distributedMBReading = true +MPI Rank 0: DataParallelSGD = [ +MPI Rank 0: gradientBits = 64 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: AutoAdjust = [ +MPI Rank 0: reduceLearnRateIfImproveLessThan = 0 +MPI Rank 0: loadBestModel = true +MPI Rank 0: increaseLearnRateIfImproveMoreThan = 1000000000 +MPI Rank 0: learnRateDecreaseFactor = 0.5 +MPI Rank 0: learnRateIncreaseFactor = 1.382 +MPI Rank 0: autoAdjustLR = "adjustAfterEpoch" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = "HTKMLFReader" +MPI Rank 0: readMethod = "blockRandomize" +MPI Rank 0: miniBatchMode = "partial" +MPI Rank 0: randomize = "auto" +MPI Rank 0: verbosity = 0 +MPI Rank 0: features = [ +MPI Rank 0: dim = 363 +MPI Rank 0: type = "real" +MPI Rank 0: scpFile = "glob_0000.scp" +MPI Rank 0: ] +MPI Rank 0: labels = [ +MPI Rank 0: mlfFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/glob_0000.mlf" +MPI Rank 0: labelMappingFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/state.list" +MPI Rank 0: labelDim = 132 +MPI Rank 0: labelType = "category" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: cvreader = [ +MPI Rank 0: readerType = "HTKMLFReader" +MPI Rank 0: readMethod = "blockRandomize" +MPI Rank 0: miniBatchMode = "partial" +MPI Rank 0: randomize = "auto" +MPI Rank 0: verbosity = 0 +MPI Rank 0: features = [ +MPI Rank 0: dim = 363 +MPI Rank 0: type = "real" +MPI Rank 0: scpFile = "glob_0000.cv.scp" +MPI Rank 0: ] +MPI Rank 0: labels = [ +MPI Rank 0: mlfFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/glob_0000.mlf" +MPI Rank 0: labelMappingFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/state.list" +MPI Rank 0: labelDim = 132 +MPI Rank 0: labelType = "category" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: currentDirectory=D:\src\cntk\Tests\EndToEndTests\Speech\Data +MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu +MPI Rank 0: DataDir=D:\src\cntk\Tests\EndToEndTests\Speech\Data +MPI Rank 0: ConfigDir=D:\src\cntk\Tests\EndToEndTests\Speech\DNN\ParallelCrossValidation +MPI Rank 0: OutputDir=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu +MPI Rank 0: DeviceId=-1 +MPI Rank 0: numCPUThreads=20 +MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: configparameters: cntkcv.cntk:command=speechTrain +MPI Rank 0: configparameters: cntkcv.cntk:ConfigDir=D:\src\cntk\Tests\EndToEndTests\Speech\DNN\ParallelCrossValidation +MPI Rank 0: configparameters: cntkcv.cntk:currentDirectory=D:\src\cntk\Tests\EndToEndTests\Speech\Data +MPI Rank 0: configparameters: cntkcv.cntk:DataDir=D:\src\cntk\Tests\EndToEndTests\Speech\Data +MPI Rank 0: configparameters: cntkcv.cntk:deviceId=-1 +MPI Rank 0: configparameters: cntkcv.cntk:numCPUThreads=20 +MPI Rank 0: configparameters: cntkcv.cntk:OutputDir=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu +MPI Rank 0: configparameters: cntkcv.cntk:parallelTrain=true +MPI Rank 0: configparameters: cntkcv.cntk:precision=double +MPI Rank 0: configparameters: cntkcv.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu +MPI Rank 0: configparameters: cntkcv.cntk:speechTrain=[ +MPI Rank 0: action = "train" +MPI Rank 0: modelPath = "C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu/models/cntkSpeech.dnn" +MPI Rank 0: deviceId = -1 +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SimpleNetworkBuilder = [ +MPI Rank 0: layerSizes = 363:512:512:132 +MPI Rank 0: trainingCriterion = "CrossEntropyWithSoftmax" +MPI Rank 0: evalCriterion = "ErrorPrediction" +MPI Rank 0: layerTypes = "Sigmoid" +MPI Rank 0: initValueScale = 1.0 +MPI Rank 0: applyMeanVarNorm = true +MPI Rank 0: uniformInit = true +MPI Rank 0: needPrior = true +MPI Rank 0: ] +MPI Rank 0: ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above +MPI Rank 0: layerSizes = 363:512:512:132 +MPI Rank 0: trainingCriterion = 'CE' +MPI Rank 0: evalCriterion = 'Err' +MPI Rank 0: applyMeanVarNorm = true +MPI Rank 0: L = Length(layerSizes)-1 // number of model layers +MPI Rank 0: features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') +MPI Rank 0: featNorm = if applyMeanVarNorm +MPI Rank 0: then MeanVarNorm(features) +MPI Rank 0: else features +MPI Rank 0: layers[layer:1..L-1] = if layer > 1 +MPI Rank 0: then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 0: else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 0: outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) +MPI Rank 0: outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) +MPI Rank 0: CE = if trainingCriterion == 'CE' +MPI Rank 0: then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') +MPI Rank 0: else Fail('unknown trainingCriterion ' + trainingCriterion) +MPI Rank 0: Err = if evalCriterion == 'Err' then +MPI Rank 0: ErrorPrediction(labels, outZ, tag='eval') +MPI Rank 0: else Fail('unknown evalCriterion ' + evalCriterion) +MPI Rank 0: logPrior = LogPrior(labels) +MPI Rank 0: // TODO: how to add a tag to an infix operation? +MPI Rank 0: ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') +MPI Rank 0: ] +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize = 20480 +MPI Rank 0: minibatchSize = 64:256:1024 +MPI Rank 0: learningRatesPerMB = 1.0:0.5:0.1 +MPI Rank 0: numMBsToShowResult = 10 +MPI Rank 0: momentumPerMB = 0.9:0.656119 +MPI Rank 0: dropoutRate = 0.0 +MPI Rank 0: maxEpochs = 3 +MPI Rank 0: keepCheckPointFiles = true +MPI Rank 0: clippingThresholdPerSample = 1#INF +MPI Rank 0: ParallelTrain = [ +MPI Rank 0: parallelizationMethod = "DataParallelSGD" +MPI Rank 0: distributedMBReading = true +MPI Rank 0: DataParallelSGD = [ +MPI Rank 0: gradientBits = 64 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: AutoAdjust = [ +MPI Rank 0: reduceLearnRateIfImproveLessThan = 0 +MPI Rank 0: loadBestModel = true +MPI Rank 0: increaseLearnRateIfImproveMoreThan = 1000000000 +MPI Rank 0: learnRateDecreaseFactor = 0.5 +MPI Rank 0: learnRateIncreaseFactor = 1.382 +MPI Rank 0: autoAdjustLR = "adjustAfterEpoch" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = "HTKMLFReader" +MPI Rank 0: readMethod = "blockRandomize" +MPI Rank 0: miniBatchMode = "partial" +MPI Rank 0: randomize = "auto" +MPI Rank 0: verbosity = 0 +MPI Rank 0: features = [ +MPI Rank 0: dim = 363 +MPI Rank 0: type = "real" +MPI Rank 0: scpFile = "glob_0000.scp" +MPI Rank 0: ] +MPI Rank 0: labels = [ +MPI Rank 0: mlfFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/glob_0000.mlf" +MPI Rank 0: labelMappingFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/state.list" +MPI Rank 0: labelDim = 132 +MPI Rank 0: labelType = "category" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: cvreader = [ +MPI Rank 0: readerType = "HTKMLFReader" +MPI Rank 0: readMethod = "blockRandomize" +MPI Rank 0: miniBatchMode = "partial" +MPI Rank 0: randomize = "auto" +MPI Rank 0: verbosity = 0 +MPI Rank 0: features = [ +MPI Rank 0: dim = 363 +MPI Rank 0: type = "real" +MPI Rank 0: scpFile = "glob_0000.cv.scp" +MPI Rank 0: ] +MPI Rank 0: labels = [ +MPI Rank 0: mlfFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/glob_0000.mlf" +MPI Rank 0: labelMappingFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/state.list" +MPI Rank 0: labelDim = 132 +MPI Rank 0: labelType = "category" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: cntkcv.cntk:stderr=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu/stderr +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: command: speechTrain +MPI Rank 0: precision = double +MPI Rank 0: Using 20 CPU threads +MPI Rank 0: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu/models/cntkSpeech.dnn +MPI Rank 0: CNTKCommandTrainInfo: speechTrain : 3 +MPI Rank 0: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 0: CNTKCommandTrainBegin: speechTrain +MPI Rank 0: SimpleNetworkBuilder Using CPU +MPI Rank 0: reading script file glob_0000.scp ... 948 entries +MPI Rank 0: total 132 state names in state list D:\src\cntk\Tests\EndToEndTests\Speech\Data/state.list +MPI Rank 0: htkmlfreader: reading MLF file D:\src\cntk\Tests\EndToEndTests\Speech\Data/glob_0000.mlf ... total 948 entries +MPI Rank 0: ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances +MPI Rank 0: label set 0: 129 classes +MPI Rank 0: minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames +MPI Rank 0: reading script file glob_0000.cv.scp ... 300 entries +MPI Rank 0: total 132 state names in state list D:\src\cntk\Tests\EndToEndTests\Speech\Data/state.list +MPI Rank 0: htkmlfreader: reading MLF file D:\src\cntk\Tests\EndToEndTests\Speech\Data/glob_0000.mlf ... total 948 entries +MPI Rank 0: ...........................................................................feature set 0: 83050 frames in 300 out of 300 utterances +MPI Rank 0: label set 0: 129 classes +MPI Rank 0: minibatchutterancesource: 300 utterances grouped into 1 chunks, av. chunk size: 300.0 utterances, 83050.0 frames +MPI Rank 0: +MPI Rank 0: Post-processing network... +MPI Rank 0: +MPI Rank 0: 7 roots: +MPI Rank 0: CrossEntropyWithSoftmax = CrossEntropyWithSoftmax +MPI Rank 0: EvalErrorPrediction = ErrorPrediction +MPI Rank 0: InvStdOfFeatures = InvStdDev +MPI Rank 0: MeanOfFeatures = Mean +MPI Rank 0: PosteriorProb = Softmax +MPI Rank 0: Prior = Mean +MPI Rank 0: ScaledLogLikelihood = Minus +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for CrossEntropyWithSoftmax CrossEntropyWithSoftmax operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for EvalErrorPrediction ErrorPrediction operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for InvStdOfFeatures InvStdDev operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for MeanOfFeatures Mean operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for PosteriorProb Softmax operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for Prior Mean operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for ScaledLogLikelihood Minus operation +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating network. 25 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132 {1} x *] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [132 x 512 {1,132}] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [512 x 512 {1,512}] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [512 x 363 {1,512}] +MPI Rank 0: Validating --> features = InputValue -> [363 {1} x *] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363 {1} x *]) -> [363 {1}] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363 {1} x *]) -> [363 {1}] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 {1} x *], MeanOfFeatures[363 {1}], InvStdOfFeatures[363 {1}]) -> [363 {1} x *] +MPI Rank 0: Validating --> W0*features = Times(W0[512 x 363 {1,512}], MVNormalizedFeatures[363 {1} x *]) -> [512 {1} x *] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [512 x 1 {1,512}] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512 {1} x *], B0[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> W1*H1 = Times(W1[512 x 512 {1,512}], H1[512 x 1 {1,512} x *]) -> [512 {1} x *] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [512 x 1 {1,512}] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512 {1} x *], B1[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> W2*H1 = Times(W2[132 x 512 {1,132}], H2[512 x 1 {1,512} x *]) -> [132 {1} x *] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [132 x 1 {1,132}] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[132 {1} x *], B2[132 x 1 {1,132}]) -> [132 x 1 {1,132} x *] +MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}] +MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}] +MPI Rank 0: Validating --> PosteriorProb = Softmax(HLast[132 x 1 {1,132} x *]) -> [132 x 1 {1,132} x *] +MPI Rank 0: Validating --> Prior = Mean(labels[132 {1} x *]) -> [132 {1}] +MPI Rank 0: Validating --> LogOfPrior = Log(Prior[132 {1}]) -> [132 {1}] +MPI Rank 0: Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 {1,132} x *], LogOfPrior[132 {1}]) -> [132 x 1 {1,132} x *] +MPI Rank 0: +MPI Rank 0: Validating network. 17 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132 {1} x *] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [132 x 512 {1,132}] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [512 x 512 {1,512}] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [512 x 363 {1,512}] +MPI Rank 0: Validating --> features = InputValue -> [363 {1} x *] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363 {1} x *]) -> [363 {1}] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363 {1} x *]) -> [363 {1}] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 {1} x *], MeanOfFeatures[363 {1}], InvStdOfFeatures[363 {1}]) -> [363 {1} x *] +MPI Rank 0: Validating --> W0*features = Times(W0[512 x 363 {1,512}], MVNormalizedFeatures[363 {1} x *]) -> [512 {1} x *] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [512 x 1 {1,512}] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512 {1} x *], B0[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> W1*H1 = Times(W1[512 x 512 {1,512}], H1[512 x 1 {1,512} x *]) -> [512 {1} x *] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [512 x 1 {1,512}] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512 {1} x *], B1[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> W2*H1 = Times(W2[132 x 512 {1,132}], H2[512 x 1 {1,512} x *]) -> [132 {1} x *] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [132 x 1 {1,132}] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[132 {1} x *], B2[132 x 1 {1,132}]) -> [132 x 1 {1,132} x *] +MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}] +MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}] +MPI Rank 0: Validating --> PosteriorProb = Softmax(HLast[132 x 1 {1,132} x *]) -> [132 x 1 {1,132} x *] +MPI Rank 0: Validating --> Prior = Mean(labels[132 {1} x *]) -> [132 {1}] +MPI Rank 0: Validating --> LogOfPrior = Log(Prior[132 {1}]) -> [132 {1}] +MPI Rank 0: Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 {1,132} x *], LogOfPrior[132 {1}]) -> [132 x 1 {1,132} x *] +MPI Rank 0: +MPI Rank 0: Validating network, final pass. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132 {1} x *] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [132 x 512 {1,132}] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [512 x 512 {1,512}] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [512 x 363 {1,512}] +MPI Rank 0: Validating --> features = InputValue -> [363 {1} x *] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363 {1} x *]) -> [363 {1}] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363 {1} x *]) -> [363 {1}] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 {1} x *], MeanOfFeatures[363 {1}], InvStdOfFeatures[363 {1}]) -> [363 {1} x *] +MPI Rank 0: Validating --> W0*features = Times(W0[512 x 363 {1,512}], MVNormalizedFeatures[363 {1} x *]) -> [512 {1} x *] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [512 x 1 {1,512}] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512 {1} x *], B0[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> W1*H1 = Times(W1[512 x 512 {1,512}], H1[512 x 1 {1,512} x *]) -> [512 {1} x *] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [512 x 1 {1,512}] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512 {1} x *], B1[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 0: Validating --> W2*H1 = Times(W2[132 x 512 {1,132}], H2[512 x 1 {1,512} x *]) -> [132 {1} x *] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [132 x 1 {1,132}] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[132 {1} x *], B2[132 x 1 {1,132}]) -> [132 x 1 {1,132} x *] +MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}] +MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}] +MPI Rank 0: Validating --> PosteriorProb = Softmax(HLast[132 x 1 {1,132} x *]) -> [132 x 1 {1,132} x *] +MPI Rank 0: Validating --> Prior = Mean(labels[132 {1} x *]) -> [132 {1}] +MPI Rank 0: Validating --> LogOfPrior = Log(Prior[132 {1}]) -> [132 {1}] +MPI Rank 0: Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 {1,132} x *], LogOfPrior[132 {1}]) -> [132 x 1 {1,132} x *] +MPI Rank 0: +MPI Rank 0: 12 out of 25 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: Post-processing network complete. +MPI Rank 0: +MPI Rank 0: SGD using CPU. +MPI Rank 0: +MPI Rank 0: Training criterion node(s): +MPI Rank 0: CrossEntropyWithSoftmax = CrossEntropyWithSoftmax +MPI Rank 0: +MPI Rank 0: Evaluation criterion node(s): +MPI Rank 0: EvalErrorPrediction = ErrorPrediction +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: +MPI Rank 0: Precomputing --> 3 PreCompute nodes found. +MPI Rank 0: +MPI Rank 0: NodeName: MeanOfFeatures +MPI Rank 0: NodeName: InvStdOfFeatures +MPI Rank 0: NodeName: Prior +MPI Rank 0: minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 0: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms +MPI Rank 0: +MPI Rank 0: Precomputing --> Completed. +MPI Rank 0: +MPI Rank 0: Starting Epoch 1: learning rate per sample = 0.015625 effective momentum = 0.900000 momentum as time constant = 607.4 samples +MPI Rank 0: minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 0 of 2, with 1 datapasses +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 2, NumGradientBits = 64), distributed reading is ENABLED. +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 1- 10, 3.13%]: SamplesSeen = 640; TrainLossPerSample = 4.46944908; EvalErr[0]PerSample = 0.90781250; TotalTime = 2.0783s; SamplesPerSecond = 307.9 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 11- 20, 6.25%]: SamplesSeen = 640; TrainLossPerSample = 4.22299987; EvalErr[0]PerSample = 0.90156250; TotalTime = 2.0164s; SamplesPerSecond = 317.4 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 21- 30, 9.38%]: SamplesSeen = 640; TrainLossPerSample = 3.93971343; EvalErr[0]PerSample = 0.84687500; TotalTime = 2.0610s; SamplesPerSecond = 310.5 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 31- 40, 12.50%]: SamplesSeen = 640; TrainLossPerSample = 3.92341692; EvalErr[0]PerSample = 0.90468750; TotalTime = 2.0474s; SamplesPerSecond = 312.6 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 41- 50, 15.63%]: SamplesSeen = 640; TrainLossPerSample = 3.84074483; EvalErr[0]PerSample = 0.91093750; TotalTime = 2.1920s; SamplesPerSecond = 292.0 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 51- 60, 18.75%]: SamplesSeen = 640; TrainLossPerSample = 3.71252184; EvalErr[0]PerSample = 0.88437500; TotalTime = 2.2135s; SamplesPerSecond = 289.1 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 61- 70, 21.88%]: SamplesSeen = 640; TrainLossPerSample = 3.51563464; EvalErr[0]PerSample = 0.82500000; TotalTime = 1.9986s; SamplesPerSecond = 320.2 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 71- 80, 25.00%]: SamplesSeen = 640; TrainLossPerSample = 3.49349060; EvalErr[0]PerSample = 0.81093750; TotalTime = 2.1541s; SamplesPerSecond = 297.1 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 81- 90, 28.13%]: SamplesSeen = 640; TrainLossPerSample = 3.34740070; EvalErr[0]PerSample = 0.76562500; TotalTime = 2.0208s; SamplesPerSecond = 316.7 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 91- 100, 31.25%]: SamplesSeen = 640; TrainLossPerSample = 3.51960918; EvalErr[0]PerSample = 0.79843750; TotalTime = 2.1205s; SamplesPerSecond = 301.8 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 101- 110, 34.38%]: SamplesSeen = 640; TrainLossPerSample = 3.24656049; EvalErr[0]PerSample = 0.80312500; TotalTime = 1.9916s; SamplesPerSecond = 321.4 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 111- 120, 37.50%]: SamplesSeen = 640; TrainLossPerSample = 3.33397669; EvalErr[0]PerSample = 0.80000000; TotalTime = 2.0554s; SamplesPerSecond = 311.4 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 121- 130, 40.63%]: SamplesSeen = 640; TrainLossPerSample = 3.17780980; EvalErr[0]PerSample = 0.77031250; TotalTime = 1.9824s; SamplesPerSecond = 322.8 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 131- 140, 43.75%]: SamplesSeen = 640; TrainLossPerSample = 3.09845902; EvalErr[0]PerSample = 0.76875000; TotalTime = 2.0244s; SamplesPerSecond = 316.1 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 141- 150, 46.88%]: SamplesSeen = 640; TrainLossPerSample = 3.06458212; EvalErr[0]PerSample = 0.72968750; TotalTime = 1.9413s; SamplesPerSecond = 329.7 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 151- 160, 50.00%]: SamplesSeen = 640; TrainLossPerSample = 2.91633510; EvalErr[0]PerSample = 0.69531250; TotalTime = 2.1205s; SamplesPerSecond = 301.8 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 161- 170, 53.13%]: SamplesSeen = 640; TrainLossPerSample = 2.90607468; EvalErr[0]PerSample = 0.73281250; TotalTime = 2.0110s; SamplesPerSecond = 318.3 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 171- 180, 56.25%]: SamplesSeen = 640; TrainLossPerSample = 2.74095059; EvalErr[0]PerSample = 0.65937500; TotalTime = 2.1071s; SamplesPerSecond = 303.7 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 181- 190, 59.38%]: SamplesSeen = 640; TrainLossPerSample = 2.67087924; EvalErr[0]PerSample = 0.67343750; TotalTime = 2.1270s; SamplesPerSecond = 300.9 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 191- 200, 62.50%]: SamplesSeen = 640; TrainLossPerSample = 2.67609083; EvalErr[0]PerSample = 0.66406250; TotalTime = 2.0980s; SamplesPerSecond = 305.1 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 201- 210, 65.63%]: SamplesSeen = 640; TrainLossPerSample = 2.54732903; EvalErr[0]PerSample = 0.62968750; TotalTime = 1.9571s; SamplesPerSecond = 327.0 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 211- 220, 68.75%]: SamplesSeen = 640; TrainLossPerSample = 2.61925710; EvalErr[0]PerSample = 0.67343750; TotalTime = 2.1494s; SamplesPerSecond = 297.8 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 221- 230, 71.88%]: SamplesSeen = 640; TrainLossPerSample = 2.52388480; EvalErr[0]PerSample = 0.65781250; TotalTime = 1.9985s; SamplesPerSecond = 320.2 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 231- 240, 75.00%]: SamplesSeen = 640; TrainLossPerSample = 2.47544601; EvalErr[0]PerSample = 0.63437500; TotalTime = 2.1009s; SamplesPerSecond = 304.6 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 241- 250, 78.13%]: SamplesSeen = 640; TrainLossPerSample = 2.43265158; EvalErr[0]PerSample = 0.61406250; TotalTime = 2.0339s; SamplesPerSecond = 314.7 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 251- 260, 81.25%]: SamplesSeen = 640; TrainLossPerSample = 2.41728740; EvalErr[0]PerSample = 0.63125000; TotalTime = 2.0284s; SamplesPerSecond = 315.5 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 261- 270, 84.38%]: SamplesSeen = 640; TrainLossPerSample = 2.17674793; EvalErr[0]PerSample = 0.57812500; TotalTime = 2.0335s; SamplesPerSecond = 314.7 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 271- 280, 87.50%]: SamplesSeen = 640; TrainLossPerSample = 2.31020940; EvalErr[0]PerSample = 0.64062500; TotalTime = 2.0310s; SamplesPerSecond = 315.1 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 281- 290, 90.63%]: SamplesSeen = 640; TrainLossPerSample = 2.26400612; EvalErr[0]PerSample = 0.61093750; TotalTime = 1.9187s; SamplesPerSecond = 333.6 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 291- 300, 93.75%]: SamplesSeen = 640; TrainLossPerSample = 2.15885172; EvalErr[0]PerSample = 0.58281250; TotalTime = 2.0545s; SamplesPerSecond = 311.5 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 301- 310, 96.88%]: SamplesSeen = 640; TrainLossPerSample = 2.22712855; EvalErr[0]PerSample = 0.59218750; TotalTime = 2.0151s; SamplesPerSecond = 317.6 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 311- 320, 100.00%]: SamplesSeen = 640; TrainLossPerSample = 2.25604782; EvalErr[0]PerSample = 0.60625000; TotalTime = 1.9242s; SamplesPerSecond = 332.6 +MPI Rank 0: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.0070483; EvalErrPerSample = 0.72827148; AvgLearningRatePerSample = 0.015625; EpochTime=65.6342 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: minibatchiterator: epoch 0: frames [0..83050] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 0: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms +MPI Rank 0: Final Results: Minibatch[1-1298]: Samples Seen = 83050 CrossEntropyWithSoftmax: CrossEntropyWithSoftmax/Sample = 2.1113987 Perplexity = 8.2597865 EvalErrorPrediction: ErrorPrediction/Sample = 0.57013847 +MPI Rank 0: Finished Epoch[ 1 of 3]: [Validation Set] TrainLossPerSample = 2.1113987; EvalErrPerSample = 0.57013847 +MPI Rank 0: SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu/models/cntkSpeech.dnn.1' +MPI Rank 0: Starting Epoch 2: learning rate per sample = 0.001953 effective momentum = 0.656119 momentum as time constant = 607.5 samples +MPI Rank 0: minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 0 of 2, with 1 datapasses +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 2, NumGradientBits = 64), distributed reading is ENABLED. +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 1- 10, 12.50%]: SamplesSeen = 2560; TrainLossPerSample = 2.10257724; EvalErr[0]PerSample = 0.56484375; TotalTime = 2.8064s; SamplesPerSecond = 912.2 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 11- 20, 25.00%]: SamplesSeen = 2560; TrainLossPerSample = 2.00548829; EvalErr[0]PerSample = 0.54843750; TotalTime = 2.7994s; SamplesPerSecond = 914.5 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 21- 30, 37.50%]: SamplesSeen = 2560; TrainLossPerSample = 2.00767228; EvalErr[0]PerSample = 0.54960937; TotalTime = 2.8698s; SamplesPerSecond = 892.1 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 31- 40, 50.00%]: SamplesSeen = 2560; TrainLossPerSample = 1.92049656; EvalErr[0]PerSample = 0.53281250; TotalTime = 3.4712s; SamplesPerSecond = 737.5 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 41- 50, 62.50%]: SamplesSeen = 2560; TrainLossPerSample = 1.90178602; EvalErr[0]PerSample = 0.52265625; TotalTime = 3.1235s; SamplesPerSecond = 819.6 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 51- 60, 75.00%]: SamplesSeen = 2560; TrainLossPerSample = 1.91359666; EvalErr[0]PerSample = 0.53984375; TotalTime = 3.0616s; SamplesPerSecond = 836.2 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 61- 70, 87.50%]: SamplesSeen = 2560; TrainLossPerSample = 1.91765335; EvalErr[0]PerSample = 0.53125000; TotalTime = 3.1389s; SamplesPerSecond = 815.6 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 71- 80, 100.00%]: SamplesSeen = 2560; TrainLossPerSample = 1.87683041; EvalErr[0]PerSample = 0.52890625; TotalTime = 3.2514s; SamplesPerSecond = 787.4 +MPI Rank 0: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9557626; EvalErrPerSample = 0.53979492; AvgLearningRatePerSample = 0.001953125; EpochTime=24.6225 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: minibatchiterator: epoch 0: frames [0..83050] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 0: Final Results: Minibatch[1-325]: Samples Seen = 83050 CrossEntropyWithSoftmax: CrossEntropyWithSoftmax/Sample = 1.8746872 Perplexity = 6.51878 EvalErrorPrediction: ErrorPrediction/Sample = 0.52056592 +MPI Rank 0: Finished Epoch[ 2 of 3]: [Validation Set] TrainLossPerSample = 1.8746872; EvalErrPerSample = 0.52056592 +MPI Rank 0: SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu/models/cntkSpeech.dnn.2' +MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.000098 effective momentum = 0.656119 momentum as time constant = 2429.9 samples +MPI Rank 0: minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 0 of 2, with 1 datapasses +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 2, NumGradientBits = 64), distributed reading is ENABLED. +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 1- 10, 50.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.88594165; EvalErr[0]PerSample = 0.52529297; TotalTime = 5.5587s; SamplesPerSecond = 1842.1 +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 11- 20, 100.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.89384783; EvalErr[0]PerSample = 0.51816406; TotalTime = 4.9924s; SamplesPerSecond = 2051.1 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8898947; EvalErrPerSample = 0.52172852; AvgLearningRatePerSample = 9.7656251e-005; EpochTime=10.6367 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: minibatchiterator: epoch 0: frames [0..83050] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 0: Final Results: Minibatch[1-82]: Samples Seen = 83050 CrossEntropyWithSoftmax: CrossEntropyWithSoftmax/Sample = 1.8521576 Perplexity = 6.3735562 EvalErrorPrediction: ErrorPrediction/Sample = 0.50912703 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Validation Set] TrainLossPerSample = 1.8521576; EvalErrPerSample = 0.50912703 +MPI Rank 0: SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu/models/cntkSpeech.dnn' +MPI Rank 0: CNTKCommandTrainEnd: speechTrain +MPI Rank 0: COMPLETED +MPI Rank 0: ~MPIWrapper +MPI Rank 1: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu/stderr_speechTrain.logrank1 +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: Build info: +MPI Rank 1: +MPI Rank 1: Built time: Mar 1 2016 16:21:17 +MPI Rank 1: Last modified date: Fri Feb 26 14:22:38 2016 +MPI Rank 1: Build type: Debug +MPI Rank 1: Build target: GPU +MPI Rank 1: With 1bit-SGD: no +MPI Rank 1: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 +MPI Rank 1: CUB_PATH: c:\src\cub-1.4.1 +MPI Rank 1: CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda +MPI Rank 1: Build Branch: weixi/pcv +MPI Rank 1: Build SHA1: 6cb3b9d86a12663b8b08404811e7d882815f2326 (modified) +MPI Rank 1: Built by weixi on GCRCN0509 +MPI Rank 1: Build Path: D:\src\cntk\Source\CNTK\ +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: running on GCRCN0509 at 2016/03/02 01:00:20 +MPI Rank 1: command line: +MPI Rank 1: D:\src\cntk\x64\debug\cntk.exe configFile=D:\src\cntk\Tests\EndToEndTests\Speech\DNN\ParallelCrossValidation/cntkcv.cntk currentDirectory=D:\src\cntk\Tests\EndToEndTests\Speech\Data RunDir=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu DataDir=D:\src\cntk\Tests\EndToEndTests\Speech\Data ConfigDir=D:\src\cntk\Tests\EndToEndTests\Speech\DNN\ParallelCrossValidation OutputDir=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu DeviceId=-1 numCPUThreads=20 stderr=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu/stderr +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: precision = "double" +MPI Rank 1: command = speechTrain +MPI Rank 1: deviceId = $DeviceId$ +MPI Rank 1: parallelTrain = true +MPI Rank 1: speechTrain = [ +MPI Rank 1: action = "train" +MPI Rank 1: modelPath = "$RunDir$/models/cntkSpeech.dnn" +MPI Rank 1: deviceId = $DeviceId$ +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SimpleNetworkBuilder = [ +MPI Rank 1: layerSizes = 363:512:512:132 +MPI Rank 1: trainingCriterion = "CrossEntropyWithSoftmax" +MPI Rank 1: evalCriterion = "ErrorPrediction" +MPI Rank 1: layerTypes = "Sigmoid" +MPI Rank 1: initValueScale = 1.0 +MPI Rank 1: applyMeanVarNorm = true +MPI Rank 1: uniformInit = true +MPI Rank 1: needPrior = true +MPI Rank 1: ] +MPI Rank 1: ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above +MPI Rank 1: layerSizes = 363:512:512:132 +MPI Rank 1: trainingCriterion = 'CE' +MPI Rank 1: evalCriterion = 'Err' +MPI Rank 1: applyMeanVarNorm = true +MPI Rank 1: L = Length(layerSizes)-1 // number of model layers +MPI Rank 1: features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') +MPI Rank 1: featNorm = if applyMeanVarNorm +MPI Rank 1: then MeanVarNorm(features) +MPI Rank 1: else features +MPI Rank 1: layers[layer:1..L-1] = if layer > 1 +MPI Rank 1: then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 1: else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 1: outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) +MPI Rank 1: outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) +MPI Rank 1: CE = if trainingCriterion == 'CE' +MPI Rank 1: then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') +MPI Rank 1: else Fail('unknown trainingCriterion ' + trainingCriterion) +MPI Rank 1: Err = if evalCriterion == 'Err' then +MPI Rank 1: ErrorPrediction(labels, outZ, tag='eval') +MPI Rank 1: else Fail('unknown evalCriterion ' + evalCriterion) +MPI Rank 1: logPrior = LogPrior(labels) +MPI Rank 1: // TODO: how to add a tag to an infix operation? +MPI Rank 1: ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') +MPI Rank 1: ] +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize = 20480 +MPI Rank 1: minibatchSize = 64:256:1024 +MPI Rank 1: learningRatesPerMB = 1.0:0.5:0.1 +MPI Rank 1: numMBsToShowResult = 10 +MPI Rank 1: momentumPerMB = 0.9:0.656119 +MPI Rank 1: dropoutRate = 0.0 +MPI Rank 1: maxEpochs = 3 +MPI Rank 1: keepCheckPointFiles = true +MPI Rank 1: clippingThresholdPerSample = 1#INF +MPI Rank 1: ParallelTrain = [ +MPI Rank 1: parallelizationMethod = "DataParallelSGD" +MPI Rank 1: distributedMBReading = true +MPI Rank 1: DataParallelSGD = [ +MPI Rank 1: gradientBits = 64 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: AutoAdjust = [ +MPI Rank 1: reduceLearnRateIfImproveLessThan = 0 +MPI Rank 1: loadBestModel = true +MPI Rank 1: increaseLearnRateIfImproveMoreThan = 1000000000 +MPI Rank 1: learnRateDecreaseFactor = 0.5 +MPI Rank 1: learnRateIncreaseFactor = 1.382 +MPI Rank 1: autoAdjustLR = "adjustAfterEpoch" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = "HTKMLFReader" +MPI Rank 1: readMethod = "blockRandomize" +MPI Rank 1: miniBatchMode = "partial" +MPI Rank 1: randomize = "auto" +MPI Rank 1: verbosity = 0 +MPI Rank 1: features = [ +MPI Rank 1: dim = 363 +MPI Rank 1: type = "real" +MPI Rank 1: scpFile = "glob_0000.scp" +MPI Rank 1: ] +MPI Rank 1: labels = [ +MPI Rank 1: mlfFile = "$DataDir$/glob_0000.mlf" +MPI Rank 1: labelMappingFile = "$DataDir$/state.list" +MPI Rank 1: labelDim = 132 +MPI Rank 1: labelType = "category" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: cvreader = [ +MPI Rank 1: readerType = "HTKMLFReader" +MPI Rank 1: readMethod = "blockRandomize" +MPI Rank 1: miniBatchMode = "partial" +MPI Rank 1: randomize = "auto" +MPI Rank 1: verbosity = 0 +MPI Rank 1: features = [ +MPI Rank 1: dim = 363 +MPI Rank 1: type = "real" +MPI Rank 1: scpFile = "glob_0000.cv.scp" +MPI Rank 1: ] +MPI Rank 1: labels = [ +MPI Rank 1: mlfFile = "$DataDir$/glob_0000.mlf" +MPI Rank 1: labelMappingFile = "$DataDir$/state.list" +MPI Rank 1: labelDim = 132 +MPI Rank 1: labelType = "category" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: currentDirectory=D:\src\cntk\Tests\EndToEndTests\Speech\Data +MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu +MPI Rank 1: DataDir=D:\src\cntk\Tests\EndToEndTests\Speech\Data +MPI Rank 1: ConfigDir=D:\src\cntk\Tests\EndToEndTests\Speech\DNN\ParallelCrossValidation +MPI Rank 1: OutputDir=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu +MPI Rank 1: DeviceId=-1 +MPI Rank 1: numCPUThreads=20 +MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: precision = "double" +MPI Rank 1: command = speechTrain +MPI Rank 1: deviceId = -1 +MPI Rank 1: parallelTrain = true +MPI Rank 1: speechTrain = [ +MPI Rank 1: action = "train" +MPI Rank 1: modelPath = "C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu/models/cntkSpeech.dnn" +MPI Rank 1: deviceId = -1 +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SimpleNetworkBuilder = [ +MPI Rank 1: layerSizes = 363:512:512:132 +MPI Rank 1: trainingCriterion = "CrossEntropyWithSoftmax" +MPI Rank 1: evalCriterion = "ErrorPrediction" +MPI Rank 1: layerTypes = "Sigmoid" +MPI Rank 1: initValueScale = 1.0 +MPI Rank 1: applyMeanVarNorm = true +MPI Rank 1: uniformInit = true +MPI Rank 1: needPrior = true +MPI Rank 1: ] +MPI Rank 1: ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above +MPI Rank 1: layerSizes = 363:512:512:132 +MPI Rank 1: trainingCriterion = 'CE' +MPI Rank 1: evalCriterion = 'Err' +MPI Rank 1: applyMeanVarNorm = true +MPI Rank 1: L = Length(layerSizes)-1 // number of model layers +MPI Rank 1: features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') +MPI Rank 1: featNorm = if applyMeanVarNorm +MPI Rank 1: then MeanVarNorm(features) +MPI Rank 1: else features +MPI Rank 1: layers[layer:1..L-1] = if layer > 1 +MPI Rank 1: then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 1: else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 1: outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) +MPI Rank 1: outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) +MPI Rank 1: CE = if trainingCriterion == 'CE' +MPI Rank 1: then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') +MPI Rank 1: else Fail('unknown trainingCriterion ' + trainingCriterion) +MPI Rank 1: Err = if evalCriterion == 'Err' then +MPI Rank 1: ErrorPrediction(labels, outZ, tag='eval') +MPI Rank 1: else Fail('unknown evalCriterion ' + evalCriterion) +MPI Rank 1: logPrior = LogPrior(labels) +MPI Rank 1: // TODO: how to add a tag to an infix operation? +MPI Rank 1: ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') +MPI Rank 1: ] +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize = 20480 +MPI Rank 1: minibatchSize = 64:256:1024 +MPI Rank 1: learningRatesPerMB = 1.0:0.5:0.1 +MPI Rank 1: numMBsToShowResult = 10 +MPI Rank 1: momentumPerMB = 0.9:0.656119 +MPI Rank 1: dropoutRate = 0.0 +MPI Rank 1: maxEpochs = 3 +MPI Rank 1: keepCheckPointFiles = true +MPI Rank 1: clippingThresholdPerSample = 1#INF +MPI Rank 1: ParallelTrain = [ +MPI Rank 1: parallelizationMethod = "DataParallelSGD" +MPI Rank 1: distributedMBReading = true +MPI Rank 1: DataParallelSGD = [ +MPI Rank 1: gradientBits = 64 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: AutoAdjust = [ +MPI Rank 1: reduceLearnRateIfImproveLessThan = 0 +MPI Rank 1: loadBestModel = true +MPI Rank 1: increaseLearnRateIfImproveMoreThan = 1000000000 +MPI Rank 1: learnRateDecreaseFactor = 0.5 +MPI Rank 1: learnRateIncreaseFactor = 1.382 +MPI Rank 1: autoAdjustLR = "adjustAfterEpoch" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = "HTKMLFReader" +MPI Rank 1: readMethod = "blockRandomize" +MPI Rank 1: miniBatchMode = "partial" +MPI Rank 1: randomize = "auto" +MPI Rank 1: verbosity = 0 +MPI Rank 1: features = [ +MPI Rank 1: dim = 363 +MPI Rank 1: type = "real" +MPI Rank 1: scpFile = "glob_0000.scp" +MPI Rank 1: ] +MPI Rank 1: labels = [ +MPI Rank 1: mlfFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/glob_0000.mlf" +MPI Rank 1: labelMappingFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/state.list" +MPI Rank 1: labelDim = 132 +MPI Rank 1: labelType = "category" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: cvreader = [ +MPI Rank 1: readerType = "HTKMLFReader" +MPI Rank 1: readMethod = "blockRandomize" +MPI Rank 1: miniBatchMode = "partial" +MPI Rank 1: randomize = "auto" +MPI Rank 1: verbosity = 0 +MPI Rank 1: features = [ +MPI Rank 1: dim = 363 +MPI Rank 1: type = "real" +MPI Rank 1: scpFile = "glob_0000.cv.scp" +MPI Rank 1: ] +MPI Rank 1: labels = [ +MPI Rank 1: mlfFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/glob_0000.mlf" +MPI Rank 1: labelMappingFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/state.list" +MPI Rank 1: labelDim = 132 +MPI Rank 1: labelType = "category" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: currentDirectory=D:\src\cntk\Tests\EndToEndTests\Speech\Data +MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu +MPI Rank 1: DataDir=D:\src\cntk\Tests\EndToEndTests\Speech\Data +MPI Rank 1: ConfigDir=D:\src\cntk\Tests\EndToEndTests\Speech\DNN\ParallelCrossValidation +MPI Rank 1: OutputDir=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu +MPI Rank 1: DeviceId=-1 +MPI Rank 1: numCPUThreads=20 +MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: configparameters: cntkcv.cntk:command=speechTrain +MPI Rank 1: configparameters: cntkcv.cntk:ConfigDir=D:\src\cntk\Tests\EndToEndTests\Speech\DNN\ParallelCrossValidation +MPI Rank 1: configparameters: cntkcv.cntk:currentDirectory=D:\src\cntk\Tests\EndToEndTests\Speech\Data +MPI Rank 1: configparameters: cntkcv.cntk:DataDir=D:\src\cntk\Tests\EndToEndTests\Speech\Data +MPI Rank 1: configparameters: cntkcv.cntk:deviceId=-1 +MPI Rank 1: configparameters: cntkcv.cntk:numCPUThreads=20 +MPI Rank 1: configparameters: cntkcv.cntk:OutputDir=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu +MPI Rank 1: configparameters: cntkcv.cntk:parallelTrain=true +MPI Rank 1: configparameters: cntkcv.cntk:precision=double +MPI Rank 1: configparameters: cntkcv.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu +MPI Rank 1: configparameters: cntkcv.cntk:speechTrain=[ +MPI Rank 1: action = "train" +MPI Rank 1: modelPath = "C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu/models/cntkSpeech.dnn" +MPI Rank 1: deviceId = -1 +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SimpleNetworkBuilder = [ +MPI Rank 1: layerSizes = 363:512:512:132 +MPI Rank 1: trainingCriterion = "CrossEntropyWithSoftmax" +MPI Rank 1: evalCriterion = "ErrorPrediction" +MPI Rank 1: layerTypes = "Sigmoid" +MPI Rank 1: initValueScale = 1.0 +MPI Rank 1: applyMeanVarNorm = true +MPI Rank 1: uniformInit = true +MPI Rank 1: needPrior = true +MPI Rank 1: ] +MPI Rank 1: ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above +MPI Rank 1: layerSizes = 363:512:512:132 +MPI Rank 1: trainingCriterion = 'CE' +MPI Rank 1: evalCriterion = 'Err' +MPI Rank 1: applyMeanVarNorm = true +MPI Rank 1: L = Length(layerSizes)-1 // number of model layers +MPI Rank 1: features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') +MPI Rank 1: featNorm = if applyMeanVarNorm +MPI Rank 1: then MeanVarNorm(features) +MPI Rank 1: else features +MPI Rank 1: layers[layer:1..L-1] = if layer > 1 +MPI Rank 1: then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 1: else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 1: outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) +MPI Rank 1: outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) +MPI Rank 1: CE = if trainingCriterion == 'CE' +MPI Rank 1: then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') +MPI Rank 1: else Fail('unknown trainingCriterion ' + trainingCriterion) +MPI Rank 1: Err = if evalCriterion == 'Err' then +MPI Rank 1: ErrorPrediction(labels, outZ, tag='eval') +MPI Rank 1: else Fail('unknown evalCriterion ' + evalCriterion) +MPI Rank 1: logPrior = LogPrior(labels) +MPI Rank 1: // TODO: how to add a tag to an infix operation? +MPI Rank 1: ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') +MPI Rank 1: ] +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize = 20480 +MPI Rank 1: minibatchSize = 64:256:1024 +MPI Rank 1: learningRatesPerMB = 1.0:0.5:0.1 +MPI Rank 1: numMBsToShowResult = 10 +MPI Rank 1: momentumPerMB = 0.9:0.656119 +MPI Rank 1: dropoutRate = 0.0 +MPI Rank 1: maxEpochs = 3 +MPI Rank 1: keepCheckPointFiles = true +MPI Rank 1: clippingThresholdPerSample = 1#INF +MPI Rank 1: ParallelTrain = [ +MPI Rank 1: parallelizationMethod = "DataParallelSGD" +MPI Rank 1: distributedMBReading = true +MPI Rank 1: DataParallelSGD = [ +MPI Rank 1: gradientBits = 64 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: AutoAdjust = [ +MPI Rank 1: reduceLearnRateIfImproveLessThan = 0 +MPI Rank 1: loadBestModel = true +MPI Rank 1: increaseLearnRateIfImproveMoreThan = 1000000000 +MPI Rank 1: learnRateDecreaseFactor = 0.5 +MPI Rank 1: learnRateIncreaseFactor = 1.382 +MPI Rank 1: autoAdjustLR = "adjustAfterEpoch" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = "HTKMLFReader" +MPI Rank 1: readMethod = "blockRandomize" +MPI Rank 1: miniBatchMode = "partial" +MPI Rank 1: randomize = "auto" +MPI Rank 1: verbosity = 0 +MPI Rank 1: features = [ +MPI Rank 1: dim = 363 +MPI Rank 1: type = "real" +MPI Rank 1: scpFile = "glob_0000.scp" +MPI Rank 1: ] +MPI Rank 1: labels = [ +MPI Rank 1: mlfFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/glob_0000.mlf" +MPI Rank 1: labelMappingFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/state.list" +MPI Rank 1: labelDim = 132 +MPI Rank 1: labelType = "category" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: cvreader = [ +MPI Rank 1: readerType = "HTKMLFReader" +MPI Rank 1: readMethod = "blockRandomize" +MPI Rank 1: miniBatchMode = "partial" +MPI Rank 1: randomize = "auto" +MPI Rank 1: verbosity = 0 +MPI Rank 1: features = [ +MPI Rank 1: dim = 363 +MPI Rank 1: type = "real" +MPI Rank 1: scpFile = "glob_0000.cv.scp" +MPI Rank 1: ] +MPI Rank 1: labels = [ +MPI Rank 1: mlfFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/glob_0000.mlf" +MPI Rank 1: labelMappingFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/state.list" +MPI Rank 1: labelDim = 132 +MPI Rank 1: labelType = "category" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: cntkcv.cntk:stderr=C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu/stderr +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: command: speechTrain +MPI Rank 1: precision = double +MPI Rank 1: Using 20 CPU threads +MPI Rank 1: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160301170019.423861\Speech\DNN_ParallelCrossValidation@debug_cpu/models/cntkSpeech.dnn +MPI Rank 1: CNTKCommandTrainInfo: speechTrain : 3 +MPI Rank 1: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 1: CNTKCommandTrainBegin: speechTrain +MPI Rank 1: SimpleNetworkBuilder Using CPU +MPI Rank 1: reading script file glob_0000.scp ... 948 entries +MPI Rank 1: total 132 state names in state list D:\src\cntk\Tests\EndToEndTests\Speech\Data/state.list +MPI Rank 1: htkmlfreader: reading MLF file D:\src\cntk\Tests\EndToEndTests\Speech\Data/glob_0000.mlf ... total 948 entries +MPI Rank 1: ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances +MPI Rank 1: label set 0: 129 classes +MPI Rank 1: minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames +MPI Rank 1: reading script file glob_0000.cv.scp ... 300 entries +MPI Rank 1: total 132 state names in state list D:\src\cntk\Tests\EndToEndTests\Speech\Data/state.list +MPI Rank 1: htkmlfreader: reading MLF file D:\src\cntk\Tests\EndToEndTests\Speech\Data/glob_0000.mlf ... total 948 entries +MPI Rank 1: ...........................................................................feature set 0: 83050 frames in 300 out of 300 utterances +MPI Rank 1: label set 0: 129 classes +MPI Rank 1: minibatchutterancesource: 300 utterances grouped into 1 chunks, av. chunk size: 300.0 utterances, 83050.0 frames +MPI Rank 1: +MPI Rank 1: Post-processing network... +MPI Rank 1: +MPI Rank 1: 7 roots: +MPI Rank 1: CrossEntropyWithSoftmax = CrossEntropyWithSoftmax +MPI Rank 1: EvalErrorPrediction = ErrorPrediction +MPI Rank 1: InvStdOfFeatures = InvStdDev +MPI Rank 1: MeanOfFeatures = Mean +MPI Rank 1: PosteriorProb = Softmax +MPI Rank 1: Prior = Mean +MPI Rank 1: ScaledLogLikelihood = Minus +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for CrossEntropyWithSoftmax CrossEntropyWithSoftmax operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for EvalErrorPrediction ErrorPrediction operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for InvStdOfFeatures InvStdDev operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for MeanOfFeatures Mean operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for PosteriorProb Softmax operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for Prior Mean operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for ScaledLogLikelihood Minus operation +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating network. 25 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132 {1} x *] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [132 x 512 {1,132}] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [512 x 512 {1,512}] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [512 x 363 {1,512}] +MPI Rank 1: Validating --> features = InputValue -> [363 {1} x *] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363 {1} x *]) -> [363 {1}] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363 {1} x *]) -> [363 {1}] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 {1} x *], MeanOfFeatures[363 {1}], InvStdOfFeatures[363 {1}]) -> [363 {1} x *] +MPI Rank 1: Validating --> W0*features = Times(W0[512 x 363 {1,512}], MVNormalizedFeatures[363 {1} x *]) -> [512 {1} x *] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [512 x 1 {1,512}] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512 {1} x *], B0[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> W1*H1 = Times(W1[512 x 512 {1,512}], H1[512 x 1 {1,512} x *]) -> [512 {1} x *] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [512 x 1 {1,512}] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512 {1} x *], B1[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> W2*H1 = Times(W2[132 x 512 {1,132}], H2[512 x 1 {1,512} x *]) -> [132 {1} x *] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [132 x 1 {1,132}] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[132 {1} x *], B2[132 x 1 {1,132}]) -> [132 x 1 {1,132} x *] +MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}] +MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}] +MPI Rank 1: Validating --> PosteriorProb = Softmax(HLast[132 x 1 {1,132} x *]) -> [132 x 1 {1,132} x *] +MPI Rank 1: Validating --> Prior = Mean(labels[132 {1} x *]) -> [132 {1}] +MPI Rank 1: Validating --> LogOfPrior = Log(Prior[132 {1}]) -> [132 {1}] +MPI Rank 1: Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 {1,132} x *], LogOfPrior[132 {1}]) -> [132 x 1 {1,132} x *] +MPI Rank 1: +MPI Rank 1: Validating network. 17 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132 {1} x *] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [132 x 512 {1,132}] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [512 x 512 {1,512}] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [512 x 363 {1,512}] +MPI Rank 1: Validating --> features = InputValue -> [363 {1} x *] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363 {1} x *]) -> [363 {1}] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363 {1} x *]) -> [363 {1}] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 {1} x *], MeanOfFeatures[363 {1}], InvStdOfFeatures[363 {1}]) -> [363 {1} x *] +MPI Rank 1: Validating --> W0*features = Times(W0[512 x 363 {1,512}], MVNormalizedFeatures[363 {1} x *]) -> [512 {1} x *] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [512 x 1 {1,512}] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512 {1} x *], B0[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> W1*H1 = Times(W1[512 x 512 {1,512}], H1[512 x 1 {1,512} x *]) -> [512 {1} x *] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [512 x 1 {1,512}] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512 {1} x *], B1[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> W2*H1 = Times(W2[132 x 512 {1,132}], H2[512 x 1 {1,512} x *]) -> [132 {1} x *] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [132 x 1 {1,132}] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[132 {1} x *], B2[132 x 1 {1,132}]) -> [132 x 1 {1,132} x *] +MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}] +MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}] +MPI Rank 1: Validating --> PosteriorProb = Softmax(HLast[132 x 1 {1,132} x *]) -> [132 x 1 {1,132} x *] +MPI Rank 1: Validating --> Prior = Mean(labels[132 {1} x *]) -> [132 {1}] +MPI Rank 1: Validating --> LogOfPrior = Log(Prior[132 {1}]) -> [132 {1}] +MPI Rank 1: Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 {1,132} x *], LogOfPrior[132 {1}]) -> [132 x 1 {1,132} x *] +MPI Rank 1: +MPI Rank 1: Validating network, final pass. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132 {1} x *] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [132 x 512 {1,132}] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [512 x 512 {1,512}] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [512 x 363 {1,512}] +MPI Rank 1: Validating --> features = InputValue -> [363 {1} x *] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363 {1} x *]) -> [363 {1}] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363 {1} x *]) -> [363 {1}] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 {1} x *], MeanOfFeatures[363 {1}], InvStdOfFeatures[363 {1}]) -> [363 {1} x *] +MPI Rank 1: Validating --> W0*features = Times(W0[512 x 363 {1,512}], MVNormalizedFeatures[363 {1} x *]) -> [512 {1} x *] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [512 x 1 {1,512}] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512 {1} x *], B0[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> W1*H1 = Times(W1[512 x 512 {1,512}], H1[512 x 1 {1,512} x *]) -> [512 {1} x *] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [512 x 1 {1,512}] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512 {1} x *], B1[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *] +MPI Rank 1: Validating --> W2*H1 = Times(W2[132 x 512 {1,132}], H2[512 x 1 {1,512} x *]) -> [132 {1} x *] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [132 x 1 {1,132}] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[132 {1} x *], B2[132 x 1 {1,132}]) -> [132 x 1 {1,132} x *] +MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}] +MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}] +MPI Rank 1: Validating --> PosteriorProb = Softmax(HLast[132 x 1 {1,132} x *]) -> [132 x 1 {1,132} x *] +MPI Rank 1: Validating --> Prior = Mean(labels[132 {1} x *]) -> [132 {1}] +MPI Rank 1: Validating --> LogOfPrior = Log(Prior[132 {1}]) -> [132 {1}] +MPI Rank 1: Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 {1,132} x *], LogOfPrior[132 {1}]) -> [132 x 1 {1,132} x *] +MPI Rank 1: +MPI Rank 1: 12 out of 25 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: Post-processing network complete. +MPI Rank 1: +MPI Rank 1: SGD using CPU. +MPI Rank 1: +MPI Rank 1: Training criterion node(s): +MPI Rank 1: CrossEntropyWithSoftmax = CrossEntropyWithSoftmax +MPI Rank 1: +MPI Rank 1: Evaluation criterion node(s): +MPI Rank 1: EvalErrorPrediction = ErrorPrediction +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Allocating matrices for forward and/or backward propagation. +MPI Rank 1: +MPI Rank 1: Precomputing --> 3 PreCompute nodes found. +MPI Rank 1: +MPI Rank 1: NodeName: MeanOfFeatures +MPI Rank 1: NodeName: InvStdOfFeatures +MPI Rank 1: NodeName: Prior +MPI Rank 1: minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 1: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms +MPI Rank 1: +MPI Rank 1: Precomputing --> Completed. +MPI Rank 1: +MPI Rank 1: Starting Epoch 1: learning rate per sample = 0.015625 effective momentum = 0.900000 momentum as time constant = 607.4 samples +MPI Rank 1: minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 1 of 2, with 1 datapasses +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 2, NumGradientBits = 64), distributed reading is ENABLED. +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 1- 10, 3.13%]: SamplesSeen = 640; TrainLossPerSample = 4.46944908; EvalErr[0]PerSample = 0.90781250; TotalTime = 2.0820s; SamplesPerSecond = 307.4 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 11- 20, 6.25%]: SamplesSeen = 640; TrainLossPerSample = 4.22299987; EvalErr[0]PerSample = 0.90156250; TotalTime = 2.0180s; SamplesPerSecond = 317.1 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 21- 30, 9.38%]: SamplesSeen = 640; TrainLossPerSample = 3.93971343; EvalErr[0]PerSample = 0.84687500; TotalTime = 2.0134s; SamplesPerSecond = 317.9 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 31- 40, 12.50%]: SamplesSeen = 640; TrainLossPerSample = 3.92341692; EvalErr[0]PerSample = 0.90468750; TotalTime = 2.0932s; SamplesPerSecond = 305.8 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 41- 50, 15.63%]: SamplesSeen = 640; TrainLossPerSample = 3.84074483; EvalErr[0]PerSample = 0.91093750; TotalTime = 2.1920s; SamplesPerSecond = 292.0 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 51- 60, 18.75%]: SamplesSeen = 640; TrainLossPerSample = 3.71252184; EvalErr[0]PerSample = 0.88437500; TotalTime = 2.2148s; SamplesPerSecond = 289.0 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 61- 70, 21.88%]: SamplesSeen = 640; TrainLossPerSample = 3.51563464; EvalErr[0]PerSample = 0.82500000; TotalTime = 1.9991s; SamplesPerSecond = 320.1 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 71- 80, 25.00%]: SamplesSeen = 640; TrainLossPerSample = 3.49349060; EvalErr[0]PerSample = 0.81093750; TotalTime = 2.1040s; SamplesPerSecond = 304.2 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 81- 90, 28.13%]: SamplesSeen = 640; TrainLossPerSample = 3.34740070; EvalErr[0]PerSample = 0.76562500; TotalTime = 2.0671s; SamplesPerSecond = 309.6 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 91- 100, 31.25%]: SamplesSeen = 640; TrainLossPerSample = 3.51960918; EvalErr[0]PerSample = 0.79843750; TotalTime = 2.1207s; SamplesPerSecond = 301.8 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 101- 110, 34.38%]: SamplesSeen = 640; TrainLossPerSample = 3.24656049; EvalErr[0]PerSample = 0.80312500; TotalTime = 1.9930s; SamplesPerSecond = 321.1 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 111- 120, 37.50%]: SamplesSeen = 640; TrainLossPerSample = 3.33397669; EvalErr[0]PerSample = 0.80000000; TotalTime = 2.0082s; SamplesPerSecond = 318.7 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 121- 130, 40.63%]: SamplesSeen = 640; TrainLossPerSample = 3.17780980; EvalErr[0]PerSample = 0.77031250; TotalTime = 2.0292s; SamplesPerSecond = 315.4 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 131- 140, 43.75%]: SamplesSeen = 640; TrainLossPerSample = 3.09845902; EvalErr[0]PerSample = 0.76875000; TotalTime = 2.0179s; SamplesPerSecond = 317.2 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 141- 150, 46.88%]: SamplesSeen = 640; TrainLossPerSample = 3.06458212; EvalErr[0]PerSample = 0.72968750; TotalTime = 1.9476s; SamplesPerSecond = 328.6 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 151- 160, 50.00%]: SamplesSeen = 640; TrainLossPerSample = 2.91633510; EvalErr[0]PerSample = 0.69531250; TotalTime = 2.1225s; SamplesPerSecond = 301.5 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 161- 170, 53.13%]: SamplesSeen = 640; TrainLossPerSample = 2.90607468; EvalErr[0]PerSample = 0.73281250; TotalTime = 2.0102s; SamplesPerSecond = 318.4 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 171- 180, 56.25%]: SamplesSeen = 640; TrainLossPerSample = 2.74095059; EvalErr[0]PerSample = 0.65937500; TotalTime = 2.1096s; SamplesPerSecond = 303.4 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 181- 190, 59.38%]: SamplesSeen = 640; TrainLossPerSample = 2.67087924; EvalErr[0]PerSample = 0.67343750; TotalTime = 2.0311s; SamplesPerSecond = 315.1 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 191- 200, 62.50%]: SamplesSeen = 640; TrainLossPerSample = 2.67609083; EvalErr[0]PerSample = 0.66406250; TotalTime = 2.1622s; SamplesPerSecond = 296.0 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 201- 210, 65.63%]: SamplesSeen = 640; TrainLossPerSample = 2.54732903; EvalErr[0]PerSample = 0.62968750; TotalTime = 1.9857s; SamplesPerSecond = 322.3 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 211- 220, 68.75%]: SamplesSeen = 640; TrainLossPerSample = 2.61925710; EvalErr[0]PerSample = 0.67343750; TotalTime = 2.1501s; SamplesPerSecond = 297.7 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 221- 230, 71.88%]: SamplesSeen = 640; TrainLossPerSample = 2.52388480; EvalErr[0]PerSample = 0.65781250; TotalTime = 1.9986s; SamplesPerSecond = 320.2 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 231- 240, 75.00%]: SamplesSeen = 640; TrainLossPerSample = 2.47544601; EvalErr[0]PerSample = 0.63437500; TotalTime = 2.0993s; SamplesPerSecond = 304.9 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 241- 250, 78.13%]: SamplesSeen = 640; TrainLossPerSample = 2.43265158; EvalErr[0]PerSample = 0.61406250; TotalTime = 2.0346s; SamplesPerSecond = 314.6 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 251- 260, 81.25%]: SamplesSeen = 640; TrainLossPerSample = 2.41728740; EvalErr[0]PerSample = 0.63125000; TotalTime = 2.0284s; SamplesPerSecond = 315.5 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 261- 270, 84.38%]: SamplesSeen = 640; TrainLossPerSample = 2.17674793; EvalErr[0]PerSample = 0.57812500; TotalTime = 2.0353s; SamplesPerSecond = 314.5 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 271- 280, 87.50%]: SamplesSeen = 640; TrainLossPerSample = 2.31020940; EvalErr[0]PerSample = 0.64062500; TotalTime = 2.0306s; SamplesPerSecond = 315.2 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 281- 290, 90.63%]: SamplesSeen = 640; TrainLossPerSample = 2.26400612; EvalErr[0]PerSample = 0.61093750; TotalTime = 1.9175s; SamplesPerSecond = 333.8 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 291- 300, 93.75%]: SamplesSeen = 640; TrainLossPerSample = 2.15885172; EvalErr[0]PerSample = 0.58281250; TotalTime = 2.0554s; SamplesPerSecond = 311.4 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 301- 310, 96.88%]: SamplesSeen = 640; TrainLossPerSample = 2.22712855; EvalErr[0]PerSample = 0.59218750; TotalTime = 2.0182s; SamplesPerSecond = 317.1 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 311- 320, 100.00%]: SamplesSeen = 640; TrainLossPerSample = 2.25604782; EvalErr[0]PerSample = 0.60625000; TotalTime = 1.9190s; SamplesPerSecond = 333.5 +MPI Rank 1: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.0070483; EvalErrPerSample = 0.72827148; AvgLearningRatePerSample = 0.015625; EpochTime=65.6342 +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Allocating matrices for forward and/or backward propagation. +MPI Rank 1: minibatchiterator: epoch 0: frames [0..83050] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 1: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms +MPI Rank 1: Final Results: Minibatch[1-1298]: Samples Seen = 83050 CrossEntropyWithSoftmax: CrossEntropyWithSoftmax/Sample = 2.1113987 Perplexity = 8.2597865 EvalErrorPrediction: ErrorPrediction/Sample = 0.57013847 +MPI Rank 1: Finished Epoch[ 1 of 3]: [Validation Set] TrainLossPerSample = 2.1113987; EvalErrPerSample = 0.57013847 +MPI Rank 1: Starting Epoch 2: learning rate per sample = 0.001953 effective momentum = 0.656119 momentum as time constant = 607.5 samples +MPI Rank 1: minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 1 of 2, with 1 datapasses +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 2, NumGradientBits = 64), distributed reading is ENABLED. +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 1- 10, 12.50%]: SamplesSeen = 2560; TrainLossPerSample = 2.10257724; EvalErr[0]PerSample = 0.56484375; TotalTime = 2.8734s; SamplesPerSecond = 890.9 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 11- 20, 25.00%]: SamplesSeen = 2560; TrainLossPerSample = 2.00548829; EvalErr[0]PerSample = 0.54843750; TotalTime = 2.7342s; SamplesPerSecond = 936.3 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 21- 30, 37.50%]: SamplesSeen = 2560; TrainLossPerSample = 2.00767228; EvalErr[0]PerSample = 0.54960937; TotalTime = 2.8828s; SamplesPerSecond = 888.0 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 31- 40, 50.00%]: SamplesSeen = 2560; TrainLossPerSample = 1.92049656; EvalErr[0]PerSample = 0.53281250; TotalTime = 3.5518s; SamplesPerSecond = 720.8 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 41- 50, 62.50%]: SamplesSeen = 2560; TrainLossPerSample = 1.90178602; EvalErr[0]PerSample = 0.52265625; TotalTime = 3.1333s; SamplesPerSecond = 817.0 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 51- 60, 75.00%]: SamplesSeen = 2560; TrainLossPerSample = 1.91359666; EvalErr[0]PerSample = 0.53984375; TotalTime = 3.0720s; SamplesPerSecond = 833.3 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 61- 70, 87.50%]: SamplesSeen = 2560; TrainLossPerSample = 1.91765335; EvalErr[0]PerSample = 0.53125000; TotalTime = 3.1473s; SamplesPerSecond = 813.4 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 71- 80, 100.00%]: SamplesSeen = 2560; TrainLossPerSample = 1.87683041; EvalErr[0]PerSample = 0.52890625; TotalTime = 3.2052s; SamplesPerSecond = 798.7 +MPI Rank 1: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9557626; EvalErrPerSample = 0.53979492; AvgLearningRatePerSample = 0.001953125; EpochTime=24.6225 +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Allocating matrices for forward and/or backward propagation. +MPI Rank 1: minibatchiterator: epoch 0: frames [0..83050] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 1: Final Results: Minibatch[1-325]: Samples Seen = 83050 CrossEntropyWithSoftmax: CrossEntropyWithSoftmax/Sample = 1.8746872 Perplexity = 6.51878 EvalErrorPrediction: ErrorPrediction/Sample = 0.52056592 +MPI Rank 1: Finished Epoch[ 2 of 3]: [Validation Set] TrainLossPerSample = 1.8746872; EvalErrPerSample = 0.52056592 +MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.000098 effective momentum = 0.656119 momentum as time constant = 2429.9 samples +MPI Rank 1: minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 1 of 2, with 1 datapasses +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 2, NumGradientBits = 64), distributed reading is ENABLED. +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 1- 10, 50.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.88594165; EvalErr[0]PerSample = 0.52529297; TotalTime = 5.6045s; SamplesPerSecond = 1827.1 +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 11- 20, 100.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.89384783; EvalErr[0]PerSample = 0.51816406; TotalTime = 4.9764s; SamplesPerSecond = 2057.7 +MPI Rank 1: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8898947; EvalErrPerSample = 0.52172852; AvgLearningRatePerSample = 9.7656251e-005; EpochTime=10.6367 +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Allocating matrices for forward and/or backward propagation. +MPI Rank 1: minibatchiterator: epoch 0: frames [0..83050] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 1: Final Results: Minibatch[1-82]: Samples Seen = 83050 CrossEntropyWithSoftmax: CrossEntropyWithSoftmax/Sample = 1.8521576 Perplexity = 6.3735562 EvalErrorPrediction: ErrorPrediction/Sample = 0.50912703 +MPI Rank 1: Finished Epoch[ 3 of 3]: [Validation Set] TrainLossPerSample = 1.8521576; EvalErrPerSample = 0.50912703 +MPI Rank 1: CNTKCommandTrainEnd: speechTrain +MPI Rank 1: COMPLETED +MPI Rank 1: ~MPIWrapper diff --git a/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/baseline.windows.gpu.txt b/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/baseline.windows.gpu.txt new file mode 100644 index 000000000000..c46823ec4452 --- /dev/null +++ b/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/baseline.windows.gpu.txt @@ -0,0 +1,1232 @@ +=== Running C:\Program Files\Microsoft MPI\Bin\/mpiexec.exe -n 2 D:\src\cntk\x64\release\cntk.exe configFile=D:\src\cntk\Tests\EndToEndTests\Speech\DNN\ParallelCrossValidation/cntkcv.cntk currentDirectory=D:\src\cntk\Tests\EndToEndTests\Speech\Data RunDir=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu DataDir=D:\src\cntk\Tests\EndToEndTests\Speech\Data ConfigDir=D:\src\cntk\Tests\EndToEndTests\Speech\DNN\ParallelCrossValidation OutputDir=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu DeviceId=0 numCPUThreads=20 stderr=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu/stderr +MPIWrapper: initializing MPI +MPIWrapper: initializing MPI +ping [requestnodes (before change)]: 2 nodes pinging each other +ping [requestnodes (before change)]: 2 nodes pinging each other +ping [requestnodes (before change)]: all 2 nodes responded +ping [requestnodes (before change)]: all 2 nodes responded +requestnodes [MPIWrapper]: using 2 out of 2 MPI nodes (2 requested); we (0) are in (participating) +requestnodes [MPIWrapper]: using 2 out of 2 MPI nodes (2 requested); we (1) are in (participating) +ping [requestnodes (after change)]: 2 nodes pinging each other +ping [requestnodes (after change)]: 2 nodes pinging each other +ping [requestnodes (after change)]: all 2 nodes responded +ping [requestnodes (after change)]: all 2 nodes responded +mpihelper: we are cog 0 in a gearbox of 2 +mpihelper: we are cog 1 in a gearbox of 2 +ping [mpihelper]: 2 nodes pinging each other +ping [mpihelper]: 2 nodes pinging each other +ping [mpihelper]: all 2 nodes responded +ping [mpihelper]: all 2 nodes responded +MPI Rank 0: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu/stderr_speechTrain.logrank0 +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: Build info: +MPI Rank 0: +MPI Rank 0: Built time: Mar 1 2016 17:03:02 +MPI Rank 0: Last modified date: Fri Feb 26 14:22:38 2016 +MPI Rank 0: Build type: Release +MPI Rank 0: Build target: GPU +MPI Rank 0: With 1bit-SGD: no +MPI Rank 0: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 +MPI Rank 0: CUB_PATH: c:\src\cub-1.4.1 +MPI Rank 0: CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda +MPI Rank 0: Build Branch: weixi/pcv +MPI Rank 0: Build SHA1: 6cb3b9d86a12663b8b08404811e7d882815f2326 (modified) +MPI Rank 0: Built by weixi on GCRCN0509 +MPI Rank 0: Build Path: D:\src\cntk\Source\CNTK\ +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: running on GCRCN0509 at 2016/03/02 01:24:13 +MPI Rank 0: command line: +MPI Rank 0: D:\src\cntk\x64\release\cntk.exe configFile=D:\src\cntk\Tests\EndToEndTests\Speech\DNN\ParallelCrossValidation/cntkcv.cntk currentDirectory=D:\src\cntk\Tests\EndToEndTests\Speech\Data RunDir=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu DataDir=D:\src\cntk\Tests\EndToEndTests\Speech\Data ConfigDir=D:\src\cntk\Tests\EndToEndTests\Speech\DNN\ParallelCrossValidation OutputDir=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu DeviceId=0 numCPUThreads=20 stderr=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu/stderr +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: precision = "double" +MPI Rank 0: command = speechTrain +MPI Rank 0: deviceId = $DeviceId$ +MPI Rank 0: parallelTrain = true +MPI Rank 0: speechTrain = [ +MPI Rank 0: action = "train" +MPI Rank 0: modelPath = "$RunDir$/models/cntkSpeech.dnn" +MPI Rank 0: deviceId = $DeviceId$ +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SimpleNetworkBuilder = [ +MPI Rank 0: layerSizes = 363:512:512:132 +MPI Rank 0: trainingCriterion = "CrossEntropyWithSoftmax" +MPI Rank 0: evalCriterion = "ErrorPrediction" +MPI Rank 0: layerTypes = "Sigmoid" +MPI Rank 0: initValueScale = 1.0 +MPI Rank 0: applyMeanVarNorm = true +MPI Rank 0: uniformInit = true +MPI Rank 0: needPrior = true +MPI Rank 0: ] +MPI Rank 0: ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above +MPI Rank 0: layerSizes = 363:512:512:132 +MPI Rank 0: trainingCriterion = 'CE' +MPI Rank 0: evalCriterion = 'Err' +MPI Rank 0: applyMeanVarNorm = true +MPI Rank 0: L = Length(layerSizes)-1 // number of model layers +MPI Rank 0: features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') +MPI Rank 0: featNorm = if applyMeanVarNorm +MPI Rank 0: then MeanVarNorm(features) +MPI Rank 0: else features +MPI Rank 0: layers[layer:1..L-1] = if layer > 1 +MPI Rank 0: then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 0: else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 0: outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) +MPI Rank 0: outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) +MPI Rank 0: CE = if trainingCriterion == 'CE' +MPI Rank 0: then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') +MPI Rank 0: else Fail('unknown trainingCriterion ' + trainingCriterion) +MPI Rank 0: Err = if evalCriterion == 'Err' then +MPI Rank 0: ErrorPrediction(labels, outZ, tag='eval') +MPI Rank 0: else Fail('unknown evalCriterion ' + evalCriterion) +MPI Rank 0: logPrior = LogPrior(labels) +MPI Rank 0: // TODO: how to add a tag to an infix operation? +MPI Rank 0: ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') +MPI Rank 0: ] +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize = 20480 +MPI Rank 0: minibatchSize = 64:256:1024 +MPI Rank 0: learningRatesPerMB = 1.0:0.5:0.1 +MPI Rank 0: numMBsToShowResult = 10 +MPI Rank 0: momentumPerMB = 0.9:0.656119 +MPI Rank 0: dropoutRate = 0.0 +MPI Rank 0: maxEpochs = 3 +MPI Rank 0: keepCheckPointFiles = true +MPI Rank 0: clippingThresholdPerSample = 1#INF +MPI Rank 0: ParallelTrain = [ +MPI Rank 0: parallelizationMethod = "DataParallelSGD" +MPI Rank 0: distributedMBReading = true +MPI Rank 0: DataParallelSGD = [ +MPI Rank 0: gradientBits = 64 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: AutoAdjust = [ +MPI Rank 0: reduceLearnRateIfImproveLessThan = 0 +MPI Rank 0: loadBestModel = true +MPI Rank 0: increaseLearnRateIfImproveMoreThan = 1000000000 +MPI Rank 0: learnRateDecreaseFactor = 0.5 +MPI Rank 0: learnRateIncreaseFactor = 1.382 +MPI Rank 0: autoAdjustLR = "adjustAfterEpoch" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = "HTKMLFReader" +MPI Rank 0: readMethod = "blockRandomize" +MPI Rank 0: miniBatchMode = "partial" +MPI Rank 0: randomize = "auto" +MPI Rank 0: verbosity = 0 +MPI Rank 0: features = [ +MPI Rank 0: dim = 363 +MPI Rank 0: type = "real" +MPI Rank 0: scpFile = "glob_0000.scp" +MPI Rank 0: ] +MPI Rank 0: labels = [ +MPI Rank 0: mlfFile = "$DataDir$/glob_0000.mlf" +MPI Rank 0: labelMappingFile = "$DataDir$/state.list" +MPI Rank 0: labelDim = 132 +MPI Rank 0: labelType = "category" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: cvreader = [ +MPI Rank 0: readerType = "HTKMLFReader" +MPI Rank 0: readMethod = "blockRandomize" +MPI Rank 0: miniBatchMode = "partial" +MPI Rank 0: randomize = "auto" +MPI Rank 0: verbosity = 0 +MPI Rank 0: features = [ +MPI Rank 0: dim = 363 +MPI Rank 0: type = "real" +MPI Rank 0: scpFile = "glob_0000.cv.scp" +MPI Rank 0: ] +MPI Rank 0: labels = [ +MPI Rank 0: mlfFile = "$DataDir$/glob_0000.mlf" +MPI Rank 0: labelMappingFile = "$DataDir$/state.list" +MPI Rank 0: labelDim = 132 +MPI Rank 0: labelType = "category" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: currentDirectory=D:\src\cntk\Tests\EndToEndTests\Speech\Data +MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu +MPI Rank 0: DataDir=D:\src\cntk\Tests\EndToEndTests\Speech\Data +MPI Rank 0: ConfigDir=D:\src\cntk\Tests\EndToEndTests\Speech\DNN\ParallelCrossValidation +MPI Rank 0: OutputDir=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu +MPI Rank 0: DeviceId=0 +MPI Rank 0: numCPUThreads=20 +MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: precision = "double" +MPI Rank 0: command = speechTrain +MPI Rank 0: deviceId = 0 +MPI Rank 0: parallelTrain = true +MPI Rank 0: speechTrain = [ +MPI Rank 0: action = "train" +MPI Rank 0: modelPath = "C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu/models/cntkSpeech.dnn" +MPI Rank 0: deviceId = 0 +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SimpleNetworkBuilder = [ +MPI Rank 0: layerSizes = 363:512:512:132 +MPI Rank 0: trainingCriterion = "CrossEntropyWithSoftmax" +MPI Rank 0: evalCriterion = "ErrorPrediction" +MPI Rank 0: layerTypes = "Sigmoid" +MPI Rank 0: initValueScale = 1.0 +MPI Rank 0: applyMeanVarNorm = true +MPI Rank 0: uniformInit = true +MPI Rank 0: needPrior = true +MPI Rank 0: ] +MPI Rank 0: ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above +MPI Rank 0: layerSizes = 363:512:512:132 +MPI Rank 0: trainingCriterion = 'CE' +MPI Rank 0: evalCriterion = 'Err' +MPI Rank 0: applyMeanVarNorm = true +MPI Rank 0: L = Length(layerSizes)-1 // number of model layers +MPI Rank 0: features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') +MPI Rank 0: featNorm = if applyMeanVarNorm +MPI Rank 0: then MeanVarNorm(features) +MPI Rank 0: else features +MPI Rank 0: layers[layer:1..L-1] = if layer > 1 +MPI Rank 0: then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 0: else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 0: outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) +MPI Rank 0: outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) +MPI Rank 0: CE = if trainingCriterion == 'CE' +MPI Rank 0: then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') +MPI Rank 0: else Fail('unknown trainingCriterion ' + trainingCriterion) +MPI Rank 0: Err = if evalCriterion == 'Err' then +MPI Rank 0: ErrorPrediction(labels, outZ, tag='eval') +MPI Rank 0: else Fail('unknown evalCriterion ' + evalCriterion) +MPI Rank 0: logPrior = LogPrior(labels) +MPI Rank 0: // TODO: how to add a tag to an infix operation? +MPI Rank 0: ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') +MPI Rank 0: ] +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize = 20480 +MPI Rank 0: minibatchSize = 64:256:1024 +MPI Rank 0: learningRatesPerMB = 1.0:0.5:0.1 +MPI Rank 0: numMBsToShowResult = 10 +MPI Rank 0: momentumPerMB = 0.9:0.656119 +MPI Rank 0: dropoutRate = 0.0 +MPI Rank 0: maxEpochs = 3 +MPI Rank 0: keepCheckPointFiles = true +MPI Rank 0: clippingThresholdPerSample = 1#INF +MPI Rank 0: ParallelTrain = [ +MPI Rank 0: parallelizationMethod = "DataParallelSGD" +MPI Rank 0: distributedMBReading = true +MPI Rank 0: DataParallelSGD = [ +MPI Rank 0: gradientBits = 64 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: AutoAdjust = [ +MPI Rank 0: reduceLearnRateIfImproveLessThan = 0 +MPI Rank 0: loadBestModel = true +MPI Rank 0: increaseLearnRateIfImproveMoreThan = 1000000000 +MPI Rank 0: learnRateDecreaseFactor = 0.5 +MPI Rank 0: learnRateIncreaseFactor = 1.382 +MPI Rank 0: autoAdjustLR = "adjustAfterEpoch" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = "HTKMLFReader" +MPI Rank 0: readMethod = "blockRandomize" +MPI Rank 0: miniBatchMode = "partial" +MPI Rank 0: randomize = "auto" +MPI Rank 0: verbosity = 0 +MPI Rank 0: features = [ +MPI Rank 0: dim = 363 +MPI Rank 0: type = "real" +MPI Rank 0: scpFile = "glob_0000.scp" +MPI Rank 0: ] +MPI Rank 0: labels = [ +MPI Rank 0: mlfFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/glob_0000.mlf" +MPI Rank 0: labelMappingFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/state.list" +MPI Rank 0: labelDim = 132 +MPI Rank 0: labelType = "category" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: cvreader = [ +MPI Rank 0: readerType = "HTKMLFReader" +MPI Rank 0: readMethod = "blockRandomize" +MPI Rank 0: miniBatchMode = "partial" +MPI Rank 0: randomize = "auto" +MPI Rank 0: verbosity = 0 +MPI Rank 0: features = [ +MPI Rank 0: dim = 363 +MPI Rank 0: type = "real" +MPI Rank 0: scpFile = "glob_0000.cv.scp" +MPI Rank 0: ] +MPI Rank 0: labels = [ +MPI Rank 0: mlfFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/glob_0000.mlf" +MPI Rank 0: labelMappingFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/state.list" +MPI Rank 0: labelDim = 132 +MPI Rank 0: labelType = "category" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: currentDirectory=D:\src\cntk\Tests\EndToEndTests\Speech\Data +MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu +MPI Rank 0: DataDir=D:\src\cntk\Tests\EndToEndTests\Speech\Data +MPI Rank 0: ConfigDir=D:\src\cntk\Tests\EndToEndTests\Speech\DNN\ParallelCrossValidation +MPI Rank 0: OutputDir=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu +MPI Rank 0: DeviceId=0 +MPI Rank 0: numCPUThreads=20 +MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: configparameters: cntkcv.cntk:command=speechTrain +MPI Rank 0: configparameters: cntkcv.cntk:ConfigDir=D:\src\cntk\Tests\EndToEndTests\Speech\DNN\ParallelCrossValidation +MPI Rank 0: configparameters: cntkcv.cntk:currentDirectory=D:\src\cntk\Tests\EndToEndTests\Speech\Data +MPI Rank 0: configparameters: cntkcv.cntk:DataDir=D:\src\cntk\Tests\EndToEndTests\Speech\Data +MPI Rank 0: configparameters: cntkcv.cntk:deviceId=0 +MPI Rank 0: configparameters: cntkcv.cntk:numCPUThreads=20 +MPI Rank 0: configparameters: cntkcv.cntk:OutputDir=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu +MPI Rank 0: configparameters: cntkcv.cntk:parallelTrain=true +MPI Rank 0: configparameters: cntkcv.cntk:precision=double +MPI Rank 0: configparameters: cntkcv.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu +MPI Rank 0: configparameters: cntkcv.cntk:speechTrain=[ +MPI Rank 0: action = "train" +MPI Rank 0: modelPath = "C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu/models/cntkSpeech.dnn" +MPI Rank 0: deviceId = 0 +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SimpleNetworkBuilder = [ +MPI Rank 0: layerSizes = 363:512:512:132 +MPI Rank 0: trainingCriterion = "CrossEntropyWithSoftmax" +MPI Rank 0: evalCriterion = "ErrorPrediction" +MPI Rank 0: layerTypes = "Sigmoid" +MPI Rank 0: initValueScale = 1.0 +MPI Rank 0: applyMeanVarNorm = true +MPI Rank 0: uniformInit = true +MPI Rank 0: needPrior = true +MPI Rank 0: ] +MPI Rank 0: ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above +MPI Rank 0: layerSizes = 363:512:512:132 +MPI Rank 0: trainingCriterion = 'CE' +MPI Rank 0: evalCriterion = 'Err' +MPI Rank 0: applyMeanVarNorm = true +MPI Rank 0: L = Length(layerSizes)-1 // number of model layers +MPI Rank 0: features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') +MPI Rank 0: featNorm = if applyMeanVarNorm +MPI Rank 0: then MeanVarNorm(features) +MPI Rank 0: else features +MPI Rank 0: layers[layer:1..L-1] = if layer > 1 +MPI Rank 0: then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 0: else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 0: outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) +MPI Rank 0: outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) +MPI Rank 0: CE = if trainingCriterion == 'CE' +MPI Rank 0: then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') +MPI Rank 0: else Fail('unknown trainingCriterion ' + trainingCriterion) +MPI Rank 0: Err = if evalCriterion == 'Err' then +MPI Rank 0: ErrorPrediction(labels, outZ, tag='eval') +MPI Rank 0: else Fail('unknown evalCriterion ' + evalCriterion) +MPI Rank 0: logPrior = LogPrior(labels) +MPI Rank 0: // TODO: how to add a tag to an infix operation? +MPI Rank 0: ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') +MPI Rank 0: ] +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize = 20480 +MPI Rank 0: minibatchSize = 64:256:1024 +MPI Rank 0: learningRatesPerMB = 1.0:0.5:0.1 +MPI Rank 0: numMBsToShowResult = 10 +MPI Rank 0: momentumPerMB = 0.9:0.656119 +MPI Rank 0: dropoutRate = 0.0 +MPI Rank 0: maxEpochs = 3 +MPI Rank 0: keepCheckPointFiles = true +MPI Rank 0: clippingThresholdPerSample = 1#INF +MPI Rank 0: ParallelTrain = [ +MPI Rank 0: parallelizationMethod = "DataParallelSGD" +MPI Rank 0: distributedMBReading = true +MPI Rank 0: DataParallelSGD = [ +MPI Rank 0: gradientBits = 64 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: AutoAdjust = [ +MPI Rank 0: reduceLearnRateIfImproveLessThan = 0 +MPI Rank 0: loadBestModel = true +MPI Rank 0: increaseLearnRateIfImproveMoreThan = 1000000000 +MPI Rank 0: learnRateDecreaseFactor = 0.5 +MPI Rank 0: learnRateIncreaseFactor = 1.382 +MPI Rank 0: autoAdjustLR = "adjustAfterEpoch" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = "HTKMLFReader" +MPI Rank 0: readMethod = "blockRandomize" +MPI Rank 0: miniBatchMode = "partial" +MPI Rank 0: randomize = "auto" +MPI Rank 0: verbosity = 0 +MPI Rank 0: features = [ +MPI Rank 0: dim = 363 +MPI Rank 0: type = "real" +MPI Rank 0: scpFile = "glob_0000.scp" +MPI Rank 0: ] +MPI Rank 0: labels = [ +MPI Rank 0: mlfFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/glob_0000.mlf" +MPI Rank 0: labelMappingFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/state.list" +MPI Rank 0: labelDim = 132 +MPI Rank 0: labelType = "category" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: cvreader = [ +MPI Rank 0: readerType = "HTKMLFReader" +MPI Rank 0: readMethod = "blockRandomize" +MPI Rank 0: miniBatchMode = "partial" +MPI Rank 0: randomize = "auto" +MPI Rank 0: verbosity = 0 +MPI Rank 0: features = [ +MPI Rank 0: dim = 363 +MPI Rank 0: type = "real" +MPI Rank 0: scpFile = "glob_0000.cv.scp" +MPI Rank 0: ] +MPI Rank 0: labels = [ +MPI Rank 0: mlfFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/glob_0000.mlf" +MPI Rank 0: labelMappingFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/state.list" +MPI Rank 0: labelDim = 132 +MPI Rank 0: labelType = "category" +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: cntkcv.cntk:stderr=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu/stderr +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: command: speechTrain +MPI Rank 0: precision = double +MPI Rank 0: Using 20 CPU threads +MPI Rank 0: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu/models/cntkSpeech.dnn +MPI Rank 0: CNTKCommandTrainInfo: speechTrain : 3 +MPI Rank 0: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 0: CNTKCommandTrainBegin: speechTrain +MPI Rank 0: SimpleNetworkBuilder Using GPU 0 +MPI Rank 0: reading script file glob_0000.scp ... 948 entries +MPI Rank 0: total 132 state names in state list D:\src\cntk\Tests\EndToEndTests\Speech\Data/state.list +MPI Rank 0: htkmlfreader: reading MLF file D:\src\cntk\Tests\EndToEndTests\Speech\Data/glob_0000.mlf ... total 948 entries +MPI Rank 0: ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances +MPI Rank 0: label set 0: 129 classes +MPI Rank 0: minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames +MPI Rank 0: reading script file glob_0000.cv.scp ... 300 entries +MPI Rank 0: total 132 state names in state list D:\src\cntk\Tests\EndToEndTests\Speech\Data/state.list +MPI Rank 0: htkmlfreader: reading MLF file D:\src\cntk\Tests\EndToEndTests\Speech\Data/glob_0000.mlf ... total 948 entries +MPI Rank 0: ...........................................................................feature set 0: 83050 frames in 300 out of 300 utterances +MPI Rank 0: label set 0: 129 classes +MPI Rank 0: minibatchutterancesource: 300 utterances grouped into 1 chunks, av. chunk size: 300.0 utterances, 83050.0 frames +MPI Rank 0: Microsoft::MSR::CNTK::GPUMatrix::SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==8 +MPI Rank 0: +MPI Rank 0: Post-processing network... +MPI Rank 0: +MPI Rank 0: 7 roots: +MPI Rank 0: CrossEntropyWithSoftmax = CrossEntropyWithSoftmax +MPI Rank 0: EvalErrorPrediction = ErrorPrediction +MPI Rank 0: InvStdOfFeatures = InvStdDev +MPI Rank 0: MeanOfFeatures = Mean +MPI Rank 0: PosteriorProb = Softmax +MPI Rank 0: Prior = Mean +MPI Rank 0: ScaledLogLikelihood = Minus +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for CrossEntropyWithSoftmax CrossEntropyWithSoftmax operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for EvalErrorPrediction ErrorPrediction operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for InvStdOfFeatures InvStdDev operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for MeanOfFeatures Mean operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for PosteriorProb Softmax operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for Prior Mean operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for ScaledLogLikelihood Minus operation +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating network. 25 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132 x *] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [132 x 512] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [512 x 512] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [512 x 363] +MPI Rank 0: Validating --> features = InputValue -> [363 x *] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363 x *]) -> [363] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363 x *]) -> [363] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 x *], MeanOfFeatures[363], InvStdOfFeatures[363]) -> [363 x *] +MPI Rank 0: Validating --> W0*features = Times(W0[512 x 363], MVNormalizedFeatures[363 x *]) -> [512 x *] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [512 x 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512 x *], B0[512 x 1]) -> [512 x 1 x *] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 0: Validating --> W1*H1 = Times(W1[512 x 512], H1[512 x 1 x *]) -> [512 x *] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [512 x 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512 x *], B1[512 x 1]) -> [512 x 1 x *] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 0: Validating --> W2*H1 = Times(W2[132 x 512], H2[512 x 1 x *]) -> [132 x *] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [132 x 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[132 x *], B2[132 x 1]) -> [132 x 1 x *] +MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 x *], HLast[132 x 1 x *]) -> [1] +MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 x *], HLast[132 x 1 x *]) -> [1] +MPI Rank 0: Validating --> PosteriorProb = Softmax(HLast[132 x 1 x *]) -> [132 x 1 x *] +MPI Rank 0: Validating --> Prior = Mean(labels[132 x *]) -> [132] +MPI Rank 0: Validating --> LogOfPrior = Log(Prior[132]) -> [132] +MPI Rank 0: Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 x *], LogOfPrior[132]) -> [132 x 1 x *] +MPI Rank 0: +MPI Rank 0: Validating network. 17 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132 x *] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [132 x 512] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [512 x 512] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [512 x 363] +MPI Rank 0: Validating --> features = InputValue -> [363 x *] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363 x *]) -> [363] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363 x *]) -> [363] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 x *], MeanOfFeatures[363], InvStdOfFeatures[363]) -> [363 x *] +MPI Rank 0: Validating --> W0*features = Times(W0[512 x 363], MVNormalizedFeatures[363 x *]) -> [512 x *] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [512 x 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512 x *], B0[512 x 1]) -> [512 x 1 x *] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 0: Validating --> W1*H1 = Times(W1[512 x 512], H1[512 x 1 x *]) -> [512 x *] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [512 x 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512 x *], B1[512 x 1]) -> [512 x 1 x *] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 0: Validating --> W2*H1 = Times(W2[132 x 512], H2[512 x 1 x *]) -> [132 x *] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [132 x 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[132 x *], B2[132 x 1]) -> [132 x 1 x *] +MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 x *], HLast[132 x 1 x *]) -> [1] +MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 x *], HLast[132 x 1 x *]) -> [1] +MPI Rank 0: Validating --> PosteriorProb = Softmax(HLast[132 x 1 x *]) -> [132 x 1 x *] +MPI Rank 0: Validating --> Prior = Mean(labels[132 x *]) -> [132] +MPI Rank 0: Validating --> LogOfPrior = Log(Prior[132]) -> [132] +MPI Rank 0: Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 x *], LogOfPrior[132]) -> [132 x 1 x *] +MPI Rank 0: +MPI Rank 0: Validating network, final pass. +MPI Rank 0: +MPI Rank 0: Validating --> labels = InputValue -> [132 x *] +MPI Rank 0: Validating --> W2 = LearnableParameter -> [132 x 512] +MPI Rank 0: Validating --> W1 = LearnableParameter -> [512 x 512] +MPI Rank 0: Validating --> W0 = LearnableParameter -> [512 x 363] +MPI Rank 0: Validating --> features = InputValue -> [363 x *] +MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363 x *]) -> [363] +MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363 x *]) -> [363] +MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 x *], MeanOfFeatures[363], InvStdOfFeatures[363]) -> [363 x *] +MPI Rank 0: Validating --> W0*features = Times(W0[512 x 363], MVNormalizedFeatures[363 x *]) -> [512 x *] +MPI Rank 0: Validating --> B0 = LearnableParameter -> [512 x 1] +MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512 x *], B0[512 x 1]) -> [512 x 1 x *] +MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 0: Validating --> W1*H1 = Times(W1[512 x 512], H1[512 x 1 x *]) -> [512 x *] +MPI Rank 0: Validating --> B1 = LearnableParameter -> [512 x 1] +MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512 x *], B1[512 x 1]) -> [512 x 1 x *] +MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 0: Validating --> W2*H1 = Times(W2[132 x 512], H2[512 x 1 x *]) -> [132 x *] +MPI Rank 0: Validating --> B2 = LearnableParameter -> [132 x 1] +MPI Rank 0: Validating --> HLast = Plus(W2*H1[132 x *], B2[132 x 1]) -> [132 x 1 x *] +MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 x *], HLast[132 x 1 x *]) -> [1] +MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 x *], HLast[132 x 1 x *]) -> [1] +MPI Rank 0: Validating --> PosteriorProb = Softmax(HLast[132 x 1 x *]) -> [132 x 1 x *] +MPI Rank 0: Validating --> Prior = Mean(labels[132 x *]) -> [132] +MPI Rank 0: Validating --> LogOfPrior = Log(Prior[132]) -> [132] +MPI Rank 0: Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 x *], LogOfPrior[132]) -> [132 x 1 x *] +MPI Rank 0: +MPI Rank 0: 12 out of 25 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: Post-processing network complete. +MPI Rank 0: +MPI Rank 0: SGD using GPU 0. +MPI Rank 0: +MPI Rank 0: Training criterion node(s): +MPI Rank 0: CrossEntropyWithSoftmax = CrossEntropyWithSoftmax +MPI Rank 0: +MPI Rank 0: Evaluation criterion node(s): +MPI Rank 0: EvalErrorPrediction = ErrorPrediction +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: +MPI Rank 0: Precomputing --> 3 PreCompute nodes found. +MPI Rank 0: +MPI Rank 0: NodeName: MeanOfFeatures +MPI Rank 0: NodeName: InvStdOfFeatures +MPI Rank 0: NodeName: Prior +MPI Rank 0: minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 0: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms +MPI Rank 0: +MPI Rank 0: Precomputing --> Completed. +MPI Rank 0: +MPI Rank 0: Starting Epoch 1: learning rate per sample = 0.015625 effective momentum = 0.900000 momentum as time constant = 607.4 samples +MPI Rank 0: minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 0 of 2, with 1 datapasses +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 2, NumGradientBits = 64), distributed reading is ENABLED. +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 1- 10, 3.13%]: SamplesSeen = 640; TrainLossPerSample = 4.52102408; EvalErr[0]PerSample = 0.92656250; TotalTime = 0.0775s; SamplesPerSecond = 8260.2 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 11- 20, 6.25%]: SamplesSeen = 640; TrainLossPerSample = 4.21764659; EvalErr[0]PerSample = 0.90156250; TotalTime = 0.0723s; SamplesPerSecond = 8850.8 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 21- 30, 9.38%]: SamplesSeen = 640; TrainLossPerSample = 3.92251861; EvalErr[0]PerSample = 0.85000000; TotalTime = 0.0731s; SamplesPerSecond = 8757.3 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 31- 40, 12.50%]: SamplesSeen = 640; TrainLossPerSample = 3.91289446; EvalErr[0]PerSample = 0.88750000; TotalTime = 0.0734s; SamplesPerSecond = 8717.9 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 41- 50, 15.63%]: SamplesSeen = 640; TrainLossPerSample = 3.84057836; EvalErr[0]PerSample = 0.91093750; TotalTime = 0.0723s; SamplesPerSecond = 8847.1 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 51- 60, 18.75%]: SamplesSeen = 640; TrainLossPerSample = 3.71077800; EvalErr[0]PerSample = 0.88437500; TotalTime = 0.0737s; SamplesPerSecond = 8684.4 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 61- 70, 21.88%]: SamplesSeen = 640; TrainLossPerSample = 3.50986627; EvalErr[0]PerSample = 0.81718750; TotalTime = 0.0696s; SamplesPerSecond = 9190.1 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 71- 80, 25.00%]: SamplesSeen = 640; TrainLossPerSample = 3.47993705; EvalErr[0]PerSample = 0.81250000; TotalTime = 0.0695s; SamplesPerSecond = 9215.3 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 81- 90, 28.13%]: SamplesSeen = 640; TrainLossPerSample = 3.33550558; EvalErr[0]PerSample = 0.76718750; TotalTime = 0.0701s; SamplesPerSecond = 9133.2 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 91- 100, 31.25%]: SamplesSeen = 640; TrainLossPerSample = 3.49726054; EvalErr[0]PerSample = 0.80000000; TotalTime = 0.0698s; SamplesPerSecond = 9172.2 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 101- 110, 34.38%]: SamplesSeen = 640; TrainLossPerSample = 3.21905375; EvalErr[0]PerSample = 0.80000000; TotalTime = 0.0730s; SamplesPerSecond = 8767.2 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 111- 120, 37.50%]: SamplesSeen = 640; TrainLossPerSample = 3.31461145; EvalErr[0]PerSample = 0.79062500; TotalTime = 0.0725s; SamplesPerSecond = 8828.0 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 121- 130, 40.63%]: SamplesSeen = 640; TrainLossPerSample = 3.15950802; EvalErr[0]PerSample = 0.77968750; TotalTime = 0.0736s; SamplesPerSecond = 8692.7 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 131- 140, 43.75%]: SamplesSeen = 640; TrainLossPerSample = 3.07762131; EvalErr[0]PerSample = 0.77187500; TotalTime = 0.0730s; SamplesPerSecond = 8769.6 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 141- 150, 46.88%]: SamplesSeen = 640; TrainLossPerSample = 3.05637351; EvalErr[0]PerSample = 0.72187500; TotalTime = 0.0723s; SamplesPerSecond = 8852.9 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 151- 160, 50.00%]: SamplesSeen = 640; TrainLossPerSample = 2.91153531; EvalErr[0]PerSample = 0.69062500; TotalTime = 0.0728s; SamplesPerSecond = 8786.9 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 161- 170, 53.13%]: SamplesSeen = 640; TrainLossPerSample = 2.89745725; EvalErr[0]PerSample = 0.73281250; TotalTime = 0.0722s; SamplesPerSecond = 8861.6 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 171- 180, 56.25%]: SamplesSeen = 640; TrainLossPerSample = 2.72829961; EvalErr[0]PerSample = 0.65312500; TotalTime = 0.0732s; SamplesPerSecond = 8743.5 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 181- 190, 59.38%]: SamplesSeen = 640; TrainLossPerSample = 2.65806444; EvalErr[0]PerSample = 0.68593750; TotalTime = 0.0727s; SamplesPerSecond = 8801.2 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 191- 200, 62.50%]: SamplesSeen = 640; TrainLossPerSample = 2.66604147; EvalErr[0]PerSample = 0.66093750; TotalTime = 0.0724s; SamplesPerSecond = 8835.8 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 201- 210, 65.63%]: SamplesSeen = 640; TrainLossPerSample = 2.53915697; EvalErr[0]PerSample = 0.63125000; TotalTime = 0.0725s; SamplesPerSecond = 8825.5 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 211- 220, 68.75%]: SamplesSeen = 640; TrainLossPerSample = 2.61937093; EvalErr[0]PerSample = 0.67343750; TotalTime = 0.0725s; SamplesPerSecond = 8832.5 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 221- 230, 71.88%]: SamplesSeen = 640; TrainLossPerSample = 2.51539473; EvalErr[0]PerSample = 0.65937500; TotalTime = 0.0725s; SamplesPerSecond = 8822.2 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 231- 240, 75.00%]: SamplesSeen = 640; TrainLossPerSample = 2.47301309; EvalErr[0]PerSample = 0.64218750; TotalTime = 0.0723s; SamplesPerSecond = 8846.5 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 241- 250, 78.13%]: SamplesSeen = 640; TrainLossPerSample = 2.42748799; EvalErr[0]PerSample = 0.61250000; TotalTime = 0.0725s; SamplesPerSecond = 8829.2 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 251- 260, 81.25%]: SamplesSeen = 640; TrainLossPerSample = 2.42204482; EvalErr[0]PerSample = 0.62500000; TotalTime = 0.0721s; SamplesPerSecond = 8880.5 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 261- 270, 84.38%]: SamplesSeen = 640; TrainLossPerSample = 2.17342812; EvalErr[0]PerSample = 0.56718750; TotalTime = 0.0721s; SamplesPerSecond = 8875.1 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 271- 280, 87.50%]: SamplesSeen = 640; TrainLossPerSample = 2.31290374; EvalErr[0]PerSample = 0.62968750; TotalTime = 0.0721s; SamplesPerSecond = 8874.0 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 281- 290, 90.63%]: SamplesSeen = 640; TrainLossPerSample = 2.26008782; EvalErr[0]PerSample = 0.60312500; TotalTime = 0.0730s; SamplesPerSecond = 8770.8 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 291- 300, 93.75%]: SamplesSeen = 640; TrainLossPerSample = 2.15763314; EvalErr[0]PerSample = 0.57968750; TotalTime = 0.0723s; SamplesPerSecond = 8854.5 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 301- 310, 96.88%]: SamplesSeen = 640; TrainLossPerSample = 2.23496000; EvalErr[0]PerSample = 0.59531250; TotalTime = 0.0721s; SamplesPerSecond = 8875.6 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 311- 320, 100.00%]: SamplesSeen = 640; TrainLossPerSample = 2.25712791; EvalErr[0]PerSample = 0.61406250; TotalTime = 0.0723s; SamplesPerSecond = 8848.2 +MPI Rank 0: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.000912; EvalErrPerSample = 0.72744141; AvgLearningRatePerSample = 0.015625; EpochTime=2.33396 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: minibatchiterator: epoch 0: frames [0..83050] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 0: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms +MPI Rank 0: Final Results: Minibatch[1-1298]: Samples Seen = 83050 CrossEntropyWithSoftmax: CrossEntropyWithSoftmax/Sample = 2.1123249 Perplexity = 8.2674399 EvalErrorPrediction: ErrorPrediction/Sample = 0.56936785 +MPI Rank 0: Finished Epoch[ 1 of 3]: [Validation Set] TrainLossPerSample = 2.1123249; EvalErrPerSample = 0.56936785 +MPI Rank 0: SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu/models/cntkSpeech.dnn.1' +MPI Rank 0: Starting Epoch 2: learning rate per sample = 0.001953 effective momentum = 0.656119 momentum as time constant = 607.5 samples +MPI Rank 0: minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 0 of 2, with 1 datapasses +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 2, NumGradientBits = 64), distributed reading is ENABLED. +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 1- 10, 12.50%]: SamplesSeen = 2560; TrainLossPerSample = 2.09679725; EvalErr[0]PerSample = 0.56328125; TotalTime = 0.0851s; SamplesPerSecond = 30086.9 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 11- 20, 25.00%]: SamplesSeen = 2560; TrainLossPerSample = 1.99204361; EvalErr[0]PerSample = 0.54648438; TotalTime = 0.0803s; SamplesPerSecond = 31900.3 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 21- 30, 37.50%]: SamplesSeen = 2560; TrainLossPerSample = 1.99681229; EvalErr[0]PerSample = 0.54882813; TotalTime = 0.0812s; SamplesPerSecond = 31536.0 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 31- 40, 50.00%]: SamplesSeen = 2560; TrainLossPerSample = 1.90894475; EvalErr[0]PerSample = 0.52929688; TotalTime = 0.0809s; SamplesPerSecond = 31646.4 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 41- 50, 62.50%]: SamplesSeen = 2560; TrainLossPerSample = 1.89584567; EvalErr[0]PerSample = 0.52500000; TotalTime = 0.0813s; SamplesPerSecond = 31473.6 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 51- 60, 75.00%]: SamplesSeen = 2560; TrainLossPerSample = 1.90450680; EvalErr[0]PerSample = 0.54062500; TotalTime = 0.0820s; SamplesPerSecond = 31224.8 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 61- 70, 87.50%]: SamplesSeen = 2560; TrainLossPerSample = 1.91141823; EvalErr[0]PerSample = 0.53203125; TotalTime = 0.0811s; SamplesPerSecond = 31553.5 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 71- 80, 100.00%]: SamplesSeen = 2560; TrainLossPerSample = 1.87590201; EvalErr[0]PerSample = 0.52460938; TotalTime = 0.0827s; SamplesPerSecond = 30964.6 +MPI Rank 0: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9477838; EvalErrPerSample = 0.53876953; AvgLearningRatePerSample = 0.001953125; EpochTime=0.66369 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: minibatchiterator: epoch 0: frames [0..83050] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 0: Final Results: Minibatch[1-325]: Samples Seen = 83050 CrossEntropyWithSoftmax: CrossEntropyWithSoftmax/Sample = 1.8690699 Perplexity = 6.4822646 EvalErrorPrediction: ErrorPrediction/Sample = 0.52003612 +MPI Rank 0: Finished Epoch[ 2 of 3]: [Validation Set] TrainLossPerSample = 1.8690699; EvalErrPerSample = 0.52003612 +MPI Rank 0: SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu/models/cntkSpeech.dnn.2' +MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.000098 effective momentum = 0.656119 momentum as time constant = 2429.9 samples +MPI Rank 0: minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 0 of 2, with 1 datapasses +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 2, NumGradientBits = 64), distributed reading is ENABLED. +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 1- 10, 50.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.87890619; EvalErr[0]PerSample = 0.52246094; TotalTime = 0.1452s; SamplesPerSecond = 70538.0 +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 11- 20, 100.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.88442887; EvalErr[0]PerSample = 0.51699219; TotalTime = 0.1342s; SamplesPerSecond = 76305.7 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8816675; EvalErrPerSample = 0.51972656; AvgLearningRatePerSample = 9.7656251e-005; EpochTime=0.289387 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: minibatchiterator: epoch 0: frames [0..83050] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 0: Final Results: Minibatch[1-82]: Samples Seen = 83050 CrossEntropyWithSoftmax: CrossEntropyWithSoftmax/Sample = 1.8458415 Perplexity = 6.3334268 EvalErrorPrediction: ErrorPrediction/Sample = 0.50965683 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Validation Set] TrainLossPerSample = 1.8458415; EvalErrPerSample = 0.50965683 +MPI Rank 0: SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu/models/cntkSpeech.dnn' +MPI Rank 0: CNTKCommandTrainEnd: speechTrain +MPI Rank 0: COMPLETED +MPI Rank 0: ~MPIWrapper +MPI Rank 1: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu/stderr_speechTrain.logrank1 +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: Build info: +MPI Rank 1: +MPI Rank 1: Built time: Mar 1 2016 17:03:02 +MPI Rank 1: Last modified date: Fri Feb 26 14:22:38 2016 +MPI Rank 1: Build type: Release +MPI Rank 1: Build target: GPU +MPI Rank 1: With 1bit-SGD: no +MPI Rank 1: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 +MPI Rank 1: CUB_PATH: c:\src\cub-1.4.1 +MPI Rank 1: CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda +MPI Rank 1: Build Branch: weixi/pcv +MPI Rank 1: Build SHA1: 6cb3b9d86a12663b8b08404811e7d882815f2326 (modified) +MPI Rank 1: Built by weixi on GCRCN0509 +MPI Rank 1: Build Path: D:\src\cntk\Source\CNTK\ +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: running on GCRCN0509 at 2016/03/02 01:24:13 +MPI Rank 1: command line: +MPI Rank 1: D:\src\cntk\x64\release\cntk.exe configFile=D:\src\cntk\Tests\EndToEndTests\Speech\DNN\ParallelCrossValidation/cntkcv.cntk currentDirectory=D:\src\cntk\Tests\EndToEndTests\Speech\Data RunDir=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu DataDir=D:\src\cntk\Tests\EndToEndTests\Speech\Data ConfigDir=D:\src\cntk\Tests\EndToEndTests\Speech\DNN\ParallelCrossValidation OutputDir=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu DeviceId=0 numCPUThreads=20 stderr=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu/stderr +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: precision = "double" +MPI Rank 1: command = speechTrain +MPI Rank 1: deviceId = $DeviceId$ +MPI Rank 1: parallelTrain = true +MPI Rank 1: speechTrain = [ +MPI Rank 1: action = "train" +MPI Rank 1: modelPath = "$RunDir$/models/cntkSpeech.dnn" +MPI Rank 1: deviceId = $DeviceId$ +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SimpleNetworkBuilder = [ +MPI Rank 1: layerSizes = 363:512:512:132 +MPI Rank 1: trainingCriterion = "CrossEntropyWithSoftmax" +MPI Rank 1: evalCriterion = "ErrorPrediction" +MPI Rank 1: layerTypes = "Sigmoid" +MPI Rank 1: initValueScale = 1.0 +MPI Rank 1: applyMeanVarNorm = true +MPI Rank 1: uniformInit = true +MPI Rank 1: needPrior = true +MPI Rank 1: ] +MPI Rank 1: ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above +MPI Rank 1: layerSizes = 363:512:512:132 +MPI Rank 1: trainingCriterion = 'CE' +MPI Rank 1: evalCriterion = 'Err' +MPI Rank 1: applyMeanVarNorm = true +MPI Rank 1: L = Length(layerSizes)-1 // number of model layers +MPI Rank 1: features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') +MPI Rank 1: featNorm = if applyMeanVarNorm +MPI Rank 1: then MeanVarNorm(features) +MPI Rank 1: else features +MPI Rank 1: layers[layer:1..L-1] = if layer > 1 +MPI Rank 1: then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 1: else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 1: outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) +MPI Rank 1: outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) +MPI Rank 1: CE = if trainingCriterion == 'CE' +MPI Rank 1: then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') +MPI Rank 1: else Fail('unknown trainingCriterion ' + trainingCriterion) +MPI Rank 1: Err = if evalCriterion == 'Err' then +MPI Rank 1: ErrorPrediction(labels, outZ, tag='eval') +MPI Rank 1: else Fail('unknown evalCriterion ' + evalCriterion) +MPI Rank 1: logPrior = LogPrior(labels) +MPI Rank 1: // TODO: how to add a tag to an infix operation? +MPI Rank 1: ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') +MPI Rank 1: ] +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize = 20480 +MPI Rank 1: minibatchSize = 64:256:1024 +MPI Rank 1: learningRatesPerMB = 1.0:0.5:0.1 +MPI Rank 1: numMBsToShowResult = 10 +MPI Rank 1: momentumPerMB = 0.9:0.656119 +MPI Rank 1: dropoutRate = 0.0 +MPI Rank 1: maxEpochs = 3 +MPI Rank 1: keepCheckPointFiles = true +MPI Rank 1: clippingThresholdPerSample = 1#INF +MPI Rank 1: ParallelTrain = [ +MPI Rank 1: parallelizationMethod = "DataParallelSGD" +MPI Rank 1: distributedMBReading = true +MPI Rank 1: DataParallelSGD = [ +MPI Rank 1: gradientBits = 64 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: AutoAdjust = [ +MPI Rank 1: reduceLearnRateIfImproveLessThan = 0 +MPI Rank 1: loadBestModel = true +MPI Rank 1: increaseLearnRateIfImproveMoreThan = 1000000000 +MPI Rank 1: learnRateDecreaseFactor = 0.5 +MPI Rank 1: learnRateIncreaseFactor = 1.382 +MPI Rank 1: autoAdjustLR = "adjustAfterEpoch" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = "HTKMLFReader" +MPI Rank 1: readMethod = "blockRandomize" +MPI Rank 1: miniBatchMode = "partial" +MPI Rank 1: randomize = "auto" +MPI Rank 1: verbosity = 0 +MPI Rank 1: features = [ +MPI Rank 1: dim = 363 +MPI Rank 1: type = "real" +MPI Rank 1: scpFile = "glob_0000.scp" +MPI Rank 1: ] +MPI Rank 1: labels = [ +MPI Rank 1: mlfFile = "$DataDir$/glob_0000.mlf" +MPI Rank 1: labelMappingFile = "$DataDir$/state.list" +MPI Rank 1: labelDim = 132 +MPI Rank 1: labelType = "category" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: cvreader = [ +MPI Rank 1: readerType = "HTKMLFReader" +MPI Rank 1: readMethod = "blockRandomize" +MPI Rank 1: miniBatchMode = "partial" +MPI Rank 1: randomize = "auto" +MPI Rank 1: verbosity = 0 +MPI Rank 1: features = [ +MPI Rank 1: dim = 363 +MPI Rank 1: type = "real" +MPI Rank 1: scpFile = "glob_0000.cv.scp" +MPI Rank 1: ] +MPI Rank 1: labels = [ +MPI Rank 1: mlfFile = "$DataDir$/glob_0000.mlf" +MPI Rank 1: labelMappingFile = "$DataDir$/state.list" +MPI Rank 1: labelDim = 132 +MPI Rank 1: labelType = "category" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: currentDirectory=D:\src\cntk\Tests\EndToEndTests\Speech\Data +MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu +MPI Rank 1: DataDir=D:\src\cntk\Tests\EndToEndTests\Speech\Data +MPI Rank 1: ConfigDir=D:\src\cntk\Tests\EndToEndTests\Speech\DNN\ParallelCrossValidation +MPI Rank 1: OutputDir=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu +MPI Rank 1: DeviceId=0 +MPI Rank 1: numCPUThreads=20 +MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: precision = "double" +MPI Rank 1: command = speechTrain +MPI Rank 1: deviceId = 0 +MPI Rank 1: parallelTrain = true +MPI Rank 1: speechTrain = [ +MPI Rank 1: action = "train" +MPI Rank 1: modelPath = "C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu/models/cntkSpeech.dnn" +MPI Rank 1: deviceId = 0 +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SimpleNetworkBuilder = [ +MPI Rank 1: layerSizes = 363:512:512:132 +MPI Rank 1: trainingCriterion = "CrossEntropyWithSoftmax" +MPI Rank 1: evalCriterion = "ErrorPrediction" +MPI Rank 1: layerTypes = "Sigmoid" +MPI Rank 1: initValueScale = 1.0 +MPI Rank 1: applyMeanVarNorm = true +MPI Rank 1: uniformInit = true +MPI Rank 1: needPrior = true +MPI Rank 1: ] +MPI Rank 1: ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above +MPI Rank 1: layerSizes = 363:512:512:132 +MPI Rank 1: trainingCriterion = 'CE' +MPI Rank 1: evalCriterion = 'Err' +MPI Rank 1: applyMeanVarNorm = true +MPI Rank 1: L = Length(layerSizes)-1 // number of model layers +MPI Rank 1: features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') +MPI Rank 1: featNorm = if applyMeanVarNorm +MPI Rank 1: then MeanVarNorm(features) +MPI Rank 1: else features +MPI Rank 1: layers[layer:1..L-1] = if layer > 1 +MPI Rank 1: then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 1: else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 1: outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) +MPI Rank 1: outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) +MPI Rank 1: CE = if trainingCriterion == 'CE' +MPI Rank 1: then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') +MPI Rank 1: else Fail('unknown trainingCriterion ' + trainingCriterion) +MPI Rank 1: Err = if evalCriterion == 'Err' then +MPI Rank 1: ErrorPrediction(labels, outZ, tag='eval') +MPI Rank 1: else Fail('unknown evalCriterion ' + evalCriterion) +MPI Rank 1: logPrior = LogPrior(labels) +MPI Rank 1: // TODO: how to add a tag to an infix operation? +MPI Rank 1: ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') +MPI Rank 1: ] +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize = 20480 +MPI Rank 1: minibatchSize = 64:256:1024 +MPI Rank 1: learningRatesPerMB = 1.0:0.5:0.1 +MPI Rank 1: numMBsToShowResult = 10 +MPI Rank 1: momentumPerMB = 0.9:0.656119 +MPI Rank 1: dropoutRate = 0.0 +MPI Rank 1: maxEpochs = 3 +MPI Rank 1: keepCheckPointFiles = true +MPI Rank 1: clippingThresholdPerSample = 1#INF +MPI Rank 1: ParallelTrain = [ +MPI Rank 1: parallelizationMethod = "DataParallelSGD" +MPI Rank 1: distributedMBReading = true +MPI Rank 1: DataParallelSGD = [ +MPI Rank 1: gradientBits = 64 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: AutoAdjust = [ +MPI Rank 1: reduceLearnRateIfImproveLessThan = 0 +MPI Rank 1: loadBestModel = true +MPI Rank 1: increaseLearnRateIfImproveMoreThan = 1000000000 +MPI Rank 1: learnRateDecreaseFactor = 0.5 +MPI Rank 1: learnRateIncreaseFactor = 1.382 +MPI Rank 1: autoAdjustLR = "adjustAfterEpoch" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = "HTKMLFReader" +MPI Rank 1: readMethod = "blockRandomize" +MPI Rank 1: miniBatchMode = "partial" +MPI Rank 1: randomize = "auto" +MPI Rank 1: verbosity = 0 +MPI Rank 1: features = [ +MPI Rank 1: dim = 363 +MPI Rank 1: type = "real" +MPI Rank 1: scpFile = "glob_0000.scp" +MPI Rank 1: ] +MPI Rank 1: labels = [ +MPI Rank 1: mlfFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/glob_0000.mlf" +MPI Rank 1: labelMappingFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/state.list" +MPI Rank 1: labelDim = 132 +MPI Rank 1: labelType = "category" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: cvreader = [ +MPI Rank 1: readerType = "HTKMLFReader" +MPI Rank 1: readMethod = "blockRandomize" +MPI Rank 1: miniBatchMode = "partial" +MPI Rank 1: randomize = "auto" +MPI Rank 1: verbosity = 0 +MPI Rank 1: features = [ +MPI Rank 1: dim = 363 +MPI Rank 1: type = "real" +MPI Rank 1: scpFile = "glob_0000.cv.scp" +MPI Rank 1: ] +MPI Rank 1: labels = [ +MPI Rank 1: mlfFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/glob_0000.mlf" +MPI Rank 1: labelMappingFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/state.list" +MPI Rank 1: labelDim = 132 +MPI Rank 1: labelType = "category" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: currentDirectory=D:\src\cntk\Tests\EndToEndTests\Speech\Data +MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu +MPI Rank 1: DataDir=D:\src\cntk\Tests\EndToEndTests\Speech\Data +MPI Rank 1: ConfigDir=D:\src\cntk\Tests\EndToEndTests\Speech\DNN\ParallelCrossValidation +MPI Rank 1: OutputDir=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu +MPI Rank 1: DeviceId=0 +MPI Rank 1: numCPUThreads=20 +MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: configparameters: cntkcv.cntk:command=speechTrain +MPI Rank 1: configparameters: cntkcv.cntk:ConfigDir=D:\src\cntk\Tests\EndToEndTests\Speech\DNN\ParallelCrossValidation +MPI Rank 1: configparameters: cntkcv.cntk:currentDirectory=D:\src\cntk\Tests\EndToEndTests\Speech\Data +MPI Rank 1: configparameters: cntkcv.cntk:DataDir=D:\src\cntk\Tests\EndToEndTests\Speech\Data +MPI Rank 1: configparameters: cntkcv.cntk:deviceId=0 +MPI Rank 1: configparameters: cntkcv.cntk:numCPUThreads=20 +MPI Rank 1: configparameters: cntkcv.cntk:OutputDir=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu +MPI Rank 1: configparameters: cntkcv.cntk:parallelTrain=true +MPI Rank 1: configparameters: cntkcv.cntk:precision=double +MPI Rank 1: configparameters: cntkcv.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu +MPI Rank 1: configparameters: cntkcv.cntk:speechTrain=[ +MPI Rank 1: action = "train" +MPI Rank 1: modelPath = "C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu/models/cntkSpeech.dnn" +MPI Rank 1: deviceId = 0 +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SimpleNetworkBuilder = [ +MPI Rank 1: layerSizes = 363:512:512:132 +MPI Rank 1: trainingCriterion = "CrossEntropyWithSoftmax" +MPI Rank 1: evalCriterion = "ErrorPrediction" +MPI Rank 1: layerTypes = "Sigmoid" +MPI Rank 1: initValueScale = 1.0 +MPI Rank 1: applyMeanVarNorm = true +MPI Rank 1: uniformInit = true +MPI Rank 1: needPrior = true +MPI Rank 1: ] +MPI Rank 1: ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above +MPI Rank 1: layerSizes = 363:512:512:132 +MPI Rank 1: trainingCriterion = 'CE' +MPI Rank 1: evalCriterion = 'Err' +MPI Rank 1: applyMeanVarNorm = true +MPI Rank 1: L = Length(layerSizes)-1 // number of model layers +MPI Rank 1: features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') +MPI Rank 1: featNorm = if applyMeanVarNorm +MPI Rank 1: then MeanVarNorm(features) +MPI Rank 1: else features +MPI Rank 1: layers[layer:1..L-1] = if layer > 1 +MPI Rank 1: then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 1: else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) +MPI Rank 1: outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) +MPI Rank 1: outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) +MPI Rank 1: CE = if trainingCriterion == 'CE' +MPI Rank 1: then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') +MPI Rank 1: else Fail('unknown trainingCriterion ' + trainingCriterion) +MPI Rank 1: Err = if evalCriterion == 'Err' then +MPI Rank 1: ErrorPrediction(labels, outZ, tag='eval') +MPI Rank 1: else Fail('unknown evalCriterion ' + evalCriterion) +MPI Rank 1: logPrior = LogPrior(labels) +MPI Rank 1: // TODO: how to add a tag to an infix operation? +MPI Rank 1: ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') +MPI Rank 1: ] +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize = 20480 +MPI Rank 1: minibatchSize = 64:256:1024 +MPI Rank 1: learningRatesPerMB = 1.0:0.5:0.1 +MPI Rank 1: numMBsToShowResult = 10 +MPI Rank 1: momentumPerMB = 0.9:0.656119 +MPI Rank 1: dropoutRate = 0.0 +MPI Rank 1: maxEpochs = 3 +MPI Rank 1: keepCheckPointFiles = true +MPI Rank 1: clippingThresholdPerSample = 1#INF +MPI Rank 1: ParallelTrain = [ +MPI Rank 1: parallelizationMethod = "DataParallelSGD" +MPI Rank 1: distributedMBReading = true +MPI Rank 1: DataParallelSGD = [ +MPI Rank 1: gradientBits = 64 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: AutoAdjust = [ +MPI Rank 1: reduceLearnRateIfImproveLessThan = 0 +MPI Rank 1: loadBestModel = true +MPI Rank 1: increaseLearnRateIfImproveMoreThan = 1000000000 +MPI Rank 1: learnRateDecreaseFactor = 0.5 +MPI Rank 1: learnRateIncreaseFactor = 1.382 +MPI Rank 1: autoAdjustLR = "adjustAfterEpoch" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = "HTKMLFReader" +MPI Rank 1: readMethod = "blockRandomize" +MPI Rank 1: miniBatchMode = "partial" +MPI Rank 1: randomize = "auto" +MPI Rank 1: verbosity = 0 +MPI Rank 1: features = [ +MPI Rank 1: dim = 363 +MPI Rank 1: type = "real" +MPI Rank 1: scpFile = "glob_0000.scp" +MPI Rank 1: ] +MPI Rank 1: labels = [ +MPI Rank 1: mlfFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/glob_0000.mlf" +MPI Rank 1: labelMappingFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/state.list" +MPI Rank 1: labelDim = 132 +MPI Rank 1: labelType = "category" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: cvreader = [ +MPI Rank 1: readerType = "HTKMLFReader" +MPI Rank 1: readMethod = "blockRandomize" +MPI Rank 1: miniBatchMode = "partial" +MPI Rank 1: randomize = "auto" +MPI Rank 1: verbosity = 0 +MPI Rank 1: features = [ +MPI Rank 1: dim = 363 +MPI Rank 1: type = "real" +MPI Rank 1: scpFile = "glob_0000.cv.scp" +MPI Rank 1: ] +MPI Rank 1: labels = [ +MPI Rank 1: mlfFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/glob_0000.mlf" +MPI Rank 1: labelMappingFile = "D:\src\cntk\Tests\EndToEndTests\Speech\Data/state.list" +MPI Rank 1: labelDim = 132 +MPI Rank 1: labelType = "category" +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: cntkcv.cntk:stderr=C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu/stderr +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: command: speechTrain +MPI Rank 1: precision = double +MPI Rank 1: Using 20 CPU threads +MPI Rank 1: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160301172412.673018\Speech\DNN_ParallelCrossValidation@release_gpu/models/cntkSpeech.dnn +MPI Rank 1: CNTKCommandTrainInfo: speechTrain : 3 +MPI Rank 1: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 1: CNTKCommandTrainBegin: speechTrain +MPI Rank 1: SimpleNetworkBuilder Using GPU 0 +MPI Rank 1: reading script file glob_0000.scp ... 948 entries +MPI Rank 1: total 132 state names in state list D:\src\cntk\Tests\EndToEndTests\Speech\Data/state.list +MPI Rank 1: htkmlfreader: reading MLF file D:\src\cntk\Tests\EndToEndTests\Speech\Data/glob_0000.mlf ... total 948 entries +MPI Rank 1: ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances +MPI Rank 1: label set 0: 129 classes +MPI Rank 1: minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames +MPI Rank 1: reading script file glob_0000.cv.scp ... 300 entries +MPI Rank 1: total 132 state names in state list D:\src\cntk\Tests\EndToEndTests\Speech\Data/state.list +MPI Rank 1: htkmlfreader: reading MLF file D:\src\cntk\Tests\EndToEndTests\Speech\Data/glob_0000.mlf ... total 948 entries +MPI Rank 1: ...........................................................................feature set 0: 83050 frames in 300 out of 300 utterances +MPI Rank 1: label set 0: 129 classes +MPI Rank 1: minibatchutterancesource: 300 utterances grouped into 1 chunks, av. chunk size: 300.0 utterances, 83050.0 frames +MPI Rank 1: Microsoft::MSR::CNTK::GPUMatrix::SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==8 +MPI Rank 1: +MPI Rank 1: Post-processing network... +MPI Rank 1: +MPI Rank 1: 7 roots: +MPI Rank 1: CrossEntropyWithSoftmax = CrossEntropyWithSoftmax +MPI Rank 1: EvalErrorPrediction = ErrorPrediction +MPI Rank 1: InvStdOfFeatures = InvStdDev +MPI Rank 1: MeanOfFeatures = Mean +MPI Rank 1: PosteriorProb = Softmax +MPI Rank 1: Prior = Mean +MPI Rank 1: ScaledLogLikelihood = Minus +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for CrossEntropyWithSoftmax CrossEntropyWithSoftmax operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for EvalErrorPrediction ErrorPrediction operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for InvStdOfFeatures InvStdDev operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for MeanOfFeatures Mean operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for PosteriorProb Softmax operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for Prior Mean operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for ScaledLogLikelihood Minus operation +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating network. 25 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132 x *] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [132 x 512] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [512 x 512] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [512 x 363] +MPI Rank 1: Validating --> features = InputValue -> [363 x *] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363 x *]) -> [363] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363 x *]) -> [363] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 x *], MeanOfFeatures[363], InvStdOfFeatures[363]) -> [363 x *] +MPI Rank 1: Validating --> W0*features = Times(W0[512 x 363], MVNormalizedFeatures[363 x *]) -> [512 x *] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [512 x 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512 x *], B0[512 x 1]) -> [512 x 1 x *] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 1: Validating --> W1*H1 = Times(W1[512 x 512], H1[512 x 1 x *]) -> [512 x *] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [512 x 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512 x *], B1[512 x 1]) -> [512 x 1 x *] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 1: Validating --> W2*H1 = Times(W2[132 x 512], H2[512 x 1 x *]) -> [132 x *] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [132 x 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[132 x *], B2[132 x 1]) -> [132 x 1 x *] +MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 x *], HLast[132 x 1 x *]) -> [1] +MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 x *], HLast[132 x 1 x *]) -> [1] +MPI Rank 1: Validating --> PosteriorProb = Softmax(HLast[132 x 1 x *]) -> [132 x 1 x *] +MPI Rank 1: Validating --> Prior = Mean(labels[132 x *]) -> [132] +MPI Rank 1: Validating --> LogOfPrior = Log(Prior[132]) -> [132] +MPI Rank 1: Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 x *], LogOfPrior[132]) -> [132 x 1 x *] +MPI Rank 1: +MPI Rank 1: Validating network. 17 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132 x *] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [132 x 512] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [512 x 512] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [512 x 363] +MPI Rank 1: Validating --> features = InputValue -> [363 x *] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363 x *]) -> [363] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363 x *]) -> [363] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 x *], MeanOfFeatures[363], InvStdOfFeatures[363]) -> [363 x *] +MPI Rank 1: Validating --> W0*features = Times(W0[512 x 363], MVNormalizedFeatures[363 x *]) -> [512 x *] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [512 x 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512 x *], B0[512 x 1]) -> [512 x 1 x *] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 1: Validating --> W1*H1 = Times(W1[512 x 512], H1[512 x 1 x *]) -> [512 x *] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [512 x 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512 x *], B1[512 x 1]) -> [512 x 1 x *] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 1: Validating --> W2*H1 = Times(W2[132 x 512], H2[512 x 1 x *]) -> [132 x *] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [132 x 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[132 x *], B2[132 x 1]) -> [132 x 1 x *] +MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 x *], HLast[132 x 1 x *]) -> [1] +MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 x *], HLast[132 x 1 x *]) -> [1] +MPI Rank 1: Validating --> PosteriorProb = Softmax(HLast[132 x 1 x *]) -> [132 x 1 x *] +MPI Rank 1: Validating --> Prior = Mean(labels[132 x *]) -> [132] +MPI Rank 1: Validating --> LogOfPrior = Log(Prior[132]) -> [132] +MPI Rank 1: Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 x *], LogOfPrior[132]) -> [132 x 1 x *] +MPI Rank 1: +MPI Rank 1: Validating network, final pass. +MPI Rank 1: +MPI Rank 1: Validating --> labels = InputValue -> [132 x *] +MPI Rank 1: Validating --> W2 = LearnableParameter -> [132 x 512] +MPI Rank 1: Validating --> W1 = LearnableParameter -> [512 x 512] +MPI Rank 1: Validating --> W0 = LearnableParameter -> [512 x 363] +MPI Rank 1: Validating --> features = InputValue -> [363 x *] +MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363 x *]) -> [363] +MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363 x *]) -> [363] +MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 x *], MeanOfFeatures[363], InvStdOfFeatures[363]) -> [363 x *] +MPI Rank 1: Validating --> W0*features = Times(W0[512 x 363], MVNormalizedFeatures[363 x *]) -> [512 x *] +MPI Rank 1: Validating --> B0 = LearnableParameter -> [512 x 1] +MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512 x *], B0[512 x 1]) -> [512 x 1 x *] +MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 1: Validating --> W1*H1 = Times(W1[512 x 512], H1[512 x 1 x *]) -> [512 x *] +MPI Rank 1: Validating --> B1 = LearnableParameter -> [512 x 1] +MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512 x *], B1[512 x 1]) -> [512 x 1 x *] +MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 x *]) -> [512 x 1 x *] +MPI Rank 1: Validating --> W2*H1 = Times(W2[132 x 512], H2[512 x 1 x *]) -> [132 x *] +MPI Rank 1: Validating --> B2 = LearnableParameter -> [132 x 1] +MPI Rank 1: Validating --> HLast = Plus(W2*H1[132 x *], B2[132 x 1]) -> [132 x 1 x *] +MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 x *], HLast[132 x 1 x *]) -> [1] +MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 x *], HLast[132 x 1 x *]) -> [1] +MPI Rank 1: Validating --> PosteriorProb = Softmax(HLast[132 x 1 x *]) -> [132 x 1 x *] +MPI Rank 1: Validating --> Prior = Mean(labels[132 x *]) -> [132] +MPI Rank 1: Validating --> LogOfPrior = Log(Prior[132]) -> [132] +MPI Rank 1: Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 x *], LogOfPrior[132]) -> [132 x 1 x *] +MPI Rank 1: +MPI Rank 1: 12 out of 25 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: Post-processing network complete. +MPI Rank 1: +MPI Rank 1: SGD using GPU 0. +MPI Rank 1: +MPI Rank 1: Training criterion node(s): +MPI Rank 1: CrossEntropyWithSoftmax = CrossEntropyWithSoftmax +MPI Rank 1: +MPI Rank 1: Evaluation criterion node(s): +MPI Rank 1: EvalErrorPrediction = ErrorPrediction +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Allocating matrices for forward and/or backward propagation. +MPI Rank 1: +MPI Rank 1: Precomputing --> 3 PreCompute nodes found. +MPI Rank 1: +MPI Rank 1: NodeName: MeanOfFeatures +MPI Rank 1: NodeName: InvStdOfFeatures +MPI Rank 1: NodeName: Prior +MPI Rank 1: minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 1: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms +MPI Rank 1: +MPI Rank 1: Precomputing --> Completed. +MPI Rank 1: +MPI Rank 1: Starting Epoch 1: learning rate per sample = 0.015625 effective momentum = 0.900000 momentum as time constant = 607.4 samples +MPI Rank 1: minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 1 of 2, with 1 datapasses +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 2, NumGradientBits = 64), distributed reading is ENABLED. +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 1- 10, 3.13%]: SamplesSeen = 640; TrainLossPerSample = 4.52102408; EvalErr[0]PerSample = 0.92656250; TotalTime = 0.0776s; SamplesPerSecond = 8247.3 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 11- 20, 6.25%]: SamplesSeen = 640; TrainLossPerSample = 4.21764659; EvalErr[0]PerSample = 0.90156250; TotalTime = 0.0723s; SamplesPerSecond = 8850.0 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 21- 30, 9.38%]: SamplesSeen = 640; TrainLossPerSample = 3.92251861; EvalErr[0]PerSample = 0.85000000; TotalTime = 0.0731s; SamplesPerSecond = 8756.1 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 31- 40, 12.50%]: SamplesSeen = 640; TrainLossPerSample = 3.91289446; EvalErr[0]PerSample = 0.88750000; TotalTime = 0.0733s; SamplesPerSecond = 8737.0 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 41- 50, 15.63%]: SamplesSeen = 640; TrainLossPerSample = 3.84057836; EvalErr[0]PerSample = 0.91093750; TotalTime = 0.0724s; SamplesPerSecond = 8843.7 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 51- 60, 18.75%]: SamplesSeen = 640; TrainLossPerSample = 3.71077800; EvalErr[0]PerSample = 0.88437500; TotalTime = 0.0737s; SamplesPerSecond = 8683.5 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 61- 70, 21.88%]: SamplesSeen = 640; TrainLossPerSample = 3.50986627; EvalErr[0]PerSample = 0.81718750; TotalTime = 0.0697s; SamplesPerSecond = 9188.5 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 71- 80, 25.00%]: SamplesSeen = 640; TrainLossPerSample = 3.47993705; EvalErr[0]PerSample = 0.81250000; TotalTime = 0.0694s; SamplesPerSecond = 9218.3 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 81- 90, 28.13%]: SamplesSeen = 640; TrainLossPerSample = 3.33550558; EvalErr[0]PerSample = 0.76718750; TotalTime = 0.0702s; SamplesPerSecond = 9111.4 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 91- 100, 31.25%]: SamplesSeen = 640; TrainLossPerSample = 3.49726054; EvalErr[0]PerSample = 0.80000000; TotalTime = 0.0696s; SamplesPerSecond = 9191.6 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 101- 110, 34.38%]: SamplesSeen = 640; TrainLossPerSample = 3.21905375; EvalErr[0]PerSample = 0.80000000; TotalTime = 0.0732s; SamplesPerSecond = 8747.8 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 111- 120, 37.50%]: SamplesSeen = 640; TrainLossPerSample = 3.31461145; EvalErr[0]PerSample = 0.79062500; TotalTime = 0.0725s; SamplesPerSecond = 8824.4 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 121- 130, 40.63%]: SamplesSeen = 640; TrainLossPerSample = 3.15950802; EvalErr[0]PerSample = 0.77968750; TotalTime = 0.0736s; SamplesPerSecond = 8690.8 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 131- 140, 43.75%]: SamplesSeen = 640; TrainLossPerSample = 3.07762131; EvalErr[0]PerSample = 0.77187500; TotalTime = 0.0730s; SamplesPerSecond = 8766.9 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 141- 150, 46.88%]: SamplesSeen = 640; TrainLossPerSample = 3.05637351; EvalErr[0]PerSample = 0.72187500; TotalTime = 0.0721s; SamplesPerSecond = 8871.3 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 151- 160, 50.00%]: SamplesSeen = 640; TrainLossPerSample = 2.91153531; EvalErr[0]PerSample = 0.69062500; TotalTime = 0.0729s; SamplesPerSecond = 8776.1 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 161- 170, 53.13%]: SamplesSeen = 640; TrainLossPerSample = 2.89745725; EvalErr[0]PerSample = 0.73281250; TotalTime = 0.0721s; SamplesPerSecond = 8871.0 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 171- 180, 56.25%]: SamplesSeen = 640; TrainLossPerSample = 2.72829961; EvalErr[0]PerSample = 0.65312500; TotalTime = 0.0732s; SamplesPerSecond = 8740.9 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 181- 190, 59.38%]: SamplesSeen = 640; TrainLossPerSample = 2.65806444; EvalErr[0]PerSample = 0.68593750; TotalTime = 0.0729s; SamplesPerSecond = 8777.1 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 191- 200, 62.50%]: SamplesSeen = 640; TrainLossPerSample = 2.66604147; EvalErr[0]PerSample = 0.66093750; TotalTime = 0.0725s; SamplesPerSecond = 8833.3 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 201- 210, 65.63%]: SamplesSeen = 640; TrainLossPerSample = 2.53915697; EvalErr[0]PerSample = 0.63125000; TotalTime = 0.0724s; SamplesPerSecond = 8844.7 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 211- 220, 68.75%]: SamplesSeen = 640; TrainLossPerSample = 2.61937093; EvalErr[0]PerSample = 0.67343750; TotalTime = 0.0727s; SamplesPerSecond = 8808.1 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 221- 230, 71.88%]: SamplesSeen = 640; TrainLossPerSample = 2.51539473; EvalErr[0]PerSample = 0.65937500; TotalTime = 0.0726s; SamplesPerSecond = 8820.5 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 231- 240, 75.00%]: SamplesSeen = 640; TrainLossPerSample = 2.47301309; EvalErr[0]PerSample = 0.64218750; TotalTime = 0.0724s; SamplesPerSecond = 8845.0 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 241- 250, 78.13%]: SamplesSeen = 640; TrainLossPerSample = 2.42748799; EvalErr[0]PerSample = 0.61250000; TotalTime = 0.0725s; SamplesPerSecond = 8829.2 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 251- 260, 81.25%]: SamplesSeen = 640; TrainLossPerSample = 2.42204482; EvalErr[0]PerSample = 0.62500000; TotalTime = 0.0721s; SamplesPerSecond = 8879.4 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 261- 270, 84.38%]: SamplesSeen = 640; TrainLossPerSample = 2.17342812; EvalErr[0]PerSample = 0.56718750; TotalTime = 0.0721s; SamplesPerSecond = 8875.5 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 271- 280, 87.50%]: SamplesSeen = 640; TrainLossPerSample = 2.31290374; EvalErr[0]PerSample = 0.62968750; TotalTime = 0.0720s; SamplesPerSecond = 8894.9 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 281- 290, 90.63%]: SamplesSeen = 640; TrainLossPerSample = 2.26008782; EvalErr[0]PerSample = 0.60312500; TotalTime = 0.0731s; SamplesPerSecond = 8749.4 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 291- 300, 93.75%]: SamplesSeen = 640; TrainLossPerSample = 2.15763314; EvalErr[0]PerSample = 0.57968750; TotalTime = 0.0723s; SamplesPerSecond = 8853.1 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 301- 310, 96.88%]: SamplesSeen = 640; TrainLossPerSample = 2.23496000; EvalErr[0]PerSample = 0.59531250; TotalTime = 0.0721s; SamplesPerSecond = 8874.5 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 311- 320, 100.00%]: SamplesSeen = 640; TrainLossPerSample = 2.25712791; EvalErr[0]PerSample = 0.61406250; TotalTime = 0.0723s; SamplesPerSecond = 8847.5 +MPI Rank 1: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.000912; EvalErrPerSample = 0.72744141; AvgLearningRatePerSample = 0.015625; EpochTime=2.33402 +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Allocating matrices for forward and/or backward propagation. +MPI Rank 1: minibatchiterator: epoch 0: frames [0..83050] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 1: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms +MPI Rank 1: Final Results: Minibatch[1-1298]: Samples Seen = 83050 CrossEntropyWithSoftmax: CrossEntropyWithSoftmax/Sample = 2.1123249 Perplexity = 8.2674399 EvalErrorPrediction: ErrorPrediction/Sample = 0.56936785 +MPI Rank 1: Finished Epoch[ 1 of 3]: [Validation Set] TrainLossPerSample = 2.1123249; EvalErrPerSample = 0.56936785 +MPI Rank 1: Starting Epoch 2: learning rate per sample = 0.001953 effective momentum = 0.656119 momentum as time constant = 607.5 samples +MPI Rank 1: minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 1 of 2, with 1 datapasses +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 2, NumGradientBits = 64), distributed reading is ENABLED. +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 1- 10, 12.50%]: SamplesSeen = 2560; TrainLossPerSample = 2.09679725; EvalErr[0]PerSample = 0.56328125; TotalTime = 0.0851s; SamplesPerSecond = 30067.4 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 11- 20, 25.00%]: SamplesSeen = 2560; TrainLossPerSample = 1.99204361; EvalErr[0]PerSample = 0.54648438; TotalTime = 0.0804s; SamplesPerSecond = 31827.3 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 21- 30, 37.50%]: SamplesSeen = 2560; TrainLossPerSample = 1.99681229; EvalErr[0]PerSample = 0.54882813; TotalTime = 0.0811s; SamplesPerSecond = 31565.6 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 31- 40, 50.00%]: SamplesSeen = 2560; TrainLossPerSample = 1.90894475; EvalErr[0]PerSample = 0.52929688; TotalTime = 0.0810s; SamplesPerSecond = 31613.5 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 41- 50, 62.50%]: SamplesSeen = 2560; TrainLossPerSample = 1.89584567; EvalErr[0]PerSample = 0.52500000; TotalTime = 0.0812s; SamplesPerSecond = 31537.6 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 51- 60, 75.00%]: SamplesSeen = 2560; TrainLossPerSample = 1.90450680; EvalErr[0]PerSample = 0.54062500; TotalTime = 0.0820s; SamplesPerSecond = 31226.4 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 61- 70, 87.50%]: SamplesSeen = 2560; TrainLossPerSample = 1.91141823; EvalErr[0]PerSample = 0.53203125; TotalTime = 0.0811s; SamplesPerSecond = 31552.4 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 71- 80, 100.00%]: SamplesSeen = 2560; TrainLossPerSample = 1.87590201; EvalErr[0]PerSample = 0.52460938; TotalTime = 0.0827s; SamplesPerSecond = 30962.7 +MPI Rank 1: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9477838; EvalErrPerSample = 0.53876953; AvgLearningRatePerSample = 0.001953125; EpochTime=0.663724 +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Allocating matrices for forward and/or backward propagation. +MPI Rank 1: minibatchiterator: epoch 0: frames [0..83050] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 1: Final Results: Minibatch[1-325]: Samples Seen = 83050 CrossEntropyWithSoftmax: CrossEntropyWithSoftmax/Sample = 1.8690699 Perplexity = 6.4822646 EvalErrorPrediction: ErrorPrediction/Sample = 0.52003612 +MPI Rank 1: Finished Epoch[ 2 of 3]: [Validation Set] TrainLossPerSample = 1.8690699; EvalErrPerSample = 0.52003612 +MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.000098 effective momentum = 0.656119 momentum as time constant = 2429.9 samples +MPI Rank 1: minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 1 of 2, with 1 datapasses +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 2, NumGradientBits = 64), distributed reading is ENABLED. +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 1- 10, 50.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.87890619; EvalErr[0]PerSample = 0.52246094; TotalTime = 0.1466s; SamplesPerSecond = 69830.9 +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 11- 20, 100.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.88442887; EvalErr[0]PerSample = 0.51699219; TotalTime = 0.1342s; SamplesPerSecond = 76320.5 +MPI Rank 1: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8816675; EvalErrPerSample = 0.51972656; AvgLearningRatePerSample = 9.7656251e-005; EpochTime=0.289375 +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Allocating matrices for forward and/or backward propagation. +MPI Rank 1: minibatchiterator: epoch 0: frames [0..83050] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses +MPI Rank 1: Final Results: Minibatch[1-82]: Samples Seen = 83050 CrossEntropyWithSoftmax: CrossEntropyWithSoftmax/Sample = 1.8458415 Perplexity = 6.3334268 EvalErrorPrediction: ErrorPrediction/Sample = 0.50965683 +MPI Rank 1: Finished Epoch[ 3 of 3]: [Validation Set] TrainLossPerSample = 1.8458415; EvalErrPerSample = 0.50965683 +MPI Rank 1: CNTKCommandTrainEnd: speechTrain +MPI Rank 1: COMPLETED +MPI Rank 1: ~MPIWrapper diff --git a/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/cntkcv.cntk b/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/cntkcv.cntk new file mode 100644 index 000000000000..85bafee4fcdf --- /dev/null +++ b/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/cntkcv.cntk @@ -0,0 +1,120 @@ +precision = "double" +command = speechTrain +deviceId = $DeviceId$ + +parallelTrain = true + +speechTrain = [ + action = "train" + modelPath = "$RunDir$/models/cntkSpeech.dnn" + deviceId = $DeviceId$ + traceLevel = 1 + + SimpleNetworkBuilder = [ + layerSizes = 363:512:512:132 + trainingCriterion = "CrossEntropyWithSoftmax" + evalCriterion = "ErrorPrediction" + layerTypes = "Sigmoid" + initValueScale = 1.0 + applyMeanVarNorm = true + uniformInit = true + needPrior = true + ] + + ExperimentalNetworkBuilder = [ // the same as above but with BS. Not active; activate by commenting out the SimpleNetworkBuilder entry above + layerSizes = 363:512:512:132 + trainingCriterion = 'CE' + evalCriterion = 'Err' + + applyMeanVarNorm = true + + L = Length(layerSizes)-1 // number of model layers + features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') + featNorm = if applyMeanVarNorm + then MeanVarNorm(features) + else features + layers[layer:1..L-1] = if layer > 1 + then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) + else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) + outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) + outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) + CE = if trainingCriterion == 'CE' + then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') + else Fail('unknown trainingCriterion ' + trainingCriterion) + Err = if evalCriterion == 'Err' then + ErrorPrediction(labels, outZ, tag='eval') + else Fail('unknown evalCriterion ' + evalCriterion) + logPrior = LogPrior(labels) + // TODO: how to add a tag to an infix operation? + ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') + ] + + SGD = [ + epochSize = 20480 + minibatchSize = 64:256:1024 + learningRatesPerMB = 1.0:0.5:0.1 + numMBsToShowResult = 10 + momentumPerMB = 0.9:0.656119 + dropoutRate = 0.0 + maxEpochs = 3 + keepCheckPointFiles = true + clippingThresholdPerSample = 1#INF + + ParallelTrain = [ + parallelizationMethod = "DataParallelSGD" + distributedMBReading = true + DataParallelSGD = [ + gradientBits = 64 + ] + ] + + AutoAdjust = [ + reduceLearnRateIfImproveLessThan = 0 + loadBestModel = true + increaseLearnRateIfImproveMoreThan = 1000000000 + learnRateDecreaseFactor = 0.5 + learnRateIncreaseFactor = 1.382 + autoAdjustLR = "adjustAfterEpoch" + ] + ] + reader = [ + readerType = "HTKMLFReader" + readMethod = "blockRandomize" + miniBatchMode = "partial" + randomize = "auto" + verbosity = 0 + + features = [ + dim = 363 + type = "real" + scpFile = "glob_0000.scp" + ] + + labels = [ + mlfFile = "$DataDir$/glob_0000.mlf" + labelMappingFile = "$DataDir$/state.list" + labelDim = 132 + labelType = "category" + ] + ] + cvreader = [ + readerType = "HTKMLFReader" + readMethod = "blockRandomize" + miniBatchMode = "partial" + randomize = "auto" + verbosity = 0 + + features = [ + dim = 363 + type = "real" + scpFile = "glob_0000.cv.scp" + ] + + labels = [ + mlfFile = "$DataDir$/glob_0000.mlf" + labelMappingFile = "$DataDir$/state.list" + labelDim = 132 + labelType = "category" + ] + ] +] diff --git a/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/run-test b/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/run-test new file mode 100755 index 000000000000..7cf51f442e93 --- /dev/null +++ b/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/run-test @@ -0,0 +1,15 @@ +#!/bin/bash + +. $TEST_ROOT_DIR/run-test-common + +ConfigDir=$TEST_DIR +LogFileName=stderr +Instances=2 +NumCPUThreads=$(threadsPerInstance $Instances) + +# cntkmpirun +cntkmpirun "-n $Instances" cntkcv.cntk "numCPUThreads=$NumCPUThreads" +ExitCode=$? +sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank0 +sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank1 +exit $ExitCode diff --git a/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/testcases.yml b/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/testcases.yml new file mode 100644 index 000000000000..30fe5ffb1bfc --- /dev/null +++ b/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/testcases.yml @@ -0,0 +1,49 @@ +dataDir: ../../Data +tags: + # running on every BVT job in 'P' (Speech) leg in Debug-GPU and Release-CPU configurations: + - bvt-p ((build_sku == 'gpu') or (build_sku == '1bitsgd')) and ((flavor=='debug') ^ (device=='cpu')) + # running unconditionally on every Nightly job in 'P' leg + - nightly-p ((build_sku == 'gpu') or (build_sku == '1bitsgd')) + +testCases: + Must train epochs in exactly same order and parameters for each MPI Rank: + patterns: + - ^MPI Rank {{integer}} + - Starting Epoch {{integer}} + - learning rate per sample = {{float}} + - momentum = {{float}} + + Epochs must be finished with expected results for each MPI Rank for training: + patterns: + - ^MPI Rank {{integer}} + - Finished Epoch[{{integer}} of {{integer}}] + - Training Set + - TrainLossPerSample = {{float,tolerance=0%}} + - EvalErrPerSample = {{float,tolerance=0%}} + - AvgLearningRatePerSample = {{float,tolerance=0.001%}} + + Epochs must be finished with expected results for each MPI Rank for CV: + patterns: + - ^MPI Rank {{integer}} + - Finished Epoch[{{integer}} of {{integer}}] + - Validation Set + - TrainLossPerSample = {{float,tolerance=0%}} + - EvalErrPerSample = {{float,tolerance=0%}} + + Per-minibatch training results must match for each MPI Rank: + patterns: + - ^MPI Rank {{integer}} + - Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}} + - SamplesSeen = {{integer}} + - TrainLossPerSample = {{float,tolerance=0%}} + - EvalErr[0]PerSample = {{float,tolerance=0%}} + + DataParallelSGD training parameters must match for each MPI Rank: + patterns: + - ^MPI Rank {{integer}} + - Starting minibatch loop + - DataParallelSGD training + - MyRank = {{integer}} + - NumNodes = 2 + - NumGradientBits = 64 + - distributed reading is ENABLED diff --git a/Tests/EndToEndTests/Speech/Data/glob_0000.cv.scp b/Tests/EndToEndTests/Speech/Data/glob_0000.cv.scp new file mode 100644 index 000000000000..359e7451f95b --- /dev/null +++ b/Tests/EndToEndTests/Speech/Data/glob_0000.cv.scp @@ -0,0 +1,300 @@ +An4/71/71/cen5-fjam-b.mfc=Features/000000000.chunk[0,367] +An4/213/213/cen4-fsaf2-b.mfc=Features/000000000.chunk[368,805] +An4/513/513/cen7-mgah-b.mfc=Features/000000000.chunk[806,1173] +An4/614/614/cen7-mkdb-b.mfc=Features/000000000.chunk[1174,1421] +An4/507/507/cen1-mgah-b.mfc=Features/000000000.chunk[1422,1669] +An4/693/693/cen8-mmkw-b.mfc=Features/000000000.chunk[1670,2027] +An4/918/918/cen4-mtos-b.mfc=Features/000000000.chunk[2028,2335] +An4/477/477/an257-mewl-b.mfc=Features/000000000.chunk[2336,2943] +An4/454/454/an70-meht-b.mfc=Features/000000000.chunk[2944,3021] +An4/254/254/cen6-ftmj-b.mfc=Features/000000000.chunk[3022,3249] +An4/946/946/cen6-mwhw-b.mfc=Features/000000000.chunk[3250,3467] +An4/122/122/cen4-fkdo-b.mfc=Features/000000000.chunk[3468,3735] +An4/181/181/an183-fnsv-b.mfc=Features/000000000.chunk[3736,4093] +An4/93/93/cen1-fjmd-b.mfc=Features/000000000.chunk[4094,4251] +An4/128/128/an62-flmm2-b.mfc=Features/000000000.chunk[4252,4409] +An4/688/688/cen3-mmkw-b.mfc=Features/000000000.chunk[4410,4617] +An4/872/872/an332-msrb-b.mfc=Features/000000000.chunk[4618,4985] +An4/624/624/cen5-mkem-b.mfc=Features/000000000.chunk[4986,5383] +An4/146/146/cen2-flrp-b.mfc=Features/000000000.chunk[5384,5541] +An4/198/198/cen2-fplp-b.mfc=Features/000000000.chunk[5542,5969] +An4/239/239/cen4-ftal-b.mfc=Features/000000000.chunk[5970,6187] +An4/49/49/an291-ffmm-b.mfc=Features/000000000.chunk[6188,6335] +An4/306/306/cen7-mbmg-b.mfc=Features/000000000.chunk[6336,6733] +An4/252/252/cen4-ftmj-b.mfc=Features/000000000.chunk[6734,7171] +An4/800/800/an359-mscg2-b.mfc=Features/000000000.chunk[7172,7509] +An4/771/771/an236-mrjc2-b.mfc=Features/000000000.chunk[7510,7597] +An4/880/880/cen5-msrb-b.mfc=Features/000000000.chunk[7598,7955] +An4/795/795/cen7-mrmg-b.mfc=Features/000000000.chunk[7956,8293] +An4/821/821/cen7-msct-b.mfc=Features/000000000.chunk[8294,8611] +An4/255/255/cen7-ftmj-b.mfc=Features/000000000.chunk[8612,8949] +An4/580/580/an58-mjhp-b.mfc=Features/000000000.chunk[8950,9267] +An4/70/70/cen4-fjam-b.mfc=Features/000000000.chunk[9268,9595] +An4/528/528/an171-mjda-b.mfc=Features/000000000.chunk[9596,9963] +An4/901/901/an35-mtje-b.mfc=Features/000000000.chunk[9964,10371] +An4/776/776/cen1-mrjc2-b.mfc=Features/000000000.chunk[10372,10779] +An4/908/908/cen7-mtje-b.mfc=Features/000000000.chunk[10780,11257] +An4/603/603/an316-mkdb-b.mfc=Features/000000000.chunk[11258,11565] +An4/544/544/an20-mjdr-b.mfc=Features/000000000.chunk[11566,11853] +An4/243/243/cen8-ftal-b.mfc=Features/000000000.chunk[11854,12071] +An4/891/891/cen3-mtcv-b.mfc=Features/000000000.chunk[12072,12269] +An4/245/245/an212-ftmj-b.mfc=Features/000000000.chunk[12270,12647] +An4/156/156/an119-fmjc-b.mfc=Features/000000000.chunk[12648,13055] +An4/446/446/cen5-meab-b.mfc=Features/000000000.chunk[13056,13483] +An4/801/801/an360-mscg2-b.mfc=Features/000000000.chunk[13484,13601] +An4/538/538/cen6-mjda-b.mfc=Features/000000000.chunk[13602,13799] +An4/282/282/an1-mblw-b.mfc=Features/000000000.chunk[13800,13947] +An4/589/589/cen7-mjhp-b.mfc=Features/000000000.chunk[13948,14275] +An4/710/710/an389-mmtm-b.mfc=Features/000000000.chunk[14276,14603] +An4/638/638/cen6-mmaf-b.mfc=Features/000000000.chunk[14604,14811] +An4/874/874/an334-msrb-b.mfc=Features/000000000.chunk[14812,15029] +An4/40/40/an40-fejs-b.mfc=Features/000000000.chunk[15030,15337] +An4/176/176/cen6-fmjd-b.mfc=Features/000000000.chunk[15338,15545] +An4/732/732/cen8-mnfe-b.mfc=Features/000000000.chunk[15546,15773] +An4/575/575/cen6-mjgk-b.mfc=Features/000000000.chunk[15774,16191] +An4/234/234/an329-ftal-b.mfc=Features/000000000.chunk[16192,16429] +An4/497/497/cen4-mfaa-b.mfc=Features/000000000.chunk[16430,16687] +An4/619/619/an189-mkem-b.mfc=Features/000000000.chunk[16688,16785] +An4/303/303/cen4-mbmg-b.mfc=Features/000000000.chunk[16786,17093] +An4/502/502/an196-mgah-b.mfc=Features/000000000.chunk[17094,17291] +An4/436/436/cen8-mdxs-b.mfc=Features/000000000.chunk[17292,17619] +An4/889/889/cen1-mtcv-b.mfc=Features/000000000.chunk[17620,18227] +An4/697/697/an384-mmsh-b.mfc=Features/000000000.chunk[18228,18475] +An4/413/413/an108-mdxn-b.mfc=Features/000000000.chunk[18476,18643] +An4/165/165/cen8-fmjc-b.mfc=Features/000000000.chunk[18644,18901] +An4/186/186/cen3-fnsv-b.mfc=Features/000000000.chunk[18902,19149] +An4/274/274/cen1-mblb-b.mfc=Features/000000000.chunk[19150,19417] +An4/309/309/an202-mcel-b.mfc=Features/000000000.chunk[19418,19525] +An4/725/725/cen1-mnfe-b.mfc=Features/000000000.chunk[19526,19783] +An4/699/699/cen1-mmsh-b.mfc=Features/000000000.chunk[19784,20051] +An4/833/833/cen6-msjm-b.mfc=Features/000000000.chunk[20052,20299] +An4/857/857/cen4-mskh-b.mfc=Features/000000000.chunk[20300,20687] +An4/734/734/an82-mnjl-b.mfc=Features/000000000.chunk[20688,21025] +An4/340/340/cen3-mcfl-b.mfc=Features/000000000.chunk[21026,21263] +An4/36/36/an36-fejs-b.mfc=Features/000000000.chunk[21264,21641] +An4/690/690/cen5-mmkw-b.mfc=Features/000000000.chunk[21642,22069] +An4/545/545/cen1-mjdr-b.mfc=Features/000000000.chunk[22070,22347] +An4/115/115/an132-fkdo-b.mfc=Features/000000000.chunk[22348,22505] +An4/48/48/cen8-fejs-b.mfc=Features/000000000.chunk[22506,22723] +An4/518/518/an249-mjbh-b.mfc=Features/000000000.chunk[22724,22811] +An4/89/89/an6-fjmd-b.mfc=Features/000000000.chunk[22812,22889] +An4/668/668/an337-mmdg-b.mfc=Features/000000000.chunk[22890,23007] +An4/622/622/cen2-mkem-b.mfc=Features/000000000.chunk[23008,23175] +An4/8/8/cen5-fash-b.mfc=Features/000000000.chunk[23176,23623] +An4/601/601/cen7-mjjs2-b.mfc=Features/000000000.chunk[23624,24051] +An4/480/480/an260-mewl-b.mfc=Features/000000000.chunk[24052,24409] +An4/182/182/an184-fnsv-b.mfc=Features/000000000.chunk[24410,24497] +An4/179/179/an181-fnsv-b.mfc=Features/000000000.chunk[24498,24825] +An4/92/92/an9-fjmd-b.mfc=Features/000000000.chunk[24826,25003] +An4/164/164/cen7-fmjc-b.mfc=Features/000000000.chunk[25004,25251] +An4/16/16/cen2-fbbh-b.mfc=Features/000000000.chunk[25252,25549] +An4/657/657/an49-mmap-b.mfc=Features/000000000.chunk[25550,25867] +An4/723/723/an349-mnfe-b.mfc=Features/000000000.chunk[25868,26325] +An4/700/700/cen2-mmsh-b.mfc=Features/000000000.chunk[26326,26453] +An4/675/675/cen4-mmdg-b.mfc=Features/000000000.chunk[26454,26861] +An4/386/386/an112-mdcs2-b.mfc=Features/000000000.chunk[26862,27129] +An4/152/152/cen8-flrp-b.mfc=Features/000000000.chunk[27130,27347] +An4/740/740/cen3-mnjl-b.mfc=Features/000000000.chunk[27348,27465] +An4/370/370/cen7-mcsc-b.mfc=Features/000000000.chunk[27466,27783] +An4/683/683/an364-mmkw-b.mfc=Features/000000000.chunk[27784,27861] +An4/440/440/an139-meab-b.mfc=Features/000000000.chunk[27862,28089] +An4/789/789/cen1-mrmg-b.mfc=Features/000000000.chunk[28090,28427] +An4/611/611/cen4-mkdb-b.mfc=Features/000000000.chunk[28428,28685] +An4/10/10/an86-fbbh-b.mfc=Features/000000000.chunk[28686,29013] +An4/343/343/cen6-mcfl-b.mfc=Features/000000000.chunk[29014,29251] +An4/438/438/an137-meab-b.mfc=Features/000000000.chunk[29252,29669] +An4/456/456/cen2-meht-b.mfc=Features/000000000.chunk[29670,29817] +An4/489/489/an161-mfaa-b.mfc=Features/000000000.chunk[29818,30075] +An4/53/53/an295-ffmm-b.mfc=Features/000000000.chunk[30076,30363] +An4/702/702/cen4-mmsh-b.mfc=Features/000000000.chunk[30364,30681] +An4/777/777/cen2-mrjc2-b.mfc=Features/000000000.chunk[30682,30999] +An4/873/873/an333-msrb-b.mfc=Features/000000000.chunk[31000,31097] +An4/768/768/cen6-mrcb-b.mfc=Features/000000000.chunk[31098,31275] +An4/552/552/cen8-mjdr-b.mfc=Features/000000000.chunk[31276,31503] +An4/631/631/an54-mmaf-b.mfc=Features/000000000.chunk[31504,31611] +An4/476/476/an256-mewl-b.mfc=Features/000000000.chunk[31612,31689] +An4/151/151/cen7-flrp-b.mfc=Features/000000000.chunk[31690,31937] +An4/920/920/cen6-mtos-b.mfc=Features/000000000.chunk[31938,32145] +An4/358/358/cen8-mcrt-b.mfc=Features/000000000.chunk[32146,32463] +An4/177/177/cen7-fmjd-b.mfc=Features/000000000.chunk[32464,32761] +An4/635/635/cen3-mmaf-b.mfc=Features/000000000.chunk[32762,32929] +An4/719/719/cen8-mmtm-b.mfc=Features/000000000.chunk[32930,33207] +An4/750/750/cen1-mrab-b.mfc=Features/000000000.chunk[33208,33395] +An4/755/755/cen6-mrab-b.mfc=Features/000000000.chunk[33396,33573] +An4/721/721/an347-mnfe-b.mfc=Features/000000000.chunk[33574,33661] +An4/380/380/cen4-mdcs-b.mfc=Features/000000000.chunk[33662,33909] +An4/625/625/cen6-mkem-b.mfc=Features/000000000.chunk[33910,34117] +An4/106/106/cen1-fkai-b.mfc=Features/000000000.chunk[34118,34295] +An4/658/658/an50-mmap-b.mfc=Features/000000000.chunk[34296,34513] +An4/402/402/an210-mdmc-b.mfc=Features/000000000.chunk[34514,35021] +An4/192/192/an91-fplp-b.mfc=Features/000000000.chunk[35022,35469] +An4/416/416/cen1-mdxn-b.mfc=Features/000000000.chunk[35470,35757] +An4/161/161/cen4-fmjc-b.mfc=Features/000000000.chunk[35758,35965] +An4/797/797/an356-mscg2-b.mfc=Features/000000000.chunk[35966,36183] +An4/433/433/cen5-mdxs-b.mfc=Features/000000000.chunk[36184,36691] +An4/57/57/cen4-ffmm-b.mfc=Features/000000000.chunk[36692,37119] +An4/157/157/an120-fmjc-b.mfc=Features/000000000.chunk[37120,37347] +An4/272/272/an374-mblb-b.mfc=Features/000000000.chunk[37348,37575] +An4/549/549/cen5-mjdr-b.mfc=Features/000000000.chunk[37576,37903] +An4/41/41/cen1-fejs-b.mfc=Features/000000000.chunk[37904,38341] +An4/290/290/cen4-mblw-b.mfc=Features/000000000.chunk[38342,38549] +An4/701/701/cen3-mmsh-b.mfc=Features/000000000.chunk[38550,38677] +An4/398/398/an206-mdmc-b.mfc=Features/000000000.chunk[38678,39005] +An4/640/640/cen8-mmaf-b.mfc=Features/000000000.chunk[39006,39323] +An4/904/904/cen3-mtje-b.mfc=Features/000000000.chunk[39324,39541] +An4/686/686/cen1-mmkw-b.mfc=Features/000000000.chunk[39542,40039] +An4/97/97/cen5-fjmd-b.mfc=Features/000000000.chunk[40040,40397] +An4/259/259/an223-fwxs-b.mfc=Features/000000000.chunk[40398,40495] +An4/729/729/cen5-mnfe-b.mfc=Features/000000000.chunk[40496,41033] +An4/709/709/an388-mmtm-b.mfc=Features/000000000.chunk[41034,41131] +An4/692/692/cen7-mmkw-b.mfc=Features/000000000.chunk[41132,41759] +An4/2/2/an253-fash-b.mfc=Features/000000000.chunk[41760,41827] +An4/39/39/an39-fejs-b.mfc=Features/000000000.chunk[41828,42095] +An4/488/488/cen8-mewl-b.mfc=Features/000000000.chunk[42096,42423] +An4/411/411/an106-mdxn-b.mfc=Features/000000000.chunk[42424,42601] +An4/905/905/cen4-mtje-b.mfc=Features/000000000.chunk[42602,43069] +An4/783/783/cen8-mrjc2-b.mfc=Features/000000000.chunk[43070,43417] +An4/205/205/an296-fsaf2-b.mfc=Features/000000000.chunk[43418,43705] +An4/788/788/an285-mrmg-b.mfc=Features/000000000.chunk[43706,44053] +An4/173/173/cen3-fmjd-b.mfc=Features/000000000.chunk[44054,44251] +An4/389/389/an115-mdcs2-b.mfc=Features/000000000.chunk[44252,44579] +An4/412/412/an107-mdxn-b.mfc=Features/000000000.chunk[44580,44867] +An4/69/69/cen3-fjam-b.mfc=Features/000000000.chunk[44868,45045] +An4/84/84/cen5-fjdn-b.mfc=Features/000000000.chunk[45046,45273] +An4/826/826/an229-msjm-b.mfc=Features/000000000.chunk[45274,45361] +An4/722/722/an348-mnfe-b.mfc=Features/000000000.chunk[45362,45589] +An4/490/490/an162-mfaa-b.mfc=Features/000000000.chunk[45590,45897] +An4/335/335/an263-mcfl-b.mfc=Features/000000000.chunk[45898,46275] +An4/854/854/cen1-mskh-b.mfc=Features/000000000.chunk[46276,46503] +An4/334/334/an262-mcfl-b.mfc=Features/000000000.chunk[46504,46851] +An4/403/403/cen1-mdmc-b.mfc=Features/000000000.chunk[46852,47079] +An4/46/46/cen6-fejs-b.mfc=Features/000000000.chunk[47080,47277] +An4/154/154/an117-fmjc-b.mfc=Features/000000000.chunk[47278,47595] +An4/565/565/cen8-mjes-b.mfc=Features/000000000.chunk[47596,47843] +An4/251/251/cen3-ftmj-b.mfc=Features/000000000.chunk[47844,48071] +An4/139/139/an21-flrp-b.mfc=Features/000000000.chunk[48072,48479] +An4/6/6/cen2-fash-b.mfc=Features/000000000.chunk[48480,48607] +An4/76/76/an122-fjdn-b.mfc=Features/000000000.chunk[48608,48765] +An4/817/817/cen3-msct-b.mfc=Features/000000000.chunk[48766,48913] +An4/328/328/cen4-mcen-b.mfc=Features/000000000.chunk[48914,49161] +An4/293/293/cen7-mblw-b.mfc=Features/000000000.chunk[49162,49409] +An4/214/214/cen5-fsaf2-b.mfc=Features/000000000.chunk[49410,49797] +An4/91/91/an8-fjmd-b.mfc=Features/000000000.chunk[49798,49975] +An4/820/820/cen6-msct-b.mfc=Features/000000000.chunk[49976,50213] +An4/300/300/cen1-mbmg-b.mfc=Features/000000000.chunk[50214,50491] +An4/18/18/cen4-fbbh-b.mfc=Features/000000000.chunk[50492,50829] +An4/526/526/cen7-mjbh-b.mfc=Features/000000000.chunk[50830,51067] +An4/408/408/cen6-mdmc-b.mfc=Features/000000000.chunk[51068,51285] +An4/169/169/an194-fmjd-b.mfc=Features/000000000.chunk[51286,51553] +An4/939/939/an154-mwhw-b.mfc=Features/000000000.chunk[51554,51841] +An4/931/931/cen4-mtxj-b.mfc=Features/000000000.chunk[51842,52299] +An4/758/758/an101-mrcb-b.mfc=Features/000000000.chunk[52300,52647] +An4/781/781/cen6-mrjc2-b.mfc=Features/000000000.chunk[52648,52875] +An4/321/321/an127-mcen-b.mfc=Features/000000000.chunk[52876,52973] +An4/199/199/cen3-fplp-b.mfc=Features/000000000.chunk[52974,53271] +An4/494/494/cen1-mfaa-b.mfc=Features/000000000.chunk[53272,53469] +An4/560/560/cen3-mjes-b.mfc=Features/000000000.chunk[53470,53547] +An4/713/713/cen2-mmtm-b.mfc=Features/000000000.chunk[53548,53855] +An4/938/938/an153-mwhw-b.mfc=Features/000000000.chunk[53856,54143] +An4/163/163/cen6-fmjc-b.mfc=Features/000000000.chunk[54144,54321] +An4/338/338/cen1-mcfl-b.mfc=Features/000000000.chunk[54322,54569] +An4/775/775/an240-mrjc2-b.mfc=Features/000000000.chunk[54570,54777] +An4/264/264/cen3-fwxs-b.mfc=Features/000000000.chunk[54778,54925] +An4/224/224/cen2-fsrb-b.mfc=Features/000000000.chunk[54926,55233] +An4/166/166/an191-fmjd-b.mfc=Features/000000000.chunk[55234,55321] +An4/80/80/cen1-fjdn-b.mfc=Features/000000000.chunk[55322,55469] +An4/426/426/an28-mdxs-b.mfc=Features/000000000.chunk[55470,55577] +An4/737/737/an85-mnjl-b.mfc=Features/000000000.chunk[55578,55965] +An4/919/919/cen5-mtos-b.mfc=Features/000000000.chunk[55966,56363] +An4/102/102/an312-fkai-b.mfc=Features/000000000.chunk[56364,56751] +An4/743/743/cen7-mnjl-b.mfc=Features/000000000.chunk[56752,57129] +An4/948/948/cen8-mwhw-b.mfc=Features/000000000.chunk[57130,57347] +An4/17/17/cen3-fbbh-b.mfc=Features/000000000.chunk[57348,57575] +An4/11/11/an87-fbbh-b.mfc=Features/000000000.chunk[57576,57743] +An4/344/344/cen7-mcfl-b.mfc=Features/000000000.chunk[57744,58111] +An4/359/359/an231-mcsc-b.mfc=Features/000000000.chunk[58112,58329] +An4/203/203/cen7-fplp-b.mfc=Features/000000000.chunk[58330,58877] +An4/704/704/cen6-mmsh-b.mfc=Features/000000000.chunk[58878,59035] +An4/331/331/cen7-mcen-b.mfc=Features/000000000.chunk[59036,59323] +An4/736/736/an84-mnjl-b.mfc=Features/000000000.chunk[59324,59511] +An4/121/121/cen3-fkdo-b.mfc=Features/000000000.chunk[59512,59769] +An4/574/574/cen5-mjgk-b.mfc=Features/000000000.chunk[59770,59977] +An4/143/143/an24-flrp-b.mfc=Features/000000000.chunk[59978,60065] +An4/209/209/an300-fsaf2-b.mfc=Features/000000000.chunk[60066,60473] +An4/367/367/cen4-mcsc-b.mfc=Features/000000000.chunk[60474,60731] +An4/38/38/an38-fejs-b.mfc=Features/000000000.chunk[60732,60809] +An4/390/390/cen1-mdcs2-b.mfc=Features/000000000.chunk[60810,61057] +An4/756/756/cen7-mrab-b.mfc=Features/000000000.chunk[61058,61275] +An4/555/555/an158-mjes-b.mfc=Features/000000000.chunk[61276,61613] +An4/680/680/an361-mmkw-b.mfc=Features/000000000.chunk[61614,62041] +An4/578/578/an56-mjhp-b.mfc=Features/000000000.chunk[62042,62419] +An4/655/655/an47-mmap-b.mfc=Features/000000000.chunk[62420,62667] +An4/646/646/cen1-mmal-b.mfc=Features/000000000.chunk[62668,63035] +An4/720/720/an346-mnfe-b.mfc=Features/000000000.chunk[63036,63453] +An4/608/608/cen1-mkdb-b.mfc=Features/000000000.chunk[63454,63721] +An4/441/441/an140-meab-b.mfc=Features/000000000.chunk[63722,64299] +An4/356/356/cen6-mcrt-b.mfc=Features/000000000.chunk[64300,64547] +An4/926/926/an379-mtxj-b.mfc=Features/000000000.chunk[64548,64625] +An4/541/541/an16-mjdr-b.mfc=Features/000000000.chunk[64626,64893] +An4/195/195/an94-fplp-b.mfc=Features/000000000.chunk[64894,65441] +An4/591/591/an176-mjjs2-b.mfc=Features/000000000.chunk[65442,65789] +An4/9/9/cen7-fash-b.mfc=Features/000000000.chunk[65790,66037] +An4/484/484/cen4-mewl-b.mfc=Features/000000000.chunk[66038,66525] +An4/537/537/cen5-mjda-b.mfc=Features/000000000.chunk[66526,66933] +An4/242/242/cen7-ftal-b.mfc=Features/000000000.chunk[66934,67171] +An4/848/848/cen8-msjr-b.mfc=Features/000000000.chunk[67172,67409] +An4/220/220/an168-fsrb-b.mfc=Features/000000000.chunk[67410,67757] +An4/906/906/cen5-mtje-b.mfc=Features/000000000.chunk[67758,68185] +An4/444/444/cen3-meab-b.mfc=Features/000000000.chunk[68186,68373] +An4/88/88/an10-fjmd-b.mfc=Features/000000000.chunk[68374,68531] +An4/561/561/cen4-mjes-b.mfc=Features/000000000.chunk[68532,68919] +An4/728/728/cen4-mnfe-b.mfc=Features/000000000.chunk[68920,69347] +An4/784/784/an281-mrmg-b.mfc=Features/000000000.chunk[69348,69485] +An4/55/55/cen2-ffmm-b.mfc=Features/000000000.chunk[69486,69983] +An4/593/593/an178-mjjs2-b.mfc=Features/000000000.chunk[69984,70061] +An4/327/327/cen3-mcen-b.mfc=Features/000000000.chunk[70062,70309] +An4/4/4/an255-fash-b.mfc=Features/000000000.chunk[70310,70567] +An4/922/922/cen8-mtos-b.mfc=Features/000000000.chunk[70568,70775] +An4/229/229/cen7-fsrb-b.mfc=Features/000000000.chunk[70776,71253] +An4/297/297/an268-mbmg-b.mfc=Features/000000000.chunk[71254,71651] +An4/215/215/cen6-fsaf2-b.mfc=Features/000000000.chunk[71652,71839] +An4/567/567/an217-mjgk-b.mfc=Features/000000000.chunk[71840,71987] +An4/96/96/cen4-fjmd-b.mfc=Features/000000000.chunk[71988,72335] +An4/846/846/cen6-msjr-b.mfc=Features/000000000.chunk[72336,72543] +An4/850/850/an96-mskh-b.mfc=Features/000000000.chunk[72544,72621] +An4/492/492/an164-mfaa-b.mfc=Features/000000000.chunk[72622,72859] +An4/661/661/cen3-mmap-b.mfc=Features/000000000.chunk[72860,72987] +An4/200/200/cen4-fplp-b.mfc=Features/000000000.chunk[72988,73485] +An4/82/82/cen3-fjdn-b.mfc=Features/000000000.chunk[73486,73583] +An4/936/936/an151-mwhw-b.mfc=Features/000000000.chunk[73584,73891] +An4/60/60/cen7-ffmm-b.mfc=Features/000000000.chunk[73892,74379] +An4/183/183/an185-fnsv-b.mfc=Features/000000000.chunk[74380,74477] +An4/667/667/an336-mmdg-b.mfc=Features/000000000.chunk[74478,74785] +An4/576/576/cen7-mjgk-b.mfc=Features/000000000.chunk[74786,74993] +An4/212/212/cen3-fsaf2-b.mfc=Features/000000000.chunk[74994,75101] +An4/779/779/cen4-mrjc2-b.mfc=Features/000000000.chunk[75102,75449] +An4/418/418/cen3-mdxn-b.mfc=Features/000000000.chunk[75450,75637] +An4/636/636/cen4-mmaf-b.mfc=Features/000000000.chunk[75638,75935] +An4/257/257/an221-fwxs-b.mfc=Features/000000000.chunk[75936,76253] +An4/59/59/cen6-ffmm-b.mfc=Features/000000000.chunk[76254,76481] +An4/899/899/an33-mtje-b.mfc=Features/000000000.chunk[76482,76879] +An4/886/886/an303-mtcv-b.mfc=Features/000000000.chunk[76880,77307] +An4/932/932/cen5-mtxj-b.mfc=Features/000000000.chunk[77308,77735] +An4/336/336/an264-mcfl-b.mfc=Features/000000000.chunk[77736,77813] +An4/877/877/cen2-msrb-b.mfc=Features/000000000.chunk[77814,78051] +An4/629/629/an52-mmaf-b.mfc=Features/000000000.chunk[78052,78199] +An4/767/767/cen5-mrcb-b.mfc=Features/000000000.chunk[78200,78547] +An4/374/374/an243-mdcs-b.mfc=Features/000000000.chunk[78548,78635] +An4/437/437/an136-meab-b.mfc=Features/000000000.chunk[78636,79063] +An4/202/202/cen6-fplp-b.mfc=Features/000000000.chunk[79064,79451] +An4/29/29/cen2-fclc-b.mfc=Features/000000000.chunk[79452,79699] +An4/669/669/an338-mmdg-b.mfc=Features/000000000.chunk[79700,80017] +An4/216/216/cen7-fsaf2-b.mfc=Features/000000000.chunk[80018,80395] +An4/227/227/cen5-fsrb-b.mfc=Features/000000000.chunk[80396,80903] +An4/864/864/an278-msmn-b.mfc=Features/000000000.chunk[80904,81311] +An4/794/794/cen6-mrmg-b.mfc=Features/000000000.chunk[81312,81549] +An4/865/865/an279-msmn-b.mfc=Features/000000000.chunk[81550,81837] +An4/111/111/cen6-fkai-b.mfc=Features/000000000.chunk[81838,82015] +An4/774/774/an239-mrjc2-b.mfc=Features/000000000.chunk[82016,82293] +An4/831/831/cen4-msjm-b.mfc=Features/000000000.chunk[82294,82481] +An4/793/793/cen5-mrmg-b.mfc=Features/000000000.chunk[82482,83049] diff --git a/Tests/EndToEndTests/Text/SparseDSSM/baseline.cpu.txt b/Tests/EndToEndTests/Text/SparseDSSM/baseline.cpu.txt new file mode 100755 index 000000000000..ba8e90eb8797 --- /dev/null +++ b/Tests/EndToEndTests/Text/SparseDSSM/baseline.cpu.txt @@ -0,0 +1,3176 @@ +=== Running mpiexec -n 4 /home/thhoens/cntk/build/gpu/release/bin/cntk configFile=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.cntk currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ DeviceId=-1 numCPUThreads=2 stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 16:31:40 + Last modified date: Thu Mar 3 16:25:02 2016 + Build type: debug + Math lib: acml + Build Branch: thhoens/tests + Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +------------------------------------------------------------------- +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 16:31:40 + Last modified date: Thu Mar 3 16:25:02 2016 + Build type: debug + Math lib: acml + Build Branch: thhoens/tests + Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +------------------------------------------------------------------- +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 16:31:40 + Last modified date: Thu Mar 3 16:25:02 2016 + Build type: debug + Math lib: acml + Build Branch: thhoens/tests + Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +------------------------------------------------------------------- +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 16:31:40 + Last modified date: Thu Mar 3 16:25:02 2016 + Build type: debug + Math lib: acml + Build Branch: thhoens/tests + Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +------------------------------------------------------------------- +MPIWrapper: initializing MPI +MPIWrapper: initializing MPI +MPIWrapper: initializing MPI +MPIWrapper: initializing MPI +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: all 4 nodes responded +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (1) are in (participating) +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: all 4 nodes responded +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (3) are in (participating) +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: all 4 nodes responded +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (2) are in (participating) +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: all 4 nodes responded +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (0) are in (participating) +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: all 4 nodes responded +mpihelper: we are cog 1 in a gearbox of 4 +ping [mpihelper]: 4 nodes pinging each other +ping [requestnodes (after change)]: all 4 nodes responded +mpihelper: we are cog 3 in a gearbox of 4 +ping [mpihelper]: 4 nodes pinging each other +ping [requestnodes (after change)]: all 4 nodes responded +mpihelper: we are cog 2 in a gearbox of 4 +ping [mpihelper]: 4 nodes pinging each other +ping [requestnodes (after change)]: all 4 nodes responded +mpihelper: we are cog 0 in a gearbox of 4 +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: all 4 nodes responded +ping [mpihelper]: all 4 nodes responded +ping [mpihelper]: all 4 nodes responded +ping [mpihelper]: all 4 nodes responded +Redirecting stderr to file /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr_train.logrank0 +Redirecting stderr to file /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr_train.logrank1 +Redirecting stderr to file /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr_train.logrank2 +Redirecting stderr to file /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr_train.logrank3 +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: Build info: +MPI Rank 0: +MPI Rank 0: Built time: Mar 3 2016 16:31:40 +MPI Rank 0: Last modified date: Thu Mar 3 16:25:02 2016 +MPI Rank 0: Build type: debug +MPI Rank 0: Math lib: acml +MPI Rank 0: Build Branch: thhoens/tests +MPI Rank 0: Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: running on localhost at 2016/03/03 16:40:15 +MPI Rank 0: command line: +MPI Rank 0: /home/thhoens/cntk/build/gpu/release/bin/cntk configFile=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.cntk currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ DeviceId=-1 numCPUThreads=2 stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: modelPath=$RunDir$/models/dssm.net +MPI Rank 0: MBSize=4096 +MPI Rank 0: LRate=0.0001 +MPI Rank 0: DeviceId=-1 +MPI Rank 0: parallelTrain=true +MPI Rank 0: command = train +MPI Rank 0: precision = float +MPI Rank 0: traceGPUMemoryAllocations=0 +MPI Rank 0: train = [ +MPI Rank 0: action = train +MPI Rank 0: numMBsToShowResult=10 +MPI Rank 0: deviceId=$DeviceId$ +MPI Rank 0: minibatchSize = $MBSize$ +MPI Rank 0: modelPath = $modelPath$ +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize=102399 +MPI Rank 0: learningRatesPerSample = $LRate$ +MPI Rank 0: momentumPerMB = 0.9 +MPI Rank 0: maxEpochs=3 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationStartEpoch=1 +MPI Rank 0: parallelizationMethod=ModelAveragingSGD +MPI Rank 0: distributedMBReading=true +MPI Rank 0: ModelAveragingSGD=[ +MPI Rank 0: SyncFrequencyInFrames=1024 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: gradUpdateType=none +MPI Rank 0: gradientClippingWithTruncation=true +MPI Rank 0: clippingThresholdPerSample=1#INF +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: NDLNetworkBuilder = [ +MPI Rank 0: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = $DataDir$/train.all.bin +MPI Rank 0: ] +MPI Rank 0: cvReader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = $DataDir$/train.all.bin +MPI Rank 0: ] +MPI Rank 0: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 0: RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu +MPI Rank 0: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 0: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 0: DeviceId=-1 +MPI Rank 0: numCPUThreads=2 +MPI Rank 0: stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: modelPath=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 0: MBSize=4096 +MPI Rank 0: LRate=0.0001 +MPI Rank 0: DeviceId=-1 +MPI Rank 0: parallelTrain=true +MPI Rank 0: command = train +MPI Rank 0: precision = float +MPI Rank 0: traceGPUMemoryAllocations=0 +MPI Rank 0: train = [ +MPI Rank 0: action = train +MPI Rank 0: numMBsToShowResult=10 +MPI Rank 0: deviceId=-1 +MPI Rank 0: minibatchSize = 4096 +MPI Rank 0: modelPath = /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize=102399 +MPI Rank 0: learningRatesPerSample = 0.0001 +MPI Rank 0: momentumPerMB = 0.9 +MPI Rank 0: maxEpochs=3 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationStartEpoch=1 +MPI Rank 0: parallelizationMethod=ModelAveragingSGD +MPI Rank 0: distributedMBReading=true +MPI Rank 0: ModelAveragingSGD=[ +MPI Rank 0: SyncFrequencyInFrames=1024 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: gradUpdateType=none +MPI Rank 0: gradientClippingWithTruncation=true +MPI Rank 0: clippingThresholdPerSample=1#INF +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: NDLNetworkBuilder = [ +MPI Rank 0: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: cvReader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 0: RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu +MPI Rank 0: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 0: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 0: DeviceId=-1 +MPI Rank 0: numCPUThreads=2 +MPI Rank 0: stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: configparameters: dssm.cntk:command=train +MPI Rank 0: configparameters: dssm.cntk:ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 0: configparameters: dssm.cntk:currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 0: configparameters: dssm.cntk:cvReader=[ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: dssm.cntk:DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 0: configparameters: dssm.cntk:DeviceId=-1 +MPI Rank 0: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 0: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 0: configparameters: dssm.cntk:modelPath=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 0: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 0: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: dssm.cntk:numCPUThreads=2 +MPI Rank 0: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 0: configparameters: dssm.cntk:precision=float +MPI Rank 0: configparameters: dssm.cntk:reader=[ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: dssm.cntk:RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu +MPI Rank 0: configparameters: dssm.cntk:stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 0: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 0: configparameters: dssm.cntk:train=[ +MPI Rank 0: action = train +MPI Rank 0: numMBsToShowResult=10 +MPI Rank 0: deviceId=-1 +MPI Rank 0: minibatchSize = 4096 +MPI Rank 0: modelPath = /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize=102399 +MPI Rank 0: learningRatesPerSample = 0.0001 +MPI Rank 0: momentumPerMB = 0.9 +MPI Rank 0: maxEpochs=3 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationStartEpoch=1 +MPI Rank 0: parallelizationMethod=ModelAveragingSGD +MPI Rank 0: distributedMBReading=true +MPI Rank 0: ModelAveragingSGD=[ +MPI Rank 0: SyncFrequencyInFrames=1024 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: gradUpdateType=none +MPI Rank 0: gradientClippingWithTruncation=true +MPI Rank 0: clippingThresholdPerSample=1#INF +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: command: train +MPI Rank 0: precision = float +MPI Rank 0: Using 2 CPU threads +MPI Rank 0: CNTKModelPath: /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 0: CNTKCommandTrainInfo: train : 3 +MPI Rank 0: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 0: CNTKCommandTrainBegin: train +MPI Rank 0: NDLBuilder Using CPU +MPI Rank 0: +MPI Rank 0: Post-processing network... +MPI Rank 0: +MPI Rank 0: 2 roots: +MPI Rank 0: SIM = CosDistanceWithNegativeSamples +MPI Rank 0: CE = CrossEntropyWithSoftmax +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: +MPI Rank 0: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: +MPI Rank 0: Validating for node SIM, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: +MPI Rank 0: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 0: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 0: +MPI Rank 0: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 0: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 0: +MPI Rank 0: Validating for node CE, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 0: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 0: +MPI Rank 0: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: Post-processing network complete. +MPI Rank 0: +MPI Rank 0: SGD using CPU. +MPI Rank 0: +MPI Rank 0: Training criterion node(s): +MPI Rank 0: CE = CrossEntropyWithSoftmax +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: No PreCompute nodes found, skipping PreCompute step +MPI Rank 0: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 0: Starting Epoch 1: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 3.44191360; TotalTime = 8.7743s; SamplesPerSecond = 1167.0 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.68707848; TotalTime = 7.2472s; SamplesPerSecond = 1413.0 +MPI Rank 0: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 2.9191892; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=20.1461 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: Final Results: Minibatch[1-25]: Samples Seen = 102399 CE: CrossEntropyWithSoftmax/Sample = 2.138047 Perplexity = 8.4828541 +MPI Rank 0: Finished Epoch[ 1 of 3]: [Validation Set] TrainLossPerSample = 2.138047 +MPI Rank 0: Starting Epoch 2: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.99931793; TotalTime = 7.2632s; SamplesPerSecond = 1409.9 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.89505386; TotalTime = 7.2380s; SamplesPerSecond = 1414.8 +MPI Rank 0: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9437951; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=18.5777 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: Final Results: Minibatch[1-25]: Samples Seen = 102399 CE: CrossEntropyWithSoftmax/Sample = 1.81559 Perplexity = 6.1447002 +MPI Rank 0: Finished Epoch[ 2 of 3]: [Validation Set] TrainLossPerSample = 1.81559 +MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.75707302; TotalTime = 7.2922s; SamplesPerSecond = 1404.2 +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.74772396; TotalTime = 7.2778s; SamplesPerSecond = 1407.0 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.7563989; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=18.6685 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: Final Results: Minibatch[1-25]: Samples Seen = 102399 CE: CrossEntropyWithSoftmax/Sample = 1.6991001 Perplexity = 5.4690235 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Validation Set] TrainLossPerSample = 1.6991001 +MPI Rank 0: CNTKCommandTrainEnd: train +MPI Rank 0: COMPLETED +MPI Rank 0: ~MPIWrapper +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: Build info: +MPI Rank 1: +MPI Rank 1: Built time: Mar 3 2016 16:31:40 +MPI Rank 1: Last modified date: Thu Mar 3 16:25:02 2016 +MPI Rank 1: Build type: debug +MPI Rank 1: Math lib: acml +MPI Rank 1: Build Branch: thhoens/tests +MPI Rank 1: Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: running on localhost at 2016/03/03 16:40:16 +MPI Rank 1: command line: +MPI Rank 1: /home/thhoens/cntk/build/gpu/release/bin/cntk configFile=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.cntk currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ DeviceId=-1 numCPUThreads=2 stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: modelPath=$RunDir$/models/dssm.net +MPI Rank 1: MBSize=4096 +MPI Rank 1: LRate=0.0001 +MPI Rank 1: DeviceId=-1 +MPI Rank 1: parallelTrain=true +MPI Rank 1: command = train +MPI Rank 1: precision = float +MPI Rank 1: traceGPUMemoryAllocations=0 +MPI Rank 1: train = [ +MPI Rank 1: action = train +MPI Rank 1: numMBsToShowResult=10 +MPI Rank 1: deviceId=$DeviceId$ +MPI Rank 1: minibatchSize = $MBSize$ +MPI Rank 1: modelPath = $modelPath$ +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize=102399 +MPI Rank 1: learningRatesPerSample = $LRate$ +MPI Rank 1: momentumPerMB = 0.9 +MPI Rank 1: maxEpochs=3 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationStartEpoch=1 +MPI Rank 1: parallelizationMethod=ModelAveragingSGD +MPI Rank 1: distributedMBReading=true +MPI Rank 1: ModelAveragingSGD=[ +MPI Rank 1: SyncFrequencyInFrames=1024 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: gradUpdateType=none +MPI Rank 1: gradientClippingWithTruncation=true +MPI Rank 1: clippingThresholdPerSample=1#INF +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: NDLNetworkBuilder = [ +MPI Rank 1: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = $DataDir$/train.all.bin +MPI Rank 1: ] +MPI Rank 1: cvReader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = $DataDir$/train.all.bin +MPI Rank 1: ] +MPI Rank 1: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 1: RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu +MPI Rank 1: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 1: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 1: DeviceId=-1 +MPI Rank 1: numCPUThreads=2 +MPI Rank 1: stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: modelPath=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 1: MBSize=4096 +MPI Rank 1: LRate=0.0001 +MPI Rank 1: DeviceId=-1 +MPI Rank 1: parallelTrain=true +MPI Rank 1: command = train +MPI Rank 1: precision = float +MPI Rank 1: traceGPUMemoryAllocations=0 +MPI Rank 1: train = [ +MPI Rank 1: action = train +MPI Rank 1: numMBsToShowResult=10 +MPI Rank 1: deviceId=-1 +MPI Rank 1: minibatchSize = 4096 +MPI Rank 1: modelPath = /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize=102399 +MPI Rank 1: learningRatesPerSample = 0.0001 +MPI Rank 1: momentumPerMB = 0.9 +MPI Rank 1: maxEpochs=3 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationStartEpoch=1 +MPI Rank 1: parallelizationMethod=ModelAveragingSGD +MPI Rank 1: distributedMBReading=true +MPI Rank 1: ModelAveragingSGD=[ +MPI Rank 1: SyncFrequencyInFrames=1024 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: gradUpdateType=none +MPI Rank 1: gradientClippingWithTruncation=true +MPI Rank 1: clippingThresholdPerSample=1#INF +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: NDLNetworkBuilder = [ +MPI Rank 1: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: cvReader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 1: RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu +MPI Rank 1: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 1: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 1: DeviceId=-1 +MPI Rank 1: numCPUThreads=2 +MPI Rank 1: stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: configparameters: dssm.cntk:command=train +MPI Rank 1: configparameters: dssm.cntk:ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 1: configparameters: dssm.cntk:currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 1: configparameters: dssm.cntk:cvReader=[ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: dssm.cntk:DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 1: configparameters: dssm.cntk:DeviceId=-1 +MPI Rank 1: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 1: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 1: configparameters: dssm.cntk:modelPath=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 1: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 1: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: dssm.cntk:numCPUThreads=2 +MPI Rank 1: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 1: configparameters: dssm.cntk:precision=float +MPI Rank 1: configparameters: dssm.cntk:reader=[ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: dssm.cntk:RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu +MPI Rank 1: configparameters: dssm.cntk:stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 1: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 1: configparameters: dssm.cntk:train=[ +MPI Rank 1: action = train +MPI Rank 1: numMBsToShowResult=10 +MPI Rank 1: deviceId=-1 +MPI Rank 1: minibatchSize = 4096 +MPI Rank 1: modelPath = /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize=102399 +MPI Rank 1: learningRatesPerSample = 0.0001 +MPI Rank 1: momentumPerMB = 0.9 +MPI Rank 1: maxEpochs=3 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationStartEpoch=1 +MPI Rank 1: parallelizationMethod=ModelAveragingSGD +MPI Rank 1: distributedMBReading=true +MPI Rank 1: ModelAveragingSGD=[ +MPI Rank 1: SyncFrequencyInFrames=1024 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: gradUpdateType=none +MPI Rank 1: gradientClippingWithTruncation=true +MPI Rank 1: clippingThresholdPerSample=1#INF +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: command: train +MPI Rank 1: precision = float +MPI Rank 1: Using 2 CPU threads +MPI Rank 1: CNTKModelPath: /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 1: CNTKCommandTrainInfo: train : 3 +MPI Rank 1: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 1: CNTKCommandTrainBegin: train +MPI Rank 1: NDLBuilder Using CPU +MPI Rank 1: +MPI Rank 1: Post-processing network... +MPI Rank 1: +MPI Rank 1: 2 roots: +MPI Rank 1: SIM = CosDistanceWithNegativeSamples +MPI Rank 1: CE = CrossEntropyWithSoftmax +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: +MPI Rank 1: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: +MPI Rank 1: Validating for node SIM, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: +MPI Rank 1: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 1: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 1: +MPI Rank 1: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 1: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 1: +MPI Rank 1: Validating for node CE, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 1: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 1: +MPI Rank 1: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: Post-processing network complete. +MPI Rank 1: +MPI Rank 1: SGD using CPU. +MPI Rank 1: +MPI Rank 1: Training criterion node(s): +MPI Rank 1: CE = CrossEntropyWithSoftmax +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Allocating matrices for forward and/or backward propagation. +MPI Rank 1: No PreCompute nodes found, skipping PreCompute step +MPI Rank 1: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 1: Starting Epoch 1: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 3.47101059; TotalTime = 8.7918s; SamplesPerSecond = 1164.7 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.69970703; TotalTime = 7.2472s; SamplesPerSecond = 1413.0 +MPI Rank 1: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 2.9191892; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=20.1461 +MPI Rank 1: Starting Epoch 2: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.02775860; TotalTime = 7.2606s; SamplesPerSecond = 1410.4 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.90990696; TotalTime = 7.2380s; SamplesPerSecond = 1414.8 +MPI Rank 1: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9437951; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=18.5777 +MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.77812557; TotalTime = 7.2909s; SamplesPerSecond = 1404.5 +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.74922504; TotalTime = 7.2778s; SamplesPerSecond = 1407.0 +MPI Rank 1: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.7563989; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=18.6685 +MPI Rank 1: CNTKCommandTrainEnd: train +MPI Rank 1: COMPLETED +MPI Rank 1: ~MPIWrapper +MPI Rank 2: ------------------------------------------------------------------- +MPI Rank 2: Build info: +MPI Rank 2: +MPI Rank 2: Built time: Mar 3 2016 16:31:40 +MPI Rank 2: Last modified date: Thu Mar 3 16:25:02 2016 +MPI Rank 2: Build type: debug +MPI Rank 2: Math lib: acml +MPI Rank 2: Build Branch: thhoens/tests +MPI Rank 2: Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +MPI Rank 2: ------------------------------------------------------------------- +MPI Rank 2: running on localhost at 2016/03/03 16:40:16 +MPI Rank 2: command line: +MPI Rank 2: /home/thhoens/cntk/build/gpu/release/bin/cntk configFile=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.cntk currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ DeviceId=-1 numCPUThreads=2 stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: modelPath=$RunDir$/models/dssm.net +MPI Rank 2: MBSize=4096 +MPI Rank 2: LRate=0.0001 +MPI Rank 2: DeviceId=-1 +MPI Rank 2: parallelTrain=true +MPI Rank 2: command = train +MPI Rank 2: precision = float +MPI Rank 2: traceGPUMemoryAllocations=0 +MPI Rank 2: train = [ +MPI Rank 2: action = train +MPI Rank 2: numMBsToShowResult=10 +MPI Rank 2: deviceId=$DeviceId$ +MPI Rank 2: minibatchSize = $MBSize$ +MPI Rank 2: modelPath = $modelPath$ +MPI Rank 2: traceLevel = 1 +MPI Rank 2: SGD = [ +MPI Rank 2: epochSize=102399 +MPI Rank 2: learningRatesPerSample = $LRate$ +MPI Rank 2: momentumPerMB = 0.9 +MPI Rank 2: maxEpochs=3 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationStartEpoch=1 +MPI Rank 2: parallelizationMethod=ModelAveragingSGD +MPI Rank 2: distributedMBReading=true +MPI Rank 2: ModelAveragingSGD=[ +MPI Rank 2: SyncFrequencyInFrames=1024 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: gradUpdateType=none +MPI Rank 2: gradientClippingWithTruncation=true +MPI Rank 2: clippingThresholdPerSample=1#INF +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: NDLNetworkBuilder = [ +MPI Rank 2: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 2: ] +MPI Rank 2: reader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = $DataDir$/train.all.bin +MPI Rank 2: ] +MPI Rank 2: cvReader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = $DataDir$/train.all.bin +MPI Rank 2: ] +MPI Rank 2: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 2: RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu +MPI Rank 2: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 2: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 2: DeviceId=-1 +MPI Rank 2: numCPUThreads=2 +MPI Rank 2: stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: modelPath=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 2: MBSize=4096 +MPI Rank 2: LRate=0.0001 +MPI Rank 2: DeviceId=-1 +MPI Rank 2: parallelTrain=true +MPI Rank 2: command = train +MPI Rank 2: precision = float +MPI Rank 2: traceGPUMemoryAllocations=0 +MPI Rank 2: train = [ +MPI Rank 2: action = train +MPI Rank 2: numMBsToShowResult=10 +MPI Rank 2: deviceId=-1 +MPI Rank 2: minibatchSize = 4096 +MPI Rank 2: modelPath = /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 2: traceLevel = 1 +MPI Rank 2: SGD = [ +MPI Rank 2: epochSize=102399 +MPI Rank 2: learningRatesPerSample = 0.0001 +MPI Rank 2: momentumPerMB = 0.9 +MPI Rank 2: maxEpochs=3 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationStartEpoch=1 +MPI Rank 2: parallelizationMethod=ModelAveragingSGD +MPI Rank 2: distributedMBReading=true +MPI Rank 2: ModelAveragingSGD=[ +MPI Rank 2: SyncFrequencyInFrames=1024 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: gradUpdateType=none +MPI Rank 2: gradientClippingWithTruncation=true +MPI Rank 2: clippingThresholdPerSample=1#INF +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: NDLNetworkBuilder = [ +MPI Rank 2: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 2: ] +MPI Rank 2: reader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: cvReader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 2: RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu +MPI Rank 2: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 2: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 2: DeviceId=-1 +MPI Rank 2: numCPUThreads=2 +MPI Rank 2: stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: configparameters: dssm.cntk:command=train +MPI Rank 2: configparameters: dssm.cntk:ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 2: configparameters: dssm.cntk:currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 2: configparameters: dssm.cntk:cvReader=[ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: configparameters: dssm.cntk:DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 2: configparameters: dssm.cntk:DeviceId=-1 +MPI Rank 2: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 2: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 2: configparameters: dssm.cntk:modelPath=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 2: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 2: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: configparameters: dssm.cntk:numCPUThreads=2 +MPI Rank 2: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 2: configparameters: dssm.cntk:precision=float +MPI Rank 2: configparameters: dssm.cntk:reader=[ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: configparameters: dssm.cntk:RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu +MPI Rank 2: configparameters: dssm.cntk:stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 2: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 2: configparameters: dssm.cntk:train=[ +MPI Rank 2: action = train +MPI Rank 2: numMBsToShowResult=10 +MPI Rank 2: deviceId=-1 +MPI Rank 2: minibatchSize = 4096 +MPI Rank 2: modelPath = /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 2: traceLevel = 1 +MPI Rank 2: SGD = [ +MPI Rank 2: epochSize=102399 +MPI Rank 2: learningRatesPerSample = 0.0001 +MPI Rank 2: momentumPerMB = 0.9 +MPI Rank 2: maxEpochs=3 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationStartEpoch=1 +MPI Rank 2: parallelizationMethod=ModelAveragingSGD +MPI Rank 2: distributedMBReading=true +MPI Rank 2: ModelAveragingSGD=[ +MPI Rank 2: SyncFrequencyInFrames=1024 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: gradUpdateType=none +MPI Rank 2: gradientClippingWithTruncation=true +MPI Rank 2: clippingThresholdPerSample=1#INF +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: command: train +MPI Rank 2: precision = float +MPI Rank 2: Using 2 CPU threads +MPI Rank 2: CNTKModelPath: /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 2: CNTKCommandTrainInfo: train : 3 +MPI Rank 2: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 2: CNTKCommandTrainBegin: train +MPI Rank 2: NDLBuilder Using CPU +MPI Rank 2: +MPI Rank 2: Post-processing network... +MPI Rank 2: +MPI Rank 2: 2 roots: +MPI Rank 2: SIM = CosDistanceWithNegativeSamples +MPI Rank 2: CE = CrossEntropyWithSoftmax +MPI Rank 2: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 2: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 2: +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: +MPI Rank 2: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 2: +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: +MPI Rank 2: Validating for node SIM, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: +MPI Rank 2: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 2: +MPI Rank 2: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 2: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 2: +MPI Rank 2: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 2: +MPI Rank 2: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 2: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 2: +MPI Rank 2: Validating for node CE, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 2: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 2: +MPI Rank 2: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: Post-processing network complete. +MPI Rank 2: +MPI Rank 2: SGD using CPU. +MPI Rank 2: +MPI Rank 2: Training criterion node(s): +MPI Rank 2: CE = CrossEntropyWithSoftmax +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Allocating matrices for forward and/or backward propagation. +MPI Rank 2: No PreCompute nodes found, skipping PreCompute step +MPI Rank 2: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 2: Starting Epoch 1: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 2: +MPI Rank 2: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 3.44945602; TotalTime = 8.7916s; SamplesPerSecond = 1164.7 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.72469749; TotalTime = 7.2471s; SamplesPerSecond = 1413.0 +MPI Rank 2: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 2.9191892; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=20.1461 +MPI Rank 2: Starting Epoch 2: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 2: +MPI Rank 2: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.05766125; TotalTime = 7.2632s; SamplesPerSecond = 1409.9 +MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.92501621; TotalTime = 7.2380s; SamplesPerSecond = 1414.8 +MPI Rank 2: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9437951; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=18.5777 +MPI Rank 2: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 2: +MPI Rank 2: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 2: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.82333488; TotalTime = 7.2919s; SamplesPerSecond = 1404.3 +MPI Rank 2: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.76783981; TotalTime = 7.2778s; SamplesPerSecond = 1407.0 +MPI Rank 2: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.7563989; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=18.6685 +MPI Rank 2: CNTKCommandTrainEnd: train +MPI Rank 2: COMPLETED +MPI Rank 2: ~MPIWrapper +MPI Rank 3: ------------------------------------------------------------------- +MPI Rank 3: Build info: +MPI Rank 3: +MPI Rank 3: Built time: Mar 3 2016 16:31:40 +MPI Rank 3: Last modified date: Thu Mar 3 16:25:02 2016 +MPI Rank 3: Build type: debug +MPI Rank 3: Math lib: acml +MPI Rank 3: Build Branch: thhoens/tests +MPI Rank 3: Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +MPI Rank 3: ------------------------------------------------------------------- +MPI Rank 3: running on localhost at 2016/03/03 16:40:17 +MPI Rank 3: command line: +MPI Rank 3: /home/thhoens/cntk/build/gpu/release/bin/cntk configFile=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.cntk currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ DeviceId=-1 numCPUThreads=2 stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: modelPath=$RunDir$/models/dssm.net +MPI Rank 3: MBSize=4096 +MPI Rank 3: LRate=0.0001 +MPI Rank 3: DeviceId=-1 +MPI Rank 3: parallelTrain=true +MPI Rank 3: command = train +MPI Rank 3: precision = float +MPI Rank 3: traceGPUMemoryAllocations=0 +MPI Rank 3: train = [ +MPI Rank 3: action = train +MPI Rank 3: numMBsToShowResult=10 +MPI Rank 3: deviceId=$DeviceId$ +MPI Rank 3: minibatchSize = $MBSize$ +MPI Rank 3: modelPath = $modelPath$ +MPI Rank 3: traceLevel = 1 +MPI Rank 3: SGD = [ +MPI Rank 3: epochSize=102399 +MPI Rank 3: learningRatesPerSample = $LRate$ +MPI Rank 3: momentumPerMB = 0.9 +MPI Rank 3: maxEpochs=3 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationStartEpoch=1 +MPI Rank 3: parallelizationMethod=ModelAveragingSGD +MPI Rank 3: distributedMBReading=true +MPI Rank 3: ModelAveragingSGD=[ +MPI Rank 3: SyncFrequencyInFrames=1024 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: gradUpdateType=none +MPI Rank 3: gradientClippingWithTruncation=true +MPI Rank 3: clippingThresholdPerSample=1#INF +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: NDLNetworkBuilder = [ +MPI Rank 3: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 3: ] +MPI Rank 3: reader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = $DataDir$/train.all.bin +MPI Rank 3: ] +MPI Rank 3: cvReader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = $DataDir$/train.all.bin +MPI Rank 3: ] +MPI Rank 3: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 3: RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu +MPI Rank 3: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 3: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 3: DeviceId=-1 +MPI Rank 3: numCPUThreads=2 +MPI Rank 3: stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: modelPath=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 3: MBSize=4096 +MPI Rank 3: LRate=0.0001 +MPI Rank 3: DeviceId=-1 +MPI Rank 3: parallelTrain=true +MPI Rank 3: command = train +MPI Rank 3: precision = float +MPI Rank 3: traceGPUMemoryAllocations=0 +MPI Rank 3: train = [ +MPI Rank 3: action = train +MPI Rank 3: numMBsToShowResult=10 +MPI Rank 3: deviceId=-1 +MPI Rank 3: minibatchSize = 4096 +MPI Rank 3: modelPath = /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 3: traceLevel = 1 +MPI Rank 3: SGD = [ +MPI Rank 3: epochSize=102399 +MPI Rank 3: learningRatesPerSample = 0.0001 +MPI Rank 3: momentumPerMB = 0.9 +MPI Rank 3: maxEpochs=3 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationStartEpoch=1 +MPI Rank 3: parallelizationMethod=ModelAveragingSGD +MPI Rank 3: distributedMBReading=true +MPI Rank 3: ModelAveragingSGD=[ +MPI Rank 3: SyncFrequencyInFrames=1024 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: gradUpdateType=none +MPI Rank 3: gradientClippingWithTruncation=true +MPI Rank 3: clippingThresholdPerSample=1#INF +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: NDLNetworkBuilder = [ +MPI Rank 3: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 3: ] +MPI Rank 3: reader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: cvReader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 3: RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu +MPI Rank 3: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 3: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 3: DeviceId=-1 +MPI Rank 3: numCPUThreads=2 +MPI Rank 3: stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: configparameters: dssm.cntk:command=train +MPI Rank 3: configparameters: dssm.cntk:ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 3: configparameters: dssm.cntk:currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 3: configparameters: dssm.cntk:cvReader=[ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: configparameters: dssm.cntk:DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 3: configparameters: dssm.cntk:DeviceId=-1 +MPI Rank 3: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 3: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 3: configparameters: dssm.cntk:modelPath=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 3: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 3: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: configparameters: dssm.cntk:numCPUThreads=2 +MPI Rank 3: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 3: configparameters: dssm.cntk:precision=float +MPI Rank 3: configparameters: dssm.cntk:reader=[ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: configparameters: dssm.cntk:RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu +MPI Rank 3: configparameters: dssm.cntk:stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 3: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 3: configparameters: dssm.cntk:train=[ +MPI Rank 3: action = train +MPI Rank 3: numMBsToShowResult=10 +MPI Rank 3: deviceId=-1 +MPI Rank 3: minibatchSize = 4096 +MPI Rank 3: modelPath = /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 3: traceLevel = 1 +MPI Rank 3: SGD = [ +MPI Rank 3: epochSize=102399 +MPI Rank 3: learningRatesPerSample = 0.0001 +MPI Rank 3: momentumPerMB = 0.9 +MPI Rank 3: maxEpochs=3 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationStartEpoch=1 +MPI Rank 3: parallelizationMethod=ModelAveragingSGD +MPI Rank 3: distributedMBReading=true +MPI Rank 3: ModelAveragingSGD=[ +MPI Rank 3: SyncFrequencyInFrames=1024 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: gradUpdateType=none +MPI Rank 3: gradientClippingWithTruncation=true +MPI Rank 3: clippingThresholdPerSample=1#INF +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: command: train +MPI Rank 3: precision = float +MPI Rank 3: Using 2 CPU threads +MPI Rank 3: CNTKModelPath: /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 3: CNTKCommandTrainInfo: train : 3 +MPI Rank 3: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 3: CNTKCommandTrainBegin: train +MPI Rank 3: NDLBuilder Using CPU +MPI Rank 3: +MPI Rank 3: Post-processing network... +MPI Rank 3: +MPI Rank 3: 2 roots: +MPI Rank 3: SIM = CosDistanceWithNegativeSamples +MPI Rank 3: CE = CrossEntropyWithSoftmax +MPI Rank 3: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 3: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 3: +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: +MPI Rank 3: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 3: +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: +MPI Rank 3: Validating for node SIM, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: +MPI Rank 3: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 3: +MPI Rank 3: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 3: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 3: +MPI Rank 3: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 3: +MPI Rank 3: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 3: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 3: +MPI Rank 3: Validating for node CE, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 3: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 3: +MPI Rank 3: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 3: +MPI Rank 3: Post-processing network complete. +MPI Rank 3: +MPI Rank 3: SGD using CPU. +MPI Rank 3: +MPI Rank 3: Training criterion node(s): +MPI Rank 3: CE = CrossEntropyWithSoftmax +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Allocating matrices for forward and/or backward propagation. +MPI Rank 3: No PreCompute nodes found, skipping PreCompute step +MPI Rank 3: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 3: Starting Epoch 1: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 3: +MPI Rank 3: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 3: Epoch[ 1 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 3.44398956; TotalTime = 8.7827s; SamplesPerSecond = 1165.9 +MPI Rank 3: Epoch[ 1 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.71102791; TotalTime = 7.2472s; SamplesPerSecond = 1413.0 +MPI Rank 3: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 2.9191892; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=20.1461 +MPI Rank 3: Starting Epoch 2: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 3: +MPI Rank 3: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 3: Epoch[ 2 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.00262775; TotalTime = 7.2746s; SamplesPerSecond = 1407.6 +MPI Rank 3: Epoch[ 2 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.90484028; TotalTime = 7.2380s; SamplesPerSecond = 1414.8 +MPI Rank 3: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9437951; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=18.5777 +MPI Rank 3: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 3: +MPI Rank 3: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 3: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.75186119; TotalTime = 7.3020s; SamplesPerSecond = 1402.4 +MPI Rank 3: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.73868065; TotalTime = 7.2778s; SamplesPerSecond = 1407.0 +MPI Rank 3: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.7563989; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=18.6685 +MPI Rank 3: CNTKCommandTrainEnd: train +MPI Rank 3: COMPLETED +MPI Rank 3: ~MPIWrapper +=== Deleting last epoch data +==== Re-running from checkpoint +=== Running mpiexec -n 4 /home/thhoens/cntk/build/gpu/release/bin/cntk configFile=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.cntk currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ DeviceId=-1 numCPUThreads=2 stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 16:31:40 + Last modified date: Thu Mar 3 16:25:02 2016 + Build type: debug + Math lib: acml + Build Branch: thhoens/tests + Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +------------------------------------------------------------------- +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 16:31:40 + Last modified date: Thu Mar 3 16:25:02 2016 + Build type: debug + Math lib: acml + Build Branch: thhoens/tests + Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +------------------------------------------------------------------- +MPIWrapper: initializing MPI +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 16:31:40 + Last modified date: Thu Mar 3 16:25:02 2016 + Build type: debug + Math lib: acml + Build Branch: thhoens/tests + Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +------------------------------------------------------------------- +MPIWrapper: initializing MPI +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 16:31:40 + Last modified date: Thu Mar 3 16:25:02 2016 + Build type: debug + Math lib: acml + Build Branch: thhoens/tests + Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +------------------------------------------------------------------- +MPIWrapper: initializing MPI +MPIWrapper: initializing MPI +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: all 4 nodes responded +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (2) are in (participating) +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: all 4 nodes responded +mpihelper: we are cog 2 in a gearbox of 4 +ping [mpihelper]: 4 nodes pinging each other +ping [requestnodes (before change)]: all 4 nodes responded +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (1) are in (participating) +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: all 4 nodes responded +mpihelper: we are cog 1 in a gearbox of 4 +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: all 4 nodes responded +ping [requestnodes (before change)]: all 4 nodes responded +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (0) are in (participating) +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: all 4 nodes responded +mpihelper: we are cog 0 in a gearbox of 4 +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: all 4 nodes responded +ping [requestnodes (before change)]: all 4 nodes responded +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (3) are in (participating) +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: all 4 nodes responded +mpihelper: we are cog 3 in a gearbox of 4 +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: all 4 nodes responded +ping [mpihelper]: all 4 nodes responded +Redirecting stderr to file /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr_train.logrank0 +Redirecting stderr to file /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr_train.logrank1 +Redirecting stderr to file /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr_train.logrank2 +Redirecting stderr to file /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr_train.logrank3 +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: Build info: +MPI Rank 0: +MPI Rank 0: Built time: Mar 3 2016 16:31:40 +MPI Rank 0: Last modified date: Thu Mar 3 16:25:02 2016 +MPI Rank 0: Build type: debug +MPI Rank 0: Math lib: acml +MPI Rank 0: Build Branch: thhoens/tests +MPI Rank 0: Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: running on localhost at 2016/03/03 16:41:56 +MPI Rank 0: command line: +MPI Rank 0: /home/thhoens/cntk/build/gpu/release/bin/cntk configFile=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.cntk currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ DeviceId=-1 numCPUThreads=2 stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: modelPath=$RunDir$/models/dssm.net +MPI Rank 0: MBSize=4096 +MPI Rank 0: LRate=0.0001 +MPI Rank 0: DeviceId=-1 +MPI Rank 0: parallelTrain=true +MPI Rank 0: command = train +MPI Rank 0: precision = float +MPI Rank 0: traceGPUMemoryAllocations=0 +MPI Rank 0: train = [ +MPI Rank 0: action = train +MPI Rank 0: numMBsToShowResult=10 +MPI Rank 0: deviceId=$DeviceId$ +MPI Rank 0: minibatchSize = $MBSize$ +MPI Rank 0: modelPath = $modelPath$ +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize=102399 +MPI Rank 0: learningRatesPerSample = $LRate$ +MPI Rank 0: momentumPerMB = 0.9 +MPI Rank 0: maxEpochs=3 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationStartEpoch=1 +MPI Rank 0: parallelizationMethod=ModelAveragingSGD +MPI Rank 0: distributedMBReading=true +MPI Rank 0: ModelAveragingSGD=[ +MPI Rank 0: SyncFrequencyInFrames=1024 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: gradUpdateType=none +MPI Rank 0: gradientClippingWithTruncation=true +MPI Rank 0: clippingThresholdPerSample=1#INF +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: NDLNetworkBuilder = [ +MPI Rank 0: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = $DataDir$/train.all.bin +MPI Rank 0: ] +MPI Rank 0: cvReader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = $DataDir$/train.all.bin +MPI Rank 0: ] +MPI Rank 0: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 0: RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu +MPI Rank 0: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 0: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 0: DeviceId=-1 +MPI Rank 0: numCPUThreads=2 +MPI Rank 0: stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: modelPath=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 0: MBSize=4096 +MPI Rank 0: LRate=0.0001 +MPI Rank 0: DeviceId=-1 +MPI Rank 0: parallelTrain=true +MPI Rank 0: command = train +MPI Rank 0: precision = float +MPI Rank 0: traceGPUMemoryAllocations=0 +MPI Rank 0: train = [ +MPI Rank 0: action = train +MPI Rank 0: numMBsToShowResult=10 +MPI Rank 0: deviceId=-1 +MPI Rank 0: minibatchSize = 4096 +MPI Rank 0: modelPath = /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize=102399 +MPI Rank 0: learningRatesPerSample = 0.0001 +MPI Rank 0: momentumPerMB = 0.9 +MPI Rank 0: maxEpochs=3 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationStartEpoch=1 +MPI Rank 0: parallelizationMethod=ModelAveragingSGD +MPI Rank 0: distributedMBReading=true +MPI Rank 0: ModelAveragingSGD=[ +MPI Rank 0: SyncFrequencyInFrames=1024 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: gradUpdateType=none +MPI Rank 0: gradientClippingWithTruncation=true +MPI Rank 0: clippingThresholdPerSample=1#INF +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: NDLNetworkBuilder = [ +MPI Rank 0: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: cvReader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 0: RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu +MPI Rank 0: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 0: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 0: DeviceId=-1 +MPI Rank 0: numCPUThreads=2 +MPI Rank 0: stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: configparameters: dssm.cntk:command=train +MPI Rank 0: configparameters: dssm.cntk:ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 0: configparameters: dssm.cntk:currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 0: configparameters: dssm.cntk:cvReader=[ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: dssm.cntk:DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 0: configparameters: dssm.cntk:DeviceId=-1 +MPI Rank 0: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 0: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 0: configparameters: dssm.cntk:modelPath=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 0: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 0: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: dssm.cntk:numCPUThreads=2 +MPI Rank 0: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 0: configparameters: dssm.cntk:precision=float +MPI Rank 0: configparameters: dssm.cntk:reader=[ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: dssm.cntk:RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu +MPI Rank 0: configparameters: dssm.cntk:stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 0: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 0: configparameters: dssm.cntk:train=[ +MPI Rank 0: action = train +MPI Rank 0: numMBsToShowResult=10 +MPI Rank 0: deviceId=-1 +MPI Rank 0: minibatchSize = 4096 +MPI Rank 0: modelPath = /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize=102399 +MPI Rank 0: learningRatesPerSample = 0.0001 +MPI Rank 0: momentumPerMB = 0.9 +MPI Rank 0: maxEpochs=3 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationStartEpoch=1 +MPI Rank 0: parallelizationMethod=ModelAveragingSGD +MPI Rank 0: distributedMBReading=true +MPI Rank 0: ModelAveragingSGD=[ +MPI Rank 0: SyncFrequencyInFrames=1024 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: gradUpdateType=none +MPI Rank 0: gradientClippingWithTruncation=true +MPI Rank 0: clippingThresholdPerSample=1#INF +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: command: train +MPI Rank 0: precision = float +MPI Rank 0: Using 2 CPU threads +MPI Rank 0: CNTKModelPath: /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 0: CNTKCommandTrainInfo: train : 3 +MPI Rank 0: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 0: CNTKCommandTrainBegin: train +MPI Rank 0: NDLBuilder Using CPU +MPI Rank 0: Starting from checkpoint. Load Network From File /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net.2. +MPI Rank 0: +MPI Rank 0: Post-processing network... +MPI Rank 0: +MPI Rank 0: 2 roots: +MPI Rank 0: CE = CrossEntropyWithSoftmax +MPI Rank 0: SIM = CosDistanceWithNegativeSamples +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 0: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 0: +MPI Rank 0: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 0: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 0: +MPI Rank 0: Validating for node CE, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 0: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 0: +MPI Rank 0: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: +MPI Rank 0: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: +MPI Rank 0: Validating for node SIM, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: +MPI Rank 0: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: Post-processing network complete. +MPI Rank 0: +MPI Rank 0: SGD using CPU. +MPI Rank 0: +MPI Rank 0: Training criterion node(s): +MPI Rank 0: CE = CrossEntropyWithSoftmax +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: No PreCompute nodes found, skipping PreCompute step +MPI Rank 0: Warning: checkpoint file is missing. learning parameters will be initialized from 0 +MPI Rank 0: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.77671967; TotalTime = 7.7887s; SamplesPerSecond = 1314.7 +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.79965210; TotalTime = 6.4066s; SamplesPerSecond = 1598.4 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8025137; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=18.3486 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: Final Results: Minibatch[1-25]: Samples Seen = 102399 CE: CrossEntropyWithSoftmax/Sample = 1.7629802 Perplexity = 5.8297857 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Validation Set] TrainLossPerSample = 1.7629802 +MPI Rank 0: CNTKCommandTrainEnd: train +MPI Rank 0: COMPLETED +MPI Rank 0: ~MPIWrapper +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: Build info: +MPI Rank 1: +MPI Rank 1: Built time: Mar 3 2016 16:31:40 +MPI Rank 1: Last modified date: Thu Mar 3 16:25:02 2016 +MPI Rank 1: Build type: debug +MPI Rank 1: Math lib: acml +MPI Rank 1: Build Branch: thhoens/tests +MPI Rank 1: Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: running on localhost at 2016/03/03 16:41:56 +MPI Rank 1: command line: +MPI Rank 1: /home/thhoens/cntk/build/gpu/release/bin/cntk configFile=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.cntk currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ DeviceId=-1 numCPUThreads=2 stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: modelPath=$RunDir$/models/dssm.net +MPI Rank 1: MBSize=4096 +MPI Rank 1: LRate=0.0001 +MPI Rank 1: DeviceId=-1 +MPI Rank 1: parallelTrain=true +MPI Rank 1: command = train +MPI Rank 1: precision = float +MPI Rank 1: traceGPUMemoryAllocations=0 +MPI Rank 1: train = [ +MPI Rank 1: action = train +MPI Rank 1: numMBsToShowResult=10 +MPI Rank 1: deviceId=$DeviceId$ +MPI Rank 1: minibatchSize = $MBSize$ +MPI Rank 1: modelPath = $modelPath$ +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize=102399 +MPI Rank 1: learningRatesPerSample = $LRate$ +MPI Rank 1: momentumPerMB = 0.9 +MPI Rank 1: maxEpochs=3 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationStartEpoch=1 +MPI Rank 1: parallelizationMethod=ModelAveragingSGD +MPI Rank 1: distributedMBReading=true +MPI Rank 1: ModelAveragingSGD=[ +MPI Rank 1: SyncFrequencyInFrames=1024 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: gradUpdateType=none +MPI Rank 1: gradientClippingWithTruncation=true +MPI Rank 1: clippingThresholdPerSample=1#INF +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: NDLNetworkBuilder = [ +MPI Rank 1: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = $DataDir$/train.all.bin +MPI Rank 1: ] +MPI Rank 1: cvReader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = $DataDir$/train.all.bin +MPI Rank 1: ] +MPI Rank 1: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 1: RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu +MPI Rank 1: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 1: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 1: DeviceId=-1 +MPI Rank 1: numCPUThreads=2 +MPI Rank 1: stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: modelPath=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 1: MBSize=4096 +MPI Rank 1: LRate=0.0001 +MPI Rank 1: DeviceId=-1 +MPI Rank 1: parallelTrain=true +MPI Rank 1: command = train +MPI Rank 1: precision = float +MPI Rank 1: traceGPUMemoryAllocations=0 +MPI Rank 1: train = [ +MPI Rank 1: action = train +MPI Rank 1: numMBsToShowResult=10 +MPI Rank 1: deviceId=-1 +MPI Rank 1: minibatchSize = 4096 +MPI Rank 1: modelPath = /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize=102399 +MPI Rank 1: learningRatesPerSample = 0.0001 +MPI Rank 1: momentumPerMB = 0.9 +MPI Rank 1: maxEpochs=3 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationStartEpoch=1 +MPI Rank 1: parallelizationMethod=ModelAveragingSGD +MPI Rank 1: distributedMBReading=true +MPI Rank 1: ModelAveragingSGD=[ +MPI Rank 1: SyncFrequencyInFrames=1024 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: gradUpdateType=none +MPI Rank 1: gradientClippingWithTruncation=true +MPI Rank 1: clippingThresholdPerSample=1#INF +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: NDLNetworkBuilder = [ +MPI Rank 1: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: cvReader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 1: RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu +MPI Rank 1: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 1: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 1: DeviceId=-1 +MPI Rank 1: numCPUThreads=2 +MPI Rank 1: stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: configparameters: dssm.cntk:command=train +MPI Rank 1: configparameters: dssm.cntk:ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 1: configparameters: dssm.cntk:currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 1: configparameters: dssm.cntk:cvReader=[ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: dssm.cntk:DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 1: configparameters: dssm.cntk:DeviceId=-1 +MPI Rank 1: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 1: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 1: configparameters: dssm.cntk:modelPath=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 1: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 1: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: dssm.cntk:numCPUThreads=2 +MPI Rank 1: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 1: configparameters: dssm.cntk:precision=float +MPI Rank 1: configparameters: dssm.cntk:reader=[ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: dssm.cntk:RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu +MPI Rank 1: configparameters: dssm.cntk:stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 1: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 1: configparameters: dssm.cntk:train=[ +MPI Rank 1: action = train +MPI Rank 1: numMBsToShowResult=10 +MPI Rank 1: deviceId=-1 +MPI Rank 1: minibatchSize = 4096 +MPI Rank 1: modelPath = /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize=102399 +MPI Rank 1: learningRatesPerSample = 0.0001 +MPI Rank 1: momentumPerMB = 0.9 +MPI Rank 1: maxEpochs=3 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationStartEpoch=1 +MPI Rank 1: parallelizationMethod=ModelAveragingSGD +MPI Rank 1: distributedMBReading=true +MPI Rank 1: ModelAveragingSGD=[ +MPI Rank 1: SyncFrequencyInFrames=1024 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: gradUpdateType=none +MPI Rank 1: gradientClippingWithTruncation=true +MPI Rank 1: clippingThresholdPerSample=1#INF +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: command: train +MPI Rank 1: precision = float +MPI Rank 1: Using 2 CPU threads +MPI Rank 1: CNTKModelPath: /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 1: CNTKCommandTrainInfo: train : 3 +MPI Rank 1: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 1: CNTKCommandTrainBegin: train +MPI Rank 1: NDLBuilder Using CPU +MPI Rank 1: Starting from checkpoint. Load Network From File /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net.2. +MPI Rank 1: +MPI Rank 1: Post-processing network... +MPI Rank 1: +MPI Rank 1: 2 roots: +MPI Rank 1: CE = CrossEntropyWithSoftmax +MPI Rank 1: SIM = CosDistanceWithNegativeSamples +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 1: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 1: +MPI Rank 1: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 1: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 1: +MPI Rank 1: Validating for node CE, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 1: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 1: +MPI Rank 1: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: +MPI Rank 1: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: +MPI Rank 1: Validating for node SIM, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: +MPI Rank 1: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: Post-processing network complete. +MPI Rank 1: +MPI Rank 1: SGD using CPU. +MPI Rank 1: +MPI Rank 1: Training criterion node(s): +MPI Rank 1: CE = CrossEntropyWithSoftmax +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Allocating matrices for forward and/or backward propagation. +MPI Rank 1: No PreCompute nodes found, skipping PreCompute step +MPI Rank 1: Warning: checkpoint file is missing. learning parameters will be initialized from 0 +MPI Rank 1: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.79816246; TotalTime = 7.7892s; SamplesPerSecond = 1314.6 +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.80574760; TotalTime = 6.4066s; SamplesPerSecond = 1598.4 +MPI Rank 1: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8025137; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=18.3486 +MPI Rank 1: CNTKCommandTrainEnd: train +MPI Rank 1: COMPLETED +MPI Rank 1: ~MPIWrapper +MPI Rank 2: ------------------------------------------------------------------- +MPI Rank 2: Build info: +MPI Rank 2: +MPI Rank 2: Built time: Mar 3 2016 16:31:40 +MPI Rank 2: Last modified date: Thu Mar 3 16:25:02 2016 +MPI Rank 2: Build type: debug +MPI Rank 2: Math lib: acml +MPI Rank 2: Build Branch: thhoens/tests +MPI Rank 2: Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +MPI Rank 2: ------------------------------------------------------------------- +MPI Rank 2: running on localhost at 2016/03/03 16:41:57 +MPI Rank 2: command line: +MPI Rank 2: /home/thhoens/cntk/build/gpu/release/bin/cntk configFile=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.cntk currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ DeviceId=-1 numCPUThreads=2 stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: modelPath=$RunDir$/models/dssm.net +MPI Rank 2: MBSize=4096 +MPI Rank 2: LRate=0.0001 +MPI Rank 2: DeviceId=-1 +MPI Rank 2: parallelTrain=true +MPI Rank 2: command = train +MPI Rank 2: precision = float +MPI Rank 2: traceGPUMemoryAllocations=0 +MPI Rank 2: train = [ +MPI Rank 2: action = train +MPI Rank 2: numMBsToShowResult=10 +MPI Rank 2: deviceId=$DeviceId$ +MPI Rank 2: minibatchSize = $MBSize$ +MPI Rank 2: modelPath = $modelPath$ +MPI Rank 2: traceLevel = 1 +MPI Rank 2: SGD = [ +MPI Rank 2: epochSize=102399 +MPI Rank 2: learningRatesPerSample = $LRate$ +MPI Rank 2: momentumPerMB = 0.9 +MPI Rank 2: maxEpochs=3 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationStartEpoch=1 +MPI Rank 2: parallelizationMethod=ModelAveragingSGD +MPI Rank 2: distributedMBReading=true +MPI Rank 2: ModelAveragingSGD=[ +MPI Rank 2: SyncFrequencyInFrames=1024 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: gradUpdateType=none +MPI Rank 2: gradientClippingWithTruncation=true +MPI Rank 2: clippingThresholdPerSample=1#INF +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: NDLNetworkBuilder = [ +MPI Rank 2: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 2: ] +MPI Rank 2: reader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = $DataDir$/train.all.bin +MPI Rank 2: ] +MPI Rank 2: cvReader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = $DataDir$/train.all.bin +MPI Rank 2: ] +MPI Rank 2: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 2: RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu +MPI Rank 2: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 2: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 2: DeviceId=-1 +MPI Rank 2: numCPUThreads=2 +MPI Rank 2: stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: modelPath=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 2: MBSize=4096 +MPI Rank 2: LRate=0.0001 +MPI Rank 2: DeviceId=-1 +MPI Rank 2: parallelTrain=true +MPI Rank 2: command = train +MPI Rank 2: precision = float +MPI Rank 2: traceGPUMemoryAllocations=0 +MPI Rank 2: train = [ +MPI Rank 2: action = train +MPI Rank 2: numMBsToShowResult=10 +MPI Rank 2: deviceId=-1 +MPI Rank 2: minibatchSize = 4096 +MPI Rank 2: modelPath = /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 2: traceLevel = 1 +MPI Rank 2: SGD = [ +MPI Rank 2: epochSize=102399 +MPI Rank 2: learningRatesPerSample = 0.0001 +MPI Rank 2: momentumPerMB = 0.9 +MPI Rank 2: maxEpochs=3 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationStartEpoch=1 +MPI Rank 2: parallelizationMethod=ModelAveragingSGD +MPI Rank 2: distributedMBReading=true +MPI Rank 2: ModelAveragingSGD=[ +MPI Rank 2: SyncFrequencyInFrames=1024 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: gradUpdateType=none +MPI Rank 2: gradientClippingWithTruncation=true +MPI Rank 2: clippingThresholdPerSample=1#INF +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: NDLNetworkBuilder = [ +MPI Rank 2: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 2: ] +MPI Rank 2: reader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: cvReader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 2: RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu +MPI Rank 2: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 2: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 2: DeviceId=-1 +MPI Rank 2: numCPUThreads=2 +MPI Rank 2: stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: configparameters: dssm.cntk:command=train +MPI Rank 2: configparameters: dssm.cntk:ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 2: configparameters: dssm.cntk:currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 2: configparameters: dssm.cntk:cvReader=[ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: configparameters: dssm.cntk:DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 2: configparameters: dssm.cntk:DeviceId=-1 +MPI Rank 2: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 2: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 2: configparameters: dssm.cntk:modelPath=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 2: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 2: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: configparameters: dssm.cntk:numCPUThreads=2 +MPI Rank 2: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 2: configparameters: dssm.cntk:precision=float +MPI Rank 2: configparameters: dssm.cntk:reader=[ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: configparameters: dssm.cntk:RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu +MPI Rank 2: configparameters: dssm.cntk:stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 2: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 2: configparameters: dssm.cntk:train=[ +MPI Rank 2: action = train +MPI Rank 2: numMBsToShowResult=10 +MPI Rank 2: deviceId=-1 +MPI Rank 2: minibatchSize = 4096 +MPI Rank 2: modelPath = /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 2: traceLevel = 1 +MPI Rank 2: SGD = [ +MPI Rank 2: epochSize=102399 +MPI Rank 2: learningRatesPerSample = 0.0001 +MPI Rank 2: momentumPerMB = 0.9 +MPI Rank 2: maxEpochs=3 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationStartEpoch=1 +MPI Rank 2: parallelizationMethod=ModelAveragingSGD +MPI Rank 2: distributedMBReading=true +MPI Rank 2: ModelAveragingSGD=[ +MPI Rank 2: SyncFrequencyInFrames=1024 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: gradUpdateType=none +MPI Rank 2: gradientClippingWithTruncation=true +MPI Rank 2: clippingThresholdPerSample=1#INF +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: command: train +MPI Rank 2: precision = float +MPI Rank 2: Using 2 CPU threads +MPI Rank 2: CNTKModelPath: /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 2: CNTKCommandTrainInfo: train : 3 +MPI Rank 2: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 2: CNTKCommandTrainBegin: train +MPI Rank 2: NDLBuilder Using CPU +MPI Rank 2: Starting from checkpoint. Load Network From File /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net.2. +MPI Rank 2: +MPI Rank 2: Post-processing network... +MPI Rank 2: +MPI Rank 2: 2 roots: +MPI Rank 2: CE = CrossEntropyWithSoftmax +MPI Rank 2: SIM = CosDistanceWithNegativeSamples +MPI Rank 2: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 2: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 2: +MPI Rank 2: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 2: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 2: +MPI Rank 2: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 2: +MPI Rank 2: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 2: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 2: +MPI Rank 2: Validating for node CE, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 2: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 2: +MPI Rank 2: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 2: +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: +MPI Rank 2: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 2: +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: +MPI Rank 2: Validating for node SIM, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: +MPI Rank 2: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: Post-processing network complete. +MPI Rank 2: +MPI Rank 2: SGD using CPU. +MPI Rank 2: +MPI Rank 2: Training criterion node(s): +MPI Rank 2: CE = CrossEntropyWithSoftmax +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Allocating matrices for forward and/or backward propagation. +MPI Rank 2: No PreCompute nodes found, skipping PreCompute step +MPI Rank 2: Warning: checkpoint file is missing. learning parameters will be initialized from 0 +MPI Rank 2: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 2: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 2: +MPI Rank 2: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 2: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.84275970; TotalTime = 7.7884s; SamplesPerSecond = 1314.8 +MPI Rank 2: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.82326355; TotalTime = 6.4066s; SamplesPerSecond = 1598.4 +MPI Rank 2: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8025137; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=18.3486 +MPI Rank 2: CNTKCommandTrainEnd: train +MPI Rank 2: COMPLETED +MPI Rank 2: ~MPIWrapper +MPI Rank 3: ------------------------------------------------------------------- +MPI Rank 3: Build info: +MPI Rank 3: +MPI Rank 3: Built time: Mar 3 2016 16:31:40 +MPI Rank 3: Last modified date: Thu Mar 3 16:25:02 2016 +MPI Rank 3: Build type: debug +MPI Rank 3: Math lib: acml +MPI Rank 3: Build Branch: thhoens/tests +MPI Rank 3: Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +MPI Rank 3: ------------------------------------------------------------------- +MPI Rank 3: running on localhost at 2016/03/03 16:41:57 +MPI Rank 3: command line: +MPI Rank 3: /home/thhoens/cntk/build/gpu/release/bin/cntk configFile=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.cntk currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ DeviceId=-1 numCPUThreads=2 stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: modelPath=$RunDir$/models/dssm.net +MPI Rank 3: MBSize=4096 +MPI Rank 3: LRate=0.0001 +MPI Rank 3: DeviceId=-1 +MPI Rank 3: parallelTrain=true +MPI Rank 3: command = train +MPI Rank 3: precision = float +MPI Rank 3: traceGPUMemoryAllocations=0 +MPI Rank 3: train = [ +MPI Rank 3: action = train +MPI Rank 3: numMBsToShowResult=10 +MPI Rank 3: deviceId=$DeviceId$ +MPI Rank 3: minibatchSize = $MBSize$ +MPI Rank 3: modelPath = $modelPath$ +MPI Rank 3: traceLevel = 1 +MPI Rank 3: SGD = [ +MPI Rank 3: epochSize=102399 +MPI Rank 3: learningRatesPerSample = $LRate$ +MPI Rank 3: momentumPerMB = 0.9 +MPI Rank 3: maxEpochs=3 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationStartEpoch=1 +MPI Rank 3: parallelizationMethod=ModelAveragingSGD +MPI Rank 3: distributedMBReading=true +MPI Rank 3: ModelAveragingSGD=[ +MPI Rank 3: SyncFrequencyInFrames=1024 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: gradUpdateType=none +MPI Rank 3: gradientClippingWithTruncation=true +MPI Rank 3: clippingThresholdPerSample=1#INF +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: NDLNetworkBuilder = [ +MPI Rank 3: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 3: ] +MPI Rank 3: reader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = $DataDir$/train.all.bin +MPI Rank 3: ] +MPI Rank 3: cvReader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = $DataDir$/train.all.bin +MPI Rank 3: ] +MPI Rank 3: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 3: RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu +MPI Rank 3: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 3: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 3: DeviceId=-1 +MPI Rank 3: numCPUThreads=2 +MPI Rank 3: stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: modelPath=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 3: MBSize=4096 +MPI Rank 3: LRate=0.0001 +MPI Rank 3: DeviceId=-1 +MPI Rank 3: parallelTrain=true +MPI Rank 3: command = train +MPI Rank 3: precision = float +MPI Rank 3: traceGPUMemoryAllocations=0 +MPI Rank 3: train = [ +MPI Rank 3: action = train +MPI Rank 3: numMBsToShowResult=10 +MPI Rank 3: deviceId=-1 +MPI Rank 3: minibatchSize = 4096 +MPI Rank 3: modelPath = /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 3: traceLevel = 1 +MPI Rank 3: SGD = [ +MPI Rank 3: epochSize=102399 +MPI Rank 3: learningRatesPerSample = 0.0001 +MPI Rank 3: momentumPerMB = 0.9 +MPI Rank 3: maxEpochs=3 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationStartEpoch=1 +MPI Rank 3: parallelizationMethod=ModelAveragingSGD +MPI Rank 3: distributedMBReading=true +MPI Rank 3: ModelAveragingSGD=[ +MPI Rank 3: SyncFrequencyInFrames=1024 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: gradUpdateType=none +MPI Rank 3: gradientClippingWithTruncation=true +MPI Rank 3: clippingThresholdPerSample=1#INF +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: NDLNetworkBuilder = [ +MPI Rank 3: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 3: ] +MPI Rank 3: reader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: cvReader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 3: RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu +MPI Rank 3: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 3: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 3: DeviceId=-1 +MPI Rank 3: numCPUThreads=2 +MPI Rank 3: stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: configparameters: dssm.cntk:command=train +MPI Rank 3: configparameters: dssm.cntk:ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 3: configparameters: dssm.cntk:currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 3: configparameters: dssm.cntk:cvReader=[ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: configparameters: dssm.cntk:DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 3: configparameters: dssm.cntk:DeviceId=-1 +MPI Rank 3: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 3: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 3: configparameters: dssm.cntk:modelPath=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 3: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 3: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: configparameters: dssm.cntk:numCPUThreads=2 +MPI Rank 3: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 3: configparameters: dssm.cntk:precision=float +MPI Rank 3: configparameters: dssm.cntk:reader=[ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: configparameters: dssm.cntk:RunDir=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu +MPI Rank 3: configparameters: dssm.cntk:stderr=/tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/stderr +MPI Rank 3: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 3: configparameters: dssm.cntk:train=[ +MPI Rank 3: action = train +MPI Rank 3: numMBsToShowResult=10 +MPI Rank 3: deviceId=-1 +MPI Rank 3: minibatchSize = 4096 +MPI Rank 3: modelPath = /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 3: traceLevel = 1 +MPI Rank 3: SGD = [ +MPI Rank 3: epochSize=102399 +MPI Rank 3: learningRatesPerSample = 0.0001 +MPI Rank 3: momentumPerMB = 0.9 +MPI Rank 3: maxEpochs=3 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationStartEpoch=1 +MPI Rank 3: parallelizationMethod=ModelAveragingSGD +MPI Rank 3: distributedMBReading=true +MPI Rank 3: ModelAveragingSGD=[ +MPI Rank 3: SyncFrequencyInFrames=1024 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: gradUpdateType=none +MPI Rank 3: gradientClippingWithTruncation=true +MPI Rank 3: clippingThresholdPerSample=1#INF +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: command: train +MPI Rank 3: precision = float +MPI Rank 3: Using 2 CPU threads +MPI Rank 3: CNTKModelPath: /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 3: CNTKCommandTrainInfo: train : 3 +MPI Rank 3: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 3: CNTKCommandTrainBegin: train +MPI Rank 3: NDLBuilder Using CPU +MPI Rank 3: Starting from checkpoint. Load Network From File /tmp/cntk-test-20160303164015.884445/Text_SparseDSSM@release_cpu/models/dssm.net.2. +MPI Rank 3: +MPI Rank 3: Post-processing network... +MPI Rank 3: +MPI Rank 3: 2 roots: +MPI Rank 3: CE = CrossEntropyWithSoftmax +MPI Rank 3: SIM = CosDistanceWithNegativeSamples +MPI Rank 3: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 3: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 3: +MPI Rank 3: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 3: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 3: +MPI Rank 3: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 3: +MPI Rank 3: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 3: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 3: +MPI Rank 3: Validating for node CE, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 3: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 3: +MPI Rank 3: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 3: +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: +MPI Rank 3: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 3: +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: +MPI Rank 3: Validating for node SIM, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: +MPI Rank 3: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 3: +MPI Rank 3: Post-processing network complete. +MPI Rank 3: +MPI Rank 3: SGD using CPU. +MPI Rank 3: +MPI Rank 3: Training criterion node(s): +MPI Rank 3: CE = CrossEntropyWithSoftmax +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Allocating matrices for forward and/or backward propagation. +MPI Rank 3: No PreCompute nodes found, skipping PreCompute step +MPI Rank 3: Warning: checkpoint file is missing. learning parameters will be initialized from 0 +MPI Rank 3: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 3: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 3: +MPI Rank 3: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 3: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.77171040; TotalTime = 7.7985s; SamplesPerSecond = 1313.1 +MPI Rank 3: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.79679737; TotalTime = 6.4066s; SamplesPerSecond = 1598.4 +MPI Rank 3: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8025137; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=18.3486 +MPI Rank 3: CNTKCommandTrainEnd: train +MPI Rank 3: COMPLETED +MPI Rank 3: ~MPIWrapper diff --git a/Tests/EndToEndTests/Text/SparseDSSM/baseline.gpu.txt b/Tests/EndToEndTests/Text/SparseDSSM/baseline.gpu.txt new file mode 100755 index 000000000000..b8d382b91410 --- /dev/null +++ b/Tests/EndToEndTests/Text/SparseDSSM/baseline.gpu.txt @@ -0,0 +1,3180 @@ +=== Running mpiexec -n 4 /home/thhoens/cntk/build/gpu/release/bin/cntk configFile=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.cntk currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ DeviceId=0 numCPUThreads=2 stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 16:31:40 + Last modified date: Thu Mar 3 16:25:02 2016 + Build type: debug + Math lib: acml + Build Branch: thhoens/tests + Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +------------------------------------------------------------------- +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 16:31:40 + Last modified date: Thu Mar 3 16:25:02 2016 + Build type: debug + Math lib: acml + Build Branch: thhoens/tests + Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +------------------------------------------------------------------- +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 16:31:40 + Last modified date: Thu Mar 3 16:25:02 2016 + Build type: debug + Math lib: acml + Build Branch: thhoens/tests + Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +------------------------------------------------------------------- +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 16:31:40 + Last modified date: Thu Mar 3 16:25:02 2016 + Build type: debug + Math lib: acml + Build Branch: thhoens/tests + Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +------------------------------------------------------------------- +MPIWrapper: initializing MPI +MPIWrapper: initializing MPI +MPIWrapper: initializing MPI +MPIWrapper: initializing MPI +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: all 4 nodes responded +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (1) are in (participating) +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: all 4 nodes responded +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (2) are in (participating) +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: all 4 nodes responded +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (3) are in (participating) +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: all 4 nodes responded +mpihelper: we are cog 3 in a gearbox of 4 +ping [mpihelper]: 4 nodes pinging each other +ping [requestnodes (before change)]: all 4 nodes responded +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (0) are in (participating) +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: all 4 nodes responded +mpihelper: we are cog 0 in a gearbox of 4 +ping [mpihelper]: 4 nodes pinging each other +ping [requestnodes (after change)]: all 4 nodes responded +mpihelper: we are cog 2 in a gearbox of 4 +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: all 4 nodes responded +ping [requestnodes (after change)]: all 4 nodes responded +mpihelper: we are cog 1 in a gearbox of 4 +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: all 4 nodes responded +ping [mpihelper]: all 4 nodes responded +ping [mpihelper]: all 4 nodes responded +Redirecting stderr to file /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr_train.logrank0 +Redirecting stderr to file /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr_train.logrank1 +Redirecting stderr to file /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr_train.logrank2 +Redirecting stderr to file /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr_train.logrank3 +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: Build info: +MPI Rank 0: +MPI Rank 0: Built time: Mar 3 2016 16:31:40 +MPI Rank 0: Last modified date: Thu Mar 3 16:25:02 2016 +MPI Rank 0: Build type: debug +MPI Rank 0: Math lib: acml +MPI Rank 0: Build Branch: thhoens/tests +MPI Rank 0: Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: running on localhost at 2016/03/03 16:37:11 +MPI Rank 0: command line: +MPI Rank 0: /home/thhoens/cntk/build/gpu/release/bin/cntk configFile=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.cntk currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ DeviceId=0 numCPUThreads=2 stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: modelPath=$RunDir$/models/dssm.net +MPI Rank 0: MBSize=4096 +MPI Rank 0: LRate=0.0001 +MPI Rank 0: DeviceId=-1 +MPI Rank 0: parallelTrain=true +MPI Rank 0: command = train +MPI Rank 0: precision = float +MPI Rank 0: traceGPUMemoryAllocations=0 +MPI Rank 0: train = [ +MPI Rank 0: action = train +MPI Rank 0: numMBsToShowResult=10 +MPI Rank 0: deviceId=$DeviceId$ +MPI Rank 0: minibatchSize = $MBSize$ +MPI Rank 0: modelPath = $modelPath$ +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize=102399 +MPI Rank 0: learningRatesPerSample = $LRate$ +MPI Rank 0: momentumPerMB = 0.9 +MPI Rank 0: maxEpochs=3 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationStartEpoch=1 +MPI Rank 0: parallelizationMethod=ModelAveragingSGD +MPI Rank 0: distributedMBReading=true +MPI Rank 0: ModelAveragingSGD=[ +MPI Rank 0: SyncFrequencyInFrames=1024 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: gradUpdateType=none +MPI Rank 0: gradientClippingWithTruncation=true +MPI Rank 0: clippingThresholdPerSample=1#INF +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: NDLNetworkBuilder = [ +MPI Rank 0: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = $DataDir$/train.all.bin +MPI Rank 0: ] +MPI Rank 0: cvReader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = $DataDir$/train.all.bin +MPI Rank 0: ] +MPI Rank 0: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 0: RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu +MPI Rank 0: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 0: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 0: DeviceId=0 +MPI Rank 0: numCPUThreads=2 +MPI Rank 0: stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: modelPath=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 0: MBSize=4096 +MPI Rank 0: LRate=0.0001 +MPI Rank 0: DeviceId=-1 +MPI Rank 0: parallelTrain=true +MPI Rank 0: command = train +MPI Rank 0: precision = float +MPI Rank 0: traceGPUMemoryAllocations=0 +MPI Rank 0: train = [ +MPI Rank 0: action = train +MPI Rank 0: numMBsToShowResult=10 +MPI Rank 0: deviceId=0 +MPI Rank 0: minibatchSize = 4096 +MPI Rank 0: modelPath = /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize=102399 +MPI Rank 0: learningRatesPerSample = 0.0001 +MPI Rank 0: momentumPerMB = 0.9 +MPI Rank 0: maxEpochs=3 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationStartEpoch=1 +MPI Rank 0: parallelizationMethod=ModelAveragingSGD +MPI Rank 0: distributedMBReading=true +MPI Rank 0: ModelAveragingSGD=[ +MPI Rank 0: SyncFrequencyInFrames=1024 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: gradUpdateType=none +MPI Rank 0: gradientClippingWithTruncation=true +MPI Rank 0: clippingThresholdPerSample=1#INF +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: NDLNetworkBuilder = [ +MPI Rank 0: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: cvReader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 0: RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu +MPI Rank 0: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 0: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 0: DeviceId=0 +MPI Rank 0: numCPUThreads=2 +MPI Rank 0: stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: configparameters: dssm.cntk:command=train +MPI Rank 0: configparameters: dssm.cntk:ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 0: configparameters: dssm.cntk:currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 0: configparameters: dssm.cntk:cvReader=[ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: dssm.cntk:DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 0: configparameters: dssm.cntk:DeviceId=0 +MPI Rank 0: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 0: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 0: configparameters: dssm.cntk:modelPath=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 0: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 0: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: dssm.cntk:numCPUThreads=2 +MPI Rank 0: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 0: configparameters: dssm.cntk:precision=float +MPI Rank 0: configparameters: dssm.cntk:reader=[ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: dssm.cntk:RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu +MPI Rank 0: configparameters: dssm.cntk:stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 0: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 0: configparameters: dssm.cntk:train=[ +MPI Rank 0: action = train +MPI Rank 0: numMBsToShowResult=10 +MPI Rank 0: deviceId=0 +MPI Rank 0: minibatchSize = 4096 +MPI Rank 0: modelPath = /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize=102399 +MPI Rank 0: learningRatesPerSample = 0.0001 +MPI Rank 0: momentumPerMB = 0.9 +MPI Rank 0: maxEpochs=3 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationStartEpoch=1 +MPI Rank 0: parallelizationMethod=ModelAveragingSGD +MPI Rank 0: distributedMBReading=true +MPI Rank 0: ModelAveragingSGD=[ +MPI Rank 0: SyncFrequencyInFrames=1024 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: gradUpdateType=none +MPI Rank 0: gradientClippingWithTruncation=true +MPI Rank 0: clippingThresholdPerSample=1#INF +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: command: train +MPI Rank 0: precision = float +MPI Rank 0: Using 2 CPU threads +MPI Rank 0: CNTKModelPath: /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 0: CNTKCommandTrainInfo: train : 3 +MPI Rank 0: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 0: CNTKCommandTrainBegin: train +MPI Rank 0: NDLBuilder Using GPU 0 +MPI Rank 0: SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4 +MPI Rank 0: +MPI Rank 0: Post-processing network... +MPI Rank 0: +MPI Rank 0: 2 roots: +MPI Rank 0: SIM = CosDistanceWithNegativeSamples +MPI Rank 0: CE = CrossEntropyWithSoftmax +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: +MPI Rank 0: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: +MPI Rank 0: Validating for node SIM, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: +MPI Rank 0: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 0: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 0: +MPI Rank 0: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 0: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 0: +MPI Rank 0: Validating for node CE, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 0: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 0: +MPI Rank 0: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: Post-processing network complete. +MPI Rank 0: +MPI Rank 0: SGD using GPU 0. +MPI Rank 0: +MPI Rank 0: Training criterion node(s): +MPI Rank 0: CE = CrossEntropyWithSoftmax +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: No PreCompute nodes found, skipping PreCompute step +MPI Rank 0: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 0: Starting Epoch 1: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 4.34696770; TotalTime = 5.1160s; SamplesPerSecond = 2001.6 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 3.34277306; TotalTime = 4.5393s; SamplesPerSecond = 2255.9 +MPI Rank 0: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.6160169; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=12.2966 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: Final Results: Minibatch[1-25]: Samples Seen = 102399 CE: CrossEntropyWithSoftmax/Sample = 2.5001548 Perplexity = 12.18438 +MPI Rank 0: Finished Epoch[ 1 of 3]: [Validation Set] TrainLossPerSample = 2.5001548 +MPI Rank 0: Starting Epoch 2: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.30270958; TotalTime = 4.5382s; SamplesPerSecond = 2256.4 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.09883766; TotalTime = 16.8837s; SamplesPerSecond = 606.5 +MPI Rank 0: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 2.1757753; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=24.0634 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: Final Results: Minibatch[1-25]: Samples Seen = 102399 CE: CrossEntropyWithSoftmax/Sample = 1.9714525 Perplexity = 7.1810993 +MPI Rank 0: Finished Epoch[ 2 of 3]: [Validation Set] TrainLossPerSample = 1.9714525 +MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.89778175; TotalTime = 4.5211s; SamplesPerSecond = 2264.9 +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.86335983; TotalTime = 4.5357s; SamplesPerSecond = 2257.7 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8856394; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=11.692 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: Final Results: Minibatch[1-25]: Samples Seen = 102399 CE: CrossEntropyWithSoftmax/Sample = 1.8091086 Perplexity = 6.1050027 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Validation Set] TrainLossPerSample = 1.8091086 +MPI Rank 0: CNTKCommandTrainEnd: train +MPI Rank 0: COMPLETED +MPI Rank 0: ~MPIWrapper +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: Build info: +MPI Rank 1: +MPI Rank 1: Built time: Mar 3 2016 16:31:40 +MPI Rank 1: Last modified date: Thu Mar 3 16:25:02 2016 +MPI Rank 1: Build type: debug +MPI Rank 1: Math lib: acml +MPI Rank 1: Build Branch: thhoens/tests +MPI Rank 1: Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: running on localhost at 2016/03/03 16:37:12 +MPI Rank 1: command line: +MPI Rank 1: /home/thhoens/cntk/build/gpu/release/bin/cntk configFile=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.cntk currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ DeviceId=0 numCPUThreads=2 stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: modelPath=$RunDir$/models/dssm.net +MPI Rank 1: MBSize=4096 +MPI Rank 1: LRate=0.0001 +MPI Rank 1: DeviceId=-1 +MPI Rank 1: parallelTrain=true +MPI Rank 1: command = train +MPI Rank 1: precision = float +MPI Rank 1: traceGPUMemoryAllocations=0 +MPI Rank 1: train = [ +MPI Rank 1: action = train +MPI Rank 1: numMBsToShowResult=10 +MPI Rank 1: deviceId=$DeviceId$ +MPI Rank 1: minibatchSize = $MBSize$ +MPI Rank 1: modelPath = $modelPath$ +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize=102399 +MPI Rank 1: learningRatesPerSample = $LRate$ +MPI Rank 1: momentumPerMB = 0.9 +MPI Rank 1: maxEpochs=3 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationStartEpoch=1 +MPI Rank 1: parallelizationMethod=ModelAveragingSGD +MPI Rank 1: distributedMBReading=true +MPI Rank 1: ModelAveragingSGD=[ +MPI Rank 1: SyncFrequencyInFrames=1024 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: gradUpdateType=none +MPI Rank 1: gradientClippingWithTruncation=true +MPI Rank 1: clippingThresholdPerSample=1#INF +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: NDLNetworkBuilder = [ +MPI Rank 1: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = $DataDir$/train.all.bin +MPI Rank 1: ] +MPI Rank 1: cvReader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = $DataDir$/train.all.bin +MPI Rank 1: ] +MPI Rank 1: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 1: RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu +MPI Rank 1: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 1: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 1: DeviceId=0 +MPI Rank 1: numCPUThreads=2 +MPI Rank 1: stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: modelPath=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 1: MBSize=4096 +MPI Rank 1: LRate=0.0001 +MPI Rank 1: DeviceId=-1 +MPI Rank 1: parallelTrain=true +MPI Rank 1: command = train +MPI Rank 1: precision = float +MPI Rank 1: traceGPUMemoryAllocations=0 +MPI Rank 1: train = [ +MPI Rank 1: action = train +MPI Rank 1: numMBsToShowResult=10 +MPI Rank 1: deviceId=0 +MPI Rank 1: minibatchSize = 4096 +MPI Rank 1: modelPath = /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize=102399 +MPI Rank 1: learningRatesPerSample = 0.0001 +MPI Rank 1: momentumPerMB = 0.9 +MPI Rank 1: maxEpochs=3 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationStartEpoch=1 +MPI Rank 1: parallelizationMethod=ModelAveragingSGD +MPI Rank 1: distributedMBReading=true +MPI Rank 1: ModelAveragingSGD=[ +MPI Rank 1: SyncFrequencyInFrames=1024 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: gradUpdateType=none +MPI Rank 1: gradientClippingWithTruncation=true +MPI Rank 1: clippingThresholdPerSample=1#INF +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: NDLNetworkBuilder = [ +MPI Rank 1: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: cvReader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 1: RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu +MPI Rank 1: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 1: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 1: DeviceId=0 +MPI Rank 1: numCPUThreads=2 +MPI Rank 1: stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: configparameters: dssm.cntk:command=train +MPI Rank 1: configparameters: dssm.cntk:ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 1: configparameters: dssm.cntk:currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 1: configparameters: dssm.cntk:cvReader=[ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: dssm.cntk:DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 1: configparameters: dssm.cntk:DeviceId=0 +MPI Rank 1: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 1: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 1: configparameters: dssm.cntk:modelPath=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 1: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 1: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: dssm.cntk:numCPUThreads=2 +MPI Rank 1: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 1: configparameters: dssm.cntk:precision=float +MPI Rank 1: configparameters: dssm.cntk:reader=[ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: dssm.cntk:RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu +MPI Rank 1: configparameters: dssm.cntk:stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 1: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 1: configparameters: dssm.cntk:train=[ +MPI Rank 1: action = train +MPI Rank 1: numMBsToShowResult=10 +MPI Rank 1: deviceId=0 +MPI Rank 1: minibatchSize = 4096 +MPI Rank 1: modelPath = /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize=102399 +MPI Rank 1: learningRatesPerSample = 0.0001 +MPI Rank 1: momentumPerMB = 0.9 +MPI Rank 1: maxEpochs=3 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationStartEpoch=1 +MPI Rank 1: parallelizationMethod=ModelAveragingSGD +MPI Rank 1: distributedMBReading=true +MPI Rank 1: ModelAveragingSGD=[ +MPI Rank 1: SyncFrequencyInFrames=1024 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: gradUpdateType=none +MPI Rank 1: gradientClippingWithTruncation=true +MPI Rank 1: clippingThresholdPerSample=1#INF +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: command: train +MPI Rank 1: precision = float +MPI Rank 1: Using 2 CPU threads +MPI Rank 1: CNTKModelPath: /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 1: CNTKCommandTrainInfo: train : 3 +MPI Rank 1: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 1: CNTKCommandTrainBegin: train +MPI Rank 1: NDLBuilder Using GPU 0 +MPI Rank 1: SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4 +MPI Rank 1: +MPI Rank 1: Post-processing network... +MPI Rank 1: +MPI Rank 1: 2 roots: +MPI Rank 1: SIM = CosDistanceWithNegativeSamples +MPI Rank 1: CE = CrossEntropyWithSoftmax +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: +MPI Rank 1: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: +MPI Rank 1: Validating for node SIM, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: +MPI Rank 1: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 1: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 1: +MPI Rank 1: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 1: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 1: +MPI Rank 1: Validating for node CE, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 1: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 1: +MPI Rank 1: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: Post-processing network complete. +MPI Rank 1: +MPI Rank 1: SGD using GPU 0. +MPI Rank 1: +MPI Rank 1: Training criterion node(s): +MPI Rank 1: CE = CrossEntropyWithSoftmax +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Allocating matrices for forward and/or backward propagation. +MPI Rank 1: No PreCompute nodes found, skipping PreCompute step +MPI Rank 1: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 1: Starting Epoch 1: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 4.32159615; TotalTime = 5.1161s; SamplesPerSecond = 2001.5 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 3.33525505; TotalTime = 4.5393s; SamplesPerSecond = 2255.9 +MPI Rank 1: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.6160169; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=12.2966 +MPI Rank 1: Starting Epoch 2: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.32732925; TotalTime = 4.5394s; SamplesPerSecond = 2255.8 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.11035995; TotalTime = 16.8838s; SamplesPerSecond = 606.5 +MPI Rank 1: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 2.1757753; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=24.0634 +MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.92909813; TotalTime = 4.5223s; SamplesPerSecond = 2264.3 +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.86598778; TotalTime = 4.5356s; SamplesPerSecond = 2257.7 +MPI Rank 1: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8856394; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=11.692 +MPI Rank 1: CNTKCommandTrainEnd: train +MPI Rank 1: COMPLETED +MPI Rank 1: ~MPIWrapper +MPI Rank 2: ------------------------------------------------------------------- +MPI Rank 2: Build info: +MPI Rank 2: +MPI Rank 2: Built time: Mar 3 2016 16:31:40 +MPI Rank 2: Last modified date: Thu Mar 3 16:25:02 2016 +MPI Rank 2: Build type: debug +MPI Rank 2: Math lib: acml +MPI Rank 2: Build Branch: thhoens/tests +MPI Rank 2: Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +MPI Rank 2: ------------------------------------------------------------------- +MPI Rank 2: running on localhost at 2016/03/03 16:37:12 +MPI Rank 2: command line: +MPI Rank 2: /home/thhoens/cntk/build/gpu/release/bin/cntk configFile=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.cntk currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ DeviceId=0 numCPUThreads=2 stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: modelPath=$RunDir$/models/dssm.net +MPI Rank 2: MBSize=4096 +MPI Rank 2: LRate=0.0001 +MPI Rank 2: DeviceId=-1 +MPI Rank 2: parallelTrain=true +MPI Rank 2: command = train +MPI Rank 2: precision = float +MPI Rank 2: traceGPUMemoryAllocations=0 +MPI Rank 2: train = [ +MPI Rank 2: action = train +MPI Rank 2: numMBsToShowResult=10 +MPI Rank 2: deviceId=$DeviceId$ +MPI Rank 2: minibatchSize = $MBSize$ +MPI Rank 2: modelPath = $modelPath$ +MPI Rank 2: traceLevel = 1 +MPI Rank 2: SGD = [ +MPI Rank 2: epochSize=102399 +MPI Rank 2: learningRatesPerSample = $LRate$ +MPI Rank 2: momentumPerMB = 0.9 +MPI Rank 2: maxEpochs=3 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationStartEpoch=1 +MPI Rank 2: parallelizationMethod=ModelAveragingSGD +MPI Rank 2: distributedMBReading=true +MPI Rank 2: ModelAveragingSGD=[ +MPI Rank 2: SyncFrequencyInFrames=1024 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: gradUpdateType=none +MPI Rank 2: gradientClippingWithTruncation=true +MPI Rank 2: clippingThresholdPerSample=1#INF +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: NDLNetworkBuilder = [ +MPI Rank 2: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 2: ] +MPI Rank 2: reader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = $DataDir$/train.all.bin +MPI Rank 2: ] +MPI Rank 2: cvReader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = $DataDir$/train.all.bin +MPI Rank 2: ] +MPI Rank 2: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 2: RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu +MPI Rank 2: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 2: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 2: DeviceId=0 +MPI Rank 2: numCPUThreads=2 +MPI Rank 2: stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: modelPath=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 2: MBSize=4096 +MPI Rank 2: LRate=0.0001 +MPI Rank 2: DeviceId=-1 +MPI Rank 2: parallelTrain=true +MPI Rank 2: command = train +MPI Rank 2: precision = float +MPI Rank 2: traceGPUMemoryAllocations=0 +MPI Rank 2: train = [ +MPI Rank 2: action = train +MPI Rank 2: numMBsToShowResult=10 +MPI Rank 2: deviceId=0 +MPI Rank 2: minibatchSize = 4096 +MPI Rank 2: modelPath = /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 2: traceLevel = 1 +MPI Rank 2: SGD = [ +MPI Rank 2: epochSize=102399 +MPI Rank 2: learningRatesPerSample = 0.0001 +MPI Rank 2: momentumPerMB = 0.9 +MPI Rank 2: maxEpochs=3 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationStartEpoch=1 +MPI Rank 2: parallelizationMethod=ModelAveragingSGD +MPI Rank 2: distributedMBReading=true +MPI Rank 2: ModelAveragingSGD=[ +MPI Rank 2: SyncFrequencyInFrames=1024 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: gradUpdateType=none +MPI Rank 2: gradientClippingWithTruncation=true +MPI Rank 2: clippingThresholdPerSample=1#INF +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: NDLNetworkBuilder = [ +MPI Rank 2: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 2: ] +MPI Rank 2: reader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: cvReader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 2: RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu +MPI Rank 2: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 2: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 2: DeviceId=0 +MPI Rank 2: numCPUThreads=2 +MPI Rank 2: stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: configparameters: dssm.cntk:command=train +MPI Rank 2: configparameters: dssm.cntk:ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 2: configparameters: dssm.cntk:currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 2: configparameters: dssm.cntk:cvReader=[ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: configparameters: dssm.cntk:DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 2: configparameters: dssm.cntk:DeviceId=0 +MPI Rank 2: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 2: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 2: configparameters: dssm.cntk:modelPath=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 2: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 2: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: configparameters: dssm.cntk:numCPUThreads=2 +MPI Rank 2: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 2: configparameters: dssm.cntk:precision=float +MPI Rank 2: configparameters: dssm.cntk:reader=[ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: configparameters: dssm.cntk:RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu +MPI Rank 2: configparameters: dssm.cntk:stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 2: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 2: configparameters: dssm.cntk:train=[ +MPI Rank 2: action = train +MPI Rank 2: numMBsToShowResult=10 +MPI Rank 2: deviceId=0 +MPI Rank 2: minibatchSize = 4096 +MPI Rank 2: modelPath = /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 2: traceLevel = 1 +MPI Rank 2: SGD = [ +MPI Rank 2: epochSize=102399 +MPI Rank 2: learningRatesPerSample = 0.0001 +MPI Rank 2: momentumPerMB = 0.9 +MPI Rank 2: maxEpochs=3 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationStartEpoch=1 +MPI Rank 2: parallelizationMethod=ModelAveragingSGD +MPI Rank 2: distributedMBReading=true +MPI Rank 2: ModelAveragingSGD=[ +MPI Rank 2: SyncFrequencyInFrames=1024 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: gradUpdateType=none +MPI Rank 2: gradientClippingWithTruncation=true +MPI Rank 2: clippingThresholdPerSample=1#INF +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: command: train +MPI Rank 2: precision = float +MPI Rank 2: Using 2 CPU threads +MPI Rank 2: CNTKModelPath: /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 2: CNTKCommandTrainInfo: train : 3 +MPI Rank 2: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 2: CNTKCommandTrainBegin: train +MPI Rank 2: NDLBuilder Using GPU 0 +MPI Rank 2: SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4 +MPI Rank 2: +MPI Rank 2: Post-processing network... +MPI Rank 2: +MPI Rank 2: 2 roots: +MPI Rank 2: SIM = CosDistanceWithNegativeSamples +MPI Rank 2: CE = CrossEntropyWithSoftmax +MPI Rank 2: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 2: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 2: +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: +MPI Rank 2: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 2: +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: +MPI Rank 2: Validating for node SIM, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: +MPI Rank 2: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 2: +MPI Rank 2: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 2: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 2: +MPI Rank 2: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 2: +MPI Rank 2: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 2: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 2: +MPI Rank 2: Validating for node CE, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 2: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 2: +MPI Rank 2: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: Post-processing network complete. +MPI Rank 2: +MPI Rank 2: SGD using GPU 0. +MPI Rank 2: +MPI Rank 2: Training criterion node(s): +MPI Rank 2: CE = CrossEntropyWithSoftmax +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Allocating matrices for forward and/or backward propagation. +MPI Rank 2: No PreCompute nodes found, skipping PreCompute step +MPI Rank 2: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 2: Starting Epoch 1: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 2: +MPI Rank 2: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 4.32837563; TotalTime = 5.1162s; SamplesPerSecond = 2001.5 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 3.35655479; TotalTime = 4.5393s; SamplesPerSecond = 2255.8 +MPI Rank 2: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.6160169; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=12.2966 +MPI Rank 2: Starting Epoch 2: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 2: +MPI Rank 2: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.32893581; TotalTime = 4.5352s; SamplesPerSecond = 2257.9 +MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.11646938; TotalTime = 16.8838s; SamplesPerSecond = 606.5 +MPI Rank 2: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 2.1757753; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=24.0634 +MPI Rank 2: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 2: +MPI Rank 2: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 2: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.95308418; TotalTime = 4.5216s; SamplesPerSecond = 2264.7 +MPI Rank 2: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.87902641; TotalTime = 4.5357s; SamplesPerSecond = 2257.7 +MPI Rank 2: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8856394; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=11.692 +MPI Rank 2: CNTKCommandTrainEnd: train +MPI Rank 2: COMPLETED +MPI Rank 2: ~MPIWrapper +MPI Rank 3: ------------------------------------------------------------------- +MPI Rank 3: Build info: +MPI Rank 3: +MPI Rank 3: Built time: Mar 3 2016 16:31:40 +MPI Rank 3: Last modified date: Thu Mar 3 16:25:02 2016 +MPI Rank 3: Build type: debug +MPI Rank 3: Math lib: acml +MPI Rank 3: Build Branch: thhoens/tests +MPI Rank 3: Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +MPI Rank 3: ------------------------------------------------------------------- +MPI Rank 3: running on localhost at 2016/03/03 16:37:13 +MPI Rank 3: command line: +MPI Rank 3: /home/thhoens/cntk/build/gpu/release/bin/cntk configFile=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.cntk currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ DeviceId=0 numCPUThreads=2 stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: modelPath=$RunDir$/models/dssm.net +MPI Rank 3: MBSize=4096 +MPI Rank 3: LRate=0.0001 +MPI Rank 3: DeviceId=-1 +MPI Rank 3: parallelTrain=true +MPI Rank 3: command = train +MPI Rank 3: precision = float +MPI Rank 3: traceGPUMemoryAllocations=0 +MPI Rank 3: train = [ +MPI Rank 3: action = train +MPI Rank 3: numMBsToShowResult=10 +MPI Rank 3: deviceId=$DeviceId$ +MPI Rank 3: minibatchSize = $MBSize$ +MPI Rank 3: modelPath = $modelPath$ +MPI Rank 3: traceLevel = 1 +MPI Rank 3: SGD = [ +MPI Rank 3: epochSize=102399 +MPI Rank 3: learningRatesPerSample = $LRate$ +MPI Rank 3: momentumPerMB = 0.9 +MPI Rank 3: maxEpochs=3 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationStartEpoch=1 +MPI Rank 3: parallelizationMethod=ModelAveragingSGD +MPI Rank 3: distributedMBReading=true +MPI Rank 3: ModelAveragingSGD=[ +MPI Rank 3: SyncFrequencyInFrames=1024 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: gradUpdateType=none +MPI Rank 3: gradientClippingWithTruncation=true +MPI Rank 3: clippingThresholdPerSample=1#INF +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: NDLNetworkBuilder = [ +MPI Rank 3: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 3: ] +MPI Rank 3: reader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = $DataDir$/train.all.bin +MPI Rank 3: ] +MPI Rank 3: cvReader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = $DataDir$/train.all.bin +MPI Rank 3: ] +MPI Rank 3: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 3: RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu +MPI Rank 3: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 3: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 3: DeviceId=0 +MPI Rank 3: numCPUThreads=2 +MPI Rank 3: stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: modelPath=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 3: MBSize=4096 +MPI Rank 3: LRate=0.0001 +MPI Rank 3: DeviceId=-1 +MPI Rank 3: parallelTrain=true +MPI Rank 3: command = train +MPI Rank 3: precision = float +MPI Rank 3: traceGPUMemoryAllocations=0 +MPI Rank 3: train = [ +MPI Rank 3: action = train +MPI Rank 3: numMBsToShowResult=10 +MPI Rank 3: deviceId=0 +MPI Rank 3: minibatchSize = 4096 +MPI Rank 3: modelPath = /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 3: traceLevel = 1 +MPI Rank 3: SGD = [ +MPI Rank 3: epochSize=102399 +MPI Rank 3: learningRatesPerSample = 0.0001 +MPI Rank 3: momentumPerMB = 0.9 +MPI Rank 3: maxEpochs=3 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationStartEpoch=1 +MPI Rank 3: parallelizationMethod=ModelAveragingSGD +MPI Rank 3: distributedMBReading=true +MPI Rank 3: ModelAveragingSGD=[ +MPI Rank 3: SyncFrequencyInFrames=1024 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: gradUpdateType=none +MPI Rank 3: gradientClippingWithTruncation=true +MPI Rank 3: clippingThresholdPerSample=1#INF +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: NDLNetworkBuilder = [ +MPI Rank 3: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 3: ] +MPI Rank 3: reader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: cvReader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 3: RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu +MPI Rank 3: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 3: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 3: DeviceId=0 +MPI Rank 3: numCPUThreads=2 +MPI Rank 3: stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: configparameters: dssm.cntk:command=train +MPI Rank 3: configparameters: dssm.cntk:ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 3: configparameters: dssm.cntk:currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 3: configparameters: dssm.cntk:cvReader=[ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: configparameters: dssm.cntk:DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 3: configparameters: dssm.cntk:DeviceId=0 +MPI Rank 3: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 3: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 3: configparameters: dssm.cntk:modelPath=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 3: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 3: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: configparameters: dssm.cntk:numCPUThreads=2 +MPI Rank 3: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 3: configparameters: dssm.cntk:precision=float +MPI Rank 3: configparameters: dssm.cntk:reader=[ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: configparameters: dssm.cntk:RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu +MPI Rank 3: configparameters: dssm.cntk:stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 3: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 3: configparameters: dssm.cntk:train=[ +MPI Rank 3: action = train +MPI Rank 3: numMBsToShowResult=10 +MPI Rank 3: deviceId=0 +MPI Rank 3: minibatchSize = 4096 +MPI Rank 3: modelPath = /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 3: traceLevel = 1 +MPI Rank 3: SGD = [ +MPI Rank 3: epochSize=102399 +MPI Rank 3: learningRatesPerSample = 0.0001 +MPI Rank 3: momentumPerMB = 0.9 +MPI Rank 3: maxEpochs=3 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationStartEpoch=1 +MPI Rank 3: parallelizationMethod=ModelAveragingSGD +MPI Rank 3: distributedMBReading=true +MPI Rank 3: ModelAveragingSGD=[ +MPI Rank 3: SyncFrequencyInFrames=1024 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: gradUpdateType=none +MPI Rank 3: gradientClippingWithTruncation=true +MPI Rank 3: clippingThresholdPerSample=1#INF +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: command: train +MPI Rank 3: precision = float +MPI Rank 3: Using 2 CPU threads +MPI Rank 3: CNTKModelPath: /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 3: CNTKCommandTrainInfo: train : 3 +MPI Rank 3: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 3: CNTKCommandTrainBegin: train +MPI Rank 3: NDLBuilder Using GPU 0 +MPI Rank 3: SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4 +MPI Rank 3: +MPI Rank 3: Post-processing network... +MPI Rank 3: +MPI Rank 3: 2 roots: +MPI Rank 3: SIM = CosDistanceWithNegativeSamples +MPI Rank 3: CE = CrossEntropyWithSoftmax +MPI Rank 3: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 3: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 3: +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: +MPI Rank 3: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 3: +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: +MPI Rank 3: Validating for node SIM, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: +MPI Rank 3: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 3: +MPI Rank 3: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 3: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 3: +MPI Rank 3: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 3: +MPI Rank 3: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 3: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 3: +MPI Rank 3: Validating for node CE, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 3: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 3: +MPI Rank 3: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 3: +MPI Rank 3: Post-processing network complete. +MPI Rank 3: +MPI Rank 3: SGD using GPU 0. +MPI Rank 3: +MPI Rank 3: Training criterion node(s): +MPI Rank 3: CE = CrossEntropyWithSoftmax +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Allocating matrices for forward and/or backward propagation. +MPI Rank 3: No PreCompute nodes found, skipping PreCompute step +MPI Rank 3: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 3: Starting Epoch 1: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 3: +MPI Rank 3: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 3: Epoch[ 1 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 4.32287788; TotalTime = 5.1259s; SamplesPerSecond = 1997.7 +MPI Rank 3: Epoch[ 1 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 3.35470390; TotalTime = 4.5393s; SamplesPerSecond = 2255.9 +MPI Rank 3: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.6160169; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=12.2966 +MPI Rank 3: Starting Epoch 2: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 3: +MPI Rank 3: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 3: Epoch[ 2 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.29653873; TotalTime = 4.5477s; SamplesPerSecond = 2251.7 +MPI Rank 3: Epoch[ 2 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.11679478; TotalTime = 16.8838s; SamplesPerSecond = 606.5 +MPI Rank 3: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 2.1757753; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=24.0634 +MPI Rank 3: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 3: +MPI Rank 3: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 3: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.90347176; TotalTime = 4.5312s; SamplesPerSecond = 2259.9 +MPI Rank 3: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.88304176; TotalTime = 4.5357s; SamplesPerSecond = 2257.6 +MPI Rank 3: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8856394; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=11.692 +MPI Rank 3: CNTKCommandTrainEnd: train +MPI Rank 3: COMPLETED +MPI Rank 3: ~MPIWrapper +=== Deleting last epoch data +==== Re-running from checkpoint +=== Running mpiexec -n 4 /home/thhoens/cntk/build/gpu/release/bin/cntk configFile=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.cntk currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ DeviceId=0 numCPUThreads=2 stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 16:31:40 + Last modified date: Thu Mar 3 16:25:02 2016 + Build type: debug + Math lib: acml + Build Branch: thhoens/tests + Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +------------------------------------------------------------------- +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 16:31:40 + Last modified date: Thu Mar 3 16:25:02 2016 + Build type: debug + Math lib: acml + Build Branch: thhoens/tests + Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +------------------------------------------------------------------- +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 16:31:40 + Last modified date: Thu Mar 3 16:25:02 2016 + Build type: debug + Math lib: acml + Build Branch: thhoens/tests + Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +------------------------------------------------------------------- +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 16:31:40 + Last modified date: Thu Mar 3 16:25:02 2016 + Build type: debug + Math lib: acml + Build Branch: thhoens/tests + Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +------------------------------------------------------------------- +MPIWrapper: initializing MPI +MPIWrapper: initializing MPI +MPIWrapper: initializing MPI +MPIWrapper: initializing MPI +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: all 4 nodes responded +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (1) are in (participating) +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: all 4 nodes responded +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (3) are in (participating) +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: all 4 nodes responded +ping [requestnodes (before change)]: all 4 nodes responded +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (0) are in (participating) +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: all 4 nodes responded +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (2) are in (participating) +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: all 4 nodes responded +mpihelper: we are cog 0 in a gearbox of 4 +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: all 4 nodes responded +ping [requestnodes (after change)]: all 4 nodes responded +mpihelper: we are cog 1 in a gearbox of 4 +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: all 4 nodes responded +mpihelper: we are cog 3 in a gearbox of 4 +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: all 4 nodes responded +ping [requestnodes (after change)]: all 4 nodes responded +mpihelper: we are cog 2 in a gearbox of 4 +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: all 4 nodes responded +Redirecting stderr to file /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr_train.logrank0 +Redirecting stderr to file /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr_train.logrank1 +Redirecting stderr to file /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr_train.logrank2 +Redirecting stderr to file /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr_train.logrank3 +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: Build info: +MPI Rank 0: +MPI Rank 0: Built time: Mar 3 2016 16:31:40 +MPI Rank 0: Last modified date: Thu Mar 3 16:25:02 2016 +MPI Rank 0: Build type: debug +MPI Rank 0: Math lib: acml +MPI Rank 0: Build Branch: thhoens/tests +MPI Rank 0: Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: running on localhost at 2016/03/03 16:38:10 +MPI Rank 0: command line: +MPI Rank 0: /home/thhoens/cntk/build/gpu/release/bin/cntk configFile=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.cntk currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ DeviceId=0 numCPUThreads=2 stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: modelPath=$RunDir$/models/dssm.net +MPI Rank 0: MBSize=4096 +MPI Rank 0: LRate=0.0001 +MPI Rank 0: DeviceId=-1 +MPI Rank 0: parallelTrain=true +MPI Rank 0: command = train +MPI Rank 0: precision = float +MPI Rank 0: traceGPUMemoryAllocations=0 +MPI Rank 0: train = [ +MPI Rank 0: action = train +MPI Rank 0: numMBsToShowResult=10 +MPI Rank 0: deviceId=$DeviceId$ +MPI Rank 0: minibatchSize = $MBSize$ +MPI Rank 0: modelPath = $modelPath$ +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize=102399 +MPI Rank 0: learningRatesPerSample = $LRate$ +MPI Rank 0: momentumPerMB = 0.9 +MPI Rank 0: maxEpochs=3 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationStartEpoch=1 +MPI Rank 0: parallelizationMethod=ModelAveragingSGD +MPI Rank 0: distributedMBReading=true +MPI Rank 0: ModelAveragingSGD=[ +MPI Rank 0: SyncFrequencyInFrames=1024 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: gradUpdateType=none +MPI Rank 0: gradientClippingWithTruncation=true +MPI Rank 0: clippingThresholdPerSample=1#INF +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: NDLNetworkBuilder = [ +MPI Rank 0: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = $DataDir$/train.all.bin +MPI Rank 0: ] +MPI Rank 0: cvReader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = $DataDir$/train.all.bin +MPI Rank 0: ] +MPI Rank 0: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 0: RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu +MPI Rank 0: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 0: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 0: DeviceId=0 +MPI Rank 0: numCPUThreads=2 +MPI Rank 0: stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: modelPath=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 0: MBSize=4096 +MPI Rank 0: LRate=0.0001 +MPI Rank 0: DeviceId=-1 +MPI Rank 0: parallelTrain=true +MPI Rank 0: command = train +MPI Rank 0: precision = float +MPI Rank 0: traceGPUMemoryAllocations=0 +MPI Rank 0: train = [ +MPI Rank 0: action = train +MPI Rank 0: numMBsToShowResult=10 +MPI Rank 0: deviceId=0 +MPI Rank 0: minibatchSize = 4096 +MPI Rank 0: modelPath = /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize=102399 +MPI Rank 0: learningRatesPerSample = 0.0001 +MPI Rank 0: momentumPerMB = 0.9 +MPI Rank 0: maxEpochs=3 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationStartEpoch=1 +MPI Rank 0: parallelizationMethod=ModelAveragingSGD +MPI Rank 0: distributedMBReading=true +MPI Rank 0: ModelAveragingSGD=[ +MPI Rank 0: SyncFrequencyInFrames=1024 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: gradUpdateType=none +MPI Rank 0: gradientClippingWithTruncation=true +MPI Rank 0: clippingThresholdPerSample=1#INF +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: NDLNetworkBuilder = [ +MPI Rank 0: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: cvReader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 0: RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu +MPI Rank 0: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 0: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 0: DeviceId=0 +MPI Rank 0: numCPUThreads=2 +MPI Rank 0: stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: configparameters: dssm.cntk:command=train +MPI Rank 0: configparameters: dssm.cntk:ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 0: configparameters: dssm.cntk:currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 0: configparameters: dssm.cntk:cvReader=[ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: dssm.cntk:DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 0: configparameters: dssm.cntk:DeviceId=0 +MPI Rank 0: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 0: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 0: configparameters: dssm.cntk:modelPath=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 0: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 0: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: dssm.cntk:numCPUThreads=2 +MPI Rank 0: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 0: configparameters: dssm.cntk:precision=float +MPI Rank 0: configparameters: dssm.cntk:reader=[ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: dssm.cntk:RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu +MPI Rank 0: configparameters: dssm.cntk:stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 0: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 0: configparameters: dssm.cntk:train=[ +MPI Rank 0: action = train +MPI Rank 0: numMBsToShowResult=10 +MPI Rank 0: deviceId=0 +MPI Rank 0: minibatchSize = 4096 +MPI Rank 0: modelPath = /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize=102399 +MPI Rank 0: learningRatesPerSample = 0.0001 +MPI Rank 0: momentumPerMB = 0.9 +MPI Rank 0: maxEpochs=3 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationStartEpoch=1 +MPI Rank 0: parallelizationMethod=ModelAveragingSGD +MPI Rank 0: distributedMBReading=true +MPI Rank 0: ModelAveragingSGD=[ +MPI Rank 0: SyncFrequencyInFrames=1024 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: gradUpdateType=none +MPI Rank 0: gradientClippingWithTruncation=true +MPI Rank 0: clippingThresholdPerSample=1#INF +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: command: train +MPI Rank 0: precision = float +MPI Rank 0: Using 2 CPU threads +MPI Rank 0: CNTKModelPath: /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 0: CNTKCommandTrainInfo: train : 3 +MPI Rank 0: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 0: CNTKCommandTrainBegin: train +MPI Rank 0: NDLBuilder Using GPU 0 +MPI Rank 0: Starting from checkpoint. Load Network From File /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net.2. +MPI Rank 0: +MPI Rank 0: Post-processing network... +MPI Rank 0: +MPI Rank 0: 2 roots: +MPI Rank 0: CE = CrossEntropyWithSoftmax +MPI Rank 0: SIM = CosDistanceWithNegativeSamples +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 0: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 0: +MPI Rank 0: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 0: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 0: +MPI Rank 0: Validating for node CE, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 0: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 0: +MPI Rank 0: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: +MPI Rank 0: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: +MPI Rank 0: Validating for node SIM, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: +MPI Rank 0: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: Post-processing network complete. +MPI Rank 0: +MPI Rank 0: SGD using GPU 0. +MPI Rank 0: +MPI Rank 0: Training criterion node(s): +MPI Rank 0: CE = CrossEntropyWithSoftmax +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: No PreCompute nodes found, skipping PreCompute step +MPI Rank 0: Warning: checkpoint file is missing. learning parameters will be initialized from 0 +MPI Rank 0: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.92903252; TotalTime = 5.0533s; SamplesPerSecond = 2026.4 +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.94077110; TotalTime = 4.5332s; SamplesPerSecond = 2258.9 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.9504802; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=12.2322 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: Final Results: Minibatch[1-25]: Samples Seen = 102399 CE: CrossEntropyWithSoftmax/Sample = 1.8982234 Perplexity = 6.6740266 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Validation Set] TrainLossPerSample = 1.8982234 +MPI Rank 0: CNTKCommandTrainEnd: train +MPI Rank 0: COMPLETED +MPI Rank 0: ~MPIWrapper +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: Build info: +MPI Rank 1: +MPI Rank 1: Built time: Mar 3 2016 16:31:40 +MPI Rank 1: Last modified date: Thu Mar 3 16:25:02 2016 +MPI Rank 1: Build type: debug +MPI Rank 1: Math lib: acml +MPI Rank 1: Build Branch: thhoens/tests +MPI Rank 1: Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: running on localhost at 2016/03/03 16:38:10 +MPI Rank 1: command line: +MPI Rank 1: /home/thhoens/cntk/build/gpu/release/bin/cntk configFile=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.cntk currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ DeviceId=0 numCPUThreads=2 stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: modelPath=$RunDir$/models/dssm.net +MPI Rank 1: MBSize=4096 +MPI Rank 1: LRate=0.0001 +MPI Rank 1: DeviceId=-1 +MPI Rank 1: parallelTrain=true +MPI Rank 1: command = train +MPI Rank 1: precision = float +MPI Rank 1: traceGPUMemoryAllocations=0 +MPI Rank 1: train = [ +MPI Rank 1: action = train +MPI Rank 1: numMBsToShowResult=10 +MPI Rank 1: deviceId=$DeviceId$ +MPI Rank 1: minibatchSize = $MBSize$ +MPI Rank 1: modelPath = $modelPath$ +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize=102399 +MPI Rank 1: learningRatesPerSample = $LRate$ +MPI Rank 1: momentumPerMB = 0.9 +MPI Rank 1: maxEpochs=3 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationStartEpoch=1 +MPI Rank 1: parallelizationMethod=ModelAveragingSGD +MPI Rank 1: distributedMBReading=true +MPI Rank 1: ModelAveragingSGD=[ +MPI Rank 1: SyncFrequencyInFrames=1024 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: gradUpdateType=none +MPI Rank 1: gradientClippingWithTruncation=true +MPI Rank 1: clippingThresholdPerSample=1#INF +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: NDLNetworkBuilder = [ +MPI Rank 1: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = $DataDir$/train.all.bin +MPI Rank 1: ] +MPI Rank 1: cvReader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = $DataDir$/train.all.bin +MPI Rank 1: ] +MPI Rank 1: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 1: RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu +MPI Rank 1: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 1: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 1: DeviceId=0 +MPI Rank 1: numCPUThreads=2 +MPI Rank 1: stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: modelPath=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 1: MBSize=4096 +MPI Rank 1: LRate=0.0001 +MPI Rank 1: DeviceId=-1 +MPI Rank 1: parallelTrain=true +MPI Rank 1: command = train +MPI Rank 1: precision = float +MPI Rank 1: traceGPUMemoryAllocations=0 +MPI Rank 1: train = [ +MPI Rank 1: action = train +MPI Rank 1: numMBsToShowResult=10 +MPI Rank 1: deviceId=0 +MPI Rank 1: minibatchSize = 4096 +MPI Rank 1: modelPath = /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize=102399 +MPI Rank 1: learningRatesPerSample = 0.0001 +MPI Rank 1: momentumPerMB = 0.9 +MPI Rank 1: maxEpochs=3 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationStartEpoch=1 +MPI Rank 1: parallelizationMethod=ModelAveragingSGD +MPI Rank 1: distributedMBReading=true +MPI Rank 1: ModelAveragingSGD=[ +MPI Rank 1: SyncFrequencyInFrames=1024 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: gradUpdateType=none +MPI Rank 1: gradientClippingWithTruncation=true +MPI Rank 1: clippingThresholdPerSample=1#INF +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: NDLNetworkBuilder = [ +MPI Rank 1: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: cvReader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 1: RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu +MPI Rank 1: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 1: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 1: DeviceId=0 +MPI Rank 1: numCPUThreads=2 +MPI Rank 1: stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: configparameters: dssm.cntk:command=train +MPI Rank 1: configparameters: dssm.cntk:ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 1: configparameters: dssm.cntk:currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 1: configparameters: dssm.cntk:cvReader=[ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: dssm.cntk:DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 1: configparameters: dssm.cntk:DeviceId=0 +MPI Rank 1: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 1: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 1: configparameters: dssm.cntk:modelPath=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 1: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 1: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: dssm.cntk:numCPUThreads=2 +MPI Rank 1: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 1: configparameters: dssm.cntk:precision=float +MPI Rank 1: configparameters: dssm.cntk:reader=[ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: dssm.cntk:RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu +MPI Rank 1: configparameters: dssm.cntk:stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 1: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 1: configparameters: dssm.cntk:train=[ +MPI Rank 1: action = train +MPI Rank 1: numMBsToShowResult=10 +MPI Rank 1: deviceId=0 +MPI Rank 1: minibatchSize = 4096 +MPI Rank 1: modelPath = /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize=102399 +MPI Rank 1: learningRatesPerSample = 0.0001 +MPI Rank 1: momentumPerMB = 0.9 +MPI Rank 1: maxEpochs=3 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationStartEpoch=1 +MPI Rank 1: parallelizationMethod=ModelAveragingSGD +MPI Rank 1: distributedMBReading=true +MPI Rank 1: ModelAveragingSGD=[ +MPI Rank 1: SyncFrequencyInFrames=1024 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: gradUpdateType=none +MPI Rank 1: gradientClippingWithTruncation=true +MPI Rank 1: clippingThresholdPerSample=1#INF +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: command: train +MPI Rank 1: precision = float +MPI Rank 1: Using 2 CPU threads +MPI Rank 1: CNTKModelPath: /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 1: CNTKCommandTrainInfo: train : 3 +MPI Rank 1: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 1: CNTKCommandTrainBegin: train +MPI Rank 1: NDLBuilder Using GPU 0 +MPI Rank 1: Starting from checkpoint. Load Network From File /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net.2. +MPI Rank 1: +MPI Rank 1: Post-processing network... +MPI Rank 1: +MPI Rank 1: 2 roots: +MPI Rank 1: CE = CrossEntropyWithSoftmax +MPI Rank 1: SIM = CosDistanceWithNegativeSamples +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 1: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 1: +MPI Rank 1: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 1: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 1: +MPI Rank 1: Validating for node CE, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 1: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 1: +MPI Rank 1: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: +MPI Rank 1: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: +MPI Rank 1: Validating for node SIM, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: +MPI Rank 1: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: Post-processing network complete. +MPI Rank 1: +MPI Rank 1: SGD using GPU 0. +MPI Rank 1: +MPI Rank 1: Training criterion node(s): +MPI Rank 1: CE = CrossEntropyWithSoftmax +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Allocating matrices for forward and/or backward propagation. +MPI Rank 1: No PreCompute nodes found, skipping PreCompute step +MPI Rank 1: Warning: checkpoint file is missing. learning parameters will be initialized from 0 +MPI Rank 1: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.95954418; TotalTime = 5.0533s; SamplesPerSecond = 2026.4 +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.94722424; TotalTime = 4.5333s; SamplesPerSecond = 2258.9 +MPI Rank 1: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.9504802; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=12.2322 +MPI Rank 1: CNTKCommandTrainEnd: train +MPI Rank 1: COMPLETED +MPI Rank 1: ~MPIWrapper +MPI Rank 2: ------------------------------------------------------------------- +MPI Rank 2: Build info: +MPI Rank 2: +MPI Rank 2: Built time: Mar 3 2016 16:31:40 +MPI Rank 2: Last modified date: Thu Mar 3 16:25:02 2016 +MPI Rank 2: Build type: debug +MPI Rank 2: Math lib: acml +MPI Rank 2: Build Branch: thhoens/tests +MPI Rank 2: Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +MPI Rank 2: ------------------------------------------------------------------- +MPI Rank 2: running on localhost at 2016/03/03 16:38:11 +MPI Rank 2: command line: +MPI Rank 2: /home/thhoens/cntk/build/gpu/release/bin/cntk configFile=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.cntk currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ DeviceId=0 numCPUThreads=2 stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: modelPath=$RunDir$/models/dssm.net +MPI Rank 2: MBSize=4096 +MPI Rank 2: LRate=0.0001 +MPI Rank 2: DeviceId=-1 +MPI Rank 2: parallelTrain=true +MPI Rank 2: command = train +MPI Rank 2: precision = float +MPI Rank 2: traceGPUMemoryAllocations=0 +MPI Rank 2: train = [ +MPI Rank 2: action = train +MPI Rank 2: numMBsToShowResult=10 +MPI Rank 2: deviceId=$DeviceId$ +MPI Rank 2: minibatchSize = $MBSize$ +MPI Rank 2: modelPath = $modelPath$ +MPI Rank 2: traceLevel = 1 +MPI Rank 2: SGD = [ +MPI Rank 2: epochSize=102399 +MPI Rank 2: learningRatesPerSample = $LRate$ +MPI Rank 2: momentumPerMB = 0.9 +MPI Rank 2: maxEpochs=3 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationStartEpoch=1 +MPI Rank 2: parallelizationMethod=ModelAveragingSGD +MPI Rank 2: distributedMBReading=true +MPI Rank 2: ModelAveragingSGD=[ +MPI Rank 2: SyncFrequencyInFrames=1024 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: gradUpdateType=none +MPI Rank 2: gradientClippingWithTruncation=true +MPI Rank 2: clippingThresholdPerSample=1#INF +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: NDLNetworkBuilder = [ +MPI Rank 2: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 2: ] +MPI Rank 2: reader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = $DataDir$/train.all.bin +MPI Rank 2: ] +MPI Rank 2: cvReader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = $DataDir$/train.all.bin +MPI Rank 2: ] +MPI Rank 2: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 2: RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu +MPI Rank 2: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 2: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 2: DeviceId=0 +MPI Rank 2: numCPUThreads=2 +MPI Rank 2: stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: modelPath=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 2: MBSize=4096 +MPI Rank 2: LRate=0.0001 +MPI Rank 2: DeviceId=-1 +MPI Rank 2: parallelTrain=true +MPI Rank 2: command = train +MPI Rank 2: precision = float +MPI Rank 2: traceGPUMemoryAllocations=0 +MPI Rank 2: train = [ +MPI Rank 2: action = train +MPI Rank 2: numMBsToShowResult=10 +MPI Rank 2: deviceId=0 +MPI Rank 2: minibatchSize = 4096 +MPI Rank 2: modelPath = /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 2: traceLevel = 1 +MPI Rank 2: SGD = [ +MPI Rank 2: epochSize=102399 +MPI Rank 2: learningRatesPerSample = 0.0001 +MPI Rank 2: momentumPerMB = 0.9 +MPI Rank 2: maxEpochs=3 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationStartEpoch=1 +MPI Rank 2: parallelizationMethod=ModelAveragingSGD +MPI Rank 2: distributedMBReading=true +MPI Rank 2: ModelAveragingSGD=[ +MPI Rank 2: SyncFrequencyInFrames=1024 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: gradUpdateType=none +MPI Rank 2: gradientClippingWithTruncation=true +MPI Rank 2: clippingThresholdPerSample=1#INF +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: NDLNetworkBuilder = [ +MPI Rank 2: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 2: ] +MPI Rank 2: reader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: cvReader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 2: RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu +MPI Rank 2: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 2: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 2: DeviceId=0 +MPI Rank 2: numCPUThreads=2 +MPI Rank 2: stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: configparameters: dssm.cntk:command=train +MPI Rank 2: configparameters: dssm.cntk:ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 2: configparameters: dssm.cntk:currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 2: configparameters: dssm.cntk:cvReader=[ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: configparameters: dssm.cntk:DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 2: configparameters: dssm.cntk:DeviceId=0 +MPI Rank 2: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 2: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 2: configparameters: dssm.cntk:modelPath=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 2: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 2: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: configparameters: dssm.cntk:numCPUThreads=2 +MPI Rank 2: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 2: configparameters: dssm.cntk:precision=float +MPI Rank 2: configparameters: dssm.cntk:reader=[ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: configparameters: dssm.cntk:RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu +MPI Rank 2: configparameters: dssm.cntk:stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 2: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 2: configparameters: dssm.cntk:train=[ +MPI Rank 2: action = train +MPI Rank 2: numMBsToShowResult=10 +MPI Rank 2: deviceId=0 +MPI Rank 2: minibatchSize = 4096 +MPI Rank 2: modelPath = /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 2: traceLevel = 1 +MPI Rank 2: SGD = [ +MPI Rank 2: epochSize=102399 +MPI Rank 2: learningRatesPerSample = 0.0001 +MPI Rank 2: momentumPerMB = 0.9 +MPI Rank 2: maxEpochs=3 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationStartEpoch=1 +MPI Rank 2: parallelizationMethod=ModelAveragingSGD +MPI Rank 2: distributedMBReading=true +MPI Rank 2: ModelAveragingSGD=[ +MPI Rank 2: SyncFrequencyInFrames=1024 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: gradUpdateType=none +MPI Rank 2: gradientClippingWithTruncation=true +MPI Rank 2: clippingThresholdPerSample=1#INF +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: command: train +MPI Rank 2: precision = float +MPI Rank 2: Using 2 CPU threads +MPI Rank 2: CNTKModelPath: /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 2: CNTKCommandTrainInfo: train : 3 +MPI Rank 2: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 2: CNTKCommandTrainBegin: train +MPI Rank 2: NDLBuilder Using GPU 0 +MPI Rank 2: Starting from checkpoint. Load Network From File /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net.2. +MPI Rank 2: +MPI Rank 2: Post-processing network... +MPI Rank 2: +MPI Rank 2: 2 roots: +MPI Rank 2: CE = CrossEntropyWithSoftmax +MPI Rank 2: SIM = CosDistanceWithNegativeSamples +MPI Rank 2: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 2: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 2: +MPI Rank 2: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 2: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 2: +MPI Rank 2: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 2: +MPI Rank 2: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 2: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 2: +MPI Rank 2: Validating for node CE, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 2: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 2: +MPI Rank 2: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 2: +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: +MPI Rank 2: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 2: +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: +MPI Rank 2: Validating for node SIM, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: +MPI Rank 2: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: Post-processing network complete. +MPI Rank 2: +MPI Rank 2: SGD using GPU 0. +MPI Rank 2: +MPI Rank 2: Training criterion node(s): +MPI Rank 2: CE = CrossEntropyWithSoftmax +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Allocating matrices for forward and/or backward propagation. +MPI Rank 2: No PreCompute nodes found, skipping PreCompute step +MPI Rank 2: Warning: checkpoint file is missing. learning parameters will be initialized from 0 +MPI Rank 2: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 2: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 2: +MPI Rank 2: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 2: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.98175964; TotalTime = 5.0537s; SamplesPerSecond = 2026.2 +MPI Rank 2: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.95635300; TotalTime = 4.5332s; SamplesPerSecond = 2258.9 +MPI Rank 2: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.9504802; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=12.2322 +MPI Rank 2: CNTKCommandTrainEnd: train +MPI Rank 2: COMPLETED +MPI Rank 2: ~MPIWrapper +MPI Rank 3: ------------------------------------------------------------------- +MPI Rank 3: Build info: +MPI Rank 3: +MPI Rank 3: Built time: Mar 3 2016 16:31:40 +MPI Rank 3: Last modified date: Thu Mar 3 16:25:02 2016 +MPI Rank 3: Build type: debug +MPI Rank 3: Math lib: acml +MPI Rank 3: Build Branch: thhoens/tests +MPI Rank 3: Build SHA1: 4848f0e1b49ff50d6a1e3a0a84a1d559d5ebe76c +MPI Rank 3: ------------------------------------------------------------------- +MPI Rank 3: running on localhost at 2016/03/03 16:38:11 +MPI Rank 3: command line: +MPI Rank 3: /home/thhoens/cntk/build/gpu/release/bin/cntk configFile=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.cntk currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ DeviceId=0 numCPUThreads=2 stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: modelPath=$RunDir$/models/dssm.net +MPI Rank 3: MBSize=4096 +MPI Rank 3: LRate=0.0001 +MPI Rank 3: DeviceId=-1 +MPI Rank 3: parallelTrain=true +MPI Rank 3: command = train +MPI Rank 3: precision = float +MPI Rank 3: traceGPUMemoryAllocations=0 +MPI Rank 3: train = [ +MPI Rank 3: action = train +MPI Rank 3: numMBsToShowResult=10 +MPI Rank 3: deviceId=$DeviceId$ +MPI Rank 3: minibatchSize = $MBSize$ +MPI Rank 3: modelPath = $modelPath$ +MPI Rank 3: traceLevel = 1 +MPI Rank 3: SGD = [ +MPI Rank 3: epochSize=102399 +MPI Rank 3: learningRatesPerSample = $LRate$ +MPI Rank 3: momentumPerMB = 0.9 +MPI Rank 3: maxEpochs=3 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationStartEpoch=1 +MPI Rank 3: parallelizationMethod=ModelAveragingSGD +MPI Rank 3: distributedMBReading=true +MPI Rank 3: ModelAveragingSGD=[ +MPI Rank 3: SyncFrequencyInFrames=1024 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: gradUpdateType=none +MPI Rank 3: gradientClippingWithTruncation=true +MPI Rank 3: clippingThresholdPerSample=1#INF +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: NDLNetworkBuilder = [ +MPI Rank 3: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 3: ] +MPI Rank 3: reader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = $DataDir$/train.all.bin +MPI Rank 3: ] +MPI Rank 3: cvReader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = $DataDir$/train.all.bin +MPI Rank 3: ] +MPI Rank 3: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 3: RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu +MPI Rank 3: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 3: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 3: DeviceId=0 +MPI Rank 3: numCPUThreads=2 +MPI Rank 3: stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: modelPath=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 3: MBSize=4096 +MPI Rank 3: LRate=0.0001 +MPI Rank 3: DeviceId=-1 +MPI Rank 3: parallelTrain=true +MPI Rank 3: command = train +MPI Rank 3: precision = float +MPI Rank 3: traceGPUMemoryAllocations=0 +MPI Rank 3: train = [ +MPI Rank 3: action = train +MPI Rank 3: numMBsToShowResult=10 +MPI Rank 3: deviceId=0 +MPI Rank 3: minibatchSize = 4096 +MPI Rank 3: modelPath = /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 3: traceLevel = 1 +MPI Rank 3: SGD = [ +MPI Rank 3: epochSize=102399 +MPI Rank 3: learningRatesPerSample = 0.0001 +MPI Rank 3: momentumPerMB = 0.9 +MPI Rank 3: maxEpochs=3 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationStartEpoch=1 +MPI Rank 3: parallelizationMethod=ModelAveragingSGD +MPI Rank 3: distributedMBReading=true +MPI Rank 3: ModelAveragingSGD=[ +MPI Rank 3: SyncFrequencyInFrames=1024 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: gradUpdateType=none +MPI Rank 3: gradientClippingWithTruncation=true +MPI Rank 3: clippingThresholdPerSample=1#INF +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: NDLNetworkBuilder = [ +MPI Rank 3: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 3: ] +MPI Rank 3: reader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: cvReader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 3: RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu +MPI Rank 3: DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 3: ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 3: DeviceId=0 +MPI Rank 3: numCPUThreads=2 +MPI Rank 3: stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: configparameters: dssm.cntk:command=train +MPI Rank 3: configparameters: dssm.cntk:ConfigDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/ +MPI Rank 3: configparameters: dssm.cntk:currentDirectory=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 3: configparameters: dssm.cntk:cvReader=[ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: configparameters: dssm.cntk:DataDir=/home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM +MPI Rank 3: configparameters: dssm.cntk:DeviceId=0 +MPI Rank 3: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 3: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 3: configparameters: dssm.cntk:modelPath=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 3: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 3: networkDescription = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM//dssm.ndl +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: configparameters: dssm.cntk:numCPUThreads=2 +MPI Rank 3: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 3: configparameters: dssm.cntk:precision=float +MPI Rank 3: configparameters: dssm.cntk:reader=[ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = /home/thhoens/cntk/Tests/EndToEndTests/Text/SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: configparameters: dssm.cntk:RunDir=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu +MPI Rank 3: configparameters: dssm.cntk:stderr=/tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/stderr +MPI Rank 3: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 3: configparameters: dssm.cntk:train=[ +MPI Rank 3: action = train +MPI Rank 3: numMBsToShowResult=10 +MPI Rank 3: deviceId=0 +MPI Rank 3: minibatchSize = 4096 +MPI Rank 3: modelPath = /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 3: traceLevel = 1 +MPI Rank 3: SGD = [ +MPI Rank 3: epochSize=102399 +MPI Rank 3: learningRatesPerSample = 0.0001 +MPI Rank 3: momentumPerMB = 0.9 +MPI Rank 3: maxEpochs=3 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationStartEpoch=1 +MPI Rank 3: parallelizationMethod=ModelAveragingSGD +MPI Rank 3: distributedMBReading=true +MPI Rank 3: ModelAveragingSGD=[ +MPI Rank 3: SyncFrequencyInFrames=1024 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: gradUpdateType=none +MPI Rank 3: gradientClippingWithTruncation=true +MPI Rank 3: clippingThresholdPerSample=1#INF +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: command: train +MPI Rank 3: precision = float +MPI Rank 3: Using 2 CPU threads +MPI Rank 3: CNTKModelPath: /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 3: CNTKCommandTrainInfo: train : 3 +MPI Rank 3: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 3: CNTKCommandTrainBegin: train +MPI Rank 3: NDLBuilder Using GPU 0 +MPI Rank 3: Starting from checkpoint. Load Network From File /tmp/cntk-test-20160303163710.473326/Text_SparseDSSM@release_gpu/models/dssm.net.2. +MPI Rank 3: +MPI Rank 3: Post-processing network... +MPI Rank 3: +MPI Rank 3: 2 roots: +MPI Rank 3: CE = CrossEntropyWithSoftmax +MPI Rank 3: SIM = CosDistanceWithNegativeSamples +MPI Rank 3: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 3: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 3: +MPI Rank 3: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 3: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 3: +MPI Rank 3: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 3: +MPI Rank 3: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 3: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 3: +MPI Rank 3: Validating for node CE, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 3: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 3: +MPI Rank 3: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 3: +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: +MPI Rank 3: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 3: +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: +MPI Rank 3: Validating for node SIM, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: +MPI Rank 3: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 3: +MPI Rank 3: Post-processing network complete. +MPI Rank 3: +MPI Rank 3: SGD using GPU 0. +MPI Rank 3: +MPI Rank 3: Training criterion node(s): +MPI Rank 3: CE = CrossEntropyWithSoftmax +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Allocating matrices for forward and/or backward propagation. +MPI Rank 3: No PreCompute nodes found, skipping PreCompute step +MPI Rank 3: Warning: checkpoint file is missing. learning parameters will be initialized from 0 +MPI Rank 3: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 3: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 3: +MPI Rank 3: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 3: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.93374424; TotalTime = 5.0634s; SamplesPerSecond = 2022.4 +MPI Rank 3: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.95965519; TotalTime = 4.5333s; SamplesPerSecond = 2258.9 +MPI Rank 3: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.9504802; AvgLearningRatePerSample = 9.9999997e-05; EpochTime=12.2322 +MPI Rank 3: CNTKCommandTrainEnd: train +MPI Rank 3: COMPLETED +MPI Rank 3: ~MPIWrapper diff --git a/Tests/EndToEndTests/Text/SparseDSSM/baseline.windows.cpu.txt b/Tests/EndToEndTests/Text/SparseDSSM/baseline.windows.cpu.txt new file mode 100755 index 000000000000..c0d0efb12c10 --- /dev/null +++ b/Tests/EndToEndTests/Text/SparseDSSM/baseline.windows.cpu.txt @@ -0,0 +1,3224 @@ +=== Running C:\Program Files\Microsoft MPI\Bin\/mpiexec.exe -n 4 D:\thhoens\CNTK\x64\release\cntk.exe configFile=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.cntk currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM DeviceId=-1 numCPUThreads=10 stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 14:41:54 + Last modified date: Thu Mar 3 14:28:26 2016 + CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 + CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 + CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda + Build Branch: HEAD + Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 + Built by thhoens on SAADSRNRDEV040 + Build Path: D:\thhoens\CNTK\Source\CNTK\ +------------------------------------------------------------------- +MPIWrapper: initializing MPI +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 14:41:54 + Last modified date: Thu Mar 3 14:28:26 2016 + CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 + CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 + CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda + Build Branch: HEAD + Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 + Built by thhoens on SAADSRNRDEV040 + Build Path: D:\thhoens\CNTK\Source\CNTK\ +------------------------------------------------------------------- +MPIWrapper: initializing MPI +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 14:41:54 + Last modified date: Thu Mar 3 14:28:26 2016 + CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 + CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 + CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda + Build Branch: HEAD + Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 + Built by thhoens on SAADSRNRDEV040 + Build Path: D:\thhoens\CNTK\Source\CNTK\ +------------------------------------------------------------------- +MPIWrapper: initializing MPI +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 14:41:54 + Last modified date: Thu Mar 3 14:28:26 2016 + CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 + CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 + CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda + Build Branch: HEAD + Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 + Built by thhoens on SAADSRNRDEV040 + Build Path: D:\thhoens\CNTK\Source\CNTK\ +------------------------------------------------------------------- +MPIWrapper: initializing MPI +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: all 4 nodes responded +ping [requestnodes (before change)]: all 4 nodes responded +ping [requestnodes (before change)]: all 4 nodes responded +ping [requestnodes (before change)]: all 4 nodes responded +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (2) are in (participating) +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (0) are in (participating) +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (1) are in (participating) +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (3) are in (participating) +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: all 4 nodes responded +ping [requestnodes (after change)]: all 4 nodes responded +ping [requestnodes (after change)]: all 4 nodes responded +ping [requestnodes (after change)]: all 4 nodes responded +mpihelper: we are cog 2 in a gearbox of 4 +mpihelper: we are cog 0 in a gearbox of 4 +mpihelper: we are cog 1 in a gearbox of 4 +mpihelper: we are cog 3 in a gearbox of 4 +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: all 4 nodes responded +ping [mpihelper]: all 4 nodes responded +ping [mpihelper]: all 4 nodes responded +ping [mpihelper]: all 4 nodes responded +MPI Rank 0: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr_train.logrank0 +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: Build info: +MPI Rank 0: +MPI Rank 0: Built time: Mar 3 2016 14:41:54 +MPI Rank 0: Last modified date: Thu Mar 3 14:28:26 2016 +MPI Rank 0: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 +MPI Rank 0: CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 +MPI Rank 0: CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda +MPI Rank 0: Build Branch: HEAD +MPI Rank 0: Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 +MPI Rank 0: Built by thhoens on SAADSRNRDEV040 +MPI Rank 0: Build Path: D:\thhoens\CNTK\Source\CNTK\ +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: running on SAADSRNRDEV040 at 2016/03/04 00:08:44 +MPI Rank 0: command line: +MPI Rank 0: D:\thhoens\CNTK\x64\release\cntk.exe configFile=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.cntk currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM DeviceId=-1 numCPUThreads=10 stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: modelPath=$RunDir$/models/dssm.net +MPI Rank 0: MBSize=4096 +MPI Rank 0: LRate=0.0001 +MPI Rank 0: DeviceId=-1 +MPI Rank 0: parallelTrain=true +MPI Rank 0: command = train +MPI Rank 0: precision = float +MPI Rank 0: traceGPUMemoryAllocations=0 +MPI Rank 0: train = [ +MPI Rank 0: action = train +MPI Rank 0: numMBsToShowResult=10 +MPI Rank 0: deviceId=$DeviceId$ +MPI Rank 0: minibatchSize = $MBSize$ +MPI Rank 0: modelPath = $modelPath$ +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize=102399 +MPI Rank 0: learningRatesPerSample = $LRate$ +MPI Rank 0: momentumPerMB = 0.9 +MPI Rank 0: maxEpochs=3 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationStartEpoch=1 +MPI Rank 0: parallelizationMethod=ModelAveragingSGD +MPI Rank 0: distributedMBReading=true +MPI Rank 0: ModelAveragingSGD=[ +MPI Rank 0: SyncFrequencyInFrames=1024 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: gradUpdateType=none +MPI Rank 0: gradientClippingWithTruncation=true +MPI Rank 0: clippingThresholdPerSample=1#INF +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: NDLNetworkBuilder = [ +MPI Rank 0: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = $DataDir$/train.all.bin +MPI Rank 0: ] +MPI Rank 0: cvReader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = $DataDir$/train.all.bin +MPI Rank 0: ] +MPI Rank 0: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu +MPI Rank 0: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: DeviceId=-1 +MPI Rank 0: numCPUThreads=10 +MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: modelPath=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 0: MBSize=4096 +MPI Rank 0: LRate=0.0001 +MPI Rank 0: DeviceId=-1 +MPI Rank 0: parallelTrain=true +MPI Rank 0: command = train +MPI Rank 0: precision = float +MPI Rank 0: traceGPUMemoryAllocations=0 +MPI Rank 0: train = [ +MPI Rank 0: action = train +MPI Rank 0: numMBsToShowResult=10 +MPI Rank 0: deviceId=-1 +MPI Rank 0: minibatchSize = 4096 +MPI Rank 0: modelPath = C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize=102399 +MPI Rank 0: learningRatesPerSample = 0.0001 +MPI Rank 0: momentumPerMB = 0.9 +MPI Rank 0: maxEpochs=3 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationStartEpoch=1 +MPI Rank 0: parallelizationMethod=ModelAveragingSGD +MPI Rank 0: distributedMBReading=true +MPI Rank 0: ModelAveragingSGD=[ +MPI Rank 0: SyncFrequencyInFrames=1024 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: gradUpdateType=none +MPI Rank 0: gradientClippingWithTruncation=true +MPI Rank 0: clippingThresholdPerSample=1#INF +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: NDLNetworkBuilder = [ +MPI Rank 0: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: cvReader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu +MPI Rank 0: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: DeviceId=-1 +MPI Rank 0: numCPUThreads=10 +MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: configparameters: dssm.cntk:command=train +MPI Rank 0: configparameters: dssm.cntk:ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: configparameters: dssm.cntk:currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: configparameters: dssm.cntk:cvReader=[ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: dssm.cntk:DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: configparameters: dssm.cntk:DeviceId=-1 +MPI Rank 0: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 0: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 0: configparameters: dssm.cntk:modelPath=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 0: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 0: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: dssm.cntk:numCPUThreads=10 +MPI Rank 0: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 0: configparameters: dssm.cntk:precision=float +MPI Rank 0: configparameters: dssm.cntk:reader=[ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: dssm.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu +MPI Rank 0: configparameters: dssm.cntk:stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 0: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 0: configparameters: dssm.cntk:train=[ +MPI Rank 0: action = train +MPI Rank 0: numMBsToShowResult=10 +MPI Rank 0: deviceId=-1 +MPI Rank 0: minibatchSize = 4096 +MPI Rank 0: modelPath = C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize=102399 +MPI Rank 0: learningRatesPerSample = 0.0001 +MPI Rank 0: momentumPerMB = 0.9 +MPI Rank 0: maxEpochs=3 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationStartEpoch=1 +MPI Rank 0: parallelizationMethod=ModelAveragingSGD +MPI Rank 0: distributedMBReading=true +MPI Rank 0: ModelAveragingSGD=[ +MPI Rank 0: SyncFrequencyInFrames=1024 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: gradUpdateType=none +MPI Rank 0: gradientClippingWithTruncation=true +MPI Rank 0: clippingThresholdPerSample=1#INF +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: command: train +MPI Rank 0: precision = float +MPI Rank 0: Using 10 CPU threads +MPI Rank 0: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 0: CNTKCommandTrainInfo: train : 3 +MPI Rank 0: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 0: CNTKCommandTrainBegin: train +MPI Rank 0: NDLBuilder Using CPU +MPI Rank 0: +MPI Rank 0: Post-processing network... +MPI Rank 0: +MPI Rank 0: 2 roots: +MPI Rank 0: SIM = CosDistanceWithNegativeSamples +MPI Rank 0: CE = CrossEntropyWithSoftmax +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: +MPI Rank 0: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: +MPI Rank 0: Validating for node SIM, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: +MPI Rank 0: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 0: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 0: +MPI Rank 0: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 0: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 0: +MPI Rank 0: Validating for node CE, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 0: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 0: +MPI Rank 0: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: Post-processing network complete. +MPI Rank 0: +MPI Rank 0: SGD using CPU. +MPI Rank 0: +MPI Rank 0: Training criterion node(s): +MPI Rank 0: CE = CrossEntropyWithSoftmax +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: No PreCompute nodes found, skipping PreCompute step +MPI Rank 0: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 0: Starting Epoch 1: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 4.41944122; TotalTime = 17.5796s; SamplesPerSecond = 582.5 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 3.38406754; TotalTime = 16.5473s; SamplesPerSecond = 618.8 +MPI Rank 0: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.6788689; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=43.1807 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: Final Results: Minibatch[1-25]: Samples Seen = 102399 CE: CrossEntropyWithSoftmax/Sample = 2.5110212 Perplexity = 12.317502 +MPI Rank 0: Finished Epoch[ 1 of 3]: [Validation Set] TrainLossPerSample = 2.5110212 +MPI Rank 0: Starting Epoch 2: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.29922523; TotalTime = 15.5020s; SamplesPerSecond = 660.6 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.08742409; TotalTime = 15.8390s; SamplesPerSecond = 646.5 +MPI Rank 0: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 2.1805767; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=40.7397 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: Final Results: Minibatch[1-25]: Samples Seen = 102399 CE: CrossEntropyWithSoftmax/Sample = 1.9751821 Perplexity = 7.2079319 +MPI Rank 0: Finished Epoch[ 2 of 3]: [Validation Set] TrainLossPerSample = 1.9751821 +MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.90042439; TotalTime = 15.5077s; SamplesPerSecond = 660.3 +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.85719700; TotalTime = 16.0274s; SamplesPerSecond = 638.9 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8883394; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=40.2866 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: Final Results: Minibatch[1-25]: Samples Seen = 102399 CE: CrossEntropyWithSoftmax/Sample = 1.8119593 Perplexity = 6.1224311 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Validation Set] TrainLossPerSample = 1.8119593 +MPI Rank 0: CNTKCommandTrainEnd: train +MPI Rank 0: COMPLETED +MPI Rank 0: ~MPIWrapper +MPI Rank 1: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr_train.logrank1 +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: Build info: +MPI Rank 1: +MPI Rank 1: Built time: Mar 3 2016 14:41:54 +MPI Rank 1: Last modified date: Thu Mar 3 14:28:26 2016 +MPI Rank 1: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 +MPI Rank 1: CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 +MPI Rank 1: CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda +MPI Rank 1: Build Branch: HEAD +MPI Rank 1: Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 +MPI Rank 1: Built by thhoens on SAADSRNRDEV040 +MPI Rank 1: Build Path: D:\thhoens\CNTK\Source\CNTK\ +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: running on SAADSRNRDEV040 at 2016/03/04 00:08:44 +MPI Rank 1: command line: +MPI Rank 1: D:\thhoens\CNTK\x64\release\cntk.exe configFile=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.cntk currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM DeviceId=-1 numCPUThreads=10 stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: modelPath=$RunDir$/models/dssm.net +MPI Rank 1: MBSize=4096 +MPI Rank 1: LRate=0.0001 +MPI Rank 1: DeviceId=-1 +MPI Rank 1: parallelTrain=true +MPI Rank 1: command = train +MPI Rank 1: precision = float +MPI Rank 1: traceGPUMemoryAllocations=0 +MPI Rank 1: train = [ +MPI Rank 1: action = train +MPI Rank 1: numMBsToShowResult=10 +MPI Rank 1: deviceId=$DeviceId$ +MPI Rank 1: minibatchSize = $MBSize$ +MPI Rank 1: modelPath = $modelPath$ +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize=102399 +MPI Rank 1: learningRatesPerSample = $LRate$ +MPI Rank 1: momentumPerMB = 0.9 +MPI Rank 1: maxEpochs=3 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationStartEpoch=1 +MPI Rank 1: parallelizationMethod=ModelAveragingSGD +MPI Rank 1: distributedMBReading=true +MPI Rank 1: ModelAveragingSGD=[ +MPI Rank 1: SyncFrequencyInFrames=1024 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: gradUpdateType=none +MPI Rank 1: gradientClippingWithTruncation=true +MPI Rank 1: clippingThresholdPerSample=1#INF +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: NDLNetworkBuilder = [ +MPI Rank 1: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = $DataDir$/train.all.bin +MPI Rank 1: ] +MPI Rank 1: cvReader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = $DataDir$/train.all.bin +MPI Rank 1: ] +MPI Rank 1: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu +MPI Rank 1: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: DeviceId=-1 +MPI Rank 1: numCPUThreads=10 +MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: modelPath=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 1: MBSize=4096 +MPI Rank 1: LRate=0.0001 +MPI Rank 1: DeviceId=-1 +MPI Rank 1: parallelTrain=true +MPI Rank 1: command = train +MPI Rank 1: precision = float +MPI Rank 1: traceGPUMemoryAllocations=0 +MPI Rank 1: train = [ +MPI Rank 1: action = train +MPI Rank 1: numMBsToShowResult=10 +MPI Rank 1: deviceId=-1 +MPI Rank 1: minibatchSize = 4096 +MPI Rank 1: modelPath = C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize=102399 +MPI Rank 1: learningRatesPerSample = 0.0001 +MPI Rank 1: momentumPerMB = 0.9 +MPI Rank 1: maxEpochs=3 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationStartEpoch=1 +MPI Rank 1: parallelizationMethod=ModelAveragingSGD +MPI Rank 1: distributedMBReading=true +MPI Rank 1: ModelAveragingSGD=[ +MPI Rank 1: SyncFrequencyInFrames=1024 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: gradUpdateType=none +MPI Rank 1: gradientClippingWithTruncation=true +MPI Rank 1: clippingThresholdPerSample=1#INF +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: NDLNetworkBuilder = [ +MPI Rank 1: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: cvReader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu +MPI Rank 1: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: DeviceId=-1 +MPI Rank 1: numCPUThreads=10 +MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: configparameters: dssm.cntk:command=train +MPI Rank 1: configparameters: dssm.cntk:ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: configparameters: dssm.cntk:currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: configparameters: dssm.cntk:cvReader=[ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: dssm.cntk:DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: configparameters: dssm.cntk:DeviceId=-1 +MPI Rank 1: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 1: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 1: configparameters: dssm.cntk:modelPath=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 1: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 1: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: dssm.cntk:numCPUThreads=10 +MPI Rank 1: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 1: configparameters: dssm.cntk:precision=float +MPI Rank 1: configparameters: dssm.cntk:reader=[ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: dssm.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu +MPI Rank 1: configparameters: dssm.cntk:stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 1: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 1: configparameters: dssm.cntk:train=[ +MPI Rank 1: action = train +MPI Rank 1: numMBsToShowResult=10 +MPI Rank 1: deviceId=-1 +MPI Rank 1: minibatchSize = 4096 +MPI Rank 1: modelPath = C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize=102399 +MPI Rank 1: learningRatesPerSample = 0.0001 +MPI Rank 1: momentumPerMB = 0.9 +MPI Rank 1: maxEpochs=3 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationStartEpoch=1 +MPI Rank 1: parallelizationMethod=ModelAveragingSGD +MPI Rank 1: distributedMBReading=true +MPI Rank 1: ModelAveragingSGD=[ +MPI Rank 1: SyncFrequencyInFrames=1024 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: gradUpdateType=none +MPI Rank 1: gradientClippingWithTruncation=true +MPI Rank 1: clippingThresholdPerSample=1#INF +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: command: train +MPI Rank 1: precision = float +MPI Rank 1: Using 10 CPU threads +MPI Rank 1: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 1: CNTKCommandTrainInfo: train : 3 +MPI Rank 1: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 1: CNTKCommandTrainBegin: train +MPI Rank 1: NDLBuilder Using CPU +MPI Rank 1: +MPI Rank 1: Post-processing network... +MPI Rank 1: +MPI Rank 1: 2 roots: +MPI Rank 1: SIM = CosDistanceWithNegativeSamples +MPI Rank 1: CE = CrossEntropyWithSoftmax +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: +MPI Rank 1: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: +MPI Rank 1: Validating for node SIM, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: +MPI Rank 1: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 1: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 1: +MPI Rank 1: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 1: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 1: +MPI Rank 1: Validating for node CE, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 1: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 1: +MPI Rank 1: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: Post-processing network complete. +MPI Rank 1: +MPI Rank 1: SGD using CPU. +MPI Rank 1: +MPI Rank 1: Training criterion node(s): +MPI Rank 1: CE = CrossEntropyWithSoftmax +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Allocating matrices for forward and/or backward propagation. +MPI Rank 1: No PreCompute nodes found, skipping PreCompute step +MPI Rank 1: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 1: Starting Epoch 1: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 4.42829742; TotalTime = 17.7166s; SamplesPerSecond = 578.0 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 3.38771057; TotalTime = 16.2755s; SamplesPerSecond = 629.2 +MPI Rank 1: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.6788689; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=43.1757 +MPI Rank 1: Starting Epoch 2: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.34065857; TotalTime = 16.1174s; SamplesPerSecond = 635.3 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.11009483; TotalTime = 15.8116s; SamplesPerSecond = 647.6 +MPI Rank 1: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 2.1805767; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=40.7355 +MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.93978729; TotalTime = 15.5855s; SamplesPerSecond = 657.0 +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.86772957; TotalTime = 16.2062s; SamplesPerSecond = 631.9 +MPI Rank 1: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8883394; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=40.2826 +MPI Rank 1: CNTKCommandTrainEnd: train +MPI Rank 1: COMPLETED +MPI Rank 1: ~MPIWrapper +MPI Rank 2: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr_train.logrank2 +MPI Rank 2: ------------------------------------------------------------------- +MPI Rank 2: Build info: +MPI Rank 2: +MPI Rank 2: Built time: Mar 3 2016 14:41:54 +MPI Rank 2: Last modified date: Thu Mar 3 14:28:26 2016 +MPI Rank 2: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 +MPI Rank 2: CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 +MPI Rank 2: CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda +MPI Rank 2: Build Branch: HEAD +MPI Rank 2: Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 +MPI Rank 2: Built by thhoens on SAADSRNRDEV040 +MPI Rank 2: Build Path: D:\thhoens\CNTK\Source\CNTK\ +MPI Rank 2: ------------------------------------------------------------------- +MPI Rank 2: running on SAADSRNRDEV040 at 2016/03/04 00:08:45 +MPI Rank 2: command line: +MPI Rank 2: D:\thhoens\CNTK\x64\release\cntk.exe configFile=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.cntk currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM DeviceId=-1 numCPUThreads=10 stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: modelPath=$RunDir$/models/dssm.net +MPI Rank 2: MBSize=4096 +MPI Rank 2: LRate=0.0001 +MPI Rank 2: DeviceId=-1 +MPI Rank 2: parallelTrain=true +MPI Rank 2: command = train +MPI Rank 2: precision = float +MPI Rank 2: traceGPUMemoryAllocations=0 +MPI Rank 2: train = [ +MPI Rank 2: action = train +MPI Rank 2: numMBsToShowResult=10 +MPI Rank 2: deviceId=$DeviceId$ +MPI Rank 2: minibatchSize = $MBSize$ +MPI Rank 2: modelPath = $modelPath$ +MPI Rank 2: traceLevel = 1 +MPI Rank 2: SGD = [ +MPI Rank 2: epochSize=102399 +MPI Rank 2: learningRatesPerSample = $LRate$ +MPI Rank 2: momentumPerMB = 0.9 +MPI Rank 2: maxEpochs=3 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationStartEpoch=1 +MPI Rank 2: parallelizationMethod=ModelAveragingSGD +MPI Rank 2: distributedMBReading=true +MPI Rank 2: ModelAveragingSGD=[ +MPI Rank 2: SyncFrequencyInFrames=1024 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: gradUpdateType=none +MPI Rank 2: gradientClippingWithTruncation=true +MPI Rank 2: clippingThresholdPerSample=1#INF +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: NDLNetworkBuilder = [ +MPI Rank 2: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 2: ] +MPI Rank 2: reader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = $DataDir$/train.all.bin +MPI Rank 2: ] +MPI Rank 2: cvReader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = $DataDir$/train.all.bin +MPI Rank 2: ] +MPI Rank 2: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu +MPI Rank 2: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: DeviceId=-1 +MPI Rank 2: numCPUThreads=10 +MPI Rank 2: stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: modelPath=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 2: MBSize=4096 +MPI Rank 2: LRate=0.0001 +MPI Rank 2: DeviceId=-1 +MPI Rank 2: parallelTrain=true +MPI Rank 2: command = train +MPI Rank 2: precision = float +MPI Rank 2: traceGPUMemoryAllocations=0 +MPI Rank 2: train = [ +MPI Rank 2: action = train +MPI Rank 2: numMBsToShowResult=10 +MPI Rank 2: deviceId=-1 +MPI Rank 2: minibatchSize = 4096 +MPI Rank 2: modelPath = C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 2: traceLevel = 1 +MPI Rank 2: SGD = [ +MPI Rank 2: epochSize=102399 +MPI Rank 2: learningRatesPerSample = 0.0001 +MPI Rank 2: momentumPerMB = 0.9 +MPI Rank 2: maxEpochs=3 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationStartEpoch=1 +MPI Rank 2: parallelizationMethod=ModelAveragingSGD +MPI Rank 2: distributedMBReading=true +MPI Rank 2: ModelAveragingSGD=[ +MPI Rank 2: SyncFrequencyInFrames=1024 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: gradUpdateType=none +MPI Rank 2: gradientClippingWithTruncation=true +MPI Rank 2: clippingThresholdPerSample=1#INF +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: NDLNetworkBuilder = [ +MPI Rank 2: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 2: ] +MPI Rank 2: reader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: cvReader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu +MPI Rank 2: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: DeviceId=-1 +MPI Rank 2: numCPUThreads=10 +MPI Rank 2: stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: configparameters: dssm.cntk:command=train +MPI Rank 2: configparameters: dssm.cntk:ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: configparameters: dssm.cntk:currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: configparameters: dssm.cntk:cvReader=[ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: configparameters: dssm.cntk:DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: configparameters: dssm.cntk:DeviceId=-1 +MPI Rank 2: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 2: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 2: configparameters: dssm.cntk:modelPath=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 2: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 2: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: configparameters: dssm.cntk:numCPUThreads=10 +MPI Rank 2: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 2: configparameters: dssm.cntk:precision=float +MPI Rank 2: configparameters: dssm.cntk:reader=[ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: configparameters: dssm.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu +MPI Rank 2: configparameters: dssm.cntk:stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 2: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 2: configparameters: dssm.cntk:train=[ +MPI Rank 2: action = train +MPI Rank 2: numMBsToShowResult=10 +MPI Rank 2: deviceId=-1 +MPI Rank 2: minibatchSize = 4096 +MPI Rank 2: modelPath = C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 2: traceLevel = 1 +MPI Rank 2: SGD = [ +MPI Rank 2: epochSize=102399 +MPI Rank 2: learningRatesPerSample = 0.0001 +MPI Rank 2: momentumPerMB = 0.9 +MPI Rank 2: maxEpochs=3 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationStartEpoch=1 +MPI Rank 2: parallelizationMethod=ModelAveragingSGD +MPI Rank 2: distributedMBReading=true +MPI Rank 2: ModelAveragingSGD=[ +MPI Rank 2: SyncFrequencyInFrames=1024 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: gradUpdateType=none +MPI Rank 2: gradientClippingWithTruncation=true +MPI Rank 2: clippingThresholdPerSample=1#INF +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: command: train +MPI Rank 2: precision = float +MPI Rank 2: Using 10 CPU threads +MPI Rank 2: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 2: CNTKCommandTrainInfo: train : 3 +MPI Rank 2: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 2: CNTKCommandTrainBegin: train +MPI Rank 2: NDLBuilder Using CPU +MPI Rank 2: +MPI Rank 2: Post-processing network... +MPI Rank 2: +MPI Rank 2: 2 roots: +MPI Rank 2: SIM = CosDistanceWithNegativeSamples +MPI Rank 2: CE = CrossEntropyWithSoftmax +MPI Rank 2: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 2: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 2: +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: +MPI Rank 2: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 2: +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: +MPI Rank 2: Validating for node SIM, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: +MPI Rank 2: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 2: +MPI Rank 2: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 2: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 2: +MPI Rank 2: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 2: +MPI Rank 2: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 2: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 2: +MPI Rank 2: Validating for node CE, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 2: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 2: +MPI Rank 2: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: Post-processing network complete. +MPI Rank 2: +MPI Rank 2: SGD using CPU. +MPI Rank 2: +MPI Rank 2: Training criterion node(s): +MPI Rank 2: CE = CrossEntropyWithSoftmax +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Allocating matrices for forward and/or backward propagation. +MPI Rank 2: No PreCompute nodes found, skipping PreCompute step +MPI Rank 2: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 2: Starting Epoch 1: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 2: +MPI Rank 2: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 4.44101410; TotalTime = 17.5831s; SamplesPerSecond = 582.4 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 3.39723740; TotalTime = 16.6279s; SamplesPerSecond = 615.8 +MPI Rank 2: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.6788689; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=43.1806 +MPI Rank 2: Starting Epoch 2: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 2: +MPI Rank 2: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.34435482; TotalTime = 16.0019s; SamplesPerSecond = 639.9 +MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.13005276; TotalTime = 15.8394s; SamplesPerSecond = 646.5 +MPI Rank 2: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 2.1805767; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=40.7397 +MPI Rank 2: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 2: +MPI Rank 2: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 2: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.95727196; TotalTime = 16.2714s; SamplesPerSecond = 629.3 +MPI Rank 2: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.88702583; TotalTime = 15.9308s; SamplesPerSecond = 642.8 +MPI Rank 2: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8883394; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=40.2866 +MPI Rank 2: CNTKCommandTrainEnd: train +MPI Rank 2: COMPLETED +MPI Rank 2: ~MPIWrapper +MPI Rank 3: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr_train.logrank3 +MPI Rank 3: ------------------------------------------------------------------- +MPI Rank 3: Build info: +MPI Rank 3: +MPI Rank 3: Built time: Mar 3 2016 14:41:54 +MPI Rank 3: Last modified date: Thu Mar 3 14:28:26 2016 +MPI Rank 3: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 +MPI Rank 3: CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 +MPI Rank 3: CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda +MPI Rank 3: Build Branch: HEAD +MPI Rank 3: Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 +MPI Rank 3: Built by thhoens on SAADSRNRDEV040 +MPI Rank 3: Build Path: D:\thhoens\CNTK\Source\CNTK\ +MPI Rank 3: ------------------------------------------------------------------- +MPI Rank 3: running on SAADSRNRDEV040 at 2016/03/04 00:08:45 +MPI Rank 3: command line: +MPI Rank 3: D:\thhoens\CNTK\x64\release\cntk.exe configFile=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.cntk currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM DeviceId=-1 numCPUThreads=10 stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: modelPath=$RunDir$/models/dssm.net +MPI Rank 3: MBSize=4096 +MPI Rank 3: LRate=0.0001 +MPI Rank 3: DeviceId=-1 +MPI Rank 3: parallelTrain=true +MPI Rank 3: command = train +MPI Rank 3: precision = float +MPI Rank 3: traceGPUMemoryAllocations=0 +MPI Rank 3: train = [ +MPI Rank 3: action = train +MPI Rank 3: numMBsToShowResult=10 +MPI Rank 3: deviceId=$DeviceId$ +MPI Rank 3: minibatchSize = $MBSize$ +MPI Rank 3: modelPath = $modelPath$ +MPI Rank 3: traceLevel = 1 +MPI Rank 3: SGD = [ +MPI Rank 3: epochSize=102399 +MPI Rank 3: learningRatesPerSample = $LRate$ +MPI Rank 3: momentumPerMB = 0.9 +MPI Rank 3: maxEpochs=3 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationStartEpoch=1 +MPI Rank 3: parallelizationMethod=ModelAveragingSGD +MPI Rank 3: distributedMBReading=true +MPI Rank 3: ModelAveragingSGD=[ +MPI Rank 3: SyncFrequencyInFrames=1024 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: gradUpdateType=none +MPI Rank 3: gradientClippingWithTruncation=true +MPI Rank 3: clippingThresholdPerSample=1#INF +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: NDLNetworkBuilder = [ +MPI Rank 3: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 3: ] +MPI Rank 3: reader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = $DataDir$/train.all.bin +MPI Rank 3: ] +MPI Rank 3: cvReader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = $DataDir$/train.all.bin +MPI Rank 3: ] +MPI Rank 3: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu +MPI Rank 3: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: DeviceId=-1 +MPI Rank 3: numCPUThreads=10 +MPI Rank 3: stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: modelPath=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 3: MBSize=4096 +MPI Rank 3: LRate=0.0001 +MPI Rank 3: DeviceId=-1 +MPI Rank 3: parallelTrain=true +MPI Rank 3: command = train +MPI Rank 3: precision = float +MPI Rank 3: traceGPUMemoryAllocations=0 +MPI Rank 3: train = [ +MPI Rank 3: action = train +MPI Rank 3: numMBsToShowResult=10 +MPI Rank 3: deviceId=-1 +MPI Rank 3: minibatchSize = 4096 +MPI Rank 3: modelPath = C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 3: traceLevel = 1 +MPI Rank 3: SGD = [ +MPI Rank 3: epochSize=102399 +MPI Rank 3: learningRatesPerSample = 0.0001 +MPI Rank 3: momentumPerMB = 0.9 +MPI Rank 3: maxEpochs=3 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationStartEpoch=1 +MPI Rank 3: parallelizationMethod=ModelAveragingSGD +MPI Rank 3: distributedMBReading=true +MPI Rank 3: ModelAveragingSGD=[ +MPI Rank 3: SyncFrequencyInFrames=1024 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: gradUpdateType=none +MPI Rank 3: gradientClippingWithTruncation=true +MPI Rank 3: clippingThresholdPerSample=1#INF +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: NDLNetworkBuilder = [ +MPI Rank 3: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 3: ] +MPI Rank 3: reader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: cvReader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu +MPI Rank 3: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: DeviceId=-1 +MPI Rank 3: numCPUThreads=10 +MPI Rank 3: stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: configparameters: dssm.cntk:command=train +MPI Rank 3: configparameters: dssm.cntk:ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: configparameters: dssm.cntk:currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: configparameters: dssm.cntk:cvReader=[ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: configparameters: dssm.cntk:DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: configparameters: dssm.cntk:DeviceId=-1 +MPI Rank 3: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 3: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 3: configparameters: dssm.cntk:modelPath=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 3: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 3: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: configparameters: dssm.cntk:numCPUThreads=10 +MPI Rank 3: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 3: configparameters: dssm.cntk:precision=float +MPI Rank 3: configparameters: dssm.cntk:reader=[ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: configparameters: dssm.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu +MPI Rank 3: configparameters: dssm.cntk:stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 3: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 3: configparameters: dssm.cntk:train=[ +MPI Rank 3: action = train +MPI Rank 3: numMBsToShowResult=10 +MPI Rank 3: deviceId=-1 +MPI Rank 3: minibatchSize = 4096 +MPI Rank 3: modelPath = C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 3: traceLevel = 1 +MPI Rank 3: SGD = [ +MPI Rank 3: epochSize=102399 +MPI Rank 3: learningRatesPerSample = 0.0001 +MPI Rank 3: momentumPerMB = 0.9 +MPI Rank 3: maxEpochs=3 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationStartEpoch=1 +MPI Rank 3: parallelizationMethod=ModelAveragingSGD +MPI Rank 3: distributedMBReading=true +MPI Rank 3: ModelAveragingSGD=[ +MPI Rank 3: SyncFrequencyInFrames=1024 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: gradUpdateType=none +MPI Rank 3: gradientClippingWithTruncation=true +MPI Rank 3: clippingThresholdPerSample=1#INF +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: command: train +MPI Rank 3: precision = float +MPI Rank 3: Using 10 CPU threads +MPI Rank 3: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 3: CNTKCommandTrainInfo: train : 3 +MPI Rank 3: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 3: CNTKCommandTrainBegin: train +MPI Rank 3: NDLBuilder Using CPU +MPI Rank 3: +MPI Rank 3: Post-processing network... +MPI Rank 3: +MPI Rank 3: 2 roots: +MPI Rank 3: CE = CrossEntropyWithSoftmax +MPI Rank 3: SIM = CosDistanceWithNegativeSamples +MPI Rank 3: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 3: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 3: +MPI Rank 3: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 3: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 3: +MPI Rank 3: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 3: +MPI Rank 3: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 3: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 3: +MPI Rank 3: Validating for node CE, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 3: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 3: +MPI Rank 3: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 3: +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: +MPI Rank 3: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 3: +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: +MPI Rank 3: Validating for node SIM, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: +MPI Rank 3: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 3: +MPI Rank 3: Post-processing network complete. +MPI Rank 3: +MPI Rank 3: SGD using CPU. +MPI Rank 3: +MPI Rank 3: Training criterion node(s): +MPI Rank 3: CE = CrossEntropyWithSoftmax +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Allocating matrices for forward and/or backward propagation. +MPI Rank 3: No PreCompute nodes found, skipping PreCompute step +MPI Rank 3: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 3: Starting Epoch 1: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 3: +MPI Rank 3: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 3: Epoch[ 1 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 4.43899651; TotalTime = 17.6870s; SamplesPerSecond = 579.0 +MPI Rank 3: Epoch[ 1 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 3.38633919; TotalTime = 16.3230s; SamplesPerSecond = 627.3 +MPI Rank 3: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.6788689; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=43.1757 +MPI Rank 3: Starting Epoch 2: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 3: +MPI Rank 3: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 3: Epoch[ 2 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.29999485; TotalTime = 15.5869s; SamplesPerSecond = 657.0 +MPI Rank 3: Epoch[ 2 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.11324863; TotalTime = 15.8117s; SamplesPerSecond = 647.6 +MPI Rank 3: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 2.1805767; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=40.735 +MPI Rank 3: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 3: +MPI Rank 3: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 3: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.90144348; TotalTime = 15.5983s; SamplesPerSecond = 656.5 +MPI Rank 3: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.87299423; TotalTime = 16.2062s; SamplesPerSecond = 631.9 +MPI Rank 3: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8883394; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=40.2823 +MPI Rank 3: CNTKCommandTrainEnd: train +MPI Rank 3: COMPLETED +MPI Rank 3: ~MPIWrapper +=== Deleting last epoch data +==== Re-running from checkpoint +=== Running C:\Program Files\Microsoft MPI\Bin\/mpiexec.exe -n 4 D:\thhoens\CNTK\x64\release\cntk.exe configFile=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.cntk currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM DeviceId=-1 numCPUThreads=10 stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 14:41:54 + Last modified date: Thu Mar 3 14:28:26 2016 + CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 + CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 + CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda + Build Branch: HEAD + Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 + Built by thhoens on SAADSRNRDEV040 + Build Path: D:\thhoens\CNTK\Source\CNTK\ +------------------------------------------------------------------- +MPIWrapper: initializing MPI +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 14:41:54 + Last modified date: Thu Mar 3 14:28:26 2016 + CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 + CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 + CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda + Build Branch: HEAD + Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 + Built by thhoens on SAADSRNRDEV040 + Build Path: D:\thhoens\CNTK\Source\CNTK\ +------------------------------------------------------------------- +MPIWrapper: initializing MPI +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 14:41:54 + Last modified date: Thu Mar 3 14:28:26 2016 + CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 + CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 + CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda + Build Branch: HEAD + Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 + Built by thhoens on SAADSRNRDEV040 + Build Path: D:\thhoens\CNTK\Source\CNTK\ +------------------------------------------------------------------- +MPIWrapper: initializing MPI +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 14:41:54 + Last modified date: Thu Mar 3 14:28:26 2016 + CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 + CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 + CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda + Build Branch: HEAD + Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 + Built by thhoens on SAADSRNRDEV040 + Build Path: D:\thhoens\CNTK\Source\CNTK\ +------------------------------------------------------------------- +MPIWrapper: initializing MPI +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: all 4 nodes responded +ping [requestnodes (before change)]: all 4 nodes responded +ping [requestnodes (before change)]: all 4 nodes responded +ping [requestnodes (before change)]: all 4 nodes responded +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (3) are in (participating) +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (1) are in (participating) +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (2) are in (participating) +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (0) are in (participating) +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: all 4 nodes responded +ping [requestnodes (after change)]: all 4 nodes responded +ping [requestnodes (after change)]: all 4 nodes responded +ping [requestnodes (after change)]: all 4 nodes responded +mpihelper: we are cog 3 in a gearbox of 4 +mpihelper: we are cog 1 in a gearbox of 4 +mpihelper: we are cog 2 in a gearbox of 4 +mpihelper: we are cog 0 in a gearbox of 4 +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: all 4 nodes responded +ping [mpihelper]: all 4 nodes responded +ping [mpihelper]: all 4 nodes responded +ping [mpihelper]: all 4 nodes responded +MPI Rank 0: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr_train.logrank0 +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: Build info: +MPI Rank 0: +MPI Rank 0: Built time: Mar 3 2016 14:41:54 +MPI Rank 0: Last modified date: Thu Mar 3 14:28:26 2016 +MPI Rank 0: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 +MPI Rank 0: CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 +MPI Rank 0: CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda +MPI Rank 0: Build Branch: HEAD +MPI Rank 0: Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 +MPI Rank 0: Built by thhoens on SAADSRNRDEV040 +MPI Rank 0: Build Path: D:\thhoens\CNTK\Source\CNTK\ +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: running on SAADSRNRDEV040 at 2016/03/04 00:11:22 +MPI Rank 0: command line: +MPI Rank 0: D:\thhoens\CNTK\x64\release\cntk.exe configFile=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.cntk currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM DeviceId=-1 numCPUThreads=10 stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: modelPath=$RunDir$/models/dssm.net +MPI Rank 0: MBSize=4096 +MPI Rank 0: LRate=0.0001 +MPI Rank 0: DeviceId=-1 +MPI Rank 0: parallelTrain=true +MPI Rank 0: command = train +MPI Rank 0: precision = float +MPI Rank 0: traceGPUMemoryAllocations=0 +MPI Rank 0: train = [ +MPI Rank 0: action = train +MPI Rank 0: numMBsToShowResult=10 +MPI Rank 0: deviceId=$DeviceId$ +MPI Rank 0: minibatchSize = $MBSize$ +MPI Rank 0: modelPath = $modelPath$ +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize=102399 +MPI Rank 0: learningRatesPerSample = $LRate$ +MPI Rank 0: momentumPerMB = 0.9 +MPI Rank 0: maxEpochs=3 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationStartEpoch=1 +MPI Rank 0: parallelizationMethod=ModelAveragingSGD +MPI Rank 0: distributedMBReading=true +MPI Rank 0: ModelAveragingSGD=[ +MPI Rank 0: SyncFrequencyInFrames=1024 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: gradUpdateType=none +MPI Rank 0: gradientClippingWithTruncation=true +MPI Rank 0: clippingThresholdPerSample=1#INF +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: NDLNetworkBuilder = [ +MPI Rank 0: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = $DataDir$/train.all.bin +MPI Rank 0: ] +MPI Rank 0: cvReader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = $DataDir$/train.all.bin +MPI Rank 0: ] +MPI Rank 0: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu +MPI Rank 0: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: DeviceId=-1 +MPI Rank 0: numCPUThreads=10 +MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: modelPath=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 0: MBSize=4096 +MPI Rank 0: LRate=0.0001 +MPI Rank 0: DeviceId=-1 +MPI Rank 0: parallelTrain=true +MPI Rank 0: command = train +MPI Rank 0: precision = float +MPI Rank 0: traceGPUMemoryAllocations=0 +MPI Rank 0: train = [ +MPI Rank 0: action = train +MPI Rank 0: numMBsToShowResult=10 +MPI Rank 0: deviceId=-1 +MPI Rank 0: minibatchSize = 4096 +MPI Rank 0: modelPath = C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize=102399 +MPI Rank 0: learningRatesPerSample = 0.0001 +MPI Rank 0: momentumPerMB = 0.9 +MPI Rank 0: maxEpochs=3 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationStartEpoch=1 +MPI Rank 0: parallelizationMethod=ModelAveragingSGD +MPI Rank 0: distributedMBReading=true +MPI Rank 0: ModelAveragingSGD=[ +MPI Rank 0: SyncFrequencyInFrames=1024 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: gradUpdateType=none +MPI Rank 0: gradientClippingWithTruncation=true +MPI Rank 0: clippingThresholdPerSample=1#INF +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: NDLNetworkBuilder = [ +MPI Rank 0: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: cvReader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu +MPI Rank 0: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: DeviceId=-1 +MPI Rank 0: numCPUThreads=10 +MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: configparameters: dssm.cntk:command=train +MPI Rank 0: configparameters: dssm.cntk:ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: configparameters: dssm.cntk:currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: configparameters: dssm.cntk:cvReader=[ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: dssm.cntk:DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: configparameters: dssm.cntk:DeviceId=-1 +MPI Rank 0: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 0: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 0: configparameters: dssm.cntk:modelPath=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 0: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 0: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: dssm.cntk:numCPUThreads=10 +MPI Rank 0: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 0: configparameters: dssm.cntk:precision=float +MPI Rank 0: configparameters: dssm.cntk:reader=[ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: dssm.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu +MPI Rank 0: configparameters: dssm.cntk:stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 0: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 0: configparameters: dssm.cntk:train=[ +MPI Rank 0: action = train +MPI Rank 0: numMBsToShowResult=10 +MPI Rank 0: deviceId=-1 +MPI Rank 0: minibatchSize = 4096 +MPI Rank 0: modelPath = C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize=102399 +MPI Rank 0: learningRatesPerSample = 0.0001 +MPI Rank 0: momentumPerMB = 0.9 +MPI Rank 0: maxEpochs=3 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationStartEpoch=1 +MPI Rank 0: parallelizationMethod=ModelAveragingSGD +MPI Rank 0: distributedMBReading=true +MPI Rank 0: ModelAveragingSGD=[ +MPI Rank 0: SyncFrequencyInFrames=1024 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: gradUpdateType=none +MPI Rank 0: gradientClippingWithTruncation=true +MPI Rank 0: clippingThresholdPerSample=1#INF +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: command: train +MPI Rank 0: precision = float +MPI Rank 0: Using 10 CPU threads +MPI Rank 0: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 0: CNTKCommandTrainInfo: train : 3 +MPI Rank 0: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 0: CNTKCommandTrainBegin: train +MPI Rank 0: NDLBuilder Using CPU +MPI Rank 0: Starting from checkpoint. Load Network From File C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net.2. +MPI Rank 0: +MPI Rank 0: Post-processing network... +MPI Rank 0: +MPI Rank 0: 2 roots: +MPI Rank 0: SIM = CosDistanceWithNegativeSamples +MPI Rank 0: CE = CrossEntropyWithSoftmax +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: +MPI Rank 0: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: +MPI Rank 0: Validating for node SIM, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: +MPI Rank 0: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 0: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 0: +MPI Rank 0: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 0: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 0: +MPI Rank 0: Validating for node CE, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 0: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 0: +MPI Rank 0: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: Post-processing network complete. +MPI Rank 0: +MPI Rank 0: SGD using CPU. +MPI Rank 0: +MPI Rank 0: Training criterion node(s): +MPI Rank 0: CE = CrossEntropyWithSoftmax +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: No PreCompute nodes found, skipping PreCompute step +MPI Rank 0: Warning: checkpoint file is missing. learning parameters will be initialized from 0 +MPI Rank 0: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.93025131; TotalTime = 19.6371s; SamplesPerSecond = 521.5 +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.93263016; TotalTime = 17.2225s; SamplesPerSecond = 594.6 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.9528804; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=46.4048 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: Final Results: Minibatch[1-25]: Samples Seen = 102399 CE: CrossEntropyWithSoftmax/Sample = 1.8993035 Perplexity = 6.6812396 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Validation Set] TrainLossPerSample = 1.8993035 +MPI Rank 0: CNTKCommandTrainEnd: train +MPI Rank 0: COMPLETED +MPI Rank 0: ~MPIWrapper +MPI Rank 1: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr_train.logrank1 +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: Build info: +MPI Rank 1: +MPI Rank 1: Built time: Mar 3 2016 14:41:54 +MPI Rank 1: Last modified date: Thu Mar 3 14:28:26 2016 +MPI Rank 1: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 +MPI Rank 1: CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 +MPI Rank 1: CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda +MPI Rank 1: Build Branch: HEAD +MPI Rank 1: Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 +MPI Rank 1: Built by thhoens on SAADSRNRDEV040 +MPI Rank 1: Build Path: D:\thhoens\CNTK\Source\CNTK\ +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: running on SAADSRNRDEV040 at 2016/03/04 00:11:22 +MPI Rank 1: command line: +MPI Rank 1: D:\thhoens\CNTK\x64\release\cntk.exe configFile=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.cntk currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM DeviceId=-1 numCPUThreads=10 stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: modelPath=$RunDir$/models/dssm.net +MPI Rank 1: MBSize=4096 +MPI Rank 1: LRate=0.0001 +MPI Rank 1: DeviceId=-1 +MPI Rank 1: parallelTrain=true +MPI Rank 1: command = train +MPI Rank 1: precision = float +MPI Rank 1: traceGPUMemoryAllocations=0 +MPI Rank 1: train = [ +MPI Rank 1: action = train +MPI Rank 1: numMBsToShowResult=10 +MPI Rank 1: deviceId=$DeviceId$ +MPI Rank 1: minibatchSize = $MBSize$ +MPI Rank 1: modelPath = $modelPath$ +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize=102399 +MPI Rank 1: learningRatesPerSample = $LRate$ +MPI Rank 1: momentumPerMB = 0.9 +MPI Rank 1: maxEpochs=3 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationStartEpoch=1 +MPI Rank 1: parallelizationMethod=ModelAveragingSGD +MPI Rank 1: distributedMBReading=true +MPI Rank 1: ModelAveragingSGD=[ +MPI Rank 1: SyncFrequencyInFrames=1024 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: gradUpdateType=none +MPI Rank 1: gradientClippingWithTruncation=true +MPI Rank 1: clippingThresholdPerSample=1#INF +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: NDLNetworkBuilder = [ +MPI Rank 1: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = $DataDir$/train.all.bin +MPI Rank 1: ] +MPI Rank 1: cvReader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = $DataDir$/train.all.bin +MPI Rank 1: ] +MPI Rank 1: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu +MPI Rank 1: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: DeviceId=-1 +MPI Rank 1: numCPUThreads=10 +MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: modelPath=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 1: MBSize=4096 +MPI Rank 1: LRate=0.0001 +MPI Rank 1: DeviceId=-1 +MPI Rank 1: parallelTrain=true +MPI Rank 1: command = train +MPI Rank 1: precision = float +MPI Rank 1: traceGPUMemoryAllocations=0 +MPI Rank 1: train = [ +MPI Rank 1: action = train +MPI Rank 1: numMBsToShowResult=10 +MPI Rank 1: deviceId=-1 +MPI Rank 1: minibatchSize = 4096 +MPI Rank 1: modelPath = C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize=102399 +MPI Rank 1: learningRatesPerSample = 0.0001 +MPI Rank 1: momentumPerMB = 0.9 +MPI Rank 1: maxEpochs=3 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationStartEpoch=1 +MPI Rank 1: parallelizationMethod=ModelAveragingSGD +MPI Rank 1: distributedMBReading=true +MPI Rank 1: ModelAveragingSGD=[ +MPI Rank 1: SyncFrequencyInFrames=1024 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: gradUpdateType=none +MPI Rank 1: gradientClippingWithTruncation=true +MPI Rank 1: clippingThresholdPerSample=1#INF +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: NDLNetworkBuilder = [ +MPI Rank 1: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: cvReader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu +MPI Rank 1: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: DeviceId=-1 +MPI Rank 1: numCPUThreads=10 +MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: configparameters: dssm.cntk:command=train +MPI Rank 1: configparameters: dssm.cntk:ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: configparameters: dssm.cntk:currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: configparameters: dssm.cntk:cvReader=[ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: dssm.cntk:DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: configparameters: dssm.cntk:DeviceId=-1 +MPI Rank 1: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 1: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 1: configparameters: dssm.cntk:modelPath=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 1: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 1: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: dssm.cntk:numCPUThreads=10 +MPI Rank 1: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 1: configparameters: dssm.cntk:precision=float +MPI Rank 1: configparameters: dssm.cntk:reader=[ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: dssm.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu +MPI Rank 1: configparameters: dssm.cntk:stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 1: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 1: configparameters: dssm.cntk:train=[ +MPI Rank 1: action = train +MPI Rank 1: numMBsToShowResult=10 +MPI Rank 1: deviceId=-1 +MPI Rank 1: minibatchSize = 4096 +MPI Rank 1: modelPath = C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize=102399 +MPI Rank 1: learningRatesPerSample = 0.0001 +MPI Rank 1: momentumPerMB = 0.9 +MPI Rank 1: maxEpochs=3 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationStartEpoch=1 +MPI Rank 1: parallelizationMethod=ModelAveragingSGD +MPI Rank 1: distributedMBReading=true +MPI Rank 1: ModelAveragingSGD=[ +MPI Rank 1: SyncFrequencyInFrames=1024 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: gradUpdateType=none +MPI Rank 1: gradientClippingWithTruncation=true +MPI Rank 1: clippingThresholdPerSample=1#INF +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: command: train +MPI Rank 1: precision = float +MPI Rank 1: Using 10 CPU threads +MPI Rank 1: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 1: CNTKCommandTrainInfo: train : 3 +MPI Rank 1: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 1: CNTKCommandTrainBegin: train +MPI Rank 1: NDLBuilder Using CPU +MPI Rank 1: Starting from checkpoint. Load Network From File C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net.2. +MPI Rank 1: +MPI Rank 1: Post-processing network... +MPI Rank 1: +MPI Rank 1: 2 roots: +MPI Rank 1: CE = CrossEntropyWithSoftmax +MPI Rank 1: SIM = CosDistanceWithNegativeSamples +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 1: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 1: +MPI Rank 1: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 1: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 1: +MPI Rank 1: Validating for node CE, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 1: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 1: +MPI Rank 1: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: +MPI Rank 1: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: +MPI Rank 1: Validating for node SIM, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: +MPI Rank 1: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: Post-processing network complete. +MPI Rank 1: +MPI Rank 1: SGD using CPU. +MPI Rank 1: +MPI Rank 1: Training criterion node(s): +MPI Rank 1: CE = CrossEntropyWithSoftmax +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Allocating matrices for forward and/or backward propagation. +MPI Rank 1: No PreCompute nodes found, skipping PreCompute step +MPI Rank 1: Warning: checkpoint file is missing. learning parameters will be initialized from 0 +MPI Rank 1: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.97015953; TotalTime = 19.1537s; SamplesPerSecond = 534.6 +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.94770432; TotalTime = 17.5215s; SamplesPerSecond = 584.4 +MPI Rank 1: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.9528804; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=46.4912 +MPI Rank 1: CNTKCommandTrainEnd: train +MPI Rank 1: COMPLETED +MPI Rank 1: ~MPIWrapper +MPI Rank 2: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr_train.logrank2 +MPI Rank 2: ------------------------------------------------------------------- +MPI Rank 2: Build info: +MPI Rank 2: +MPI Rank 2: Built time: Mar 3 2016 14:41:54 +MPI Rank 2: Last modified date: Thu Mar 3 14:28:26 2016 +MPI Rank 2: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 +MPI Rank 2: CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 +MPI Rank 2: CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda +MPI Rank 2: Build Branch: HEAD +MPI Rank 2: Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 +MPI Rank 2: Built by thhoens on SAADSRNRDEV040 +MPI Rank 2: Build Path: D:\thhoens\CNTK\Source\CNTK\ +MPI Rank 2: ------------------------------------------------------------------- +MPI Rank 2: running on SAADSRNRDEV040 at 2016/03/04 00:11:23 +MPI Rank 2: command line: +MPI Rank 2: D:\thhoens\CNTK\x64\release\cntk.exe configFile=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.cntk currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM DeviceId=-1 numCPUThreads=10 stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: modelPath=$RunDir$/models/dssm.net +MPI Rank 2: MBSize=4096 +MPI Rank 2: LRate=0.0001 +MPI Rank 2: DeviceId=-1 +MPI Rank 2: parallelTrain=true +MPI Rank 2: command = train +MPI Rank 2: precision = float +MPI Rank 2: traceGPUMemoryAllocations=0 +MPI Rank 2: train = [ +MPI Rank 2: action = train +MPI Rank 2: numMBsToShowResult=10 +MPI Rank 2: deviceId=$DeviceId$ +MPI Rank 2: minibatchSize = $MBSize$ +MPI Rank 2: modelPath = $modelPath$ +MPI Rank 2: traceLevel = 1 +MPI Rank 2: SGD = [ +MPI Rank 2: epochSize=102399 +MPI Rank 2: learningRatesPerSample = $LRate$ +MPI Rank 2: momentumPerMB = 0.9 +MPI Rank 2: maxEpochs=3 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationStartEpoch=1 +MPI Rank 2: parallelizationMethod=ModelAveragingSGD +MPI Rank 2: distributedMBReading=true +MPI Rank 2: ModelAveragingSGD=[ +MPI Rank 2: SyncFrequencyInFrames=1024 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: gradUpdateType=none +MPI Rank 2: gradientClippingWithTruncation=true +MPI Rank 2: clippingThresholdPerSample=1#INF +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: NDLNetworkBuilder = [ +MPI Rank 2: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 2: ] +MPI Rank 2: reader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = $DataDir$/train.all.bin +MPI Rank 2: ] +MPI Rank 2: cvReader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = $DataDir$/train.all.bin +MPI Rank 2: ] +MPI Rank 2: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu +MPI Rank 2: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: DeviceId=-1 +MPI Rank 2: numCPUThreads=10 +MPI Rank 2: stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: modelPath=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 2: MBSize=4096 +MPI Rank 2: LRate=0.0001 +MPI Rank 2: DeviceId=-1 +MPI Rank 2: parallelTrain=true +MPI Rank 2: command = train +MPI Rank 2: precision = float +MPI Rank 2: traceGPUMemoryAllocations=0 +MPI Rank 2: train = [ +MPI Rank 2: action = train +MPI Rank 2: numMBsToShowResult=10 +MPI Rank 2: deviceId=-1 +MPI Rank 2: minibatchSize = 4096 +MPI Rank 2: modelPath = C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 2: traceLevel = 1 +MPI Rank 2: SGD = [ +MPI Rank 2: epochSize=102399 +MPI Rank 2: learningRatesPerSample = 0.0001 +MPI Rank 2: momentumPerMB = 0.9 +MPI Rank 2: maxEpochs=3 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationStartEpoch=1 +MPI Rank 2: parallelizationMethod=ModelAveragingSGD +MPI Rank 2: distributedMBReading=true +MPI Rank 2: ModelAveragingSGD=[ +MPI Rank 2: SyncFrequencyInFrames=1024 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: gradUpdateType=none +MPI Rank 2: gradientClippingWithTruncation=true +MPI Rank 2: clippingThresholdPerSample=1#INF +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: NDLNetworkBuilder = [ +MPI Rank 2: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 2: ] +MPI Rank 2: reader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: cvReader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu +MPI Rank 2: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: DeviceId=-1 +MPI Rank 2: numCPUThreads=10 +MPI Rank 2: stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: configparameters: dssm.cntk:command=train +MPI Rank 2: configparameters: dssm.cntk:ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: configparameters: dssm.cntk:currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: configparameters: dssm.cntk:cvReader=[ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: configparameters: dssm.cntk:DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: configparameters: dssm.cntk:DeviceId=-1 +MPI Rank 2: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 2: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 2: configparameters: dssm.cntk:modelPath=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 2: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 2: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: configparameters: dssm.cntk:numCPUThreads=10 +MPI Rank 2: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 2: configparameters: dssm.cntk:precision=float +MPI Rank 2: configparameters: dssm.cntk:reader=[ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: configparameters: dssm.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu +MPI Rank 2: configparameters: dssm.cntk:stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 2: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 2: configparameters: dssm.cntk:train=[ +MPI Rank 2: action = train +MPI Rank 2: numMBsToShowResult=10 +MPI Rank 2: deviceId=-1 +MPI Rank 2: minibatchSize = 4096 +MPI Rank 2: modelPath = C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 2: traceLevel = 1 +MPI Rank 2: SGD = [ +MPI Rank 2: epochSize=102399 +MPI Rank 2: learningRatesPerSample = 0.0001 +MPI Rank 2: momentumPerMB = 0.9 +MPI Rank 2: maxEpochs=3 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationStartEpoch=1 +MPI Rank 2: parallelizationMethod=ModelAveragingSGD +MPI Rank 2: distributedMBReading=true +MPI Rank 2: ModelAveragingSGD=[ +MPI Rank 2: SyncFrequencyInFrames=1024 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: gradUpdateType=none +MPI Rank 2: gradientClippingWithTruncation=true +MPI Rank 2: clippingThresholdPerSample=1#INF +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: command: train +MPI Rank 2: precision = float +MPI Rank 2: Using 10 CPU threads +MPI Rank 2: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 2: CNTKCommandTrainInfo: train : 3 +MPI Rank 2: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 2: CNTKCommandTrainBegin: train +MPI Rank 2: NDLBuilder Using CPU +MPI Rank 2: Starting from checkpoint. Load Network From File C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net.2. +MPI Rank 2: +MPI Rank 2: Post-processing network... +MPI Rank 2: +MPI Rank 2: 2 roots: +MPI Rank 2: SIM = CosDistanceWithNegativeSamples +MPI Rank 2: CE = CrossEntropyWithSoftmax +MPI Rank 2: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 2: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 2: +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: +MPI Rank 2: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 2: +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: +MPI Rank 2: Validating for node SIM, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: +MPI Rank 2: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 2: +MPI Rank 2: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 2: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 2: +MPI Rank 2: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 2: +MPI Rank 2: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 2: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 2: +MPI Rank 2: Validating for node CE, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 2: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 2: +MPI Rank 2: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: Post-processing network complete. +MPI Rank 2: +MPI Rank 2: SGD using CPU. +MPI Rank 2: +MPI Rank 2: Training criterion node(s): +MPI Rank 2: CE = CrossEntropyWithSoftmax +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Allocating matrices for forward and/or backward propagation. +MPI Rank 2: No PreCompute nodes found, skipping PreCompute step +MPI Rank 2: Warning: checkpoint file is missing. learning parameters will be initialized from 0 +MPI Rank 2: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 2: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 2: +MPI Rank 2: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 2: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.98702526; TotalTime = 19.7224s; SamplesPerSecond = 519.2 +MPI Rank 2: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.96577854; TotalTime = 17.2169s; SamplesPerSecond = 594.8 +MPI Rank 2: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.9528804; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=46.4944 +MPI Rank 2: CNTKCommandTrainEnd: train +MPI Rank 2: COMPLETED +MPI Rank 2: ~MPIWrapper +MPI Rank 3: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr_train.logrank3 +MPI Rank 3: ------------------------------------------------------------------- +MPI Rank 3: Build info: +MPI Rank 3: +MPI Rank 3: Built time: Mar 3 2016 14:41:54 +MPI Rank 3: Last modified date: Thu Mar 3 14:28:26 2016 +MPI Rank 3: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 +MPI Rank 3: CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 +MPI Rank 3: CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda +MPI Rank 3: Build Branch: HEAD +MPI Rank 3: Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 +MPI Rank 3: Built by thhoens on SAADSRNRDEV040 +MPI Rank 3: Build Path: D:\thhoens\CNTK\Source\CNTK\ +MPI Rank 3: ------------------------------------------------------------------- +MPI Rank 3: running on SAADSRNRDEV040 at 2016/03/04 00:11:23 +MPI Rank 3: command line: +MPI Rank 3: D:\thhoens\CNTK\x64\release\cntk.exe configFile=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.cntk currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM DeviceId=-1 numCPUThreads=10 stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: modelPath=$RunDir$/models/dssm.net +MPI Rank 3: MBSize=4096 +MPI Rank 3: LRate=0.0001 +MPI Rank 3: DeviceId=-1 +MPI Rank 3: parallelTrain=true +MPI Rank 3: command = train +MPI Rank 3: precision = float +MPI Rank 3: traceGPUMemoryAllocations=0 +MPI Rank 3: train = [ +MPI Rank 3: action = train +MPI Rank 3: numMBsToShowResult=10 +MPI Rank 3: deviceId=$DeviceId$ +MPI Rank 3: minibatchSize = $MBSize$ +MPI Rank 3: modelPath = $modelPath$ +MPI Rank 3: traceLevel = 1 +MPI Rank 3: SGD = [ +MPI Rank 3: epochSize=102399 +MPI Rank 3: learningRatesPerSample = $LRate$ +MPI Rank 3: momentumPerMB = 0.9 +MPI Rank 3: maxEpochs=3 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationStartEpoch=1 +MPI Rank 3: parallelizationMethod=ModelAveragingSGD +MPI Rank 3: distributedMBReading=true +MPI Rank 3: ModelAveragingSGD=[ +MPI Rank 3: SyncFrequencyInFrames=1024 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: gradUpdateType=none +MPI Rank 3: gradientClippingWithTruncation=true +MPI Rank 3: clippingThresholdPerSample=1#INF +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: NDLNetworkBuilder = [ +MPI Rank 3: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 3: ] +MPI Rank 3: reader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = $DataDir$/train.all.bin +MPI Rank 3: ] +MPI Rank 3: cvReader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = $DataDir$/train.all.bin +MPI Rank 3: ] +MPI Rank 3: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu +MPI Rank 3: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: DeviceId=-1 +MPI Rank 3: numCPUThreads=10 +MPI Rank 3: stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: modelPath=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 3: MBSize=4096 +MPI Rank 3: LRate=0.0001 +MPI Rank 3: DeviceId=-1 +MPI Rank 3: parallelTrain=true +MPI Rank 3: command = train +MPI Rank 3: precision = float +MPI Rank 3: traceGPUMemoryAllocations=0 +MPI Rank 3: train = [ +MPI Rank 3: action = train +MPI Rank 3: numMBsToShowResult=10 +MPI Rank 3: deviceId=-1 +MPI Rank 3: minibatchSize = 4096 +MPI Rank 3: modelPath = C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 3: traceLevel = 1 +MPI Rank 3: SGD = [ +MPI Rank 3: epochSize=102399 +MPI Rank 3: learningRatesPerSample = 0.0001 +MPI Rank 3: momentumPerMB = 0.9 +MPI Rank 3: maxEpochs=3 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationStartEpoch=1 +MPI Rank 3: parallelizationMethod=ModelAveragingSGD +MPI Rank 3: distributedMBReading=true +MPI Rank 3: ModelAveragingSGD=[ +MPI Rank 3: SyncFrequencyInFrames=1024 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: gradUpdateType=none +MPI Rank 3: gradientClippingWithTruncation=true +MPI Rank 3: clippingThresholdPerSample=1#INF +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: NDLNetworkBuilder = [ +MPI Rank 3: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 3: ] +MPI Rank 3: reader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: cvReader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu +MPI Rank 3: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: DeviceId=-1 +MPI Rank 3: numCPUThreads=10 +MPI Rank 3: stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: configparameters: dssm.cntk:command=train +MPI Rank 3: configparameters: dssm.cntk:ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: configparameters: dssm.cntk:currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: configparameters: dssm.cntk:cvReader=[ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: configparameters: dssm.cntk:DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: configparameters: dssm.cntk:DeviceId=-1 +MPI Rank 3: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 3: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 3: configparameters: dssm.cntk:modelPath=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 3: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 3: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: configparameters: dssm.cntk:numCPUThreads=10 +MPI Rank 3: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 3: configparameters: dssm.cntk:precision=float +MPI Rank 3: configparameters: dssm.cntk:reader=[ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: configparameters: dssm.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu +MPI Rank 3: configparameters: dssm.cntk:stderr=C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/stderr +MPI Rank 3: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 3: configparameters: dssm.cntk:train=[ +MPI Rank 3: action = train +MPI Rank 3: numMBsToShowResult=10 +MPI Rank 3: deviceId=-1 +MPI Rank 3: minibatchSize = 4096 +MPI Rank 3: modelPath = C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 3: traceLevel = 1 +MPI Rank 3: SGD = [ +MPI Rank 3: epochSize=102399 +MPI Rank 3: learningRatesPerSample = 0.0001 +MPI Rank 3: momentumPerMB = 0.9 +MPI Rank 3: maxEpochs=3 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationStartEpoch=1 +MPI Rank 3: parallelizationMethod=ModelAveragingSGD +MPI Rank 3: distributedMBReading=true +MPI Rank 3: ModelAveragingSGD=[ +MPI Rank 3: SyncFrequencyInFrames=1024 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: gradUpdateType=none +MPI Rank 3: gradientClippingWithTruncation=true +MPI Rank 3: clippingThresholdPerSample=1#INF +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: command: train +MPI Rank 3: precision = float +MPI Rank 3: Using 10 CPU threads +MPI Rank 3: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net +MPI Rank 3: CNTKCommandTrainInfo: train : 3 +MPI Rank 3: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 3: CNTKCommandTrainBegin: train +MPI Rank 3: NDLBuilder Using CPU +MPI Rank 3: Starting from checkpoint. Load Network From File C:\cygwin64\tmp\cntk-test-20160303160843.247203\Text_SparseDSSM@release_cpu/models/dssm.net.2. +MPI Rank 3: +MPI Rank 3: Post-processing network... +MPI Rank 3: +MPI Rank 3: 2 roots: +MPI Rank 3: CE = CrossEntropyWithSoftmax +MPI Rank 3: SIM = CosDistanceWithNegativeSamples +MPI Rank 3: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 3: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 3: +MPI Rank 3: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 3: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 3: +MPI Rank 3: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 3: +MPI Rank 3: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 3: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 3: +MPI Rank 3: Validating for node CE, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 3: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 3: +MPI Rank 3: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 3: +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: +MPI Rank 3: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 3: +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: +MPI Rank 3: Validating for node SIM, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: +MPI Rank 3: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 3: +MPI Rank 3: Post-processing network complete. +MPI Rank 3: +MPI Rank 3: SGD using CPU. +MPI Rank 3: +MPI Rank 3: Training criterion node(s): +MPI Rank 3: CE = CrossEntropyWithSoftmax +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Allocating matrices for forward and/or backward propagation. +MPI Rank 3: No PreCompute nodes found, skipping PreCompute step +MPI Rank 3: Warning: checkpoint file is missing. learning parameters will be initialized from 0 +MPI Rank 3: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 3: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 3: +MPI Rank 3: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 3: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.93149776; TotalTime = 19.2109s; SamplesPerSecond = 533.0 +MPI Rank 3: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.95135975; TotalTime = 17.5215s; SamplesPerSecond = 584.4 +MPI Rank 3: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.9528804; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=46.4913 +MPI Rank 3: CNTKCommandTrainEnd: train +MPI Rank 3: COMPLETED +MPI Rank 3: ~MPIWrapper diff --git a/Tests/EndToEndTests/Text/SparseDSSM/baseline.windows.gpu.txt b/Tests/EndToEndTests/Text/SparseDSSM/baseline.windows.gpu.txt new file mode 100755 index 000000000000..cc2c98ca8484 --- /dev/null +++ b/Tests/EndToEndTests/Text/SparseDSSM/baseline.windows.gpu.txt @@ -0,0 +1,3228 @@ +=== Running C:\Program Files\Microsoft MPI\Bin\/mpiexec.exe -n 4 D:\thhoens\CNTK\x64\release\cntk.exe configFile=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.cntk currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM DeviceId=0 numCPUThreads=10 stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 14:41:54 + Last modified date: Thu Mar 3 14:28:26 2016 + CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 + CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 + CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda + Build Branch: HEAD + Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 + Built by thhoens on SAADSRNRDEV040 + Build Path: D:\thhoens\CNTK\Source\CNTK\ +------------------------------------------------------------------- +MPIWrapper: initializing MPI +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 14:41:54 + Last modified date: Thu Mar 3 14:28:26 2016 + CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 + CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 + CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda + Build Branch: HEAD + Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 + Built by thhoens on SAADSRNRDEV040 + Build Path: D:\thhoens\CNTK\Source\CNTK\ +------------------------------------------------------------------- +MPIWrapper: initializing MPI +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 14:41:54 + Last modified date: Thu Mar 3 14:28:26 2016 + CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 + CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 + CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda + Build Branch: HEAD + Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 + Built by thhoens on SAADSRNRDEV040 + Build Path: D:\thhoens\CNTK\Source\CNTK\ +------------------------------------------------------------------- +MPIWrapper: initializing MPI +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 14:41:54 + Last modified date: Thu Mar 3 14:28:26 2016 + CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 + CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 + CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda + Build Branch: HEAD + Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 + Built by thhoens on SAADSRNRDEV040 + Build Path: D:\thhoens\CNTK\Source\CNTK\ +------------------------------------------------------------------- +MPIWrapper: initializing MPI +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: all 4 nodes responded +ping [requestnodes (before change)]: all 4 nodes responded +ping [requestnodes (before change)]: all 4 nodes responded +ping [requestnodes (before change)]: all 4 nodes responded +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (3) are in (participating) +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (1) are in (participating) +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (0) are in (participating) +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (2) are in (participating) +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: all 4 nodes responded +ping [requestnodes (after change)]: all 4 nodes responded +ping [requestnodes (after change)]: all 4 nodes responded +ping [requestnodes (after change)]: all 4 nodes responded +mpihelper: we are cog 3 in a gearbox of 4 +mpihelper: we are cog 1 in a gearbox of 4 +mpihelper: we are cog 0 in a gearbox of 4 +mpihelper: we are cog 2 in a gearbox of 4 +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: all 4 nodes responded +ping [mpihelper]: all 4 nodes responded +ping [mpihelper]: all 4 nodes responded +ping [mpihelper]: all 4 nodes responded +MPI Rank 0: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr_train.logrank0 +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: Build info: +MPI Rank 0: +MPI Rank 0: Built time: Mar 3 2016 14:41:54 +MPI Rank 0: Last modified date: Thu Mar 3 14:28:26 2016 +MPI Rank 0: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 +MPI Rank 0: CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 +MPI Rank 0: CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda +MPI Rank 0: Build Branch: HEAD +MPI Rank 0: Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 +MPI Rank 0: Built by thhoens on SAADSRNRDEV040 +MPI Rank 0: Build Path: D:\thhoens\CNTK\Source\CNTK\ +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: running on SAADSRNRDEV040 at 2016/03/03 23:47:11 +MPI Rank 0: command line: +MPI Rank 0: D:\thhoens\CNTK\x64\release\cntk.exe configFile=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.cntk currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM DeviceId=0 numCPUThreads=10 stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: modelPath=$RunDir$/models/dssm.net +MPI Rank 0: MBSize=4096 +MPI Rank 0: LRate=0.0001 +MPI Rank 0: DeviceId=-1 +MPI Rank 0: parallelTrain=true +MPI Rank 0: command = train +MPI Rank 0: precision = float +MPI Rank 0: traceGPUMemoryAllocations=0 +MPI Rank 0: train = [ +MPI Rank 0: action = train +MPI Rank 0: numMBsToShowResult=10 +MPI Rank 0: deviceId=$DeviceId$ +MPI Rank 0: minibatchSize = $MBSize$ +MPI Rank 0: modelPath = $modelPath$ +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize=102399 +MPI Rank 0: learningRatesPerSample = $LRate$ +MPI Rank 0: momentumPerMB = 0.9 +MPI Rank 0: maxEpochs=3 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationStartEpoch=1 +MPI Rank 0: parallelizationMethod=ModelAveragingSGD +MPI Rank 0: distributedMBReading=true +MPI Rank 0: ModelAveragingSGD=[ +MPI Rank 0: SyncFrequencyInFrames=1024 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: gradUpdateType=none +MPI Rank 0: gradientClippingWithTruncation=true +MPI Rank 0: clippingThresholdPerSample=1#INF +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: NDLNetworkBuilder = [ +MPI Rank 0: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = $DataDir$/train.all.bin +MPI Rank 0: ] +MPI Rank 0: cvReader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = $DataDir$/train.all.bin +MPI Rank 0: ] +MPI Rank 0: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu +MPI Rank 0: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: DeviceId=0 +MPI Rank 0: numCPUThreads=10 +MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: modelPath=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 0: MBSize=4096 +MPI Rank 0: LRate=0.0001 +MPI Rank 0: DeviceId=-1 +MPI Rank 0: parallelTrain=true +MPI Rank 0: command = train +MPI Rank 0: precision = float +MPI Rank 0: traceGPUMemoryAllocations=0 +MPI Rank 0: train = [ +MPI Rank 0: action = train +MPI Rank 0: numMBsToShowResult=10 +MPI Rank 0: deviceId=0 +MPI Rank 0: minibatchSize = 4096 +MPI Rank 0: modelPath = C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize=102399 +MPI Rank 0: learningRatesPerSample = 0.0001 +MPI Rank 0: momentumPerMB = 0.9 +MPI Rank 0: maxEpochs=3 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationStartEpoch=1 +MPI Rank 0: parallelizationMethod=ModelAveragingSGD +MPI Rank 0: distributedMBReading=true +MPI Rank 0: ModelAveragingSGD=[ +MPI Rank 0: SyncFrequencyInFrames=1024 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: gradUpdateType=none +MPI Rank 0: gradientClippingWithTruncation=true +MPI Rank 0: clippingThresholdPerSample=1#INF +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: NDLNetworkBuilder = [ +MPI Rank 0: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: cvReader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu +MPI Rank 0: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: DeviceId=0 +MPI Rank 0: numCPUThreads=10 +MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: configparameters: dssm.cntk:command=train +MPI Rank 0: configparameters: dssm.cntk:ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: configparameters: dssm.cntk:currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: configparameters: dssm.cntk:cvReader=[ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: dssm.cntk:DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: configparameters: dssm.cntk:DeviceId=0 +MPI Rank 0: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 0: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 0: configparameters: dssm.cntk:modelPath=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 0: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 0: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: dssm.cntk:numCPUThreads=10 +MPI Rank 0: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 0: configparameters: dssm.cntk:precision=float +MPI Rank 0: configparameters: dssm.cntk:reader=[ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: dssm.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu +MPI Rank 0: configparameters: dssm.cntk:stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 0: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 0: configparameters: dssm.cntk:train=[ +MPI Rank 0: action = train +MPI Rank 0: numMBsToShowResult=10 +MPI Rank 0: deviceId=0 +MPI Rank 0: minibatchSize = 4096 +MPI Rank 0: modelPath = C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize=102399 +MPI Rank 0: learningRatesPerSample = 0.0001 +MPI Rank 0: momentumPerMB = 0.9 +MPI Rank 0: maxEpochs=3 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationStartEpoch=1 +MPI Rank 0: parallelizationMethod=ModelAveragingSGD +MPI Rank 0: distributedMBReading=true +MPI Rank 0: ModelAveragingSGD=[ +MPI Rank 0: SyncFrequencyInFrames=1024 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: gradUpdateType=none +MPI Rank 0: gradientClippingWithTruncation=true +MPI Rank 0: clippingThresholdPerSample=1#INF +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: command: train +MPI Rank 0: precision = float +MPI Rank 0: Using 10 CPU threads +MPI Rank 0: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 0: CNTKCommandTrainInfo: train : 3 +MPI Rank 0: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 0: CNTKCommandTrainBegin: train +MPI Rank 0: NDLBuilder Using GPU 0 +MPI Rank 0: Microsoft::MSR::CNTK::GPUMatrix::SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4 +MPI Rank 0: +MPI Rank 0: Post-processing network... +MPI Rank 0: +MPI Rank 0: 2 roots: +MPI Rank 0: SIM = CosDistanceWithNegativeSamples +MPI Rank 0: CE = CrossEntropyWithSoftmax +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: +MPI Rank 0: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: +MPI Rank 0: Validating for node SIM, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: +MPI Rank 0: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 0: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 0: +MPI Rank 0: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 0: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 0: +MPI Rank 0: Validating for node CE, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 0: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 0: +MPI Rank 0: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: Post-processing network complete. +MPI Rank 0: +MPI Rank 0: SGD using GPU 0. +MPI Rank 0: +MPI Rank 0: Training criterion node(s): +MPI Rank 0: CE = CrossEntropyWithSoftmax +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: No PreCompute nodes found, skipping PreCompute step +MPI Rank 0: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 0: Starting Epoch 1: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 4.34696808; TotalTime = 9.4644s; SamplesPerSecond = 1081.9 +MPI Rank 0: Epoch[ 1 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 3.34277420; TotalTime = 7.3300s; SamplesPerSecond = 1397.0 +MPI Rank 0: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.6160171; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=21.7436 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: Final Results: Minibatch[1-25]: Samples Seen = 102399 CE: CrossEntropyWithSoftmax/Sample = 2.5001547 Perplexity = 12.184379 +MPI Rank 0: Finished Epoch[ 1 of 3]: [Validation Set] TrainLossPerSample = 2.5001547 +MPI Rank 0: Starting Epoch 2: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.30270958; TotalTime = 8.1645s; SamplesPerSecond = 1254.2 +MPI Rank 0: Epoch[ 2 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.09883804; TotalTime = 7.8825s; SamplesPerSecond = 1299.1 +MPI Rank 0: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 2.1757753; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=20.4866 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: Final Results: Minibatch[1-25]: Samples Seen = 102399 CE: CrossEntropyWithSoftmax/Sample = 1.9714525 Perplexity = 7.1810993 +MPI Rank 0: Finished Epoch[ 2 of 3]: [Validation Set] TrainLossPerSample = 1.9714525 +MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.89778175; TotalTime = 8.3942s; SamplesPerSecond = 1219.9 +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.86335983; TotalTime = 8.0798s; SamplesPerSecond = 1267.4 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8856394; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=20.8703 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: Final Results: Minibatch[1-25]: Samples Seen = 102399 CE: CrossEntropyWithSoftmax/Sample = 1.8091086 Perplexity = 6.1050028 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Validation Set] TrainLossPerSample = 1.8091086 +MPI Rank 0: CNTKCommandTrainEnd: train +MPI Rank 0: COMPLETED +MPI Rank 0: ~MPIWrapper +MPI Rank 1: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr_train.logrank1 +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: Build info: +MPI Rank 1: +MPI Rank 1: Built time: Mar 3 2016 14:41:54 +MPI Rank 1: Last modified date: Thu Mar 3 14:28:26 2016 +MPI Rank 1: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 +MPI Rank 1: CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 +MPI Rank 1: CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda +MPI Rank 1: Build Branch: HEAD +MPI Rank 1: Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 +MPI Rank 1: Built by thhoens on SAADSRNRDEV040 +MPI Rank 1: Build Path: D:\thhoens\CNTK\Source\CNTK\ +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: running on SAADSRNRDEV040 at 2016/03/03 23:47:11 +MPI Rank 1: command line: +MPI Rank 1: D:\thhoens\CNTK\x64\release\cntk.exe configFile=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.cntk currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM DeviceId=0 numCPUThreads=10 stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: modelPath=$RunDir$/models/dssm.net +MPI Rank 1: MBSize=4096 +MPI Rank 1: LRate=0.0001 +MPI Rank 1: DeviceId=-1 +MPI Rank 1: parallelTrain=true +MPI Rank 1: command = train +MPI Rank 1: precision = float +MPI Rank 1: traceGPUMemoryAllocations=0 +MPI Rank 1: train = [ +MPI Rank 1: action = train +MPI Rank 1: numMBsToShowResult=10 +MPI Rank 1: deviceId=$DeviceId$ +MPI Rank 1: minibatchSize = $MBSize$ +MPI Rank 1: modelPath = $modelPath$ +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize=102399 +MPI Rank 1: learningRatesPerSample = $LRate$ +MPI Rank 1: momentumPerMB = 0.9 +MPI Rank 1: maxEpochs=3 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationStartEpoch=1 +MPI Rank 1: parallelizationMethod=ModelAveragingSGD +MPI Rank 1: distributedMBReading=true +MPI Rank 1: ModelAveragingSGD=[ +MPI Rank 1: SyncFrequencyInFrames=1024 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: gradUpdateType=none +MPI Rank 1: gradientClippingWithTruncation=true +MPI Rank 1: clippingThresholdPerSample=1#INF +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: NDLNetworkBuilder = [ +MPI Rank 1: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = $DataDir$/train.all.bin +MPI Rank 1: ] +MPI Rank 1: cvReader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = $DataDir$/train.all.bin +MPI Rank 1: ] +MPI Rank 1: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu +MPI Rank 1: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: DeviceId=0 +MPI Rank 1: numCPUThreads=10 +MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: modelPath=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 1: MBSize=4096 +MPI Rank 1: LRate=0.0001 +MPI Rank 1: DeviceId=-1 +MPI Rank 1: parallelTrain=true +MPI Rank 1: command = train +MPI Rank 1: precision = float +MPI Rank 1: traceGPUMemoryAllocations=0 +MPI Rank 1: train = [ +MPI Rank 1: action = train +MPI Rank 1: numMBsToShowResult=10 +MPI Rank 1: deviceId=0 +MPI Rank 1: minibatchSize = 4096 +MPI Rank 1: modelPath = C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize=102399 +MPI Rank 1: learningRatesPerSample = 0.0001 +MPI Rank 1: momentumPerMB = 0.9 +MPI Rank 1: maxEpochs=3 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationStartEpoch=1 +MPI Rank 1: parallelizationMethod=ModelAveragingSGD +MPI Rank 1: distributedMBReading=true +MPI Rank 1: ModelAveragingSGD=[ +MPI Rank 1: SyncFrequencyInFrames=1024 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: gradUpdateType=none +MPI Rank 1: gradientClippingWithTruncation=true +MPI Rank 1: clippingThresholdPerSample=1#INF +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: NDLNetworkBuilder = [ +MPI Rank 1: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: cvReader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu +MPI Rank 1: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: DeviceId=0 +MPI Rank 1: numCPUThreads=10 +MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: configparameters: dssm.cntk:command=train +MPI Rank 1: configparameters: dssm.cntk:ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: configparameters: dssm.cntk:currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: configparameters: dssm.cntk:cvReader=[ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: dssm.cntk:DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: configparameters: dssm.cntk:DeviceId=0 +MPI Rank 1: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 1: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 1: configparameters: dssm.cntk:modelPath=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 1: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 1: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: dssm.cntk:numCPUThreads=10 +MPI Rank 1: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 1: configparameters: dssm.cntk:precision=float +MPI Rank 1: configparameters: dssm.cntk:reader=[ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: dssm.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu +MPI Rank 1: configparameters: dssm.cntk:stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 1: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 1: configparameters: dssm.cntk:train=[ +MPI Rank 1: action = train +MPI Rank 1: numMBsToShowResult=10 +MPI Rank 1: deviceId=0 +MPI Rank 1: minibatchSize = 4096 +MPI Rank 1: modelPath = C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize=102399 +MPI Rank 1: learningRatesPerSample = 0.0001 +MPI Rank 1: momentumPerMB = 0.9 +MPI Rank 1: maxEpochs=3 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationStartEpoch=1 +MPI Rank 1: parallelizationMethod=ModelAveragingSGD +MPI Rank 1: distributedMBReading=true +MPI Rank 1: ModelAveragingSGD=[ +MPI Rank 1: SyncFrequencyInFrames=1024 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: gradUpdateType=none +MPI Rank 1: gradientClippingWithTruncation=true +MPI Rank 1: clippingThresholdPerSample=1#INF +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: command: train +MPI Rank 1: precision = float +MPI Rank 1: Using 10 CPU threads +MPI Rank 1: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 1: CNTKCommandTrainInfo: train : 3 +MPI Rank 1: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 1: CNTKCommandTrainBegin: train +MPI Rank 1: NDLBuilder Using GPU 0 +MPI Rank 1: Microsoft::MSR::CNTK::GPUMatrix::SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4 +MPI Rank 1: +MPI Rank 1: Post-processing network... +MPI Rank 1: +MPI Rank 1: 2 roots: +MPI Rank 1: SIM = CosDistanceWithNegativeSamples +MPI Rank 1: CE = CrossEntropyWithSoftmax +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: +MPI Rank 1: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: +MPI Rank 1: Validating for node SIM, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: +MPI Rank 1: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 1: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 1: +MPI Rank 1: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 1: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 1: +MPI Rank 1: Validating for node CE, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 1: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 1: +MPI Rank 1: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: Post-processing network complete. +MPI Rank 1: +MPI Rank 1: SGD using GPU 0. +MPI Rank 1: +MPI Rank 1: Training criterion node(s): +MPI Rank 1: CE = CrossEntropyWithSoftmax +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Allocating matrices for forward and/or backward propagation. +MPI Rank 1: No PreCompute nodes found, skipping PreCompute step +MPI Rank 1: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 1: Starting Epoch 1: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 4.32159615; TotalTime = 9.4668s; SamplesPerSecond = 1081.7 +MPI Rank 1: Epoch[ 1 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 3.33525505; TotalTime = 7.3300s; SamplesPerSecond = 1397.0 +MPI Rank 1: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.6160171; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=21.7436 +MPI Rank 1: Starting Epoch 2: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.32732925; TotalTime = 8.1731s; SamplesPerSecond = 1252.9 +MPI Rank 1: Epoch[ 2 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.11035995; TotalTime = 7.8825s; SamplesPerSecond = 1299.1 +MPI Rank 1: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 2.1757753; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=20.4866 +MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.92909813; TotalTime = 8.4392s; SamplesPerSecond = 1213.4 +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.86598778; TotalTime = 8.0798s; SamplesPerSecond = 1267.4 +MPI Rank 1: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8856394; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=20.8703 +MPI Rank 1: CNTKCommandTrainEnd: train +MPI Rank 1: COMPLETED +MPI Rank 1: ~MPIWrapper +MPI Rank 2: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr_train.logrank2 +MPI Rank 2: ------------------------------------------------------------------- +MPI Rank 2: Build info: +MPI Rank 2: +MPI Rank 2: Built time: Mar 3 2016 14:41:54 +MPI Rank 2: Last modified date: Thu Mar 3 14:28:26 2016 +MPI Rank 2: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 +MPI Rank 2: CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 +MPI Rank 2: CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda +MPI Rank 2: Build Branch: HEAD +MPI Rank 2: Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 +MPI Rank 2: Built by thhoens on SAADSRNRDEV040 +MPI Rank 2: Build Path: D:\thhoens\CNTK\Source\CNTK\ +MPI Rank 2: ------------------------------------------------------------------- +MPI Rank 2: running on SAADSRNRDEV040 at 2016/03/03 23:47:12 +MPI Rank 2: command line: +MPI Rank 2: D:\thhoens\CNTK\x64\release\cntk.exe configFile=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.cntk currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM DeviceId=0 numCPUThreads=10 stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: modelPath=$RunDir$/models/dssm.net +MPI Rank 2: MBSize=4096 +MPI Rank 2: LRate=0.0001 +MPI Rank 2: DeviceId=-1 +MPI Rank 2: parallelTrain=true +MPI Rank 2: command = train +MPI Rank 2: precision = float +MPI Rank 2: traceGPUMemoryAllocations=0 +MPI Rank 2: train = [ +MPI Rank 2: action = train +MPI Rank 2: numMBsToShowResult=10 +MPI Rank 2: deviceId=$DeviceId$ +MPI Rank 2: minibatchSize = $MBSize$ +MPI Rank 2: modelPath = $modelPath$ +MPI Rank 2: traceLevel = 1 +MPI Rank 2: SGD = [ +MPI Rank 2: epochSize=102399 +MPI Rank 2: learningRatesPerSample = $LRate$ +MPI Rank 2: momentumPerMB = 0.9 +MPI Rank 2: maxEpochs=3 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationStartEpoch=1 +MPI Rank 2: parallelizationMethod=ModelAveragingSGD +MPI Rank 2: distributedMBReading=true +MPI Rank 2: ModelAveragingSGD=[ +MPI Rank 2: SyncFrequencyInFrames=1024 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: gradUpdateType=none +MPI Rank 2: gradientClippingWithTruncation=true +MPI Rank 2: clippingThresholdPerSample=1#INF +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: NDLNetworkBuilder = [ +MPI Rank 2: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 2: ] +MPI Rank 2: reader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = $DataDir$/train.all.bin +MPI Rank 2: ] +MPI Rank 2: cvReader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = $DataDir$/train.all.bin +MPI Rank 2: ] +MPI Rank 2: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu +MPI Rank 2: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: DeviceId=0 +MPI Rank 2: numCPUThreads=10 +MPI Rank 2: stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: modelPath=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 2: MBSize=4096 +MPI Rank 2: LRate=0.0001 +MPI Rank 2: DeviceId=-1 +MPI Rank 2: parallelTrain=true +MPI Rank 2: command = train +MPI Rank 2: precision = float +MPI Rank 2: traceGPUMemoryAllocations=0 +MPI Rank 2: train = [ +MPI Rank 2: action = train +MPI Rank 2: numMBsToShowResult=10 +MPI Rank 2: deviceId=0 +MPI Rank 2: minibatchSize = 4096 +MPI Rank 2: modelPath = C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 2: traceLevel = 1 +MPI Rank 2: SGD = [ +MPI Rank 2: epochSize=102399 +MPI Rank 2: learningRatesPerSample = 0.0001 +MPI Rank 2: momentumPerMB = 0.9 +MPI Rank 2: maxEpochs=3 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationStartEpoch=1 +MPI Rank 2: parallelizationMethod=ModelAveragingSGD +MPI Rank 2: distributedMBReading=true +MPI Rank 2: ModelAveragingSGD=[ +MPI Rank 2: SyncFrequencyInFrames=1024 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: gradUpdateType=none +MPI Rank 2: gradientClippingWithTruncation=true +MPI Rank 2: clippingThresholdPerSample=1#INF +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: NDLNetworkBuilder = [ +MPI Rank 2: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 2: ] +MPI Rank 2: reader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: cvReader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu +MPI Rank 2: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: DeviceId=0 +MPI Rank 2: numCPUThreads=10 +MPI Rank 2: stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: configparameters: dssm.cntk:command=train +MPI Rank 2: configparameters: dssm.cntk:ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: configparameters: dssm.cntk:currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: configparameters: dssm.cntk:cvReader=[ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: configparameters: dssm.cntk:DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: configparameters: dssm.cntk:DeviceId=0 +MPI Rank 2: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 2: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 2: configparameters: dssm.cntk:modelPath=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 2: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 2: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: configparameters: dssm.cntk:numCPUThreads=10 +MPI Rank 2: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 2: configparameters: dssm.cntk:precision=float +MPI Rank 2: configparameters: dssm.cntk:reader=[ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: configparameters: dssm.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu +MPI Rank 2: configparameters: dssm.cntk:stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 2: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 2: configparameters: dssm.cntk:train=[ +MPI Rank 2: action = train +MPI Rank 2: numMBsToShowResult=10 +MPI Rank 2: deviceId=0 +MPI Rank 2: minibatchSize = 4096 +MPI Rank 2: modelPath = C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 2: traceLevel = 1 +MPI Rank 2: SGD = [ +MPI Rank 2: epochSize=102399 +MPI Rank 2: learningRatesPerSample = 0.0001 +MPI Rank 2: momentumPerMB = 0.9 +MPI Rank 2: maxEpochs=3 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationStartEpoch=1 +MPI Rank 2: parallelizationMethod=ModelAveragingSGD +MPI Rank 2: distributedMBReading=true +MPI Rank 2: ModelAveragingSGD=[ +MPI Rank 2: SyncFrequencyInFrames=1024 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: gradUpdateType=none +MPI Rank 2: gradientClippingWithTruncation=true +MPI Rank 2: clippingThresholdPerSample=1#INF +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: command: train +MPI Rank 2: precision = float +MPI Rank 2: Using 10 CPU threads +MPI Rank 2: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 2: CNTKCommandTrainInfo: train : 3 +MPI Rank 2: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 2: CNTKCommandTrainBegin: train +MPI Rank 2: NDLBuilder Using GPU 0 +MPI Rank 2: Microsoft::MSR::CNTK::GPUMatrix::SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4 +MPI Rank 2: +MPI Rank 2: Post-processing network... +MPI Rank 2: +MPI Rank 2: 2 roots: +MPI Rank 2: SIM = CosDistanceWithNegativeSamples +MPI Rank 2: CE = CrossEntropyWithSoftmax +MPI Rank 2: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 2: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 2: +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: +MPI Rank 2: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 2: +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: +MPI Rank 2: Validating for node SIM, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: +MPI Rank 2: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 2: +MPI Rank 2: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 2: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 2: +MPI Rank 2: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 2: +MPI Rank 2: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 2: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 2: +MPI Rank 2: Validating for node CE, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 2: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 2: +MPI Rank 2: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: Post-processing network complete. +MPI Rank 2: +MPI Rank 2: SGD using GPU 0. +MPI Rank 2: +MPI Rank 2: Training criterion node(s): +MPI Rank 2: CE = CrossEntropyWithSoftmax +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Allocating matrices for forward and/or backward propagation. +MPI Rank 2: No PreCompute nodes found, skipping PreCompute step +MPI Rank 2: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 2: Starting Epoch 1: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 2: +MPI Rank 2: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 4.32837563; TotalTime = 9.4674s; SamplesPerSecond = 1081.6 +MPI Rank 2: Epoch[ 1 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 3.35655479; TotalTime = 7.3300s; SamplesPerSecond = 1397.0 +MPI Rank 2: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.6160171; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=21.7436 +MPI Rank 2: Starting Epoch 2: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 2: +MPI Rank 2: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.32893581; TotalTime = 8.1756s; SamplesPerSecond = 1252.5 +MPI Rank 2: Epoch[ 2 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.11646900; TotalTime = 7.8825s; SamplesPerSecond = 1299.1 +MPI Rank 2: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 2.1757753; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=20.4866 +MPI Rank 2: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 2: +MPI Rank 2: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 2: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.95308418; TotalTime = 8.4366s; SamplesPerSecond = 1213.8 +MPI Rank 2: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.87902641; TotalTime = 8.0798s; SamplesPerSecond = 1267.4 +MPI Rank 2: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8856394; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=20.8703 +MPI Rank 2: CNTKCommandTrainEnd: train +MPI Rank 2: COMPLETED +MPI Rank 2: ~MPIWrapper +MPI Rank 3: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr_train.logrank3 +MPI Rank 3: ------------------------------------------------------------------- +MPI Rank 3: Build info: +MPI Rank 3: +MPI Rank 3: Built time: Mar 3 2016 14:41:54 +MPI Rank 3: Last modified date: Thu Mar 3 14:28:26 2016 +MPI Rank 3: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 +MPI Rank 3: CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 +MPI Rank 3: CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda +MPI Rank 3: Build Branch: HEAD +MPI Rank 3: Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 +MPI Rank 3: Built by thhoens on SAADSRNRDEV040 +MPI Rank 3: Build Path: D:\thhoens\CNTK\Source\CNTK\ +MPI Rank 3: ------------------------------------------------------------------- +MPI Rank 3: running on SAADSRNRDEV040 at 2016/03/03 23:47:12 +MPI Rank 3: command line: +MPI Rank 3: D:\thhoens\CNTK\x64\release\cntk.exe configFile=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.cntk currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM DeviceId=0 numCPUThreads=10 stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: modelPath=$RunDir$/models/dssm.net +MPI Rank 3: MBSize=4096 +MPI Rank 3: LRate=0.0001 +MPI Rank 3: DeviceId=-1 +MPI Rank 3: parallelTrain=true +MPI Rank 3: command = train +MPI Rank 3: precision = float +MPI Rank 3: traceGPUMemoryAllocations=0 +MPI Rank 3: train = [ +MPI Rank 3: action = train +MPI Rank 3: numMBsToShowResult=10 +MPI Rank 3: deviceId=$DeviceId$ +MPI Rank 3: minibatchSize = $MBSize$ +MPI Rank 3: modelPath = $modelPath$ +MPI Rank 3: traceLevel = 1 +MPI Rank 3: SGD = [ +MPI Rank 3: epochSize=102399 +MPI Rank 3: learningRatesPerSample = $LRate$ +MPI Rank 3: momentumPerMB = 0.9 +MPI Rank 3: maxEpochs=3 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationStartEpoch=1 +MPI Rank 3: parallelizationMethod=ModelAveragingSGD +MPI Rank 3: distributedMBReading=true +MPI Rank 3: ModelAveragingSGD=[ +MPI Rank 3: SyncFrequencyInFrames=1024 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: gradUpdateType=none +MPI Rank 3: gradientClippingWithTruncation=true +MPI Rank 3: clippingThresholdPerSample=1#INF +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: NDLNetworkBuilder = [ +MPI Rank 3: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 3: ] +MPI Rank 3: reader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = $DataDir$/train.all.bin +MPI Rank 3: ] +MPI Rank 3: cvReader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = $DataDir$/train.all.bin +MPI Rank 3: ] +MPI Rank 3: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu +MPI Rank 3: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: DeviceId=0 +MPI Rank 3: numCPUThreads=10 +MPI Rank 3: stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: modelPath=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 3: MBSize=4096 +MPI Rank 3: LRate=0.0001 +MPI Rank 3: DeviceId=-1 +MPI Rank 3: parallelTrain=true +MPI Rank 3: command = train +MPI Rank 3: precision = float +MPI Rank 3: traceGPUMemoryAllocations=0 +MPI Rank 3: train = [ +MPI Rank 3: action = train +MPI Rank 3: numMBsToShowResult=10 +MPI Rank 3: deviceId=0 +MPI Rank 3: minibatchSize = 4096 +MPI Rank 3: modelPath = C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 3: traceLevel = 1 +MPI Rank 3: SGD = [ +MPI Rank 3: epochSize=102399 +MPI Rank 3: learningRatesPerSample = 0.0001 +MPI Rank 3: momentumPerMB = 0.9 +MPI Rank 3: maxEpochs=3 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationStartEpoch=1 +MPI Rank 3: parallelizationMethod=ModelAveragingSGD +MPI Rank 3: distributedMBReading=true +MPI Rank 3: ModelAveragingSGD=[ +MPI Rank 3: SyncFrequencyInFrames=1024 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: gradUpdateType=none +MPI Rank 3: gradientClippingWithTruncation=true +MPI Rank 3: clippingThresholdPerSample=1#INF +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: NDLNetworkBuilder = [ +MPI Rank 3: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 3: ] +MPI Rank 3: reader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: cvReader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu +MPI Rank 3: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: DeviceId=0 +MPI Rank 3: numCPUThreads=10 +MPI Rank 3: stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: configparameters: dssm.cntk:command=train +MPI Rank 3: configparameters: dssm.cntk:ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: configparameters: dssm.cntk:currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: configparameters: dssm.cntk:cvReader=[ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: configparameters: dssm.cntk:DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: configparameters: dssm.cntk:DeviceId=0 +MPI Rank 3: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 3: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 3: configparameters: dssm.cntk:modelPath=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 3: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 3: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: configparameters: dssm.cntk:numCPUThreads=10 +MPI Rank 3: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 3: configparameters: dssm.cntk:precision=float +MPI Rank 3: configparameters: dssm.cntk:reader=[ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: configparameters: dssm.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu +MPI Rank 3: configparameters: dssm.cntk:stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 3: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 3: configparameters: dssm.cntk:train=[ +MPI Rank 3: action = train +MPI Rank 3: numMBsToShowResult=10 +MPI Rank 3: deviceId=0 +MPI Rank 3: minibatchSize = 4096 +MPI Rank 3: modelPath = C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 3: traceLevel = 1 +MPI Rank 3: SGD = [ +MPI Rank 3: epochSize=102399 +MPI Rank 3: learningRatesPerSample = 0.0001 +MPI Rank 3: momentumPerMB = 0.9 +MPI Rank 3: maxEpochs=3 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationStartEpoch=1 +MPI Rank 3: parallelizationMethod=ModelAveragingSGD +MPI Rank 3: distributedMBReading=true +MPI Rank 3: ModelAveragingSGD=[ +MPI Rank 3: SyncFrequencyInFrames=1024 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: gradUpdateType=none +MPI Rank 3: gradientClippingWithTruncation=true +MPI Rank 3: clippingThresholdPerSample=1#INF +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: command: train +MPI Rank 3: precision = float +MPI Rank 3: Using 10 CPU threads +MPI Rank 3: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 3: CNTKCommandTrainInfo: train : 3 +MPI Rank 3: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 3: CNTKCommandTrainBegin: train +MPI Rank 3: NDLBuilder Using GPU 0 +MPI Rank 3: Microsoft::MSR::CNTK::GPUMatrix::SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4 +MPI Rank 3: +MPI Rank 3: Post-processing network... +MPI Rank 3: +MPI Rank 3: 2 roots: +MPI Rank 3: SIM = CosDistanceWithNegativeSamples +MPI Rank 3: CE = CrossEntropyWithSoftmax +MPI Rank 3: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 3: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 3: +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: +MPI Rank 3: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 3: +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: +MPI Rank 3: Validating for node SIM, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: +MPI Rank 3: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 3: +MPI Rank 3: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 3: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 3: +MPI Rank 3: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 3: +MPI Rank 3: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 3: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 3: +MPI Rank 3: Validating for node CE, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 3: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 3: +MPI Rank 3: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 3: +MPI Rank 3: Post-processing network complete. +MPI Rank 3: +MPI Rank 3: SGD using GPU 0. +MPI Rank 3: +MPI Rank 3: Training criterion node(s): +MPI Rank 3: CE = CrossEntropyWithSoftmax +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Allocating matrices for forward and/or backward propagation. +MPI Rank 3: No PreCompute nodes found, skipping PreCompute step +MPI Rank 3: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 3: Starting Epoch 1: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 3: +MPI Rank 3: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 3: Epoch[ 1 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 4.32287788; TotalTime = 9.4867s; SamplesPerSecond = 1079.4 +MPI Rank 3: Epoch[ 1 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 3.35470390; TotalTime = 7.3300s; SamplesPerSecond = 1397.0 +MPI Rank 3: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.6160171; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=21.7436 +MPI Rank 3: Starting Epoch 2: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 3: +MPI Rank 3: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 3: Epoch[ 2 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.29653873; TotalTime = 8.1958s; SamplesPerSecond = 1249.4 +MPI Rank 3: Epoch[ 2 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 2.11679478; TotalTime = 7.8817s; SamplesPerSecond = 1299.2 +MPI Rank 3: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 2.1757753; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=20.4866 +MPI Rank 3: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 3: +MPI Rank 3: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 3: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.90347176; TotalTime = 8.4605s; SamplesPerSecond = 1210.3 +MPI Rank 3: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.88304138; TotalTime = 8.0823s; SamplesPerSecond = 1267.0 +MPI Rank 3: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8856394; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=20.8703 +MPI Rank 3: CNTKCommandTrainEnd: train +MPI Rank 3: COMPLETED +MPI Rank 3: ~MPIWrapper +=== Deleting last epoch data +==== Re-running from checkpoint +=== Running C:\Program Files\Microsoft MPI\Bin\/mpiexec.exe -n 4 D:\thhoens\CNTK\x64\release\cntk.exe configFile=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.cntk currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM DeviceId=0 numCPUThreads=10 stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 14:41:54 + Last modified date: Thu Mar 3 14:28:26 2016 + CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 + CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 + CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda + Build Branch: HEAD + Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 + Built by thhoens on SAADSRNRDEV040 + Build Path: D:\thhoens\CNTK\Source\CNTK\ +------------------------------------------------------------------- +MPIWrapper: initializing MPI +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 14:41:54 + Last modified date: Thu Mar 3 14:28:26 2016 + CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 + CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 + CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda + Build Branch: HEAD + Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 + Built by thhoens on SAADSRNRDEV040 + Build Path: D:\thhoens\CNTK\Source\CNTK\ +------------------------------------------------------------------- +MPIWrapper: initializing MPI +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 14:41:54 + Last modified date: Thu Mar 3 14:28:26 2016 + CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 + CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 + CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda + Build Branch: HEAD + Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 + Built by thhoens on SAADSRNRDEV040 + Build Path: D:\thhoens\CNTK\Source\CNTK\ +------------------------------------------------------------------- +MPIWrapper: initializing MPI +------------------------------------------------------------------- +Build info: + + Built time: Mar 3 2016 14:41:54 + Last modified date: Thu Mar 3 14:28:26 2016 + CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 + CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 + CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda + Build Branch: HEAD + Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 + Built by thhoens on SAADSRNRDEV040 + Build Path: D:\thhoens\CNTK\Source\CNTK\ +------------------------------------------------------------------- +MPIWrapper: initializing MPI +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: 4 nodes pinging each other +ping [requestnodes (before change)]: all 4 nodes responded +ping [requestnodes (before change)]: all 4 nodes responded +ping [requestnodes (before change)]: all 4 nodes responded +ping [requestnodes (before change)]: all 4 nodes responded +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (2) are in (participating) +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (1) are in (participating) +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (0) are in (participating) +requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (3) are in (participating) +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: 4 nodes pinging each other +ping [requestnodes (after change)]: all 4 nodes responded +ping [requestnodes (after change)]: all 4 nodes responded +ping [requestnodes (after change)]: all 4 nodes responded +ping [requestnodes (after change)]: all 4 nodes responded +mpihelper: we are cog 2 in a gearbox of 4 +mpihelper: we are cog 1 in a gearbox of 4 +mpihelper: we are cog 0 in a gearbox of 4 +mpihelper: we are cog 3 in a gearbox of 4 +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: 4 nodes pinging each other +ping [mpihelper]: all 4 nodes responded +ping [mpihelper]: all 4 nodes responded +ping [mpihelper]: all 4 nodes responded +ping [mpihelper]: all 4 nodes responded +MPI Rank 0: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr_train.logrank0 +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: Build info: +MPI Rank 0: +MPI Rank 0: Built time: Mar 3 2016 14:41:54 +MPI Rank 0: Last modified date: Thu Mar 3 14:28:26 2016 +MPI Rank 0: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 +MPI Rank 0: CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 +MPI Rank 0: CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda +MPI Rank 0: Build Branch: HEAD +MPI Rank 0: Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 +MPI Rank 0: Built by thhoens on SAADSRNRDEV040 +MPI Rank 0: Build Path: D:\thhoens\CNTK\Source\CNTK\ +MPI Rank 0: ------------------------------------------------------------------- +MPI Rank 0: running on SAADSRNRDEV040 at 2016/03/03 23:48:38 +MPI Rank 0: command line: +MPI Rank 0: D:\thhoens\CNTK\x64\release\cntk.exe configFile=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.cntk currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM DeviceId=0 numCPUThreads=10 stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: modelPath=$RunDir$/models/dssm.net +MPI Rank 0: MBSize=4096 +MPI Rank 0: LRate=0.0001 +MPI Rank 0: DeviceId=-1 +MPI Rank 0: parallelTrain=true +MPI Rank 0: command = train +MPI Rank 0: precision = float +MPI Rank 0: traceGPUMemoryAllocations=0 +MPI Rank 0: train = [ +MPI Rank 0: action = train +MPI Rank 0: numMBsToShowResult=10 +MPI Rank 0: deviceId=$DeviceId$ +MPI Rank 0: minibatchSize = $MBSize$ +MPI Rank 0: modelPath = $modelPath$ +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize=102399 +MPI Rank 0: learningRatesPerSample = $LRate$ +MPI Rank 0: momentumPerMB = 0.9 +MPI Rank 0: maxEpochs=3 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationStartEpoch=1 +MPI Rank 0: parallelizationMethod=ModelAveragingSGD +MPI Rank 0: distributedMBReading=true +MPI Rank 0: ModelAveragingSGD=[ +MPI Rank 0: SyncFrequencyInFrames=1024 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: gradUpdateType=none +MPI Rank 0: gradientClippingWithTruncation=true +MPI Rank 0: clippingThresholdPerSample=1#INF +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: NDLNetworkBuilder = [ +MPI Rank 0: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = $DataDir$/train.all.bin +MPI Rank 0: ] +MPI Rank 0: cvReader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = $DataDir$/train.all.bin +MPI Rank 0: ] +MPI Rank 0: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu +MPI Rank 0: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: DeviceId=0 +MPI Rank 0: numCPUThreads=10 +MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: modelPath=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 0: MBSize=4096 +MPI Rank 0: LRate=0.0001 +MPI Rank 0: DeviceId=-1 +MPI Rank 0: parallelTrain=true +MPI Rank 0: command = train +MPI Rank 0: precision = float +MPI Rank 0: traceGPUMemoryAllocations=0 +MPI Rank 0: train = [ +MPI Rank 0: action = train +MPI Rank 0: numMBsToShowResult=10 +MPI Rank 0: deviceId=0 +MPI Rank 0: minibatchSize = 4096 +MPI Rank 0: modelPath = C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize=102399 +MPI Rank 0: learningRatesPerSample = 0.0001 +MPI Rank 0: momentumPerMB = 0.9 +MPI Rank 0: maxEpochs=3 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationStartEpoch=1 +MPI Rank 0: parallelizationMethod=ModelAveragingSGD +MPI Rank 0: distributedMBReading=true +MPI Rank 0: ModelAveragingSGD=[ +MPI Rank 0: SyncFrequencyInFrames=1024 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: gradUpdateType=none +MPI Rank 0: gradientClippingWithTruncation=true +MPI Rank 0: clippingThresholdPerSample=1#INF +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: NDLNetworkBuilder = [ +MPI Rank 0: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 0: ] +MPI Rank 0: reader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: cvReader = [ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu +MPI Rank 0: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: DeviceId=0 +MPI Rank 0: numCPUThreads=10 +MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: +MPI Rank 0: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 0: configparameters: dssm.cntk:command=train +MPI Rank 0: configparameters: dssm.cntk:ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: configparameters: dssm.cntk:currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: configparameters: dssm.cntk:cvReader=[ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: dssm.cntk:DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 0: configparameters: dssm.cntk:DeviceId=0 +MPI Rank 0: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 0: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 0: configparameters: dssm.cntk:modelPath=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 0: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 0: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: dssm.cntk:numCPUThreads=10 +MPI Rank 0: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 0: configparameters: dssm.cntk:precision=float +MPI Rank 0: configparameters: dssm.cntk:reader=[ +MPI Rank 0: readerType = LibSVMBinaryReader +MPI Rank 0: miniBatchMode = Partial +MPI Rank 0: randomize = 0 +MPI Rank 0: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: configparameters: dssm.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu +MPI Rank 0: configparameters: dssm.cntk:stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 0: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 0: configparameters: dssm.cntk:train=[ +MPI Rank 0: action = train +MPI Rank 0: numMBsToShowResult=10 +MPI Rank 0: deviceId=0 +MPI Rank 0: minibatchSize = 4096 +MPI Rank 0: modelPath = C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 0: traceLevel = 1 +MPI Rank 0: SGD = [ +MPI Rank 0: epochSize=102399 +MPI Rank 0: learningRatesPerSample = 0.0001 +MPI Rank 0: momentumPerMB = 0.9 +MPI Rank 0: maxEpochs=3 +MPI Rank 0: ParallelTrain=[ +MPI Rank 0: parallelizationStartEpoch=1 +MPI Rank 0: parallelizationMethod=ModelAveragingSGD +MPI Rank 0: distributedMBReading=true +MPI Rank 0: ModelAveragingSGD=[ +MPI Rank 0: SyncFrequencyInFrames=1024 +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: gradUpdateType=none +MPI Rank 0: gradientClippingWithTruncation=true +MPI Rank 0: clippingThresholdPerSample=1#INF +MPI Rank 0: ] +MPI Rank 0: ] +MPI Rank 0: +MPI Rank 0: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 0: command: train +MPI Rank 0: precision = float +MPI Rank 0: Using 10 CPU threads +MPI Rank 0: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 0: CNTKCommandTrainInfo: train : 3 +MPI Rank 0: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 0: CNTKCommandTrainBegin: train +MPI Rank 0: NDLBuilder Using GPU 0 +MPI Rank 0: Starting from checkpoint. Load Network From File C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net.2. +MPI Rank 0: +MPI Rank 0: Post-processing network... +MPI Rank 0: +MPI Rank 0: 2 roots: +MPI Rank 0: CE = CrossEntropyWithSoftmax +MPI Rank 0: SIM = CosDistanceWithNegativeSamples +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 0: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 0: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 0: +MPI Rank 0: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 0: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 0: +MPI Rank 0: Validating for node CE, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 0: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 0: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 0: +MPI Rank 0: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 0: +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: +MPI Rank 0: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 0: +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: +MPI Rank 0: Validating for node SIM, final verification. +MPI Rank 0: +MPI Rank 0: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 0: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 0: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 0: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 0: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 0: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 0: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 0: +MPI Rank 0: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 0: +MPI Rank 0: Post-processing network complete. +MPI Rank 0: +MPI Rank 0: SGD using GPU 0. +MPI Rank 0: +MPI Rank 0: Training criterion node(s): +MPI Rank 0: CE = CrossEntropyWithSoftmax +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: No PreCompute nodes found, skipping PreCompute step +MPI Rank 0: Warning: checkpoint file is missing. learning parameters will be initialized from 0 +MPI Rank 0: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 0: +MPI Rank 0: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.92903252; TotalTime = 9.3380s; SamplesPerSecond = 1096.6 +MPI Rank 0: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.94077148; TotalTime = 8.1200s; SamplesPerSecond = 1261.1 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.9504802; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=22.0648 +MPI Rank 0: +MPI Rank 0: +MPI Rank 0: Allocating matrices for forward and/or backward propagation. +MPI Rank 0: Final Results: Minibatch[1-25]: Samples Seen = 102399 CE: CrossEntropyWithSoftmax/Sample = 1.8982234 Perplexity = 6.6740265 +MPI Rank 0: Finished Epoch[ 3 of 3]: [Validation Set] TrainLossPerSample = 1.8982234 +MPI Rank 0: CNTKCommandTrainEnd: train +MPI Rank 0: COMPLETED +MPI Rank 0: ~MPIWrapper +MPI Rank 1: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr_train.logrank1 +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: Build info: +MPI Rank 1: +MPI Rank 1: Built time: Mar 3 2016 14:41:54 +MPI Rank 1: Last modified date: Thu Mar 3 14:28:26 2016 +MPI Rank 1: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 +MPI Rank 1: CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 +MPI Rank 1: CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda +MPI Rank 1: Build Branch: HEAD +MPI Rank 1: Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 +MPI Rank 1: Built by thhoens on SAADSRNRDEV040 +MPI Rank 1: Build Path: D:\thhoens\CNTK\Source\CNTK\ +MPI Rank 1: ------------------------------------------------------------------- +MPI Rank 1: running on SAADSRNRDEV040 at 2016/03/03 23:48:38 +MPI Rank 1: command line: +MPI Rank 1: D:\thhoens\CNTK\x64\release\cntk.exe configFile=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.cntk currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM DeviceId=0 numCPUThreads=10 stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: modelPath=$RunDir$/models/dssm.net +MPI Rank 1: MBSize=4096 +MPI Rank 1: LRate=0.0001 +MPI Rank 1: DeviceId=-1 +MPI Rank 1: parallelTrain=true +MPI Rank 1: command = train +MPI Rank 1: precision = float +MPI Rank 1: traceGPUMemoryAllocations=0 +MPI Rank 1: train = [ +MPI Rank 1: action = train +MPI Rank 1: numMBsToShowResult=10 +MPI Rank 1: deviceId=$DeviceId$ +MPI Rank 1: minibatchSize = $MBSize$ +MPI Rank 1: modelPath = $modelPath$ +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize=102399 +MPI Rank 1: learningRatesPerSample = $LRate$ +MPI Rank 1: momentumPerMB = 0.9 +MPI Rank 1: maxEpochs=3 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationStartEpoch=1 +MPI Rank 1: parallelizationMethod=ModelAveragingSGD +MPI Rank 1: distributedMBReading=true +MPI Rank 1: ModelAveragingSGD=[ +MPI Rank 1: SyncFrequencyInFrames=1024 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: gradUpdateType=none +MPI Rank 1: gradientClippingWithTruncation=true +MPI Rank 1: clippingThresholdPerSample=1#INF +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: NDLNetworkBuilder = [ +MPI Rank 1: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = $DataDir$/train.all.bin +MPI Rank 1: ] +MPI Rank 1: cvReader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = $DataDir$/train.all.bin +MPI Rank 1: ] +MPI Rank 1: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu +MPI Rank 1: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: DeviceId=0 +MPI Rank 1: numCPUThreads=10 +MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: modelPath=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 1: MBSize=4096 +MPI Rank 1: LRate=0.0001 +MPI Rank 1: DeviceId=-1 +MPI Rank 1: parallelTrain=true +MPI Rank 1: command = train +MPI Rank 1: precision = float +MPI Rank 1: traceGPUMemoryAllocations=0 +MPI Rank 1: train = [ +MPI Rank 1: action = train +MPI Rank 1: numMBsToShowResult=10 +MPI Rank 1: deviceId=0 +MPI Rank 1: minibatchSize = 4096 +MPI Rank 1: modelPath = C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize=102399 +MPI Rank 1: learningRatesPerSample = 0.0001 +MPI Rank 1: momentumPerMB = 0.9 +MPI Rank 1: maxEpochs=3 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationStartEpoch=1 +MPI Rank 1: parallelizationMethod=ModelAveragingSGD +MPI Rank 1: distributedMBReading=true +MPI Rank 1: ModelAveragingSGD=[ +MPI Rank 1: SyncFrequencyInFrames=1024 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: gradUpdateType=none +MPI Rank 1: gradientClippingWithTruncation=true +MPI Rank 1: clippingThresholdPerSample=1#INF +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: NDLNetworkBuilder = [ +MPI Rank 1: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 1: ] +MPI Rank 1: reader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: cvReader = [ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu +MPI Rank 1: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: DeviceId=0 +MPI Rank 1: numCPUThreads=10 +MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: +MPI Rank 1: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 1: configparameters: dssm.cntk:command=train +MPI Rank 1: configparameters: dssm.cntk:ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: configparameters: dssm.cntk:currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: configparameters: dssm.cntk:cvReader=[ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: dssm.cntk:DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 1: configparameters: dssm.cntk:DeviceId=0 +MPI Rank 1: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 1: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 1: configparameters: dssm.cntk:modelPath=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 1: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 1: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: dssm.cntk:numCPUThreads=10 +MPI Rank 1: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 1: configparameters: dssm.cntk:precision=float +MPI Rank 1: configparameters: dssm.cntk:reader=[ +MPI Rank 1: readerType = LibSVMBinaryReader +MPI Rank 1: miniBatchMode = Partial +MPI Rank 1: randomize = 0 +MPI Rank 1: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: configparameters: dssm.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu +MPI Rank 1: configparameters: dssm.cntk:stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 1: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 1: configparameters: dssm.cntk:train=[ +MPI Rank 1: action = train +MPI Rank 1: numMBsToShowResult=10 +MPI Rank 1: deviceId=0 +MPI Rank 1: minibatchSize = 4096 +MPI Rank 1: modelPath = C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 1: traceLevel = 1 +MPI Rank 1: SGD = [ +MPI Rank 1: epochSize=102399 +MPI Rank 1: learningRatesPerSample = 0.0001 +MPI Rank 1: momentumPerMB = 0.9 +MPI Rank 1: maxEpochs=3 +MPI Rank 1: ParallelTrain=[ +MPI Rank 1: parallelizationStartEpoch=1 +MPI Rank 1: parallelizationMethod=ModelAveragingSGD +MPI Rank 1: distributedMBReading=true +MPI Rank 1: ModelAveragingSGD=[ +MPI Rank 1: SyncFrequencyInFrames=1024 +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: gradUpdateType=none +MPI Rank 1: gradientClippingWithTruncation=true +MPI Rank 1: clippingThresholdPerSample=1#INF +MPI Rank 1: ] +MPI Rank 1: ] +MPI Rank 1: +MPI Rank 1: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 1: command: train +MPI Rank 1: precision = float +MPI Rank 1: Using 10 CPU threads +MPI Rank 1: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 1: CNTKCommandTrainInfo: train : 3 +MPI Rank 1: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 1: CNTKCommandTrainBegin: train +MPI Rank 1: NDLBuilder Using GPU 0 +MPI Rank 1: Starting from checkpoint. Load Network From File C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net.2. +MPI Rank 1: +MPI Rank 1: Post-processing network... +MPI Rank 1: +MPI Rank 1: 2 roots: +MPI Rank 1: CE = CrossEntropyWithSoftmax +MPI Rank 1: SIM = CosDistanceWithNegativeSamples +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 1: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 1: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 1: +MPI Rank 1: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 1: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 1: +MPI Rank 1: Validating for node CE, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 1: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 1: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 1: +MPI Rank 1: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 1: +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: +MPI Rank 1: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 1: +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: +MPI Rank 1: Validating for node SIM, final verification. +MPI Rank 1: +MPI Rank 1: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 1: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 1: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 1: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 1: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 1: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 1: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 1: +MPI Rank 1: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 1: +MPI Rank 1: Post-processing network complete. +MPI Rank 1: +MPI Rank 1: SGD using GPU 0. +MPI Rank 1: +MPI Rank 1: Training criterion node(s): +MPI Rank 1: CE = CrossEntropyWithSoftmax +MPI Rank 1: +MPI Rank 1: +MPI Rank 1: Allocating matrices for forward and/or backward propagation. +MPI Rank 1: No PreCompute nodes found, skipping PreCompute step +MPI Rank 1: Warning: checkpoint file is missing. learning parameters will be initialized from 0 +MPI Rank 1: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 1: +MPI Rank 1: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.95954418; TotalTime = 9.3550s; SamplesPerSecond = 1094.6 +MPI Rank 1: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.94722424; TotalTime = 8.1200s; SamplesPerSecond = 1261.1 +MPI Rank 1: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.9504802; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=22.065 +MPI Rank 1: CNTKCommandTrainEnd: train +MPI Rank 1: COMPLETED +MPI Rank 1: ~MPIWrapper +MPI Rank 2: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr_train.logrank2 +MPI Rank 2: ------------------------------------------------------------------- +MPI Rank 2: Build info: +MPI Rank 2: +MPI Rank 2: Built time: Mar 3 2016 14:41:54 +MPI Rank 2: Last modified date: Thu Mar 3 14:28:26 2016 +MPI Rank 2: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 +MPI Rank 2: CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 +MPI Rank 2: CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda +MPI Rank 2: Build Branch: HEAD +MPI Rank 2: Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 +MPI Rank 2: Built by thhoens on SAADSRNRDEV040 +MPI Rank 2: Build Path: D:\thhoens\CNTK\Source\CNTK\ +MPI Rank 2: ------------------------------------------------------------------- +MPI Rank 2: running on SAADSRNRDEV040 at 2016/03/03 23:48:39 +MPI Rank 2: command line: +MPI Rank 2: D:\thhoens\CNTK\x64\release\cntk.exe configFile=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.cntk currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM DeviceId=0 numCPUThreads=10 stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: modelPath=$RunDir$/models/dssm.net +MPI Rank 2: MBSize=4096 +MPI Rank 2: LRate=0.0001 +MPI Rank 2: DeviceId=-1 +MPI Rank 2: parallelTrain=true +MPI Rank 2: command = train +MPI Rank 2: precision = float +MPI Rank 2: traceGPUMemoryAllocations=0 +MPI Rank 2: train = [ +MPI Rank 2: action = train +MPI Rank 2: numMBsToShowResult=10 +MPI Rank 2: deviceId=$DeviceId$ +MPI Rank 2: minibatchSize = $MBSize$ +MPI Rank 2: modelPath = $modelPath$ +MPI Rank 2: traceLevel = 1 +MPI Rank 2: SGD = [ +MPI Rank 2: epochSize=102399 +MPI Rank 2: learningRatesPerSample = $LRate$ +MPI Rank 2: momentumPerMB = 0.9 +MPI Rank 2: maxEpochs=3 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationStartEpoch=1 +MPI Rank 2: parallelizationMethod=ModelAveragingSGD +MPI Rank 2: distributedMBReading=true +MPI Rank 2: ModelAveragingSGD=[ +MPI Rank 2: SyncFrequencyInFrames=1024 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: gradUpdateType=none +MPI Rank 2: gradientClippingWithTruncation=true +MPI Rank 2: clippingThresholdPerSample=1#INF +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: NDLNetworkBuilder = [ +MPI Rank 2: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 2: ] +MPI Rank 2: reader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = $DataDir$/train.all.bin +MPI Rank 2: ] +MPI Rank 2: cvReader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = $DataDir$/train.all.bin +MPI Rank 2: ] +MPI Rank 2: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu +MPI Rank 2: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: DeviceId=0 +MPI Rank 2: numCPUThreads=10 +MPI Rank 2: stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: modelPath=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 2: MBSize=4096 +MPI Rank 2: LRate=0.0001 +MPI Rank 2: DeviceId=-1 +MPI Rank 2: parallelTrain=true +MPI Rank 2: command = train +MPI Rank 2: precision = float +MPI Rank 2: traceGPUMemoryAllocations=0 +MPI Rank 2: train = [ +MPI Rank 2: action = train +MPI Rank 2: numMBsToShowResult=10 +MPI Rank 2: deviceId=0 +MPI Rank 2: minibatchSize = 4096 +MPI Rank 2: modelPath = C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 2: traceLevel = 1 +MPI Rank 2: SGD = [ +MPI Rank 2: epochSize=102399 +MPI Rank 2: learningRatesPerSample = 0.0001 +MPI Rank 2: momentumPerMB = 0.9 +MPI Rank 2: maxEpochs=3 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationStartEpoch=1 +MPI Rank 2: parallelizationMethod=ModelAveragingSGD +MPI Rank 2: distributedMBReading=true +MPI Rank 2: ModelAveragingSGD=[ +MPI Rank 2: SyncFrequencyInFrames=1024 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: gradUpdateType=none +MPI Rank 2: gradientClippingWithTruncation=true +MPI Rank 2: clippingThresholdPerSample=1#INF +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: NDLNetworkBuilder = [ +MPI Rank 2: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 2: ] +MPI Rank 2: reader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: cvReader = [ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu +MPI Rank 2: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: DeviceId=0 +MPI Rank 2: numCPUThreads=10 +MPI Rank 2: stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: +MPI Rank 2: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 2: configparameters: dssm.cntk:command=train +MPI Rank 2: configparameters: dssm.cntk:ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: configparameters: dssm.cntk:currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: configparameters: dssm.cntk:cvReader=[ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: configparameters: dssm.cntk:DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 2: configparameters: dssm.cntk:DeviceId=0 +MPI Rank 2: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 2: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 2: configparameters: dssm.cntk:modelPath=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 2: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 2: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: configparameters: dssm.cntk:numCPUThreads=10 +MPI Rank 2: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 2: configparameters: dssm.cntk:precision=float +MPI Rank 2: configparameters: dssm.cntk:reader=[ +MPI Rank 2: readerType = LibSVMBinaryReader +MPI Rank 2: miniBatchMode = Partial +MPI Rank 2: randomize = 0 +MPI Rank 2: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: configparameters: dssm.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu +MPI Rank 2: configparameters: dssm.cntk:stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 2: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 2: configparameters: dssm.cntk:train=[ +MPI Rank 2: action = train +MPI Rank 2: numMBsToShowResult=10 +MPI Rank 2: deviceId=0 +MPI Rank 2: minibatchSize = 4096 +MPI Rank 2: modelPath = C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 2: traceLevel = 1 +MPI Rank 2: SGD = [ +MPI Rank 2: epochSize=102399 +MPI Rank 2: learningRatesPerSample = 0.0001 +MPI Rank 2: momentumPerMB = 0.9 +MPI Rank 2: maxEpochs=3 +MPI Rank 2: ParallelTrain=[ +MPI Rank 2: parallelizationStartEpoch=1 +MPI Rank 2: parallelizationMethod=ModelAveragingSGD +MPI Rank 2: distributedMBReading=true +MPI Rank 2: ModelAveragingSGD=[ +MPI Rank 2: SyncFrequencyInFrames=1024 +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: gradUpdateType=none +MPI Rank 2: gradientClippingWithTruncation=true +MPI Rank 2: clippingThresholdPerSample=1#INF +MPI Rank 2: ] +MPI Rank 2: ] +MPI Rank 2: +MPI Rank 2: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 2: command: train +MPI Rank 2: precision = float +MPI Rank 2: Using 10 CPU threads +MPI Rank 2: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 2: CNTKCommandTrainInfo: train : 3 +MPI Rank 2: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 2: CNTKCommandTrainBegin: train +MPI Rank 2: NDLBuilder Using GPU 0 +MPI Rank 2: Starting from checkpoint. Load Network From File C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net.2. +MPI Rank 2: +MPI Rank 2: Post-processing network... +MPI Rank 2: +MPI Rank 2: 2 roots: +MPI Rank 2: CE = CrossEntropyWithSoftmax +MPI Rank 2: SIM = CosDistanceWithNegativeSamples +MPI Rank 2: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 2: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 2: +MPI Rank 2: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 2: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 2: +MPI Rank 2: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 2: +MPI Rank 2: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 2: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 2: +MPI Rank 2: Validating for node CE, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 2: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 2: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 2: +MPI Rank 2: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 2: +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: +MPI Rank 2: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 2: +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: +MPI Rank 2: Validating for node SIM, final verification. +MPI Rank 2: +MPI Rank 2: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 2: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 2: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 2: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 2: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 2: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 2: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 2: +MPI Rank 2: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 2: +MPI Rank 2: Post-processing network complete. +MPI Rank 2: +MPI Rank 2: SGD using GPU 0. +MPI Rank 2: +MPI Rank 2: Training criterion node(s): +MPI Rank 2: CE = CrossEntropyWithSoftmax +MPI Rank 2: +MPI Rank 2: +MPI Rank 2: Allocating matrices for forward and/or backward propagation. +MPI Rank 2: No PreCompute nodes found, skipping PreCompute step +MPI Rank 2: Warning: checkpoint file is missing. learning parameters will be initialized from 0 +MPI Rank 2: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 2: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 2: +MPI Rank 2: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 2: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.98175926; TotalTime = 8.9122s; SamplesPerSecond = 1149.0 +MPI Rank 2: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.95635300; TotalTime = 8.1200s; SamplesPerSecond = 1261.1 +MPI Rank 2: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.9504802; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=22.0648 +MPI Rank 2: CNTKCommandTrainEnd: train +MPI Rank 2: COMPLETED +MPI Rank 2: ~MPIWrapper +MPI Rank 3: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr_train.logrank3 +MPI Rank 3: ------------------------------------------------------------------- +MPI Rank 3: Build info: +MPI Rank 3: +MPI Rank 3: Built time: Mar 3 2016 14:41:54 +MPI Rank 3: Last modified date: Thu Mar 3 14:28:26 2016 +MPI Rank 3: CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 +MPI Rank 3: CUB_PATH: D:\thhoens\cub-1.4.1\cub-1.4.1 +MPI Rank 3: CUDNN_PATH: C:\NVIDIA\cudnn-4.0\cuda +MPI Rank 3: Build Branch: HEAD +MPI Rank 3: Build SHA1: 31a164602c629d10741761443e6e46b2ab787ad5 +MPI Rank 3: Built by thhoens on SAADSRNRDEV040 +MPI Rank 3: Build Path: D:\thhoens\CNTK\Source\CNTK\ +MPI Rank 3: ------------------------------------------------------------------- +MPI Rank 3: running on SAADSRNRDEV040 at 2016/03/03 23:48:39 +MPI Rank 3: command line: +MPI Rank 3: D:\thhoens\CNTK\x64\release\cntk.exe configFile=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.cntk currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM DeviceId=0 numCPUThreads=10 stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: modelPath=$RunDir$/models/dssm.net +MPI Rank 3: MBSize=4096 +MPI Rank 3: LRate=0.0001 +MPI Rank 3: DeviceId=-1 +MPI Rank 3: parallelTrain=true +MPI Rank 3: command = train +MPI Rank 3: precision = float +MPI Rank 3: traceGPUMemoryAllocations=0 +MPI Rank 3: train = [ +MPI Rank 3: action = train +MPI Rank 3: numMBsToShowResult=10 +MPI Rank 3: deviceId=$DeviceId$ +MPI Rank 3: minibatchSize = $MBSize$ +MPI Rank 3: modelPath = $modelPath$ +MPI Rank 3: traceLevel = 1 +MPI Rank 3: SGD = [ +MPI Rank 3: epochSize=102399 +MPI Rank 3: learningRatesPerSample = $LRate$ +MPI Rank 3: momentumPerMB = 0.9 +MPI Rank 3: maxEpochs=3 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationStartEpoch=1 +MPI Rank 3: parallelizationMethod=ModelAveragingSGD +MPI Rank 3: distributedMBReading=true +MPI Rank 3: ModelAveragingSGD=[ +MPI Rank 3: SyncFrequencyInFrames=1024 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: gradUpdateType=none +MPI Rank 3: gradientClippingWithTruncation=true +MPI Rank 3: clippingThresholdPerSample=1#INF +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: NDLNetworkBuilder = [ +MPI Rank 3: networkDescription = $ConfigDir$/dssm.ndl +MPI Rank 3: ] +MPI Rank 3: reader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = $DataDir$/train.all.bin +MPI Rank 3: ] +MPI Rank 3: cvReader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = $DataDir$/train.all.bin +MPI Rank 3: ] +MPI Rank 3: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu +MPI Rank 3: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: DeviceId=0 +MPI Rank 3: numCPUThreads=10 +MPI Rank 3: stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: modelPath=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 3: MBSize=4096 +MPI Rank 3: LRate=0.0001 +MPI Rank 3: DeviceId=-1 +MPI Rank 3: parallelTrain=true +MPI Rank 3: command = train +MPI Rank 3: precision = float +MPI Rank 3: traceGPUMemoryAllocations=0 +MPI Rank 3: train = [ +MPI Rank 3: action = train +MPI Rank 3: numMBsToShowResult=10 +MPI Rank 3: deviceId=0 +MPI Rank 3: minibatchSize = 4096 +MPI Rank 3: modelPath = C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 3: traceLevel = 1 +MPI Rank 3: SGD = [ +MPI Rank 3: epochSize=102399 +MPI Rank 3: learningRatesPerSample = 0.0001 +MPI Rank 3: momentumPerMB = 0.9 +MPI Rank 3: maxEpochs=3 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationStartEpoch=1 +MPI Rank 3: parallelizationMethod=ModelAveragingSGD +MPI Rank 3: distributedMBReading=true +MPI Rank 3: ModelAveragingSGD=[ +MPI Rank 3: SyncFrequencyInFrames=1024 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: gradUpdateType=none +MPI Rank 3: gradientClippingWithTruncation=true +MPI Rank 3: clippingThresholdPerSample=1#INF +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: NDLNetworkBuilder = [ +MPI Rank 3: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 3: ] +MPI Rank 3: reader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: cvReader = [ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu +MPI Rank 3: DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: DeviceId=0 +MPI Rank 3: numCPUThreads=10 +MPI Rank 3: stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: +MPI Rank 3: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> +MPI Rank 3: configparameters: dssm.cntk:command=train +MPI Rank 3: configparameters: dssm.cntk:ConfigDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: configparameters: dssm.cntk:currentDirectory=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: configparameters: dssm.cntk:cvReader=[ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: configparameters: dssm.cntk:DataDir=D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM +MPI Rank 3: configparameters: dssm.cntk:DeviceId=0 +MPI Rank 3: configparameters: dssm.cntk:LRate=0.0001 +MPI Rank 3: configparameters: dssm.cntk:MBSize=4096 +MPI Rank 3: configparameters: dssm.cntk:modelPath=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 3: configparameters: dssm.cntk:NDLNetworkBuilder=[ +MPI Rank 3: networkDescription = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/dssm.ndl +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: configparameters: dssm.cntk:numCPUThreads=10 +MPI Rank 3: configparameters: dssm.cntk:parallelTrain=true +MPI Rank 3: configparameters: dssm.cntk:precision=float +MPI Rank 3: configparameters: dssm.cntk:reader=[ +MPI Rank 3: readerType = LibSVMBinaryReader +MPI Rank 3: miniBatchMode = Partial +MPI Rank 3: randomize = 0 +MPI Rank 3: file = D:\thhoens\CNTK\Tests\EndToEndTests\Text\SparseDSSM/train.all.bin +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: configparameters: dssm.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu +MPI Rank 3: configparameters: dssm.cntk:stderr=C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/stderr +MPI Rank 3: configparameters: dssm.cntk:traceGPUMemoryAllocations=0 +MPI Rank 3: configparameters: dssm.cntk:train=[ +MPI Rank 3: action = train +MPI Rank 3: numMBsToShowResult=10 +MPI Rank 3: deviceId=0 +MPI Rank 3: minibatchSize = 4096 +MPI Rank 3: modelPath = C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 3: traceLevel = 1 +MPI Rank 3: SGD = [ +MPI Rank 3: epochSize=102399 +MPI Rank 3: learningRatesPerSample = 0.0001 +MPI Rank 3: momentumPerMB = 0.9 +MPI Rank 3: maxEpochs=3 +MPI Rank 3: ParallelTrain=[ +MPI Rank 3: parallelizationStartEpoch=1 +MPI Rank 3: parallelizationMethod=ModelAveragingSGD +MPI Rank 3: distributedMBReading=true +MPI Rank 3: ModelAveragingSGD=[ +MPI Rank 3: SyncFrequencyInFrames=1024 +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: gradUpdateType=none +MPI Rank 3: gradientClippingWithTruncation=true +MPI Rank 3: clippingThresholdPerSample=1#INF +MPI Rank 3: ] +MPI Rank 3: ] +MPI Rank 3: +MPI Rank 3: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< +MPI Rank 3: command: train +MPI Rank 3: precision = float +MPI Rank 3: Using 10 CPU threads +MPI Rank 3: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net +MPI Rank 3: CNTKCommandTrainInfo: train : 3 +MPI Rank 3: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3 +MPI Rank 3: CNTKCommandTrainBegin: train +MPI Rank 3: NDLBuilder Using GPU 0 +MPI Rank 3: Starting from checkpoint. Load Network From File C:\cygwin64\tmp\cntk-test-20160303154710.317558\Text_SparseDSSM@release_gpu/models/dssm.net.2. +MPI Rank 3: +MPI Rank 3: Post-processing network... +MPI Rank 3: +MPI Rank 3: 2 roots: +MPI Rank 3: SIM = CosDistanceWithNegativeSamples +MPI Rank 3: CE = CrossEntropyWithSoftmax +MPI Rank 3: FormNestedNetwork: WARNING: Was called twice for SIM CosDistanceWithNegativeSamples operation +MPI Rank 3: FormNestedNetwork: WARNING: Was called twice for CE CrossEntropyWithSoftmax operation +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Validating for node SIM. 17 nodes to process in pass 1. +MPI Rank 3: +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: +MPI Rank 3: Validating for node SIM. 9 nodes to process in pass 2. +MPI Rank 3: +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: +MPI Rank 3: Validating for node SIM, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: +MPI Rank 3: 6 out of 17 nodes do not share the minibatch layout with the input data. +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Validating for node CE. 21 nodes to process in pass 1. +MPI Rank 3: +MPI Rank 3: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 3: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 3: +MPI Rank 3: Validating for node CE. 11 nodes to process in pass 2. +MPI Rank 3: +MPI Rank 3: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 3: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 3: +MPI Rank 3: Validating for node CE, final verification. +MPI Rank 3: +MPI Rank 3: Validating --> DSSMLabel = InputValue -> [51 [51 x 1], MBSize 0] +MPI Rank 3: Validating --> G = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> WQ1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WQ0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Query = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q = Times(WQ0[288, 49292], Query[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ0_Q_Tanh = Tanh(WQ0_Q[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q = Times(WQ1[64, 288], WQ0_Q_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WQ1_Q_Tanh = Tanh(WQ1_Q[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1 = LearnableParameter -> [64 [64], 288] +MPI Rank 3: Validating --> WD0 = LearnableParameter -> [288 [288], 49292] +MPI Rank 3: Validating --> Keyword = SparseInputValue -> [49292 [49292], MBSize 0] +MPI Rank 3: Validating --> WD0_D = Times(WD0[288, 49292], Keyword[49292, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD0_D_Tanh = Tanh(WD0_D[288, MBSize 0]) -> [288 [288], MBSize 0] +MPI Rank 3: Validating --> WD1_D = Times(WD1[64, 288], WD0_D_Tanh[288, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> WD1_D_Tanh = Tanh(WD1_D[64, MBSize 0]) -> [64 [64], MBSize 0] +MPI Rank 3: Validating --> S = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> N = LearnableParameter -> [1 [1], 1] +MPI Rank 3: Validating --> SIM = CosDistanceWithNegativeSamples(WQ1_Q_Tanh[64, MBSize 0], WD1_D_Tanh[64, MBSize 0], S[1, 1], N[1, 1]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> SIM_Scale = ElementTimes(G[1, 1], SIM[51, MBSize 0]) -> [51 [51], MBSize 0] +MPI Rank 3: Validating --> CE = CrossEntropyWithSoftmax(DSSMLabel[51 [51 x 1], MBSize 0], SIM_Scale[51, MBSize 0]) -> [1 [1], 1] +MPI Rank 3: +MPI Rank 3: 8 out of 21 nodes do not share the minibatch layout with the input data. +MPI Rank 3: +MPI Rank 3: Post-processing network complete. +MPI Rank 3: +MPI Rank 3: SGD using GPU 0. +MPI Rank 3: +MPI Rank 3: Training criterion node(s): +MPI Rank 3: CE = CrossEntropyWithSoftmax +MPI Rank 3: +MPI Rank 3: +MPI Rank 3: Allocating matrices for forward and/or backward propagation. +MPI Rank 3: No PreCompute nodes found, skipping PreCompute step +MPI Rank 3: Warning: checkpoint file is missing. learning parameters will be initialized from 0 +MPI Rank 3: Set Max Temp Mem Size For Convolution Nodes to 0 samples. +MPI Rank 3: Starting Epoch 3: learning rate per sample = 0.000100 effective momentum = 0.900000 momentum as time constant = 38876.0 samples +MPI Rank 3: +MPI Rank 3: Starting minibatch loop, distributed reading is ENABLED. +MPI Rank 3: Epoch[ 3 of 3]-Minibatch[ 1- 10, 40.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.93374424; TotalTime = 9.3941s; SamplesPerSecond = 1090.0 +MPI Rank 3: Epoch[ 3 of 3]-Minibatch[ 11- 20, 80.00%]: SamplesSeen = 10240; TrainLossPerSample = 1.95965519; TotalTime = 8.1200s; SamplesPerSecond = 1261.1 +MPI Rank 3: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.9504802; AvgLearningRatePerSample = 9.9999997e-005; EpochTime=22.065 +MPI Rank 3: CNTKCommandTrainEnd: train +MPI Rank 3: COMPLETED +MPI Rank 3: ~MPIWrapper diff --git a/Tests/EndToEndTests/Text/SparseDSSM/dssm.cntk b/Tests/EndToEndTests/Text/SparseDSSM/dssm.cntk new file mode 100755 index 000000000000..ad5967e840e8 --- /dev/null +++ b/Tests/EndToEndTests/Text/SparseDSSM/dssm.cntk @@ -0,0 +1,61 @@ +modelPath=$RunDir$/models/dssm.net + +MBSize=4096 +LRate=0.0001 + +# deviceId=-1 for CPU, >=0 for GPU devices +DeviceId=-1 + +parallelTrain=true + +command = train +precision = float +traceGPUMemoryAllocations=0 +train = [ + action = train + numMBsToShowResult=10 + deviceId=$DeviceId$ + minibatchSize = $MBSize$ + modelPath = $modelPath$ + traceLevel = 1 + + SGD = [ + epochSize=102399 + learningRatesPerSample = $LRate$ + momentumPerMB = 0.9 + maxEpochs=3 + ParallelTrain=[ + parallelizationStartEpoch=1 + parallelizationMethod=ModelAveragingSGD + distributedMBReading=true + ModelAveragingSGD=[ + SyncFrequencyInFrames=1024 + ] + ] + + gradUpdateType=none + gradientClippingWithTruncation=true + clippingThresholdPerSample=1#INF + ] + +] + +NDLNetworkBuilder = [ + networkDescription = $ConfigDir$/dssm.ndl +] + +reader = [ + # reader to use + readerType = LibSVMBinaryReader + miniBatchMode = Partial + randomize = 0 + file = $DataDir$/train.all.bin +] + +cvReader = [ + # reader to use + readerType = LibSVMBinaryReader + miniBatchMode = Partial + randomize = 0 + file = $DataDir$/train.all.bin +] diff --git a/Tests/EndToEndTests/Text/SparseDSSM/dssm.ndl b/Tests/EndToEndTests/Text/SparseDSSM/dssm.ndl new file mode 100755 index 000000000000..a1c885b9f62a --- /dev/null +++ b/Tests/EndToEndTests/Text/SparseDSSM/dssm.ndl @@ -0,0 +1,50 @@ +# The following script defines the same network structure as that in the +# baseline dssm See details in +# \\msrr-deep-02\hxd\DSSM\WWW14_Set\config_DSSM.ax.txt +# Feature dimension is data dependent. Each time we changes the dataset, SDim_Q and SDim_D must be set accordingly. +# For the train.4M dataset, they are set to be 49288 and 49010. +# Theoretically, we could set these values to the maximum feature size. We chose current way to make it consistent with DSSM. +SDim_Q=49292 +SDim_D=49292 +HDim=288 +LDim=64 +NEG=50 +RC=51 +SHIFT=1 +GAMMA=10 + +# Model learnable parameters +WQ0=Parameter(HDim, SDim_Q) +WQ1=Parameter(LDim, HDim) +# Model learnable parameters +WD0=Parameter(HDim, SDim_D) +WD1=Parameter(LDim, HDim) +# Below is for Q +Query=SparseInput(SDim_Q) +WQ0_Q=Times(WQ0, Query) +WQ0_Q_Tanh=Tanh(WQ0_Q) +WQ1_Q=Times(WQ1, WQ0_Q_Tanh) +WQ1_Q_Tanh=Tanh(WQ1_Q) +# Below is for D +Keyword=SparseInput(SDim_D) +WD0_D=Times(WD0, Keyword) +WD0_D_Tanh=Tanh(WD0_D) +WD1_D=Times(WD1, WD0_D_Tanh) +WD1_D_Tanh=Tanh(WD1_D) +# Below is for comparing Q and D +S=Constant(SHIFT) +N=Constant(NEG) +G=Constant(GAMMA) +# BUGBUG: DSSMLabel is a special input that LibSVMBinaryReader knows about +# It fills this in automagically. +DSSMLabel=Input(RC, 1) + +SIM=CosDistanceWithNegativeSamples(WQ1_Q_Tanh, WD1_D_Tanh, S, N) +SIM_Scale=Scale(G,SIM) +CE=CrossEntropyWithSoftmax(DSSMLabel, SIM_Scale) + +FeatureNodes=(Query,Keyword) +LabelNodes=(DSSMLabel) +CriteriaNodes=(CE) +EvalNodes=(CE) +OutputNodes=(SIM) diff --git a/Tests/EndToEndTests/Text/SparseDSSM/run-test b/Tests/EndToEndTests/Text/SparseDSSM/run-test new file mode 100755 index 000000000000..1644e7a27550 --- /dev/null +++ b/Tests/EndToEndTests/Text/SparseDSSM/run-test @@ -0,0 +1,50 @@ +#!/bin/bash + +. $TEST_ROOT_DIR/run-test-common + +# This test uses a large dataset which is not part of the CNTK repository itself +# We use the dataset from an external location specified using an environment variable +if [[ "$CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY" == "" || ! -d "$CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY" ]]; then + echo 'This test uses external data that is not part of the CNTK repository. Environment variable CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY must be set to point to the external test data location' + exit 1 +fi + +if [ "$OS" == "Windows_NT" ]; then + DataSourceDir=`cygpath -au $CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY`/Text/SparseDSSM +else + DataSourceDir=$CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY/Text/SparseDSSM +fi + +# Copy the test data to the test run directory +DataDir=$TEST_RUN_DIR/TestData +mkdir $DataDir +cp -R $DataSourceDir/* $DataDir || exit $? + + +ConfigDir=$TEST_DIR/ +LogFileName=stderr +Instances=4 +NumCPUThreads=$(threadsPerInstance $Instances) + +# cntkrun +cntkmpirun "-n $Instances" dssm.cntk "numCPUThreads=$NumCPUThreads" +ExitCode=$? +sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_train.logrank0 +sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_train.logrank1 +sed 's/^/MPI Rank 2: /' $TEST_RUN_DIR/"$LogFileName"_train.logrank2 +sed 's/^/MPI Rank 3: /' $TEST_RUN_DIR/"$LogFileName"_train.logrank3 +if [ "$ExitCode" != "0" ]; then + exit $ExitCode +fi +echo === Deleting last epoch data +rm $TEST_RUN_DIR/models/*.net || exit $? +echo ==== Re-running from checkpoint +DeleteExistingModels=0 +# cntkrun +cntkmpirun "-n $Instances" dssm.cntk "numCPUThreads=$NumCPUThreads" +ExitCode=$? +sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_train.logrank0 +sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_train.logrank1 +sed 's/^/MPI Rank 2: /' $TEST_RUN_DIR/"$LogFileName"_train.logrank2 +sed 's/^/MPI Rank 3: /' $TEST_RUN_DIR/"$LogFileName"_train.logrank3 +exit $ExitCode diff --git a/Tests/EndToEndTests/Text/SparseDSSM/testcases.yml.disabled b/Tests/EndToEndTests/Text/SparseDSSM/testcases.yml.disabled new file mode 100755 index 000000000000..dac208722980 --- /dev/null +++ b/Tests/EndToEndTests/Text/SparseDSSM/testcases.yml.disabled @@ -0,0 +1,37 @@ +dataDir: ./ +tags: + # running on every BVT job in 'S' (Speech) leg in Debug-GPU and Release-CPU configurations: + - bvt-s (build_sku == 'gpu') and ((flavor=='debug') ^ (device=='cpu')) + # running unconditionally on every Nightly job in 'S' leg + - nightly-s (build_sku == 'gpu') + +testCases: + Must train epochs in exactly same order and parameters: + patterns: + - ^MPI Rank {{integer}} + - Starting Epoch {{integer}} + - learning rate per sample = {{float}} + - momentum = {{float}} + + Training must finish as expected: + patterns: + - ^MPI Rank {{integer}} + - Finished Epoch[{{integer}} of {{integer}}] + - Training Set + - TrainLossPerSample = {{float,tolerance=.1%}} + - AvgLearningRatePerSample = {{float,tolerance=0.001%}} + + Cross Validation must finish as expected: + patterns: + - ^MPI Rank {{integer}} + - Finished Epoch[{{integer}} of {{integer}}] + - Validation Set + - TrainLossPerSample = {{float,tolerance=.1%}} + + Per-minibatch training results must match: + patterns: + - ^MPI Rank {{integer}} + - Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}} + - SamplesSeen = {{integer}} + - TrainLossPerSample = {{float,tolerance=.1%}} +