Merge with new changes including RowStackNode

henauy · Jun 16, 2015 · f332421 · f332421
2 parents 01468f3 + 99af413
commit f332421
Show file tree

Hide file tree

Showing 32 changed files with 810 additions and 134 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,6 +15,7 @@ x64/
 build/
 [Bb]in/
 [Oo]bj/
+.run-*
 
 # Enable "build/" folder in the NuGet Packages folder since NuGet packages use it for MSBuild targets
 !packages/*/build/

diff --git a/DataReader/BinaryReader/BinaryWriter.cpp b/DataReader/BinaryReader/BinaryWriter.cpp
@@ -47,8 +47,8 @@ BinaryWriter<ElemType>::~BinaryWriter()
 //  miniBatchMode=Partial
 //  randomize=None
 //  wfile=c:\speech\mnist\mnist_test.bin
-//  #wsize - inital size of the file in MB
-//  # if calculated size would be bigger, that is used instead
+//  #wsize - inital size of the file in MB default to 256
+//  # has to be large enough for your dataset. the file will shrink to the actual size when closed.
 //  #wsize=256
 //  #wrecords - number of records we should allocate space for in the file
 //  # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file

diff --git a/DataReader/HTKMLFReader/HTKMLFReader.cpp b/DataReader/HTKMLFReader/HTKMLFReader.cpp
@@ -980,8 +980,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                         {
                             // dereference matrix that corresponds to key (input/output name) and 
                             // populate based on whether its a feature or a label
-                            //Matrix<ElemType>& data =
-                                                        *matrices[iter->first]; // can be features or labels
+                            //Matrix<ElemType>& data = *matrices[iter->first]; // can be features or labels
 
                             if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
                             {
@@ -1058,8 +1057,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                         {
                             // dereference matrix that corresponds to key (input/output name) and 
                             // populate based on whether its a feature or a label
-                            //Matrix<ElemType>& data =
-                                                        *matrices[iter->first]; // can be features or labels
+                            //Matrix<ElemType>& data =*matrices[iter->first]; // can be features or labels
 
                             if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
                             {
@@ -1134,8 +1132,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                         {
                             // dereference matrix that corresponds to key (input/output name) and 
                             // populate based on whether its a feature or a label
-                            //Matrix<ElemType>& data =
-                                                        *matrices[iter->first]; // can be features or labels
+                            //Matrix<ElemType>& data = *matrices[iter->first]; // can be features or labels
 
                             if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
                             {

diff --git a/DataReader/HTKMLFReader/basetypes.h b/DataReader/HTKMLFReader/basetypes.h
@@ -142,6 +142,15 @@ extern void _CHECKED_ASSERT_error(const char * file, int line, const char * exp)
 #endif
 #endif
 
+/**
+These macros are used for sentence segmentation information.
+*/
+#define SENTENCE_BEGIN 0 
+#define SENTENCE_MIDDLE 1
+#define NO_LABELS -1
+#define EXISTS_SENTENCE_BEGIN_OR_NO_LABELS 0
+#define NO_EXISTS_SENTENCE_BEGIN_OR_NO_LABELS 1
+
 // ----------------------------------------------------------------------------
 // basic data types
 // ----------------------------------------------------------------------------

diff --git a/DataReader/HTKMLFReader/utterancesourcemulti.h b/DataReader/HTKMLFReader/utterancesourcemulti.h
@@ -382,47 +382,58 @@ class minibatchutterancesourcemulti : public minibatchsource
                 // TODO: we can store labels more efficiently now since we don't do frame-wise random access anymore.
 
                 // OK, utterance has all we need --remember it
-                utteranceset.push_back (std::move (utterance));
 
                 if (m==0)
                 {
-                    _totalframes += uttframes;
-                    framesaccum.push_back(uttframes); //track number of frames in each utterance - first feature is the reference
                     if (!labels.empty() && !lacksmlf)
                     //if (!labels.empty() && labelsiter != labels[0].end())
                     {
-                        foreach_index (j, labels)
+                        // first verify that all the label files have the proper duration
+                        bool durationmatch = true;
+                        foreach_index(j, labels)
                         {
                             const auto & labseq = labels[j].find(key)->second;
                             // check if durations match; skip if not
-                            size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size()-1].firstframe + labseq[labseq.size()-1].numframes);
+                            size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size() - 1].firstframe + labseq[labseq.size() - 1].numframes);
                             if (labframes != uttframes)
                             {
-                                fprintf (stderr, " [duration mismatch (%d in label vs. %d in feat file), skipping %S]", labframes, uttframes, key.c_str());
+                                fprintf(stderr, " [duration mismatch (%d in label vs. %d in feat file), skipping %S]", labframes, uttframes, key.c_str());
                                 nomlf++;
-                                continue;   // skip this utterance at all
+                                durationmatch = false;
+                                break; // continue;   // skip this utterance at all
                             }
-                            // expand classid sequence into flat array
-                            foreach_index (i, labseq)
+                        }
+                        if (durationmatch){
+                            utteranceset.push_back(std::move(utterance));
+                            _totalframes += uttframes;
+                            framesaccum.push_back(uttframes); //track number of frames in each utterance - first feature is the reference
+                            // then parse each mlf if the durations are consistent
+                            foreach_index(j, labels)
                             {
-                                const auto & e = labseq[i];
-                                if ((i > 0 && labseq[i-1].firstframe + labseq[i-1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0))
-                                    throw std::runtime_error (msra::strfun::strprintf ("minibatchutterancesource: labels not in consecutive order MLF in label set: %S", key.c_str()));
-                                if (e.classid >= udim[j])
-                                    throw std::runtime_error (msra::strfun::strprintf ("minibatchutterancesource: class id %d exceeds model output dimension %d in file %S", e.classid, udim, key.c_str()));
-                                if (e.classid != (CLASSIDTYPE) e.classid)
-                                    throw std::runtime_error ("CLASSIDTYPE has too few bits");
-                                for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++)
-                                    classids[j]->push_back ((CLASSIDTYPE) e.classid);
-                                numclasses[j] = max (numclasses[j], 1u + e.classid);
-                                counts[j].resize (numclasses[j], 0);
-                                counts[j][e.classid] += e.numframes;
-                            }
-                            classids[j]->push_back ((CLASSIDTYPE) -1);  // append a boundary marker marker for checking
+                                const auto & labseq = labels[j].find(key)->second;
+                                // expand classid sequence into flat array
+                                foreach_index(i, labseq)
+                                {
+                                    const auto & e = labseq[i];
+                                    if ((i > 0 && labseq[i - 1].firstframe + labseq[i - 1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0))
+                                        throw std::runtime_error(msra::strfun::strprintf("minibatchutterancesource: labels not in consecutive order MLF in label set: %S", key.c_str()));
+                                    if (e.classid >= udim[j])
+                                        throw std::runtime_error(msra::strfun::strprintf("minibatchutterancesource: class id %d exceeds model output dimension %d in file %S", e.classid, udim, key.c_str()));
+                                    if (e.classid != (CLASSIDTYPE)e.classid)
+                                        throw std::runtime_error("CLASSIDTYPE has too few bits");
+                                    for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++)
+                                        classids[j]->push_back((CLASSIDTYPE)e.classid);
+                                    numclasses[j] = max(numclasses[j], 1u + e.classid);
+                                    counts[j].resize(numclasses[j], 0);
+                                    counts[j][e.classid] += e.numframes;
+                                }
+
+                                classids[j]->push_back((CLASSIDTYPE)-1);  // append a boundary marker marker for checking
 
-                            if (!labels[j].empty() && classids[j]->size() != _totalframes + utteranceset.size())
-                                throw std::logic_error (msra::strfun::strprintf ("minibatchutterancesource: label duration inconsistent with feature file in MLF label set: %S", key.c_str()));
-                            assert (labels[j].empty() || classids[j]->size() == _totalframes + utteranceset.size());
+                                if (!labels[j].empty() && classids[j]->size() != _totalframes + utteranceset.size())
+                                    throw std::logic_error(msra::strfun::strprintf("minibatchutterancesource: label duration inconsistent with feature file in MLF label set: %S", key.c_str()));
+                                assert(labels[j].empty() || classids[j]->size() == _totalframes + utteranceset.size());
+                            }
                         }
                     }
                     else{
@@ -451,7 +462,7 @@ class minibatchutterancesourcemulti : public minibatchsource
             }
             if (nomlf + nolat > 0)
             {
-                fprintf (stderr, "minibatchutterancesource: out of %d files, %d files not found in label set and %d have no lattice\n", infiles.size(), nomlf, nolat);
+                fprintf (stderr, "minibatchutterancesource: out of %d files, %d files not found in label set and %d have no lattice\n", infiles[0].size(), nomlf, nolat);
                 if (nomlf + nolat > infiles[m].size() / 2)
                     throw std::runtime_error ("minibatchutterancesource: too many files not found in label set--assuming broken configuration\n");
             }

diff --git a/DataReader/Kaldi2Reader/HTKMLFReader.cpp b/DataReader/Kaldi2Reader/HTKMLFReader.cpp
@@ -24,6 +24,7 @@
 #define DATAREADER_EXPORTS  // creating the exports here
 #include "DataReader.h"
 #include "HTKMLFReader.h"
+#include "commandArgUtil.h"
 #ifdef LEAKDETECT
 #include <vld.h> // for memory leak detection
 #endif

diff --git a/DataReader/Kaldi2Reader/HTKMLFWriter.cpp b/DataReader/Kaldi2Reader/HTKMLFWriter.cpp
@@ -28,6 +28,7 @@
 #include "DataWriter.h"
 #include "commandArgUtil.h"
 #include "HTKMLFWriter.h"
+#include "commandArgUtil.h"
 #ifdef LEAKDETECT
 #include <vld.h> // for memory leak detection
 #endif

diff --git a/DataReader/LMSequenceReader/SequenceReader.cpp b/DataReader/LMSequenceReader/SequenceReader.cpp
@@ -2048,6 +2048,10 @@ void BatchSequenceReader<ElemType>::GetLabelOutput(std::map<std::wstring,
     {
         RuntimeError("GetLabelOutput::should use CPU for labels ");
     }
+    if (curDevId != CPUDEVICE)
+    {
+        labels->TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false);
+    }
 }
 
 template<class ElemType>

diff --git a/DataReader/UCIFastReader/UCIParser.cpp b/DataReader/UCIFastReader/UCIParser.cpp
@@ -11,6 +11,11 @@
 #include <stdexcept>
 #include <stdint.h>
 
+#if WIN32
+#define ftell64 _ftelli64
+#else
+#define ftell64 ftell
+#endif
 
 // SetState for a particular value
 template <typename NumType, typename LabelType>
@@ -362,10 +367,10 @@ void UCIParser<NumType, LabelType>::ParseInit(LPCWSTR fileName, size_t startFeat
 
     errno_t err = _wfopen_s( &m_pFile, fileName, L"rb" );
     if (err)
-        std::runtime_error("UCIParser::ParseInit - error opening file"); 
+        throw std::runtime_error("UCIParser::ParseInit - error opening file"); 
     int rc = _fseeki64(m_pFile, 0, SEEK_END);
     if (rc)
-        std::runtime_error("UCIParser::ParseInit - error seeking in file");
+        throw std::runtime_error("UCIParser::ParseInit - error seeking in file");
 
     m_fileSize = GetFilePosition();
     m_fileBuffer = new BYTE[m_bufferSize];
@@ -377,9 +382,9 @@ void UCIParser<NumType, LabelType>::ParseInit(LPCWSTR fileName, size_t startFeat
 template <typename NumType, typename LabelType>
 int64_t UCIParser<NumType, LabelType>::GetFilePosition()
 {
-    int64_t position = _ftelli64(m_pFile);
+    int64_t position = ftell64(m_pFile);
     if (position == -1L)
-        std::runtime_error("UCIParser::GetFilePosition - error retrieving file position in file");
+        throw std::runtime_error("UCIParser::GetFilePosition - error retrieving file position in file");
     return position;
 }
 
@@ -392,7 +397,7 @@ void UCIParser<NumType, LabelType>::SetFilePosition(int64_t position)
 {
     int rc = _fseeki64(m_pFile, position, SEEK_SET);
     if (rc)
-        std::runtime_error("UCIParser::SetFilePosition - error seeking in file");
+        throw std::runtime_error("UCIParser::SetFilePosition - error seeking in file");
 
     // setup state machine to start at this position
     PrepareStartPosition(position);
@@ -445,7 +450,7 @@ size_t UCIParser<NumType, LabelType>::UpdateBuffer()
     size_t bytesToRead = min(m_bufferSize, m_fileSize-m_bufferStart)-saveBytes;
     size_t bytesRead = fread(m_fileBuffer+saveBytes, 1, bytesToRead, m_pFile);
     if (bytesRead == 0 && ferror(m_pFile))
-        std::runtime_error("UCIParser::UpdateBuffer - error reading file");
+        throw std::runtime_error("UCIParser::UpdateBuffer - error reading file");
     return bytesRead;
 }
 

diff --git a/DataReader/UCIFastReader/UCIParser.h b/DataReader/UCIFastReader/UCIParser.h
@@ -90,8 +90,8 @@ class UCIParser
     int m_elementsConvertedThisLine;
 
     // global stats
-    int m_totalNumbersConverted;
-    int m_totalLabelsConverted;
+    int64_t m_totalNumbersConverted;
+    int64_t m_totalLabelsConverted;
 
     // file positions/buffer
     FILE * m_pFile;

diff --git a/Demos/Simple/Simple.config b/Demos/Simple/Simple.config
@@ -1,8 +1,9 @@
-# command=Simple_Demo_Output
+RootDir=..
 command=Simple_Demo:Simple_Demo_Output
 
 # deviceId=-1 for CPU, >=0 for GPU devices
 DeviceNumber=-1
+
 #stderr=Demo
 
 precision=float
@@ -13,7 +14,6 @@ deviceId=$DeviceNumber$
 outputNodeNames=ScaledLogLikelihood
 traceLevel=1
 
-
 #######################################
 #  TRAINING CONFIG (Simple, Fixed LR) #
 #######################################
@@ -52,22 +52,22 @@ Simple_Demo=[
     reader=[
       # reader to use
       readerType=UCIFastReader
-      file=../Demos/Simple/SimpleDataTrain.txt
+      file=$RootDir$/Demos/Simple/SimpleDataTrain.txt
 
       miniBatchMode=Partial
       randomize=Auto
       verbosity=1   
 
       features=[
-	  dim=2      # two-dimensional input data
+          dim=2      # two-dimensional input data
           start=0    # Start with first element on line
       ]
 
       labels=[
-	start=2      # Skip two elements
+        start=2      # Skip two elements
         dim=1        # One label dimension
         labelDim=2   # Two labels possible
-        labelMappingFile=../Demos/Simple/SimpleMapping.txt
+        labelMappingFile=$RootDir$/Demos/Simple/SimpleMapping.txt
       ]
     ]
 ]
@@ -84,16 +84,16 @@ Simple_Demo_Output=[
     reader=[
       # reader to use
       readerType=UCIFastReader
-      file=../Demos/Simple/SimpleDataTest.txt
+      file=$RootDir$/Demos/Simple/SimpleDataTest.txt
       features=[
           dim=2
-	  start=0
+          start=0
       ]
       labels=[
-	start=2
+      start=2
         dim=1
         labelDim=2
-        labelMappingFile=../Demos/Simple/SimpleMapping.txt
+        labelMappingFile=$RootDir$/Demos/Simple/SimpleMapping.txt
       ]
     ]
     outputPath=SimpleOutput    # Dump output as text
-Original file line number
+Diff line change
@@ Expand Up / @@ -15,6 +15,7 @@ x64/ @@
     build/
     [Bb]in/
     [Oo]bj/
+    .run-*
     # Enable "build/" folder in the NuGet Packages folder since NuGet packages use it for MSBuild targets
     !packages/*/build/
@@ Expand Down @@