Skip to content

Commit

Permalink
Merge with new changes including RowStackNode
Browse files Browse the repository at this point in the history
  • Loading branch information
kaisheng committed Jun 16, 2015
2 parents 01468f3 + 99af413 commit f332421
Show file tree
Hide file tree
Showing 32 changed files with 810 additions and 134 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ x64/
build/
[Bb]in/
[Oo]bj/
.run-*

# Enable "build/" folder in the NuGet Packages folder since NuGet packages use it for MSBuild targets
!packages/*/build/
Expand Down
4 changes: 2 additions & 2 deletions DataReader/BinaryReader/BinaryWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ BinaryWriter<ElemType>::~BinaryWriter()
// miniBatchMode=Partial
// randomize=None
// wfile=c:\speech\mnist\mnist_test.bin
// #wsize - inital size of the file in MB
// # if calculated size would be bigger, that is used instead
// #wsize - inital size of the file in MB default to 256
// # has to be large enough for your dataset. the file will shrink to the actual size when closed.
// #wsize=256
// #wrecords - number of records we should allocate space for in the file
// # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
Expand Down
9 changes: 3 additions & 6 deletions DataReader/HTKMLFReader/HTKMLFReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -980,8 +980,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
// dereference matrix that corresponds to key (input/output name) and
// populate based on whether its a feature or a label
//Matrix<ElemType>& data =
*matrices[iter->first]; // can be features or labels
//Matrix<ElemType>& data = *matrices[iter->first]; // can be features or labels

if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
{
Expand Down Expand Up @@ -1058,8 +1057,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
// dereference matrix that corresponds to key (input/output name) and
// populate based on whether its a feature or a label
//Matrix<ElemType>& data =
*matrices[iter->first]; // can be features or labels
//Matrix<ElemType>& data =*matrices[iter->first]; // can be features or labels

if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
{
Expand Down Expand Up @@ -1134,8 +1132,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
// dereference matrix that corresponds to key (input/output name) and
// populate based on whether its a feature or a label
//Matrix<ElemType>& data =
*matrices[iter->first]; // can be features or labels
//Matrix<ElemType>& data = *matrices[iter->first]; // can be features or labels

if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
{
Expand Down
9 changes: 9 additions & 0 deletions DataReader/HTKMLFReader/basetypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,15 @@ extern void _CHECKED_ASSERT_error(const char * file, int line, const char * exp)
#endif
#endif

/**
These macros are used for sentence segmentation information.
*/
#define SENTENCE_BEGIN 0
#define SENTENCE_MIDDLE 1
#define NO_LABELS -1
#define EXISTS_SENTENCE_BEGIN_OR_NO_LABELS 0
#define NO_EXISTS_SENTENCE_BEGIN_OR_NO_LABELS 1

// ----------------------------------------------------------------------------
// basic data types
// ----------------------------------------------------------------------------
Expand Down
65 changes: 38 additions & 27 deletions DataReader/HTKMLFReader/utterancesourcemulti.h
Original file line number Diff line number Diff line change
Expand Up @@ -382,47 +382,58 @@ class minibatchutterancesourcemulti : public minibatchsource
// TODO: we can store labels more efficiently now since we don't do frame-wise random access anymore.

// OK, utterance has all we need --remember it
utteranceset.push_back (std::move (utterance));

if (m==0)
{
_totalframes += uttframes;
framesaccum.push_back(uttframes); //track number of frames in each utterance - first feature is the reference
if (!labels.empty() && !lacksmlf)
//if (!labels.empty() && labelsiter != labels[0].end())
{
foreach_index (j, labels)
// first verify that all the label files have the proper duration
bool durationmatch = true;
foreach_index(j, labels)
{
const auto & labseq = labels[j].find(key)->second;
// check if durations match; skip if not
size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size()-1].firstframe + labseq[labseq.size()-1].numframes);
size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size() - 1].firstframe + labseq[labseq.size() - 1].numframes);
if (labframes != uttframes)
{
fprintf (stderr, " [duration mismatch (%d in label vs. %d in feat file), skipping %S]", labframes, uttframes, key.c_str());
fprintf(stderr, " [duration mismatch (%d in label vs. %d in feat file), skipping %S]", labframes, uttframes, key.c_str());
nomlf++;
continue; // skip this utterance at all
durationmatch = false;
break; // continue; // skip this utterance at all
}
// expand classid sequence into flat array
foreach_index (i, labseq)
}
if (durationmatch){
utteranceset.push_back(std::move(utterance));
_totalframes += uttframes;
framesaccum.push_back(uttframes); //track number of frames in each utterance - first feature is the reference
// then parse each mlf if the durations are consistent
foreach_index(j, labels)
{
const auto & e = labseq[i];
if ((i > 0 && labseq[i-1].firstframe + labseq[i-1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0))
throw std::runtime_error (msra::strfun::strprintf ("minibatchutterancesource: labels not in consecutive order MLF in label set: %S", key.c_str()));
if (e.classid >= udim[j])
throw std::runtime_error (msra::strfun::strprintf ("minibatchutterancesource: class id %d exceeds model output dimension %d in file %S", e.classid, udim, key.c_str()));
if (e.classid != (CLASSIDTYPE) e.classid)
throw std::runtime_error ("CLASSIDTYPE has too few bits");
for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++)
classids[j]->push_back ((CLASSIDTYPE) e.classid);
numclasses[j] = max (numclasses[j], 1u + e.classid);
counts[j].resize (numclasses[j], 0);
counts[j][e.classid] += e.numframes;
}
classids[j]->push_back ((CLASSIDTYPE) -1); // append a boundary marker marker for checking
const auto & labseq = labels[j].find(key)->second;
// expand classid sequence into flat array
foreach_index(i, labseq)
{
const auto & e = labseq[i];
if ((i > 0 && labseq[i - 1].firstframe + labseq[i - 1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0))
throw std::runtime_error(msra::strfun::strprintf("minibatchutterancesource: labels not in consecutive order MLF in label set: %S", key.c_str()));
if (e.classid >= udim[j])
throw std::runtime_error(msra::strfun::strprintf("minibatchutterancesource: class id %d exceeds model output dimension %d in file %S", e.classid, udim, key.c_str()));
if (e.classid != (CLASSIDTYPE)e.classid)
throw std::runtime_error("CLASSIDTYPE has too few bits");
for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++)
classids[j]->push_back((CLASSIDTYPE)e.classid);
numclasses[j] = max(numclasses[j], 1u + e.classid);
counts[j].resize(numclasses[j], 0);
counts[j][e.classid] += e.numframes;
}

classids[j]->push_back((CLASSIDTYPE)-1); // append a boundary marker marker for checking

if (!labels[j].empty() && classids[j]->size() != _totalframes + utteranceset.size())
throw std::logic_error (msra::strfun::strprintf ("minibatchutterancesource: label duration inconsistent with feature file in MLF label set: %S", key.c_str()));
assert (labels[j].empty() || classids[j]->size() == _totalframes + utteranceset.size());
if (!labels[j].empty() && classids[j]->size() != _totalframes + utteranceset.size())
throw std::logic_error(msra::strfun::strprintf("minibatchutterancesource: label duration inconsistent with feature file in MLF label set: %S", key.c_str()));
assert(labels[j].empty() || classids[j]->size() == _totalframes + utteranceset.size());
}
}
}
else{
Expand Down Expand Up @@ -451,7 +462,7 @@ class minibatchutterancesourcemulti : public minibatchsource
}
if (nomlf + nolat > 0)
{
fprintf (stderr, "minibatchutterancesource: out of %d files, %d files not found in label set and %d have no lattice\n", infiles.size(), nomlf, nolat);
fprintf (stderr, "minibatchutterancesource: out of %d files, %d files not found in label set and %d have no lattice\n", infiles[0].size(), nomlf, nolat);
if (nomlf + nolat > infiles[m].size() / 2)
throw std::runtime_error ("minibatchutterancesource: too many files not found in label set--assuming broken configuration\n");
}
Expand Down
1 change: 1 addition & 0 deletions DataReader/Kaldi2Reader/HTKMLFReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#define DATAREADER_EXPORTS // creating the exports here
#include "DataReader.h"
#include "HTKMLFReader.h"
#include "commandArgUtil.h"
#ifdef LEAKDETECT
#include <vld.h> // for memory leak detection
#endif
Expand Down
1 change: 1 addition & 0 deletions DataReader/Kaldi2Reader/HTKMLFWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "DataWriter.h"
#include "commandArgUtil.h"
#include "HTKMLFWriter.h"
#include "commandArgUtil.h"
#ifdef LEAKDETECT
#include <vld.h> // for memory leak detection
#endif
Expand Down
4 changes: 4 additions & 0 deletions DataReader/LMSequenceReader/SequenceReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2048,6 +2048,10 @@ void BatchSequenceReader<ElemType>::GetLabelOutput(std::map<std::wstring,
{
RuntimeError("GetLabelOutput::should use CPU for labels ");
}
if (curDevId != CPUDEVICE)
{
labels->TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false);
}
}

template<class ElemType>
Expand Down
17 changes: 11 additions & 6 deletions DataReader/UCIFastReader/UCIParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
#include <stdexcept>
#include <stdint.h>

#if WIN32
#define ftell64 _ftelli64
#else
#define ftell64 ftell
#endif

// SetState for a particular value
template <typename NumType, typename LabelType>
Expand Down Expand Up @@ -362,10 +367,10 @@ void UCIParser<NumType, LabelType>::ParseInit(LPCWSTR fileName, size_t startFeat

errno_t err = _wfopen_s( &m_pFile, fileName, L"rb" );
if (err)
std::runtime_error("UCIParser::ParseInit - error opening file");
throw std::runtime_error("UCIParser::ParseInit - error opening file");
int rc = _fseeki64(m_pFile, 0, SEEK_END);
if (rc)
std::runtime_error("UCIParser::ParseInit - error seeking in file");
throw std::runtime_error("UCIParser::ParseInit - error seeking in file");

m_fileSize = GetFilePosition();
m_fileBuffer = new BYTE[m_bufferSize];
Expand All @@ -377,9 +382,9 @@ void UCIParser<NumType, LabelType>::ParseInit(LPCWSTR fileName, size_t startFeat
template <typename NumType, typename LabelType>
int64_t UCIParser<NumType, LabelType>::GetFilePosition()
{
int64_t position = _ftelli64(m_pFile);
int64_t position = ftell64(m_pFile);
if (position == -1L)
std::runtime_error("UCIParser::GetFilePosition - error retrieving file position in file");
throw std::runtime_error("UCIParser::GetFilePosition - error retrieving file position in file");
return position;
}

Expand All @@ -392,7 +397,7 @@ void UCIParser<NumType, LabelType>::SetFilePosition(int64_t position)
{
int rc = _fseeki64(m_pFile, position, SEEK_SET);
if (rc)
std::runtime_error("UCIParser::SetFilePosition - error seeking in file");
throw std::runtime_error("UCIParser::SetFilePosition - error seeking in file");

// setup state machine to start at this position
PrepareStartPosition(position);
Expand Down Expand Up @@ -445,7 +450,7 @@ size_t UCIParser<NumType, LabelType>::UpdateBuffer()
size_t bytesToRead = min(m_bufferSize, m_fileSize-m_bufferStart)-saveBytes;
size_t bytesRead = fread(m_fileBuffer+saveBytes, 1, bytesToRead, m_pFile);
if (bytesRead == 0 && ferror(m_pFile))
std::runtime_error("UCIParser::UpdateBuffer - error reading file");
throw std::runtime_error("UCIParser::UpdateBuffer - error reading file");
return bytesRead;
}

Expand Down
4 changes: 2 additions & 2 deletions DataReader/UCIFastReader/UCIParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ class UCIParser
int m_elementsConvertedThisLine;

// global stats
int m_totalNumbersConverted;
int m_totalLabelsConverted;
int64_t m_totalNumbersConverted;
int64_t m_totalLabelsConverted;

// file positions/buffer
FILE * m_pFile;
Expand Down
20 changes: 10 additions & 10 deletions Demos/Simple/Simple.config
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# command=Simple_Demo_Output
RootDir=..
command=Simple_Demo:Simple_Demo_Output

# deviceId=-1 for CPU, >=0 for GPU devices
DeviceNumber=-1

#stderr=Demo

precision=float
Expand All @@ -13,7 +14,6 @@ deviceId=$DeviceNumber$
outputNodeNames=ScaledLogLikelihood
traceLevel=1


#######################################
# TRAINING CONFIG (Simple, Fixed LR) #
#######################################
Expand Down Expand Up @@ -52,22 +52,22 @@ Simple_Demo=[
reader=[
# reader to use
readerType=UCIFastReader
file=../Demos/Simple/SimpleDataTrain.txt
file=$RootDir$/Demos/Simple/SimpleDataTrain.txt

miniBatchMode=Partial
randomize=Auto
verbosity=1

features=[
dim=2 # two-dimensional input data
dim=2 # two-dimensional input data
start=0 # Start with first element on line
]

labels=[
start=2 # Skip two elements
start=2 # Skip two elements
dim=1 # One label dimension
labelDim=2 # Two labels possible
labelMappingFile=../Demos/Simple/SimpleMapping.txt
labelMappingFile=$RootDir$/Demos/Simple/SimpleMapping.txt
]
]
]
Expand All @@ -84,16 +84,16 @@ Simple_Demo_Output=[
reader=[
# reader to use
readerType=UCIFastReader
file=../Demos/Simple/SimpleDataTest.txt
file=$RootDir$/Demos/Simple/SimpleDataTest.txt
features=[
dim=2
start=0
start=0
]
labels=[
start=2
start=2
dim=1
labelDim=2
labelMappingFile=../Demos/Simple/SimpleMapping.txt
labelMappingFile=$RootDir$/Demos/Simple/SimpleMapping.txt
]
]
outputPath=SimpleOutput # Dump output as text
Expand Down
Loading

0 comments on commit f332421

Please sign in to comment.