Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/mitslsmaster' into dongyu/memshare
Browse files Browse the repository at this point in the history
Conflicts:
	Math/Math/NoGPU.cpp
  • Loading branch information
yzhang87 committed Oct 1, 2015
2 parents b4e21db + 04af449 commit 3ea3c56
Show file tree
Hide file tree
Showing 13 changed files with 92 additions and 135 deletions.
3 changes: 3 additions & 0 deletions DataReader/KaldiReader/DataReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
//

#include "stdafx.h"
#ifdef _WIN32
#include <objbase.h>
#endif
#include "basetypes.h"

#include "htkfeatio.h" // for reading HTK features
Expand Down
89 changes: 24 additions & 65 deletions DataReader/KaldiReader/HTKMLFReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
//

#include "stdafx.h"
#ifdef _WIN32
#include <objbase.h>
#endif
#include "basetypes.h"

#include "htkfeatio.h" // for reading HTK features
Expand Down Expand Up @@ -794,22 +797,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
dim = m_featureNameToDimMap[iter->first];
const msra::dbn::matrixstripe feat = m_mbiter->frames(id);
const size_t actualmbsize = feat.cols(); // it may still return less if at end of sweep TODO: this check probably only needs to happen once
if (first)
if (first) // initialize MBLayout
{
m_sentenceBegin.Resize((size_t)1, (size_t)feat.cols());
m_minibatchPackingFlag.resize(feat.cols());
m_sentenceBegin.SetValue((ElemType) SEQUENCE_MIDDLE);
m_sentenceBegin.SetValue(0, 0, (ElemType) SEQUENCE_START);
m_sentenceBegin.SetValue(0, (size_t)feat.cols()-1, (ElemType) SEQUENCE_END);

std::fill(m_minibatchPackingFlag.begin(), m_minibatchPackingFlag.end(), MinibatchPackingFlag::None);
m_minibatchPackingFlag[0] = MinibatchPackingFlag::SequenceStart;
m_minibatchPackingFlag[(size_t)feat.cols()-1] = MinibatchPackingFlag::SequenceEnd;
// entire minibatch is one utterance
m_pMBLayout->Resize(1, actualmbsize);
m_pMBLayout->Reset(0, 0, MinibatchPackingFlags::SequenceStart); // TODO: can't we use Set()?
m_pMBLayout->Reset(0, actualmbsize - 1, MinibatchPackingFlags::SequenceEnd);
first = false;
}



assert (actualmbsize == m_mbiter->currentmbframes());
skip = (!m_partialMinibatch && m_mbiter->requestedframes() != actualmbsize && m_frameSource->totalframes() > actualmbsize);

Expand Down Expand Up @@ -951,25 +947,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
size_t numOfFea = m_featuresBufferMultiIO.size();
size_t numOfLabel = m_labelsBufferMultiIO.size();
/**
mtSentenceBegin : a matrix with [Ns x T]
the first row is 0/1 bit for wether corresponding frame has sentence beginining/no_label for any of streams
0 : no such case
1 : case exists
*/
m_sentenceBegin.Resize(m_numberOfuttsPerMinibatch, m_mbSize);
m_minibatchPackingFlag.resize(m_mbSize);

//mtSentenceBegin.SetValue((ElemType) SEQUENCE_MIDDLE);
for (size_t i = 0; i < m_numberOfuttsPerMinibatch; i++)
{
for (size_t j = 0; j < m_mbSize; j++)
{
m_sentenceBegin.SetValue(i,j,(ElemType) SEQUENCE_MIDDLE);
}
}
std::fill(m_minibatchPackingFlag.begin(), m_minibatchPackingFlag.end(), MinibatchPackingFlag::None);

m_pMBLayout->Resize(m_numberOfuttsPerMinibatch, m_mbSize);

vector<size_t> actualmbsize;
actualmbsize.assign(m_numberOfuttsPerMinibatch,0);
Expand All @@ -984,17 +963,13 @@ the first row is 0/1 bit for wether corresponding frame has sentence beginining/
m_sentenceEnd[i] = false;
m_switchFrame[i] = m_mbSize+1;
if (m_processedFrame[i] == 1)
{
m_sentenceBegin.SetValue(i, 0, (ElemType)SEQUENCE_END);
m_minibatchPackingFlag[0] = MinibatchPackingFlag::SequenceEnd;
}
m_pMBLayout->Reset(i, 0, MinibatchPackingFlags::SequenceEnd); // TODO: shouldn't both Start and End be set? TODO: can we just use Set()?
}
else
{
m_switchFrame[i] = 0;
m_sentenceEnd[i] = true;
m_sentenceBegin.SetValue(i, 0, (ElemType)SEQUENCE_START);
m_minibatchPackingFlag[0] = MinibatchPackingFlag::SequenceStart;
m_switchFrame[i] = 0;
m_pMBLayout->Reset(i, 0, MinibatchPackingFlags::SequenceStart);
}
actualmbsize[i] = m_mbSize;
endFr = startFr + actualmbsize[i];
Expand Down Expand Up @@ -1147,16 +1122,9 @@ the first row is 0/1 bit for wether corresponding frame has sentence beginining/
m_switchFrame[i] = actualmbsize[i];

if (actualmbsize[i] < m_mbSize)
{
m_sentenceBegin.SetValue(i, actualmbsize[i], (ElemType)SEQUENCE_START);
m_minibatchPackingFlag[actualmbsize[i]] = m_minibatchPackingFlag[actualmbsize[i]] | MinibatchPackingFlag::SequenceStart;
}
m_pMBLayout->Set(i, actualmbsize[i], MinibatchPackingFlags::SequenceStart); // NOTE: this ORs, while original code overwrote in matrix but ORed into vector
if (actualmbsize[i] == m_mbSize)
{
m_sentenceBegin.SetValue(i, actualmbsize[i]-1, (ElemType)SEQUENCE_END);
m_minibatchPackingFlag[actualmbsize[i]-1] = m_minibatchPackingFlag[actualmbsize[i]] | MinibatchPackingFlag::SequenceEnd;
}

m_pMBLayout->Set(i, actualmbsize[i] - 1, MinibatchPackingFlags::SequenceEnd); // NOTE: this ORs, while original code overwrote in matrix but ORed into vector
startFr = m_switchFrame[i];
endFr = m_mbSize;
bool reNewSucc = ReNewBufferForMultiIO(i);
Expand Down Expand Up @@ -1303,22 +1271,12 @@ the first row is 0/1 bit for wether corresponding frame has sentence beginining/
const msra::dbn::matrix feat = m_fileEvalSource->ChunkOfFrames(id);
if (first)
{
m_sentenceBegin.Resize((size_t)1, (size_t)feat.cols());
m_minibatchPackingFlag.resize((size_t)feat.cols());

m_sentenceBegin.SetValue((ElemType)SEQUENCE_MIDDLE);
m_sentenceBegin.SetValue(0, 0, (ElemType)SEQUENCE_START);
m_sentenceBegin.SetValue(0, (size_t)feat.cols()-1, (ElemType) SEQUENCE_END);

std::fill(m_minibatchPackingFlag.begin(), m_minibatchPackingFlag.end(), MinibatchPackingFlag::None);
m_minibatchPackingFlag[0] = MinibatchPackingFlag::SequenceStart;
m_minibatchPackingFlag[(size_t)feat.cols()-1] = MinibatchPackingFlag::SequenceEnd;

m_pMBLayout->Resize(1, feat.cols());
m_pMBLayout->Reset(0, 0, MinibatchPackingFlags::SequenceStart); // TODO: can't we use Set()?
m_pMBLayout->Reset(0, feat.cols() - 1, MinibatchPackingFlags::SequenceEnd);
first = false;
}



// copy the features over to our array type
//
assert(feat.rows()==dim); dim; // check feature dimension matches what's expected
Expand Down Expand Up @@ -1632,12 +1590,13 @@ the first row is 0/1 bit for wether corresponding frame has sentence beginining/
}

template<class ElemType>
void HTKMLFReader<ElemType>::SetSentenceSegBatch(Matrix<ElemType> &sentenceBegin, vector<MinibatchPackingFlag>& minibatchPackingFlag)
{
sentenceBegin.SetValue(m_sentenceBegin);
minibatchPackingFlag = m_minibatchPackingFlag;
}

void HTKMLFReader<ElemType>::CopyMBLayoutTo(MBLayoutPtr pMBLayout)
{
if (!m_framemode)
pMBLayout->CopyFrom(m_pMBLayout);
else
pMBLayout->SetAllNone(); // no flags in frame mode
}

// GetFileConfigNames - determine the names of the features and labels sections in the config file
// features - [in,out] a vector of feature name strings
Expand Down
34 changes: 11 additions & 23 deletions DataReader/KaldiReader/HTKMLFReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class HTKMLFReader : public IDataReader<ElemType>
vector<bool> m_sentenceEnd;
bool m_readAhead;
bool m_truncated;
bool m_framemode;
vector<size_t> m_processedFrame;
size_t m_numberOfuttsPerMinibatch;
size_t m_actualnumberOfuttsPerMinibatch;
Expand Down Expand Up @@ -80,6 +81,9 @@ class HTKMLFReader : public IDataReader<ElemType>

bool ReNewBufferForMultiIO(size_t i);

size_t GetNumParallelSequences() { return m_numberOfuttsPerMinibatch; }
void SetNumParallelSequences(const size_t) { };

size_t NumberSlicesInEachRecurrentIter() { return m_numberOfuttsPerMinibatch ;}
void SetNbrSlicesEachRecurrentIter(const size_t) { };

Expand All @@ -96,35 +100,17 @@ class HTKMLFReader : public IDataReader<ElemType>


public:
/// a matrix of n_stream x n_length
/// n_stream is the number of streams
/// n_length is the maximum lenght of each stream
/// for example, two sentences used in parallel in one minibatch would be
/// [2 x 5] if the max length of one of the sentences is 5
/// the elements of the matrix is 0, 1, or -1, defined as SEQUENCE_START, SEQUENCE_MIDDLE, NO_INPUT in cbasetype.h
/// 0 1 1 0 1
/// 1 0 1 0 0
/// for two parallel data streams. The first has two sentences, with 0 indicating begining of a sentence
/// the second data stream has two sentences, with 0 indicating begining of sentences
/// you may use 1 even if a sentence begins at that position, in this case, the trainer will carry over hidden states to the following
/// frame.
Matrix<ElemType> m_sentenceBegin;

/// a matrix of 1 x n_length
/// 1 denotes the case that there exists sentnece begin or no_labels case in this frame
/// 0 denotes such case is not in this frame


vector<MinibatchPackingFlag> m_minibatchPackingFlag;
MBLayoutPtr m_pMBLayout;

/// by default it is false
/// if true, reader will set to SEQUENCE_MIDDLE for time positions that are orignally correspond to SEQUENCE_START
/// set to true so that a current minibatch can uses state activities from the previous minibatch.
/// default will have truncated BPTT, which only does BPTT inside a minibatch

bool mIgnoreSentenceBeginTag;
HTKMLFReader() : m_sentenceBegin(CPUDEVICE) {
}
HTKMLFReader() : m_pMBLayout(make_shared<MBLayout>())
{
}

virtual void Init(const ConfigParameters& config);
virtual void Destroy() {delete this;}
Expand All @@ -136,10 +122,12 @@ class HTKMLFReader : public IDataReader<ElemType>
virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart=0);

virtual bool DataEnd(EndDataType endDataType);
void CopyMBLayoutTo(MBLayoutPtr);
void SetSentenceEndInBatch(vector<size_t> &/*sentenceEnd*/);
void SetSentenceEnd(int /*actualMbSize*/){};
void SetSentenceSegBatch(Matrix<ElemType> &sentenceBegin, vector<MinibatchPackingFlag>& sentenceExistsBeginOrNoLabels);
void SetRandomSeed(int) { NOT_IMPLEMENTED };

bool RequireSentenceSeg() { return !m_framemode; };
};

}}}
49 changes: 25 additions & 24 deletions DataReader/KaldiReader/basetypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,15 @@ OACR_WARNING_DISABLE(POTENTIAL_ARGUMENT_TYPE_MISMATCH, "Not level1 or level2_sec
#endif

#include "Platform.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // include here because we redefine some names later
#include <errno.h>
#include <cstdio>
#include <cstdlib>
#include <cstring> // include here because we redefine some names later
#include <cerrno>
#include <string>
#include <vector>
#include <math.h> // for HUGE_VAL // potential double isnan definition
#include <assert.h>
#include <stdarg.h>
#include <cmath> // for HUGE_VAL // potential double isnan definition
#include <cassert>
#include <cstdarg>
#include <map>
#include <stdexcept>
#include <locale> // std::wstring_convert
Expand Down Expand Up @@ -1192,15 +1192,17 @@ class RegisterModule
#define ISCLOSE(a, b, threshold) (abs(a - b) < threshold)?true:false

/**
These macros are used for sentence segmentation information.
These macros are used for sentence segmentation information.
TODO: get rid of this, no need
*/
#define SEQUENCE_START ((int) MinibatchPackingFlag::SequenceStart)
#define SEQUENCE_MIDDLE ((int) MinibatchPackingFlag::None)
#define SEQUENCE_END ((int) MinibatchPackingFlag::SequenceEnd)
#define NO_INPUT ((int) MinibatchPackingFlag::NoInput)
#define NO_LABEL ((int) MinibatchPackingFlag::NoLabel)

enum class MinibatchPackingFlag : unsigned char
//#define ((int) MinibatchPackingFlags::SequenceStart) ((int) MinibatchPackingFlags::SequenceStart)
//#define ((int) MinibatchPackingFlags::None) ((int) MinibatchPackingFlags::None)
//#define ((int) MinibatchPackingFlags::SequenceEnd) ((int) MinibatchPackingFlags::SequenceEnd)
//#define ((int) MinibatchPackingFlags::NoInput) ((int) MinibatchPackingFlags::NoInput)
//#define ((int) MinibatchPackingFlags::NoFeature) ((int) MinibatchPackingFlags::NoFeature)
//#define ((int) MinibatchPackingFlags::NoLabel) ((int) MinibatchPackingFlags::NoLabel)

enum class MinibatchPackingFlags : unsigned char
{
None = 0,
SequenceStart = 1 << 0, //binary 0001
Expand All @@ -1209,26 +1211,25 @@ enum class MinibatchPackingFlag : unsigned char
NoLabel = 1 << 3, //binary 1000

NoInput = NoFeature | NoLabel, //when we refactorize reader, NoInput will no longer needed
SequenceStartOrNoInput = SequenceStart | NoInput,
SequenceEndOrNoInput = SequenceEnd | NoInput,
SequenceStartOrEndOrNoInput = SequenceStart | SequenceEnd | NoInput,
SequenceStartOrNoFeature = SequenceStart | NoFeature,
SequenceEndOrNoFeature = SequenceEnd | NoFeature,
SequenceStartOrEndOrNoFeature = SequenceStart | SequenceEnd | NoFeature,
};

inline MinibatchPackingFlag operator| (MinibatchPackingFlag a, MinibatchPackingFlag b)
inline MinibatchPackingFlags operator| (MinibatchPackingFlags a, MinibatchPackingFlags b)
{
return static_cast<MinibatchPackingFlag>(static_cast<unsigned char>(a) | static_cast<unsigned char>(b));
return static_cast<MinibatchPackingFlags>(static_cast<unsigned char>(a) | static_cast<unsigned char>(b));
}

inline MinibatchPackingFlag& operator|= (MinibatchPackingFlag& a, MinibatchPackingFlag b)
inline MinibatchPackingFlags& operator|= (MinibatchPackingFlags& a, MinibatchPackingFlags b)
{
a = a | b;
return a;
}


inline bool operator& (MinibatchPackingFlag a, MinibatchPackingFlag b)
inline bool operator& (MinibatchPackingFlags a, MinibatchPackingFlags b)
{
return (static_cast<unsigned char>(a) & static_cast<unsigned char>(b)) != 0;
return (static_cast<unsigned char>(a)& static_cast<unsigned char>(b)) != 0;
}

template<class F>
Expand Down
10 changes: 5 additions & 5 deletions DataReader/KaldiReader/fileutil.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#define _FILEUTIL_

#include "Platform.h"
#include <stdio.h>
#include <cstdio>
#ifdef __unix__
#include <sys/types.h>
#include <sys/stat.h>
Expand All @@ -18,10 +18,10 @@
#include <map>
#include <functional>
#include <cctype>
#include <errno.h>
#include <stdint.h>
#include <assert.h>
#include <string.h> // for strerror()
#include <cerrno>
#include <cstdint>
#include <cassert>
#include <cstring> // for strerror()

using namespace std;

Expand Down
6 changes: 3 additions & 3 deletions DataReader/KaldiReader/htkfeatio.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
#include <regex>
#include <set>
#include <unordered_map>
#include <stdint.h>
#include <limits.h>
#include <wchar.h>
#include <cstdint>
#include <climits>
#include <cwchar>
#include <fstream>
#include "util/common-utils.h"
#include "base/kaldi-common.h"
Expand Down
7 changes: 4 additions & 3 deletions DataReader/KaldiReader/latticearchive.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
#include "latticestorage.h"
#include "simple_checked_arrays.h"
#include "fileutil.h"
#include <stdint.h>
#include <inttypes.h>
#include <cstdint>
#include <cinttypes>
#include <vector>
#include <string>
#include <algorithm> // for find()
Expand Down Expand Up @@ -1132,7 +1132,8 @@ class archive
throw std::runtime_error ("open: invalid TOC line (empty archive pathname): " + std::string (line));
char c;
uint64_t offset;
if (sscanf(q, "[%" PRId64 "]%c", &offset, &c) != 1)
const char *scanformat = "[%" PRId64 "]%c"; // Work-around for GNU/Linux bug in sscanf in older libc (2.15) versions
if (sscanf(q, scanformat, &offset, &c) != 1)
throw std::runtime_error ("open: invalid TOC line (bad [] expression): " + std::string (line));
if (!toc.insert (make_pair (key, latticeref (offset, archiveindex))).second)
throw std::runtime_error ("open: TOC entry leads to duplicate key: " + std::string (line));
Expand Down
2 changes: 1 addition & 1 deletion DataReader/KaldiReader/latticestorage.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#pragma once
#include <string> // for the error message in checkoverflow() only
#include <stdexcept>
#include <stdint.h>
#include <cstdint>

#undef INITIAL_STRANGE // [v-hansu] intialize structs to strange values
#define PARALLEL_SIL // [v-hansu] process sil on CUDA, used in other files, please search this
Expand Down
2 changes: 1 addition & 1 deletion DataReader/KaldiReader/minibatchsourcehelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#pragma once

#include "basetypes.h"
#include <stdio.h>
#include <cstdio>
#include <vector>
#include <algorithm>

Expand Down
Loading

0 comments on commit 3ea3c56

Please sign in to comment.