Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/Microsoft/CNTK into fseid…
Browse files Browse the repository at this point in the history
…e/pack
  • Loading branch information
U-FAREAST\fseide committed Apr 4, 2016
2 parents a5d8fc5 + cc1d54e commit a6bc80f
Show file tree
Hide file tree
Showing 15 changed files with 3,247 additions and 855,057 deletions.
66 changes: 50 additions & 16 deletions Source/Readers/ExperimentalHTKMLFReader/HTKDataDeserializer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,14 @@ using namespace std;
HTKDataDeserializer::HTKDataDeserializer(
CorpusDescriptorPtr corpus,
const ConfigParameters& feature,
const wstring& featureName)
const wstring& featureName,
bool primary)
: m_ioFeatureDimension(0),
m_samplePeriod(0),
m_verbosity(0),
m_corpus(corpus),
m_totalNumberOfFrames(0)
m_totalNumberOfFrames(0),
m_primary(primary)
{
// The frame mode is currently specified once per configuration,
// not in the configuration of a particular deserializer, but on a higher level in the configuration.
Expand Down Expand Up @@ -82,7 +84,23 @@ void HTKDataDeserializer::InitializeChunkDescriptions(ConfigHelper& config)
continue;
}

size_t id = stringRegistry.AddValue(description.GetKey());
wstring key = description.GetKey();
size_t id = 0;
if (m_primary)
{
// TODO: Definition of the corpus should be moved to the CorpusDescriptor
// TODO: All keys should be added there. Currently, we add them in the driving deserializer.
id = stringRegistry.AddValue(key);
}
else
{
if (!stringRegistry.TryGet(key, id))
{
// Utterance is unknown, skipping it.
continue;
}
}

description.SetId(id);
utterances.push_back(description);
m_totalNumberOfFrames += numberOfFrames;
Expand Down Expand Up @@ -121,15 +139,15 @@ void HTKDataDeserializer::InitializeChunkDescriptions(ConfigHelper& config)
// append utterance to last chunk
HTKChunkDescription& currentChunk = m_chunks.back();
utterances[i].AssignToChunk(chunkId, currentChunk.GetNumberOfUtterances(), startFrameInsideChunk);
if (!m_primary)
{
// Have to store key <-> utterance mapping for non primary deserializers.
m_keyToChunkLocation[utterances[i].GetId()] = make_pair(utterances[i].GetChunkId(), utterances[i].GetIndexInsideChunk());
}
startFrameInsideChunk += utterances[i].GetNumberOfFrames();
currentChunk.Add(move(utterances[i]));
}

// Creating a table of weak pointers to chunks,
// so that if randomizer asks the same chunk twice
// we do not need to recreated the chunk if we already uploaded in memory.
m_weakChunks.resize(m_chunks.size());

fprintf(stderr,
"HTKDataDeserializer::HTKDataDeserializer: %d utterances grouped into %d chunks, av. chunk size: %.1f utterances, %.1f frames\n",
(int)utterances.size(),
Expand Down Expand Up @@ -289,14 +307,7 @@ class HTKDataDeserializer::HTKChunk : public Chunk
// Gets a data chunk with the specified chunk id.
ChunkPtr HTKDataDeserializer::GetChunk(size_t chunkId)
{
if (!m_weakChunks[chunkId].expired())
{
return m_weakChunks[chunkId].lock();
}

auto chunk = make_shared<HTKChunk>(this, chunkId);
m_weakChunks[chunkId] = chunk;
return chunk;
return make_shared<HTKChunk>(this, chunkId);
};

// A matrix that stores all samples of a sequence without padding (differently from ssematrix).
Expand Down Expand Up @@ -415,4 +426,27 @@ void HTKDataDeserializer::GetSequenceById(size_t chunkId, size_t id, vector<Sequ
r.push_back(result);
}

static SequenceDescription s_InvalidSequence{0, 0, 0, false};

// Gets sequence description by its key.
void HTKDataDeserializer::GetSequenceDescriptionByKey(const KeyType& key, SequenceDescription& d)
{
assert(!m_primary);
auto iter = m_keyToChunkLocation.find(key.m_major);
if (iter == m_keyToChunkLocation.end())
{
// Unknown sequence. Return invalid.
d = s_InvalidSequence;
}
else
{
const auto& chunk = m_chunks[iter->second.first];
const auto& sequence = chunk.GetUtterance(iter->second.second);
d.m_chunkId = sequence->GetChunkId();
d.m_id = m_frameMode ? sequence->GetStartFrameIndexInsideChunk() + key.m_minor : sequence->GetIndexInsideChunk();
d.m_isValid = true;
d.m_numberOfSamples = m_frameMode ? 1 : sequence->GetNumberOfFrames();
}
}

}}}
17 changes: 11 additions & 6 deletions Source/Readers/ExperimentalHTKMLFReader/HTKDataDeserializer.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
class HTKDataDeserializer : public DataDeserializerBase
{
public:
HTKDataDeserializer(CorpusDescriptorPtr corpus, const ConfigParameters& featureConfig, const std::wstring& featureName);
HTKDataDeserializer(CorpusDescriptorPtr corpus, const ConfigParameters& featureConfig, const std::wstring& featureName, bool primary);

// Get information about chunks.
virtual ChunkDescriptions GetChunkDescriptions() override;
Expand All @@ -30,6 +30,9 @@ class HTKDataDeserializer : public DataDeserializerBase
// Retrieves data for a chunk.
virtual ChunkPtr GetChunk(size_t chunkId) override;

// Gets sequence description by its key.
virtual void GetSequenceDescriptionByKey(const KeyType&, SequenceDescription&) override;

private:
class HTKChunk;
DISABLE_COPY_AND_MOVE(HTKDataDeserializer);
Expand All @@ -51,11 +54,6 @@ class HTKDataDeserializer : public DataDeserializerBase
// Chunk descriptions.
std::vector<HTKChunkDescription> m_chunks;

// Weak pointers on existing chunks.
// If randomizer asks the same chunk twice we do not need to recreate
// the chunk if we already uploaded it in memory.
std::vector<std::weak_ptr<Chunk>> m_weakChunks;

// Augmentation window.
std::pair<size_t, size_t> m_augmentationWindow;

Expand All @@ -69,6 +67,13 @@ class HTKDataDeserializer : public DataDeserializerBase
// Flag that indicates whether a single speech frames should be exposed as a sequence.
bool m_frameMode;

// Indicates, whether the deserializers is the "primary" one, the one that drives chunking.
bool m_primary;

// Used to correlate a sequence key with the sequence inside the chunk when deserializer is running not in primary mode.
// Key -> <chunkid, offset inside chunk>
std::map<size_t, std::pair<size_t, size_t>> m_keyToChunkLocation;

// Auxiliary data for checking against the data in the feature file.
unsigned int m_samplePeriod;
size_t m_ioFeatureDimension;
Expand Down
13 changes: 9 additions & 4 deletions Source/Readers/ExperimentalHTKMLFReader/HTKMLFReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,20 +37,22 @@ std::vector<IDataDeserializerPtr> CreateDeserializers(const ConfigParameters& re
std::vector<IDataDeserializerPtr> featureDeserializers;
std::vector<IDataDeserializerPtr> labelDeserializers;

bool primary = true;
// The first deserializer is the driving one, it defines chunking.
// TODO: should we make this explicit configuration parameter
for (const auto& featureName : featureNames)
{
auto deserializer = std::make_shared<HTKDataDeserializer>(corpus, readerConfig(featureName), featureName);
auto deserializer = std::make_shared<HTKDataDeserializer>(corpus, readerConfig(featureName), featureName, primary);
primary = false;
featureDeserializers.push_back(deserializer);
}
assert(featureDeserializers.size() == 1);

for (const auto& labelName : labelNames)
{
auto deserializer = std::make_shared<MLFDataDeserializer>(corpus, readerConfig(labelName), labelName);

labelDeserializers.push_back(deserializer);
}
assert(labelDeserializers.size() == 1);

std::vector<IDataDeserializerPtr> deserializers;
deserializers.insert(deserializers.end(), featureDeserializers.begin(), featureDeserializers.end());
Expand Down Expand Up @@ -98,7 +100,10 @@ HTKMLFReader::HTKMLFReader(MemoryProviderPtr provider,
ConfigHelper config(readerConfig);
size_t window = config.GetRandomizationWindow();
auto deserializers = CreateDeserializers(readerConfig);
assert(deserializers.size() == 2);
if (deserializers.empty())
{
LogicError("Please specify at least a single input stream.");
}

auto bundler = std::make_shared<Bundler>(readerConfig, deserializers[0], deserializers, false);
int verbosity = readerConfig(L"verbosity", 2);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,11 @@ MLFDataDeserializer::MLFDataDeserializer(CorpusDescriptorPtr corpus, const Confi
{
// Currently the string registry contains only utterances described in scp.
// So here we skip all others.
if (!stringRegistry.Contains(l.first))
size_t id = 0;
if (!stringRegistry.TryGet(l.first, id))
continue;

description.m_key.m_major = stringRegistry[l.first];
description.m_key.m_major = id;

const auto& utterance = l.second;
description.m_sequenceStart = m_classIds.size();
Expand Down Expand Up @@ -222,8 +223,8 @@ ChunkPtr MLFDataDeserializer::GetChunk(size_t chunkId)
{
UNUSED(chunkId);
assert(chunkId == 0);
return std::make_shared<MLFChunk>(this);
}
return make_shared<MLFChunk>(this);
};

// Sparse labels for an utterance.
template <class ElemType>
Expand Down
25 changes: 22 additions & 3 deletions Source/Readers/ReaderLib/Bundler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#define _CRT_SECURE_NO_WARNINGS

#include "Bundler.h"
#include <set>

namespace Microsoft { namespace MSR { namespace CNTK {

Expand Down Expand Up @@ -150,16 +151,19 @@ void Bundler::GetSequencesForChunk(size_t chunkId, std::vector<SequenceDescripti
std::swap(sequences, result);
}

// Represents a chunk that has pointers to the underlying deserialzer chunks.
// Represents a chunk that has pointers to the underlying deserializer chunks.
class Bundler::BundlingChunk : public Chunk
{
size_t m_numberOfInputs;
Bundler* m_parent;
size_t m_chunkId;

// A mapping between exposed sequence id and inner chunk for each deserialzier.
// A mapping between exposed sequence id and inner chunk for each deserializer.
// Index i of the vector maps to the chunk of inner sequence (i / m_numberOfInputs) of
// deserializer (i % m_numberOfInputs).
std::vector<ChunkPtr> m_innerChunks;
// A mapping between exposed sequence id and inner sequence id for each deserializer.
// Indices as above.
std::vector<size_t> m_sequenceToSequence;

DISABLE_COPY_AND_MOVE(BundlingChunk);
Expand Down Expand Up @@ -197,6 +201,8 @@ class Bundler::BundlingChunk : public Chunk
SequenceDescription s;
for (size_t deserializerIndex = 1; deserializerIndex < m_parent->m_deserializers.size(); ++deserializerIndex)
{
std::map<size_t, ChunkPtr> secondaryChunks;

for (size_t sequenceIndex = 0; sequenceIndex < sequences.size(); ++sequenceIndex)
{
if (chunk->m_invalid.find(sequenceIndex) != chunk->m_invalid.end())
Expand All @@ -207,7 +213,20 @@ class Bundler::BundlingChunk : public Chunk
size_t currentIndex = sequenceIndex * m_numberOfInputs + deserializerIndex;
deserializers[deserializerIndex]->GetSequenceDescriptionByKey(sequences[sequenceIndex].m_key, s);
m_sequenceToSequence[currentIndex] = s.m_id;
m_innerChunks[currentIndex] = deserializers[deserializerIndex]->GetChunk(s.m_chunkId);

ChunkPtr secondaryChunk;
auto it = secondaryChunks.find(s.m_chunkId);
if (it == secondaryChunks.end())
{
secondaryChunk = deserializers[deserializerIndex]->GetChunk(s.m_chunkId);
secondaryChunks.insert(make_pair(s.m_chunkId, secondaryChunk));
}
else
{
secondaryChunk = it->second;
}

m_innerChunks[currentIndex] = secondaryChunk;
}
}
}
Expand Down
1 change: 0 additions & 1 deletion Source/Readers/ReaderLib/Bundler.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
#include "DataDeserializer.h"
#include "DataDeserializerBase.h"
#include "Config.h"
#include <set>

namespace Microsoft { namespace MSR { namespace CNTK {

Expand Down
15 changes: 15 additions & 0 deletions Source/Readers/ReaderLib/StringToIdMap.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,21 @@ class TStringToIdMap
return m_indexedValues.size() - 1;
}

// Tries to get a value by id.
bool TryGet(const TString& value, size_t& id)
{
const auto& it = m_values.find(value);
if (it == m_values.end())
{
return false;
}
else
{
id = it->second;
return true;
}
}

// Get integer id for the string value.
size_t operator[](const TString& value) const
{
Expand Down
Loading

0 comments on commit a6bc80f

Please sign in to comment.