Skip to content

Commit

Permalink
Adding support of sequence keys to CNTK key
Browse files Browse the repository at this point in the history
  • Loading branch information
eldakms committed May 20, 2016
1 parent 07165dd commit 618e98a
Show file tree
Hide file tree
Showing 12 changed files with 196 additions and 48 deletions.
51 changes: 49 additions & 2 deletions Source/Readers/CNTKTextFormatReader/Descriptors.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,53 @@ namespace Microsoft { namespace MSR { namespace CNTK {

// A collection of chunk descriptors, each containing
// a collection of sequence descriptors for the corresponding
// chunk of the input data.
typedef std::vector<ChunkDescriptor> Index;
// chunk of the input data.
struct Index
{
std::vector<ChunkDescriptor> m_chunks;
std::map<size_t, std::pair<size_t, size_t>> m_keyToSequenceInChunk;
size_t m_maxChunkSize;

explicit Index(size_t chunkSize) : m_maxChunkSize(chunkSize)
{}

// Adds sequence (metadata) to the index. Additionally, it
// assigns an appropriate chunk id to the sequence descriptor,
// ensures that chunks do not exceed the maximum allowed size
// (except when a sequence size is greater than the maximum chunk size)
void AddSequence(SequenceDescriptor& sd)
{
assert(!m_chunks.empty());
ChunkDescriptor* chunk = &m_chunks.back();
if (chunk->m_byteSize > 0 && (chunk->m_byteSize + sd.m_byteSize) > m_maxChunkSize)
{
m_chunks.push_back({});
chunk = &m_chunks.back();
chunk->m_id = m_chunks.size() - 1;
}

auto location = std::make_pair<size_t, size_t>(m_chunks.size() - 1, chunk->m_sequences.size());
m_keyToSequenceInChunk.insert(std::make_pair(sd.m_key.m_sequence, location));
chunk->m_byteSize += sd.m_byteSize;
chunk->m_numberOfSequences++;
chunk->m_numberOfSamples += sd.m_numberOfSamples;
sd.m_chunkId = chunk->m_id;
chunk->m_sequences.push_back(sd);
}

void Reserve(size_t sizeInBytes)
{
if (m_maxChunkSize > 0)
{
m_chunks.reserve((sizeInBytes + m_maxChunkSize - 1) / m_maxChunkSize);
}

m_chunks.push_back({});
}

bool IsEmpty() const
{
return m_chunks.empty();
}
};
}}}
36 changes: 7 additions & 29 deletions Source/Readers/CNTKTextFormatReader/Indexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ Indexer::Indexer(FILE* file, bool skipSequenceIds, size_t chunkSize) :
m_pos(nullptr),
m_done(false),
m_hasSequenceIds(!skipSequenceIds),
m_maxChunkSize(chunkSize)
m_maxChunkSize(chunkSize),
m_index(chunkSize)
{
if (m_file == nullptr)
{
Expand Down Expand Up @@ -53,23 +54,6 @@ void Indexer::RefillBuffer()
}
}

void Indexer::AddSequence(SequenceDescriptor& sd)
{
assert(!m_chunks.empty());
ChunkDescriptor* chunk = &m_chunks.back();
if (chunk->m_byteSize > 0 && (chunk->m_byteSize + sd.m_byteSize) > m_maxChunkSize)
{
m_chunks.push_back({});
chunk = &m_chunks.back();
chunk->m_id = m_chunks.size() - 1;
}
chunk->m_byteSize += sd.m_byteSize;
chunk->m_numberOfSequences++;
chunk->m_numberOfSamples += sd.m_numberOfSamples;
sd.m_chunkId = chunk->m_id;
chunk->m_sequences.push_back(sd);
}

void Indexer::BuildFromLines()
{
assert(m_pos == m_bufferStart);
Expand All @@ -88,7 +72,7 @@ void Indexer::BuildFromLines()
sd.m_fileOffsetBytes = offset;
offset = GetFileOffset() + 1;
sd.m_byteSize = offset - sd.m_fileOffsetBytes;
AddSequence(sd);
m_index.AddSequence(sd);
++m_pos;
++lines;
}
Expand All @@ -108,25 +92,19 @@ void Indexer::BuildFromLines()
sd.m_isValid = true;
sd.m_fileOffsetBytes = offset;
sd.m_byteSize = m_fileOffsetEnd - sd.m_fileOffsetBytes;
AddSequence(sd);
m_index.AddSequence(sd);
}

}

void Indexer::Build(CorpusDescriptorPtr corpus)
{
if (!m_chunks.empty())
if (!m_index.IsEmpty())
{
return;
}

if (m_maxChunkSize > 0)
{
auto fileSize = filesize(m_file);
m_chunks.reserve((fileSize + m_maxChunkSize - 1) / m_maxChunkSize);
}

m_chunks.push_back({});
m_index.Reserve(filesize(m_file));

RefillBuffer(); // read the first block of data
if (m_done)
Expand Down Expand Up @@ -196,7 +174,7 @@ void Indexer::AddSequenceIfIncluded(CorpusDescriptorPtr corpus, SequenceDescript
{
sd.m_key.m_sequence = stringRegistry[key];
sd.m_key.m_sample = 0;
AddSequence(sd);
m_index.AddSequence(sd);
}
}

Expand Down
12 changes: 5 additions & 7 deletions Source/Readers/CNTKTextFormatReader/Indexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class Indexer
void Build(CorpusDescriptorPtr corpus);

// Returns input data index (chunk and sequence metadata)
const Index& GetIndex() const { return m_chunks; }
const Index& GetIndex() const { return m_index; }

// True, when input does not have the sequence id column
// or when sequence id column was ignored during indexing
Expand All @@ -52,13 +52,11 @@ class Indexer

const size_t m_maxChunkSize; // maximum permitted chunk size;

std::vector<ChunkDescriptor> m_chunks; // a collection of chunk descriptors
// a collection of chunk descriptors and sequence keys.
Index m_index;

// Adds sequence (metadata) to the index. Additionally, it
// assigns an appropriate chunk id to the sequence descriptor,
// ensures that chunks do not exceed the maximum allowed size
// (except when a sequence size is greater than the maximum chunk size)
void AddSequence(SequenceDescriptor& sd);
// Mapping of keys into <chunk, sequence>
std::map<size_t, std::pair<size_t, size_t>> m_keyToSequenceInChunk;

// Same function as above but with check that the sequence is included in the corpus descriptor.
void AddSequenceIfIncluded(CorpusDescriptorPtr corpus, SequenceDescriptor& sd);
Expand Down
24 changes: 20 additions & 4 deletions Source/Readers/CNTKTextFormatReader/TextParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,8 +193,8 @@ ChunkDescriptions TextParser<ElemType>::GetChunkDescriptions()
const auto& index = m_indexer->GetIndex();

ChunkDescriptions result;
result.reserve(index.size());
for (auto const& chunk : index)
result.reserve(index.m_chunks.size());
for (auto const& chunk : index.m_chunks)
{
result.push_back(shared_ptr<ChunkDescription>(
new ChunkDescription {
Expand All @@ -211,7 +211,7 @@ template <class ElemType>
void TextParser<ElemType>::GetSequencesForChunk(size_t chunkId, std::vector<SequenceDescription>& result)
{
const auto& index = m_indexer->GetIndex();
const auto& chunk = index[chunkId];
const auto& chunk = index.m_chunks[chunkId];
result.reserve(chunk.m_sequences.size());

for (auto const& s : chunk.m_sequences)
Expand Down Expand Up @@ -276,7 +276,7 @@ void TextParser<ElemType>::TextDataChunk::GetSequence(size_t sequenceId, std::ve
template <class ElemType>
ChunkPtr TextParser<ElemType>::GetChunk(size_t chunkId)
{
const auto& chunkDescriptor = m_indexer->GetIndex()[chunkId];
const auto& chunkDescriptor = m_indexer->GetIndex().m_chunks[chunkId];
auto textChunk = make_shared<TextDataChunk>(chunkDescriptor, this);

attempt(m_numRetries, [this, &textChunk, &chunkDescriptor]()
Expand Down Expand Up @@ -1201,6 +1201,22 @@ std::wstring TextParser<ElemType>::GetFileInfo()
return info.str();
}

static SequenceDescription s_InvalidSequence{0, 0, 0, false, {0, 0}};

template <class ElemType>
void TextParser<ElemType>::GetSequenceDescriptionByKey(const KeyType& key, SequenceDescription& result)
{
const auto& keys = m_indexer->GetIndex().m_keyToSequenceInChunk;
auto sequenceLocation = keys.find(key.m_sequence);
if (sequenceLocation == keys.end())
{
result = s_InvalidSequence;
return;
}

result = m_indexer->GetIndex().m_chunks[sequenceLocation->second.first].m_sequences[sequenceLocation->second.second];
}

template class TextParser<float>;
template class TextParser<double>;
}}}
2 changes: 2 additions & 0 deletions Source/Readers/CNTKTextFormatReader/TextParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ class TextParser : public DataDeserializerBase {
// Get information about particular chunk.
void GetSequencesForChunk(size_t chunkId, std::vector<SequenceDescription>& result) override;

void GetSequenceDescriptionByKey(const KeyType&, SequenceDescription&) override;

private:
// Builds an index of the input data.
void Initialize(CorpusDescriptorPtr corpus);
Expand Down
2 changes: 1 addition & 1 deletion Source/Readers/CompositeDataReader/CompositeDataReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ CompositeDataReader::CompositeDataReader(const ConfigParameters& config, MemoryP
m_provider(provider)
{
// Identifying packing mode.
bool frameMode = config(L"frameMode", true);
bool frameMode = config(L"frameMode", false);
bool truncated = config(L"truncated", false);
if (frameMode && truncated)
{
Expand Down
40 changes: 40 additions & 0 deletions Tests/UnitTests/ReaderTests/CNTKTextFormatReaderTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -720,6 +720,46 @@ BOOST_AUTO_TEST_CASE(CNTKTextFormatReader_200x200x2_seq2seq)
CheckFilesEquivalent(controlFile, outputFile);
};

// 50 sequences with up to 20 samples each (508 samples in total)
BOOST_AUTO_TEST_CASE(CompositeCNTKTextFormatReader_5x5_and_5x10_jagged_sequences_dense)
{
// This simply writes. Test control is the same as the output file.
HelperRunReaderTest<double>(
testDataPath() + "/Config/CNTKTextFormatReader/dense.cntk",
testDataPath() + "/Control/CNTKTextFormatReader/5x10_and_5x5_jagged_output.txt",
testDataPath() + "/Control/CNTKTextFormatReader/5x10_and_5x5_jagged_output.txt",
"5x10_and_5x5_jagged",
"reader",
40, // epoch size
10, // mb size
3, // num epochs
2,
0,
0,
1,
false,
false,
false);

HelperRunReaderTest<double>(
testDataPath() + "/Config/CNTKTextFormatReader/dense.cntk",
testDataPath() + "/Control/CNTKTextFormatReader/5x10_and_5x5_jagged_output.txt",
testDataPath() + "/Control/CNTKTextFormatReader/5x10_and_5x5_jagged_output2.txt",
"5x10_and_5x5_jagged_composite",
"reader",
40, // epoch size
10, // mb size
3, // num epochs
2,
0,
0,
1,
false,
false,
false);
};


BOOST_AUTO_TEST_SUITE_END()

} } } }
Original file line number Diff line number Diff line change
Expand Up @@ -292,4 +292,62 @@ Simple = [
]
]
]
]
]

5x10_and_5x5_jagged = [
precision = "double"
reader = [
randomize = true
deserializers = [
[
type="CNTKTextFormatDeserializer"
module="CNTKTextFormatReader"
file="5x10_and_5x5_jagged.txt"
input=[
features1 = [
alias = "F0"
dim = 10
format = "dense"
]
features2 = [
alias = "F1"
dim = 5
format = "dense"
]
]
]
]
]
]

5x10_and_5x5_jagged_composite = [
precision = "double"
reader = [
randomize = true
deserializers = [
[
type="CNTKTextFormatDeserializer"
module="CNTKTextFormatReader"
file="5x10_jagged.txt"
input=[
features1 = [
alias = "F0"
dim = 10
format = "dense"
]
]
]:[
type="CNTKTextFormatDeserializer"
module="CNTKTextFormatReader"
file="5x5_jagged.txt"
input=[
features2 = [
alias = "F1"
dim = 5
format = "dense"
]
]
]
]
]
]
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@
3|F0 -0.477001 -0.00520275 -1.65596 -1.20112 -0.000103034 1.99208e-005 -0.284429 8411.77 -1.43145 -0.000683784
3|F0 5.66203 0.995731 -0.421539 0.928061 -10717.7 -1.00966 -0.792171 1.40365 -2.74474e-007 131.182
3|F0 1.09727e-014 -1.1983 -6.4148e-007 19.4354 -2.14661e-005 -0.0109343 -0.110837 0.935603 -0.638487 -1.82358
4|F0 -4.73443 3.35042 -3.55349 -2.34185 -3.23589e+009 0.00388636 0.177933 0.622771 0.0548931 -11.997
4|F0 -4.73443 3.35042 -3.55349 -2.34185 -3.23589e+009 0.00388636 0.177933 0.622771 0.0548931 -11.997
3 changes: 0 additions & 3 deletions Tests/UnitTests/ReaderTests/HTKLMFReaderTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -577,10 +577,7 @@ BOOST_AUTO_TEST_CASE(ExperimentalHTKMLFReaderSimpleDataLoop3)
1);
};



BOOST_AUTO_TEST_SUITE_END()

}

}}}
3 changes: 3 additions & 0 deletions Tests/UnitTests/ReaderTests/ReaderTests.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,9 @@
<Text Include="Data\CNTKTextFormatReader\1x1_dense.txt" />
<Text Include="Data\CNTKTextFormatReader\1x1_sparse.txt" />
<Text Include="Data\CNTKTextFormatReader\200x200x2_seq2seq_dense.txt" />
<Text Include="Data\CNTKTextFormatReader\5x10_and_5x5_jagged.txt" />
<Text Include="Data\CNTKTextFormatReader\5x10_jagged.txt" />
<Text Include="Data\CNTKTextFormatReader\5x5_jagged.txt" />
<Text Include="Data\CNTKTextFormatReader\contains_blank_lines.txt" />
<Text Include="Data\CNTKTextFormatReader\duplicate_inputs.txt" />
<Text Include="Data\CNTKTextFormatReader\empty_samples.txt" />
Expand Down
9 changes: 9 additions & 0 deletions Tests/UnitTests/ReaderTests/ReaderTests.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,15 @@
<Text Include="Control\CNTKTextFormatReader\200x200x2_seq2seq_dense_sorted.txt">
<Filter>Control\CNTKTextFormatReader</Filter>
</Text>
<Text Include="Data\CNTKTextFormatReader\5x5_jagged.txt">
<Filter>Data\CNTKTextFormatReader</Filter>
</Text>
<Text Include="Data\CNTKTextFormatReader\5x10_and_5x5_jagged.txt">
<Filter>Data\CNTKTextFormatReader</Filter>
</Text>
<Text Include="Data\CNTKTextFormatReader\5x10_jagged.txt">
<Filter>Data\CNTKTextFormatReader</Filter>
</Text>
</ItemGroup>
<ItemGroup>
<Image Include="Data\images\black.jpg">
Expand Down

0 comments on commit 618e98a

Please sign in to comment.