Skip to content

Commit

Permalink
Fixing some bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
eldakms committed May 18, 2016
1 parent 04959ac commit 13f1769
Show file tree
Hide file tree
Showing 8 changed files with 69 additions and 43 deletions.
22 changes: 21 additions & 1 deletion Source/Readers/CNTKTextFormatReader/Exports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
#include "ReaderShim.h"
#include "CNTKTextFormatReader.h"
#include "HeapMemoryProvider.h"
#include "CudaMemoryProvider.h"

namespace Microsoft { namespace MSR { namespace CNTK {

Expand All @@ -32,4 +31,25 @@ extern "C" DATAREADER_API void GetReaderD(IDataReader** preader)
*preader = new ReaderShim<double>(factory);
}

// TODO: Not safe from the ABI perspective. Will be uglified to make the interface ABI.
// A factory method for creating image deserializers.
extern "C" DATAREADER_API bool CreateDeserializer(IDataDeserializer** deserializer, const std::wstring& type, const ConfigParameters& deserializerConfig, CorpusDescriptorPtr corpus, bool)
{
string precision = deserializerConfig.Find("precision", "float");
if (type == L"CNTKTextFormatDeserializer")
{
if (precision == "float")
*deserializer = new TextParser<float>(corpus, TextConfigHelper(deserializerConfig));
else // Currently assume double, TODO: should change when support more types.
*deserializer = new TextParser<double>(corpus, TextConfigHelper(deserializerConfig));
}
else
// Unknown type.
return false;

// Deserializer created.
return true;
}


}}}
13 changes: 10 additions & 3 deletions Source/Readers/CNTKTextFormatReader/Indexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ void Indexer::Build(CorpusDescriptorPtr corpus)
sd.m_fileOffsetBytes = offset;
sd.m_isValid = true;

auto& stringRegistry = corpus->GetStringRegistry();
while (!m_done)
{
SkipLine(); // ignore whatever is left on this line.
Expand All @@ -174,10 +175,10 @@ void Indexer::Build(CorpusDescriptorPtr corpus)
{
// found a new sequence, which starts at the [offset] bytes into the file
sd.m_byteSize = offset - sd.m_fileOffsetBytes;
auto key = msra::strfun::utf16(std::to_string(id));
auto key = msra::strfun::utf16(std::to_string(sd.m_id));
if (corpus->IsIncluded(key))
{
sd.m_key.m_sequence = corpus->GetStringRegistry()[key];
sd.m_key.m_sequence = stringRegistry[key];
sd.m_key.m_sample = 0;
AddSequence(sd);
}
Expand All @@ -191,7 +192,13 @@ void Indexer::Build(CorpusDescriptorPtr corpus)

// calculate the byte size for the last sequence
sd.m_byteSize = m_fileOffsetEnd - sd.m_fileOffsetBytes;
AddSequence(sd);
auto key = msra::strfun::utf16(std::to_string(sd.m_id));
if (corpus->IsIncluded(key))
{
sd.m_key.m_sequence = stringRegistry[key];
sd.m_key.m_sample = 0;
AddSequence(sd);
}
}


Expand Down
2 changes: 1 addition & 1 deletion Source/Readers/CompositeDataReader/CompositeDataReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ IDataDeserializerPtr CompositeDataReader::CreateDeserializer(const ConfigParamet
void CompositeDataReader::CreateTransforms(const ConfigParameters& deserializerConfig)
{
std::string defaultModule = deserializerConfig("module");
argvector<ConfigParameters> inputs = deserializerConfig("inputs");
argvector<ConfigParameters> inputs = deserializerConfig("input");
for (size_t i = 0; i < inputs.size(); ++i)
{
// Trying to find transfomers in a stream section of the config.
Expand Down
4 changes: 2 additions & 2 deletions Source/Readers/ImageReader/ImageDataDeserializer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ class ImageDataDeserializer::ImageChunk : public Chunk, public std::enable_share
// TODO: Provide only sequences specified in the corpus descriptor.
ImageDataDeserializer::ImageDataDeserializer(CorpusDescriptorPtr corpus, const ConfigParameters& config)
{
ConfigParameters inputs = config("inputs");
ConfigParameters inputs = config("input");
std::vector<std::string> featureNames = GetSectionsWithParameter("ImageDataDeserializer", inputs, "transforms");
std::vector<std::string> labelNames = GetSectionsWithParameter("ImageDataDeserializer", inputs, "labelDim");

Expand Down Expand Up @@ -290,8 +290,8 @@ void ImageDataDeserializer::CreateSequenceDescriptions(CorpusDescriptorPtr corpu
description.m_key.m_sequence = corpusStringRegistry[wsequenceKey];
description.m_key.m_sample = 0;

m_imageSequences.push_back(description);
m_keyToSequence[description.m_key.m_sequence] = m_imageSequences.size();
m_imageSequences.push_back(description);
RegisterByteReader(description.m_id, description.m_path, knownReaders);
}
}
Expand Down
23 changes: 14 additions & 9 deletions Source/Readers/ReaderLib/Bundler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,20 +133,26 @@ void Bundler::GetSequencesForChunk(size_t chunkId, std::vector<SequenceDescripti
// Can return because all sequences are clean.
if (chunk->m_invalid.empty())
{
// Reindexing.
for (int i = 0; i < sequences.size(); ++i)
{
sequences[i].m_id = i;
}
return;
}

// Do cleansing.
std::vector<SequenceDescription> result;
result.reserve(sequences.size());
for (size_t sequenceIndex = 0; sequenceIndex < sequences.size(); ++sequenceIndex)
for (size_t sequenceIndex = 0, index = 0; sequenceIndex < sequences.size(); ++sequenceIndex)
{
if (chunk->m_invalid.find(sequenceIndex) != chunk->m_invalid.end())
{
continue;
}

result.push_back(sequences[sequenceIndex]);
result.back().m_id = index++;
}
std::swap(sequences, result);
}
Expand All @@ -159,8 +165,8 @@ class Bundler::BundlingChunk : public Chunk
size_t m_chunkId;

// A mapping between exposed sequence id and inner chunk for each deserializer.
// Index i of the vector maps to the chunk of inner sequence (i / m_numberOfInputs) of
// deserializer (i % m_numberOfInputs).
// Index i of the vector maps to the chunk of inner sequence (i / number of deserializers) of
// deserializer (i % number of deserializers).
std::vector<ChunkPtr> m_innerChunks;
// A mapping between exposed sequence id and inner sequence id for each deserializer.
// Indices as above.
Expand All @@ -176,23 +182,22 @@ class Bundler::BundlingChunk : public Chunk
ChunkDescriptionPtr original = chunk->m_original;

auto& deserializers = m_parent->m_deserializers;
assert(numberOfInputs == deserializers.size());
std::vector<SequenceDescription> sequences;
sequences.reserve(original->m_numberOfSequences);

// Creating chunk mapping.
m_parent->m_driver->GetSequencesForChunk(original->m_id, sequences);
ChunkPtr drivingChunk = m_parent->m_driver->GetChunk(original->m_id);
m_sequenceToSequence.resize(m_numberOfInputs * sequences.size());
m_innerChunks.resize(m_numberOfInputs * sequences.size());
m_sequenceToSequence.resize(deserializers.size() * sequences.size());
m_innerChunks.resize(deserializers.size() * sequences.size());
for (size_t sequenceIndex = 0; sequenceIndex < sequences.size(); ++sequenceIndex)
{
if (chunk->m_invalid.find(sequenceIndex) != chunk->m_invalid.end())
{
continue;
}

size_t currentIndex = sequenceIndex * m_numberOfInputs;
size_t currentIndex = sequenceIndex * deserializers.size();
m_sequenceToSequence[currentIndex] = sequences[sequenceIndex].m_id;
m_innerChunks[currentIndex] = drivingChunk;
}
Expand All @@ -210,7 +215,7 @@ class Bundler::BundlingChunk : public Chunk
continue;
}

size_t currentIndex = sequenceIndex * m_numberOfInputs + deserializerIndex;
size_t currentIndex = sequenceIndex * deserializers.size() + deserializerIndex;
deserializers[deserializerIndex]->GetSequenceDescriptionByKey(sequences[sequenceIndex].m_key, s);
m_sequenceToSequence[currentIndex] = s.m_id;

Expand All @@ -235,7 +240,7 @@ class Bundler::BundlingChunk : public Chunk
virtual void GetSequence(size_t sequenceId, std::vector<SequenceDataPtr>& result) override
{
result.reserve(m_numberOfInputs);
size_t currentIndex = sequenceId * m_numberOfInputs;
size_t currentIndex = sequenceId * m_parent->m_deserializers.size();
for (int i = 0; i < m_parent->m_deserializers.size(); ++i)
{
size_t originalSequenceId = m_sequenceToSequence[currentIndex + i];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,24 @@ reader = [
verbosity=0
randomize=true

# Currently for image reader a single sequence is a chunk
# so setting randomization window to 1.
randomizationWindow=1
useLegacy=false

# A list of deserializers to use.
deserializers=[
[
[
type="CNTKTextFormatDeserializer"
module="CNTKTextFormatReader"
file = "$RootDir$/ImageAndTextReaderSimple_labels.txt"

input=[
labels=[
dim=4
format="sparse"
]
]
]:[
type="ImageDataDeserializer"
module="ImageReader"
file = "$RootDir$/ImageAndReaderSimple_map.txt"
file = "$RootDir$/ImageAndTextReaderSimple_map.txt"

# Description of input streams
inputs=[
input=[
features=[
transforms=[
[
Expand All @@ -53,22 +57,12 @@ reader = [
]
]
]

# Currently the image deserializer always has labels, we read but ignore them.
ignored=[
labelDim=4
]
]
]:[
type="CNTKTextFormatDeserializer"
module="CNTKTextFormatReader"
file = "$RootDir$/ImageAndReaderSimple_labels.txt"

# Description of input streams
input=[
labels=[
dim=4
format="sparse"
]
]
]
]
]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
1 |labels 0
2 |labels 1
3 |labels 2
4 |labels 3
1 |labels 0:1
2 |labels 1:1
3 |labels 2:1
4 |labels 3:1
2 changes: 1 addition & 1 deletion Tests/UnitTests/ReaderTests/ReaderTests.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@
</PropertyGroup>
<Target Name="CopyUnitTestDependencies" AfterTargets="Build">
<ItemGroup>
<UnitTestDependencies Include="$(OutDir)..\Math.dll;$(OutDir)..\ucifastreader.dll;$(OutDir)..\cntktextformatreader.dll;$(OutDir)..\htkmlfreader.dll;$(OutDir)..\experimentalhtkmlfreader.dll;$(OutDir)..\libacml_mp_dll.dll;$(OutDir)..\libifcoremd.dll;$(OutDir)..\libifportmd.dll;$(OutDir)..\libiomp*.dll;$(OutDir)..\libmmd.dll;$(OutDir)..\svml_dispmd.dll;$(ImageReaderDependencies);" />
<UnitTestDependencies Include="$(OutDir)..\Math.dll;$(OutDir)..\ucifastreader.dll;$(OutDir)..\cntktextformatreader.dll;$(OutDir)..\htkmlfreader.dll;$(OutDir)..\experimentalhtkmlfreader.dll;$(OutDir)..\compositedatareader.dll;$(OutDir)..\libacml_mp_dll.dll;$(OutDir)..\libifcoremd.dll;$(OutDir)..\libifportmd.dll;$(OutDir)..\libiomp*.dll;$(OutDir)..\libmmd.dll;$(OutDir)..\svml_dispmd.dll;$(ImageReaderDependencies);" />
</ItemGroup>
<Copy SourceFiles="@(UnitTestDependencies)" DestinationFolder="$(OutDir)" SkipUnchangedFiles="true">
<Output TaskParameter="DestinationFiles" ItemName="NewFileWrites" />
Expand Down

0 comments on commit 13f1769

Please sign in to comment.