From b8eb51d4a3cceb1698da34f52e46ca152667c25b Mon Sep 17 00:00:00 2001 From: yzhang87 Date: Mon, 14 Dec 2015 11:00:20 -0500 Subject: [PATCH] Fix kaldi Reader. 1) Using the new MBLayout interface. 2) Modify the configParameter to configRecordType to consistent with HTKMLFReader. 3) Clean the warning message. Next: refactor to make it consistent with HTKMLFReader. --- DataReader/Kaldi2Reader/HTKMLFReader.cpp | 229 ++- DataReader/Kaldi2Reader/HTKMLFReader.h | 10 +- DataReader/Kaldi2Reader/HTKMLFWriter.cpp | 11 +- .../KaldiSequenceTrainingDerivative.cpp | 10 +- DataReader/Kaldi2Reader/basetypes.h | 1518 ++++----------- DataReader/Kaldi2Reader/fileutil.cpp | 1726 ----------------- DataReader/Kaldi2Reader/fileutil.h | 620 ------ DataReader/Kaldi2Reader/htkfeatio.h | 1 + Makefile | 8 +- 9 files changed, 487 insertions(+), 3646 deletions(-) delete mode 100644 DataReader/Kaldi2Reader/fileutil.cpp delete mode 100644 DataReader/Kaldi2Reader/fileutil.h diff --git a/DataReader/Kaldi2Reader/HTKMLFReader.cpp b/DataReader/Kaldi2Reader/HTKMLFReader.cpp index b07c88a724a4..6b4cdfe371d0 100644 --- a/DataReader/Kaldi2Reader/HTKMLFReader.cpp +++ b/DataReader/Kaldi2Reader/HTKMLFReader.cpp @@ -7,6 +7,7 @@ // #include "stdafx.h" +#include "Basics.h" #include "basetypes.h" #include "htkfeatio.h" // for reading HTK features @@ -65,7 +66,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // If is false, throw away any utterance that is longer // than the specified . - m_maxUtteranceLength = readerConfig(L"maxUtteranceLength", "10000"); + m_maxUtteranceLength = readerConfig(L"maxUtteranceLength", 10000); // m_truncated: // If true, truncate utterances to fit the minibatch size. Otherwise @@ -73,8 +74,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { // m_numberOfuttsPerMinibatch: // If larger than one, then each minibatch contains multiple // utterances. - m_truncated = readerConfig(L"Truncated", "false"); - m_numberOfuttsPerMinibatch = readerConfig(L"nbruttsineachrecurrentiter", "1"); + m_truncated = readerConfig(L"Truncated", false); + m_numberOfuttsPerMinibatch = readerConfig(L"nbruttsineachrecurrentiter", 1); if (m_numberOfuttsPerMinibatch < 1) { LogicError("nbrUttsInEachRecurrentIter cannot be less than 1.\n"); @@ -93,7 +94,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (readerConfig.Exists(L"seqTrainCriterion")) { m_doSeqTrain = true; - m_seqTrainCriterion = wstring(readerConfig(L"seqTrainCriterion")); + m_seqTrainCriterion = (const wstring&) readerConfig(L"seqTrainCriterion",L""); if ((m_seqTrainCriterion != L"mpfe") && (m_seqTrainCriterion != L"smbr")) { @@ -102,7 +103,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { } // Checks if framemode is false in sequence training. - m_framemode = readerConfig(L"frameMode", "true"); + m_framemode = readerConfig(L"frameMode", true); if (m_framemode && m_doSeqTrain) { LogicError("frameMode has to be false in sequence training.\n"); @@ -124,8 +125,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { } // Checks if we are in "write" mode or "train/test" mode. - string command(readerConfig(L"action",L"")); - if (command == "write") + wstring command(readerConfig(L"action",L"")); + if (command == L"write") { m_trainOrTest = false; PrepareForWriting(readerConfig); @@ -138,8 +139,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { } - template - void HTKMLFReader::PrepareForSequenceTraining(const ConfigParameters& readerConfig) + template template + void HTKMLFReader::PrepareForSequenceTraining(const ConfigRecordType & readerConfig) { // Parameters that we are looking for. wstring denlatRspecifier, aliRspecifier, transModelFilename, silencePhoneStr; @@ -159,51 +160,53 @@ namespace Microsoft { namespace MSR { namespace CNTK { } // Processes "denlats" section. - ConfigParameters denlatConfig = readerConfig(L"denlats"); - if (!denlatConfig.Exists("rx")) + const ConfigRecordType & denlatConfig = readerConfig(L"denlats"); + if (!denlatConfig.Exists(L"rx")) { LogicError("Rspecifier is not provided for denominator lattices.\n"); } - if (!denlatConfig.Exists("kaldiModel")) + if (!denlatConfig.Exists(L"kaldiModel")) { LogicError("Rspecifier is not provided for Kaldi model.\n"); } - denlatRspecifier = wstring(denlatConfig("rx")); - transModelFilename = wstring(denlatConfig("kaldiModel")); - silencePhoneStr = wstring(denlatConfig("silPhoneList", "")); - oldAcousticScale = denlatConfig("oldAcousticScale", "0.0"); - acousticScale = denlatConfig("acousticScale", "0.2"); - lmScale = denlatConfig("lmScale", "1.0"); - oneSilenceClass = denlatConfig("oneSilenceClass", "true"); + denlatRspecifier = (const wstring&) denlatConfig(L"rx"); + transModelFilename = (const wstring &)(denlatConfig(L"kaldiModel")); + silencePhoneStr = (const wstring&)(denlatConfig(L"silPhoneList", L"")); + oldAcousticScale = denlatConfig(L"oldAcousticScale", 0.0); + acousticScale = denlatConfig(L"acousticScale", 0.2); + lmScale = denlatConfig(L"lmScale", 1.0); + oneSilenceClass = denlatConfig(L"oneSilenceClass", true); // Processes "alignments" section. - ConfigParameters aliConfig = readerConfig(L"alignments"); - if (!aliConfig.Exists("rx")) + const ConfigRecordType & aliConfig = readerConfig(L"alignments"); + if (!aliConfig.Exists(L"rx")) { LogicError("Rspecifier is not provided for alignments.\n"); } - aliRspecifier = wstring(aliConfig("rx")); + aliRspecifier = (const wstring&) (aliConfig(L"rx")); // Scans the configurations to get "readerDeriv" type input and // "readerObj" type input. Both are feature nodes, we feed derivatives // to training criterion node through "readerDeriv" and feed objective // through "readerObj". bool hasDrive = false, hasObj = false; - for (auto iter = readerConfig.begin(); iter != readerConfig.end(); ++iter) + //for (auto iter = readerConfig.begin(); iter != readerConfig.end(); ++iter) + for (const auto & id : readerConfig.GetMemberIds()) { - ConfigParameters temp = iter->second; + const ConfigRecordType & temp = readerConfig(id); if (temp.ExistsCurrent(L"type")) { - if (temp("type") == "readerDeriv" - || temp("type") == "seqTrainDeriv" /*for back compatibility */) + wstring type = temp(L"type"); + if (!_wcsicmp(type.c_str(), L"readerDeriv") + || !_wcsicmp(type.c_str(), L"seqTrainDeriv") /*for back compatibility */) { - m_nameToTypeMap[msra::strfun::utf16(iter->first)] = InputOutputTypes::readerDeriv; + m_nameToTypeMap[id] = InputOutputTypes::readerDeriv; hasDrive = true; } - else if (temp("type") == "readerObj" - || temp("type") == "seqTrainObj" /*for back compatibility */) + else if (!_wcsicmp(type.c_str(), L"readerObj") + || !_wcsicmp(type.c_str(),L"seqTrainObj") /*for back compatibility */) { - m_nameToTypeMap[msra::strfun::utf16(iter->first)] = InputOutputTypes::readerObj; + m_nameToTypeMap[id] = InputOutputTypes::readerObj; hasObj = true; } } @@ -241,8 +244,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { // Note that we treat and as features, but they // will be computed in the reader, rather then reading from disks. Those // will then be fed to training criterion node for training purposes. - template - void HTKMLFReader::PrepareForTrainingOrTesting(const ConfigParameters& readerConfig) + template template + void HTKMLFReader::PrepareForTrainingOrTesting(const ConfigRecordType& readerConfig) { // Loads files for sequence training. if (m_doSeqTrain) @@ -278,15 +281,14 @@ namespace Microsoft { namespace MSR { namespace CNTK { vector & scriptpaths = m_trainingOrTestingFeatureSections; foreach_index(i, featureNames) { - ConfigParameters thisFeature = readerConfig(featureNames[i]); - - // Figures out the context. - ConfigArray contextWindow = thisFeature("contextWindow", "1"); + const ConfigRecordType & thisFeature = readerConfig(featureNames[i]); + m_featDims.push_back(thisFeature(L"dim")); + intargvector contextWindow = thisFeature(L"contextWindow", ConfigRecordType::Array(intargvector(vector{ 1 }))); if (contextWindow.size() == 1) // symmetric { size_t windowFrames = contextWindow[0]; if (windowFrames % 2 == 0 ) - RuntimeError("augmentationextent: neighbor expansion of input features to %d not symmetrical", windowFrames); + RuntimeError("augmentationextent: neighbor expansion of input features to %d not symmetrical", (int)windowFrames); size_t context = windowFrames / 2; // extend each side by this numContextLeft.push_back(context); numContextRight.push_back(context); @@ -299,28 +301,26 @@ namespace Microsoft { namespace MSR { namespace CNTK { } else { - RuntimeError("contextFrames must have 1 or 2 values specified, found %d", contextWindow.size()); + RuntimeError("contextFrames must have 1 or 2 values specified, found %d", (int)contextWindow.size()); } // Figures the actual feature dimension, with context. - m_featDims.push_back(thisFeature("dim")); m_featDims[i] = m_featDims[i] * (1 + numContextLeft[i] + numContextRight[i]); // Figures out the category. - string type = thisFeature("type", "Real"); - if (type == "Real") + wstring type = thisFeature(L"type", L"real"); + if (!_wcsicmp(type.c_str(), L"real")) { m_nameToTypeMap[featureNames[i]] = InputOutputTypes::real; } else { - RuntimeError("feature type must be Real"); + InvalidArgument("feature type must be 'real'"); } - m_featureNameToIdMap[featureNames[i]] = iFeat; assert(iFeat == m_featureIdToNameMap.size()); m_featureIdToNameMap.push_back(featureNames[i]); - scriptpaths.push_back(new msra::asr::FeatureSection(thisFeature("scpFile"), thisFeature("rx"), thisFeature("featureTransform", ""))); + scriptpaths.push_back(new msra::asr::FeatureSection(thisFeature(L"scpFile"), thisFeature(L"rx"), thisFeature(L"featureTransform", L""))); m_featureNameToDimMap[featureNames[i]] = m_featDims[i]; m_featuresBufferMultiIO.push_back(NULL); @@ -336,36 +336,36 @@ namespace Microsoft { namespace MSR { namespace CNTK { vector> mlfpathsmulti; foreach_index(i, labelNames) { - ConfigParameters thisLabel = readerConfig(labelNames[i]); + const ConfigRecordType& thisLabel = readerConfig(labelNames[i]); // Figures out label dimension. - if (thisLabel.Exists("labelDim")) - m_labelDims.push_back(thisLabel("labelDim")); - else if (thisLabel.Exists("dim")) - m_labelDims.push_back(thisLabel("dim")); + if (thisLabel.Exists(L"labelDim")) + m_labelDims.push_back(thisLabel(L"labelDim")); + else if (thisLabel.Exists(L"dim")) + m_labelDims.push_back(thisLabel(L"dim")); else - RuntimeError("labels must specify dim or labelDim"); + InvalidArgument("labels must specify dim or labelDim"); // Figures out the category. - string type; - if (thisLabel.Exists("labelType")) - type = thisLabel("labelType"); // let's deprecate this eventually and just use "type"... + wstring type; + if (thisLabel.Exists(L"labelType")) + type = (const wstring &)thisLabel(L"labelType"); // let's deprecate this eventually and just use "type"... else - type = thisLabel("type","Category"); // outputs should default to category - if (type == "Category") + type = (const wstring &)thisLabel(L"type",L"category"); // outputs should default to category + if (!_wcsicmp(type.c_str(), L"category")) m_nameToTypeMap[labelNames[i]] = InputOutputTypes::category; else - RuntimeError("label type must be Category"); + InvalidArgument("label type must be Category"); // Loads label mapping. - statelistpaths.push_back(thisLabel("labelMappingFile",L"")); + statelistpaths.push_back(thisLabel(L"labelMappingFile",L"")); m_labelNameToIdMap[labelNames[i]] = iLabel; assert(iLabel == m_labelIdToNameMap.size()); m_labelIdToNameMap.push_back(labelNames[i]); m_labelNameToDimMap[labelNames[i]] = m_labelDims[i]; mlfpaths.clear(); - mlfpaths.push_back(thisLabel("mlfFile")); + mlfpaths.push_back(thisLabel(L"mlfFile")); mlfpathsmulti.push_back(mlfpaths); m_labelsBufferMultiIO.push_back(NULL); @@ -374,14 +374,14 @@ namespace Microsoft { namespace MSR { namespace CNTK { iLabel++; // Figures out label to target mapping. - wstring labelToTargetMappingFile(thisLabel("labelToTargetMappingFile",L"")); + wstring labelToTargetMappingFile(thisLabel(L"labelToTargetMappingFile",L"")); if (labelToTargetMappingFile != L"") { std::vector> labelToTargetMap; m_convertLabelsToTargetsMultiIO.push_back(true); - if (thisLabel.Exists("targetDim")) + if (thisLabel.Exists(L"targetDim")) { - m_labelNameToDimMap[labelNames[i]] = m_labelDims[i] = thisLabel("targetDim"); + m_labelNameToDimMap[labelNames[i]] = m_labelDims[i] = thisLabel(L"targetDim"); } else RuntimeError("output must specify targetDim if labelToTargetMappingFile specified!"); @@ -489,11 +489,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { { LogicError("rollingWindow is not supported in sequence training.\n"); } - std::string pageFilePath; + std::wstring pageFilePath; std::vector pagePaths; if (readerConfig.Exists(L"pageFilePath")) { - pageFilePath = readerConfig(L"pageFilePath"); + pageFilePath = (const wstring&) readerConfig(L"pageFilePath"); // replace any '/' with '\' for compat with default path std::replace(pageFilePath.begin(), pageFilePath.end(), '/','\\'); @@ -505,9 +505,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { #endif #ifdef __unix__ struct stat statbuf; - if (stat(pageFilePath.c_str(), &statbuf)==-1) + if (stat(wtocharpath(pageFilePath).c_str(), &statbuf)==-1) { - throw std::runtime_error ("pageFilePath does not exist"); + RuntimeError ("pageFilePath does not exist"); } #endif @@ -520,7 +520,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { #endif #ifdef __unix__ pageFilePath.reserve(PATH_MAX); - pageFilePath = "/tmp/temp.CNTK.XXXXXX"; + pageFilePath = L"/tmp/temp.CNTK.XXXXXX"; #endif } @@ -555,7 +555,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { //m_frameSourceMultiIO = new msra::dbn::minibatchframesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, randomize, pagepath, mayhavenoframe, addEnergy); //m_frameSourceMultiIO->setverbosity(verbosity); - int verbosity = readerConfig(L"verbosity","2"); + int verbosity = readerConfig(L"verbosity",2); m_frameSource = new msra::dbn::minibatchframesourcemulti(scriptpaths, infilesmulti, labelsmulti, m_featDims, m_labelDims, numContextLeft, numContextRight, randomize, pagePaths, mayhavenoframe, addEnergy); m_frameSource->setverbosity(verbosity); } @@ -570,8 +570,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { // categories for different input/output: // features: InputOutputTypes::real // labels: InputOutputTypes::category - template - void HTKMLFReader::PrepareForWriting(const ConfigParameters& readerConfig) + template template + void HTKMLFReader::PrepareForWriting(const ConfigRecordType & readerConfig) { // Gets a list of features and labels. Note that we assume feature // section names have prefix "features" and label section names have @@ -596,7 +596,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { { size_t windowFrames = contextWindow[0]; if (windowFrames % 2 == 0) - RuntimeError("augmentationextent: neighbor expansion of input features to %d not symmetrical", windowFrames); + RuntimeError("augmentationextent: neighbor expansion of input features to %d not symmetrical", (int)windowFrames); size_t context = windowFrames / 2; // extend each side by this numContextLeft.push_back(context); numContextRight.push_back(context); @@ -609,7 +609,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { } else { - RuntimeError("contextFrames must have 1 or 2 values specified, found %d", contextWindow.size()); + RuntimeError("contextFrames must have 1 or 2 values specified, found %d", (int)contextWindow.size()); } // Figures out the feature dimension, with context. @@ -1106,14 +1106,21 @@ namespace Microsoft { namespace MSR { namespace CNTK { // We initialize the sentence boundary information before we process // the utterances. - m_pMBLayout->Init(m_numberOfuttsPerMinibatch, m_currentMBSize, !m_framemode); - for (size_t i = 0; i < m_numberOfuttsPerMinibatch; i++) + if (m_framemode) + { + assert(m_numberOfuttsPerMinibatch==1); + m_pMBLayout->InitAsFrameMode(m_currentMBSize); + } else + { + m_pMBLayout->Init(m_numberOfuttsPerMinibatch, m_currentMBSize); + } + /*for (size_t i = 0; i < m_numberOfuttsPerMinibatch; i++) { for (size_t j = 0; j < m_currentMBSize; j++) { m_pMBLayout->SetWithoutOr(i, j, MinibatchPackingFlags::None); } - } + }*/ // Iterates over utterances. m_numberOfuttsPerMinibatch = 1 is a // special case. @@ -1121,6 +1128,16 @@ namespace Microsoft { namespace MSR { namespace CNTK { { size_t startFrame = m_processedFrame[i]; size_t endFrame = 0; + // Sets the utterance boundary. + if (!m_framemode) + { + if (m_toProcess[i] > startFrame) + { + m_pMBLayout->AddSequence(NEW_SEQUENCE_ID, i, -(ptrdiff_t)startFrame, m_toProcess[i]-startFrame); + } + } + //m_pMBLayout->Set(i, 0, MinibatchPackingFlags::SequenceStart); + if ((startFrame + m_currentMBSize) < m_toProcess[i]) { @@ -1129,12 +1146,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { assert(m_framemode == false); assert(m_truncated == true); - // Sets the utterance boundary. - if (startFrame == 0) - { - m_pMBLayout->Set(i, 0, MinibatchPackingFlags::SequenceStart); - } - endFrame = startFrame + m_currentMBSize; bool populateSucc = PopulateUtteranceInMinibatch(matrices, i, startFrame, endFrame, m_currentMBSize); if (m_doMinibatchBuffering && populateSucc) @@ -1155,7 +1166,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // utterance boundary. // Sets the utterance boundary. - if (m_framemode == false) + /*if (m_framemode == false) { if (startFrame == 0) { @@ -1164,7 +1175,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // We have to set the utterance end. m_pMBLayout->Set(i, m_pMBLayout->GetNumTimeSteps() - 1, MinibatchPackingFlags::SequenceEnd); - } + }*/ // Now puts the utterance into the minibatch, and loads the // next one. @@ -1192,9 +1203,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { // Checks if we have reached the end of the minibatch. if (startFrame == m_toProcess[i]) { + m_pMBLayout->AddGap(i, 0, m_currentMBSize); for (size_t k = 0; k < m_currentMBSize; k++) { - m_pMBLayout->Set(i, k, MinibatchPackingFlags::NoInput); + //m_pMBLayout->Set(i, k, MinibatchPackingFlags::NoInput); // Populates with real features, the // following implementation is not efficient... @@ -1215,7 +1227,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // Second, we set utterance boundary for the partial // minibatch, and then load it. - if (m_framemode == false) + /* if (m_framemode == false) { if (startFrame == 0) { @@ -1225,7 +1237,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // We have to set the utterance end. assert(m_toProcess[i] - startFrame - 1 < m_pMBLayout->GetNumTimeSteps()); m_pMBLayout->Set(i, m_toProcess[i] - startFrame - 1, MinibatchPackingFlags::SequenceEnd); - } + }*/ endFrame = m_toProcess[i]; size_t currentMBFilled = endFrame - startFrame; bool populateSucc = PopulateUtteranceInMinibatch(matrices, i, startFrame, endFrame, m_currentMBSize); @@ -1243,8 +1255,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { { // Sets the utterance boundary. assert(currentMBFilled + m_toProcess[i] <= m_pMBLayout->GetNumTimeSteps()); - m_pMBLayout->Set(i, currentMBFilled, MinibatchPackingFlags::SequenceStart); - m_pMBLayout->Set(i, currentMBFilled + m_toProcess[i] - 1, MinibatchPackingFlags::SequenceEnd); + m_pMBLayout->AddSequence(NEW_SEQUENCE_ID, i, currentMBFilled, currentMBFilled + m_toProcess[i]); + //m_pMBLayout->Set(i, currentMBFilled, MinibatchPackingFlags::SequenceStart); + //m_pMBLayout->Set(i, currentMBFilled + m_toProcess[i] - 1, MinibatchPackingFlags::SequenceEnd); populateSucc = PopulateUtteranceInMinibatch(matrices, i, 0, m_toProcess[i], m_currentMBSize, currentMBFilled); if (m_doMinibatchBuffering && populateSucc) { @@ -1270,14 +1283,16 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_processedFrame[i] += m_currentMBSize - currentMBFilled; if (currentMBFilled < m_currentMBSize) { - m_pMBLayout->Set(i, currentMBFilled, MinibatchPackingFlags::SequenceStart); + m_pMBLayout->AddSequence(NEW_SEQUENCE_ID, i, currentMBFilled, currentMBFilled+m_toProcess[i]); + //m_pMBLayout->Set(i, currentMBFilled, MinibatchPackingFlags::SequenceStart); } } else { + m_pMBLayout->AddGap(i, currentMBFilled, m_currentMBSize); for (size_t k = currentMBFilled; k < m_currentMBSize; k++) { - m_pMBLayout->Set(i, k, MinibatchPackingFlags::NoInput); + //m_pMBLayout->Set(i, k, MinibatchPackingFlags::NoInput); // Populates with real features, the // following implementation is not efficient... @@ -1343,8 +1358,19 @@ namespace Microsoft { namespace MSR { namespace CNTK { currentMBSize : (originalMBSize - startIndex); // Sets MBLayout. - currentMinibatch.pMBLayout->CopyFromRange(m_pMBLayout, startIndex, numFrames); - + //currentMinibatch.pMBLayout->CopyFromRange(m_pMBLayout, startIndex, numFrames); + currentMinibatch.pMBLayout->Init(m_pMBLayout->GetNumParallelSequences(), numFrames); + const auto & sequences = m_pMBLayout->GetAllSequences(); + for (const auto & seq : sequences) + { + if (seq.tEnd > startIndex && seq.tBegin < (ptrdiff_t)(startIndex + numFrames)) + { + auto shiftedSeq = seq; + shiftedSeq.tBegin -= startIndex; + shiftedSeq.tEnd -= startIndex; + currentMinibatch.pMBLayout->AddSequence(shiftedSeq); + } + } // Sets the minibatch size for the current minibatch. currentMinibatch.currentMBSize = numFrames; @@ -1644,9 +1670,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { const msra::dbn::matrix feat = m_fileEvalSource->ChunkOfFrames(id); if (first) { - m_pMBLayout->Init(1, feat.cols(), true); - m_pMBLayout->Set(0, 0, MinibatchPackingFlags::SequenceStart); - m_pMBLayout->SetWithoutOr(0, feat.cols() - 1, MinibatchPackingFlags::SequenceEnd); + m_pMBLayout->Init(1,feat.cols()); + m_pMBLayout->AddSequence(NEW_SEQUENCE_ID, 0, 0, feat.cols()); + //m_pMBLayout->SetWithoutOr(0, feat.cols() - 1, MinibatchPackingFlags::SequenceEnd); first = false; } @@ -2089,21 +2115,22 @@ namespace Microsoft { namespace MSR { namespace CNTK { // For Kaldi2Reader, we now make the following assumptions // 1. feature sections will always have a sub-field "scpFile" // 2. label sections will always have a sub-field "mlfFile" - template - void HTKMLFReader::GetDataNamesFromConfig(const ConfigParameters& readerConfig, std::vector& features, std::vector& labels) + template template + void HTKMLFReader::GetDataNamesFromConfig(const ConfigRecordType& readerConfig, std::vector& features, std::vector& labels) { - for (auto iter = readerConfig.begin(); iter != readerConfig.end(); ++iter) + for (auto & id : readerConfig.GetMemberIds()) { - auto pair = *iter; - ConfigParameters temp = iter->second; + if (!readerConfig.CanBeConfigRecord(id)) + continue; + const ConfigRecordType & temp = readerConfig(id); // see if we have a config parameters that contains a "file" element, it's a sub key, use it if (temp.ExistsCurrent(L"scpFile")) { - features.push_back(msra::strfun::utf16(iter->first)); + features.push_back(id); } else if (temp.ExistsCurrent(L"mlfFile")) { - labels.push_back(msra::strfun::utf16(iter->first)); + labels.push_back(id); } } } diff --git a/DataReader/Kaldi2Reader/HTKMLFReader.h b/DataReader/Kaldi2Reader/HTKMLFReader.h index a8aeb2420613..3cc7b02d5552 100644 --- a/DataReader/Kaldi2Reader/HTKMLFReader.h +++ b/DataReader/Kaldi2Reader/HTKMLFReader.h @@ -106,10 +106,10 @@ class HTKMLFReader : public IDataReader std::vector>>m_labelToTargetMapMultiIO; - void PrepareForTrainingOrTesting(const ConfigParameters& config); - void PrepareForWriting(const ConfigParameters& config); - void PrepareForSequenceTraining(const ConfigParameters& config); - + template void PrepareForTrainingOrTesting(const ConfigRecordType & config); + template void PrepareForWriting(const ConfigRecordType & config); + template void PrepareForSequenceTraining(const ConfigRecordType & config); + bool GetMinibatchToTrainOrTest(std::map*>& matrices); bool GetOneMinibatchToTrainOrTestDataBuffer(const std::map*>& matrices); bool GetMinibatchToWrite(std::map*>& matrices); @@ -136,7 +136,7 @@ class HTKMLFReader : public IDataReader size_t NumberSlicesInEachRecurrentIter() { return m_numberOfuttsPerMinibatch ;} void SetNbrSlicesEachRecurrentIter(const size_t) { }; - void GetDataNamesFromConfig(const ConfigParameters& readerConfig, std::vector& features, std::vector& labels); + template void GetDataNamesFromConfig(const ConfigRecordType & readerConfig, std::vector& features, std::vector& labels); size_t ReadLabelToTargetMappingFile (const std::wstring& labelToTargetMappingFile, const std::wstring& labelListFile, std::vector>& labelToTargetMap); diff --git a/DataReader/Kaldi2Reader/HTKMLFWriter.cpp b/DataReader/Kaldi2Reader/HTKMLFWriter.cpp index acc7969fadf8..a24c9b6b9a95 100644 --- a/DataReader/Kaldi2Reader/HTKMLFWriter.cpp +++ b/DataReader/Kaldi2Reader/HTKMLFWriter.cpp @@ -28,7 +28,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { //DATAWRITER_API IDataWriter* DataWriterFactory(void) template - void HTKMLFWriter::Init(const ConfigParameters& writerConfig) + template + void HTKMLFWriter::InitFromConfig(const ConfigRecordType& writerConfig) { m_tempArray = nullptr; m_tempArraySize = 0; @@ -40,11 +41,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { size_t firstfilesonly = SIZE_MAX; // set to a lower value for testing - m_verbosity = writerConfig(L"verbosity", "2"); - m_overflowValue = writerConfig(L"overflowValue", "50"); - m_maxNumOverflowWarning = writerConfig(L"maxNumOverflowWarning", "10"); + m_verbosity = writerConfig(L"verbosity", 2); + m_overflowValue = writerConfig(L"overflowValue", 50); + m_maxNumOverflowWarning = writerConfig(L"maxNumOverflowWarning", 10); - ConfigArray outputNames = writerConfig(L"outputNodeNames",""); + vector outputNames = writerConfig(L"outputNodeNames", ConfigRecordType::Array(stringargvector())); if (outputNames.size()<1) RuntimeError("writer needs at least one outputNodeName specified in config"); int counter = 0; diff --git a/DataReader/Kaldi2Reader/KaldiSequenceTrainingDerivative.cpp b/DataReader/Kaldi2Reader/KaldiSequenceTrainingDerivative.cpp index ac771bcbfaec..2f1dc9d82d52 100644 --- a/DataReader/Kaldi2Reader/KaldiSequenceTrainingDerivative.cpp +++ b/DataReader/Kaldi2Reader/KaldiSequenceTrainingDerivative.cpp @@ -67,8 +67,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { { RuntimeError("Number of labels in logLikelihood does not match that" " in the Kaldi model for utterance %S: %d v.s. %d\n", - uttID.c_str(), logLikelihood.GetNumRows(), - m_transModel.NumPdfs()); + uttID.c_str(), (int)logLikelihood.GetNumRows(), + (int)m_transModel.NumPdfs()); } // Reads alignment. @@ -82,7 +82,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { { RuntimeError("Number of frames in logLikelihood does not match that" " in the alignment for utterance %S: %d v.s. %d\n", - uttID.c_str(), logLikelihood.GetNumCols(), ali.size()); + uttID.c_str(), (int)logLikelihood.GetNumCols(), (int)ali.size()); } // Reads denominator lattice. @@ -184,8 +184,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (maxTime != logLikelihood.GetNumCols()) { RuntimeError("Number of frames in the logLikelihood does not match" - " that in the denominator lattice for utterance %S\n", - uttID.c_str(), logLikelihood.GetNumRows(), maxTime); + " that in the denominator lattice for utterance %S: %d vs. %d\n", + uttID.c_str(), (int)logLikelihood.GetNumRows(), (int)maxTime); } std::vector> timeStateMap( diff --git a/DataReader/Kaldi2Reader/basetypes.h b/DataReader/Kaldi2Reader/basetypes.h index abbef5d0ec5e..d3565d73598f 100644 --- a/DataReader/Kaldi2Reader/basetypes.h +++ b/DataReader/Kaldi2Reader/basetypes.h @@ -1,1180 +1,338 @@ -// -// basetypes.h - basic types that C++ lacks -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -#pragma once -#ifndef _BASETYPES_ -#define _BASETYPES_ - -#ifndef UNDER_CE // fixed-buffer overloads not available for wince -#ifdef _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES // fixed-buffer overloads for strcpy() etc. -#undef _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES -#endif -#define _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES 1 -#endif - -#pragma warning (push) -#pragma warning (disable: 4793) // caused by varargs - -// disable certain parts of basetypes for wince compilation -#ifdef UNDER_CE -#define BASETYPES_NO_UNSAFECRTOVERLOAD // disable unsafe CRT overloads (safe functions don't exist in wince) -#define BASETYPES_NO_STRPRINTF // dependent functions here are not defined for wince -#endif - -#ifndef OACR // dummies when we are not compiling under Office -#define OACR_WARNING_SUPPRESS(x, y) -#define OACR_WARNING_DISABLE(x, y) -#define OACR_WARNING_PUSH -#define OACR_WARNING_POP -#endif -#ifndef OACR_ASSUME // this seems to be a different one -#define OACR_ASSUME(x) -#endif - -// following oacr warnings are not level1 or level2-security -// in currect stage we want to ignore those warnings -// if necessay this can be fixed at later stage - -// not a bug -OACR_WARNING_DISABLE(EXC_NOT_CAUGHT_BY_REFERENCE, "Not indicating a bug or security threat."); -OACR_WARNING_DISABLE(LOCALDECLHIDESLOCAL, "Not indicating a bug or security threat."); - -// not reviewed -OACR_WARNING_DISABLE(MISSING_OVERRIDE, "Not level1 or level2_security."); -OACR_WARNING_DISABLE(EMPTY_DTOR, "Not level1 or level2_security."); -OACR_WARNING_DISABLE(DEREF_NULL_PTR, "Not level1 or level2_security."); -OACR_WARNING_DISABLE(INVALID_PARAM_VALUE_1, "Not level1 or level2_security."); -OACR_WARNING_DISABLE(VIRTUAL_CALL_IN_CTOR, "Not level1 or level2_security."); -OACR_WARNING_DISABLE(POTENTIAL_ARGUMENT_TYPE_MISMATCH, "Not level1 or level2_security."); - -// determine WIN32 api calling convention -// it seems this is normally stdcall?? but when compiling as /clr:pure or /clr:Safe -// this is not supported, so in this case, we need to use the 'default' calling convention -// TODO: can we reuse the #define of WINAPI?? -#ifdef _M_CEE_SAFE -#define WINAPI_CC __clrcall -#elif _M_CEE -#define WINAPI_CC __clrcall -#else -#define WINAPI_CC __stdcall -#endif - -// fix some warnings in STL -#if !defined(_DEBUG) || defined(_CHECKED) || defined(_MANAGED) -#pragma warning(disable : 4702) // unreachable code -#endif - -#include "Platform.h" -#include -#include -#include // include here because we redefine some names later -#include -#include -#include -#include // for HUGE_VAL // potential double isnan definition -#include -#include -#include -#include -#include // std::wstring_convert -#include -#include // for transform() -#ifdef _MSC_VER -#include // std::codecvt_utf8 -#endif -#ifdef _WIN32 -#include // for CRITICAL_SECTION and Unicode conversion functions --TODO: is there a portable alternative? -#include - -#endif -#if __unix__ -#include -#include -#include -#include -#include -#include -#include -#include - -typedef unsigned char byte; -#endif - -using namespace std; - -// CRT error handling seems to not be included in wince headers -// so we define our own imports -#ifdef UNDER_CE - -// TODO: is this true - is GetLastError == errno?? - also this adds a dependency on windows.h -#define errno GetLastError() - -// strerror(x) - x here is normally errno - TODO: make this return errno as a string -#define strerror(x) "strerror error but can't report error number sorry!" -#endif - -// disable warnings for which fixing would make code less readable -#pragma warning(disable : 4290) // throw() declaration ignored -#pragma warning(disable : 4244) // conversion from typeA to typeB, possible loss of data - -// ---------------------------------------------------------------------------- -// (w)cstring -- helper class like std::string but with auto-cast to char* -// ---------------------------------------------------------------------------- - -namespace msra { namespace strfun { - // a class that can return a std::string with auto-convert into a const char* - template struct basic_cstring : public std::basic_string - { - template basic_cstring (S p) : std::basic_string (p) { } - operator const C * () const { return this->c_str(); } - }; - typedef basic_cstring cstring; - typedef basic_cstring wcstring; -}} -static inline wchar_t*GetWC(const char *c) -{ - const size_t cSize = strlen(c)+1; - wchar_t* wc = new wchar_t[cSize]; - mbstowcs (wc, c, cSize); - - return wc; -} -struct MatchPathSeparator -{ - bool operator()( char ch ) const - { - return ch == '\\' || ch == '/'; - } -}; -static inline std::string basename( std::string const& pathname) -{ - return std::string (std::find_if(pathname.rbegin(), pathname.rend(),MatchPathSeparator()).base(), pathname.end()); -} - -static inline std::string removeExtension (std::string const& filename) -{ - //std::string::const_reverse_iterator pivot = std::find(filename.rbegin(), filename.rend(), '.'); - //return pivot == filename.rend() ? filename: std::string(filename.begin(), pivot.base()-1); - int lastindex = filename.find_first_of("."); - return filename.substr(0,lastindex); -} -static inline std::wstring basename( std::wstring const& pathname) -{ - return std::wstring (std::find_if(pathname.rbegin(), pathname.rend(),MatchPathSeparator()).base(), pathname.end()); -} - -static inline std::wstring removeExtension (std::wstring const& filename) -{ - //std::wstring::const_reverse_iterator pivot = std::find(filename.rbegin(), filename.rend(), '.'); - //return pivot == filename.rend() ? filename: std::wstring(filename.begin(), pivot.base()-1); - int lastindex = filename.find_first_of(L"."); - return filename.substr(0,lastindex); - -} - -// ---------------------------------------------------------------------------- -// some mappings for non-Windows builds -// ---------------------------------------------------------------------------- - -#ifndef _MSC_VER // add some functions that are VS-only -// --- basic file functions -// convert a wchar_t path to what gets passed to CRT functions that take narrow characters -// This is needed for the Linux CRT which does not accept wide-char strings for pathnames anywhere. -// Always use this function for mapping the paths. -static inline msra::strfun::cstring charpath (const std::wstring & p) -{ -#ifdef _WIN32 - return std::wstring_convert>().to_bytes(p); -#else // old version, delete once we know it works - size_t len = p.length(); - std::vector buf(2 * len + 1, 0); // max: 1 wchar => 2 mb chars - ::wcstombs(buf.data(), p.c_str(), 2 * len + 1); - return msra::strfun::cstring (&buf[0]); -#endif -} -static inline FILE* _wfopen (const wchar_t * path, const wchar_t * mode) { return fopen(charpath(path), charpath(mode)); } -static inline int _wunlink (const wchar_t * p) { return unlink (charpath (p)); } -static inline int _wmkdir (const wchar_t * p) { return mkdir (charpath (p), 0777/*correct?*/); } -// --- basic string functions -static inline wchar_t* wcstok_s (wchar_t* s, const wchar_t* delim, wchar_t** ptr) { return ::wcstok(s, delim, ptr); } -static inline int _stricmp (const char * a, const char * b) { return ::strcasecmp (a, b); } -static inline int _strnicmp (const char * a, const char * b, size_t n) { return ::strncasecmp (a, b, n); } -static inline int _wcsicmp (const wchar_t * a, const wchar_t * b) { return ::wcscasecmp (a, b); } -static inline int _wcsnicmp (const wchar_t * a, const wchar_t * b, size_t n) { return ::wcsncasecmp (a, b, n); } -static inline int64_t _strtoi64 (const char * s, char ** ep, int r) { return strtoll (s, ep, r); } // TODO: check if correct -static inline uint64_t _strtoui64 (const char * s, char ** ep, int r) { return strtoull (s, ep, r); } // TODO: correct for size_t? -// -- other -//static inline void memcpy_s(void * dst, size_t dstsize, const void * src, size_t maxcount) { assert (maxcount <= dstsize); memcpy (dst, src, maxcount); } -static inline void Sleep (size_t ms) { std::this_thread::sleep_for (std::chrono::milliseconds (ms)); } -#define _countof(_Array) (sizeof(_Array) / sizeof(_Array[0])) -#endif - -// ---------------------------------------------------------------------------- -// basic macros --TODO: do we need those? delete what we dont' need -// ---------------------------------------------------------------------------- - -//#define SAFE_DELETE(p) { if(p) { delete (p); (p)=NULL; } } -//#define SAFE_RELEASE(p) { if(p) { (p)->Release(); (p)=NULL; } } // nasty! use CComPtr<> - -// ---------------------------------------------------------------------------- -// basic data types -// ---------------------------------------------------------------------------- - -namespace msra { namespace basetypes { - - -// class fixed_vector - non-resizable vector - -template class fixed_vector -{ - _T * p; // pointer array - size_t n; // number of elements - void check (int index) const { index/*avoid compiler warning*/;assert (index >= 0 && (size_t) index < n); } - void check (size_t index) const { assert (index < n); } - // ... TODO: when I make this public, LinearTransform.h acts totally up but I cannot see where it comes from. - //fixed_vector (const fixed_vector & other) : n (0), p (NULL) { *this = other; } -public: - fixed_vector() : n (0), p (NULL) { } - void resize (int size) { clear(); if (size > 0) { p = new _T[size]; n = size; } } - void resize (size_t size) { clear(); if (size > 0) { p = new _T[size]; n = size; } } - fixed_vector (int size) : n (size), p (size > 0 ? new _T[size] : NULL) { } - fixed_vector (size_t size) : n ((int) size), p (size > 0 ? new _T[size] : NULL) { } - ~fixed_vector() { delete[] p; } - inline int size() const { return (int) n; } - inline int capacity() const { return (int) n; } - inline bool empty() const { return n == 0; } - void clear() { delete[] p; p = NULL; n = 0; } - _T * begin() { return p; } - const _T * begin() const { return p; } - _T * end() { return p + n; } // note: n == 0 so result is NULL - inline _T & operator[] (int index) { check (index); return p[index]; } // writing - inline const _T & operator[] (int index) const { check (index); return p[index]; } // reading - inline _T & operator[] (size_t index) { check (index); return p[index]; } // writing - inline const _T & operator[] (size_t index) const { check (index); return p[index]; } // reading - inline int indexof (const _T & elem) const { assert (&elem >= p && &elem < p + n); return &elem - p; } - inline void swap (fixed_vector & other) throw() { std::swap (other.p, p); std::swap (other.n, n); } - template fixed_vector & operator= (const VECTOR & other) - { - int other_n = (int) other.size(); - fixed_vector tmp (other_n); - for (int k = 0; k < other_n; k++) tmp[k] = other[k]; - swap (tmp); - return *this; - } - fixed_vector & operator= (const fixed_vector & other) - { - int other_n = (int) other.size(); - fixed_vector tmp (other_n); - for (int k = 0; k < other_n; k++) tmp[k] = other[k]; - swap (tmp); - return *this; - } - template fixed_vector (const VECTOR & other) : n (0), p (NULL) { *this = other; } -}; -template inline void swap (fixed_vector<_T> & L, fixed_vector<_T> & R) throw() { L.swap (R); } - -#pragma warning(pop) // pop off waring: expression has no effect - -// class matrix - simple fixed-size 2-dimensional array, access elements as m(i,j) -// stored as concatenation of rows - -template class matrix : fixed_vector -{ - size_t numcols; - size_t locate (size_t i, size_t j) const { assert (i < rows() && j < cols()); return i * cols() + j; } -public: - typedef T elemtype; - matrix() : numcols (0) {} - matrix (size_t n, size_t m) { resize (n, m); } - void resize (size_t n, size_t m) { numcols = m; fixed_vector::resize (n * m); } - size_t cols() const { return numcols; } - size_t rows() const { return empty() ? 0 : size() / cols(); } - size_t size() const { return fixed_vector::size(); } // use this for reading and writing... not nice! - bool empty() const { return fixed_vector::empty(); } - T & operator() (size_t i, size_t j) { return (*this)[locate(i,j)]; } - const T & operator() (size_t i, size_t j) const { return (*this)[locate(i,j)]; } - void swap (matrix & other) throw() { std::swap (numcols, other.numcols); fixed_vector::swap (other); } -}; -template inline void swap (matrix<_T> & L, matrix<_T> & R) throw() { L.swap (R); } - -// TODO: get rid of these -typedef std::string STRING; -typedef std::wstring WSTRING; - -// derive from this for noncopyable classes (will get you private unimplemented copy constructors) -// ... TODO: change all of basetypes classes/structs to use this -class noncopyable -{ - noncopyable & operator= (const noncopyable &); - noncopyable (const noncopyable &); -public: - noncopyable(){} -}; - -// class CCritSec and CAutoLock -- simple critical section handling -#ifndef _WIN32 // TODO: Currently only working under Windows; BROKEN otherwise, to be fixed -typedef int CRITICAL_SECTION; -static inline void InitializeCriticalSection(CRITICAL_SECTION *) {} -static inline void DeleteCriticalSection(CRITICAL_SECTION *) {} -static inline void EnterCriticalSection(CRITICAL_SECTION *) {} -static inline void LeaveCriticalSection(CRITICAL_SECTION *) {} -#endif -class CCritSec -{ - CCritSec (const CCritSec &); CCritSec & operator= (const CCritSec &); - CRITICAL_SECTION m_CritSec; -public: - CCritSec() { InitializeCriticalSection(&m_CritSec); }; - ~CCritSec() { DeleteCriticalSection(&m_CritSec); }; - void Lock() { EnterCriticalSection(&m_CritSec); }; - void Unlock() { LeaveCriticalSection(&m_CritSec); }; -}; - - -// locks a critical section, and unlocks it automatically -// when the lock goes out of scope -class CAutoLock -{ - CAutoLock(const CAutoLock &refAutoLock); CAutoLock &operator=(const CAutoLock &refAutoLock); - CCritSec & m_rLock; -public: - CAutoLock(CCritSec & rLock) : m_rLock (rLock) { m_rLock.Lock(); }; - ~CAutoLock() { m_rLock.Unlock(); }; -}; - -#if 0 -// an efficient way to write COM code -// usage examples: -// COM_function() || throw_hr ("message"); -// while ((s->Read (p, n, &m) || throw_hr ("Read failure")) == S_OK) { ... } -// is that cool or what? -struct bad_hr : public std::runtime_error -{ - HRESULT hr; - bad_hr (HRESULT p_hr, const char * msg) : hr (p_hr), std::runtime_error (msg) { } - // (only for use in || expression --deprecated:) - bad_hr() : std::runtime_error(NULL) { } - bad_hr(const char * msg) : std::runtime_error(msg) { } -}; -struct throw_hr -{ - const char * msg; - inline throw_hr (const char * msg = NULL) : msg (msg) {} -}; -inline static HRESULT operator|| (HRESULT hr, const throw_hr & e) -{ - if (SUCCEEDED (hr)) return hr; - throw bad_hr (hr, e.msg); -} -// (old deprecated version kept for compat:) -inline static bool operator|| (HRESULT hr, const bad_hr & e) { if (SUCCEEDED (hr)) return true; throw bad_hr (hr, e.what()); } - -// back-mapping of exceptions to HRESULT codes -// usage pattern: HRESULT COM_function (...) { try { exception-based function body } catch_hr_return; } -#define catch_hr_return \ - catch (const bad_alloc &) { return E_OUTOFMEMORY; } \ - catch (const bad_hr & e) { return e.hr; } \ - catch (const invalid_argument &) { return E_INVALIDARG; } \ - catch (const runtime_error &) { return E_FAIL; } \ - catch (const logic_error &) { return E_UNEXPECTED; } \ - catch (const exception &) { return E_FAIL; } \ - return S_OK; - -// CoInitializeEx() wrapper to ensure CoUnintialize() -//struct auto_co_initialize : noncopyable -//{ -// auto_co_initialize() { ::CoInitializeEx (NULL, COINIT_MULTITHREADED) || bad_hr ("auto_co_initialize: CoInitializeEx failure"); } -// ~auto_co_initialize() { ::CoUninitialize(); } -//}; - -// auto pointer for ::CoTaskMemFree -template class auto_co_ptr : noncopyable -{ - T * p; -public: - auto_co_ptr() : p (NULL) { } - auto_co_ptr (T * p) : p (p) { } -// ~auto_co_ptr() { ::CoTaskMemFree (p); } - operator T * () const { return p; } - T * operator->() const { return p; } - T** operator& () { assert (p == NULL); return &p; } // must be empty when taking address -}; - -// represents a thread-local-storage variable -// Note: __declspec(thread) is broken on pre-Vista for delay loaded DLLs -// [http://www.nynaeve.net/?p=187] -// so instead, we need to wrap up the Win32 TLS functions ourselves. -// Note: tls instances must be allocated as static to work correctly, e.g.: -// static tls myVal(); -// myVal = (void *) 25; -// printf ("value is %d",(void *) myVal); - -class tls -{ -private: - int tlsSlot; -public: - -#ifdef UNDER_CE - // this is from standard windows headers - seems to be missing in WINCE - #define TLS_OUT_OF_INDEXES ((DWORD)0xFFFFFFFF) -#endif - tls() { tlsSlot = TlsAlloc(); if (tlsSlot == TLS_OUT_OF_INDEXES) throw std::runtime_error("tls: TlsAlloc failed, out of tls slots"); } - operator void * () { return TlsGetValue (tlsSlot); } - void *operator = (void *val) { if (!TlsSetValue (tlsSlot,val)) throw std::runtime_error ("tls: TlsSetValue failed"); return val; } -}; -#endif - -};}; // namespace - -#if 0 //ndef BASETYPES_NO_UNSAFECRTOVERLOAD // if on, no unsafe CRT overload functions - -// ---------------------------------------------------------------------------- -// overloads for "unsafe" CRT functions used in our code base -// ---------------------------------------------------------------------------- - -// strlen/wcslen overloads for fixed-buffer size - -// Note: Careful while fixing bug related to these templates. -// In all attempted experiments, in seems all 6 definitions are required -// below to get the correct behaviour. Be very very careful -// not to delete something without testing that case 5&6 have "size" deduced. -// 1. char * -// 2. char * const -// 3. const char * -// 4. const char * const -// 5. char (&) [size] -// 6. const char (&) [size] -// the following includes all headers that use strlen() and fail because of the mapping below -// to find those, change #define strlen strlen_ to something invalid e.g. strlen::strlen_ -#if _MSC_VER >= 1600 // VS 2010 --TODO: fix this by correct include order instead -#include // defines strlen() as an intrinsic in VS 2010 -#include // uses strlen() -#include // uses strlen() -#endif -#define strlen strlen_ -#ifndef LINUX -template inline __declspec(deprecated("Dummy general template, cannot be used directly")) -#else -template inline -#endif // LINUX -size_t strlen_(_T &s) { return strnlen_s(static_cast(s), SIZE_MAX); } // never be called but needed to keep compiler happy -template inline size_t strlen_(const _T &s) { return strnlen_s(static_cast(s), SIZE_MAX); } -template<> inline size_t strlen_(char * &s) { return strnlen_s(s, SIZE_MAX); } -template<> inline size_t strlen_(const char * &s) { return strnlen_s(s, SIZE_MAX); } -template inline size_t strlen_(const char (&s)[n]) { return (strnlen_s(s, n)); } -template inline size_t strlen_(char (&s)[n]) { return (strnlen_s(s, n)); } -#define wcslen wcslen_ -template inline __declspec(deprecated("Dummy general template, cannot be used directly")) -size_t wcslen_(_T &s) { return wcsnlen_s(static_cast(s), SIZE_MAX); } // never be called but needed to keep compiler happy -template inline size_t __cdecl wcslen_(const _T &s) { return wcsnlen_s(static_cast(s), SIZE_MAX); } -template<> inline size_t wcslen_(wchar_t * &s) { return wcsnlen_s(s, SIZE_MAX); } -template<> inline size_t wcslen_(const wchar_t * &s) { return wcsnlen_s(s, SIZE_MAX); } -template inline size_t wcslen_(const wchar_t (&s)[n]) { return (wcsnlen_s(s, n)); } -template inline size_t wcslen_(wchar_t (&s)[n]) { return (wcsnlen_s(s, n)); } - -// xscanf wrappers -- one overload for each actual use case in our code base -static inline int sscanf (const char * buf, const char * format, int * i1) { return sscanf_s (buf, format, i1); } -static inline int sscanf (const char * buf, const char * format, int * i1, int * i2) { return sscanf_s (buf, format, i1, i2); } -static inline int sscanf (const char * buf, const char * format, int * i1, int * i2, int * i3) { return sscanf_s (buf, format, i1, i2, i3); } -static inline int sscanf (const char * buf, const char * format, double * f1) { return sscanf_s (buf, format, f1); } -static inline int swscanf (const wchar_t * buf, const wchar_t * format, int * i1) { return swscanf_s (buf, format, i1); } -static inline int fscanf (FILE * file, const char * format, float * f1) { return fscanf_s (file, format, f1); } - -// ...TODO: should we pass 'count' instead of SIZE_MAX? (need to review use cases) -#define _vsnprintf _vsnprintf_ -static inline int _vsnprintf_(char *buffer, size_t count, const char *format, va_list argptr) -{ return _vsnprintf_s (buffer, SIZE_MAX, count, format, argptr); } -#define _vsnwprintf _vsnwprintf_ -static inline int _vsnwprintf_(wchar_t *buffer, size_t count, const wchar_t *format, va_list argptr) -{ return _vsnwprintf_s (buffer, SIZE_MAX, count, format, argptr); } - -// wcsfcpy -- same as standard wcsncpy, use padded fixed-size buffer really needed -static inline void wcsfcpy (wchar_t * dest, const wchar_t * source, size_t count) -{ - while (count && (*dest++ = *source++) != 0) count--; // copy - if (count) while (--count) *dest++ = 0; // pad with zeroes -} - -// cacpy -- fixed-size character array (same as original strncpy (dst, src, sizeof (dst))) -// NOTE: THIS FUNCTION HAS NEVER BEEN TESTED. REMOVE THIS COMMENT ONCE IT HAS. -template static inline void cacpy (T (&dst)[n], const T * src) -{ for (int i = 0; i < n; i++) { dst[i] = *src; if (*src) src++; } } -// { return strncpy (dst, src, n); } // using original C std lib function - -// mappings for "unsafe" functions that are not really unsafe -#define strtok strtok_ // map to "safe" function (adds no value) -static inline /*const*/ char * strtok_(char * s, const char * delim) -{ - static msra::basetypes::tls tls_context; // see note for tls class def - char *context = (char *) (void *) tls_context; - char *ret = strtok_s (s, delim, &context); - tls_context = context; - return ret; -} - -#define wcstok wcstok_ // map to "safe" function (adds no value) -static inline /*const*/ wchar_t * wcstok_(wchar_t * s, const wchar_t * delim) -{ - static msra::basetypes::tls tls_context; // see note for tls class def - wchar_t *context = (wchar_t *) (void *) tls_context; - wchar_t *ret = wcstok_s (s, delim, &context); - tls_context = context; - return ret; -} - -#define fopen fopen_ // map to _fsopen() (adds no value) -static inline FILE * fopen_(const char * p, const char * m) { return _fsopen (p, m, _SH_DENYWR); } -#define _wfopen _wfopen_ // map to _wfsopen() (adds no value) -static inline FILE * _wfopen_(const wchar_t * p, const wchar_t * m) { return _wfsopen (p, m, _SH_DENYWR); } - -#define strerror(e) strerror_((e)) // map to "safe" function (adds no value) -static inline const char *strerror_(int e) -{ // keep a cache so we can return a pointer (to mimic the old interface) - static msra::basetypes::CCritSec cs; static std::map msgs; - msra::basetypes::CAutoLock lock (cs); - if (msgs.find(e) == msgs.end()) { char msg[1024]; strerror_s (msg, e); msgs[e] = msg; } - return msgs[e].c_str(); -} -#endif -#ifdef __unix__ -extern int fileno(FILE*); // somehow got deprecated in C++11 -#endif - -// ---------------------------------------------------------------------------- -// frequently missing string functions -// ---------------------------------------------------------------------------- - -namespace msra { namespace strfun { - -#ifndef BASETYPES_NO_STRPRINTF - -/* -#ifdef __UNIX__ -static FILE *dummyf = fopen("tmp", "wb"); -#endif -*/ -// [w]strprintf() -- like sprintf() but resulting in a C++ string -template struct _strprintf : public std::basic_string<_T> -{ // works for both wchar_t* and char* - _strprintf (const _T * format, ...) - { - va_list args; - va_start (args, format); // varargs stuff - size_t n = _cprintf (format, args); // num chars excl. '\0' - va_end(args); - va_start(args, format); - const int FIXBUF_SIZE = 128; // incl. '\0' - if (n < FIXBUF_SIZE) - { - _T fixbuf[FIXBUF_SIZE]; - this->assign (_sprintf (&fixbuf[0], sizeof (fixbuf)/sizeof (*fixbuf), format, args), n); - } - else // too long: use dynamically allocated variable-size buffer - { - std::vector<_T> varbuf (n + 1); // incl. '\0' - this->assign (_sprintf (&varbuf[0], varbuf.size(), format, args), n); - } - } -private: - // helpers - inline size_t _cprintf (const wchar_t * format, va_list args) - { -#ifdef __WINDOWS__ - return vswprintf (nullptr, 0, format, args); -#elif defined(__UNIX__) - FILE *dummyf = fopen("/dev/null", "w"); - if (dummyf == NULL) - perror("The following error occurred in basetypes.h:cprintf"); - int n = vfwprintf (dummyf, format, args); - if (n < 0) - perror("The following error occurred in basetypes.h:cprintf"); - fclose(dummyf); - return n; -#endif - } - inline size_t _cprintf (const char * format, va_list args) - { -#ifdef __WINDOWS__ - return vsprintf (nullptr, format, args); -#elif defined(__UNIX__) - FILE *dummyf = fopen("/dev/null", "wb"); - if (dummyf == NULL) - perror("The following error occurred in basetypes.h:cprintf"); - int n = vfprintf (dummyf, format, args); - if (n < 0) - perror("The following error occurred in basetypes.h:cprintf"); - fclose(dummyf); - return n; -#endif - } - inline const wchar_t * _sprintf (wchar_t * buf, size_t bufsiz, const wchar_t * format, va_list args) { vswprintf (buf, bufsiz, format, args); return buf; } - inline const char * _sprintf ( char * buf, size_t /*bufsiz*/, const char * format, va_list args) { vsprintf (buf, format, args); return buf; } -}; -typedef strfun::_strprintf strprintf; // char version -typedef strfun::_strprintf wstrprintf; // wchar_t version - -#endif - -// string-encoding conversion functions -// Note: generally, 8-bit strings in this codebase are UTF-8. -// One exception are functions that take 8-bit pathnames. Those will be interpreted by the OS as MBS. Best use wstring pathnames for all file accesses. - -#pragma warning(push) -#pragma warning(disable : 4996) // Reviewed by Yusheng Li, March 14, 2006. depr. fn (wcstombs, mbstowcs) -static inline std::string wcstombs(const std::wstring & p) // output: MBCS -{ - size_t len = p.length(); - msra::basetypes::fixed_vector buf(2 * len + 1); // max: 1 wchar => 2 mb chars - std::fill(buf.begin(), buf.end(), 0); - ::wcstombs(&buf[0], p.c_str(), 2 * len + 1); - return std::string(&buf[0]); -} -static inline std::wstring mbstowcs(const std::string & p) // input: MBCS -{ - size_t len = p.length(); - msra::basetypes::fixed_vector buf(len + 1); // max: >1 mb chars => 1 wchar - std::fill(buf.begin(), buf.end(), (wchar_t)0); - OACR_WARNING_SUPPRESS(UNSAFE_STRING_FUNCTION, "Reviewed OK. size checked. [rogeryu 2006/03/21]"); - ::mbstowcs(&buf[0], p.c_str(), len + 1); - return std::wstring(&buf[0]); -} -#pragma warning(pop) - -#ifdef _WIN32 -static inline cstring utf8 (const std::wstring & p) { return std::wstring_convert>().to_bytes(p); } // utf-16 to -8 -static inline wcstring utf16 (const std::string & p) { return std::wstring_convert>().from_bytes(p); } // utf-8 to -16 -#else // BUGBUG: we cannot compile the above on Cygwin GCC, so for now fake it using the mbs functions, which will only work for 7-bit ASCII strings -static inline std::string utf8 (const std::wstring & p) { return msra::strfun::wcstombs (p.c_str()); } // output: UTF-8... not really -static inline std::wstring utf16 (const std::string & p) { return msra::strfun::mbstowcs(p.c_str()); } // input: UTF-8... not really -#endif -static inline cstring utf8 (const std::string & p) { return p; } // no conversion (useful in templated functions) -static inline wcstring utf16 (const std::wstring & p) { return p; } - -// convert a string to lowercase --TODO: currently only correct for 7-bit ASCII -template -static inline void tolower_ascii (std::basic_string & s) { std::transform(s.begin(), s.end(), s.begin(), [] (CHAR c) { return (c >= 0 && c < 128) ? ::tolower(c) : c; }); } - -// split and join -- tokenize a string like strtok() would, join() strings together -template static inline std::vector> split (const std::basic_string<_T> & s, const _T * delim) -{ - std::vector> res; - for (size_t st = s.find_first_not_of (delim); st != std::basic_string<_T>::npos; ) - { - size_t en = s.find_first_of (delim, st +1); - if (en == std::basic_string<_T>::npos) en = s.length(); - res.push_back (s.substr (st, en-st)); - st = s.find_first_not_of (delim, en +1); // may exceed - } - return res; -} - -template static inline std::basic_string<_T> join (const std::vector> & a, const _T * delim) -{ - std::basic_string<_T> res; - for (int i = 0; i < (int) a.size(); i++) - { - if (i > 0) res.append (delim); - res.append (a[i]); - } - return res; -} - -// parsing strings to numbers -static inline int toint (const wchar_t * s) -{ - return (int)wcstol(s, 0, 10); - //return _wtoi (s); // ... TODO: test this -} -static inline int toint (const char * s) -{ - return atoi (s); // ... TODO: check it -} -static inline int toint (const std::wstring & s) { return toint (s.c_str()); } - -static inline double todouble (const char * s) -{ - char * ep; // will be set to point to first character that failed parsing - double value = strtod (s, &ep); - if (*s == 0 || *ep != 0) - throw std::runtime_error ("todouble: invalid input string"); - return value; -} - -// TODO: merge this with todouble(const char*) above -static inline double todouble (const std::string & s) -{ - s.size(); // just used to remove the unreferenced warning - - double value = 0.0; - - // stod supposedly exists in VS2010, but some folks have compilation errors - // If this causes errors again, change the #if into the respective one for VS 2010. -#if _MSC_VER > 1400 // VS 2010+ - size_t * idx = 0; - value = std::stod (s, idx); - if (idx) throw std::runtime_error ("todouble: invalid input string"); -#else - char *ep = 0; // will be updated by strtod to point to first character that failed parsing - value = strtod (s.c_str(), &ep); - - // strtod documentation says ep points to first unconverted character OR - // return value will be +/- HUGE_VAL for overflow/underflow - if (ep != s.c_str() + s.length() || value == HUGE_VAL || value == -HUGE_VAL) - throw std::runtime_error ("todouble: invalid input string"); -#endif - - return value; -} - -static inline double todouble (const std::wstring & s) -{ - wchar_t * endptr; - double value = wcstod (s.c_str(), &endptr); - if (*endptr) throw std::runtime_error ("todouble: invalid input string"); - return value; -} - -// ---------------------------------------------------------------------------- -// tokenizer -- utility for white-space tokenizing strings in a character buffer -// This simple class just breaks a string, but does not own the string buffer. -// ---------------------------------------------------------------------------- - -class tokenizer : public std::vector -{ - const char * delim; -public: - tokenizer (const char * delim, size_t cap) : delim (delim) { reserve (cap); } - // Usage: tokenizer tokens (delim, capacity); tokens = buf; tokens.size(), tokens[i] - void operator= (char * buf) - { - resize (0); - - // strtok_s not available on all platforms - so backoff to strtok on those -#if __STDC_WANT_SECURE_LIB__ - char * context; // for strtok_s() - for (char * p = strtok_s (buf, delim, &context); p; p = strtok_s (NULL, delim, &context)) - push_back (p); -#else - for (char * p = strtok (buf, delim); p; p = strtok (NULL, delim)) - push_back (p); -#endif - } -}; - -};}; // namespace - -// ---------------------------------------------------------------------------- -// wrappers for some basic types (files, handles, timer) -// ---------------------------------------------------------------------------- - -namespace msra { namespace basetypes { - -// FILE* with auto-close; use auto_file_ptr instead of FILE*. -// Warning: do not pass an auto_file_ptr to a function that calls fclose(), -// except for fclose() itself. -class auto_file_ptr -{ - FILE * f; - FILE * operator= (auto_file_ptr &); // can't ref-count: no assignment - auto_file_ptr (auto_file_ptr &); - // implicit close (destructor, assignment): we ignore error - void close() throw() { if (f) try { if (f != stdin && f != stdout && f != stderr) ::fclose (f); } catch (...) { } f = NULL; } - void openfailed (const std::string & path) { throw std::runtime_error ("auto_file_ptr: error opening file '" + path + "': " + strerror (errno)); } -protected: - friend int fclose (auto_file_ptr&); // explicit close (note: may fail) - int fclose() { int rc = ::fclose (f); if (rc == 0) f = NULL; return rc; } -public: - auto_file_ptr() : f (NULL) { } - ~auto_file_ptr() { close(); } - auto_file_ptr (const char * path, const char * mode) { f = fopen (path, mode); if (f == NULL) openfailed (path); } - auto_file_ptr (const wchar_t * wpath, const char * mode) { f = _wfopen (wpath, msra::strfun::utf16 (mode).c_str()); if (f == NULL) openfailed (msra::strfun::utf8 (wpath)); } - FILE * operator= (FILE * other) { close(); f = other; return f; } - auto_file_ptr (FILE * other) : f (other) { } - operator FILE * () const { return f; } - FILE * operator->() const { return f; } - void swap (auto_file_ptr & other) throw() { std::swap (f, other.f); } -}; -inline int fclose (auto_file_ptr & af) { return af.fclose(); } - -#ifdef _MSC_VER -// auto-closing container for Win32 handles. -// Pass close function if not CloseHandle(), e.g. -// auto_handle h (FindFirstFile(...), FindClose); -// ... TODO: the close function should really be a template parameter -template class auto_handle_t -{ - _H h; - BOOL (WINAPI_CC * close) (HANDLE); // close function - auto_handle_t operator= (const auto_handle_t &); - auto_handle_t (const auto_handle_t &); -public: - auto_handle_t (_H p_h, BOOL (WINAPI_CC * p_close) (HANDLE) = ::CloseHandle) : h (p_h), close (p_close) {} - ~auto_handle_t() { if (h != INVALID_HANDLE_VALUE) close (h); } - operator _H () const { return h; } -}; -typedef auto_handle_t auto_handle; -#endif - -// like auto_ptr but calls freeFunc_p (type free_func_t) instead of delete to clean up -// minor difference - wrapped object is T, not T *, so to wrap a -// T *, use auto_clean -// TODO: can this be used for simplifying those other classes? -template class auto_clean -{ - T it; - typedef FR (*free_func_t)(T); - free_func_t freeFunc; // the function used to free the pointer - void free() - { - //printf ("start clean\n"); - if (it) freeFunc(it); it = 0; - } - auto_clean operator= (const auto_clean &); // hide to prevent copy - auto_clean (const auto_clean &); // hide to prevent copy -public: - auto_clean (T it_p, free_func_t freeFunc_p) : it (it_p), freeFunc (freeFunc_p) {} - ~auto_clean() { free(); } - operator T () { return it; } - operator const T () const { return it; } - T detach () { T tmp = it; it = 0; return tmp; } // release ownership of object -}; - -#if 1 -// simple timer -// auto_timer timer; run(); double seconds = timer; // now can abandon the objecta -#ifdef __unix__ -typedef timeval LARGE_INTEGER; -#endif -class auto_timer -{ - LARGE_INTEGER freq, start; - auto_timer (const auto_timer &); void operator= (const auto_timer &); -public: - auto_timer() - { -#ifdef _WIN32 - if (!QueryPerformanceFrequency (&freq)) // count ticks per second - throw std::runtime_error ("auto_timer: QueryPerformanceFrequency failure"); - QueryPerformanceCounter (&start); -#endif -#ifdef __unix__ - gettimeofday (&start, NULL); -#endif - - } - operator double() const // each read gives time elapsed since start, in seconds - { - LARGE_INTEGER end; -#ifdef _WIN32 - QueryPerformanceCounter (&end); - return (end.QuadPart - start.QuadPart) / (double) freq.QuadPart; -#endif -#ifdef __unix__ - gettimeofday (&end,NULL); - return (end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec)/(1000*1000); -#endif - } - void show (const std::string & msg) const - { - double elapsed = *this; - fprintf (stderr, "%s: %.6f ms\n", msg.c_str(), elapsed * 1000.0/*to ms*/); - } -}; -#endif - -};}; - -namespace msra { namespace files { - -// ---------------------------------------------------------------------------- -// textreader -- simple reader for text files --we need this all the time! -// Currently reads 8-bit files, but can return as wstring, in which case -// they are interpreted as UTF-8 (without BOM). -// Note: Not suitable for pipes or typed input due to readahead (fixable if needed). -// ---------------------------------------------------------------------------- - -class textreader -{ - msra::basetypes::auto_file_ptr f; - std::vector buf; // read buffer (will only grow, never shrink) - int ch; // next character (we need to read ahead by one...) - char getch() { char prevch = (char) ch; ch = fgetc (f); return prevch; } -public: - textreader (const std::wstring & path) : f (path.c_str(), "rb") { buf.reserve (10000); ch = fgetc (f); } - operator bool() const { return ch != EOF; } // true if still a line to read - std::string getline() // get and consume the next line - { - if (ch == EOF) throw std::logic_error ("textreader: attempted to read beyond EOF"); - assert (buf.empty()); - // get all line's characters --we recognize UNIX (LF), DOS (CRLF), and Mac (CR) convention - while (ch != EOF && ch != '\n' && ch != '\r') buf.push_back (getch()); - if (ch != EOF && getch() == '\r' && ch == '\n') getch(); // consume EOLN char - std::string line (buf.begin(), buf.end()); - buf.clear(); - return line; - } - std::wstring wgetline() { return msra::strfun::utf16 (getline()); } -}; - -};}; - -// ---------------------------------------------------------------------------- -// functional-programming style helper macros (...do this with templates?) -// ---------------------------------------------------------------------------- - -#define foreach_index(_i,_dat) for (int _i = 0; _i < (int) (_dat).size(); _i++) -#define map_array(_x,_expr,_y) { _y.resize (_x.size()); foreach_index(_i,_x) _y[_i]=_expr(_x[_i]); } -#define reduce_array(_x,_expr,_y) { foreach_index(_i,_x) _y = (_i==0) ? _x[_i] : _expr(_y,_x[_i]); } -//template -//static void fill_array(_A & a, _F v) { ::fill (a.begin(), a.end(), v); } - -// ---------------------------------------------------------------------------- -// frequently missing utility functions -// ---------------------------------------------------------------------------- - -namespace msra { namespace util { - -// to (slightly) simplify processing of command-line arguments. -// command_line args (argc, argv); -// while (args.has (1) && args[0][0] == '-') { option = args.shift(); process (option); } -// for (const wchar_t * arg = args.shift(); arg; arg = args.shift()) { process (arg); } -class command_line -{ - int num; - const wchar_t ** args; -public: - command_line (int argc, wchar_t * argv[]) : num (argc), args ((const wchar_t **) argv) { shift(); } - inline int size() const { return num; } - inline bool has (int left) { return size() >= left; } - const wchar_t * shift() { if (size() == 0) return NULL; num--; return *args++; } - const wchar_t * operator[] (int i) const { return (i < 0 || i >= size()) ? NULL : args[i]; } -}; - -// byte-reverse a variable --reverse all bytes (intended for integral types and float) -template static inline void bytereverse (T & v) throw() -{ // note: this is more efficient than it looks because sizeof (v[0]) is a constant - char * p = (char *) &v; - const size_t elemsize = sizeof (v); - for (int k = 0; k < elemsize / 2; k++) // swap individual bytes - swap (p[k], p[elemsize-1 - k]); -} - -// byte-swap an entire array -template static inline void byteswap (V & v) throw() -{ - foreach_index (i, v) - bytereverse (v[i]); -} - -//#if 0 -// execute a block with retry -// Block must be restartable. -// Use this when writing small files to those unreliable Windows servers. -// TODO: This will fail to compile under VS 2008--we need an #ifdef around this -template static void attempt (int retries, const FUNCTION & body) -{ - for (int attempt = 1; ; attempt++) - { - try - { - body(); - if (attempt > 1) fprintf (stderr, "attempt: success after %d retries\n", attempt); - break; - } - catch (const std::exception & e) - { - if (attempt >= retries) - throw; // failed N times --give up and rethrow the error - fprintf (stderr, "attempt: %s, retrying %d-th time out of %d...\n", e.what(), attempt+1, retries); - ::Sleep (1000); // wait a little, then try again - } - } -} -//#endif - -};}; // namespace - -template static inline void ZeroStruct (S & s) { memset (&s, 0, sizeof (s)); } - -// ---------------------------------------------------------------------------- -// machine dependent -// ---------------------------------------------------------------------------- - -#define MACHINE_IS_BIG_ENDIAN (false) - -using namespace msra::basetypes; // for compatibility - -#pragma warning (pop) - -// RuntimeError - throw a std::runtime_error with a formatted error string -#ifdef _MSC_VER -__declspec(noreturn) -#endif -static inline void RuntimeError(const char * format, ...) -{ - va_list args; - char buffer[1024]; - - va_start(args, format); - vsprintf(buffer, format, args); - throw std::runtime_error(buffer); -}; - -// LogicError - throw a std::logic_error with a formatted error string -#ifdef _MSC_VER -__declspec(noreturn) -#endif -static inline void LogicError(const char * format, ...) -{ - va_list args; - char buffer[1024]; - - va_start(args, format); - vsprintf(buffer, format, args); - throw std::logic_error(buffer); -}; - -// ---------------------------------------------------------------------------- -// dynamic loading of modules -// ---------------------------------------------------------------------------- - -#ifdef _WIN32 -class Plugin -{ - HMODULE m_hModule; // module handle for the writer DLL - std::wstring m_dllName; // name of the writer DLL -public: - Plugin() { m_hModule = NULL; } - template // accepts char (UTF-8) and wide string - FARPROC Load(const STRING & plugin, const std::string & proc) - { - m_dllName = msra::strfun::utf16(plugin); - m_dllName += L".dll"; - m_hModule = LoadLibrary(m_dllName.c_str()); - if (m_hModule == NULL) - RuntimeError("Plugin not found: %s", msra::strfun::utf8(m_dllName).c_str()); - - // create a variable of each type just to call the proper templated version - return GetProcAddress(m_hModule, proc.c_str()); - } - ~Plugin(){} - // removed because this causes the exception messages to be lost (exception vftables are unloaded when DLL is unloaded) - // ~Plugin() { if (m_hModule) FreeLibrary(m_hModule); } -}; -#else -class Plugin -{ -private: - void *handle; -public: - Plugin() - { - handle = NULL; - } - - template // accepts char (UTF-8) and wide string - void * Load(const STRING & plugin, const std::string & proc) - { - string soName = msra::strfun::utf8(plugin); - soName = soName + ".so"; - void *handle = dlopen(soName.c_str(), RTLD_LAZY); - if (handle == NULL) - RuntimeError("Plugin not found: %s", soName.c_str()); - return dlsym(handle, proc.c_str()); - } - - ~Plugin() { - if (handle != NULL) - dlclose(handle); - } -}; -#endif - -#if 0 // construction site -// ---------------------------------------------------------------------------- -// class RegisterModule -// TODO: move this elsewhere -// ---------------------------------------------------------------------------- -#include -template -class RegisterModule -{ - static std::map> & GetFactoryMethodsHash() - { - static std::map> FactoryMethods; // shared object - return FactoryMethods; - } -public: - RegisterModule(const std::wstring & ModuleName, std::function FactoryMethod) - { - auto & FactoryMethods = GetFactoryMethodsHash(); - FactoryMethods[ModuleName] = FactoryMethod; - // TODO: check for dups, using map::insert() - } - static MODULETYPE* Create(const std::wstring & ModuleName) - { - auto & FactoryMethods = GetFactoryMethodsHash(); - auto Entry = FactoryMethods.find(ModuleName); - if (Entry != FactoryMethods.end()) - return Entry->second(); - else - return nullptr; - } -}; -#endif -#define EPSILON 1e-5 -#define ISCLOSE(a, b, threshold) (abs(a - b) < threshold)?true:false - -/** -These macros are used for sentence segmentation information. -*/ -#define SEQUENCE_START ((int) MinibatchPackingFlags::SequenceStart) -#define SEQUENCE_MIDDLE ((int) MinibatchPackingFlags::None) -#define SEQUENCE_END ((int) MinibatchPackingFlags::SequenceEnd) -#define NO_INPUT ((int) MinibatchPackingFlags::NoInput) -#define NO_FEATURE ((int) MinibatchPackingFlags::NoFeature) -#define NO_LABEL ((int) MinibatchPackingFlags::NoLabel) - -enum class MinibatchPackingFlags : unsigned char -{ - None = 0, - SequenceStart = 1 << 0, //binary 0001 - SequenceEnd = 1 << 1, //binary 0010 - NoFeature = 1 << 2, //binary 0100 - NoLabel = 1 << 3, //binary 1000 - - NoInput = NoFeature | NoLabel, //when we refactorize reader, NoInput will no longer needed - SequenceStartOrNoFeature = SequenceStart | NoFeature, - SequenceEndOrNoFeature = SequenceEnd | NoFeature, - SequenceStartOrEndOrNoFeature = SequenceStart | SequenceEnd | NoFeature, -}; - - -inline MinibatchPackingFlags operator| (MinibatchPackingFlags a, MinibatchPackingFlags b) -{ - return static_cast(static_cast(a) | static_cast(b)); -} - -inline MinibatchPackingFlags& operator|= (MinibatchPackingFlags& a, MinibatchPackingFlags b) -{ - a = a | b; - return a; -} - - -inline bool operator& (MinibatchPackingFlags a, MinibatchPackingFlags b) -{ - return (static_cast(a) & static_cast(b)) != 0; -} - -template -static inline bool comparator(const pair& l, const pair& r) -{ - return l.second > r.second; -} - - -#endif // _BASETYPES_ +// +// basetypes.h - basic types that C++ lacks +// +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// +#pragma once +#ifndef _BASETYPES_ +#define _BASETYPES_ + +#include "Platform.h" +#include "Basics.h" +#include +#include +#include // include here because we redefine some names later +#include +#include +#include +#include // for HUGE_VAL +#include +#include +#include +#include +#include // std::wstring_convert +#include +#include // for transform() +#include +#include +#include +#include +#include +#include +#ifdef _WIN32 +#include // for CRITICAL_SECTION and Unicode conversion functions --TODO: is there a portable alternative? +#endif +#if __unix__ +#include +#include +#include +#include +#include +typedef unsigned char byte; +#endif + +using namespace std; // Ugh! + +static inline wchar_t*GetWC(const char *c) +{ + const size_t cSize = strlen(c)+1; + wchar_t* wc = new wchar_t[cSize]; +#ifdef _WIN32 + size_t retVal; + mbstowcs_s(&retVal, wc, cSize, c, cSize); +#else + mbstowcs(wc, c, cSize); +#endif // _WIN32 + + return wc; +} +struct MatchPathSeparator +{ + bool operator()( char ch ) const + { + return ch == '\\' || ch == '/'; + } +}; +static inline std::string basename( std::string const& pathname) +{ + return std::string (std::find_if(pathname.rbegin(), pathname.rend(),MatchPathSeparator()).base(), pathname.end()); +} + +static inline std::string removeExtension (std::string const& filename) +{ + //std::string::const_reverse_iterator pivot = std::find(filename.rbegin(), filename.rend(), '.'); + //return pivot == filename.rend() ? filename: std::string(filename.begin(), pivot.base()-1); + int lastindex = filename.find_first_of("."); + return filename.substr(0,lastindex); +} +static inline std::wstring basename( std::wstring const& pathname) +{ + return std::wstring (std::find_if(pathname.rbegin(), pathname.rend(),MatchPathSeparator()).base(), pathname.end()); +} + +static inline std::wstring removeExtension (std::wstring const& filename) +{ + //std::wstring::const_reverse_iterator pivot = std::find(filename.rbegin(), filename.rend(), '.'); + //return pivot == filename.rend() ? filename: std::wstring(filename.begin(), pivot.base()-1); + int lastindex = filename.find_first_of(L"."); + return filename.substr(0,lastindex); + +} + + +// ---------------------------------------------------------------------------- +// basic data types +// ---------------------------------------------------------------------------- + +namespace msra { namespace basetypes { + +#ifdef __unix__ + typedef timeval LARGE_INTEGER; +#endif + class auto_timer + { + LARGE_INTEGER freq, start; + auto_timer(const auto_timer &); void operator= (const auto_timer &); + public: + auto_timer() + { +#ifdef _WIN32 + if (!QueryPerformanceFrequency(&freq)) // count ticks per second + RuntimeError("auto_timer: QueryPerformanceFrequency failure"); + QueryPerformanceCounter(&start); +#endif +#ifdef __unix__ + gettimeofday (&start, NULL); +#endif + } + operator double() const // each read gives time elapsed since start, in seconds + { + LARGE_INTEGER end; +#ifdef _WIN32 + QueryPerformanceCounter(&end); + return (end.QuadPart - start.QuadPart) / (double)freq.QuadPart; +#endif +#ifdef __unix__ + gettimeofday (&end,NULL); + return (end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec)/(1000*1000); +#endif + } + void show(const std::string & msg) const + { + double elapsed = *this; + fprintf(stderr, "%s: %.6f ms\n", msg.c_str(), elapsed * 1000.0/*to ms*/); + } + }; + +#pragma warning(push) +#pragma warning(disable : 4555) // expression has no affect, used so retail won't be empty + +// class fixed_vector - non-resizable vector --TODO: just use std::vector + +template class fixed_vector +{ + _T * p; // pointer array + size_t n; // number of elements + void check (int index) const + { + assert (index >= 0 && (size_t) index < n); +#ifdef NDEBUG + UNUSED(index); +#endif + } + void check (size_t index) const + { + assert (index < n); +#ifdef NDEBUG + UNUSED(index); +#endif + } + // ... TODO: when I make this public, LinearTransform.h acts totally up but I cannot see where it comes from. + //fixed_vector (const fixed_vector & other) : n (0), p (NULL) { *this = other; } +public: + fixed_vector() : n (0), p (NULL) { } + void resize (int size) { clear(); if (size > 0) { p = new _T[size]; n = size; } } + void resize (size_t size) { clear(); if (size > 0) { p = new _T[size]; n = size; } } + fixed_vector (int size) : n (size), p (size > 0 ? new _T[size] : NULL) { } + fixed_vector (size_t size) : n ((int) size), p (size > 0 ? new _T[size] : NULL) { } + ~fixed_vector() { delete[] p; } + int size() const { return (int) n; } + inline int capacity() const { return (int) n; } + bool empty() const { return n == 0; } + void clear() { delete[] p; p = NULL; n = 0; } + _T * begin() { return p; } + const _T * begin() const { return p; } + _T * end() { return p + n; } // note: n == 0 so result is NULL + inline _T & operator[] (int index) { check (index); return p[index]; } // writing + inline const _T & operator[] (int index) const { check (index); return p[index]; } // reading + inline _T & operator[] (size_t index) { check (index); return p[index]; } // writing + inline const _T & operator[] (size_t index) const { check (index); return p[index]; } // reading + inline int indexof (const _T & elem) const { assert (&elem >= p && &elem < p + n); return &elem - p; } + void swap (fixed_vector & other) throw() { std::swap (other.p, p); std::swap (other.n, n); } + template fixed_vector & operator= (const VECTOR & other) + { + int other_n = (int) other.size(); + fixed_vector tmp (other_n); + for (int k = 0; k < other_n; k++) tmp[k] = other[k]; + swap (tmp); + return *this; + } + fixed_vector & operator= (const fixed_vector & other) + { + int other_n = (int) other.size(); + fixed_vector tmp (other_n); + for (int k = 0; k < other_n; k++) tmp[k] = other[k]; + swap (tmp); + return *this; + } + template fixed_vector (const VECTOR & other) : n (0), p (NULL) { *this = other; } +}; +template inline void swap (fixed_vector<_T> & L, fixed_vector<_T> & R) throw() { L.swap (R); } + +#pragma warning(pop) // pop off waring: expression has no effect + +// class matrix - simple fixed-size 2-dimensional array, access elements as m(i,j) +// stored as concatenation of rows + +#if 1 +template class matrix : fixed_vector +{ + size_t numcols; + size_t locate(size_t i, size_t j) const { assert(i < rows() && j < cols()); return i * cols() + j; } +public: + typedef T elemtype; + matrix() : numcols(0) {} + matrix(size_t n, size_t m) { resize(n, m); } + void resize(size_t n, size_t m) { numcols = m; fixed_vector::resize(n * m); } + size_t cols() const { return numcols; } + size_t rows() const { return empty() ? 0 : size() / cols(); } + size_t size() const { return fixed_vector::size(); } // use this for reading and writing... not nice! + bool empty() const { return fixed_vector::empty(); } + T & operator() (size_t i, size_t j) { return (*this)[locate(i, j)]; } + const T & operator() (size_t i, size_t j) const { return (*this)[locate(i, j)]; } + void swap(matrix & other) throw() { std::swap(numcols, other.numcols); fixed_vector::swap(other); } +}; +template inline void swap(matrix<_T> & L, matrix<_T> & R) throw() { L.swap(R); } + +// TODO: get rid of these +typedef std::string STRING; +typedef std::wstring WSTRING; +typedef std::basic_string TSTRING; // wide/narrow character string +#endif + +// derive from this for noncopyable classes (will get you private unimplemented copy constructors) +// ... TODO: change all of basetypes classes/structs to use this +class noncopyable +{ + noncopyable & operator= (const noncopyable &); + noncopyable (const noncopyable &); +public: + noncopyable(){} +}; + +class CCritSec +{ + CCritSec (const CCritSec &) = delete; + CCritSec & operator= (const CCritSec &) = delete; + std::mutex m_CritSec; +public: + CCritSec() {}; + ~CCritSec() {}; + void Lock() { m_CritSec.lock(); }; + void Unlock() { m_CritSec.unlock(); }; +}; + + +// locks a critical section, and unlocks it automatically +// when the lock goes out of scope +class CAutoLock +{ + CAutoLock(const CAutoLock &refAutoLock); CAutoLock &operator=(const CAutoLock &refAutoLock); + CCritSec & m_rLock; +public: + CAutoLock(CCritSec & rLock) : m_rLock (rLock) { m_rLock.Lock(); }; + ~CAutoLock() { m_rLock.Unlock(); }; +}; + +};}; // namespace + +// ---------------------------------------------------------------------------- +// frequently missing utility functions +// ---------------------------------------------------------------------------- + +namespace msra { namespace util { + +// byte-reverse a variable --reverse all bytes (intended for integral types and float) +template static inline void bytereverse (T & v) throw() +{ // note: this is more efficient than it looks because sizeof (v[0]) is a constant + char * p = (char *) &v; + const size_t elemsize = sizeof (v); + for (int k = 0; k < elemsize / 2; k++) // swap individual bytes + swap (p[k], p[elemsize-1 - k]); +} + +// byte-swap an entire array +template static inline void byteswap (V & v) throw() +{ + foreach_index (i, v) + bytereverse (v[i]); +} + +// execute a block with retry +// Block must be restartable. +// Use this when writing small files to those unreliable Windows servers. +// TODO: This will fail to compile under VS 2008--we need an #ifdef around this +template static void attempt (int retries, const FUNCTION & body) +{ + for (int attempt = 1; ; attempt++) + { + try + { + body(); + if (attempt > 1) fprintf (stderr, "attempt: success after %d retries\n", attempt); + break; + } + catch (const std::exception & e) + { + if (attempt >= retries) + throw; // failed N times --give up and rethrow the error + fprintf (stderr, "attempt: %s, retrying %d-th time out of %d...\n", e.what(), attempt+1, retries); + ::Sleep (1000); // wait a little, then try again + } + } +} + +};}; // namespace + +template static inline void ZeroStruct (S & s) { memset (&s, 0, sizeof (s)); } + +// ---------------------------------------------------------------------------- +// machine dependent +// ---------------------------------------------------------------------------- + +using namespace msra::basetypes; // for compatibility + +//#pragma warning (pop) + +// why is this in basetypes.h? +#if 0 +template +static inline bool comparator(const pair& l, const pair& r) +{ + return l.second > r.second; +} +#endif + +#endif // _BASETYPES_ diff --git a/DataReader/Kaldi2Reader/fileutil.cpp b/DataReader/Kaldi2Reader/fileutil.cpp deleted file mode 100644 index 2daeda5437ad..000000000000 --- a/DataReader/Kaldi2Reader/fileutil.cpp +++ /dev/null @@ -1,1726 +0,0 @@ -// -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// - - -#include "stdafx.h" - -#ifndef UNDER_CE // fixed-buffer overloads not available for wince -#ifdef _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES // fixed-buffer overloads for strcpy() etc. -#undef _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES -#endif -#define _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES 1 -#endif - -#include "basetypes.h" -#include "fileutil.h" -#include -#include -#include -#ifndef __unix__ -#include "windows.h" // for FILETIME -#endif -#include // for std::find - -#ifndef UNDER_CE // some headers don't exist under winCE - the appropriate definitions seem to be in stdlib.h -#include // for _O_BINARY/TEXT - not needed for wince -#ifndef __unix__ -#include // for _setmode() -#endif -#endif - -#include - -using namespace std; - -// ---------------------------------------------------------------------------- -// fopenOrDie(): like fopen() but terminate with err msg in case of error. -// A pathname of "-" returns stdout or stdin, depending on mode, and it will -// change the binary mode if 'b' or 't' are given. If you use this, make sure -// not to fclose() such a handle. -// ---------------------------------------------------------------------------- - -static const wchar_t * strchr (const wchar_t * s, wchar_t v) { return wcschr (s, v); } - -// pathname is "-" -- open stdin or stdout. Changes bin mode if 'b' or 't' given. -template FILE * fopenStdHandle (const _T * mode) -{ - FILE * f = strchr (mode, 'r') ? stdin : stdout; -#ifndef __unix__ // don't need binary/text distinction on unix - if (strchr(mode, 'b') || strchr(mode, 't')) // change binary mode - { - // switch to binary mode if not yet (in case it is stdin) - int rc = _setmode (_fileno (f), strchr (mode, 'b') ? _O_BINARY : _O_TEXT); - if (rc == -1) - RuntimeError ("error switching stream to binary mode: %s", strerror (errno)); - } -#endif - return f; -} - -FILE * fopenOrDie (const STRING & pathname, const char * mode) -{ - FILE * f = (pathname[0] == '-') ? fopenStdHandle (mode) : fopen (pathname.c_str(), mode); - if (f == NULL) - { - RuntimeError("error opening file '%s': %s", pathname.c_str(), strerror(errno)); - } - if (strchr (mode, 'S')) - { // if optimized for sequential access then use large buffer - setvbuf (f, NULL, _IOFBF, 10000000); // OK if it fails - } - return f; -} - -FILE * fopenOrDie (const WSTRING & pathname, const wchar_t * mode) -{ - FILE * f = (pathname[0] == '-') ? fopenStdHandle (mode) : _wfopen (pathname.c_str(), mode); - if (f == NULL) - { - RuntimeError ("error opening file '%S': %s", pathname.c_str(), strerror (errno)); - } - if (strchr (mode, 'S')) - { // if optimized for sequential access then use large buffer - setvbuf (f, NULL, _IOFBF, 10000000); // OK if it fails - } - return f; -} - -// ---------------------------------------------------------------------------- -// set mode to binary or text (pass 'b' or 't') -// ---------------------------------------------------------------------------- - -#ifndef __unix__ // don't need binary/text distinction on unix -void fsetmode(FILE * f, char type) -{ - if (type != 'b' && type != 't') - { - RuntimeError ("fsetmode: invalid type '%c'"); - } -#ifdef UNDER_CE // winCE and win32 have different return types for _fileno - FILE *fd = _fileno (f); // note: no error check possible -#else - int fd = _fileno (f); // note: no error check possible -#endif - int mode = type == 'b' ? _O_BINARY : _O_TEXT; - int rc = _setmode (fd, mode); - if (rc == -1) - { - RuntimeError ("error changing file mode: %s", strerror (errno)); - } -} -#endif - -// ---------------------------------------------------------------------------- -// freadOrDie(): like fread() but terminate with err msg in case of error -// ---------------------------------------------------------------------------- - -void freadOrDie (void * ptr, size_t size, size_t count, FILE * f) -{ - // \\XXX\C$ reads are limited, with some randomness (e.g. 48 MB), on Windows 7 32 bit, so we break this into chunks of some MB. Meh. - while (count > 0) - { - size_t chunkn = min (count, 15*1024*1024); // BUGBUG: I surely meant this limit to be bytes, not units of 'size'... - size_t n = fread (ptr, size, chunkn, f); - if (n != chunkn) - RuntimeError ("error reading from file: %s", strerror (errno)); - count -= n; - ptr = n * size + (char*) ptr; - } -} - -void freadOrDie (void * ptr, size_t size, size_t count, const HANDLE f) -{ - // \\XXX\C$ reads are limited, with some randomness (e.g. 48 MB), on Windows 7 32 bit, so we break this into chunks of some MB. Meh. - while (count > 0) - { - size_t chunkn = min (count * size, 15*1024*1024); - DWORD n ; - ReadFile(f, ptr, (DWORD) chunkn, &n, NULL); - if (n != chunkn) - RuntimeError ("error number for reading from file: %s", GetLastError()); - count -= (size_t) (n / size); - ptr = n + (char*) ptr; - } -} - -// ---------------------------------------------------------------------------- -// fwriteOrDie(): like fwrite() but terminate with err msg in case of error; -// Windows C std lib fwrite() has problems writing >100 MB at a time (fails -// with Invalid Argument error), so we break it into chunks (yak!!) -// ---------------------------------------------------------------------------- - -void fwriteOrDie (const void * ptr, size_t size, size_t count, FILE * f) -{ - const char * p1 = (const char *) ptr; - size_t totalBytes = size * count; - while (totalBytes > 0) - { - size_t wantWrite = totalBytes; -#define LIMIT (16*1024*1024) // limit to 16 MB at a time - if (wantWrite > LIMIT) - { - wantWrite = LIMIT; - } - size_t n = fwrite ((const void *) p1, 1, wantWrite, f); - if (n != wantWrite) - { - RuntimeError ("error writing to file (ptr=0x%08lx, size=%d," - " count=%d, writing %d bytes after %d): %s", - ptr, size, count, (int) wantWrite, - (int) (size * count - totalBytes), - strerror (errno)); - } - totalBytes -= wantWrite; - p1 += wantWrite; - } -} - -void fwriteOrDie (const void * ptr, size_t size, size_t count, const HANDLE f) -{ - const char * p1 = (const char *) ptr; - DWORD totalBytes = (DWORD) (size * count); - while (totalBytes > 0) - { - DWORD wantWrite = totalBytes; -#define LIMIT (16*1024*1024) // limit to 16 MB at a time - if (wantWrite > LIMIT) - { - wantWrite = LIMIT; - } - DWORD byteWritten = 0 ; - if (WriteFile(f, (const void *) p1, wantWrite, &byteWritten, NULL) == false) - { - RuntimeError ("error writing to file (ptr=0x%08lx, size=%d," - " count=%d, writing %d bytes after %d): %s", - ptr, size, count, (int) wantWrite, - (int) (size * count - totalBytes), - strerror (errno)); - } - totalBytes -= wantWrite; - p1 += wantWrite; - } -} - - -// ---------------------------------------------------------------------------- -// fprintfOrDie(): like fprintf() but terminate with err msg in case of error -// ---------------------------------------------------------------------------- - -#pragma warning(push) -#pragma warning(disable : 4793) // 'vararg' : causes native code generation -void fprintfOrDie (FILE * f, const char * fmt, ...) -{ - va_list arg_ptr; - va_start (arg_ptr, fmt); - int rc = vfprintf (f, fmt, arg_ptr); - if (rc < 0) - { - RuntimeError ("error writing to file: %s", strerror (errno)); - } -} -#pragma warning(pop) - -// ---------------------------------------------------------------------------- -// fflushOrDie(): like fflush() but terminate with err msg in case of error -// ---------------------------------------------------------------------------- - -void fflushOrDie (FILE * f) -{ - int rc = fflush (f); - if (rc != 0) - { - RuntimeError ("error flushing to file: %s", strerror (errno)); - } -} - -// ---------------------------------------------------------------------------- -// filesize(): determine size of the file in bytes (with open file) -// BUGBUG: how about files > 4 GB? -// ---------------------------------------------------------------------------- -size_t filesize (FILE * f) -{ - long curPos = ftell (f); - if (curPos == -1L) - { - RuntimeError ("error determining file position: %s", strerror (errno)); - } - int rc = fseek (f, 0, SEEK_END); - if (rc != 0) - { - RuntimeError ("error seeking to end of file: %s", strerror (errno)); - } - long len = ftell (f); - if (len == -1L) - { - RuntimeError ("error determining file position: %s", strerror (errno)); - } - rc = fseek (f, curPos, SEEK_SET); - if (rc != 0) - { - RuntimeError ("error resetting file position: %s", strerror (errno)); - } - return (size_t) len; -} - -// filesize(): determine size of the file in bytes (with pathname) -size_t filesize (const wchar_t * pathname) -{ - FILE * f = fopenOrDie (pathname, L"rb"); - try - { - size_t len = filesize (f); - fclose (f); - return (size_t) len; - } - catch (...) - { - fclose (f); - throw; - } -} - -#ifndef UNDER_CE // no 64-bit under winCE - -// filesize64(): determine size of the file in bytes (with pathname) -int64_t filesize64 (const wchar_t * pathname) -{ - __stat64 fileinfo; - if (_wstat64 (pathname,&fileinfo) == -1) - return 0; - else - return fileinfo.st_size; -} -#endif - -// ---------------------------------------------------------------------------- -// fseekOrDie(),ftellOrDie(), fget/setpos(): seek functions with error handling -// ---------------------------------------------------------------------------- - -long fseekOrDie (FILE * f, long offset, int mode) -{ - long curPos = ftell (f); - if (curPos == -1L) - { - RuntimeError ("error seeking: %s", strerror (errno)); - } - int rc = fseek (f, offset, mode); - if (rc != 0) - { - RuntimeError ("error seeking: %s", strerror (errno)); - } - return curPos; -} - -uint64_t fgetpos (FILE * f) -{ - fpos_t post; - int rc = ::fgetpos (f, &post); - if (rc != 0) - RuntimeError ("error getting file position: %s", strerror (errno)); - return post; -} - -void fsetpos (FILE * f, uint64_t reqpos) -{ - // ::fsetpos() flushes the read buffer. This conflicts with a situation where - // we generally read linearly but skip a few bytes or KB occasionally, as is - // the case in speech recognition tools. This requires a number of optimizations. - - uint64_t curpos = fgetpos (f); - uint64_t cureob = curpos + f->_cnt; // UGH: we mess with an internal structure here - while (reqpos >= curpos && reqpos < cureob) - { - // if we made it then do not call fsetpos() - if (reqpos == fgetpos (f)) - return; - - // if we seek within the existing buffer, then just move to the position by dummy reads - char buf[65536]; - size_t n = min ((size_t) reqpos - (size_t) curpos, _countof (buf)); - fread (buf, sizeof (buf[0]), n, f); // (this may fail, but really shouldn't) - curpos += n; - - // since we mess with f->_cnt, if something unexpected happened to the buffer then back off - if (curpos != fgetpos (f) || curpos + f->_cnt != cureob) - break; // oops - } - - // actually perform the seek - fpos_t post = reqpos; - int rc = ::fsetpos (f, &post); - if (rc != 0) - RuntimeError ("error setting file position: %s", strerror (errno)); -} - -// ---------------------------------------------------------------------------- -// unlinkOrDie(): unlink() with error handling -// ---------------------------------------------------------------------------- - -void unlinkOrDie (const std::string & pathname) -{ - if (_unlink (pathname.c_str()) != 0 && errno != ENOENT) // if file is missing that's what we want - RuntimeError ("error deleting file '%s': %s", pathname.c_str(), strerror (errno)); -} -void unlinkOrDie (const std::wstring & pathname) -{ - if (_wunlink (pathname.c_str()) != 0 && errno != ENOENT) // if file is missing that's what we want - RuntimeError ("error deleting file '%S': %s", pathname.c_str(), strerror (errno)); -} - -// ---------------------------------------------------------------------------- -// renameOrDie(): rename() with error handling -// ---------------------------------------------------------------------------- - -#ifndef UNDER_CE // CE only supports Unicode APIs -void renameOrDie (const std::string & from, const std::string & to) -{ - if (!MoveFileA (from.c_str(),to.c_str())) - RuntimeError ("error renaming: %s", GetLastError()); -} -#endif - -void renameOrDie (const std::wstring & from, const std::wstring & to) -{ - if (!MoveFileW (from.c_str(),to.c_str())) - RuntimeError ("error renaming: %s", GetLastError()); -} - -// ---------------------------------------------------------------------------- -// fexists(): test if a file exists -// ---------------------------------------------------------------------------- - -bool fexists (const wchar_t * pathname) -{ - WIN32_FIND_DATAW findFileData; - HANDLE hFind = FindFirstFileW (pathname, &findFileData); - if (hFind != INVALID_HANDLE_VALUE) - { - FindClose (hFind); - return true; - } - else - { - return false; - } -} - -#ifndef UNDER_CE // CE only supports Unicode APIs -bool fexists (const char * pathname) -{ - WIN32_FIND_DATAA findFileData; - HANDLE hFind = FindFirstFileA (pathname, &findFileData); - if (hFind != INVALID_HANDLE_VALUE) - { - FindClose (hFind); - return true; - } - else - { - return false; - } -} -#endif - -// ---------------------------------------------------------------------------- -// funicode(): test if a file uses unicode by reading its BOM -// ---------------------------------------------------------------------------- - -bool funicode (FILE * f) -{ - unsigned short testCode; - if (fread (&testCode, sizeof(short), 1, f) == 1 && - (int)testCode == 0xFEFF) - return true; - fseek (f,0,SEEK_SET); - //rewind (f); - return false; -} - -// ---------------------------------------------------------------------------- -// fgetline(): like fgets() but terminate with err msg in case of error; -// removes the newline character at the end (like gets()); -// Returns 'buf' (always). buf guaranteed to be 0-terminated. -// ---------------------------------------------------------------------------- - -static inline wchar_t * fgets (wchar_t * buf, int n, FILE * f) { return fgetws (buf, n, f); } -static inline string _utf8 (const string & s) { return s; } -static inline string _utf8 (const wstring & s) { return msra::strfun::utf8 (s); } -static inline size_t strnlen (wchar_t * s, size_t n) { return wcsnlen (s, n); } - -#ifdef UNDER_CE // strlen for char * not defined in winCE -static inline size_t strnlen (const char *s, size_t n) { return std::find (s,s+n,'\0') - s; } -#endif - -template -CHAR * fgetline (FILE * f, CHAR * buf, int size) -{ - - uint64_t filepos = fgetpos (f); // (for error message only) - CHAR * p = fgets (buf, size, f); - if (p == NULL) // EOF reached: next time feof() = true - { - if (ferror (f)) - RuntimeError ("error reading line: %s", strerror (errno)); - buf[0] = 0; - return buf; - } - size_t n = strnlen (p, size); - - // check for buffer overflow - - if (n >= (size_t) size -1) - { - basic_string example (p, n < 100 ? n : 100); - RuntimeError ("input line too long at file offset %I64d (max. %d characters allowed) [%s ...]", - filepos, size -1, _utf8 (example).c_str()); - } - - // remove newline at end - - if (n > 0 && p[n-1] == '\n') // UNIX and Windows style - { - n--; - p[n] = 0; - if (n > 0 && p[n-1] == '\r') // Windows style - { - n--; - p[n] = 0; - } - } - else if (n > 0 && p[n-1] == '\r') // Mac style - { - n--; - p[n] = 0; - } - - return buf; -} - -#if 0 -const wchar_t * fgetline (FILE * f, wchar_t * buf, int size) -{ - wchar_t * p = fgetws (buf, size, f); - if (p == NULL) // EOF reached: next time feof() = true - { - if (ferror (f)) - RuntimeError ("error reading line: %s", strerror (errno)); - buf[0] = 0; - return buf; - } - size_t n = wcsnlen (p, size); // SECURITY NOTE: string use has been reviewed - - // check for buffer overflow - - if (n >= (size_t) size -1) - { - wstring example (buf, min (n, 100)); - RuntimeError ("input line too long at file offset %U64d (max. %d characters allowed) [%S ...]", - fgetpos (f), size -1, example.c_str()); - } - - // remove newline at end - - if (n > 0 && p[n-1] == L'\n') // UNIX and Windows style - { - n--; - p[n] = 0; - if (n > 0 && p[n-1] == L'\r') // Windows style - { - n--; - p[n] = 0; - } - } - else if (n > 0 && p[n-1] == L'\r') // Mac style - { - n--; - p[n] = 0; - } - - return buf; -} -#endif - -// STL string version -std::string fgetline (FILE * f) -{ - fixed_vector buf (1000000); - return fgetline (f, &buf[0], (int) buf.size()); -} - -// STL string version -std::wstring fgetlinew (FILE * f) -{ - fixed_vector buf (1000000); - return fgetline (f, &buf[0], (int) buf.size()); -} - -// STL string version avoiding most memory allocations -void fgetline (FILE * f, std::string & s, std::vector & buf) -{ - buf.resize (1000000); // enough? // KIT: increased to 1M to be safe - const char * p = fgetline (f, &buf[0], (int) buf.size()); - s.assign (p); -} - -void fgetline (FILE * f, std::wstring & s, std::vector & buf) -{ - buf.resize (1000000); // enough? // KIT: increased to 1M to be safe - const wchar_t * p = fgetline (f, &buf[0], (int) buf.size()); - s.assign (p); -} - -// char buffer version -void fgetline (FILE * f, std::vector & buf) -{ - const int BUF_SIZE = 1000000; // enough? // KIT: increased to 1M to be safe - buf.resize (BUF_SIZE); - fgetline (f, &buf[0], (int) buf.size()); - buf.resize (strnlen (&buf[0], BUF_SIZE) +1); // SECURITY NOTE: string use has been reviewed -} - -void fgetline (FILE * f, std::vector & buf) -{ - const int BUF_SIZE = 1000000; // enough? // KIT: increased to 1M to be safe - buf.resize (BUF_SIZE); - fgetline (f, &buf[0], (int) buf.size()); - buf.resize (wcsnlen (&buf[0], BUF_SIZE) +1); // SECURITY NOTE: string use has been reviewed -} - -// read a 0-terminated string -const char * fgetstring (FILE * f, __out_z_cap(size) char * buf, int size) -{ - int i; - for (i = 0; ; i++) - { - int c = fgetc (f); - if (c == EOF) - RuntimeError ("error reading string or missing 0: %s", strerror (errno)); - if (c == 0) break; - if (i >= size -1) - { - RuntimeError ("input line too long (max. %d characters allowed)", size -1); - } - buf[i] = (char) c; - } - assert (i < size); - buf[i] = 0; - return buf; -} - -const char * fgetstring (const HANDLE f, __out_z_cap(size) char * buf, int size) -{ - int i; - for (i = 0; ; i++) - { - char c; - freadOrDie((void*) &c, sizeof(char), 1, f); - if (c == (char) 0) break; - if (i >= size -1) - { - RuntimeError ("input line too long (max. %d characters allowed)", size -1); - } - buf[i] = (char) c; - } - assert (i < size); - buf[i] = 0; - return buf; -} - -// read a 0-terminated wstring -wstring fgetwstring (FILE * f) -{ - wstring res; - for (;;) - { - int c = fgetwc (f); - if (c == EOF) - RuntimeError ("error reading string or missing 0: %s", strerror (errno)); - if (c == 0) break; - res.push_back ((wchar_t) c); - } - return res; -} - -void fskipspace (FILE * f) -{ - for (;;) - { - int c = fgetc (f); - if (c == EOF) // hit the end - { - if (ferror (f)) - RuntimeError ("error reading from file: %s", strerror (errno)); - break; - } - if (!isspace (c)) // end of space: undo getting that character - { - int rc = ungetc (c, f); - if (rc != c) - RuntimeError ("error in ungetc(): %s", strerror (errno)); - break; - } - } -} - -// fskipNewLine(): skip all white space until end of line incl. the newline -void fskipNewline (FILE * f) -{ - char c; - - // skip white space - - do - { - freadOrDie (&c, sizeof (c), 1, f); - } while (c == ' ' || c == '\t'); - - if (c == '\r') // Windows-style CR-LF - { - freadOrDie (&c, sizeof (c), 1, f); - } - - if (c != '\n') - { - RuntimeError ("unexpected garbage at end of line"); - } -} - -// read a space-terminated token -// ...TODO: eat trailing space like fscanf() doessurrounding space) -const char * fgettoken (FILE * f, __out_z_cap(size) char * buf, int size) -{ - fskipspace (f); // skip leading space - int c = -1; - int i; - for (i = 0; ; i++) - { - c = fgetc (f); - if (c == EOF) break; - if (isspace (c)) break; - if (i >= size -1) - RuntimeError ("input token too long (max. %d characters allowed)", size -1); - buf[i] = (char) c; - } - // ... TODO: while (isspace (c)) c = fgetc (f); // skip trailing space - if (c != EOF) - { - int rc = ungetc (c, f); - if (rc != c) - RuntimeError ("error in ungetc(): %s", strerror (errno)); - } - assert (i < size); - buf[i] = 0; - return buf; -} - -STRING fgettoken (FILE * f) -{ - char buf[80]; - return fgettoken (f, buf, sizeof(buf)/sizeof(*buf)); -} - -// ---------------------------------------------------------------------------- -// fputstring(): write a 0-terminated string -// ---------------------------------------------------------------------------- - -void fputstring (FILE * f, const char * str) -{ - fwriteOrDie ((void *) str, sizeof (*str), strnlen (str, SIZE_MAX)+1, f); // SECURITY NOTE: string use has been reviewed -} - -void fputstring (const HANDLE f, const char * str) -{ - fwriteOrDie ((void *) str, sizeof (*str), strnlen (str, SIZE_MAX)+1, f); // SECURITY NOTE: string use has been reviewed -} - -void fputstring (FILE * f, const std::string & str) -{ - fputstring (f, str.c_str()); -} - -void fputstring (FILE * f, const wchar_t * str) -{ - fwriteOrDie ((void *) str, sizeof (*str), wcsnlen (str, SIZE_MAX)+1, f); // SECURITY NOTE: string use has been reviewed -} - -void fputstring (FILE * f, const std::wstring & str) -{ - fputstring (f, str.c_str()); -} - - -// ---------------------------------------------------------------------------- -// fgetTag(): read a 4-byte tag & return as a string -// ---------------------------------------------------------------------------- - -std::string fgetTag (FILE * f) -{ - char tag[5]; - freadOrDie (&tag[0], sizeof (tag[0]), 4, f); - tag[4] = 0; - return std::string (tag); -} - -std::string fgetTag (const HANDLE f) -{ - char tag[5]; - freadOrDie (&tag[0], sizeof (tag[0]), 4, f); - tag[4] = 0; - return std::string (tag); -} - -// ---------------------------------------------------------------------------- -// fcheckTag(): read a 4-byte tag & verify it; terminate if wrong tag -// ---------------------------------------------------------------------------- - -void fcheckTag (FILE * f, const char * expectedTag) -{ - fcompareTag (fgetTag (f), expectedTag); -} - - -void fcheckTag (const HANDLE f, const char * expectedTag) -{ - fcompareTag (fgetTag (f), expectedTag); -} - -void fcheckTag_ascii (FILE * f, const STRING & expectedTag) -{ - char buf[20]; // long enough for a tag - fskipspace (f); - fgettoken (f, buf, sizeof(buf)/sizeof(*buf)); - if (expectedTag != buf) - { - RuntimeError ("invalid tag '%s' found; expected '%s'", buf, expectedTag.c_str()); - } -} - -// ---------------------------------------------------------------------------- -// fcompareTag(): compare two tags; terminate if wrong tag -// ---------------------------------------------------------------------------- - -void fcompareTag (const STRING & readTag, const STRING & expectedTag) -{ - if (readTag != expectedTag) - { - RuntimeError ("invalid tag '%s' found; expected '%s'", - readTag.c_str(), expectedTag.c_str()); - } -} - -// ---------------------------------------------------------------------------- -// fputTag(): write a 4-byte tag -// ---------------------------------------------------------------------------- - -void fputTag (FILE * f, const char * tag) -{ - const int TAG_LEN = 4; - assert (strnlen (tag, TAG_LEN + 1) == TAG_LEN); - fwriteOrDie ((void *) tag, sizeof (*tag), strnlen (tag, TAG_LEN), f); -} - -void fputTag(const HANDLE f, const char * tag) -{ - const int TAG_LEN = 4; - assert (strnlen (tag, TAG_LEN + 1) == TAG_LEN); - fwriteOrDie ((void *) tag, sizeof (*tag), strnlen (tag, TAG_LEN), f); -} - -// ---------------------------------------------------------------------------- -// fskipstring(): skip a 0-terminated string, such as a pad string -// ---------------------------------------------------------------------------- - -void fskipstring (FILE * f) -{ - char c; - do - { - freadOrDie (&c, sizeof (c), 1, f); - } - while (c); -} - -// ---------------------------------------------------------------------------- -// fpad(): write a 0-terminated string to pad file to a n-byte boundary -// (note: file must be opened in binmode to work properly on DOS/Windows!!!) -// ---------------------------------------------------------------------------- -void fpad (FILE * f, int n) -{ - // get current writing position - int pos = ftell (f); - if (pos == -1) - { - RuntimeError ("error in ftell(): %s", strerror (errno)); - } - // determine how many bytes are needed (at least 1 for the 0-terminator) - // and create a dummy string of that length incl. terminator - int len = n - (pos % n); - const char dummyString[] = "MSR-Asia: JL+FS"; - size_t offset = sizeof(dummyString)/sizeof(dummyString[0]) - len; - assert (offset >= 0); - fputstring (f, dummyString + offset); -} -// ---------------------------------------------------------------------------- -// fgetbyte(): read a byte value -// ---------------------------------------------------------------------------- - -char fgetbyte (FILE * f) -{ - char v; - freadOrDie (&v, sizeof (v), 1, f); - return v; -} - -// ---------------------------------------------------------------------------- -// fgetshort(): read a short value -// ---------------------------------------------------------------------------- - -short fgetshort (FILE * f) -{ - short v; - freadOrDie (&v, sizeof (v), 1, f); - return v; -} - -short fgetshort_bigendian (FILE * f) -{ - unsigned char b[2]; - freadOrDie (&b, sizeof (b), 1, f); - return (short) ((b[0] << 8) + b[1]); -} - -// ---------------------------------------------------------------------------- -// fgetint24(): read a 3-byte (24-bit) int value -// ---------------------------------------------------------------------------- - -int fgetint24 (FILE * f) -{ - int v; - assert (sizeof (v) == 4); - freadOrDie (&v, sizeof (v) -1, 1, f); // only read 3 lower-order bytes - v <<= 8; // shift up (upper 8 bits uninit'ed) - v >>= 8; // shift down 8 bits with sign-extend - return v; -} - -// ---------------------------------------------------------------------------- -// fgetint(): read an int value -// ---------------------------------------------------------------------------- - -int fgetint (FILE * f) -{ - int v; - freadOrDie (&v, sizeof (v), 1, f); - return v; -} - -int fgetint (const HANDLE f) -{ - int v; - freadOrDie (&v, sizeof (v), 1, f); - return v; -} - -int fgetint_bigendian (FILE * f) -{ - unsigned char b[4]; - freadOrDie (&b, sizeof (b), 1, f); - return (int) (((((b[0] << 8) + b[1]) << 8) + b[2]) << 8) + b[3]; -} - -int fgetint_ascii (FILE * f) -{ - fskipspace (f); - int res = 0; - char c; - freadOrDie (&c, sizeof (c), 1, f); - while (isdigit ((unsigned char)c)) - { - res = (10 * res) + (c - '0'); - freadOrDie (&c, sizeof (c), 1, f); - } - int rc = ungetc (c, f); - if (rc != c) - { - RuntimeError ("error in ungetc(): %s", strerror (errno)); - } - return res; -} - -// ---------------------------------------------------------------------------- -// fgetfloat(): read a float value -// ---------------------------------------------------------------------------- - -float fgetfloat (FILE * f) -{ - float v; - freadOrDie (&v, sizeof (v), 1, f); - return v; -} - -float fgetfloat_bigendian (FILE * f) -{ - int bitpattern = fgetint_bigendian (f); - return *((float*) &bitpattern); -} - -float fgetfloat_ascii (FILE * f) -{ - float val; - fskipspace (f); - int rc = fscanf (f, "%f", &val); // security hint: safe overloads - if (rc == 0) - RuntimeError ("error reading float value from file (invalid format): %s"); - else if (rc == EOF) - RuntimeError ("error reading from file: %s", strerror (errno)); - assert (rc == 1); - return val; -} - -// ---------------------------------------------------------------------------- -// fgetdouble(): read a double value -// ---------------------------------------------------------------------------- - -double fgetdouble (FILE * f) -{ - double v; - freadOrDie (&v, sizeof (v), 1, f); - return v; -} - -// ---------------------------------------------------------------------------- -// fgetwav(): read an entire .wav file -// ---------------------------------------------------------------------------- - -void WAVEHEADER::prepareRest (int sampleCount) -{ - FmtLength = 16; - - wFormatTag = 1; - nAvgBytesPerSec = nSamplesPerSec * nBlockAlign; - - riffchar[0] = 'R'; - riffchar[1] = 'I'; - riffchar[2] = 'F'; - riffchar[3] = 'F'; - if (sampleCount != -1) - { - DataLength = sampleCount * nBlockAlign; - RiffLength = 36 + DataLength; - } - else - { - DataLength = 0xffffffff; - RiffLength = 0xffffffff; - } - - wavechar[0] = 'W'; - wavechar[1] = 'A'; - wavechar[2] = 'V'; - wavechar[3] = 'E'; - wavechar[4] = 'f'; - wavechar[5] = 'm'; - wavechar[6] = 't'; - wavechar[7] = ' '; - - datachar[0] = 'd'; - datachar[1] = 'a'; - datachar[2] = 't'; - datachar[3] = 'a'; -} - -void WAVEHEADER::prepare (unsigned int Fs, int Bits, int Channels, int SampleCount) -{ - nChannels = (short) Channels; - nSamplesPerSec = Fs; - nBlockAlign = (short) (Channels * (Bits/8)); - nAvgBytesPerSec = Fs * nBlockAlign; - wBitsPerSample = (short) Bits; - - prepareRest (SampleCount); -} - -void WAVEHEADER::prepare (const WAVEFORMATEX & wfx, int sampleCount /* -1 for unknown */) -{ - nChannels = wfx.nChannels; - nSamplesPerSec = wfx.nSamplesPerSec; - nBlockAlign = wfx.nBlockAlign; - wBitsPerSample = wfx.wBitsPerSample; - - prepareRest (sampleCount); -} - -void WAVEHEADER::write (FILE * f) -{ - fputTag (f, "RIFF"); - fputint (f, RiffLength); - fputTag (f, "WAVE"); - fputTag (f, "fmt "); - fputint (f, FmtLength); - fputshort (f, wFormatTag); - fputshort (f, nChannels); - fputint (f, nSamplesPerSec); - fputint (f, nAvgBytesPerSec); - fputshort (f, nBlockAlign); - fputshort (f, wBitsPerSample); - assert (FmtLength == 16); - assert (wFormatTag == 1); - fputTag (f, "data"); - fputint (f, DataLength); - fflushOrDie (f); -} - -/*static*/ void WAVEHEADER::update (FILE * f) -{ - long curPos = ftell (f); - if (curPos == -1L) - { - RuntimeError ("error determining file position: %s", strerror (errno)); - } - unsigned int len = (unsigned int) filesize (f); - unsigned int RiffLength = len - 8; - unsigned int DataLength = RiffLength - 36; - fseekOrDie (f, 4, SEEK_SET); - fputint (f, RiffLength); - fseekOrDie (f, 40, SEEK_SET); - fputint (f, DataLength); - fseekOrDie (f, curPos, SEEK_SET); -} - -#if 0 -unsigned int WAVEHEADER::read (FILE * f, signed short & wRealFormatTag, int & bytesPerSample) -{ - // read header - fcheckTag (f, "RIFF"); - /*unsigned int riffLen = */ fgetint (f); - fcheckTag (f, "WAVE"); - fcheckTag (f, "fmt "); - unsigned int fmtLen = fgetint (f); - wRealFormatTag = fgetshort (f); - if (wRealFormatTag == -2) // MARecorder.exe [Ivan Tashev] puts a -2 for - { // 8-channel recordings (meaning unknown). - wRealFormatTag = 1; // Workaround: pretend it is 1 (seems safe) - } - (wRealFormatTag == 1 || wRealFormatTag == 7) - || RuntimeError ("WAVEHEADER::read: wFormatTag=%d not supported for now", wRealFormatTag); - unsigned short wChannels = fgetshort (f); - unsigned long dwSamplesPerSec = fgetint (f); - unsigned int sampleRate = dwSamplesPerSec; - /*unsigned long dwAvgBytesPerSec = */ fgetint (f); - unsigned short wBlockAlign = fgetshort (f); - unsigned short wBitsPerSample = fgetshort (f); - (wBitsPerSample <= 16) || RuntimeError ("WAVEHEADER::read: invalid wBitsPerSample %d", wBitsPerSample); - bytesPerSample = wBitsPerSample / 8; - (wBlockAlign == wChannels * bytesPerSample) - || RuntimeError ("WAVEHEADER::read: wBlockAlign != wChannels*bytesPerSample not supported"); - while (fmtLen > 16) // unused extra garbage in header - { - fgetbyte (f); - fmtLen--; - } - if (wRealFormatTag == 7) - { - (bytesPerSample == 1) || RuntimeError ("WAVEHEADER::read: invalid wBitsPerSample %d for mulaw", wBitsPerSample); - fcheckTag (f, "fact"); - unsigned int factLen = fgetint (f); - while (factLen > 0) - { - fgetbyte (f); - factLen--; - } - } - fcheckTag (f, "data"); - unsigned int dataLen = fgetint (f); - unsigned int numSamples = dataLen / wBlockAlign; - - // prepare a nice wave header without junk (44 bytes, 16-bit PCM) - prepare (sampleRate, wBitsPerSample, wChannels, numSamples); - - return numSamples; -} - -static short toolULawToLinear(unsigned char p_ucULawByte) -{ - static short anExpLut[8] = { 0, 132, 396, 924, 1980, 4092, 8316, 16764 }; - short nSign, nExponent, nMantissa, nSample; - - p_ucULawByte=~p_ucULawByte; - nSign=(p_ucULawByte & 0x80); - nExponent=(p_ucULawByte >> 4) & 0x07; - nMantissa=p_ucULawByte & 0x0F; - nSample=anExpLut[nExponent]+(nMantissa<<(nExponent+3)); - if(nSign != 0) - nSample = -nSample; - - return nSample; -} - -// fgetwavraw(): only read data of .wav file. For multi-channel data, samples -// are kept interleaved. -static void fgetwavraw(FILE * f, std::vector & wav, const WAVEHEADER & wavhd) -{ - int bytesPerSample = wavhd.wBitsPerSample / 8; // (sample size on one channel) - wav.resize (wavhd.DataLength / bytesPerSample); - if (wavhd.wFormatTag == 7) // mulaw - { - (wavhd.nChannels == 1) || RuntimeError ("fgetwav: wChannels=%d not supported for mulaw", wavhd.nChannels); - std::vector data; - int numSamples = wavhd.DataLength/wavhd.nBlockAlign; - data.resize (numSamples); - freadOrDie (&data[0], sizeof (data[0]), numSamples, f); - for (int i = 0; i < numSamples; i++) - { - wav[i] = toolULawToLinear (data[i]); - } - } - else if (bytesPerSample == 2) - { // note: we may be reading an interleaved multi-channel signal. - freadOrDie (&wav[0], sizeof (wav[0]), wav.size(), f); - } - // ... TODO: support 8 bit linear PCM samples (implement when needed; samples scaled to 'short') - else - { - RuntimeError ("bytesPerSample != 2 is not supported except mulaw format!\n"); - } -} - -// ---------------------------------------------------------------------------- -// fgetwav(): read an entire .wav file. Stereo is mapped to mono. -// ---------------------------------------------------------------------------- - -void fgetwav (FILE * f, std::vector & wav, int & sampleRate) -{ - WAVEHEADER wavhd; // will be filled in for 16-bit PCM!! - signed short wFormatTag; // real format tag as found in data - int bytesPerSample; // bytes per sample as found in data - - unsigned int numSamples = wavhd.read (f, wFormatTag, bytesPerSample); - sampleRate = (int) wavhd.nSamplesPerSec; - - if (wavhd.nChannels == 1) - { - fgetwavraw (f, wav, wavhd); - } - else if (wavhd.nChannels == 2) - { - //read raw data - std::vector buf; - buf.resize(numSamples * 2); - fgetwavraw(f, buf, wavhd); - - //map to mono - wav.resize (numSamples); - const short * p = &buf[0]; - for (int i = 0; i < (int) numSamples; i++) - { - int l = *p++; - int r = *p++; - int mono = ((l + r) + 1) >> 1; - wav[i] = (short) mono; - } - } - else - { - RuntimeError ("bytesPerSample/wChannels != 2 needs to be implemented"); - } -} - -void fgetwav (const wstring & fn, std::vector & wav, int & sampleRate) -{ - auto_file_ptr f = fopenOrDie (fn, L"rbS"); - fgetwav (f, wav, sampleRate); -} - -// ---------------------------------------------------------------------------- -// ... TODO: -// - rename this function!! -// - also change to read header itself and return sample rate and channels -// fgetraw(): read data of multi-channel .wav file, and separate data of multiple channels. -// For example, data[i][j]: i is channel index, 0 means the first -// channel. j is sample index. -// ---------------------------------------------------------------------------- - -void fgetraw (FILE *f, std::vector< std::vector > & data, const WAVEHEADER & wavhd) -{ - std::vector wavraw; - fgetwavraw (f, wavraw, wavhd); - data.resize (wavhd.nChannels); - int numSamples = wavhd.DataLength/wavhd.nBlockAlign; - assert (numSamples == (int) wavraw.size() / wavhd.nChannels); - - for (int i = 0; i < wavhd.nChannels; i++) - { - data[i].resize (numSamples); - - for (int j = 0; j < numSamples; j++) - { - data[i][j] = wavraw[wavhd.nChannels*j + i]; - } - } -} - -// ---------------------------------------------------------------------------- -// fgetwfx(), fputwfx(): direct access to simple WAV headers -// ---------------------------------------------------------------------------- - -// read header and skip to first data byte; return #samples -unsigned int fgetwfx (FILE * f, WAVEFORMATEX & wfx) -{ - // read header - fcheckTag (f, "RIFF"); - /*unsigned int riffLen = */ fgetint (f); - fcheckTag (f, "WAVE"); - fcheckTag (f, "fmt "); - wfx.cbSize = sizeof (wfx); - int fmtLen = fgetint (f); - wfx.wFormatTag = fgetshort (f); - if (wfx.wFormatTag == -2) // MARecorder.exe [Ivan Tashev] puts a -2 for - { // 8-channel recordings (meaning unknown). - wfx.wFormatTag = 1; // Workaround: pretend it is 1 (seems safe) - } - (wfx.wFormatTag == 1 || wfx.wFormatTag == 3 || wfx.wFormatTag == 7) - || RuntimeError ("WAVEHEADER::read: wFormatTag=%d not supported for now", wfx.wFormatTag); - wfx.nChannels = fgetshort (f); - wfx.nSamplesPerSec = fgetint (f); - wfx.nAvgBytesPerSec = fgetint (f); - wfx.nBlockAlign = fgetshort (f); - wfx.wBitsPerSample = fgetshort (f); - // unused extra garbage in header - for ( ; fmtLen > 16; fmtLen--) fgetbyte (f); - fcheckTag (f, "data"); - unsigned int dataLen = fgetint (f); - unsigned int numSamples = dataLen / wfx.nBlockAlign; - return numSamples; -} - -void fputwfx (FILE *f, const WAVEFORMATEX & wfx, unsigned int numSamples) -{ - unsigned int DataLength = numSamples * wfx.nBlockAlign; - (DataLength / wfx.nBlockAlign == numSamples) - || RuntimeError ("fputwfx: data size exceeds WAV header 32-bit range"); - unsigned int RiffLength = 36 + DataLength; - unsigned int FmtLength = 16; - // file header - assert (wfx.cbSize == 0 || wfx.cbSize == FmtLength + 2); - fputTag (f, "RIFF"); - fputint (f, RiffLength); - fputTag (f, "WAVE"); - // 'fmt ' chunk (to hold wfx) - fputTag (f, "fmt "); - fputint (f, FmtLength); - fputshort (f, wfx.wFormatTag); - fputshort (f, wfx.nChannels); - fputint (f, wfx.nSamplesPerSec); - fputint (f, wfx.nAvgBytesPerSec); - fputshort (f, wfx.nBlockAlign); - fputshort (f, wfx.wBitsPerSample); - // data chunk - fputTag (f, "data"); - fputint (f, DataLength); - fflushOrDie (f); -} - -// ---------------------------------------------------------------------------- -// fputwav(): write an entire .wav file (16 bit PCM) -// ---------------------------------------------------------------------------- - -void fputwav (FILE * f, const vector & wav, int sampleRate, int nChannels) -{ - f;wav;sampleRate;nChannels; - // construct WAVEFORMATEX - WAVEFORMATEX wfx; - wfx.cbSize = 16 + 2; //fmt data + extra data - wfx.nAvgBytesPerSec = (DWORD)(sampleRate * nChannels * 2); //short: 2 bytes per sample - wfx.nBlockAlign = (WORD)nChannels * 2; //short: 2bytes per sample - wfx.nChannels = (WORD)nChannels; - wfx.nSamplesPerSec = sampleRate; - wfx.wBitsPerSample = 16; - wfx.wFormatTag = WAVE_FORMAT_PCM; - //putwfx - fputwfx (f, wfx, (unsigned int) wav.size()); - // wrtie the data - fwriteOrDie (&wav[0], sizeof(wav[0]), wav.size(), f); -} - -void fputwav (const wstring & fn, const vector & wav, int sampleRate, int nChannels) -{ - auto_file_ptr f = fopenOrDie (fn, L"wbS"); - fputwav (f, wav, sampleRate, nChannels); - fflushOrDie (f); // after this, fclose() (in destructor of f) cannot fail -} -#endif - -// ---------------------------------------------------------------------------- -// fputbyte(): write a byte value -// ---------------------------------------------------------------------------- - -void fputbyte (FILE * f, char v) -{ - fwriteOrDie (&v, sizeof (v), 1, f); -} - -// ---------------------------------------------------------------------------- -// fputshort(): write a short value -// ---------------------------------------------------------------------------- - -void fputshort (FILE * f, short v) -{ - fwriteOrDie (&v, sizeof (v), 1, f); -} - -// ---------------------------------------------------------------------------- -// fputint24(): write a 3-byte (24-bit) int value -// ---------------------------------------------------------------------------- - -void fputint24 (FILE * f, int v) -{ - assert (sizeof (v) == 4); - fwriteOrDie (&v, sizeof (v) -1, 1, f); // write low-order 3 bytes -} - -// ---------------------------------------------------------------------------- -// fputint(): write an int value -// ---------------------------------------------------------------------------- - -void fputint (FILE * f, int v) -{ - fwriteOrDie (&v, sizeof (v), 1, f); -} - -void fputint (const HANDLE f, int v) -{ - fwriteOrDie (&v, sizeof (v), 1, f); -} - -// ---------------------------------------------------------------------------- -// fputfloat(): write a float value -// ---------------------------------------------------------------------------- - -void fputfloat (FILE * f, float v) -{ - fwriteOrDie (&v, sizeof (v), 1, f); -} - -// ---------------------------------------------------------------------------- -// fputdouble(): write a double value -// ---------------------------------------------------------------------------- - -void fputdouble (FILE * f, double v) -{ - fwriteOrDie (&v, sizeof (v), 1, f); -} - -// ---------------------------------------------------------------------------- -// fputfile(): write a binary block or a string as a file -// ---------------------------------------------------------------------------- - -void fputfile (const WSTRING & pathname, const std::vector & buffer) -{ - FILE * f = fopenOrDie (pathname, L"wb"); - try - { - if (buffer.size() > 0) - { // ^^ otherwise buffer[0] is an illegal expression - fwriteOrDie (&buffer[0], sizeof (buffer[0]), buffer.size(), f); - } - fcloseOrDie (f); - } - catch (...) - { - fclose (f); - throw; - } -} - -void fputfile (const WSTRING & pathname, const std::wstring & string) -{ - FILE * f = fopenOrDie (pathname, L"wb"); - try - { - if (string.length() > 0) - { // ^^ otherwise buffer[0] is an illegal expression - fwriteOrDie (string.c_str(), sizeof (string[0]), string.length(), f); - } - fcloseOrDie (f); - } - catch (...) - { - fclose (f); - throw; - } -} - -void fputfile (const WSTRING & pathname, const std::string & string) -{ - FILE * f = fopenOrDie (pathname, L"wb"); - try - { - if (string.length() > 0) - { // ^^ otherwise buffer[0] is an illegal expression - fwriteOrDie (string.c_str(), sizeof (string[0]), string.length(), f); - } - fcloseOrDie (f); - } - catch (...) - { - fclose (f); - throw; - } -} - -// ---------------------------------------------------------------------------- -// fgetfile(): load a file as a binary block -// ---------------------------------------------------------------------------- - -void fgetfile (const WSTRING & pathname, std::vector & buffer) -{ - FILE * f = fopenOrDie (pathname, L"rb"); - size_t len = filesize (f); - buffer.resize (len); - if (buffer.size() > 0) - { // ^^ otherwise buffer[0] is an illegal expression - freadOrDie (&buffer[0], sizeof (buffer[0]), buffer.size(), f); - } - fclose (f); -} - -void fgetfile (FILE * f, std::vector & buffer) -{ // this version reads until eof - buffer.resize (0); - buffer.reserve (1000000); // avoid too many reallocations - std::vector inbuf; - inbuf.resize (65536); // read in chunks of this size - while (!feof (f)) // read until eof - { - size_t n = fread (&inbuf[0], sizeof (inbuf[0]), inbuf.size(), f); - if (ferror (f)) - { - RuntimeError ("fgetfile: error reading from file: %s", strerror (errno)); - } - buffer.insert (buffer.end(), inbuf.begin(), inbuf.begin() + n); - } - buffer.reserve (buffer.size()); -} - -// load it into RAM in one huge chunk -static size_t fgetfilechars (const std::wstring & path, vector & buffer) -{ - auto_file_ptr f = fopenOrDie (path, L"rb"); - size_t len = filesize (f); - buffer.reserve (len +1); - freadOrDie (buffer, len, f); - buffer.push_back (0); // this makes it a proper C string - return len; -} - -template static void strtoklines (char * s, LINES & lines) -{ - char * context; - for (char * p = strtok_s (s, "\r\n", &context); p; p = strtok_s (NULL, "\r\n", &context)) - lines.push_back (p); -} - -void msra::files::fgetfilelines (const std::wstring & path, vector & buffer, std::vector & lines) -{ - // load it into RAM in one huge chunk - const size_t len = fgetfilechars (path, buffer); - - // parse into lines - lines.resize (0); - lines.reserve (len / 20); - strtoklines (&buffer[0], lines); -} - -// same as above but returning const char* (avoiding the memory allocation) -vector msra::files::fgetfilelines (const wstring & path, vector & buffer) -{ - // load it into RAM in one huge chunk - const size_t len = fgetfilechars (path, buffer); - - // parse into lines - vector lines; - lines.reserve (len / 20); - strtoklines (&buffer[0], lines); - return lines; -} - -// ---------------------------------------------------------------------------- -// getfiletime(), setfiletime(): access modification time -// ---------------------------------------------------------------------------- - -bool getfiletime (const wstring & path, FILETIME & time) -{ // return file modification time, false if cannot be determined - WIN32_FIND_DATAW findFileData; - auto_handle hFind (FindFirstFileW (path.c_str(), &findFileData), ::FindClose); - if (hFind != INVALID_HANDLE_VALUE) - { - time = findFileData.ftLastWriteTime; - return true; - } - else - { - return false; - } -} - -void setfiletime (const wstring & path, const FILETIME & time) -{ // update the file modification time of an existing file - auto_handle h (CreateFileW (path.c_str(), FILE_WRITE_ATTRIBUTES, - FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, - OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL)); - if (h == INVALID_HANDLE_VALUE) - { - RuntimeError ("setfiletime: error opening file: %d", GetLastError()); - } - BOOL rc = SetFileTime (h, NULL, NULL, &time); - if (!rc) - { - RuntimeError ("setfiletime: error setting file time information: %d", GetLastError()); - } -} - -// ---------------------------------------------------------------------------- -// expand_wildcards -- wildcard expansion of a path, including directories. -// ---------------------------------------------------------------------------- - -// Win32-style variant of this function (in case we want to use it some day) -// Returns 0 in case of failure. May throw in case of bad_alloc. -static BOOL ExpandWildcards (wstring path, vector & paths) -{ - // convert root to DOS filename convention - for (size_t k = 0; k < path.length(); k++) if (path[k] == '/') path[k] = '\\'; - - // remove terminating backslash - size_t last = path.length() -1; - if (last >= 0 && path[last] == '\\') path.erase (last); - - // convert root to long filename convention - //if (path.find (L"\\\\?\\") != 0) - // path = L"\\\\?\\" + root; - - // split off everything after first wildcard - size_t wpos = path.find_first_of (L"*?"); - if (wpos == 2 && path[0] == '\\' && path[1] == '\\') - wpos = path.find_first_of (L"*?", 4); // 4=skip "\\?\" - if (wpos == wstring::npos) - { // no wildcard: just return it - paths.push_back (path); - return TRUE; - } - - // split off everything afterwards if any - wstring rest; // remaining path after this directory - size_t spos = path.find_first_of (L"\\", wpos +1); - if (spos != wstring::npos) - { - rest = path.substr (spos +1); - path.erase (spos); - } - - // crawl folder - WIN32_FIND_DATAW ffdata; - auto_handle hFind (::FindFirstFileW (path.c_str(), &ffdata), ::FindClose); - if (hFind == INVALID_HANDLE_VALUE) - { - DWORD err = ::GetLastError(); - if (rest.empty() && err == 2) return TRUE; // no matching file: empty - return FALSE; // another error - } - size_t pos = path.find_last_of (L"\\"); - if (pos == wstring::npos) throw std::logic_error ("unexpected missing \\ in path"); - wstring parent = path.substr (0, pos); - do - { - // skip this and parent directory - bool isDir = ((ffdata.dwFileAttributes & (FILE_ATTRIBUTE_DIRECTORY | FILE_ATTRIBUTE_REPARSE_POINT)) != 0); - if (isDir && ffdata.cFileName[0] == '.') continue; - - wstring filename = parent + L"\\" + ffdata.cFileName; - if (rest.empty()) - { - paths.push_back (filename); - } - else if (isDir) // multi-wildcards: further expand - { - BOOL rc = ExpandWildcards (filename + L"\\" + rest, paths); - rc; // error here means no match, e.g. Access Denied to one subfolder - } - } while (::FindNextFileW(hFind, &ffdata) != 0); - return TRUE; -} - -void expand_wildcards (const wstring & path, vector & paths) -{ - BOOL rc = ExpandWildcards (path, paths); - if (!rc) - RuntimeError ("error in expanding wild cards '%S': %S", path.c_str(), FormatWin32Error (::GetLastError()).c_str()); -} - -// ---------------------------------------------------------------------------- -// make_intermediate_dirs() -- make all intermediate dirs on a path -// ---------------------------------------------------------------------------- - -static void mkdir (const wstring & path) -{ - int rc = _wmkdir (path.c_str()); - if (rc >= 0 || errno == EEXIST) - return; // no error or already existing --ok - if (errno == EACCES) - { - // bug in _wmkdir(): returns access_denied if folder exists but read-only --check existence - DWORD att = ::GetFileAttributesW (path.c_str()); - if (att != INVALID_FILE_ATTRIBUTES || (att & FILE_ATTRIBUTE_DIRECTORY) != 0) - return; // ok - } - RuntimeError ("make_intermediate_dirs: error creating intermediate directory %S", path.c_str()); -} - -// make subdir of a file including parents -void msra::files::make_intermediate_dirs (const wstring & filepath) -{ - vector buf; - buf.resize (filepath.length() +1, 0); - wcscpy_s (&buf[0], buf.size(), filepath.c_str()); - wstring subpath; - int skip = 0; - // if share (\\) then the first two levels (machine, share name) cannot be made - if ((buf[0] == '/' && buf[1] == '/') || (buf[0] == '\\' && buf[1] == '\\')) - { - subpath = L"/"; - skip = 2; // skip two levels (machine, share) - } - // make all constituents except the filename (to make a dir, include a trailing slash) - for (const wchar_t * p = wcstok (&buf[0], L"/\\"); p; p = wcstok (NULL, L"/\\")) - { - if (subpath != L"" && subpath != L"/" && subpath != L"\\" && skip == 0) - { - mkdir (subpath); - } - else if (skip > 0) skip--; // skip this level - // rebuild the final path - if (subpath != L"") subpath += L"/"; - subpath += p; - } -} - -// ---------------------------------------------------------------------------- -// fuptodate() -- test whether an output file is at least as new as an input file -// ---------------------------------------------------------------------------- - -// test if file 'target' is not older than 'input' --used for make mode -// 'input' must exist if 'inputrequired'; otherweise if 'target' exists, it is considered up to date -// 'target' may or may not exist -bool msra::files::fuptodate (const wstring & target, const wstring & input, bool inputrequired) -{ - FILETIME targettime; - if (!getfiletime (target, targettime)) return false; // target missing: need to update - FILETIME inputtime; - if (!getfiletime (input, inputtime)) return !inputrequired; // input missing: if required, pretend to be out of date as to force caller to fail - ULARGE_INTEGER targett, inputt; - memcpy (&targett, &targettime, sizeof (targett)); - memcpy (&inputt, &inputtime, sizeof (inputt)); - return !(targett.QuadPart < inputt.QuadPart); // up to date if target not older than input -} diff --git a/DataReader/Kaldi2Reader/fileutil.h b/DataReader/Kaldi2Reader/fileutil.h deleted file mode 100644 index 9b36d9684443..000000000000 --- a/DataReader/Kaldi2Reader/fileutil.h +++ /dev/null @@ -1,620 +0,0 @@ -// -// fileutil.h - file I/O with error checking -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -#pragma once -#ifndef _FILEUTIL_ -#define _FILEUTIL_ - -#include "Platform.h" -#include -#ifdef __unix__ -#include -#include -#endif -#include // for std::find -#include -#include -#include -#include -#include -#include -#include -#include // for strerror() - -using namespace std; - -#define SAFE_CLOSE(f) (((f) == NULL) || (fcloseOrDie ((f)), (f) = NULL)) - -// ---------------------------------------------------------------------------- -// fopenOrDie(): like fopen() but terminate with err msg in case of error. -// A pathname of "-" returns stdout or stdin, depending on mode, and it will -// change the binary mode if 'b' or 't' are given. If you use this, make sure -// not to fclose() such a handle. -// ---------------------------------------------------------------------------- - -FILE * fopenOrDie (const string & pathname, const char * mode); -FILE * fopenOrDie (const wstring & pathname, const wchar_t * mode); - -#ifndef __unix__ -// ---------------------------------------------------------------------------- -// fsetmode(): set mode to binary or text -// ---------------------------------------------------------------------------- - -void fsetmode (FILE * f, char type); -#endif - -// ---------------------------------------------------------------------------- -// freadOrDie(): like fread() but terminate with err msg in case of error -// ---------------------------------------------------------------------------- - -void freadOrDie (void * ptr, size_t size, size_t count, FILE * f); - -template -void freadOrDie (_T & data, int num, FILE * f) // template for vector<> -{ data.resize (num); if (data.size() > 0) freadOrDie (&data[0], sizeof (data[0]), data.size(), f); } -template -void freadOrDie (_T & data, size_t num, FILE * f) // template for vector<> -{ data.resize (num); if (data.size() > 0) freadOrDie (&data[0], sizeof (data[0]), data.size(), f); } - - -// ---------------------------------------------------------------------------- -// fwriteOrDie(): like fwrite() but terminate with err msg in case of error -// ---------------------------------------------------------------------------- - -void fwriteOrDie (const void * ptr, size_t size, size_t count, FILE * f); - -template -void fwriteOrDie (const _T & data, FILE * f) // template for vector<> -{ if (data.size() > 0) fwriteOrDie (&data[0], sizeof (data[0]), data.size(), f); } - - -// ---------------------------------------------------------------------------- -// fprintfOrDie(): like fprintf() but terminate with err msg in case of error -// ---------------------------------------------------------------------------- - -void fprintfOrDie (FILE * f, const char *format, ...); - -// ---------------------------------------------------------------------------- -// fcloseOrDie(): like fclose() but terminate with err msg in case of error -// not yet implemented, but we should -// ---------------------------------------------------------------------------- - -#define fcloseOrDie fclose - -// ---------------------------------------------------------------------------- -// fflushOrDie(): like fflush() but terminate with err msg in case of error -// ---------------------------------------------------------------------------- - -void fflushOrDie (FILE * f); - -// ---------------------------------------------------------------------------- -// filesize(): determine size of the file in bytes -// ---------------------------------------------------------------------------- - -size_t filesize (const wchar_t * pathname); -size_t filesize (FILE * f); -int64_t filesize64 (const wchar_t * pathname); - -// ---------------------------------------------------------------------------- -// fseekOrDie(),ftellOrDie(), fget/setpos(): seek functions with error handling -// ---------------------------------------------------------------------------- - -// 32-bit offsets only -long fseekOrDie (FILE * f, long offset, int mode = SEEK_SET); -#define ftellOrDie ftell - -// ---------------------------------------------------------------------------- -// fget/setpos(): seek functions with error handling -// ---------------------------------------------------------------------------- - -uint64_t fgetpos (FILE * f); -void fsetpos (FILE * f, uint64_t pos); - -// ---------------------------------------------------------------------------- -// unlinkOrDie(): unlink() with error handling -// ---------------------------------------------------------------------------- - -void unlinkOrDie (const std::string & pathname); -void unlinkOrDie (const std::wstring & pathname); - -// ---------------------------------------------------------------------------- -// renameOrDie(): rename() with error handling -// ---------------------------------------------------------------------------- - -void renameOrDie (const std::string & from, const std::string & to); -void renameOrDie (const std::wstring & from, const std::wstring & to); - -// ---------------------------------------------------------------------------- -// fexists(): test if a file exists -// ---------------------------------------------------------------------------- - -bool fexists (const char * pathname); -bool fexists (const wchar_t * pathname); -inline bool fexists (const std::string & pathname) { return fexists (pathname.c_str()); } -inline bool fexists (const std::wstring & pathname) { return fexists (pathname.c_str()); } - -// ---------------------------------------------------------------------------- -// funicode(): test if a file uses unicode -// ---------------------------------------------------------------------------- - -bool funicode (FILE * f); - -// ---------------------------------------------------------------------------- -// fskipspace(): skip space characters -// ---------------------------------------------------------------------------- - -bool fskipspace (FILE * F); -bool fskipwspace (FILE * F); - -// ---------------------------------------------------------------------------- -// fgetline(): like fgets() but terminate with err msg in case of error; -// removes the newline character at the end (like gets()), returned buffer is -// always 0-terminated; has second version that returns an STL string instead -// fgetstring(): read a 0-terminated string (terminate if error) -// fgetword(): read a space-terminated token (terminate if error) -// fskipNewLine(): skip all white space until end of line incl. the newline -// ---------------------------------------------------------------------------- - -// ---------------------------------------------------------------------------- -// fputstring(): write a 0-terminated string (terminate if error) -// ---------------------------------------------------------------------------- - -void fputstring (FILE * f, const char *); -void fputstring (const HANDLE f, const char * str); -void fputstring (FILE * f, const std::string &); -void fputstring (FILE * f, const wchar_t *); -void fputstring (FILE * f, const std::wstring &); - -template CHAR * fgetline (FILE * f, CHAR * buf, int size); -template CHAR * fgetline (FILE * f, CHAR (& buf)[n]) { return fgetline (f, buf, n); } -string fgetline (FILE * f); -wstring fgetlinew (FILE * f); -void fgetline (FILE * f, std::string & s, std::vector & buf); -void fgetline (FILE * f, std::wstring & s, std::vector & buf); -void fgetline (FILE * f, std::vector & buf); -void fgetline (FILE * f, std::vector & buf); - -const char * fgetstring (FILE * f, char * buf, int size); -template const char * fgetstring (FILE * f, char (& buf)[n]) { return fgetstring (f, buf, n); } -const char * fgetstring (const HANDLE f, char * buf, int size); -template const char * fgetstring (const HANDLE f, char (& buf)[n]) { return fgetstring (f, buf, n); } - -const wchar_t * fgetstring (FILE * f, wchar_t * buf, int size); -wstring fgetwstring (FILE * f); -string fgetstring (FILE * f); - -const char * fgettoken (FILE * f, char * buf, int size); -template const char * fgettoken (FILE * f, char (& buf)[n]) { return fgettoken (f, buf, n); } -string fgettoken (FILE * f); -const wchar_t * fgettoken (FILE * f, wchar_t * buf, int size); -wstring fgetwtoken (FILE * f); - -int fskipNewline (FILE * f, bool skip = true); -int fskipwNewline (FILE * f, bool skip = true); - -// ---------------------------------------------------------------------------- -// fputstring(): write a 0-terminated string (terminate if error) -// ---------------------------------------------------------------------------- - -void fputstring (FILE * f, const char *); -void fputstring (FILE * f, const std::string &); -void fputstring (FILE * f, const wchar_t *); -void fputstring (FILE * f, const std::wstring &); - -// ---------------------------------------------------------------------------- -// fgetTag(): read a 4-byte tag & return as a string -// ---------------------------------------------------------------------------- - -string fgetTag (FILE * f); - -// ---------------------------------------------------------------------------- -// fcheckTag(): read a 4-byte tag & verify it; terminate if wrong tag -// ---------------------------------------------------------------------------- - -void fcheckTag (FILE * f, const char * expectedTag); -void fcheckTag_ascii (FILE * f, const string & expectedTag); - -// ---------------------------------------------------------------------------- -// fcompareTag(): compare two tags; terminate if wrong tag -// ---------------------------------------------------------------------------- - -void fcompareTag (const string & readTag, const string & expectedTag); - -// ---------------------------------------------------------------------------- -// fputTag(): write a 4-byte tag -// ---------------------------------------------------------------------------- - -void fputTag (FILE * f, const char * tag); - -// ---------------------------------------------------------------------------- -// fskipstring(): skip a 0-terminated string, such as a pad string -// ---------------------------------------------------------------------------- - -void fskipstring (FILE * f); - -// ---------------------------------------------------------------------------- -// fpad(): write a 0-terminated string to pad file to a n-byte boundary -// ---------------------------------------------------------------------------- - -void fpad (FILE * f, int n); - -// ---------------------------------------------------------------------------- -// fgetbyte(): read a byte value -// ---------------------------------------------------------------------------- - -char fgetbyte (FILE * f); - -// ---------------------------------------------------------------------------- -// fgetshort(): read a short value -// ---------------------------------------------------------------------------- - -short fgetshort (FILE * f); -short fgetshort_bigendian (FILE * f); - -// ---------------------------------------------------------------------------- -// fgetint24(): read a 3-byte (24-bit) int value -// ---------------------------------------------------------------------------- - -int fgetint24 (FILE * f); - -// ---------------------------------------------------------------------------- -// fgetint(): read an int value -// ---------------------------------------------------------------------------- - -int fgetint (FILE * f); -int fgetint_bigendian (FILE * f); -int fgetint_ascii (FILE * f); - -// ---------------------------------------------------------------------------- -// fgetlong(): read an long value -// ---------------------------------------------------------------------------- -long fgetlong (FILE * f); - -// ---------------------------------------------------------------------------- -// fgetfloat(): read a float value -// ---------------------------------------------------------------------------- - -float fgetfloat (FILE * f); -float fgetfloat_bigendian (FILE * f); -float fgetfloat_ascii (FILE * f); - -// ---------------------------------------------------------------------------- -// fgetdouble(): read a double value -// ---------------------------------------------------------------------------- - -double fgetdouble (FILE * f); - -// ---------------------------------------------------------------------------- -// fputbyte(): write a byte value -// ---------------------------------------------------------------------------- - -void fputbyte (FILE * f, char val); - -// ---------------------------------------------------------------------------- -// fputshort(): write a short value -// ---------------------------------------------------------------------------- - -void fputshort (FILE * f, short val); - -// ---------------------------------------------------------------------------- -// fputint24(): write a 3-byte (24-bit) int value -// ---------------------------------------------------------------------------- - -void fputint24 (FILE * f, int v); - -// ---------------------------------------------------------------------------- -// fputint(): write an int value -// ---------------------------------------------------------------------------- - -void fputint (FILE * f, int val); - -// ---------------------------------------------------------------------------- -// fputlong(): write an long value -// ---------------------------------------------------------------------------- - -void fputlong (FILE * f, long val); - -// ---------------------------------------------------------------------------- -// fputfloat(): write a float value -// ---------------------------------------------------------------------------- - -void fputfloat (FILE * f, float val); - -// ---------------------------------------------------------------------------- -// fputdouble(): write a double value -// ---------------------------------------------------------------------------- - -void fputdouble (FILE * f, double val); - - -// template versions of put/get functions for binary files -template -void fput(FILE * f, T v) -{ - fwriteOrDie (&v, sizeof (v), 1, f); -} - - -// template versions of put/get functions for binary files -template -void fget(FILE * f, T& v) -{ - freadOrDie ((void *)&v, sizeof (v), 1, f); -} - - -// GetFormatString - get the format string for a particular type -template -const wchar_t* GetFormatString(T /*t*/) -{ - // if this _ASSERT goes off it means that you are using a type that doesn't have - // a read and/or write routine. - // If the type is a user defined class, you need to create some global functions that handles file in/out. - // for example: - //File& operator>>(File& stream, MyClass& test); - //File& operator<<(File& stream, MyClass& test); - // - // in your class you will probably want to add these functions as friends so you can access any private members - // friend File& operator>>(File& stream, MyClass& test); - // friend File& operator<<(File& stream, MyClass& test); - // - // if you are using wchar_t* or char* types, these use other methods because they require buffers to be passed - // either use std::string and std::wstring, or use the WriteString() and ReadString() methods - assert(false); // need a specialization - return NULL; -} - -// GetFormatString - specalizations to get the format string for a particular type -template <> const wchar_t* GetFormatString(char); -template <> const wchar_t* GetFormatString(wchar_t); -template <> const wchar_t* GetFormatString(short); -template <> const wchar_t* GetFormatString(int); -template <> const wchar_t* GetFormatString(long); -template <> const wchar_t* GetFormatString(unsigned short); -template <> const wchar_t* GetFormatString(unsigned int); -template <> const wchar_t* GetFormatString(unsigned long); -template <> const wchar_t* GetFormatString(float); -template <> const wchar_t* GetFormatString(double); -template <> const wchar_t* GetFormatString(size_t); -template <> const wchar_t* GetFormatString(long long); -template <> const wchar_t* GetFormatString(const char*); -template <> const wchar_t* GetFormatString(const wchar_t*); - -// GetScanFormatString - get the format string for a particular type -template -const wchar_t* GetScanFormatString(T t) -{ - assert(false); // need a specialization - return NULL; -} - -// GetScanFormatString - specalizations to get the format string for a particular type -template <> const wchar_t* GetScanFormatString(char); -template <> const wchar_t* GetScanFormatString(wchar_t); -template <> const wchar_t* GetScanFormatString(short); -template <> const wchar_t* GetScanFormatString(int); -template <> const wchar_t* GetScanFormatString(long); -template <> const wchar_t* GetScanFormatString(unsigned short); -template <> const wchar_t* GetScanFormatString(unsigned int); -template <> const wchar_t* GetScanFormatString(unsigned long); -template <> const wchar_t* GetScanFormatString(float); -template <> const wchar_t* GetScanFormatString(double); -template <> const wchar_t* GetScanFormatString(size_t); -template <> const wchar_t* GetScanFormatString(long long); - - -// ---------------------------------------------------------------------------- -// fgetText(): get a value from a text file -// ---------------------------------------------------------------------------- -template -void fgetText(FILE * f, T& v) -{ - int rc = ftrygetText(f, v); - if (rc == 0) - throw std::runtime_error("error reading value from file (invalid format)"); - else if (rc == EOF) - throw std::runtime_error(std::string("error reading from file: ") + strerror(errno)); - assert(rc == 1); -} - -// version to try and get a string, and not throw exceptions if contents don't match -template -int ftrygetText(FILE * f, T& v) -{ - const wchar_t* formatString = GetScanFormatString(v); - int rc = fwscanf (f, formatString, &v); - assert(rc == 1 || rc == 0); - return rc; -} - -template <> int ftrygetText(FILE * f, bool& v); -// ---------------------------------------------------------------------------- -// fgetText() specializations for fwscanf_s differences: get a value from a text file -// ---------------------------------------------------------------------------- -void fgetText(FILE * f, char& v); -void fgetText(FILE * f, wchar_t& v); - - -// ---------------------------------------------------------------------------- -// fputText(): write a value out as text -// ---------------------------------------------------------------------------- -template -void fputText(FILE * f, T v) -{ - const wchar_t* formatString = GetFormatString(v); - int rc = fwprintf(f, formatString, v); - if (rc == 0) - throw std::runtime_error("error writing value to file, no values written"); - else if (rc < 0) - throw std::runtime_error(std::string("error writing to file: ") + strerror(errno)); -} - -// ---------------------------------------------------------------------------- -// fputText(): write a bool out as character -// ---------------------------------------------------------------------------- -template <> void fputText(FILE * f, bool v); - -// ---------------------------------------------------------------------------- -// fputfile(): write a binary block or a string as a file -// ---------------------------------------------------------------------------- - -void fputfile (const wstring & pathname, const std::vector & buffer); -void fputfile (const wstring & pathname, const std::wstring & string); -void fputfile (const wstring & pathname, const std::string & string); - -// ---------------------------------------------------------------------------- -// fgetfile(): load a file as a binary block -// ---------------------------------------------------------------------------- - -void fgetfile (const wstring & pathname, std::vector & buffer); -void fgetfile (FILE * f, std::vector & buffer); -namespace msra { namespace files { - void fgetfilelines (const std::wstring & pathname, vector & readbuffer, std::vector & lines); - static inline std::vector fgetfilelines (const std::wstring & pathname) { vector buffer; std::vector lines; fgetfilelines (pathname, buffer, lines); return lines; } - vector fgetfilelines (const wstring & pathname, vector & readbuffer); -};}; - -// ---------------------------------------------------------------------------- -// expand_wildcards() -- expand a path with wildcards (also intermediate ones) -// ---------------------------------------------------------------------------- - -void expand_wildcards (const wstring & path, vector & paths); - -// ---------------------------------------------------------------------------- -// make_intermediate_dirs() -- make all intermediate dirs on a path -// ---------------------------------------------------------------------------- - -namespace msra { namespace files { - void make_intermediate_dirs (const wstring & filepath); -};}; - -// ---------------------------------------------------------------------------- -// fuptodate() -- test whether an output file is at least as new as an input file -// ---------------------------------------------------------------------------- - -namespace msra { namespace files { - bool fuptodate (const wstring & target, const wstring & input, bool inputrequired = true); -};}; - -#if 0 -// ---------------------------------------------------------------------------- -// simple support for WAV file I/O -// ---------------------------------------------------------------------------- - -// define the header if we haven't seen it yet -#ifndef _WAVEFORMATEX_ -#define _WAVEFORMATEX_ - -/* - * extended waveform format structure used for all non-PCM formats. this - * structure is common to all non-PCM formats. - */ -typedef unsigned short WORD; // in case not defined yet (i.e. linux) -typedef struct tWAVEFORMATEX -{ - WORD wFormatTag; /* format type */ - WORD nChannels; /* number of channels (i.e. mono, stereo...) */ - DWORD nSamplesPerSec; /* sample rate */ - DWORD nAvgBytesPerSec; /* for buffer estimation */ - WORD nBlockAlign; /* block size of data */ - WORD wBitsPerSample; /* number of bits per sample of mono data */ - WORD cbSize; /* the count in bytes of the size of */ - /* extra information (after cbSize) */ -} WAVEFORMATEX, *PWAVEFORMATEX; - -#endif /* _WAVEFORMATEX_ */ - -typedef struct wavehder{ - char riffchar[4]; - unsigned int RiffLength; - char wavechar[8]; - unsigned int FmtLength; - signed short wFormatTag; - signed short nChannels; - unsigned int nSamplesPerSec; - unsigned int nAvgBytesPerSec; - signed short nBlockAlign; - signed short wBitsPerSample; - char datachar[4]; - unsigned int DataLength; -private: - void prepareRest (int SampleCount); -public: - void prepare (unsigned int Fs, int Bits, int Channels, int SampleCount); - void prepare (const WAVEFORMATEX & wfx, int SampleCount); - unsigned int read (FILE * f, signed short & wRealFormatTag, int & bytesPerSample); - void write (FILE * f); - static void update (FILE * f); -} WAVEHEADER; - -// ---------------------------------------------------------------------------- -// fgetwfx(), fputwfx(): I/O of wave file headers only -// ---------------------------------------------------------------------------- -unsigned int fgetwfx (FILE *f, WAVEFORMATEX & wfx); -void fputwfx (FILE *f, const WAVEFORMATEX & wfx, unsigned int numSamples); - -// ---------------------------------------------------------------------------- -// fgetraw(): read data of .wav file, and separate data of multiple channels. -// For example, data[i][j]: i is channel index, 0 means the first -// channel. j is sample index. -// ---------------------------------------------------------------------------- -void fgetraw (FILE *f,std::vector< std::vector > & data,const WAVEHEADER & wavhd); -#endif - -// ---------------------------------------------------------------------------- -// temp functions -- clean these up -// ---------------------------------------------------------------------------- - -// split a pathname into directory and filename -static inline void splitpath (const wstring & path, wstring & dir, wstring & file) -{ - size_t pos = path.find_last_of (L"\\:/"); // DOS drives, UNIX, Windows - if (pos == path.npos) // no directory found - { - dir.clear(); - file = path; - } - else - { - dir = path.substr (0, pos); - file = path.substr (pos +1); - } -} - -// test if a pathname is a relative path -// A relative path is one that can be appended to a directory. -// Drive-relative paths, such as D:file, are considered non-relative. -static inline bool relpath (const wchar_t * path) -{ // this is a wild collection of pathname conventions in Windows - if (path[0] == '/' || path[0] == '\\') // e.g. \WINDOWS - return false; - if (path[0] && path[1] == ':') // drive syntax - return false; - // ... TODO: handle long NT paths - return true; // all others -} -template -static inline bool relpath (const std::basic_string & s) { return relpath (s.c_str()); } - -// trim from start -static inline std::string <rim(std::string &s) { - s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::ptr_fun(std::isspace)))); - return s; -} - -// trim from end -static inline std::string &rtrim(std::string &s) { - s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun(std::isspace))).base(), s.end()); - return s; -} - -// trim from both ends -static inline std::string &trim(std::string &s) { - return ltrim(rtrim(s)); -} - -vector sep_string(const string & str, const string & sep); - -#endif // _FILEUTIL_ diff --git a/DataReader/Kaldi2Reader/htkfeatio.h b/DataReader/Kaldi2Reader/htkfeatio.h index e2f83d53553f..9baf06ba7440 100644 --- a/DataReader/Kaldi2Reader/htkfeatio.h +++ b/DataReader/Kaldi2Reader/htkfeatio.h @@ -7,6 +7,7 @@ #pragma once +#include "Basics.h" #include "basetypes.h" #include "fileutil.h" #include "simple_checked_arrays.h" diff --git a/Makefile b/Makefile index a6c3f2aee9b1..bdbba9285123 100644 --- a/Makefile +++ b/Makefile @@ -370,15 +370,15 @@ KALDIREADER_SRC = \ KALDIREADER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(KALDIREADER_SRC)) KALDIREADER:=$(LIBDIR)/KaldiReader.so -ALL+=$(KALDIREADER) -SRC+=$(KALDIREADER_SRC) +#ALL+=$(KALDIREADER) +#SRC+=$(KALDIREADER_SRC) $(KALDIREADER): $(KALDIREADER_OBJ) | $(CNTKMATH_LIB) @echo $(SEPARATOR) $(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(KALDI_LIBPATH) $(LIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(KALDI_LIBPATH) $(LIBPATH)) -o $@ $^ -l$(CNTKMATH) $(KALDI_LIBS) -KALDIWRITER:=$(LIBDIR)/KaldiWriter.so -ALL+=$(KALDIWRITER) +#KALDIWRITER:=$(LIBDIR)/KaldiWriter.so +#ALL+=$(KALDIWRITER) $(KALDIWRITER): $(KALDIREADER_OBJ) | $(CNTKMATH_LIB) @echo $(SEPARATOR)