Skip to content

Commit

Permalink
Turtle parser: better code, bug fixes, and new unit tests (ad-freibur…
Browse files Browse the repository at this point in the history
…g#1039)

1. There is now a common base class for our Turtle parsers. That way, methods involed in index building no longer have be be templated with the parser class.
2. The parallel parser now accepts an arbitrary sequence of `\n` and `\r` as newline, and there may be arbitrary whitespace after the final dot in a line. So far, only `\n` was accepted as newline, and a `\t` after the final dot did not work.
3. There is now a Boolean option `parallel-parsing` in the `settings.json` configuration file. It is `true` by default, which is consistent with the behavior so far.
4. Add unit tests for the Turtle parser (both parallel and not parallel). So far, there were unit tests for individual rules, but not for the parser as a whole. In particular, the batching and parallelization were untested so far.
5. Remove the mmap parser becuse it was no longer needed (and probably also not working any longer).
  • Loading branch information
joka921 authored Jul 20, 2023
1 parent ad6f15e commit 0b74fa1
Show file tree
Hide file tree
Showing 21 changed files with 629 additions and 593 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/code-coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ jobs:
- name: Configure CMake
# Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
# See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
run: cmake -B ${{github.workspace}}/build ${{env.cmake-flags}} -DCMAKE_BUILD_TYPE=${{env.build-type}} -DLOGLEVEL=DEBUG -DCMAKE_TOOLCHAIN_FILE="$(pwd)/build/conan_toolchain.cmake" -DADDITIONAL_COMPILER_FLAGS="${{env.warnings}} ${{env.asan-flags}} ${{env.ubsan-flags}} ${{env.coverage-flags}}" -DADDITIONAL_LINKER_FLAGS="${{env.coverage-flags}}" -DUSE_PARALLEL=false -DRUN_EXPENSIVE_TESTS=false -DSINGLE_TEST_BINARY=ON -DENABLE_EXPENSIVE_CHECKS=true
run: cmake -B ${{github.workspace}}/build ${{env.cmake-flags}} -DCMAKE_BUILD_TYPE=${{env.build-type}} -DLOGLEVEL=TIMING -DCMAKE_TOOLCHAIN_FILE="$(pwd)/build/conan_toolchain.cmake" -DADDITIONAL_COMPILER_FLAGS="${{env.warnings}} ${{env.asan-flags}} ${{env.ubsan-flags}} ${{env.coverage-flags}}" -DADDITIONAL_LINKER_FLAGS="${{env.coverage-flags}}" -DUSE_PARALLEL=false -DRUN_EXPENSIVE_TESTS=false -DSINGLE_TEST_BINARY=ON -DENABLE_EXPENSIVE_CHECKS=true

- name: Build
# Build your program with the given configuration
Expand Down
3 changes: 2 additions & 1 deletion examples/wikidata.settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@
"ignore-punctuation": true
},
"ascii-prefixes-only": true,
"num-triples-per-batch" : 10000000
"num-triples-per-batch" : 10000000,
"parallel-parsing" : true
}
6 changes: 2 additions & 4 deletions src/TurtleParserMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ void writeNTImpl(std::ostream& out, const std::string& filename) {
TurtleTriple triple;
size_t numTriples = 0;
while (p.getLine(triple)) {
out << triple._subject << " " << triple._predicate << " "
<< triple._object.toRdfLiteral() << " .\n";
out << triple.subject_ << " " << triple.predicate_ << " "
<< triple.object_.toRdfLiteral() << " .\n";
numTriples++;
if (numTriples % 10000000 == 0) {
LOG(INFO) << "Parsed " << numTriples << " triples" << std::endl;
Expand Down Expand Up @@ -72,8 +72,6 @@ void writeNT(std::ostream& out, const string& fileFormat,
if (fileFormat == "ttl" || fileFormat == "nt") {
// writeLabel<TurtleStreamParser<Tokenizer_T>>(out, filename);
writeNTImpl<TurtleStreamParser<Tokenizer_T>>(out, filename);
} else if (fileFormat == "mmap") {
writeNTImpl<TurtleMmapParser<Tokenizer_T>>(out, filename);
} else {
LOG(ERROR) << "writeNT was called with unknown file format " << fileFormat
<< ". This should never happen, terminating" << std::endl;
Expand Down
6 changes: 6 additions & 0 deletions src/global/Constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,12 @@ static const std::string WARNING_ASCII_ONLY_PREFIXES =
// "literals, and the regex \". *\\n\" only matches at the end of a triple. "
// "Most Turtle files fulfill these properties (e.g. that from Wikidata), "
// "but not all";

static const std::string WARNING_PARALLEL_PARSING =
"You specified \"parallel-parsing = true\", which enables faster parsing "
"for TTL files that don't include multiline literals with unescaped "
"newline characters and that have newline characters after the end of "
"triples.";
static const std::string LOCALE_DEFAULT_LANG = "en";
static const std::string LOCALE_DEFAULT_COUNTRY = "US";
static constexpr bool LOCALE_DEFAULT_IGNORE_PUNCTUATION = false;
Expand Down
10 changes: 1 addition & 9 deletions src/index/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,10 @@ Index::Index(Index&&) noexcept = default;
Index::~Index() = default;

// ____________________________________________________________________________
template <class Parser>
void Index::createFromFile(const std::string& filename) {
pimpl_->template createFromFile<Parser>(filename);
pimpl_->createFromFile(filename);
}

// Explicit instantiations.
template void Index::createFromFile<TurtleStreamParser<Tokenizer>>(
const string& filename);
template void Index::createFromFile<TurtleMmapParser<Tokenizer>>(
const string& filename);
template void Index::createFromFile<TurtleParserAuto>(const string& filename);

// ____________________________________________________________________________
void Index::addPatternsToExistingIndex() {
pimpl_->addPatternsToExistingIndex();
Expand Down
9 changes: 0 additions & 9 deletions src/index/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,6 @@ class IdTable;
class TextBlockMetaData;
class IndexImpl;

/**
* Used as a template argument to the `createFromFile` method, when we do not
* yet know which tokenizer specialization of the `TurtleParser` we are going
* to use.
*/
class TurtleParserAuto {};

class Index {
private:
// Pimpl to reduce compile times.
Expand Down Expand Up @@ -76,8 +69,6 @@ class Index {
// Create an index from a file. Will write vocabulary and on-disk index data.
// NOTE: The index can not directly be used after this call, but has to be
// setup by `createFromOnDiskIndex` after this call.
// TODO<joka921> Make the parser options also a plain enum!
template <class Parser>
void createFromFile(const std::string& filename);

void addPatternsToExistingIndex();
Expand Down
29 changes: 5 additions & 24 deletions src/index/IndexBuilderMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ int main(int argc, char** argv) {
string filetype;
string inputFile;
bool noPrefixCompression = false;
bool onDiskLiterals = false;
bool noPatterns = false;
bool onlyAddTextIndex = false;
bool keepTemporaryFiles = false;
Expand All @@ -97,7 +96,7 @@ int main(int argc, char** argv) {
"will read from stdin.");
add("file-format,F", po::value(&filetype),
"The format of the input file with the knowledge graph data. Must be one "
"of [tsv|nt|ttl]. If not set, QLever will try to deduce it from the "
"of [nt|ttl]. If not set, QLever will try to deduce it from the "
"filename suffix.");
add("kg-index-name,K", po::value(&kbIndexName),
"The name of the knowledge graph index (default: basename of "
Expand All @@ -119,9 +118,6 @@ int main(int argc, char** argv) {
"the same `index-basename` already exists.");

// Options for the knowledge graph index.
add("externalize-literals,l", po::bool_switch(&onDiskLiterals),
"An unused and deprecated option that will be removed from a future "
"version of qlever");
add("settings-file,s", po::value(&settingsFile),
"A JSON file, where various settings can be specified (see the QLever "
"documentation).");
Expand Down Expand Up @@ -160,13 +156,6 @@ int main(int argc, char** argv) {
1024ul * 1024ul * 1024ul * stxxlMemoryGB.value();
}

if (onDiskLiterals) {
LOG(WARN) << EMPH_ON
<< "Warning, the -l command line option has no effect anymore "
"and will be removed from a future version of QLever"
<< EMPH_OFF << std::endl;
}

// If no text index name was specified, take the part of the wordsfile after
// the last slash.
if (textIndexName.empty() && !wordsfile.empty()) {
Expand Down Expand Up @@ -213,10 +202,7 @@ int main(int argc, char** argv) {
<< ad_utility::getUppercase(filetype) << std::endl;
} else {
bool filetypeDeduced = false;
if (inputFile.ends_with(".tsv")) {
filetype = "tsv";
filetypeDeduced = true;
} else if (inputFile.ends_with(".nt")) {
if (inputFile.ends_with(".nt")) {
filetype = "nt";
filetypeDeduced = true;
} else if (inputFile.ends_with(".ttl")) {
Expand All @@ -239,18 +225,13 @@ int main(int argc, char** argv) {
if (filetype == "ttl") {
LOG(DEBUG) << "Parsing uncompressed TTL from: " << inputFile
<< std::endl;
index.createFromFile<TurtleParserAuto>(inputFile);
index.createFromFile(inputFile);
} else if (filetype == "nt") {
LOG(DEBUG) << "Parsing uncompressed N-Triples from: " << inputFile
<< " (using the Turtle parser)" << std::endl;
index.createFromFile<TurtleParserAuto>(inputFile);
} else if (filetype == "mmap") {
LOG(DEBUG) << "Parsing uncompressed TTL from from: " << inputFile
<< " (using mmap, which only works for files, not for "
<< "streams)" << std::endl;
index.createFromFile<TurtleMmapParser<Tokenizer>>(inputFile);
index.createFromFile(inputFile);
} else {
LOG(ERROR) << "File format must be one of: nt ttl mmap" << std::endl;
LOG(ERROR) << "File format must be one of: nt ttl" << std::endl;
std::cerr << boostOptions << std::endl;
exit(1);
}
Expand Down
95 changes: 39 additions & 56 deletions src/index/IndexImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
#include <cstdio>
#include <future>
#include <optional>
#include <stxxl/algorithm>
#include <stxxl/map>
#include <unordered_map>

#include "CompilationInfo.h"
Expand All @@ -35,11 +33,10 @@ IndexImpl::IndexImpl(ad_utility::AllocatorWithLimit<Id> allocator)
: allocator_{std::move(allocator)} {};

// _____________________________________________________________________________
template <class Parser>
IndexBuilderDataAsPsoSorter IndexImpl::createIdTriplesAndVocab(
const string& ntFile) {
std::shared_ptr<TurtleParserBase> parser) {
auto indexBuilderData =
passFileForVocabulary<Parser>(ntFile, numTriplesPerBatch_);
passFileForVocabulary(std::move(parser), numTriplesPerBatch_);
// first save the total number of words, this is needed to initialize the
// dense IndexMetaData variants
totalVocabularySize_ = indexBuilderData.vocabularyMetaData_.numWordsTotal_;
Expand Down Expand Up @@ -79,29 +76,33 @@ void createPatternsFromSpoTriplesView(auto&& spoTriplesView,
}

// _____________________________________________________________________________
template <class Parser>
void IndexImpl::createFromFile(const string& filename) {
LOG(INFO) << "Processing input triples from " << filename << " ..."
<< std::endl;
string indexFilename = onDiskBase_ + ".index";

readIndexBuilderSettingsFromFile<Parser>();
readIndexBuilderSettingsFromFile();

IndexBuilderDataAsPsoSorter indexBuilderData;
if constexpr (std::is_same_v<std::decay_t<Parser>, TurtleParserAuto>) {
auto setTokenizer = [this,
&filename]<template <typename> typename ParserTemplate>()
-> std::unique_ptr<TurtleParserBase> {
if (onlyAsciiTurtlePrefixes_) {
LOG(DEBUG) << "Using the CTRE library for tokenization" << std::endl;
indexBuilderData =
createIdTriplesAndVocab<TurtleParallelParser<TokenizerCtre>>(
filename);
return std::make_unique<ParserTemplate<TokenizerCtre>>(filename);
} else {
LOG(DEBUG) << "Using the Google RE2 library for tokenization"
<< std::endl;
indexBuilderData =
createIdTriplesAndVocab<TurtleParallelParser<Tokenizer>>(filename);
return std::make_unique<ParserTemplate<Tokenizer>>(filename);
}
};

} else {
indexBuilderData = createIdTriplesAndVocab<Parser>(filename);
}
std::unique_ptr<TurtleParserBase> parser = [&setTokenizer, this]() {
if (useParallelParser_) {
return setTokenizer.template operator()<TurtleParallelParser>();
} else {
return setTokenizer.template operator()<TurtleStreamParser>();
}
}();

IndexBuilderDataAsPsoSorter indexBuilderData =
createIdTriplesAndVocab(std::move(parser));

// If we have no compression, this will also copy the whole vocabulary.
// but since we expect compression to be the default case, this should not
Expand All @@ -119,7 +120,7 @@ void IndexImpl::createFromFile(const string& filename) {
LOG(INFO) << "Writing compressed vocabulary to disk ..." << std::endl;

vocab_.buildCodebookForPrefixCompression(prefixes);
auto wordReader = vocab_.makeUncompressedDiskIterator(vocabFile);
auto wordReader = RdfsVocabulary::makeUncompressedDiskIterator(vocabFile);
auto wordWriter = vocab_.makeCompressedWordWriter(vocabFileTmp);
for (const auto& word : wordReader) {
wordWriter.push(word);
Expand Down Expand Up @@ -229,21 +230,9 @@ void IndexImpl::createFromFile(const string& filename) {
LOG(INFO) << "Index build completed" << std::endl;
}

// Explicit instantiations.
template void IndexImpl::createFromFile<TurtleStreamParser<Tokenizer>>(
const string& filename);
template void IndexImpl::createFromFile<TurtleMmapParser<Tokenizer>>(
const string& filename);
template void IndexImpl::createFromFile<TurtleParserAuto>(
const string& filename);

// _____________________________________________________________________________
template <class Parser>
IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary(
const string& filename, size_t linesPerPartial) {
LOG(INFO) << "Processing input triples from " << filename << " ..."
<< std::endl;
auto parser = std::make_shared<Parser>(filename);
std::shared_ptr<TurtleParserBase> parser, size_t linesPerPartial) {
parser->integerOverflowBehavior() = turtleParserIntegerOverflowBehavior_;
parser->invalidLiteralsAreSkipped() = turtleParserSkipIllegalLiterals_;
std::unique_ptr<TripleVec> idTriples(new TripleVec());
Expand Down Expand Up @@ -308,9 +297,7 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary(
<< std::endl;
}

if constexpr (requires(Parser p) { p.printAndResetQueueStatistics(); }) {
parser->printAndResetQueueStatistics();
}
parser->printAndResetQueueStatistics();
}

// localWriter.finish();
Expand Down Expand Up @@ -917,20 +904,20 @@ LangtagAndTriple IndexImpl::tripleToInternalRepresentation(
TurtleTriple&& triple) const {
LangtagAndTriple result{"", {}};
auto& resultTriple = result._triple;
resultTriple[0] = std::move(triple._subject);
resultTriple[1] = std::move(triple._predicate);
resultTriple[0] = std::move(triple.subject_);
resultTriple[1] = std::move(triple.predicate_);

// If the object of the triple can be directly folded into an ID, do so. Note
// that the actual folding is done by the `TripleComponent`.
std::optional<Id> idIfNotString = triple._object.toValueIdIfNotString();
std::optional<Id> idIfNotString = triple.object_.toValueIdIfNotString();

// TODO<joka921> The following statement could be simplified by a helper
// function "optionalCast";
if (idIfNotString.has_value()) {
resultTriple[2] = idIfNotString.value();
} else {
// `toRdfLiteral` handles literals as well as IRIs correctly.
resultTriple[2] = std::move(triple._object).toRdfLiteral();
resultTriple[2] = std::move(triple.object_).toRdfLiteral();
}

for (size_t i = 0; i < 3; ++i) {
Expand All @@ -954,7 +941,6 @@ LangtagAndTriple IndexImpl::tripleToInternalRepresentation(
}

// ___________________________________________________________________________
template <class Parser>
void IndexImpl::readIndexBuilderSettingsFromFile() {
json j; // if we have no settings, we still have to initialize some default
// values
Expand Down Expand Up @@ -1018,20 +1004,17 @@ void IndexImpl::readIndexBuilderSettingsFromFile() {
configurationJson_["languages-internal"] = j["languages-internal"];
}
if (j.count("ascii-prefixes-only")) {
if constexpr (std::is_same_v<std::decay_t<Parser>, TurtleParserAuto>) {
bool v{j["ascii-prefixes-only"]};
if (v) {
LOG(INFO) << WARNING_ASCII_ONLY_PREFIXES << std::endl;
onlyAsciiTurtlePrefixes_ = true;
} else {
onlyAsciiTurtlePrefixes_ = false;
}
} else {
LOG(WARN) << "You specified the ascii-prefixes-only but a parser that is "
"not the Turtle stream parser. This means that this setting "
"is ignored."
<< std::endl;
}
onlyAsciiTurtlePrefixes_ = static_cast<bool>(j["ascii-prefixes-only"]);
}
if (onlyAsciiTurtlePrefixes_) {
LOG(INFO) << WARNING_ASCII_ONLY_PREFIXES << std::endl;
}

if (j.count("parallel-parsing")) {
useParallelParser_ = static_cast<bool>(j["parallel-parsing"]);
}
if (useParallelParser_) {
LOG(INFO) << WARNING_PARALLEL_PARSING << std::endl;
}

if (j.count("num-triples-per-batch")) {
Expand Down
12 changes: 5 additions & 7 deletions src/index/IndexImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ class IndexImpl {
string onDiskBase_;
string settingsFileName_;
bool onlyAsciiTurtlePrefixes_ = false;
bool useParallelParser_ = true;
TurtleParserIntegerOverflowBehavior turtleParserIntegerOverflowBehavior_ =
TurtleParserIntegerOverflowBehavior::Error;
bool turtleParserSkipIllegalLiterals_ = false;
Expand Down Expand Up @@ -204,7 +205,6 @@ class IndexImpl {
// Will write vocabulary and on-disk index data.
// !! The index can not directly be used after this call, but has to be setup
// by createFromOnDiskIndex after this call.
template <class Parser>
void createFromFile(const string& filename);

void addPatternsToExistingIndex();
Expand Down Expand Up @@ -426,13 +426,12 @@ class IndexImpl {
// permutations. Member vocab_ will be empty after this because it is not
// needed for index creation once the TripleVec is set up and it would be a
// waste of RAM.
template <class Parser>
IndexBuilderDataAsPsoSorter createIdTriplesAndVocab(const string& ntFile);
IndexBuilderDataAsPsoSorter createIdTriplesAndVocab(
std::shared_ptr<TurtleParserBase> parser);

// ___________________________________________________________________
template <class Parser>
IndexBuilderDataAsStxxlVector passFileForVocabulary(const string& ntFile,
size_t linesPerPartial);
IndexBuilderDataAsStxxlVector passFileForVocabulary(
std::shared_ptr<TurtleParserBase> parser, size_t linesPerPartial);

/**
* @brief Everything that has to be done when we have seen all the triples
Expand Down Expand Up @@ -617,7 +616,6 @@ class IndexImpl {
void readConfiguration();

// initialize the index-build-time settings for the vocabulary
template <class Parser>
void readIndexBuilderSettingsFromFile();

/**
Expand Down
Loading

0 comments on commit 0b74fa1

Please sign in to comment.