Turtle parser: better code, bug fixes, and new unit tests (ad-freibur…

…g#1039) 1. There is now a common base class for our Turtle parsers. That way, methods involed in index building no longer have be be templated with the parser class. 2. The parallel parser now accepts an arbitrary sequence of `\n` and `\r` as newline, and there may be arbitrary whitespace after the final dot in a line. So far, only `\n` was accepted as newline, and a `\t` after the final dot did not work. 3. There is now a Boolean option `parallel-parsing` in the `settings.json` configuration file. It is `true` by default, which is consistent with the behavior so far. 4. Add unit tests for the Turtle parser (both parallel and not parallel). So far, there were unit tests for individual rules, but not for the parser as a whole. In particular, the batching and parallelization were untested so far. 5. Remove the mmap parser becuse it was no longer needed (and probably also not working any longer).
schlegan · Jul 20, 2023 · 0b74fa1 · 0b74fa1
1 parent ad6f15e
commit 0b74fa1
Show file tree

Hide file tree

Showing 21 changed files with 629 additions and 593 deletions.
diff --git a/.github/workflows/code-coverage.yml b/.github/workflows/code-coverage.yml
@@ -70,7 +70,7 @@ jobs:
     - name: Configure CMake
       # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
       # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
-      run: cmake -B ${{github.workspace}}/build ${{env.cmake-flags}} -DCMAKE_BUILD_TYPE=${{env.build-type}} -DLOGLEVEL=DEBUG -DCMAKE_TOOLCHAIN_FILE="$(pwd)/build/conan_toolchain.cmake" -DADDITIONAL_COMPILER_FLAGS="${{env.warnings}} ${{env.asan-flags}} ${{env.ubsan-flags}} ${{env.coverage-flags}}" -DADDITIONAL_LINKER_FLAGS="${{env.coverage-flags}}" -DUSE_PARALLEL=false -DRUN_EXPENSIVE_TESTS=false -DSINGLE_TEST_BINARY=ON -DENABLE_EXPENSIVE_CHECKS=true
+      run: cmake -B ${{github.workspace}}/build ${{env.cmake-flags}} -DCMAKE_BUILD_TYPE=${{env.build-type}} -DLOGLEVEL=TIMING -DCMAKE_TOOLCHAIN_FILE="$(pwd)/build/conan_toolchain.cmake" -DADDITIONAL_COMPILER_FLAGS="${{env.warnings}} ${{env.asan-flags}} ${{env.ubsan-flags}} ${{env.coverage-flags}}" -DADDITIONAL_LINKER_FLAGS="${{env.coverage-flags}}" -DUSE_PARALLEL=false -DRUN_EXPENSIVE_TESTS=false -DSINGLE_TEST_BINARY=ON -DENABLE_EXPENSIVE_CHECKS=true
 
     - name: Build
         # Build your program with the given configuration

diff --git a/examples/wikidata.settings.json b/examples/wikidata.settings.json
@@ -11,5 +11,6 @@
 	  "ignore-punctuation": true
   },
   "ascii-prefixes-only": true,
-  "num-triples-per-batch" : 10000000
+  "num-triples-per-batch" : 10000000,
+  "parallel-parsing" : true
 }
diff --git a/src/TurtleParserMain.cpp b/src/TurtleParserMain.cpp
@@ -27,8 +27,8 @@ void writeNTImpl(std::ostream& out, const std::string& filename) {
   TurtleTriple triple;
   size_t numTriples = 0;
   while (p.getLine(triple)) {
-    out << triple._subject << " " << triple._predicate << " "
-        << triple._object.toRdfLiteral() << " .\n";
+    out << triple.subject_ << " " << triple.predicate_ << " "
+        << triple.object_.toRdfLiteral() << " .\n";
     numTriples++;
     if (numTriples % 10000000 == 0) {
       LOG(INFO) << "Parsed " << numTriples << " triples" << std::endl;
@@ -72,8 +72,6 @@ void writeNT(std::ostream& out, const string& fileFormat,
   if (fileFormat == "ttl" || fileFormat == "nt") {
     // writeLabel<TurtleStreamParser<Tokenizer_T>>(out, filename);
     writeNTImpl<TurtleStreamParser<Tokenizer_T>>(out, filename);
-  } else if (fileFormat == "mmap") {
-    writeNTImpl<TurtleMmapParser<Tokenizer_T>>(out, filename);
   } else {
     LOG(ERROR) << "writeNT was called with unknown file format " << fileFormat
                << ". This should never happen, terminating" << std::endl;

diff --git a/src/global/Constants.h b/src/global/Constants.h
@@ -125,6 +125,12 @@ static const std::string WARNING_ASCII_ONLY_PREFIXES =
 // "literals, and the regex \". *\\n\" only matches at the end of a triple. "
 // "Most Turtle files fulfill these properties (e.g. that from Wikidata), "
 // "but not all";
+
+static const std::string WARNING_PARALLEL_PARSING =
+    "You specified \"parallel-parsing = true\", which enables faster parsing "
+    "for TTL files that don't include multiline literals with unescaped "
+    "newline characters and that have newline characters after the end of "
+    "triples.";
 static const std::string LOCALE_DEFAULT_LANG = "en";
 static const std::string LOCALE_DEFAULT_COUNTRY = "US";
 static constexpr bool LOCALE_DEFAULT_IGNORE_PUNCTUATION = false;

diff --git a/src/index/Index.cpp b/src/index/Index.cpp
@@ -19,18 +19,10 @@ Index::Index(Index&&) noexcept = default;
 Index::~Index() = default;
 
 // ____________________________________________________________________________
-template <class Parser>
 void Index::createFromFile(const std::string& filename) {
-  pimpl_->template createFromFile<Parser>(filename);
+  pimpl_->createFromFile(filename);
 }
 
-// Explicit instantiations.
-template void Index::createFromFile<TurtleStreamParser<Tokenizer>>(
-    const string& filename);
-template void Index::createFromFile<TurtleMmapParser<Tokenizer>>(
-    const string& filename);
-template void Index::createFromFile<TurtleParserAuto>(const string& filename);
-
 // ____________________________________________________________________________
 void Index::addPatternsToExistingIndex() {
   pimpl_->addPatternsToExistingIndex();

diff --git a/src/index/Index.h b/src/index/Index.h
@@ -22,13 +22,6 @@ class IdTable;
 class TextBlockMetaData;
 class IndexImpl;
 
-/**
- * Used as a template argument to the `createFromFile` method, when we do not
- * yet know which tokenizer specialization of the `TurtleParser` we are going
- * to use.
- */
-class TurtleParserAuto {};
-
 class Index {
  private:
   // Pimpl to reduce compile times.
@@ -76,8 +69,6 @@ class Index {
   // Create an index from a file. Will write vocabulary and on-disk index data.
   // NOTE: The index can not directly be used after this call, but has to be
   // setup by `createFromOnDiskIndex` after this call.
-  // TODO<joka921> Make the parser options also a plain enum!
-  template <class Parser>
   void createFromFile(const std::string& filename);
 
   void addPatternsToExistingIndex();

diff --git a/src/index/IndexBuilderMain.cpp b/src/index/IndexBuilderMain.cpp
@@ -73,7 +73,6 @@ int main(int argc, char** argv) {
   string filetype;
   string inputFile;
   bool noPrefixCompression = false;
-  bool onDiskLiterals = false;
   bool noPatterns = false;
   bool onlyAddTextIndex = false;
   bool keepTemporaryFiles = false;
@@ -97,7 +96,7 @@ int main(int argc, char** argv) {
       "will read from stdin.");
   add("file-format,F", po::value(&filetype),
       "The format of the input file with the knowledge graph data. Must be one "
-      "of [tsv|nt|ttl]. If not set, QLever will try to deduce it from the "
+      "of [nt|ttl]. If not set, QLever will try to deduce it from the "
       "filename suffix.");
   add("kg-index-name,K", po::value(&kbIndexName),
       "The name of the knowledge graph index (default: basename of "
@@ -119,9 +118,6 @@ int main(int argc, char** argv) {
       "the same `index-basename` already exists.");
 
   // Options for the knowledge graph index.
-  add("externalize-literals,l", po::bool_switch(&onDiskLiterals),
-      "An unused and deprecated option that will be removed from a future "
-      "version of qlever");
   add("settings-file,s", po::value(&settingsFile),
       "A JSON file, where various settings can be specified (see the QLever "
       "documentation).");
@@ -160,13 +156,6 @@ int main(int argc, char** argv) {
         1024ul * 1024ul * 1024ul * stxxlMemoryGB.value();
   }
 
-  if (onDiskLiterals) {
-    LOG(WARN) << EMPH_ON
-              << "Warning, the -l command line option has no effect anymore "
-                 "and will be removed from a future version of QLever"
-              << EMPH_OFF << std::endl;
-  }
-
   // If no text index name was specified, take the part of the wordsfile after
   // the last slash.
   if (textIndexName.empty() && !wordsfile.empty()) {
@@ -213,10 +202,7 @@ int main(int argc, char** argv) {
                   << ad_utility::getUppercase(filetype) << std::endl;
       } else {
         bool filetypeDeduced = false;
-        if (inputFile.ends_with(".tsv")) {
-          filetype = "tsv";
-          filetypeDeduced = true;
-        } else if (inputFile.ends_with(".nt")) {
+        if (inputFile.ends_with(".nt")) {
           filetype = "nt";
           filetypeDeduced = true;
         } else if (inputFile.ends_with(".ttl")) {
@@ -239,18 +225,13 @@ int main(int argc, char** argv) {
       if (filetype == "ttl") {
         LOG(DEBUG) << "Parsing uncompressed TTL from: " << inputFile
                    << std::endl;
-        index.createFromFile<TurtleParserAuto>(inputFile);
+        index.createFromFile(inputFile);
       } else if (filetype == "nt") {
         LOG(DEBUG) << "Parsing uncompressed N-Triples from: " << inputFile
                    << " (using the Turtle parser)" << std::endl;
-        index.createFromFile<TurtleParserAuto>(inputFile);
-      } else if (filetype == "mmap") {
-        LOG(DEBUG) << "Parsing uncompressed TTL from from: " << inputFile
-                   << " (using mmap, which only works for files, not for "
-                   << "streams)" << std::endl;
-        index.createFromFile<TurtleMmapParser<Tokenizer>>(inputFile);
+        index.createFromFile(inputFile);
       } else {
-        LOG(ERROR) << "File format must be one of: nt ttl mmap" << std::endl;
+        LOG(ERROR) << "File format must be one of: nt ttl" << std::endl;
         std::cerr << boostOptions << std::endl;
         exit(1);
       }

diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp
@@ -11,8 +11,6 @@
 #include <cstdio>
 #include <future>
 #include <optional>
-#include <stxxl/algorithm>
-#include <stxxl/map>
 #include <unordered_map>
 
 #include "CompilationInfo.h"
@@ -35,11 +33,10 @@ IndexImpl::IndexImpl(ad_utility::AllocatorWithLimit<Id> allocator)
     : allocator_{std::move(allocator)} {};
 
 // _____________________________________________________________________________
-template <class Parser>
 IndexBuilderDataAsPsoSorter IndexImpl::createIdTriplesAndVocab(
-    const string& ntFile) {
+    std::shared_ptr<TurtleParserBase> parser) {
   auto indexBuilderData =
-      passFileForVocabulary<Parser>(ntFile, numTriplesPerBatch_);
+      passFileForVocabulary(std::move(parser), numTriplesPerBatch_);
   // first save the total number of words, this is needed to initialize the
   // dense IndexMetaData variants
   totalVocabularySize_ = indexBuilderData.vocabularyMetaData_.numWordsTotal_;
@@ -79,29 +76,33 @@ void createPatternsFromSpoTriplesView(auto&& spoTriplesView,
 }
 
 // _____________________________________________________________________________
-template <class Parser>
 void IndexImpl::createFromFile(const string& filename) {
+  LOG(INFO) << "Processing input triples from " << filename << " ..."
+            << std::endl;
   string indexFilename = onDiskBase_ + ".index";
 
-  readIndexBuilderSettingsFromFile<Parser>();
+  readIndexBuilderSettingsFromFile();
 
-  IndexBuilderDataAsPsoSorter indexBuilderData;
-  if constexpr (std::is_same_v<std::decay_t<Parser>, TurtleParserAuto>) {
+  auto setTokenizer = [this,
+                       &filename]<template <typename> typename ParserTemplate>()
+      -> std::unique_ptr<TurtleParserBase> {
     if (onlyAsciiTurtlePrefixes_) {
-      LOG(DEBUG) << "Using the CTRE library for tokenization" << std::endl;
-      indexBuilderData =
-          createIdTriplesAndVocab<TurtleParallelParser<TokenizerCtre>>(
-              filename);
+      return std::make_unique<ParserTemplate<TokenizerCtre>>(filename);
     } else {
-      LOG(DEBUG) << "Using the Google RE2 library for tokenization"
-                 << std::endl;
-      indexBuilderData =
-          createIdTriplesAndVocab<TurtleParallelParser<Tokenizer>>(filename);
+      return std::make_unique<ParserTemplate<Tokenizer>>(filename);
     }
+  };
 
-  } else {
-    indexBuilderData = createIdTriplesAndVocab<Parser>(filename);
-  }
+  std::unique_ptr<TurtleParserBase> parser = [&setTokenizer, this]() {
+    if (useParallelParser_) {
+      return setTokenizer.template operator()<TurtleParallelParser>();
+    } else {
+      return setTokenizer.template operator()<TurtleStreamParser>();
+    }
+  }();
+
+  IndexBuilderDataAsPsoSorter indexBuilderData =
+      createIdTriplesAndVocab(std::move(parser));
 
   // If we have no compression, this will also copy the whole vocabulary.
   // but since we expect compression to be the default case, this  should not
@@ -119,7 +120,7 @@ void IndexImpl::createFromFile(const string& filename) {
   LOG(INFO) << "Writing compressed vocabulary to disk ..." << std::endl;
 
   vocab_.buildCodebookForPrefixCompression(prefixes);
-  auto wordReader = vocab_.makeUncompressedDiskIterator(vocabFile);
+  auto wordReader = RdfsVocabulary::makeUncompressedDiskIterator(vocabFile);
   auto wordWriter = vocab_.makeCompressedWordWriter(vocabFileTmp);
   for (const auto& word : wordReader) {
     wordWriter.push(word);
@@ -229,21 +230,9 @@ void IndexImpl::createFromFile(const string& filename) {
   LOG(INFO) << "Index build completed" << std::endl;
 }
 
-// Explicit instantiations.
-template void IndexImpl::createFromFile<TurtleStreamParser<Tokenizer>>(
-    const string& filename);
-template void IndexImpl::createFromFile<TurtleMmapParser<Tokenizer>>(
-    const string& filename);
-template void IndexImpl::createFromFile<TurtleParserAuto>(
-    const string& filename);
-
 // _____________________________________________________________________________
-template <class Parser>
 IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary(
-    const string& filename, size_t linesPerPartial) {
-  LOG(INFO) << "Processing input triples from " << filename << " ..."
-            << std::endl;
-  auto parser = std::make_shared<Parser>(filename);
+    std::shared_ptr<TurtleParserBase> parser, size_t linesPerPartial) {
   parser->integerOverflowBehavior() = turtleParserIntegerOverflowBehavior_;
   parser->invalidLiteralsAreSkipped() = turtleParserSkipIllegalLiterals_;
   std::unique_ptr<TripleVec> idTriples(new TripleVec());
@@ -308,9 +297,7 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary(
                     << std::endl;
       }
 
-      if constexpr (requires(Parser p) { p.printAndResetQueueStatistics(); }) {
-        parser->printAndResetQueueStatistics();
-      }
+      parser->printAndResetQueueStatistics();
     }
 
     // localWriter.finish();
@@ -917,20 +904,20 @@ LangtagAndTriple IndexImpl::tripleToInternalRepresentation(
     TurtleTriple&& triple) const {
   LangtagAndTriple result{"", {}};
   auto& resultTriple = result._triple;
-  resultTriple[0] = std::move(triple._subject);
-  resultTriple[1] = std::move(triple._predicate);
+  resultTriple[0] = std::move(triple.subject_);
+  resultTriple[1] = std::move(triple.predicate_);
 
   // If the object of the triple can be directly folded into an ID, do so. Note
   // that the actual folding is done by the `TripleComponent`.
-  std::optional<Id> idIfNotString = triple._object.toValueIdIfNotString();
+  std::optional<Id> idIfNotString = triple.object_.toValueIdIfNotString();
 
   // TODO<joka921> The following statement could be simplified by a helper
   // function "optionalCast";
   if (idIfNotString.has_value()) {
     resultTriple[2] = idIfNotString.value();
   } else {
     // `toRdfLiteral` handles literals as well as IRIs correctly.
-    resultTriple[2] = std::move(triple._object).toRdfLiteral();
+    resultTriple[2] = std::move(triple.object_).toRdfLiteral();
   }
 
   for (size_t i = 0; i < 3; ++i) {
@@ -954,7 +941,6 @@ LangtagAndTriple IndexImpl::tripleToInternalRepresentation(
 }
 
 // ___________________________________________________________________________
-template <class Parser>
 void IndexImpl::readIndexBuilderSettingsFromFile() {
   json j;  // if we have no settings, we still have to initialize some default
            // values
@@ -1018,20 +1004,17 @@ void IndexImpl::readIndexBuilderSettingsFromFile() {
     configurationJson_["languages-internal"] = j["languages-internal"];
   }
   if (j.count("ascii-prefixes-only")) {
-    if constexpr (std::is_same_v<std::decay_t<Parser>, TurtleParserAuto>) {
-      bool v{j["ascii-prefixes-only"]};
-      if (v) {
-        LOG(INFO) << WARNING_ASCII_ONLY_PREFIXES << std::endl;
-        onlyAsciiTurtlePrefixes_ = true;
-      } else {
-        onlyAsciiTurtlePrefixes_ = false;
-      }
-    } else {
-      LOG(WARN) << "You specified the ascii-prefixes-only but a parser that is "
-                   "not the Turtle stream parser. This means that this setting "
-                   "is ignored."
-                << std::endl;
-    }
+    onlyAsciiTurtlePrefixes_ = static_cast<bool>(j["ascii-prefixes-only"]);
+  }
+  if (onlyAsciiTurtlePrefixes_) {
+    LOG(INFO) << WARNING_ASCII_ONLY_PREFIXES << std::endl;
+  }
+
+  if (j.count("parallel-parsing")) {
+    useParallelParser_ = static_cast<bool>(j["parallel-parsing"]);
+  }
+  if (useParallelParser_) {
+    LOG(INFO) << WARNING_PARALLEL_PARSING << std::endl;
   }
 
   if (j.count("num-triples-per-batch")) {

diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h
@@ -109,6 +109,7 @@ class IndexImpl {
   string onDiskBase_;
   string settingsFileName_;
   bool onlyAsciiTurtlePrefixes_ = false;
+  bool useParallelParser_ = true;
   TurtleParserIntegerOverflowBehavior turtleParserIntegerOverflowBehavior_ =
       TurtleParserIntegerOverflowBehavior::Error;
   bool turtleParserSkipIllegalLiterals_ = false;
@@ -204,7 +205,6 @@ class IndexImpl {
   // Will write vocabulary and on-disk index data.
   // !! The index can not directly be used after this call, but has to be setup
   // by createFromOnDiskIndex after this call.
-  template <class Parser>
   void createFromFile(const string& filename);
 
   void addPatternsToExistingIndex();
@@ -426,13 +426,12 @@ class IndexImpl {
   // permutations. Member vocab_ will be empty after this because it is not
   // needed for index creation once the TripleVec is set up and it would be a
   // waste of RAM.
-  template <class Parser>
-  IndexBuilderDataAsPsoSorter createIdTriplesAndVocab(const string& ntFile);
+  IndexBuilderDataAsPsoSorter createIdTriplesAndVocab(
+      std::shared_ptr<TurtleParserBase> parser);
 
   // ___________________________________________________________________
-  template <class Parser>
-  IndexBuilderDataAsStxxlVector passFileForVocabulary(const string& ntFile,
-                                                      size_t linesPerPartial);
+  IndexBuilderDataAsStxxlVector passFileForVocabulary(
+      std::shared_ptr<TurtleParserBase> parser, size_t linesPerPartial);
 
   /**
    * @brief Everything that has to be done when we have seen all the triples
@@ -617,7 +616,6 @@ class IndexImpl {
   void readConfiguration();
 
   // initialize the index-build-time settings for the vocabulary
-  template <class Parser>
   void readIndexBuilderSettingsFromFile();
 
   /**