More consistent filenames and settings parameters (ad-freiburg#612)

schlegan · Feb 24, 2022 · a9f4961 · a9f4961
1 parent 25d5d64
commit a9f4961
Show file tree

Hide file tree

Showing 15 changed files with 47 additions and 162 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -249,6 +249,3 @@ target_link_libraries(VocabularyMergerMain index ${CMAKE_THREAD_LIBS_INIT})
 
 add_executable(PermutationExporterMain src/index/PermutationExporterMain.cpp)
 target_link_libraries(PermutationExporterMain index ${CMAKE_THREAD_LIBS_INIT})
-
-add_executable(VocabularyConverterMain src/VocabularyConverterMain.cpp)
-target_link_libraries(VocabularyConverterMain index ${CMAKE_THREAD_LIBS_INIT})
diff --git a/e2e/e2e-build-settings.json b/e2e/e2e-build-settings.json
@@ -1,5 +1,5 @@
 {
-  "num-triples-per-partial-vocab" : 40000,
+  "num-triples-per-batch" : 40000,
   "parser-batch-size" : 1000,
   "ascii-prefixes-only":false
 }
diff --git a/examples/olympics.settings.json b/examples/olympics.settings.json
@@ -1,4 +1,4 @@
 {
   "ascii-prefixes-only": true,
-  "num-triples-per-partial-vocab": 50000000
+  "num-triples-per-batch": 50000000
 }
diff --git a/examples/scientists.settings.json b/examples/scientists.settings.json
@@ -1,4 +1,4 @@
 {
   "ascii-prefixes-only": true,
-  "num-triples-per-partial-vocab": 10000000
+  "num-triples-per-batch": 10000000
 }
diff --git a/examples/wikidata.settings.json b/examples/wikidata.settings.json
@@ -11,5 +11,5 @@
 	  "ignore-punctuation": true
   },
   "ascii-prefixes-only": true,
-  "num-triples-per-partial-vocab" : 50000000
+  "num-triples-per-batch" : 50000000
 }
diff --git a/master.Makefile b/master.Makefile
@@ -117,7 +117,7 @@ index:
 	fi
 
 remove_index:
-	rm -f $(DB).index.* $(DB).literals-index $(DB).vocabulary $(DB).prefixes $(DB).meta-data.json $(DB).index-log.txt
+	rm -f $(DB).index.* $(DB).vocabulary.* $(DB).prefixes $(DB).meta-data.json $(DB).index-log.txt
 
 # Create wordsfile and docsfile from all literals of the given NT file.
 # Using this as input for a SPARQL+Text index build will effectively enable
@@ -183,7 +183,7 @@ clear_unpinned:
 # STATISTICS on cache, memory, and the number of triples per predicate.
 
 disk_usage:
-	du -hc $(DB).index.* $(DB).literals-index $(DB).vocabulary $(DB).prefixes $(DB).meta-data.json
+	du -hc $(DB).index.* $(DB).vocabulary.* $(DB).prefixes $(DB).meta-data.json
 
 cachestats:
 	@curl -Gs $(QLEVER_API) --data-urlencode "cmd=cachestats" \

diff --git a/src/VocabularyConverterMain.cpp b/src/VocabularyConverterMain.cpp
diff --git a/src/VocabularyMergerMain.cpp b/src/VocabularyMergerMain.cpp
@@ -19,7 +19,7 @@ int main(int argc, char** argv) {
   size_t numFiles = atoi(argv[2]);
 
   VocabularyMerger m;
-  std::ofstream file(basename + ".vocabulary");
+  std::ofstream file(basename + INTERNAL_VOCAB_SUFFIX);
   AD_CHECK(file.is_open());
   auto internalVocabularyAction = [&file](const auto& word) {
     file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n';

diff --git a/src/global/Constants.h b/src/global/Constants.h
@@ -64,6 +64,8 @@ static const int DEFAULT_NOF_VALUE_EXPONENT_DIGITS = 20;
 static const int DEFAULT_NOF_VALUE_MANTISSA_DIGITS = 30;
 static const int DEFAULT_NOF_DATE_YEAR_DIGITS = 19;
 
+static const std::string INTERNAL_VOCAB_SUFFIX = ".vocabulary.internal";
+static const std::string EXTERNAL_VOCAB_SUFFIX = ".vocabulary.external";
 static const std::string MMAP_FILE_SUFFIX = ".meta";
 static const std::string CONFIGURATION_FILE = ".meta-data.json";
 static const std::string PREFIX_FILE = ".prefixes";

diff --git a/src/index/Index.Text.cpp b/src/index/Index.Text.cpp
@@ -140,8 +140,9 @@ void Index::passContextFileIntoVector(const string& contextFile,
   // initialized before
   _vocab = RdfsVocabulary{};
   readConfiguration();
-  _vocab.readFromFile(_onDiskBase + ".vocabulary",
-                      _onDiskLiterals ? _onDiskBase + ".literals-index" : "");
+  _vocab.readFromFile(
+      _onDiskBase + INTERNAL_VOCAB_SUFFIX,
+      _onDiskLiterals ? _onDiskBase + EXTERNAL_VOCAB_SUFFIX : "");
 
   TextVec::bufwriter_type writer(vec);
   ad_utility::HashMap<Id, Score> wordsInContext;

diff --git a/src/index/Index.cpp b/src/index/Index.cpp
@@ -34,11 +34,11 @@ Index::Index() : _usePatterns(false) {}
 template <class Parser>
 IndexBuilderDataAsPsoSorter Index::createIdTriplesAndVocab(
     const string& ntFile) {
-  auto vocabData =
-      passFileForVocabulary<Parser>(ntFile, _numTriplesPerPartialVocab);
+  auto indexBuilderData =
+      passFileForVocabulary<Parser>(ntFile, _numTriplesPerBatch);
   // first save the total number of words, this is needed to initialize the
   // dense IndexMetaData variants
-  _totalVocabularySize = vocabData.nofWords;
+  _totalVocabularySize = indexBuilderData.nofWords;
   LOG(DEBUG) << "Number of words in internal and external vocabulary: "
              << _totalVocabularySize << std::endl;
 
@@ -47,18 +47,18 @@ IndexBuilderDataAsPsoSorter Index::createIdTriplesAndVocab(
   if (_onDiskLiterals) {
     _vocab.externalizeLiteralsFromTextFile(
         _onDiskBase + EXTERNAL_LITS_TEXT_FILE_NAME,
-        _onDiskBase + ".literals-index");
+        _onDiskBase + EXTERNAL_VOCAB_SUFFIX);
   }
   deleteTemporaryFile(_onDiskBase + EXTERNAL_LITS_TEXT_FILE_NAME);
   // clear vocabulary to save ram (only information from partial binary files
   // used from now on). This will preserve information about externalized
   // Prefixes etc.
   _vocab.clear();
-  auto psoSorter = convertPartialToGlobalIds(*vocabData.idTriples,
-                                             vocabData.actualPartialSizes,
-                                             NUM_TRIPLES_PER_PARTIAL_VOCAB);
+  auto psoSorter = convertPartialToGlobalIds(
+      *indexBuilderData.idTriples, indexBuilderData.actualPartialSizes,
+      NUM_TRIPLES_PER_PARTIAL_VOCAB);
 
-  return {vocabData, std::move(psoSorter)};
+  return {indexBuilderData, std::move(psoSorter)};
 }
 namespace {
 // Return a lambda that takes a triple of IDs and returns true iff the predicate
@@ -117,14 +117,14 @@ void Index::createFromFile(const string& filename) {
   // If we have no compression, this will also copy the whole vocabulary.
   // but since we expect compression to be the default case, this  should not
   // hurt.
-  string vocabFile = _onDiskBase + ".vocabulary";
+  string vocabFile = _onDiskBase + INTERNAL_VOCAB_SUFFIX;
   string vocabFileTmp = _onDiskBase + ".vocabularyTmp";
   std::vector<string> prefixes;
   if (_vocabPrefixCompressed) {
     // We have to use the "normally" sorted vocabulary for the prefix
     // compression.
     std::string vocabFileForPrefixCalculation =
-        _onDiskBase + TMP_BASENAME_COMPRESSION + ".vocabulary";
+        _onDiskBase + TMP_BASENAME_COMPRESSION + INTERNAL_VOCAB_SUFFIX;
     prefixes = calculatePrefixes(vocabFileForPrefixCalculation,
                                  NUM_COMPRESSION_PREFIXES, 1, true);
     deleteTemporaryFile(vocabFileForPrefixCalculation);
@@ -346,7 +346,7 @@ IndexBuilderDataAsStxxlVector Index::passFileForVocabulary(
               << "(internal only) ..." << std::endl;
     VocabularyMerger m;
     std::ofstream compressionOutfile(_onDiskBase + TMP_BASENAME_COMPRESSION +
-                                     ".vocabulary");
+                                     INTERNAL_VOCAB_SUFFIX);
     AD_CHECK(compressionOutfile.is_open());
     auto internalVocabularyActionCompression =
         [&compressionOutfile](const auto& word) {
@@ -371,7 +371,7 @@ IndexBuilderDataAsStxxlVector Index::passFileForVocabulary(
       return (*cmp)(a, b, decltype(_vocab)::SortLevel::TOTAL);
     };
     auto wordWriter =
-        _vocab.makeUncompressingWordWriter(_onDiskBase + ".vocabulary");
+        _vocab.makeUncompressingWordWriter(_onDiskBase + INTERNAL_VOCAB_SUFFIX);
     auto internalVocabularyAction = [&wordWriter](const auto& word) {
       wordWriter.push(word.data(), word.size());
     };
@@ -649,8 +649,9 @@ void Index::addPatternsToExistingIndex() {
 void Index::createFromOnDiskIndex(const string& onDiskBase) {
   setOnDiskBase(onDiskBase);
   readConfiguration();
-  _vocab.readFromFile(_onDiskBase + ".vocabulary",
-                      _onDiskLiterals ? _onDiskBase + ".literals-index" : "");
+  _vocab.readFromFile(
+      _onDiskBase + INTERNAL_VOCAB_SUFFIX,
+      _onDiskLiterals ? _onDiskBase + EXTERNAL_VOCAB_SUFFIX : "");
 
   _totalVocabularySize = _vocab.size() + _vocab.getExternalVocab().size();
   LOG(DEBUG) << "Number of words in internal and external vocabulary: "
@@ -1043,11 +1044,10 @@ void Index::initializeVocabularySettingsBuild() {
     }
   }
 
-  if (j.count("num-triples-per-partial-vocab")) {
-    _numTriplesPerPartialVocab = size_t{j["num-triples-per-partial-vocab"]};
+  if (j.count("num-triples-per-batch")) {
+    _numTriplesPerBatch = size_t{j["num-triples-per-batch"]};
     LOG(INFO)
-        << "You specified \"num-triples-per-partial-vocab = "
-        << _numTriplesPerPartialVocab
+        << "You specified \"num-triples-per-batch = " << _numTriplesPerBatch
         << "\", choose a lower value if the index builder runs out of memory"
         << std::endl;
   }

diff --git a/src/index/Index.h b/src/index/Index.h
@@ -309,6 +309,10 @@ class Index {
 
   void setPrefixCompression(bool compressed);
 
+  void setNumTriplesPerBatch(uint64_t numTriplesPerBatch) {
+    _numTriplesPerBatch = numTriplesPerBatch;
+  }
+
   const string& getTextName() const { return _textMeta.getName(); }
 
   const string& getKbName() const { return _PSO.metaData().getName(); }
@@ -477,7 +481,7 @@ class Index {
   size_t _fullHasPredicateSize;
 
   size_t _parserBatchSize = PARSER_BATCH_SIZE;
-  size_t _numTriplesPerPartialVocab = NUM_TRIPLES_PER_PARTIAL_VOCAB;
+  size_t _numTriplesPerBatch = NUM_TRIPLES_PER_PARTIAL_VOCAB;
   /**
    * @brief Maps pattern ids to sets of predicate ids.
    */

diff --git a/test/GroupByTest.cpp b/test/GroupByTest.cpp
@@ -44,6 +44,7 @@ class GroupByTest : public ::testing::Test {
       _index.setKbName("group_by_test");
       _index.setTextName("group_by_test");
       _index.setOnDiskBase("group_ty_test");
+      _index.setNumTriplesPerBatch(2);
       _index.createFromFile<TurtleParserAuto>("group_by_test.nt");
       _index.addTextFromContextFile("group_by_test.words");
       _index.buildDocsDB("group_by_test.documents");

diff --git a/test/IndexTest.cpp b/test/IndexTest.cpp
@@ -71,6 +71,7 @@ TEST(IndexTest, createFromTsvTest) {
     {
       Index index;
       index.setOnDiskBase("_testindex");
+      index.setNumTriplesPerBatch(2);
       index.createFromFile<TsvParser>("_testtmp2.tsv");
     }
     Index index;
@@ -145,6 +146,7 @@ TEST(IndexTest, createFromTsvTest) {
     {
       Index index;
       index.setOnDiskBase("_testindex");
+      index.setNumTriplesPerBatch(2);
       index.createFromFile<TsvParser>("_testtmp2.tsv");
     }
     Index index;
@@ -243,6 +245,7 @@ TEST_F(CreatePatternsFixture, createPatterns) {
       Index index;
       index.setUsePatterns(true);
       index.setOnDiskBase("_testindex");
+      index.setNumTriplesPerBatch(2);
       index.createFromFile<TsvParser>("_testtmppatterns.tsv");
     }
     Index index;
@@ -307,6 +310,7 @@ TEST(IndexTest, createFromOnDiskIndexTest) {
   {
     Index indexPrim;
     indexPrim.setOnDiskBase("_testindex2");
+    indexPrim.setNumTriplesPerBatch(2);
     indexPrim.createFromFile<TsvParser>("_testtmp3.tsv");
   }
 
@@ -357,6 +361,7 @@ TEST(IndexTest, scanTest) {
     {
       Index index;
       index.setOnDiskBase("_testindex");
+      index.setNumTriplesPerBatch(2);
       index.createFromFile<TsvParser>("_testtmp2.tsv");
     }
 
@@ -442,6 +447,7 @@ TEST(IndexTest, scanTest) {
     {
       Index index;
       index.setOnDiskBase("_testindex");
+      index.setNumTriplesPerBatch(2);
       index.createFromFile<TsvParser>("_testtmp2.tsv");
     }
     Index index;