Speed up first phase of index build (ad-freiburg#1124)

The first phase of the index build (where the input triples are parser and the partial vocabularies are built) is now more than twice as fast. The main change is that the space for the hash maps for the partial vocabulary (one per batch) is now reused (via a new special-purpose class CachingMemoyResource.). There are also various other small improvements. For example, various unnecessary string copies from the previous code are now removed. At one place, multiple hash maps are converted to a large vector, which is now parallelized. IMPORTANT: To make the best use of these improvements, when feeding input files compressed with BZ2, use lbzcat -n 4 instead of bzcat in order to decompress in parallel (the argument of the -n is the number of decompressor threads).
Flixtastic · Nov 9, 2023 · 4cc4e7c · 4cc4e7c
1 parent cf2d0b1
commit 4cc4e7c
Show file tree

Hide file tree

Showing 21 changed files with 767 additions and 347 deletions.
diff --git a/src/index/ConstantsIndexBuilding.h b/src/index/ConstantsIndexBuilding.h
@@ -28,12 +28,12 @@ static const size_t PARSER_BATCH_SIZE = 1'000'000;
 // That many triples does the turtle parser have to buffer before the call to
 // getline returns (unless our input reaches EOF). This makes parsing from
 // streams faster.
-static const size_t PARSER_MIN_TRIPLES_AT_ONCE = 100'000;
+static const size_t PARSER_MIN_TRIPLES_AT_ONCE = 10'000;
 
 // When reading from a file, Chunks of this size will
-// be fed to the parser at once (100 MiB)
+// be fed to the parser at once (10 MiB)
 inline std::atomic<size_t>& FILE_BUFFER_SIZE() {
-  static std::atomic<size_t> fileBufferSize = 100 * (1ul << 20);
+  static std::atomic<size_t> fileBufferSize = 10 * (1ul << 20);
   return fileBufferSize;
 }
 
@@ -63,11 +63,11 @@ static const std::string TMP_BASENAME_COMPRESSION =
 // unique elements of the vocabulary are identified via hash maps. Typically, 6
 // is a good value. On systems with very few CPUs, a lower value might be
 // beneficial.
-constexpr size_t NUM_PARALLEL_ITEM_MAPS = 6;
+constexpr size_t NUM_PARALLEL_ITEM_MAPS = 10;
 
 // The number of threads that are parsing in parallel, when the parallel Turtle
 // parser is used.
-constexpr size_t NUM_PARALLEL_PARSER_THREADS = 5;
+constexpr size_t NUM_PARALLEL_PARSER_THREADS = 8;
 
 // Increasing the following two constants increases the RAM usage without much
 // benefit to the performance.

diff --git a/src/index/IndexBuilderTypes.h b/src/index/IndexBuilderTypes.h
@@ -1,52 +1,54 @@
 //  Copyright 2020, University of Freiburg,
-//  Chair of Algorithms and Data Structures.
+//                  Chair of Algorithms and Data Structures.
 //  Author: Johannes Kalmbach <[email protected]>
 
 // Common classes / Typedefs that are used during Index Creation
 
-#include "../global/Constants.h"
-#include "../global/Id.h"
-#include "../util/Conversions.h"
-#include "../util/HashMap.h"
-#include "../util/Serializer/Serializer.h"
-#include "../util/TupleHelpers.h"
-#include "../util/TypeTraits.h"
-#include "./ConstantsIndexBuilding.h"
-#include "./StringSortComparator.h"
-
 #ifndef QLEVER_INDEXBUILDERTYPES_H
 #define QLEVER_INDEXBUILDERTYPES_H
 
+#include <memory_resource>
+
+#include "global/Constants.h"
+#include "global/Id.h"
+#include "index/ConstantsIndexBuilding.h"
+#include "index/StringSortComparator.h"
+#include "util/Conversions.h"
+#include "util/HashMap.h"
+#include "util/Serializer/Serializer.h"
+#include "util/TupleHelpers.h"
+#include "util/TypeTraits.h"
+
 // An IRI or a literal together with the information, whether it should be part
 // of the external vocabulary
 struct PossiblyExternalizedIriOrLiteral {
   PossiblyExternalizedIriOrLiteral(std::string iriOrLiteral,
                                    bool isExternal = false)
-      : _iriOrLiteral{std::move(iriOrLiteral)}, _isExternal{isExternal} {}
+      : iriOrLiteral_{std::move(iriOrLiteral)}, isExternal_{isExternal} {}
   PossiblyExternalizedIriOrLiteral() = default;
-  std::string _iriOrLiteral;
-  bool _isExternal = false;
+  std::string iriOrLiteral_;
+  bool isExternal_ = false;
 
   AD_SERIALIZE_FRIEND_FUNCTION(PossiblyExternalizedIriOrLiteral) {
-    serializer | arg._iriOrLiteral;
-    serializer | arg._isExternal;
+    serializer | arg.iriOrLiteral_;
+    serializer | arg.isExternal_;
   }
 };
 
 struct TripleComponentWithIndex {
-  std::string _iriOrLiteral;
-  bool _isExternal = false;
-  uint64_t _index = 0;
+  std::string iriOrLiteral_;
+  bool isExternal_ = false;
+  uint64_t index_ = 0;
 
-  [[nodiscard]] const auto& isExternal() const { return _isExternal; }
-  [[nodiscard]] auto& isExternal() { return _isExternal; }
-  [[nodiscard]] const auto& iriOrLiteral() const { return _iriOrLiteral; }
-  [[nodiscard]] auto& iriOrLiteral() { return _iriOrLiteral; }
+  [[nodiscard]] const auto& isExternal() const { return isExternal_; }
+  [[nodiscard]] auto& isExternal() { return isExternal_; }
+  [[nodiscard]] const auto& iriOrLiteral() const { return iriOrLiteral_; }
+  [[nodiscard]] auto& iriOrLiteral() { return iriOrLiteral_; }
 
   AD_SERIALIZE_FRIEND_FUNCTION(TripleComponentWithIndex) {
-    serializer | arg._iriOrLiteral;
-    serializer | arg._isExternal;
-    serializer | arg._index;
+    serializer | arg.iriOrLiteral_;
+    serializer | arg.isExternal_;
+    serializer | arg.index_;
   }
 };
 
@@ -64,13 +66,68 @@ inline Triple makeTriple(std::array<std::string, 3>&& t) {
 
 /// The index of a word and the corresponding `SplitVal`.
 struct LocalVocabIndexAndSplitVal {
-  uint64_t m_id;
-  TripleComponentComparator::SplitVal m_splitVal;
+  uint64_t id_;
+  TripleComponentComparator::SplitValNonOwningWithSortKey splitVal_;
+};
+
+// During the first phase of the index building we use hash maps from strings
+// (entries in the vocabulary) to their `LocalVocabIndexAndSplitVal` (see
+// above). In the hash map we will only store pointers (`string_view` as the
+// key, and the `LocalVocabIndexAndSplitVal` also is a non-owning pointer type)
+// and manage the memory separately, s.t. we can deallocate all the strings of a
+// single phase at once as soon as we are finished with them.
+
+// Allocator type for the hash map
+using ItemAlloc = std::pmr::polymorphic_allocator<
+    std::pair<const std::string_view, LocalVocabIndexAndSplitVal>>;
+
+// The actual hash map type.
+using ItemMap = ad_utility::HashMap<
+    std::string_view, LocalVocabIndexAndSplitVal,
+    absl::container_internal::hash_default_hash<std::string_view>,
+    absl::container_internal::hash_default_eq<std::string_view>, ItemAlloc>;
+
+// A vector that stores the same values as the hash map.
+using ItemVec =
+    std::vector<std::pair<std::string_view, LocalVocabIndexAndSplitVal>>;
+
+// A buffer that very efficiently handles a set of strings that is deallocated
+// at once when the buffer goes out of scope.
+class MonotonicBuffer {
+  std::unique_ptr<std::pmr::monotonic_buffer_resource> buffer_ =
+      std::make_unique<std::pmr::monotonic_buffer_resource>();
+  std::unique_ptr<std::pmr::polymorphic_allocator<char>> charAllocator_ =
+      std::make_unique<std::pmr::polymorphic_allocator<char>>(buffer_.get());
+
+ public:
+  // Access to the underlying allocator.
+  std::pmr::polymorphic_allocator<char>& charAllocator() {
+    return *charAllocator_;
+  }
+  // Append a string to the buffer and return a `string_view` that points into
+  // the buffer.
+  std::string_view addString(std::string_view input) {
+    auto ptr = charAllocator_->allocate(input.size());
+    std::ranges::copy(input, ptr);
+    return {ptr, ptr + input.size()};
+  }
+};
+
+// The hash map (which only stores pointers) together with the `MonotonicBuffer`
+// that manages the actual strings.
+struct ItemMapAndBuffer {
+  ItemMap map_;
+  MonotonicBuffer buffer_;
+
+  explicit ItemMapAndBuffer(ItemAlloc alloc) : map_{alloc} {}
+  ItemMapAndBuffer(ItemMapAndBuffer&&) noexcept = default;
+  // We have to delete the move-assignment as it would have the wrong semantics
+  // (the monotonic buffer wouldn't be moved, this is one of the oddities of the
+  // `std::pmr` types.
+  ItemMapAndBuffer& operator=(ItemMapAndBuffer&&) noexcept = delete;
 };
 
-using ItemMap = ad_utility::HashMap<std::string, LocalVocabIndexAndSplitVal>;
-using ItemMapArray = std::array<ItemMap, NUM_PARALLEL_ITEM_MAPS>;
-using ItemVec = std::vector<std::pair<std::string, LocalVocabIndexAndSplitVal>>;
+using ItemMapArray = std::array<ItemMapAndBuffer, NUM_PARALLEL_ITEM_MAPS>;
 
 /**
  * Manage a HashMap of string->Id to create unique Ids for strings.
@@ -80,14 +137,13 @@ using ItemVec = std::vector<std::pair<std::string, LocalVocabIndexAndSplitVal>>;
 // Align each ItemMapManager on its own cache line to avoid false sharing.
 struct alignas(256) ItemMapManager {
   /// Construct by assigning the minimum ID that should be returned by the map.
-  explicit ItemMapManager(uint64_t minId, const TripleComponentComparator* cmp)
-      : _map(), _minId(minId), m_comp(cmp) {}
-  /// Minimum Id is 0
-  ItemMapManager() = default;
+  explicit ItemMapManager(uint64_t minId, const TripleComponentComparator* cmp,
+                          ItemAlloc alloc)
+      : map_(alloc), minId_(minId), comparator_(cmp) {}
 
   /// Move the held HashMap out as soon as we are done inserting and only need
-  /// the actual vocabulary
-  ItemMap&& moveMap() && { return std::move(_map); }
+  /// the actual vocabulary.
+  ItemMapAndBuffer&& moveMap() && { return std::move(map_); }
 
   /// If the key was seen before, return its preassigned ID. Else assign the
   /// next free ID to the string, store and return it.
@@ -96,40 +152,47 @@ struct alignas(256) ItemMapManager {
       return std::get<Id>(keyOrId);
     }
     const auto& key = std::get<PossiblyExternalizedIriOrLiteral>(keyOrId);
-    if (!_map.count(key._iriOrLiteral)) {
-      uint64_t res = _map.size() + _minId;
-      _map[key._iriOrLiteral] = {
-          res, m_comp->extractAndTransformComparable(
-                   key._iriOrLiteral, TripleComponentComparator::Level::TOTAL,
-                   key._isExternal)};
+    auto& map = map_.map_;
+    auto& buffer = map_.buffer_;
+    auto it = map.find(key.iriOrLiteral_);
+    if (it == map.end()) {
+      uint64_t res = map.size() + minId_;
+      // We have to first add the string to the buffer, otherwise we don't have
+      // a persistent `string_view` to add to the `map`.
+      auto keyView = buffer.addString(key.iriOrLiteral_);
+      map.try_emplace(
+          keyView, LocalVocabIndexAndSplitVal{
+                       res, comparator_->extractAndTransformComparableNonOwning(
+                                key.iriOrLiteral_,
+                                TripleComponentComparator::Level::TOTAL,
+                                key.isExternal_, &buffer.charAllocator())});
       return Id::makeFromVocabIndex(VocabIndex::make(res));
     } else {
-      return Id::makeFromVocabIndex(
-          VocabIndex::make(_map[key._iriOrLiteral].m_id));
+      return Id::makeFromVocabIndex(VocabIndex::make(it->second.id_));
     }
   }
 
   /// call getId for each of the Triple elements.
   std::array<Id, 3> getId(const Triple& t) {
     return {getId(t[0]), getId(t[1]), getId(t[2])};
   }
-  ItemMap _map;
-  uint64_t _minId = 0;
-  const TripleComponentComparator* m_comp = nullptr;
+  ItemMapAndBuffer map_;
+  uint64_t minId_ = 0;
+  const TripleComponentComparator* comparator_ = nullptr;
 };
 
 /// Combines a triple (three strings) together with the (possibly empty)
 /// language tag of its object.
 struct LangtagAndTriple {
-  std::string _langtag;
-  Triple _triple;
+  std::string langtag_;
+  Triple triple_;
 };
 
 /**
  * @brief Get the tuple of lambda functions that is needed for the String-> Id
  * step of the Index building Pipeline
  *
- * return a tuple of <Parallelism> lambda functions, each lambda does the
+ * return a tuple of <NumThreads> lambda functions, each lambda does the
  * following
  *
  * given an index idx, returns a lambda that
@@ -153,19 +216,26 @@ struct LangtagAndTriple {
  * ranges for the individual HashMaps
  * @return A Tuple of lambda functions (see above)
  */
-template <size_t Parallelism>
-auto getIdMapLambdas(std::array<ItemMapManager, Parallelism>* itemArrayPtr,
-                     size_t maxNumberOfTriples,
-                     const TripleComponentComparator* comp) {
+template <size_t NumThreads>
+auto getIdMapLambdas(
+    std::array<std::optional<ItemMapManager>, NumThreads>* itemArrayPtr,
+    size_t maxNumberOfTriples, const TripleComponentComparator* comp,
+    auto* indexPtr, ItemAlloc alloc) {
   // that way the different ids won't interfere
   auto& itemArray = *itemArrayPtr;
-  for (size_t j = 0; j < Parallelism; ++j) {
-    itemArray[j] = ItemMapManager(j * 100 * maxNumberOfTriples, comp);
+  for (size_t j = 0; j < NumThreads; ++j) {
+    itemArray[j].emplace(j * 100 * maxNumberOfTriples, comp, alloc);
+    // This `reserve` is for a guaranteed upper bound that stays the same during
+    // the whole index building. That's why we use the `CachingMemoryResource`
+    // as an underlying memory pool for the allocator of the hash map to make
+    // the allocation and deallocation of these hash maps (that are newly
+    // created for each batch) much cheaper (see `CachingMemoryResource.h` and
+    // `IndexImpl.cpp`).
+    itemArray[j]->map_.map_.reserve(5 * maxNumberOfTriples / NumThreads);
     // The LANGUAGE_PREDICATE gets the first ID in each map. TODO<joka921>
     // This is not necessary for the actual QLever code, but certain unit tests
     // currently fail without it.
-    itemArray[j].getId(LANGUAGE_PREDICATE);
-    itemArray[j]._map.reserve(2 * maxNumberOfTriples);
+    itemArray[j]->getId(LANGUAGE_PREDICATE);
   }
   using OptionalIds = std::array<std::optional<std::array<Id, 3>>, 3>;
 
@@ -177,23 +247,24 @@ auto getIdMapLambdas(std::array<ItemMapManager, Parallelism>* itemArrayPtr,
    * tag)
    * - All Ids are assigned according to itemArray[idx]
    */
-  const auto itemMapLamdaCreator = [&itemArray](const size_t idx) {
-    return [&map = itemArray[idx]](LangtagAndTriple&& lt) {
+  const auto itemMapLamdaCreator = [&itemArray, indexPtr](const size_t idx) {
+    return [&map = *itemArray[idx], indexPtr](ad_utility::Rvalue auto&& tr) {
+      auto lt = indexPtr->tripleToInternalRepresentation(AD_FWD(tr));
       OptionalIds res;
       // get Ids for the actual triple and store them in the result.
-      res[0] = map.getId(lt._triple);
-      if (!lt._langtag.empty()) {  // the object of the triple was a literal
+      res[0] = map.getId(lt.triple_);
+      if (!lt.langtag_.empty()) {  // the object of the triple was a literal
                                    // with a language tag
         // get the Id for the corresponding langtag Entity
         auto langTagId =
-            map.getId(ad_utility::convertLangtagToEntityUri(lt._langtag));
+            map.getId(ad_utility::convertLangtagToEntityUri(lt.langtag_));
         // get the Id for the tagged predicate, e.g. @en@rdfs:label
         auto langTaggedPredId =
             map.getId(ad_utility::convertToLanguageTaggedPredicate(
-                std::get<PossiblyExternalizedIriOrLiteral>(lt._triple[1])
-                    ._iriOrLiteral,
-                lt._langtag));
-        auto& spoIds = *(res[0]);  // ids of original triple
+                std::get<PossiblyExternalizedIriOrLiteral>(lt.triple_[1])
+                    .iriOrLiteral_,
+                lt.langtag_));
+        auto& spoIds = *res[0];  // ids of original triple
         // TODO replace the std::array by an explicit IdTriple class,
         //  then the emplace calls don't need the explicit type.
         // extra triple <subject> @language@<predicate> <object>
@@ -210,8 +281,7 @@ auto getIdMapLambdas(std::array<ItemMapManager, Parallelism>* itemArrayPtr,
   // setup a tuple with one lambda function per map in the itemArray
   // (the first lambda will assign ids according to itemArray[1]...
   auto itemMapLambdaTuple =
-      ad_tuple_helpers::setupTupleFromCallable<Parallelism>(
-          itemMapLamdaCreator);
+      ad_tuple_helpers::setupTupleFromCallable<NumThreads>(itemMapLamdaCreator);
   return itemMapLambdaTuple;
 }
 #endif  // QLEVER_INDEXBUILDERTYPES_H