For small index scans, avoid spurious copy of whole block (ad-freibur…

…g#1260) For small index scans, which read only a part of a single block, QLever so far made a copy of the whole block, even when the block was already in the block cache. This spurious copy is now avoided, so that, when the block is already in the cache, only the requested part is copied. In particular, this makes full index scans much faster and also helps other queries with ?s ?p ?o where ?s is bound by the rest of the query. Note: There is potential for more performance gains for such scans, but we have other issues for now.
schlegan · Feb 6, 2024 · 60bf149 · 60bf149
1 parent ac9db29
commit 60bf149
Show file tree

Hide file tree

Showing 4 changed files with 42 additions and 28 deletions.
diff --git a/e2e/e2e.sh b/e2e/e2e.sh
@@ -111,7 +111,7 @@ fi
 # here because then we can't easily get the SERVER_PID out of that subshell
 pushd "$BINARY_DIR"
 echo "Launching server from path $(pwd)"
-./ServerMain -i "$INDEX" -p 9099 -m 1GB -t --default-query-timeout 500s &> server_log.txt &
+./ServerMain -i "$INDEX" -p 9099 -m 1GB -t --default-query-timeout 30s &> server_log.txt &
 SERVER_PID=$!
 popd
 

diff --git a/src/index/CompressedRelation.cpp b/src/index/CompressedRelation.cpp
@@ -423,13 +423,15 @@ DecompressedBlock CompressedRelationReader::readPossiblyIncompleteBlock(
       std::back_inserter(allColumns));
   // A block is uniquely identified by its start position in the file.
   auto cacheKey = blockMetadata.offsetsAndCompressedSize_.at(0).offsetInFile_;
-  DecompressedBlock block = blockCache_
-                                .computeOnce(cacheKey,
-                                             [&]() {
-                                               return readAndDecompressBlock(
-                                                   blockMetadata, allColumns);
-                                             })
-                                ._resultPointer->clone();
+  auto sharedResultFromCache =
+      blockCache_
+          .computeOnce(cacheKey,
+                       [&]() {
+                         return readAndDecompressBlock(blockMetadata,
+                                                       allColumns);
+                       })
+          ._resultPointer;
+  const DecompressedBlock& block = *sharedResultFromCache;
   const auto& col1Column = block.getColumn(0);
 
   // Find the range in the blockMetadata, that belongs to the same relation
@@ -452,17 +454,22 @@ DecompressedBlock CompressedRelationReader::readPossiblyIncompleteBlock(
     }
   }();
   auto numResults = subBlock.size();
-  block.erase(block.begin(),
-              block.begin() + (subBlock.begin() - col1Column.begin()));
-  block.resize(numResults);
-
+  auto beginIndex = subBlock.begin() - col1Column.begin();
+  auto endIndex = subBlock.end() - col1Column.begin();
+
+  DecompressedBlock result{columnIndices.size(), allocator_};
+  result.resize(numResults);
+  for (auto i : ad_utility::integerRange(columnIndices.size())) {
+    const auto& inputCol = block.getColumn(columnIndices[i]);
+    std::ranges::copy(inputCol.begin() + beginIndex,
+                      inputCol.begin() + endIndex, result.getColumn(i).begin());
+  }
   if (scanMetadata.has_value()) {
     auto& details = scanMetadata.value().get();
     ++details.numBlocksRead_;
-    details.numElementsRead_ += block.numRows();
+    details.numElementsRead_ += result.numRows();
   }
-  block.setColumnSubset(columnIndices);
-  return block;
+  return result;
 };
 
 // _____________________________________________________________________________

diff --git a/src/index/CompressedRelation.h b/src/index/CompressedRelation.h
@@ -515,7 +515,10 @@ class CompressedRelationReader {
   // the `col1Id` doesn't match. For this to work, the block has to be one of
   // the blocks that actually store triples from the given `relationMetadata`'s
   // relation, else the behavior is undefined. Only return the columns specified
-  // by the `columnIndices`.
+  // by the `columnIndices`. Note: Do not call this function for blocks of which
+  // you know that you need them completely, as then this function wastes some
+  // time and space. It is only typically needed for the first and last block of
+  // certain scans.
   DecompressedBlock readPossiblyIncompleteBlock(
       const CompressedRelationMetadata& relationMetadata,
       std::optional<Id> col1Id, const CompressedBlockMetadata& blockMetadata,

diff --git a/test/engine/IndexScanTest.cpp b/test/engine/IndexScanTest.cpp
@@ -6,6 +6,7 @@
 
 #include "../IndexTestHelpers.h"
 #include "../util/GTestHelpers.h"
+#include "../util/IdTableHelpers.h"
 #include "engine/IndexScan.h"
 #include "parser/ParsedQuery.h"
 
@@ -323,24 +324,27 @@ TEST(IndexScan, additionalColumn) {
   auto qec = getQec("<x> <y> <z>.");
   using V = Variable;
   SparqlTriple triple{V{"?x"}, "<y>", V{"?z"}};
-  triple._additionalScanColumns.emplace_back(1, V{"?blib"});
-  triple._additionalScanColumns.emplace_back(0, V{"?blub"});
+  triple._additionalScanColumns.emplace_back(
+      ADDITIONAL_COLUMN_INDEX_SUBJECT_PATTERN, V{"?xpattern"});
+  triple._additionalScanColumns.emplace_back(
+      ADDITIONAL_COLUMN_INDEX_OBJECT_PATTERN, V{"?ypattern"});
   auto scan = IndexScan{qec, Permutation::PSO, triple};
   ASSERT_EQ(scan.getResultWidth(), 4);
   auto col = makeAlwaysDefinedColumn;
   VariableToColumnMap expected = {{V{"?x"}, col(0)},
                                   {V{"?z"}, col(1)},
-                                  {V("?blib"), col(2)},
-                                  {V("?blub"), col(3)}};
+                                  {V("?xpattern"), col(2)},
+                                  {V("?ypattern"), col(3)}};
   ASSERT_THAT(scan.getExternallyVisibleVariableColumns(),
               ::testing::UnorderedElementsAreArray(expected));
   ASSERT_THAT(scan.getCacheKey(),
-              ::testing::ContainsRegex("Additional Columns: 1 0"));
-  // Executing such a query that has the same column multiple times is currently
-  // not supported and fails with an exception inside the `IdTable.h` module
-  // TODO<joka921> Add proper tests as soon as we can properly add additional
-  // columns. Maybe we cann add additional columns generically during the index
-  // build by adding a generic transformation function etc.
-  AD_EXPECT_THROW_WITH_MESSAGE(scan.computeResultOnlyForTesting(),
-                               ::testing::ContainsRegex("IdTable.h"));
+              ::testing::ContainsRegex("Additional Columns: 2 3"));
+  auto res = scan.computeResultOnlyForTesting();
+  auto getId = makeGetId(qec->getIndex());
+  auto I = IntId;
+  // <x> is the only subject, so it has pattern 0, <z> doesn't appear as a
+  // subject, so it has no pattern.
+  auto exp = makeIdTableFromVector(
+      {{getId("<x>"), getId("<z>"), I(0), I(NO_PATTERN)}});
+  EXPECT_THAT(res.idTable(), ::testing::ElementsAreArray(exp));
 }