ql:contains-word now can show the score of the word match in the re…

…spective text (ad-freiburg#1397) The fulltext index of QLever has forever been able to associate the occurence of a word in a text with a score. This PR adds the functionality to actually retrieve this score and to use it in the remainder of the query. Currently the score is bound to a variable the name of which is automatically determined from the involved literals and variables. The easiest way to get the names of these variables is to use `SELECT *` or to look at the runtime information tree.
Flixtastic · Dec 16, 2024 · a97905e · a97905e
1 parent 27f4799
commit a97905e
Show file tree

Hide file tree

Showing 20 changed files with 400 additions and 98 deletions.
diff --git a/e2e/scientists_queries.yaml b/e2e/scientists_queries.yaml
@@ -55,31 +55,43 @@ queries:
           ?t ql:contains-word "RElaT* phySIKalische rela*"
       }
     checks:
-      - num_cols: 5
-      - selected: [ "?x", "?ql_score_t_var_x", "?t", "?ql_matchingword_t_relat", "?ql_matchingword_t_rela" ]
+      - num_cols: 8
+      - selected: [ "?x", "?ql_score_t_var_x", "?t", "?ql_score_prefix_t_RElaT", "?ql_matchingword_t_relat", "?ql_score_word_t_phySIKalische", "?ql_score_prefix_t_rela", "?ql_matchingword_t_rela" ]
       - contains_row:
           - "<Albert_Einstein>"
           - null
           - null
+          - null
           - "relationship"
+          - null
+          - null
           - "relationship"
       - contains_row:
           - "<Albert_Einstein>"
           - null
           - null
+          - null
           - "relationship"
+          - null
+          - null
           - "relativity"
       - contains_row:
           - "<Albert_Einstein>"
           - null
           - null
+          - null
           - "relativity"
+          - null
+          - null
           - "relationship"
       - contains_row:
           - "<Albert_Einstein>"
           - null
           - null
+          - null
           - "relativity"
+          - null
+          - null
           - "relativity"
 
   - query: algo-star-female-scientists
@@ -151,7 +163,7 @@ queries:
       }
       TEXTLIMIT 2
     checks:
-      - num_cols: 7
+      - num_cols: 9
       - num_rows: 18
 
   - query: algor-star-female-born-before-1940
@@ -192,7 +204,7 @@ queries:
       }
       ORDER BY DESC(?ql_score_text_fixedEntity__60_Ada_95_Lovelace_62_)
     checks:
-      - num_cols: 5
+      - num_cols: 6
       - num_rows: 7
       - contains_row:
         - "<Ada_Lovelace>"
@@ -202,6 +214,7 @@ queries:
         Charles Babbage, also known as' the father of computers', and in
         particular, Babbage's work on the Analytical Engine."
         - null
+        - null
         - "relationship"
       - order_numeric: {"dir": "DESC", 
       "var" : "?ql_score_text_fixedEntity__60_Ada_95_Lovelace_62_"}
@@ -219,7 +232,7 @@ queries:
       ORDER BY DESC(?ql_score_text_fixedEntity__60_Ada_95_Lovelace_62_)
       TEXTLIMIT 2
     checks:
-      - num_cols: 5
+      - num_cols: 6
       - num_rows: 3
       - contains_row:
         - "<Ada_Lovelace>"
@@ -229,6 +242,7 @@ queries:
         Charles Babbage, also known as' the father of computers', and in
         particular, Babbage's work on the Analytical Engine."
         - null
+        - null
         - "relationship"
       - order_numeric: {"dir": "DESC", 
       "var" : "?ql_score_text_fixedEntity__60_Ada_95_Lovelace_62_"}
@@ -246,7 +260,7 @@ queries:
       }
       TEXTLIMIT 1
     checks:
-      - num_cols: 6
+      - num_cols: 7
       - num_rows: 2
       - contains_row:
         - "<Ada_Lovelace>"
@@ -255,6 +269,7 @@ queries:
         with Somerville to visit Babbage as often as she could."
         - null
         - null
+        - null
         - "relationship"
 
 
@@ -1391,10 +1406,10 @@ queries:
           ?t ql:contains-word "algo* herm* primary"
       }
     checks:
-      - num_cols: 5
+      - num_cols: 8
       - num_rows: 1
-      - selected: [ "?x", "?ql_score_t_var_x", "?t", "?ql_matchingword_t_algo", "?ql_matchingword_t_herm" ]
-      - contains_row: [ "<Grete_Hermann>",null,"Hermann's algorithm for primary decomposition is still in use now.","algorithm","hermann" ]
+      - selected: [ "?x", "?ql_score_t_var_x", "?t", "?ql_score_prefix_t_algo", "?ql_matchingword_t_algo", "?ql_score_prefix_t_herm", "?ql_matchingword_t_herm", "?ql_score_word_t_primary" ]
+      - contains_row: [ "<Grete_Hermann>",null,"Hermann's algorithm for primary decomposition is still in use now.",null,"algorithm",null,"hermann",null ]
 
 
   - query : select_asterisk_regex-lastname-stein

diff --git a/src/engine/QueryPlanner.cpp b/src/engine/QueryPlanner.cpp
@@ -1002,14 +1002,14 @@ QueryPlanner::SubtreePlan QueryPlanner::getTextLeafPlan(
                           : *(node._variables.begin());
       plan = makeSubtreePlan<TextIndexScanForEntity>(_qec, cvar, evar, word);
       textLimits[cvar].entityVars_.push_back(evar);
-      textLimits[cvar].scoreVars_.push_back(cvar.getScoreVariable(evar));
+      textLimits[cvar].scoreVars_.push_back(cvar.getEntityScoreVariable(evar));
     } else {
       // Fixed entity case
       AD_CORRECTNESS_CHECK(node._variables.size() == 1);
       plan = makeSubtreePlan<TextIndexScanForEntity>(
           _qec, cvar, node.triple_.o_.toString(), word);
       textLimits[cvar].scoreVars_.push_back(
-          cvar.getScoreVariable(node.triple_.o_.toString()));
+          cvar.getEntityScoreVariable(node.triple_.o_.toString()));
     }
   } else {
     plan = makeSubtreePlan<TextIndexScanForWord>(_qec, cvar, word);

diff --git a/src/engine/TextIndexScanForEntity.cpp b/src/engine/TextIndexScanForEntity.cpp
@@ -48,10 +48,10 @@ VariableToColumnMap TextIndexScanForEntity::computeVariableToColumnMap() const {
   };
   addDefinedVar(textRecordVar_);
   if (hasFixedEntity()) {
-    addDefinedVar(textRecordVar_.getScoreVariable(fixedEntity()));
+    addDefinedVar(textRecordVar_.getEntityScoreVariable(fixedEntity()));
   } else {
     addDefinedVar(entityVariable());
-    addDefinedVar(textRecordVar_.getScoreVariable(entityVariable()));
+    addDefinedVar(textRecordVar_.getEntityScoreVariable(entityVariable()));
   }
   return vcmap;
 }

diff --git a/src/engine/TextIndexScanForWord.cpp b/src/engine/TextIndexScanForWord.cpp
@@ -18,13 +18,12 @@ ProtoResult TextIndexScanForWord::computeResult(
   IdTable idTable = getExecutionContext()->getIndex().getWordPostingsForTerm(
       word_, getExecutionContext()->getAllocator());
 
+  // This filters out the word column. When the searchword is a prefix this
+  // column shows the word the prefix got extended to
   if (!isPrefix_) {
-    IdTable smallIdTable{getExecutionContext()->getAllocator()};
-    smallIdTable.setNumColumns(1);
-    smallIdTable.resize(idTable.numRows());
-    ql::ranges::copy(idTable.getColumn(0), smallIdTable.getColumn(0).begin());
-
-    return {std::move(smallIdTable), resultSortedOn(), LocalVocab{}};
+    using CI = ColumnIndex;
+    idTable.setColumnSubset(std::array{CI{0}, CI{2}});
+    return {std::move(idTable), resultSortedOn(), LocalVocab{}};
   }
 
   // Add details to the runtimeInfo. This is has no effect on the result.
@@ -46,12 +45,13 @@ VariableToColumnMap TextIndexScanForWord::computeVariableToColumnMap() const {
     addDefinedVar(textRecordVar_.getMatchingWordVariable(
         std::string_view(word_).substr(0, word_.size() - 1)));
   }
+  addDefinedVar(textRecordVar_.getWordScoreVariable(word_, isPrefix_));
   return vcmap;
 }
 
 // _____________________________________________________________________________
 size_t TextIndexScanForWord::getResultWidth() const {
-  return 1 + (isPrefix_ ? 1 : 0);
+  return 2 + (isPrefix_ ? 1 : 0);
 }
 
 // _____________________________________________________________________________

diff --git a/src/index/FTSAlgorithms.cpp b/src/index/FTSAlgorithms.cpp
@@ -10,19 +10,21 @@
 // _____________________________________________________________________________
 IdTable FTSAlgorithms::filterByRange(const IdRange<WordVocabIndex>& idRange,
                                      const IdTable& idTablePreFilter) {
-  AD_CONTRACT_CHECK(idTablePreFilter.numColumns() == 2);
+  AD_CONTRACT_CHECK(idTablePreFilter.numColumns() == 3);
   LOG(DEBUG) << "Filtering " << idTablePreFilter.getColumn(0).size()
              << " elements by ID range...\n";
 
   IdTable idTableResult{idTablePreFilter.getAllocator()};
-  idTableResult.setNumColumns(2);
+  idTableResult.setNumColumns(3);
   idTableResult.resize(idTablePreFilter.getColumn(0).size());
 
   decltype(auto) resultCidColumn = idTableResult.getColumn(0);
   decltype(auto) resultWidColumn = idTableResult.getColumn(1);
+  decltype(auto) resultSidColumn = idTableResult.getColumn(2);
   size_t nofResultElements = 0;
   decltype(auto) preFilterCidColumn = idTablePreFilter.getColumn(0);
   decltype(auto) preFilterWidColumn = idTablePreFilter.getColumn(1);
+  decltype(auto) preFilterSidColumn = idTablePreFilter.getColumn(2);
   // TODO<C++23> Use views::zip.
   for (size_t i = 0; i < preFilterWidColumn.size(); ++i) {
     // TODO<joka921> proper Ids for the text stuff.
@@ -36,6 +38,7 @@ IdTable FTSAlgorithms::filterByRange(const IdRange<WordVocabIndex>& idRange,
         preFilterWidColumn[i].getWordVocabIndex() <= idRange.last()) {
       resultCidColumn[nofResultElements] = preFilterCidColumn[i];
       resultWidColumn[nofResultElements] = preFilterWidColumn[i];
+      resultSidColumn[nofResultElements] = preFilterSidColumn[i];
       nofResultElements++;
     }
   }

diff --git a/src/index/Index.cpp b/src/index/Index.cpp
@@ -232,6 +232,11 @@ size_t Index::getNofEntityPostings() const {
   return pimpl_->getNofEntityPostings();
 }
 
+// ____________________________________________________________________________
+size_t Index::getNofNonLiteralsInTextIndex() const {
+  return pimpl_->getNofNonLiteralsInTextIndex();
+}
+
 // ____________________________________________________________________________
 Index::NumNormalAndInternal Index::numDistinctSubjects() const {
   return pimpl_->numDistinctSubjects();

diff --git a/src/index/Index.h b/src/index/Index.h
@@ -214,6 +214,7 @@ class Index {
   size_t getNofTextRecords() const;
   size_t getNofWordPostings() const;
   size_t getNofEntityPostings() const;
+  size_t getNofNonLiteralsInTextIndex() const;
 
   NumNormalAndInternal numDistinctSubjects() const;
   NumNormalAndInternal numDistinctObjects() const;

diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp
@@ -65,7 +65,7 @@ cppcoro::generator<ContextFileParser::Line> IndexImpl::wordsInTextRecords(
       if (!isLiteral(text)) {
         continue;
       }
-      ContextFileParser::Line entityLine{text, true, contextId, 1};
+      ContextFileParser::Line entityLine{text, true, contextId, 1, true};
       co_yield entityLine;
       std::string_view textView = text;
       textView = textView.substr(0, textView.rfind('"'));
@@ -235,10 +235,12 @@ void IndexImpl::processWordsForInvertedLists(const string& contextFile,
   ad_utility::HashMap<WordIndex, Score> wordsInContext;
   ad_utility::HashMap<Id, Score> entitiesInContext;
   auto currentContext = TextRecordIndex::make(0);
+  // The nofContexts can be misleading since it also counts empty contexts
   size_t nofContexts = 0;
   size_t nofWordPostings = 0;
   size_t nofEntityPostings = 0;
   size_t entityNotFoundErrorMsgCount = 0;
+  size_t nofLiterals = 0;
 
   for (auto line : wordsInTextRecords(contextFile, addWordsFromLiterals)) {
     if (line._contextId != currentContext) {
@@ -258,6 +260,9 @@ void IndexImpl::processWordsForInvertedLists(const string& contextFile,
         // Note that `entitiesInContext` is a HashMap, so the `Id`s don't have
         // to be contiguous.
         entitiesInContext[Id::makeFromVocabIndex(eid)] += line._score;
+        if (line._isLiteralEntity) {
+          ++nofLiterals;
+        }
       } else {
         if (entityNotFoundErrorMsgCount < 20) {
           LOG(WARN) << "Entity from text not in KB: " << line._word << '\n';
@@ -294,6 +299,10 @@ void IndexImpl::processWordsForInvertedLists(const string& contextFile,
   textMeta_.setNofTextRecords(nofContexts);
   textMeta_.setNofWordPostings(nofWordPostings);
   textMeta_.setNofEntityPostings(nofEntityPostings);
+  nofNonLiteralsInTextIndex_ = nofContexts - nofLiterals;
+  configurationJson_["num-non-literals-text-index"] =
+      nofNonLiteralsInTextIndex_;
+  writeConfiguration();
 
   writer.finish();
   LOG(TRACE) << "END IndexImpl::passContextFileIntoVector" << std::endl;
@@ -415,7 +424,7 @@ ContextListMetaData IndexImpl::writePostings(ad_utility::File& out,
 
   size_t n = 0;
 
-  WordToCodeMap wordCodeMap;
+  WordCodeMap wordCodeMap;
   WordCodebook wordCodebook;
   ScoreCodeMap scoreCodeMap;
   ScoreCodebook scoreCodebook;
@@ -646,10 +655,11 @@ size_t IndexImpl::writeList(Numeric* data, size_t nofElements,
 
 // _____________________________________________________________________________
 void IndexImpl::createCodebooks(const vector<IndexImpl::Posting>& postings,
-                                IndexImpl::WordToCodeMap& wordCodemap,
+                                IndexImpl::WordCodeMap& wordCodemap,
                                 IndexImpl::WordCodebook& wordCodebook,
                                 IndexImpl::ScoreCodeMap& scoreCodemap,
                                 IndexImpl::ScoreCodebook& scoreCodebook) const {
+  // There should be a more efficient way to do this (Felix Meisen)
   ad_utility::HashMap<WordIndex, size_t> wfMap;
   ad_utility::HashMap<Score, size_t> sfMap;
   for (const auto& p : postings) {
@@ -718,7 +728,7 @@ std::string_view IndexImpl::wordIdToString(WordIndex wordIndex) const {
 IdTable IndexImpl::readWordCl(
     const TextBlockMetaData& tbmd,
     const ad_utility::AllocatorWithLimit<Id>& allocator) const {
-  IdTable idTable{2, allocator};
+  IdTable idTable{3, allocator};
   vector<TextRecordIndex> cids = readGapComprList<TextRecordIndex>(
       tbmd._cl._nofElements, tbmd._cl._startContextlist,
       static_cast<size_t>(tbmd._cl._startWordlist - tbmd._cl._startContextlist),
@@ -734,6 +744,11 @@ IdTable IndexImpl::readWordCl(
       idTable.getColumn(1).begin(), [](WordIndex id) {
         return Id::makeFromWordVocabIndex(WordVocabIndex::make(id));
       });
+  std::ranges::transform(
+      readFreqComprList<Score>(tbmd._cl._nofElements, tbmd._cl._startScorelist,
+                               static_cast<size_t>(tbmd._cl._lastByte + 1 -
+                                                   tbmd._cl._startScorelist)),
+      idTable.getColumn(2).begin(), &Id::makeFromInt);
   return idTable;
 }
 
@@ -772,7 +787,7 @@ IdTable IndexImpl::getWordPostingsForTerm(
     const ad_utility::AllocatorWithLimit<Id>& allocator) const {
   LOG(DEBUG) << "Getting word postings for term: " << term << '\n';
   IdTable idTable{allocator};
-  idTable.setNumColumns(term.ends_with('*') ? 2 : 1);
+  idTable.setNumColumns(term.ends_with('*') ? 3 : 2);
   auto optionalTbmd = getTextBlockMetadataForWordOrPrefix(term);
   if (!optionalTbmd.has_value()) {
     return idTable;

diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp
@@ -1128,6 +1128,7 @@ void IndexImpl::readConfiguration() {
   loadDataMember("num-subjects", numSubjects_, NumNormalAndInternal{});
   loadDataMember("num-objects", numObjects_, NumNormalAndInternal{});
   loadDataMember("num-triples", numTriples_, NumNormalAndInternal{});
+  loadDataMember("num-non-literals-text-index", nofNonLiteralsInTextIndex_, 0);
 
   // Initialize BlankNodeManager
   uint64_t numBlankNodesTotal;

diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h
@@ -158,6 +158,11 @@ class IndexImpl {
   NumNormalAndInternal numTriples_;
   string indexId_;
 
+  // Keeps track of the number of nonLiteral contexts in the index this is used
+  // in the test retrieval of the texts. This only works reliably if the
+  // wordsFile.tsv starts with contextId 1 and is continuous.
+  size_t nofNonLiteralsInTextIndex_;
+
   // Global static pointers to the currently active index and comparator.
   // Those are used to compare LocalVocab entries with each other as well as
   // with Vocab entries.
@@ -424,6 +429,9 @@ class IndexImpl {
   size_t getNofEntityPostings() const {
     return textMeta_.getNofEntityPostings();
   }
+  size_t getNofNonLiteralsInTextIndex() const {
+    return nofNonLiteralsInTextIndex_;
+  }
 
   bool hasAllPermutations() const { return SPO().isLoaded(); }
 
@@ -624,14 +632,17 @@ class IndexImpl {
                    ad_utility::File& file) const;
 
   // TODO<joka921> understand what the "codes" are, are they better just ints?
-  typedef ad_utility::HashMap<WordIndex, CompressionCode> WordToCodeMap;
+  // After using createCodebooks on these types, the lowest codes refer to the
+  // most frequent WordIndex/Score. The maps are mapping those codes to their
+  // respective frequency.
+  typedef ad_utility::HashMap<WordIndex, CompressionCode> WordCodeMap;
   typedef ad_utility::HashMap<Score, Score> ScoreCodeMap;
   typedef vector<CompressionCode> WordCodebook;
   typedef vector<Score> ScoreCodebook;
 
   //! Creates codebooks for lists that are supposed to be entropy encoded.
   void createCodebooks(const vector<Posting>& postings,
-                       WordToCodeMap& wordCodemap, WordCodebook& wordCodebook,
+                       WordCodeMap& wordCodemap, WordCodebook& wordCodebook,
                        ScoreCodeMap& scoreCodemap,
                        ScoreCodebook& scoreCodebook) const;
 

diff --git a/src/parser/ContextFileParser.h b/src/parser/ContextFileParser.h
@@ -21,6 +21,7 @@ class ContextFileParser {
     bool _isEntity;
     TextRecordIndex _contextId;
     Score _score;
+    bool _isLiteralEntity = false;
   };
 
   explicit ContextFileParser(const string& contextFile,