Only one RAND() per group in a GROUP BY (ad-freiburg#1050)

Special treatment of `RAND()` when it is part of a `GROUP BY`, notably in a query of the form `... GROUP BY <some variables> ORDER BY RAND()`. We then expect a random order of the groups, and a new random order each time we repeat the query. In the code so far, this gave an error message. For Blazegraph, the order remains the same when repeating the query. This fixes ad-freiburg#1049.
schlegan · Aug 16, 2023 · d8039e1 · d8039e1
1 parent 4b81883
commit d8039e1
Show file tree

Hide file tree

Showing 5 changed files with 36 additions and 0 deletions.
diff --git a/e2e/scientists_queries.yaml b/e2e/scientists_queries.yaml
@@ -157,6 +157,22 @@ queries:
       - contains_row: [ 280, "<New_York_City>" ]
       - order_numeric: { "dir": "DESC", "var": "?count" }
 
+  - query: group-by-and-rand
+    type: no-text
+    sparql: |
+      SELECT (COUNT(?x) as ?count) ?place WHERE {
+          ?x <is-a> <Scientist> .
+          ?x <Place_of_birth> ?place .
+          ?x <Gender> <Female>
+      }
+      GROUP BY ?place
+      ORDER BY RAND()
+    checks:
+      - num_cols: 2
+      - num_rows : 638
+      - selected: [ "?count", "?place" ]
+      - contains_row: [ 27, "<New_York_City>" ]
+
 
   # TODO<joka> reintroduce order by DESC((COUNT(?x) as ?count)) as soon as
   #  ordering by an expression while grouping is implemented

diff --git a/src/engine/GroupBy.cpp b/src/engine/GroupBy.cpp
@@ -253,6 +253,9 @@ void GroupBy::doGroupBy(const IdTable& dynInput,
       getInternallyVisibleVariableColumns();
   evaluationContext._previousResultsFromSameGroup.resize(getResultWidth());
 
+  // Let the evaluation know that we are part of a GROUP BY.
+  evaluationContext._isPartOfGroupBy = true;
+
   auto processNextBlock = [&](size_t blockStart, size_t blockEnd) {
     result.emplace_back();
     size_t rowIdx = result.size() - 1;

diff --git a/src/engine/sparqlExpressions/RandomExpression.h b/src/engine/sparqlExpressions/RandomExpression.h
@@ -22,6 +22,12 @@ class RandomExpression : public SparqlExpression {
     const size_t numElements = context->_endIndex - context->_beginIndex;
     result.reserve(numElements);
     FastRandomIntGenerator<int64_t> randInt;
+
+    // As part of a GROUP BY we only return one value per group.
+    if (context->_isPartOfGroupBy) {
+      return Id::makeFromInt(randInt() >> Id::numDatatypeBits);
+    }
+
     for (size_t i = 0; i < numElements; ++i) {
       result.push_back(Id::makeFromInt(randInt() >> Id::numDatatypeBits));
     }

diff --git a/src/engine/sparqlExpressions/SparqlExpressionTypes.h b/src/engine/sparqlExpressions/SparqlExpressionTypes.h
@@ -173,6 +173,10 @@ struct EvaluationContext {
   // `_variableToColumnMapPreviousResults`.
   std::vector<ExpressionResult> _previousResultsFromSameGroup;
 
+  // Used to modify the behavior of the RAND() expression when it is evaluated
+  // as part of a GROUP BY clause.
+  bool _isPartOfGroupBy = false;
+
   /// Constructor for evaluating an expression on the complete input.
   EvaluationContext(const QueryExecutionContext& qec,
                     const VariableToColumnMap& variableToColumnMap,

diff --git a/test/RandomExpressionTest.cpp b/test/RandomExpressionTest.cpp
@@ -39,6 +39,13 @@ TEST(RandomExpression, evaluate) {
   }
   ASSERT_GE(numSwaps, 100);
   ASSERT_LE(numSwaps, 900);
+
+  // When we are part of a GROUP BY, we don't expect a vector but a single ID.
+  {
+    evaluationContext._isPartOfGroupBy = true;
+    auto resultAsVariant2 = RandomExpression{}.evaluate(&evaluationContext);
+    ASSERT_TRUE(std::holds_alternative<Id>(resultAsVariant2));
+  }
 }
 
 TEST(RandomExpression, simpleMemberFunctions) {