Skip to content

Commit

Permalink
[SPARK-24381][TESTING] Add unit tests for NOT IN subquery around null…
Browse files Browse the repository at this point in the history
… values

## What changes were proposed in this pull request?
This PR adds several unit tests along the `cols NOT IN (subquery)` pathway. There are a scattering of tests here and there which cover this codepath, but there doesn't seem to be a unified unit test of the correctness of null-aware anti joins anywhere. I have also added a brief explanation of how this expression behaves in SubquerySuite. Lastly, I made some clarifying changes in the NOT IN pathway in RewritePredicateSubquery.

## How was this patch tested?
Added unit tests! There should be no behavioral change in this PR.

Author: Miles Yucht <[email protected]>

Closes apache#21425 from mgyucht/spark-24381.
  • Loading branch information
mgyucht authored and gatorsmile committed May 27, 2018
1 parent ed1a654 commit d440699
Show file tree
Hide file tree
Showing 10 changed files with 714 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -116,15 +116,16 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper {
// (a1,a2,...) = (b1,b2,...)
// to
// (a1=b1 OR isnull(a1=b1)) AND (a2=b2 OR isnull(a2=b2)) AND ...
val joinConds = splitConjunctivePredicates(joinCond.get)
val baseJoinConds = splitConjunctivePredicates(joinCond.get)
val nullAwareJoinConds = baseJoinConds.map(c => Or(c, IsNull(c)))
// After that, add back the correlated join predicate(s) in the subquery
// Example:
// SELECT ... FROM A WHERE A.A1 NOT IN (SELECT B.B1 FROM B WHERE B.B2 = A.A2 AND B.B3 > 1)
// will have the final conditions in the LEFT ANTI as
// (A.A1 = B.B1 OR ISNULL(A.A1 = B.B1)) AND (B.B2 = A.A2)
val pairs = (joinConds.map(c => Or(c, IsNull(c))) ++ conditions).reduceLeft(And)
// (A.A1 = B.B1 OR ISNULL(A.A1 = B.B1)) AND (B.B2 = A.A2) AND B.B3 > 1
val finalJoinCond = (nullAwareJoinConds ++ conditions).reduceLeft(And)
// Deduplicate conflicting attributes if any.
dedupJoin(Join(outerPlan, sub, LeftAnti, Option(pairs)))
dedupJoin(Join(outerPlan, sub, LeftAnti, Option(finalJoinCond)))
case (p, predicate) =>
val (newCond, inputPlan) = rewriteExistentialExpr(Seq(predicate), p)
Project(p.output, Filter(newCond.get, inputPlan))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
-- Unit tests for simple NOT IN predicate subquery across multiple columns.
--
-- See not-in-single-column-unit-tests.sql for an introduction.
-- This file has the same test cases as not-in-unit-tests-multi-column.sql with literals instead of
-- subqueries. Small changes have been made to the literals to make them typecheck.

CREATE TEMPORARY VIEW m AS SELECT * FROM VALUES
(null, null),
(null, 1.0),
(2, 3.0),
(4, 5.0)
AS m(a, b);

-- Case 1 (not possible to write a literal with no rows, so we ignore it.)
-- (subquery is empty -> row is returned)

-- Cases 2, 3 and 4 are currently broken, so I have commented them out here.
-- Filed https://issues.apache.org/jira/browse/SPARK-24395 to fix and restore these test cases.

-- Case 5
-- (one null column with no match -> row is returned)
SELECT *
FROM m
WHERE b = 1.0 -- Matches (null, 1.0)
AND (a, b) NOT IN ((2, 3.0));

-- Case 6
-- (no null columns with match -> row not returned)
SELECT *
FROM m
WHERE b = 3.0 -- Matches (2, 3.0)
AND (a, b) NOT IN ((2, 3.0));

-- Case 7
-- (no null columns with no match -> row is returned)
SELECT *
FROM m
WHERE b = 5.0 -- Matches (4, 5.0)
AND (a, b) NOT IN ((2, 3.0));
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
-- Unit tests for simple NOT IN predicate subquery across multiple columns.
--
-- See not-in-single-column-unit-tests.sql for an introduction.
--
-- Test cases for multi-column ``WHERE a NOT IN (SELECT c FROM r ...)'':
-- | # | does subquery include null? | do filter columns contain null? | a = c? | b = d? | row included in result? |
-- | 1 | empty | * | * | * | yes |
-- | 2 | 1+ row has null for all columns | * | * | * | no |
-- | 3 | no row has null for all columns | (yes, yes) | * | * | no |
-- | 4 | no row has null for all columns | (no, yes) | yes | * | no |
-- | 5 | no row has null for all columns | (no, yes) | no | * | yes |
-- | 6 | no | (no, no) | yes | yes | no |
-- | 7 | no | (no, no) | _ | _ | yes |
--
-- This can be generalized to include more tests for more columns, but it covers the main cases
-- when there is more than one column.

CREATE TEMPORARY VIEW m AS SELECT * FROM VALUES
(null, null),
(null, 1.0),
(2, 3.0),
(4, 5.0)
AS m(a, b);

CREATE TEMPORARY VIEW s AS SELECT * FROM VALUES
(null, null),
(0, 1.0),
(2, 3.0),
(4, null)
AS s(c, d);

-- Case 1
-- (subquery is empty -> row is returned)
SELECT *
FROM m
WHERE (a, b) NOT IN (SELECT *
FROM s
WHERE d > 5.0) -- Matches no rows
;

-- Case 2
-- (subquery contains a row with null in all columns -> row not returned)
SELECT *
FROM m
WHERE (a, b) NOT IN (SELECT *
FROM s
WHERE c IS NULL AND d IS NULL) -- Matches only (null, null)
;

-- Case 3
-- (probe-side columns are all null -> row not returned)
SELECT *
FROM m
WHERE a IS NULL AND b IS NULL -- Matches only (null, null)
AND (a, b) NOT IN (SELECT *
FROM s
WHERE c IS NOT NULL) -- Matches (0, 1.0), (2, 3.0), (4, null)
;

-- Case 4
-- (one column null, other column matches a row in the subquery result -> row not returned)
SELECT *
FROM m
WHERE b = 1.0 -- Matches (null, 1.0)
AND (a, b) NOT IN (SELECT *
FROM s
WHERE c IS NOT NULL) -- Matches (0, 1.0), (2, 3.0), (4, null)
;

-- Case 5
-- (one null column with no match -> row is returned)
SELECT *
FROM m
WHERE b = 1.0 -- Matches (null, 1.0)
AND (a, b) NOT IN (SELECT *
FROM s
WHERE c = 2) -- Matches (2, 3.0)
;

-- Case 6
-- (no null columns with match -> row not returned)
SELECT *
FROM m
WHERE b = 3.0 -- Matches (2, 3.0)
AND (a, b) NOT IN (SELECT *
FROM s
WHERE c = 2) -- Matches (2, 3.0)
;

-- Case 7
-- (no null columns with no match -> row is returned)
SELECT *
FROM m
WHERE b = 5.0 -- Matches (4, 5.0)
AND (a, b) NOT IN (SELECT *
FROM s
WHERE c = 2) -- Matches (2, 3.0)
;
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
-- Unit tests for simple NOT IN with a literal expression of a single column
--
-- More information can be found in not-in-unit-tests-single-column.sql.
-- This file has the same test cases as not-in-unit-tests-single-column.sql with literals instead of
-- subqueries.

CREATE TEMPORARY VIEW m AS SELECT * FROM VALUES
(null, 1.0),
(2, 3.0),
(4, 5.0)
AS m(a, b);

-- Uncorrelated NOT IN Subquery test cases
-- Case 1 (not possible to write a literal with no rows, so we ignore it.)
-- (empty subquery -> all rows returned)

-- Case 2
-- (subquery includes null -> no rows returned)
SELECT *
FROM m
WHERE a NOT IN (null);

-- Case 3
-- (probe column is null -> row not returned)
SELECT *
FROM m
WHERE b = 1.0 -- Only matches (null, 1.0)
AND a NOT IN (2);

-- Case 4
-- (probe column matches subquery row -> row not returned)
SELECT *
FROM m
WHERE b = 3.0 -- Only matches (2, 3.0)
AND a NOT IN (2);

-- Case 5
-- (probe column does not match subquery row -> row is returned)
SELECT *
FROM m
WHERE b = 3.0 -- Only matches (2, 3.0)
AND a NOT IN (6);
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
-- Unit tests for simple NOT IN predicate subquery across a single column.
--
-- ``col NOT IN expr'' is quite difficult to reason about. There are many edge cases, some of the
-- rules are confusing to the uninitiated, and precedence and treatment of null values is plain
-- unintuitive. To make this simpler to understand, I've come up with a plain English way of
-- describing the expected behavior of this query.
--
-- - If the subquery is empty (i.e. returns no rows), the row should be returned, regardless of
-- whether the filtered columns include nulls.
-- - If the subquery contains a result with all columns null, then the row should not be returned.
-- - If for all non-null filter columns there exists a row in the subquery in which each column
-- either
-- 1. is equal to the corresponding filter column or
-- 2. is null
-- then the row should not be returned. (This includes the case where all filter columns are
-- null.)
-- - Otherwise, the row should be returned.
--
-- Using these rules, we can come up with a set of test cases for single-column and multi-column
-- NOT IN test cases.
--
-- Test cases for single-column ``WHERE a NOT IN (SELECT c FROM r ...)'':
-- | # | does subquery include null? | is a null? | a = c? | row with a included in result? |
-- | 1 | empty | | | yes |
-- | 2 | yes | | | no |
-- | 3 | no | yes | | no |
-- | 4 | no | no | yes | no |
-- | 5 | no | no | no | yes |
--
-- There are also some considerations around correlated subqueries. Correlated subqueries can
-- cause cases 2, 3, or 4 to be reduced to case 1 by limiting the number of rows returned by the
-- subquery, so the row from the parent table should always be included in the output.

CREATE TEMPORARY VIEW m AS SELECT * FROM VALUES
(null, 1.0),
(2, 3.0),
(4, 5.0)
AS m(a, b);

CREATE TEMPORARY VIEW s AS SELECT * FROM VALUES
(null, 1.0),
(2, 3.0),
(6, 7.0)
AS s(c, d);

-- Uncorrelated NOT IN Subquery test cases
-- Case 1
-- (empty subquery -> all rows returned)
SELECT *
FROM m
WHERE a NOT IN (SELECT c
FROM s
WHERE d > 10.0) -- (empty subquery)
;

-- Case 2
-- (subquery includes null -> no rows returned)
SELECT *
FROM m
WHERE a NOT IN (SELECT c
FROM s
WHERE d = 1.0) -- Only matches (null, 1.0)
;

-- Case 3
-- (probe column is null -> row not returned)
SELECT *
FROM m
WHERE b = 1.0 -- Only matches (null, 1.0)
AND a NOT IN (SELECT c
FROM s
WHERE d = 3.0) -- Matches (2, 3.0)
;

-- Case 4
-- (probe column matches subquery row -> row not returned)
SELECT *
FROM m
WHERE b = 3.0 -- Only matches (2, 3.0)
AND a NOT IN (SELECT c
FROM s
WHERE d = 3.0) -- Matches (2, 3.0)
;

-- Case 5
-- (probe column does not match subquery row -> row is returned)
SELECT *
FROM m
WHERE b = 3.0 -- Only matches (2, 3.0)
AND a NOT IN (SELECT c
FROM s
WHERE d = 7.0) -- Matches (6, 7.0)
;

-- Correlated NOT IN subquery test cases
-- Case 2->1
-- (subquery had nulls but they are removed by correlated subquery -> all rows returned)
SELECT *
FROM m
WHERE a NOT IN (SELECT c
FROM s
WHERE d = b + 10) -- Matches no row
;

-- Case 3->1
-- (probe column is null but subquery returns no rows -> row is returned)
SELECT *
FROM m
WHERE b = 1.0 -- Only matches (null, 1.0)
AND a NOT IN (SELECT c
FROM s
WHERE d = b + 10) -- Matches no row
;

-- Case 4->1
-- (probe column matches row which is filtered out by correlated subquery -> row is returned)
SELECT *
FROM m
WHERE b = 3.0 -- Only matches (2, 3.0)
AND a NOT IN (SELECT c
FROM s
WHERE d = b + 10) -- Matches no row
;
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 4


-- !query 0
CREATE TEMPORARY VIEW m AS SELECT * FROM VALUES
(null, null),
(null, 1.0),
(2, 3.0),
(4, 5.0)
AS m(a, b)
-- !query 0 schema
struct<>
-- !query 0 output



-- !query 1
-- Case 5
-- (one null column with no match -> row is returned)
SELECT *
FROM m
WHERE b = 1.0 -- Matches (null, 1.0)
AND (a, b) NOT IN ((2, 3.0))
-- !query 1 schema
struct<a:int,b:decimal(2,1)>
-- !query 1 output
NULL 1


-- !query 2
-- Case 6
-- (no null columns with match -> row not returned)
SELECT *
FROM m
WHERE b = 3.0 -- Matches (2, 3.0)
AND (a, b) NOT IN ((2, 3.0))
-- !query 2 schema
struct<a:int,b:decimal(2,1)>
-- !query 2 output



-- !query 3
-- Case 7
-- (no null columns with no match -> row is returned)
SELECT *
FROM m
WHERE b = 5.0 -- Matches (4, 5.0)
AND (a, b) NOT IN ((2, 3.0))
-- !query 3 schema
struct<a:int,b:decimal(2,1)>
-- !query 3 output
4 5
Loading

0 comments on commit d440699

Please sign in to comment.