HIVE-15905 : Inefficient plan for correlated subqueries (Vineet Garg …

…via Ashutosh Chauhan) Signed-off-by: Ashutosh Chauhan <[email protected]>
akitanaka · Feb 15, 2017 · bddf5a7 · bddf5a7
1 parent b14ef6d
commit bddf5a7
Show file tree

Hide file tree

Showing 30 changed files with 6,219 additions and 16,882 deletions.
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRelDecorrelator.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRelDecorrelator.java
diff --git a/ql/src/test/queries/clientpositive/subquery_multi.q b/ql/src/test/queries/clientpositive/subquery_multi.q
@@ -58,8 +58,12 @@ explain select * from part_null where p_name IN (select p_name from part_null) A
 select * from part_null where p_name IN (select p_name from part_null) AND NOT EXISTS (select c from tempty);
 
 -- corr, mix of IN/NOT IN
-explain select * from part_null where p_name IN ( select p_name from part where part.p_type = part_null.p_type) AND p_brand NOT IN (select p_container from part where part.p_type = part_null.p_type AND p_brand IN (select p_brand from part pp where part.p_type = pp.p_type));
-select * from part_null where p_name IN ( select p_name from part where part.p_type = part_null.p_type) AND p_brand NOT IN (select p_container from part where part.p_type = part_null.p_type AND p_brand IN (select p_brand from part pp where part.p_type = pp.p_type));
+explain select * from part_null where p_name IN ( select p_name from part where part.p_type = part_null.p_type)
+        AND p_brand NOT IN (select p_container from part where part.p_type = part_null.p_type
+                                AND p_brand IN (select p_brand from part pp where part.p_type = pp.p_type));
+select * from part_null where p_name IN ( select p_name from part where part.p_type = part_null.p_type)
+        AND p_brand NOT IN (select p_container from part where part.p_type = part_null.p_type
+                                AND p_brand IN (select p_brand from part pp where part.p_type = pp.p_type));
 
 -- mix of corr and uncorr
 explain select * from part_null where p_name IN ( select p_name from part) AND p_brand IN (select p_brand from part where part.p_type = part_null.p_type);

diff --git a/ql/src/test/results/clientpositive/constprog_partitioner.q.out b/ql/src/test/results/clientpositive/constprog_partitioner.q.out
@@ -80,95 +80,10 @@ WHERE li.l_linenumber = 1 AND
  li.l_orderkey IN (SELECT l_orderkey FROM lineitem WHERE l_shipmode = 'AIR' AND l_linenumber = li.l_linenumber)
 POSTHOOK: type: QUERY
 STAGE DEPENDENCIES:
-  Stage-3 is a root stage
-  Stage-2 depends on stages: Stage-3
-  Stage-1 depends on stages: Stage-2
+  Stage-1 is a root stage
   Stage-0 depends on stages: Stage-1
 
 STAGE PLANS:
-  Stage: Stage-3
-    Map Reduce
-      Map Operator Tree:
-          TableScan
-            alias: li
-            Statistics: Num rows: 100 Data size: 11999 Basic stats: COMPLETE Column stats: NONE
-            Select Operator
-              expressions: l_linenumber (type: int)
-              outputColumnNames: l_linenumber
-              Statistics: Num rows: 100 Data size: 11999 Basic stats: COMPLETE Column stats: NONE
-              Group By Operator
-                keys: l_linenumber (type: int)
-                mode: hash
-                outputColumnNames: _col0
-                Statistics: Num rows: 100 Data size: 11999 Basic stats: COMPLETE Column stats: NONE
-                Reduce Output Operator
-                  key expressions: _col0 (type: int)
-                  sort order: +
-                  Map-reduce partition columns: _col0 (type: int)
-                  Statistics: Num rows: 100 Data size: 11999 Basic stats: COMPLETE Column stats: NONE
-      Reduce Operator Tree:
-        Group By Operator
-          keys: KEY._col0 (type: int)
-          mode: mergepartial
-          outputColumnNames: _col0
-          Statistics: Num rows: 50 Data size: 5999 Basic stats: COMPLETE Column stats: NONE
-          File Output Operator
-            compressed: false
-            table:
-                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
-                output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
-                serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
-
-  Stage: Stage-2
-    Map Reduce
-      Map Operator Tree:
-          TableScan
-            alias: lineitem
-            Statistics: Num rows: 100 Data size: 11999 Basic stats: COMPLETE Column stats: NONE
-            Filter Operator
-              predicate: (l_shipmode = 'AIR') (type: boolean)
-              Statistics: Num rows: 50 Data size: 5999 Basic stats: COMPLETE Column stats: NONE
-              Select Operator
-                expressions: l_orderkey (type: int), l_linenumber (type: int)
-                outputColumnNames: _col0, _col1
-                Statistics: Num rows: 50 Data size: 5999 Basic stats: COMPLETE Column stats: NONE
-                Reduce Output Operator
-                  key expressions: _col1 (type: int)
-                  sort order: +
-                  Map-reduce partition columns: _col1 (type: int)
-                  Statistics: Num rows: 50 Data size: 5999 Basic stats: COMPLETE Column stats: NONE
-                  value expressions: _col0 (type: int)
-          TableScan
-            Reduce Output Operator
-              key expressions: _col0 (type: int)
-              sort order: +
-              Map-reduce partition columns: _col0 (type: int)
-              Statistics: Num rows: 50 Data size: 5999 Basic stats: COMPLETE Column stats: NONE
-      Reduce Operator Tree:
-        Join Operator
-          condition map:
-               Inner Join 0 to 1
-          keys:
-            0 _col1 (type: int)
-            1 _col0 (type: int)
-          outputColumnNames: _col0, _col3
-          Statistics: Num rows: 55 Data size: 6598 Basic stats: COMPLETE Column stats: NONE
-          Select Operator
-            expressions: _col0 (type: int), _col3 (type: int)
-            outputColumnNames: _col0, _col1
-            Statistics: Num rows: 55 Data size: 6598 Basic stats: COMPLETE Column stats: NONE
-            Group By Operator
-              keys: _col0 (type: int), _col1 (type: int)
-              mode: hash
-              outputColumnNames: _col0, _col1
-              Statistics: Num rows: 55 Data size: 6598 Basic stats: COMPLETE Column stats: NONE
-              File Output Operator
-                compressed: false
-                table:
-                    input format: org.apache.hadoop.mapred.SequenceFileInputFormat
-                    output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
-                    serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
-
   Stage: Stage-1
     Map Reduce
       Map Operator Tree:
@@ -189,11 +104,25 @@ STAGE PLANS:
                   Statistics: Num rows: 50 Data size: 5999 Basic stats: COMPLETE Column stats: NONE
                   value expressions: _col1 (type: int), _col2 (type: int)
           TableScan
-            Reduce Output Operator
-              key expressions: _col0 (type: int), _col1 (type: int)
-              sort order: ++
-              Map-reduce partition columns: _col0 (type: int), _col1 (type: int)
-              Statistics: Num rows: 55 Data size: 6598 Basic stats: COMPLETE Column stats: NONE
+            alias: lineitem
+            Statistics: Num rows: 100 Data size: 11999 Basic stats: COMPLETE Column stats: NONE
+            Filter Operator
+              predicate: ((l_shipmode = 'AIR') and (l_linenumber = l_linenumber)) (type: boolean)
+              Statistics: Num rows: 25 Data size: 2999 Basic stats: COMPLETE Column stats: NONE
+              Select Operator
+                expressions: l_orderkey (type: int), l_linenumber (type: int)
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 25 Data size: 2999 Basic stats: COMPLETE Column stats: NONE
+                Group By Operator
+                  keys: _col0 (type: int), _col1 (type: int)
+                  mode: hash
+                  outputColumnNames: _col0, _col1
+                  Statistics: Num rows: 25 Data size: 2999 Basic stats: COMPLETE Column stats: NONE
+                  Reduce Output Operator
+                    key expressions: _col0 (type: int), _col1 (type: int)
+                    sort order: ++
+                    Map-reduce partition columns: _col0 (type: int), _col1 (type: int)
+                    Statistics: Num rows: 25 Data size: 2999 Basic stats: COMPLETE Column stats: NONE
       Reduce Operator Tree:
         Join Operator
           condition map:
@@ -202,14 +131,14 @@ STAGE PLANS:
             0 _col0 (type: int), 1 (type: int)
             1 _col0 (type: int), _col1 (type: int)
           outputColumnNames: _col1, _col2
-          Statistics: Num rows: 60 Data size: 7257 Basic stats: COMPLETE Column stats: NONE
+          Statistics: Num rows: 55 Data size: 6598 Basic stats: COMPLETE Column stats: NONE
           Select Operator
             expressions: _col1 (type: int), _col2 (type: int)
             outputColumnNames: _col0, _col1
-            Statistics: Num rows: 60 Data size: 7257 Basic stats: COMPLETE Column stats: NONE
+            Statistics: Num rows: 55 Data size: 6598 Basic stats: COMPLETE Column stats: NONE
             File Output Operator
               compressed: false
-              Statistics: Num rows: 60 Data size: 7257 Basic stats: COMPLETE Column stats: NONE
+              Statistics: Num rows: 55 Data size: 6598 Basic stats: COMPLETE Column stats: NONE
               table:
                   input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                   output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat