Skip to content

Commit

Permalink
HIVE-16368 : Unexpected java.lang.ArrayIndexOutOfBoundsException from…
Browse files Browse the repository at this point in the history
… query with LaterView Operation for hive on MR. (Zhihai Xu via Ashutosh Chauhan)

Signed-off-by: Ashutosh Chauhan <[email protected]>
  • Loading branch information
zhihaixu2012 authored and ashutoshc committed Apr 10, 2017
1 parent c3aba15 commit 79d2eb7
Show file tree
Hide file tree
Showing 3 changed files with 219 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -671,10 +671,11 @@ public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx,

List<FieldNode> colsAfterReplacement = new ArrayList<>();
List<FieldNode> newCols = new ArrayList<>();
for (FieldNode col : cols) {
int index = outputCols.indexOf(col.getFieldName());
for (int index = 0; index < numSelColumns; index++) {
String colName = outputCols.get(index);
FieldNode col = lookupColumn(cols, colName);
// colExprMap.size() == size of cols from SEL(*) branch
if (index >= 0 && index < numSelColumns) {
if (col != null) {
ExprNodeDesc transformed = colExprMap.get(col.getFieldName());
colsAfterReplacement = mergeFieldNodesWithDesc(colsAfterReplacement, transformed);
newCols.add(col);
Expand Down Expand Up @@ -713,12 +714,14 @@ public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx,
RowSchema rs = op.getSchema();
ArrayList<ExprNodeDesc> colList = new ArrayList<>();
List<FieldNode> outputCols = new ArrayList<>();
for (FieldNode col : cols) {
// revert output cols of SEL(*) to ExprNodeColumnDesc
ColumnInfo colInfo = rs.getColumnInfo(col.getFieldName());
ExprNodeColumnDesc colExpr = new ExprNodeColumnDesc(colInfo);
colList.add(colExpr);
outputCols.add(col);
for (ColumnInfo colInfo : rs.getSignature()) {
FieldNode col = lookupColumn(cols, colInfo.getInternalName());
if (col != null) {
// revert output cols of SEL(*) to ExprNodeColumnDesc
ExprNodeColumnDesc colExpr = new ExprNodeColumnDesc(colInfo);
colList.add(colExpr);
outputCols.add(col);
}
}
// replace SEL(*) to SEL(exprs)
((SelectDesc)select.getConf()).setSelStarNoCompute(false);
Expand Down
6 changes: 6 additions & 0 deletions ql/src/test/queries/clientpositive/lateral_view_onview.q
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,9 @@ SELECT SIZE(c2),c3,TRIM(c1),c4,myCol from lv_view LATERAL VIEW explode(array(1,2

SELECT SIZE(c2),c3,TRIM(c1),c4,myCol from lv_view LATERAL VIEW explode(array(1,2,3)) myTab as myCol limit 3;

CREATE TABLE lv_table1( c1 STRING, c3 INT, c4 CHAR(1), c5 STRING, c6 STRING, c7 STRING, c8 STRING, c9 STRING, c10 STRING, c11 STRING, c12 STRING, c13 STRING);
CREATE TABLE lv_table2( c1 STRING, c2 ARRAY<INT>);
INSERT OVERWRITE TABLE lv_table1 SELECT 'abc ', 100, 't', 'test', 'test', 'test', 'test', 'test', 'test', 'test', 'test', 'test' FROM src;
INSERT OVERWRITE TABLE lv_table2 SELECT 'abc ', array(1,2,3) FROM src;
EXPLAIN WITH lv_view1 AS (SELECT lv_table1.*, c2 FROM lv_table1 JOIN lv_table2 ON lv_table1.c1 = lv_table2.c1), lv_view2 AS (SELECT * FROM lv_view1 LATERAL VIEW explode(c2) myTable AS myCol) SELECT * FROM lv_view2 SORT BY c1 ASC, myCol ASC LIMIT 1;
WITH lv_view1 AS (SELECT lv_table1.*, c2 FROM lv_table1 JOIN lv_table2 ON lv_table1.c1 = lv_table2.c1), lv_view2 AS (SELECT * FROM lv_view1 LATERAL VIEW explode(c2) myTable AS myCol) SELECT * FROM lv_view2 SORT BY c1 ASC, myCol ASC LIMIT 1;
205 changes: 201 additions & 4 deletions ql/src/test/results/clientpositive/lateral_view_onview.q.out
Original file line number Diff line number Diff line change
Expand Up @@ -545,11 +545,11 @@ STAGE PLANS:
Lateral View Forward
Statistics: Num rows: 500 Data size: 8500 Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: _col1 (type: array<int>), _col2 (type: int), _col0 (type: string), _col3 (type: char(1))
outputColumnNames: _col1, _col2, _col0, _col3
expressions: _col0 (type: string), _col1 (type: array<int>), _col2 (type: int), _col3 (type: char(1))
outputColumnNames: _col0, _col1, _col2, _col3
Statistics: Num rows: 500 Data size: 8500 Basic stats: COMPLETE Column stats: NONE
Lateral View Join Operator
outputColumnNames: _col1, _col2, _col0, _col3, _col4
outputColumnNames: _col0, _col1, _col2, _col3, _col4
Statistics: Num rows: 1000 Data size: 17000 Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: size(_col1) (type: int), _col2 (type: int), trim(_col0) (type: string), _col3 (type: char(1)), _col4 (type: int)
Expand All @@ -573,7 +573,7 @@ STAGE PLANS:
Statistics: Num rows: 500 Data size: 8500 Basic stats: COMPLETE Column stats: NONE
function name: explode
Lateral View Join Operator
outputColumnNames: _col1, _col2, _col0, _col3, _col4
outputColumnNames: _col0, _col1, _col2, _col3, _col4
Statistics: Num rows: 1000 Data size: 17000 Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: size(_col1) (type: int), _col2 (type: int), trim(_col0) (type: string), _col3 (type: char(1)), _col4 (type: int)
Expand Down Expand Up @@ -609,3 +609,200 @@ POSTHOOK: Input: default@lv_view
3 100 abc t 1
3 100 abc t 2
3 100 abc t 3
PREHOOK: query: CREATE TABLE lv_table1( c1 STRING, c3 INT, c4 CHAR(1), c5 STRING, c6 STRING, c7 STRING, c8 STRING, c9 STRING, c10 STRING, c11 STRING, c12 STRING, c13 STRING)
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@lv_table1
POSTHOOK: query: CREATE TABLE lv_table1( c1 STRING, c3 INT, c4 CHAR(1), c5 STRING, c6 STRING, c7 STRING, c8 STRING, c9 STRING, c10 STRING, c11 STRING, c12 STRING, c13 STRING)
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@lv_table1
PREHOOK: query: CREATE TABLE lv_table2( c1 STRING, c2 ARRAY<INT>)
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@lv_table2
POSTHOOK: query: CREATE TABLE lv_table2( c1 STRING, c2 ARRAY<INT>)
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@lv_table2
PREHOOK: query: INSERT OVERWRITE TABLE lv_table1 SELECT 'abc ', 100, 't', 'test', 'test', 'test', 'test', 'test', 'test', 'test', 'test', 'test' FROM src
PREHOOK: type: QUERY
PREHOOK: Input: default@src
PREHOOK: Output: default@lv_table1
POSTHOOK: query: INSERT OVERWRITE TABLE lv_table1 SELECT 'abc ', 100, 't', 'test', 'test', 'test', 'test', 'test', 'test', 'test', 'test', 'test' FROM src
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
POSTHOOK: Output: default@lv_table1
POSTHOOK: Lineage: lv_table1.c1 SIMPLE []
POSTHOOK: Lineage: lv_table1.c10 SIMPLE []
POSTHOOK: Lineage: lv_table1.c11 SIMPLE []
POSTHOOK: Lineage: lv_table1.c12 SIMPLE []
POSTHOOK: Lineage: lv_table1.c13 SIMPLE []
POSTHOOK: Lineage: lv_table1.c3 SIMPLE []
POSTHOOK: Lineage: lv_table1.c4 EXPRESSION []
POSTHOOK: Lineage: lv_table1.c5 SIMPLE []
POSTHOOK: Lineage: lv_table1.c6 SIMPLE []
POSTHOOK: Lineage: lv_table1.c7 SIMPLE []
POSTHOOK: Lineage: lv_table1.c8 SIMPLE []
POSTHOOK: Lineage: lv_table1.c9 SIMPLE []
PREHOOK: query: INSERT OVERWRITE TABLE lv_table2 SELECT 'abc ', array(1,2,3) FROM src
PREHOOK: type: QUERY
PREHOOK: Input: default@src
PREHOOK: Output: default@lv_table2
POSTHOOK: query: INSERT OVERWRITE TABLE lv_table2 SELECT 'abc ', array(1,2,3) FROM src
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
POSTHOOK: Output: default@lv_table2
POSTHOOK: Lineage: lv_table2.c1 SIMPLE []
POSTHOOK: Lineage: lv_table2.c2 EXPRESSION []
PREHOOK: query: EXPLAIN WITH lv_view1 AS (SELECT lv_table1.*, c2 FROM lv_table1 JOIN lv_table2 ON lv_table1.c1 = lv_table2.c1), lv_view2 AS (SELECT * FROM lv_view1 LATERAL VIEW explode(c2) myTable AS myCol) SELECT * FROM lv_view2 SORT BY c1 ASC, myCol ASC LIMIT 1
PREHOOK: type: QUERY
POSTHOOK: query: EXPLAIN WITH lv_view1 AS (SELECT lv_table1.*, c2 FROM lv_table1 JOIN lv_table2 ON lv_table1.c1 = lv_table2.c1), lv_view2 AS (SELECT * FROM lv_view1 LATERAL VIEW explode(c2) myTable AS myCol) SELECT * FROM lv_view2 SORT BY c1 ASC, myCol ASC LIMIT 1
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
Stage-1 is a root stage
Stage-2 depends on stages: Stage-1
Stage-3 depends on stages: Stage-2
Stage-0 depends on stages: Stage-3

STAGE PLANS:
Stage: Stage-1
Map Reduce
Map Operator Tree:
TableScan
alias: lv_table1
Statistics: Num rows: 500 Data size: 28000 Basic stats: COMPLETE Column stats: NONE
Filter Operator
predicate: c1 is not null (type: boolean)
Statistics: Num rows: 500 Data size: 28000 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
key expressions: c1 (type: string)
sort order: +
Map-reduce partition columns: c1 (type: string)
Statistics: Num rows: 500 Data size: 28000 Basic stats: COMPLETE Column stats: NONE
value expressions: c3 (type: int), c4 (type: char(1)), c5 (type: string), c6 (type: string), c7 (type: string), c8 (type: string), c9 (type: string), c10 (type: string), c11 (type: string), c12 (type: string), c13 (type: string)
TableScan
alias: lv_table2
Statistics: Num rows: 500 Data size: 5500 Basic stats: COMPLETE Column stats: NONE
Filter Operator
predicate: c1 is not null (type: boolean)
Statistics: Num rows: 500 Data size: 5500 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
key expressions: c1 (type: string)
sort order: +
Map-reduce partition columns: c1 (type: string)
Statistics: Num rows: 500 Data size: 5500 Basic stats: COMPLETE Column stats: NONE
value expressions: c2 (type: array<int>)
Reduce Operator Tree:
Join Operator
condition map:
Inner Join 0 to 1
keys:
0 c1 (type: string)
1 c1 (type: string)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col16
Statistics: Num rows: 550 Data size: 30800 Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: _col0 (type: string), _col1 (type: int), _col2 (type: char(1)), _col3 (type: string), _col4 (type: string), _col5 (type: string), _col6 (type: string), _col7 (type: string), _col8 (type: string), _col9 (type: string), _col10 (type: string), _col11 (type: string), _col16 (type: array<int>)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12
Statistics: Num rows: 550 Data size: 30800 Basic stats: COMPLETE Column stats: NONE
Lateral View Forward
Statistics: Num rows: 550 Data size: 30800 Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: _col0 (type: string), _col1 (type: int), _col2 (type: char(1)), _col3 (type: string), _col4 (type: string), _col5 (type: string), _col6 (type: string), _col7 (type: string), _col8 (type: string), _col9 (type: string), _col10 (type: string), _col11 (type: string), _col12 (type: array<int>)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12
Statistics: Num rows: 550 Data size: 30800 Basic stats: COMPLETE Column stats: NONE
Lateral View Join Operator
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13
Statistics: Num rows: 1100 Data size: 61600 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
Select Operator
expressions: _col12 (type: array<int>)
outputColumnNames: _col0
Statistics: Num rows: 550 Data size: 30800 Basic stats: COMPLETE Column stats: NONE
UDTF Operator
Statistics: Num rows: 550 Data size: 30800 Basic stats: COMPLETE Column stats: NONE
function name: explode
Lateral View Join Operator
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13
Statistics: Num rows: 1100 Data size: 61600 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe

Stage: Stage-2
Map Reduce
Map Operator Tree:
TableScan
Reduce Output Operator
key expressions: _col0 (type: string), _col13 (type: int)
sort order: ++
Statistics: Num rows: 1100 Data size: 61600 Basic stats: COMPLETE Column stats: NONE
TopN Hash Memory Usage: 0.1
value expressions: _col1 (type: int), _col2 (type: char(1)), _col3 (type: string), _col4 (type: string), _col5 (type: string), _col6 (type: string), _col7 (type: string), _col8 (type: string), _col9 (type: string), _col10 (type: string), _col11 (type: string), _col12 (type: array<int>)
Reduce Operator Tree:
Select Operator
expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: int), VALUE._col1 (type: char(1)), VALUE._col2 (type: string), VALUE._col3 (type: string), VALUE._col4 (type: string), VALUE._col5 (type: string), VALUE._col6 (type: string), VALUE._col7 (type: string), VALUE._col8 (type: string), VALUE._col9 (type: string), VALUE._col10 (type: string), VALUE._col11 (type: array<int>), KEY.reducesinkkey1 (type: int)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13
Statistics: Num rows: 1100 Data size: 61600 Basic stats: COMPLETE Column stats: NONE
Limit
Number of rows: 1
Statistics: Num rows: 1 Data size: 56 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe

Stage: Stage-3
Map Reduce
Map Operator Tree:
TableScan
Reduce Output Operator
key expressions: _col0 (type: string), _col13 (type: int)
sort order: ++
Statistics: Num rows: 1 Data size: 56 Basic stats: COMPLETE Column stats: NONE
TopN Hash Memory Usage: 0.1
value expressions: _col1 (type: int), _col2 (type: char(1)), _col3 (type: string), _col4 (type: string), _col5 (type: string), _col6 (type: string), _col7 (type: string), _col8 (type: string), _col9 (type: string), _col10 (type: string), _col11 (type: string), _col12 (type: array<int>)
Reduce Operator Tree:
Select Operator
expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: int), VALUE._col1 (type: char(1)), VALUE._col2 (type: string), VALUE._col3 (type: string), VALUE._col4 (type: string), VALUE._col5 (type: string), VALUE._col6 (type: string), VALUE._col7 (type: string), VALUE._col8 (type: string), VALUE._col9 (type: string), VALUE._col10 (type: string), VALUE._col11 (type: array<int>), KEY.reducesinkkey1 (type: int)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13
Statistics: Num rows: 1 Data size: 56 Basic stats: COMPLETE Column stats: NONE
Limit
Number of rows: 1
Statistics: Num rows: 1 Data size: 56 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
Statistics: Num rows: 1 Data size: 56 Basic stats: COMPLETE Column stats: NONE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe

Stage: Stage-0
Fetch Operator
limit: 1
Processor Tree:
ListSink

PREHOOK: query: WITH lv_view1 AS (SELECT lv_table1.*, c2 FROM lv_table1 JOIN lv_table2 ON lv_table1.c1 = lv_table2.c1), lv_view2 AS (SELECT * FROM lv_view1 LATERAL VIEW explode(c2) myTable AS myCol) SELECT * FROM lv_view2 SORT BY c1 ASC, myCol ASC LIMIT 1
PREHOOK: type: QUERY
PREHOOK: Input: default@lv_table1
PREHOOK: Input: default@lv_table2
#### A masked pattern was here ####
POSTHOOK: query: WITH lv_view1 AS (SELECT lv_table1.*, c2 FROM lv_table1 JOIN lv_table2 ON lv_table1.c1 = lv_table2.c1), lv_view2 AS (SELECT * FROM lv_view1 LATERAL VIEW explode(c2) myTable AS myCol) SELECT * FROM lv_view2 SORT BY c1 ASC, myCol ASC LIMIT 1
POSTHOOK: type: QUERY
POSTHOOK: Input: default@lv_table1
POSTHOOK: Input: default@lv_table2
#### A masked pattern was here ####
abc 100 t test test test test test test test test test [1,2,3] 1

0 comments on commit 79d2eb7

Please sign in to comment.