Hive: Vectorized ORC reads for Hive (apache#2613)

deniskuzZ · Jun 7, 2021 · 282b6f9 · 282b6f9
1 parent a6fb658
commit 282b6f9
Show file tree

Hide file tree

Showing 14 changed files with 847 additions and 79 deletions.
diff --git a/build.gradle b/build.gradle
@@ -551,6 +551,60 @@ project(':iceberg-mr') {
 }
 
 if (jdkVersion == '8') {
+  // The purpose of this module is to re-shade org.apache.orc.storage to the original org.apache.hadoop.hive package
+  // name. This is to be used by Hive3 for features including e.g. vectorization.
+  project(':iceberg-hive3-orc-bundle') {
+
+    apply plugin: 'com.github.johnrengelman.shadow'
+
+    tasks.jar.dependsOn tasks.shadowJar
+
+    dependencies {
+      compile project(':iceberg-data')
+      compile project(':iceberg-orc')
+
+      testCompileOnly project(path: ':iceberg-data', configuration: 'testArtifacts')
+      testCompileOnly project(path: ':iceberg-orc', configuration: 'testArtifacts')
+    }
+
+    shadowJar {
+      configurations = [project.configurations.compile, project.configurations.compileOnly, project.configurations.testCompileOnly]
+
+      zip64 true
+
+      // include the LICENSE and NOTICE files for the shaded Jar
+      from(projectDir) {
+        include 'LICENSE'
+        include 'NOTICE'
+      }
+
+      // Relocate dependencies to avoid conflicts
+      relocate 'org.apache.orc.storage', 'org.apache.hadoop.hive'
+
+      // We really only need Iceberg and Orc classes, but with relocated references for storage-api classes (see above)
+      // Unfortunately the include list feature of this shader plugin doesn't work as expected
+      exclude 'com/**/*'
+      exclude 'edu/**/*'
+      exclude 'io/**'
+      exclude 'javax/**'
+      exclude 'org/apache/avro/**/*'
+      exclude 'org/apache/commons/**/*'
+      exclude 'org/checkerframework/**/*'
+      exclude 'org/codehaus/**/*'
+      exclude 'org/intellij/**/*'
+      exclude 'org/jetbrains/**/*'
+      exclude 'org/slf4j/**/*'
+      exclude 'org/threeten/**/*'
+
+      classifier null
+    }
+
+    jar {
+      enabled = false
+    }
+
+  }
+
   project(':iceberg-hive3') {
 
     // run the tests in iceberg-mr with Hive3 dependencies
@@ -569,13 +623,13 @@ if (jdkVersion == '8') {
     }
 
     dependencies {
-      compile project(':iceberg-api')
-      compile project(':iceberg-core')
-      compile project(':iceberg-data')
-      compile project(':iceberg-hive-metastore')
-      compile project(':iceberg-orc')
-      compile project(':iceberg-parquet')
-      compile project(':iceberg-mr')
+      compileOnly project(':iceberg-api')
+      compileOnly project(':iceberg-core')
+      compileOnly project(':iceberg-hive-metastore')
+      compileOnly project(':iceberg-parquet')
+      compileOnly project(':iceberg-hive3-orc-bundle')
+      compileOnly project(':iceberg-mr')
+
 
       compileOnly("org.apache.hadoop:hadoop-client:3.1.0") {
         exclude group: 'org.apache.avro', module: 'avro'
@@ -592,10 +646,13 @@ if (jdkVersion == '8') {
         exclude group: 'org.pentaho' // missing dependency
         exclude group: 'org.slf4j', module: 'slf4j-log4j12'
       }
-      compileOnly("org.apache.hive:hive-metastore:3.1.2")
-      compileOnly("org.apache.hive:hive-serde:3.1.2")
+      compileOnly("org.apache.hive:hive-metastore:3.1.2") {
+        exclude group: 'org.apache.orc'
+      }
+      compileOnly("org.apache.hive:hive-serde:3.1.2") {
+        exclude group: 'org.apache.orc'
+      }
 
-      testCompile project(path: ':iceberg-data', configuration: 'testArtifacts')
       testCompile project(path: ':iceberg-api', configuration: 'testArtifacts')
       testCompile project(path: ':iceberg-core', configuration: 'testArtifacts')
       testCompile project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts')
@@ -606,6 +663,7 @@ if (jdkVersion == '8') {
       testCompile("com.fasterxml.jackson.core:jackson-annotations:2.6.5")
       testCompile("org.apache.hive:hive-service:3.1.2") {
         exclude group: 'org.apache.hive', module: 'hive-exec'
+        exclude group: 'org.apache.orc'
       }
       testCompile("org.apache.tez:tez-dag:0.9.1")
       testCompile("org.apache.tez:tez-mapreduce:0.9.1")
@@ -628,7 +686,7 @@ project(':iceberg-hive-runtime') {
       exclude group: 'com.github.stephenc.findbugs'
       exclude group: 'commons-pool'
       exclude group: 'javax.annotation'
-      exclude group: 'javax.xml.bind'      
+      exclude group: 'javax.xml.bind'
       exclude group: 'org.apache.commons'
       exclude group: 'org.slf4j'
       exclude group: 'org.xerial.snappy'
@@ -645,7 +703,7 @@ project(':iceberg-hive-runtime') {
     }
     compile project(':iceberg-aws')
   }
-  
+
   shadowJar {
     configurations = [project.configurations.compile]
 
@@ -659,7 +717,7 @@ project(':iceberg-hive-runtime') {
 
     // Relocate dependencies to avoid conflicts
     relocate 'org.apache.avro', 'org.apache.iceberg.shaded.org.apache.avro'
-    relocate 'org.apache.parquet', 'org.apache.iceberg.shaded.org.apache.parquet'  
+    relocate 'org.apache.parquet', 'org.apache.iceberg.shaded.org.apache.parquet'
     relocate 'com.google', 'org.apache.iceberg.shaded.com.google'
     relocate 'com.fasterxml', 'org.apache.iceberg.shaded.com.fasterxml'
     relocate 'com.github.benmanes', 'org.apache.iceberg.shaded.com.github.benmanes'

diff --git a/hive3/src/main/java/org/apache/iceberg/mr/hive/vector/CompatibilityHiveVectorUtils.java b/hive3/src/main/java/org/apache/iceberg/mr/hive/vector/CompatibilityHiveVectorUtils.java
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.iceberg.mr.hive.vector;
+
+import java.sql.Date;
+import java.sql.Timestamp;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.common.type.HiveIntervalDayTime;
+import org.apache.hadoop.hive.common.type.HiveIntervalYearMonth;
+import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.exec.tez.DagUtils;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.IntervalDayTimeColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.plan.BaseWork;
+import org.apache.hadoop.hive.ql.plan.MapWork;
+import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Contains ported code snippets from later Hive sources. We should get rid of this class as soon as Hive 4 is released
+ * and Iceberg makes a dependency to that version.
+ */
+public class CompatibilityHiveVectorUtils {
+
+  private static final Logger LOG = LoggerFactory.getLogger(CompatibilityHiveVectorUtils.class);
+
+  private CompatibilityHiveVectorUtils() {
+
+  }
+
+  /**
+   * Returns serialized mapwork instance from a job conf - ported from Hive source code LlapHiveUtils#findMapWork
+   *
+   * @param job JobConf instance
+   * @return
+   */
+  public static MapWork findMapWork(JobConf job) {
+    String inputName = job.get(Utilities.INPUT_NAME, null);
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("Initializing for input {}", inputName);
+    }
+    String prefixes = job.get(DagUtils.TEZ_MERGE_WORK_FILE_PREFIXES);
+    if (prefixes != null && !StringUtils.isBlank(prefixes)) {
+      // Currently SMB is broken, so we cannot check if it's  compatible with IO elevator.
+      // So, we don't use the below code that would get the correct MapWork. See HIVE-16985.
+      return null;
+    }
+
+    BaseWork work = null;
+    // HIVE-16985: try to find the fake merge work for SMB join, that is really another MapWork.
+    if (inputName != null) {
+      if (prefixes == null || !Lists.newArrayList(prefixes.split(",")).contains(inputName)) {
+        inputName = null;
+      }
+    }
+    if (inputName != null) {
+      work = Utilities.getMergeWork(job, inputName);
+    }
+
+    if (!(work instanceof MapWork)) {
+      work = Utilities.getMapWork(job);
+    }
+    return (MapWork) work;
+  }
+
+
+  /**
+   * Ported from Hive source code VectorizedRowBatchCtx#addPartitionColsToBatch
+   *
+   * @param col ColumnVector to write the partition value into
+   * @param value partition value
+   * @param partitionColumnName partition key
+   * @param rowColumnTypeInfo column type description
+   */
+//  @SuppressWarnings({"AvoidNestedBlocks", "FallThrough", "MethodLength", "CyclomaticComplexity", "Indentation"})
+  public static void addPartitionColsToBatch(ColumnVector col, Object value, String partitionColumnName,
+      TypeInfo rowColumnTypeInfo) {
+    PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) rowColumnTypeInfo;
+
+    if (value == null) {
+      col.noNulls = false;
+      col.isNull[0] = true;
+      col.isRepeating = true;
+      return;
+    }
+
+    switch (primitiveTypeInfo.getPrimitiveCategory()) {
+      case BOOLEAN:
+        LongColumnVector booleanColumnVector = (LongColumnVector) col;
+        booleanColumnVector.fill((Boolean) value ? 1 : 0);
+        booleanColumnVector.isNull[0] = false;
+        break;
+
+      case BYTE:
+        LongColumnVector byteColumnVector = (LongColumnVector) col;
+        byteColumnVector.fill((Byte) value);
+        byteColumnVector.isNull[0] = false;
+        break;
+
+      case SHORT:
+        LongColumnVector shortColumnVector = (LongColumnVector) col;
+        shortColumnVector.fill((Short) value);
+        shortColumnVector.isNull[0] = false;
+        break;
+
+      case INT:
+        LongColumnVector intColumnVector = (LongColumnVector) col;
+        intColumnVector.fill((Integer) value);
+        intColumnVector.isNull[0] = false;
+        break;
+
+      case LONG:
+        LongColumnVector longColumnVector = (LongColumnVector) col;
+        longColumnVector.fill((Long) value);
+        longColumnVector.isNull[0] = false;
+        break;
+
+      case DATE:
+        LongColumnVector dateColumnVector = (LongColumnVector) col;
+        dateColumnVector.fill(DateWritable.dateToDays((Date) value));
+        dateColumnVector.isNull[0] = false;
+        break;
+
+      case TIMESTAMP:
+        TimestampColumnVector timeStampColumnVector = (TimestampColumnVector) col;
+        timeStampColumnVector.fill((Timestamp) value);
+        timeStampColumnVector.isNull[0] = false;
+        break;
+
+      case INTERVAL_YEAR_MONTH:
+        LongColumnVector intervalYearMonthColumnVector = (LongColumnVector) col;
+        intervalYearMonthColumnVector.fill(((HiveIntervalYearMonth) value).getTotalMonths());
+        intervalYearMonthColumnVector.isNull[0] = false;
+        break;
+
+      case INTERVAL_DAY_TIME:
+        IntervalDayTimeColumnVector intervalDayTimeColumnVector = (IntervalDayTimeColumnVector) col;
+        intervalDayTimeColumnVector.fill((HiveIntervalDayTime) value);
+        intervalDayTimeColumnVector.isNull[0] = false;
+        break;
+
+      case FLOAT:
+        DoubleColumnVector floatColumnVector = (DoubleColumnVector) col;
+        floatColumnVector.fill((Float) value);
+        floatColumnVector.isNull[0] = false;
+        break;
+
+      case DOUBLE:
+        DoubleColumnVector doubleColumnVector = (DoubleColumnVector) col;
+        doubleColumnVector.fill((Double) value);
+        doubleColumnVector.isNull[0] = false;
+        break;
+
+      case DECIMAL:
+        DecimalColumnVector decimalColumnVector = (DecimalColumnVector) col;
+        HiveDecimal hd = (HiveDecimal) value;
+        decimalColumnVector.set(0, hd);
+        decimalColumnVector.isRepeating = true;
+        decimalColumnVector.isNull[0] = false;
+        break;
+
+      case BINARY:
+        BytesColumnVector binaryColumnVector = (BytesColumnVector) col;
+        byte[] bytes = (byte[]) value;
+        binaryColumnVector.fill(bytes);
+        binaryColumnVector.isNull[0] = false;
+        break;
+
+      case STRING:
+      case CHAR:
+      case VARCHAR:
+        BytesColumnVector bytesColumnVector = (BytesColumnVector) col;
+        String sVal = value.toString();
+        if (sVal == null) {
+          bytesColumnVector.noNulls = false;
+          bytesColumnVector.isNull[0] = true;
+          bytesColumnVector.isRepeating = true;
+        } else {
+          bytesColumnVector.setVal(0, sVal.getBytes());
+          bytesColumnVector.isRepeating = true;
+        }
+        break;
+
+      default:
+        throw new RuntimeException("Unable to recognize the partition type " +
+            primitiveTypeInfo.getPrimitiveCategory() + " for column " + partitionColumnName);
+    }
+
+  }
+}