Skip to content

Commit

Permalink
Spark: Remove common module, iceberg-spark (apache#3313)
Browse files Browse the repository at this point in the history
This also fixes some instances of source incompatibility in Spark 3
  • Loading branch information
aokolnychyi authored Oct 19, 2021
1 parent edc6985 commit f3e6770
Show file tree
Hide file tree
Showing 342 changed files with 32,809 additions and 79 deletions.
8 changes: 0 additions & 8 deletions jmh.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,6 @@ configure(jmhProjects) {
zip64 true
}

// Path is relative to either spark2 or spark3 folder, depending on project being tested
sourceSets {
jmh {
java.srcDirs = ['src/jmh/java', '../../../spark/src/jmh/java']
compileClasspath += sourceSets.main.runtimeClasspath
}
}

jmhCompileGeneratedClasses {
pluginManager.withPlugin('com.palantir.baseline-error-prone') {
options.errorprone.enabled = false
Expand Down
66 changes: 0 additions & 66 deletions spark/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -17,71 +17,6 @@
* under the License.
*/

project(':iceberg-spark') {
configurations.all {
resolutionStrategy {
// Spark 2.4.4 can only use the below datanucleus version, the versions introduced
// by Hive 2.3.6 will meet lots of unexpected issues, so here force to use the versions
// introduced by Hive 1.2.1.
force 'org.datanucleus:datanucleus-api-jdo:3.2.6'
force 'org.datanucleus:datanucleus-core:3.2.10'
force 'org.datanucleus:datanucleus-rdbms:3.2.9'
}
}

dependencies {
implementation project(path: ':iceberg-bundled-guava', configuration: 'shadow')
api project(':iceberg-api')
implementation project(':iceberg-common')
implementation project(':iceberg-core')
api project(':iceberg-data')
implementation project(':iceberg-orc')
implementation project(':iceberg-parquet')
implementation project(':iceberg-arrow')
implementation project(':iceberg-hive-metastore')

compileOnly "com.google.errorprone:error_prone_annotations"
compileOnly "org.apache.avro:avro"
compileOnly("org.apache.spark:spark-hive_2.11") {
exclude group: 'org.apache.avro', module: 'avro'
}

implementation("org.apache.orc:orc-core::nohive") {
exclude group: 'org.apache.hadoop'
exclude group: 'commons-lang'
// These artifacts are shaded and included in the orc-core fat jar
exclude group: 'com.google.protobuf', module: 'protobuf-java'
exclude group: 'org.apache.hive', module: 'hive-storage-api'
}

implementation("org.apache.arrow:arrow-vector") {
exclude group: 'io.netty', module: 'netty-buffer'
exclude group: 'io.netty', module: 'netty-common'
exclude group: 'com.google.code.findbugs', module: 'jsr305'
}

testImplementation("org.apache.hadoop:hadoop-minicluster") {
exclude group: 'org.apache.avro', module: 'avro'
}
testImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts')
testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts')
testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts')
testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts')
}

test {
// For vectorized reads
// Allow unsafe memory access to avoid the costly check arrow does to check if index is within bounds
systemProperty("arrow.enable_unsafe_memory_access", "true")
// Disable expensive null check for every get(index) call.
// Iceberg manages nullability checks itself instead of relying on arrow.
systemProperty("arrow.enable_null_check_for_get", "false")

// Vectorized reads need more memory
maxHeapSize '2500m'
}
}

// add enabled Spark version modules to the build
def sparkVersions = (System.getProperty("sparkVersions") != null ? System.getProperty("sparkVersions") : System.getProperty("defaultSparkVersions")).split(",")

Expand All @@ -92,4 +27,3 @@ if (jdkVersion == '8' && sparkVersions.contains("2.4")) {
if (sparkVersions.contains("3.0")) {
apply from: file("$projectDir/v3.0/build.gradle")
}

17 changes: 15 additions & 2 deletions spark/v2.4/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -43,21 +43,34 @@ project(':iceberg-spark:iceberg-spark2') {
implementation project(':iceberg-parquet')
implementation project(':iceberg-arrow')
implementation project(':iceberg-hive-metastore')
implementation project(':iceberg-spark')
implementation "com.github.ben-manes.caffeine:caffeine"

compileOnly "com.google.errorprone:error_prone_annotations"
compileOnly "org.apache.avro:avro"
compileOnly("org.apache.spark:spark-hive_2.11") {
exclude group: 'org.apache.avro', module: 'avro'
}

testImplementation project(path: ':iceberg-spark', configuration: 'testArtifacts')
implementation("org.apache.orc:orc-core::nohive") {
exclude group: 'org.apache.hadoop'
exclude group: 'commons-lang'
// These artifacts are shaded and included in the orc-core fat jar
exclude group: 'com.google.protobuf', module: 'protobuf-java'
exclude group: 'org.apache.hive', module: 'hive-storage-api'
}

implementation("org.apache.arrow:arrow-vector") {
exclude group: 'io.netty', module: 'netty-buffer'
exclude group: 'io.netty', module: 'netty-common'
exclude group: 'com.google.code.findbugs', module: 'jsr305'
}

testImplementation("org.apache.hadoop:hadoop-minicluster") {
exclude group: 'org.apache.avro', module: 'avro'
}
testImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts')
testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts')
testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts')
testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts')
}

Expand Down
13 changes: 10 additions & 3 deletions spark/v3.0/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -37,27 +37,34 @@ project(':iceberg-spark:iceberg-spark3') {
implementation project(':iceberg-parquet')
implementation project(':iceberg-arrow')
implementation project(':iceberg-hive-metastore')
implementation project(':iceberg-spark')

compileOnly "com.google.errorprone:error_prone_annotations"
compileOnly "org.apache.avro:avro"
compileOnly("org.apache.spark:spark-hive_2.12:${project.ext.Spark30Version}") {
exclude group: 'org.apache.avro', module: 'avro'
exclude group: 'org.apache.arrow'
}

implementation("org.apache.orc:orc-core::nohive") {
exclude group: 'org.apache.hadoop'
exclude group: 'commons-lang'
// These artifacts are shaded and included in the orc-core fat jar
exclude group: 'com.google.protobuf', module: 'protobuf-java'
exclude group: 'org.apache.hive', module: 'hive-storage-api'
}

implementation("org.apache.arrow:arrow-vector") {
exclude group: 'io.netty', module: 'netty-buffer'
exclude group: 'io.netty', module: 'netty-common'
exclude group: 'com.google.code.findbugs', module: 'jsr305'
}

testImplementation project(path: ':iceberg-spark', configuration: 'testArtifacts')

testImplementation("org.apache.hadoop:hadoop-minicluster") {
exclude group: 'org.apache.avro', module: 'avro'
}
testImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts')
testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts')
testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts')
testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts')
testImplementation "org.xerial:sqlite-jdbc"
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.iceberg.spark;

import java.util.List;
import org.apache.iceberg.Schema;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.types.Types;
import org.apache.spark.sql.catalyst.expressions.Attribute;
import org.apache.spark.sql.catalyst.expressions.AttributeReference;
import org.apache.spark.sql.catalyst.expressions.Expression;
import org.apache.spark.sql.catalyst.expressions.UnsafeProjection;
import org.apache.spark.sql.types.StructType;
import scala.collection.JavaConverters;

public class SparkBenchmarkUtil {

private SparkBenchmarkUtil() {
}

public static UnsafeProjection projection(Schema expectedSchema, Schema actualSchema) {
StructType struct = SparkSchemaUtil.convert(actualSchema);

List<AttributeReference> refs = JavaConverters.seqAsJavaListConverter(struct.toAttributes()).asJava();
List<Attribute> attrs = Lists.newArrayListWithExpectedSize(struct.fields().length);
List<Expression> exprs = Lists.newArrayListWithExpectedSize(struct.fields().length);

for (AttributeReference ref : refs) {
attrs.add(ref.toAttribute());
}

for (Types.NestedField field : expectedSchema.columns()) {
int indexInIterSchema = struct.fieldIndex(field.name());
exprs.add(refs.get(indexInIterSchema));
}

return UnsafeProjection.create(
JavaConverters.asScalaBufferConverter(exprs).asScala().toSeq(),
JavaConverters.asScalaBufferConverter(attrs).asScala().toSeq());
}
}
Loading

0 comments on commit f3e6770

Please sign in to comment.