Spark 2.4: Remove module (apache#7385)

jia3857 · Apr 20, 2023 · 37ec2df · 37ec2df
1 parent 403efd1
commit 37ec2df
Show file tree

Hide file tree

Showing 208 changed files with 16 additions and 40,461 deletions.
diff --git a/.github/workflows/spark-ci.yml b/.github/workflows/spark-ci.yml
@@ -55,32 +55,6 @@ concurrency:
   cancel-in-progress: ${{ github.event_name == 'pull_request' }}
 
 jobs:
-  spark2-tests:
-    runs-on: ubuntu-22.04
-    env:
-      SPARK_LOCAL_IP: localhost
-    steps:
-    - uses: actions/checkout@v3
-    - uses: actions/setup-java@v3
-      with:
-        distribution: zulu
-        java-version: 8
-    - uses: actions/cache@v3
-      with:
-        path: |
-          ~/.gradle/caches
-          ~/.gradle/wrapper
-        key: ${{ runner.os }}-gradle-${{ hashFiles('**/*.gradle*', '**/gradle-wrapper.properties') }}
-        restore-keys: ${{ runner.os }}-gradle-
-    - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts
-    - run: ./gradlew -DsparkVersions=2.4 -DhiveVersions= -DflinkVersions= :iceberg-spark:check :iceberg-spark:iceberg-spark-2.4:check :iceberg-spark:iceberg-spark-runtime-2.4:check -Pquick=true -x javadoc
-    - uses: actions/upload-artifact@v3
-      if: failure()
-      with:
-        name: test logs
-        path: |
-          **/build/testlogs
-
   spark-3x-scala-2-12-tests:
     runs-on: ubuntu-22.04
     strategy:

diff --git a/.gitignore b/.gitignore
@@ -28,7 +28,6 @@ lib/
 site/site
 
 # benchmark output
-spark/v2.4/spark/benchmark/*
 spark/v3.1/spark/benchmark/*
 spark/v3.2/spark/benchmark/*
 spark/v3.3/spark/benchmark/*

diff --git a/dev/stage-binaries.sh b/dev/stage-binaries.sh
@@ -20,7 +20,7 @@
 
 SCALA_VERSION=2.12
 FLINK_VERSIONS=1.15,1.16,1.17
-SPARK_VERSIONS=2.4,3.1,3.2,3.3,3.4
+SPARK_VERSIONS=3.1,3.2,3.3,3.4
 HIVE_VERSIONS=2,3
 
 ./gradlew -Prelease -DscalaVersion=$SCALA_VERSION -DflinkVersions=$FLINK_VERSIONS -DsparkVersions=$SPARK_VERSIONS -DhiveVersions=$HIVE_VERSIONS publishApachePublicationToMavenRepository

diff --git a/docs/spark-configuration.md b/docs/spark-configuration.md
@@ -124,13 +124,6 @@ spark.sql.catalog.custom_prod.catalog-impl = com.my.custom.CatalogImpl
 spark.sql.catalog.custom_prod.my-additional-catalog-config = my-value
 ```
 
-### Catalogs in Spark 2.4
-
-When using Iceberg 0.11.0 and later, Spark 2.4 can load tables from multiple Iceberg catalogs or from table locations.
-
-Catalogs in 2.4 are configured just like catalogs in 3.x, but only Iceberg catalogs are supported.
-
-
 ## SQL Extensions
 
 Iceberg 0.11.0 and later add an extension module to Spark to add new SQL commands, like `CALL` for stored procedures or `ALTER TABLE ... WRITE ORDERED BY`.
@@ -142,9 +135,6 @@ Using those SQL commands requires adding Iceberg extensions to your Spark enviro
 |---------------------------|---------------------------------------------------------------------|
 | `spark.sql.extensions`    | `org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions` |
 
-SQL extensions are not available for Spark 2.4.
-
-
 ## Runtime configuration
 
 ### Read options

diff --git a/docs/spark-ddl.md b/docs/spark-ddl.md
@@ -27,13 +27,7 @@ menu:
 
 # Spark DDL
 
-To use Iceberg in Spark, first configure [Spark catalogs](../spark-configuration).
-
-Iceberg uses Apache Spark's DataSourceV2 API for data source and catalog implementations. Spark DSv2 is an evolving API with different levels of support in Spark versions. Spark 2.4 does not support SQL DDL.
-
-{{< hint info >}}
-Spark 2.4 can't create Iceberg tables with DDL, instead use Spark 3 or the [Iceberg API](..//java-api-quickstart).
-{{< /hint >}}
+To use Iceberg in Spark, first configure [Spark catalogs](../spark-configuration). Iceberg uses Apache Spark's DataSourceV2 API for data source and catalog implementations.
 
 ## `CREATE TABLE`
 
@@ -256,7 +250,7 @@ ADD COLUMN points.value.b int
 
 Note: Altering a map 'key' column by adding columns is not allowed. Only map values can be updated.
 
-In Spark 2.4.4 and later, you can add columns in any position by adding `FIRST` or `AFTER` clauses:
+Add columns in any position by adding `FIRST` or `AFTER` clauses:
 
 ```sql
 ALTER TABLE prod.db.sample

diff --git a/docs/spark-queries.md b/docs/spark-queries.md
@@ -27,22 +27,7 @@ menu:
 
 # Spark Queries
 
-To use Iceberg in Spark, first configure [Spark catalogs](../spark-configuration).
-
-Iceberg uses Apache Spark's DataSourceV2 API for data source and catalog implementations. Spark DSv2 is an evolving API with different levels of support in Spark versions:
-
-| Feature support                                  | Spark 3 | Spark 2.4  | Notes                                          |
-|--------------------------------------------------|-----------|------------|------------------------------------------------|
-| [`SELECT`](#querying-with-sql)                   | ✔️        |            |                                                |
-| [DataFrame reads](#querying-with-dataframes)     | ✔️        | ✔️          |                                                |
-| [Metadata table `SELECT`](#inspecting-tables)    | ✔️        |            |                                                |
-| [History metadata table](#history)               | ✔️        | ✔️          |                                                |
-| [Snapshots metadata table](#snapshots)           | ✔️        | ✔️          |                                                |
-| [Files metadata table](#files)                   | ✔️        | ✔️          |                                                |
-| [Manifests metadata table](#manifests)           | ✔️        | ✔️          |                                                |
-| [Partitions metadata table](#partitions)         | ✔️        | ✔️          |                                                |
-| [All metadata tables](#all-metadata-tables)      | ✔️        | ✔️          |                                                |
-
+To use Iceberg in Spark, first configure [Spark catalogs](../spark-configuration). Iceberg uses Apache Spark's DataSourceV2 API for data source and catalog implementations.
 
 ## Querying with SQL
 
@@ -75,8 +60,6 @@ val df = spark.table("prod.db.table")
 
 ### Catalogs with DataFrameReader
 
-Iceberg 0.11.0 adds multi-catalog support to `DataFrameReader` in both Spark 3 and 2.4.
-
 Paths and table names can be loaded with Spark's `DataFrameReader` interface. How tables are loaded depends on how
 the identifier is specified. When using `spark.read.format("iceberg").load(table)` or `spark.table(table)` the `table`
 variable can take a number of forms as listed below:
@@ -205,38 +188,13 @@ Incremental read works with both V1 and V2 format-version.
 Incremental read is not supported by Spark's SQL syntax.
 {{< /hint >}}
 
-### Spark 2.4
-
-Spark 2.4 requires using the DataFrame reader with `iceberg` as a format, because 2.4 does not support direct SQL queries:
-
-```scala
-// named metastore table
-spark.read.format("iceberg").load("catalog.db.table")
-// Hadoop path table
-spark.read.format("iceberg").load("hdfs://nn:8020/path/to/table")
-```
-
-#### Spark 2.4 with SQL
-
-To run SQL `SELECT` statements on Iceberg tables in 2.4, register the DataFrame as a temporary table:
-
-```scala
-val df = spark.read.format("iceberg").load("db.table")
-df.createOrReplaceTempView("table")
-
-spark.sql("""select count(1) from table""").show()
-```
-
-
 ## Inspecting tables
 
 To inspect a table's history, snapshots, and other metadata, Iceberg supports metadata tables.
 
 Metadata tables are identified by adding the metadata table name after the original table name. For example, history for `db.table` is read using `db.table.history`.
 
 {{< hint info >}}
-For Spark 2.4, use the `DataFrameReader` API to [inspect tables](#inspecting-with-dataframes).
-
 For Spark 3, prior to 3.2, the Spark [session catalog](../spark-configuration#replacing-the-session-catalog) does not support table names with multipart identifiers such as `catalog.database.table.metadata`. As a workaround, configure an `org.apache.iceberg.spark.SparkCatalog`, or use the Spark `DataFrameReader` API.
 {{< /hint >}}
 
@@ -422,7 +380,7 @@ SELECT * FROM prod.db.table.refs;
 
 ### Inspecting with DataFrames
 
-Metadata tables can be loaded in Spark 2.4 or Spark 3 using the DataFrameReader API:
+Metadata tables can be loaded using the DataFrameReader API:
 
 ```scala
 // named metastore table

diff --git a/docs/spark-structured-streaming.md b/docs/spark-structured-streaming.md
@@ -32,10 +32,6 @@ with different levels of support in Spark versions.
 
 As of Spark 3, DataFrame reads and writes are supported.
 
-| Feature support                                  | Spark 3 | Spark 2.4  | Notes                                          |
-|--------------------------------------------------|-----------|------------|------------------------------------------------|
-| [DataFrame write](#streaming-writes)             | ✔         | ✔          |                                                |
-
 ## Streaming Reads
 
 Iceberg supports processing incremental data in spark structured streaming jobs which starts from a historical timestamp:

diff --git a/docs/spark-writes.md b/docs/spark-writes.md
@@ -33,16 +33,16 @@ Some plans are only available when using [Iceberg SQL extensions](../spark-confi
 
 Iceberg uses Apache Spark's DataSourceV2 API for data source and catalog implementations. Spark DSv2 is an evolving API with different levels of support in Spark versions:
 
-| Feature support                                  | Spark 3 | Spark 2.4  | Notes                                        |
-|--------------------------------------------------|-----------|------------|----------------------------------------------|
-| [SQL insert into](#insert-into)                  | ✔️        |            |                                              |
-| [SQL merge into](#merge-into)                    | ✔️        |            | ⚠ Requires Iceberg Spark extensions          |
-| [SQL insert overwrite](#insert-overwrite)        | ✔️        |            |                                              |
-| [SQL delete from](#delete-from)                  | ✔️        |            | ⚠ Row-level delete requires Spark extensions |
-| [SQL update](#update)                            | ✔️        |            | ⚠ Requires Iceberg Spark extensions          |
-| [DataFrame append](#appending-data)              | ✔️        | ✔️          |                                              |
-| [DataFrame overwrite](#overwriting-data)         | ✔️        | ✔️          | ⚠ Behavior changed in Spark 3              |
-| [DataFrame CTAS and RTAS](#creating-tables)      | ✔️        |            |                                              |
+| Feature support                                  | Spark 3 | Notes                                        |
+|--------------------------------------------------|-----------|----------------------------------------------|
+| [SQL insert into](#insert-into)                  | ✔️        |                                              |
+| [SQL merge into](#merge-into)                    | ✔️        | ⚠ Requires Iceberg Spark extensions          |
+| [SQL insert overwrite](#insert-overwrite)        | ✔️        |                                              |
+| [SQL delete from](#delete-from)                  | ✔️        | ⚠ Row-level delete requires Spark extensions |
+| [SQL update](#update)                            | ✔️        | ⚠ Requires Iceberg Spark extensions          |
+| [DataFrame append](#appending-data)              | ✔️        |                                              |
+| [DataFrame overwrite](#overwriting-data)         | ✔️        |                                              |
+| [DataFrame CTAS and RTAS](#creating-tables)      | ✔️        |                                              |
 
 
 ## Writing with SQL
@@ -234,17 +234,6 @@ val data: DataFrame = ...
 data.writeTo("prod.db.table").append()
 ```
 
-#### Spark 2.4
-
-In Spark 2.4, use the v1 API with `append` mode and `iceberg` format:
-
-```scala
-data.write
-    .format("iceberg")
-    .mode("append")
-    .save("db.table")
-```
-
 ### Overwriting data
 
 To overwrite partitions dynamically, use `overwritePartitions()`:
@@ -260,23 +249,6 @@ To explicitly overwrite partitions, use `overwrite` to supply a filter:
 data.writeTo("prod.db.table").overwrite($"level" === "INFO")
 ```
 
-#### Spark 2.4
-
-In Spark 2.4, overwrite values in an Iceberg table with `overwrite` mode and `iceberg` format:
-
-```scala
-data.write
-    .format("iceberg")
-    .mode("overwrite")
-    .save("db.table")
-```
-
-{{< hint danger >}}
-**The behavior of overwrite mode changed between Spark 2.4 and Spark 3**.
-{{< /hint >}}
-
-The behavior of DataFrameWriter overwrite mode was undefined in Spark 2.4, but is required to overwrite the entire table in Spark 3. Because of this new requirement, the Iceberg source's behavior changed in Spark 3. In Spark 2.4, the behavior was to dynamically overwrite partitions. To use the Spark 2.4 behavior, add option `overwrite-mode=dynamic`.
-
 ### Creating tables
 
 To run a CTAS or RTAS, use `create`, `replace`, or `createOrReplace` operations:

diff --git a/gradle.properties b/gradle.properties
@@ -21,7 +21,7 @@ systemProp.knownFlinkVersions=1.15,1.16,1.17
 systemProp.defaultHiveVersions=2
 systemProp.knownHiveVersions=2,3
 systemProp.defaultSparkVersions=3.4
-systemProp.knownSparkVersions=2.4,3.1,3.2,3.3,3.4
+systemProp.knownSparkVersions=3.1,3.2,3.3,3.4
 systemProp.defaultScalaVersion=2.12
 systemProp.knownScalaVersions=2.12,2.13
 org.gradle.parallel=true

diff --git a/jmh.gradle b/jmh.gradle
@@ -25,10 +25,6 @@ def sparkVersions = (System.getProperty("sparkVersions") != null ? System.getPro
 def scalaVersion = System.getProperty("scalaVersion") != null ? System.getProperty("scalaVersion") : System.getProperty("defaultScalaVersion")
 def jmhProjects = [project(":iceberg-core")]
 
-if (jdkVersion == '8' && sparkVersions.contains("2.4")) {
-  jmhProjects.add(project(":iceberg-spark:iceberg-spark-2.4"))
-}
-
 if (sparkVersions.contains("3.1")) {
   jmhProjects.add(project(":iceberg-spark:iceberg-spark-3.1_2.12"))
 }

diff --git a/settings.gradle b/settings.gradle
@@ -183,16 +183,6 @@ if (hiveVersions.contains("2") || hiveVersions.contains("3")) {
 }
 
 if (JavaVersion.current() == JavaVersion.VERSION_1_8) {
-  if (sparkVersions.contains("2.4")) {
-    include ':iceberg-spark:spark-2.4'
-    include ':iceberg-spark:spark-runtime-2.4'
-
-    project(':iceberg-spark:spark-2.4').projectDir = file('spark/v2.4/spark')
-    project(':iceberg-spark:spark-2.4').name = 'iceberg-spark-2.4'
-    project(':iceberg-spark:spark-runtime-2.4').projectDir = file('spark/v2.4/spark-runtime')
-    project(':iceberg-spark:spark-runtime-2.4').name = 'iceberg-spark-runtime-2.4'
-  }
-
   if (hiveVersions.contains("3")) {
     include 'hive3'
     include 'hive3-orc-bundle'

diff --git a/spark/build.gradle b/spark/build.gradle
@@ -20,10 +20,6 @@
 // add enabled Spark version modules to the build
 def sparkVersions = (System.getProperty("sparkVersions") != null ? System.getProperty("sparkVersions") : System.getProperty("defaultSparkVersions")).split(",")
 
-if (jdkVersion == '8' && sparkVersions.contains("2.4")) {
-  apply from: file("$projectDir/v2.4/build.gradle")
-}
-
 if (sparkVersions.contains("3.1")) {
   apply from: file("$projectDir/v3.1/build.gradle")
 }