Skip to content

Commit 37ec2df

Browse files
authored
Spark 2.4: Remove module (apache#7385)
1 parent 403efd1 commit 37ec2df

File tree

208 files changed

+16
-40461
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

208 files changed

+16
-40461
lines changed

.github/workflows/spark-ci.yml

-26
Original file line numberDiff line numberDiff line change
@@ -55,32 +55,6 @@ concurrency:
5555
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
5656

5757
jobs:
58-
spark2-tests:
59-
runs-on: ubuntu-22.04
60-
env:
61-
SPARK_LOCAL_IP: localhost
62-
steps:
63-
- uses: actions/checkout@v3
64-
- uses: actions/setup-java@v3
65-
with:
66-
distribution: zulu
67-
java-version: 8
68-
- uses: actions/cache@v3
69-
with:
70-
path: |
71-
~/.gradle/caches
72-
~/.gradle/wrapper
73-
key: ${{ runner.os }}-gradle-${{ hashFiles('**/*.gradle*', '**/gradle-wrapper.properties') }}
74-
restore-keys: ${{ runner.os }}-gradle-
75-
- run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts
76-
- run: ./gradlew -DsparkVersions=2.4 -DhiveVersions= -DflinkVersions= :iceberg-spark:check :iceberg-spark:iceberg-spark-2.4:check :iceberg-spark:iceberg-spark-runtime-2.4:check -Pquick=true -x javadoc
77-
- uses: actions/upload-artifact@v3
78-
if: failure()
79-
with:
80-
name: test logs
81-
path: |
82-
**/build/testlogs
83-
8458
spark-3x-scala-2-12-tests:
8559
runs-on: ubuntu-22.04
8660
strategy:

.gitignore

-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ lib/
2828
site/site
2929

3030
# benchmark output
31-
spark/v2.4/spark/benchmark/*
3231
spark/v3.1/spark/benchmark/*
3332
spark/v3.2/spark/benchmark/*
3433
spark/v3.3/spark/benchmark/*

dev/stage-binaries.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
SCALA_VERSION=2.12
2222
FLINK_VERSIONS=1.15,1.16,1.17
23-
SPARK_VERSIONS=2.4,3.1,3.2,3.3,3.4
23+
SPARK_VERSIONS=3.1,3.2,3.3,3.4
2424
HIVE_VERSIONS=2,3
2525

2626
./gradlew -Prelease -DscalaVersion=$SCALA_VERSION -DflinkVersions=$FLINK_VERSIONS -DsparkVersions=$SPARK_VERSIONS -DhiveVersions=$HIVE_VERSIONS publishApachePublicationToMavenRepository

docs/spark-configuration.md

-10
Original file line numberDiff line numberDiff line change
@@ -124,13 +124,6 @@ spark.sql.catalog.custom_prod.catalog-impl = com.my.custom.CatalogImpl
124124
spark.sql.catalog.custom_prod.my-additional-catalog-config = my-value
125125
```
126126

127-
### Catalogs in Spark 2.4
128-
129-
When using Iceberg 0.11.0 and later, Spark 2.4 can load tables from multiple Iceberg catalogs or from table locations.
130-
131-
Catalogs in 2.4 are configured just like catalogs in 3.x, but only Iceberg catalogs are supported.
132-
133-
134127
## SQL Extensions
135128

136129
Iceberg 0.11.0 and later add an extension module to Spark to add new SQL commands, like `CALL` for stored procedures or `ALTER TABLE ... WRITE ORDERED BY`.
@@ -142,9 +135,6 @@ Using those SQL commands requires adding Iceberg extensions to your Spark enviro
142135
|---------------------------|---------------------------------------------------------------------|
143136
| `spark.sql.extensions` | `org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions` |
144137

145-
SQL extensions are not available for Spark 2.4.
146-
147-
148138
## Runtime configuration
149139

150140
### Read options

docs/spark-ddl.md

+2-8
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,7 @@ menu:
2727

2828
# Spark DDL
2929

30-
To use Iceberg in Spark, first configure [Spark catalogs](../spark-configuration).
31-
32-
Iceberg uses Apache Spark's DataSourceV2 API for data source and catalog implementations. Spark DSv2 is an evolving API with different levels of support in Spark versions. Spark 2.4 does not support SQL DDL.
33-
34-
{{< hint info >}}
35-
Spark 2.4 can't create Iceberg tables with DDL, instead use Spark 3 or the [Iceberg API](..//java-api-quickstart).
36-
{{< /hint >}}
30+
To use Iceberg in Spark, first configure [Spark catalogs](../spark-configuration). Iceberg uses Apache Spark's DataSourceV2 API for data source and catalog implementations.
3731

3832
## `CREATE TABLE`
3933

@@ -256,7 +250,7 @@ ADD COLUMN points.value.b int
256250

257251
Note: Altering a map 'key' column by adding columns is not allowed. Only map values can be updated.
258252

259-
In Spark 2.4.4 and later, you can add columns in any position by adding `FIRST` or `AFTER` clauses:
253+
Add columns in any position by adding `FIRST` or `AFTER` clauses:
260254

261255
```sql
262256
ALTER TABLE prod.db.sample

docs/spark-queries.md

+2-44
Original file line numberDiff line numberDiff line change
@@ -27,22 +27,7 @@ menu:
2727

2828
# Spark Queries
2929

30-
To use Iceberg in Spark, first configure [Spark catalogs](../spark-configuration).
31-
32-
Iceberg uses Apache Spark's DataSourceV2 API for data source and catalog implementations. Spark DSv2 is an evolving API with different levels of support in Spark versions:
33-
34-
| Feature support | Spark 3 | Spark 2.4 | Notes |
35-
|--------------------------------------------------|-----------|------------|------------------------------------------------|
36-
| [`SELECT`](#querying-with-sql) | ✔️ | | |
37-
| [DataFrame reads](#querying-with-dataframes) | ✔️ | ✔️ | |
38-
| [Metadata table `SELECT`](#inspecting-tables) | ✔️ | | |
39-
| [History metadata table](#history) | ✔️ | ✔️ | |
40-
| [Snapshots metadata table](#snapshots) | ✔️ | ✔️ | |
41-
| [Files metadata table](#files) | ✔️ | ✔️ | |
42-
| [Manifests metadata table](#manifests) | ✔️ | ✔️ | |
43-
| [Partitions metadata table](#partitions) | ✔️ | ✔️ | |
44-
| [All metadata tables](#all-metadata-tables) | ✔️ | ✔️ | |
45-
30+
To use Iceberg in Spark, first configure [Spark catalogs](../spark-configuration). Iceberg uses Apache Spark's DataSourceV2 API for data source and catalog implementations.
4631

4732
## Querying with SQL
4833

@@ -75,8 +60,6 @@ val df = spark.table("prod.db.table")
7560

7661
### Catalogs with DataFrameReader
7762

78-
Iceberg 0.11.0 adds multi-catalog support to `DataFrameReader` in both Spark 3 and 2.4.
79-
8063
Paths and table names can be loaded with Spark's `DataFrameReader` interface. How tables are loaded depends on how
8164
the identifier is specified. When using `spark.read.format("iceberg").load(table)` or `spark.table(table)` the `table`
8265
variable can take a number of forms as listed below:
@@ -205,38 +188,13 @@ Incremental read works with both V1 and V2 format-version.
205188
Incremental read is not supported by Spark's SQL syntax.
206189
{{< /hint >}}
207190

208-
### Spark 2.4
209-
210-
Spark 2.4 requires using the DataFrame reader with `iceberg` as a format, because 2.4 does not support direct SQL queries:
211-
212-
```scala
213-
// named metastore table
214-
spark.read.format("iceberg").load("catalog.db.table")
215-
// Hadoop path table
216-
spark.read.format("iceberg").load("hdfs://nn:8020/path/to/table")
217-
```
218-
219-
#### Spark 2.4 with SQL
220-
221-
To run SQL `SELECT` statements on Iceberg tables in 2.4, register the DataFrame as a temporary table:
222-
223-
```scala
224-
val df = spark.read.format("iceberg").load("db.table")
225-
df.createOrReplaceTempView("table")
226-
227-
spark.sql("""select count(1) from table""").show()
228-
```
229-
230-
231191
## Inspecting tables
232192

233193
To inspect a table's history, snapshots, and other metadata, Iceberg supports metadata tables.
234194

235195
Metadata tables are identified by adding the metadata table name after the original table name. For example, history for `db.table` is read using `db.table.history`.
236196

237197
{{< hint info >}}
238-
For Spark 2.4, use the `DataFrameReader` API to [inspect tables](#inspecting-with-dataframes).
239-
240198
For Spark 3, prior to 3.2, the Spark [session catalog](../spark-configuration#replacing-the-session-catalog) does not support table names with multipart identifiers such as `catalog.database.table.metadata`. As a workaround, configure an `org.apache.iceberg.spark.SparkCatalog`, or use the Spark `DataFrameReader` API.
241199
{{< /hint >}}
242200

@@ -422,7 +380,7 @@ SELECT * FROM prod.db.table.refs;
422380

423381
### Inspecting with DataFrames
424382

425-
Metadata tables can be loaded in Spark 2.4 or Spark 3 using the DataFrameReader API:
383+
Metadata tables can be loaded using the DataFrameReader API:
426384

427385
```scala
428386
// named metastore table

docs/spark-structured-streaming.md

-4
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,6 @@ with different levels of support in Spark versions.
3232

3333
As of Spark 3, DataFrame reads and writes are supported.
3434

35-
| Feature support | Spark 3 | Spark 2.4 | Notes |
36-
|--------------------------------------------------|-----------|------------|------------------------------------------------|
37-
| [DataFrame write](#streaming-writes) ||| |
38-
3935
## Streaming Reads
4036

4137
Iceberg supports processing incremental data in spark structured streaming jobs which starts from a historical timestamp:

docs/spark-writes.md

+10-38
Original file line numberDiff line numberDiff line change
@@ -33,16 +33,16 @@ Some plans are only available when using [Iceberg SQL extensions](../spark-confi
3333

3434
Iceberg uses Apache Spark's DataSourceV2 API for data source and catalog implementations. Spark DSv2 is an evolving API with different levels of support in Spark versions:
3535

36-
| Feature support | Spark 3 | Spark 2.4 | Notes |
37-
|--------------------------------------------------|-----------|------------|----------------------------------------------|
38-
| [SQL insert into](#insert-into) | ✔️ | | |
39-
| [SQL merge into](#merge-into) | ✔️ | | ⚠ Requires Iceberg Spark extensions |
40-
| [SQL insert overwrite](#insert-overwrite) | ✔️ | | |
41-
| [SQL delete from](#delete-from) | ✔️ | | ⚠ Row-level delete requires Spark extensions |
42-
| [SQL update](#update) | ✔️ | | ⚠ Requires Iceberg Spark extensions |
43-
| [DataFrame append](#appending-data) | ✔️ | ✔️ | |
44-
| [DataFrame overwrite](#overwriting-data) | ✔️ | ✔️ | ⚠ Behavior changed in Spark 3 |
45-
| [DataFrame CTAS and RTAS](#creating-tables) | ✔️ | | |
36+
| Feature support | Spark 3 | Notes |
37+
|--------------------------------------------------|-----------|----------------------------------------------|
38+
| [SQL insert into](#insert-into) | ✔️ | |
39+
| [SQL merge into](#merge-into) | ✔️ | ⚠ Requires Iceberg Spark extensions |
40+
| [SQL insert overwrite](#insert-overwrite) | ✔️ | |
41+
| [SQL delete from](#delete-from) | ✔️ | ⚠ Row-level delete requires Spark extensions |
42+
| [SQL update](#update) | ✔️ | ⚠ Requires Iceberg Spark extensions |
43+
| [DataFrame append](#appending-data) | ✔️ | |
44+
| [DataFrame overwrite](#overwriting-data) | ✔️ | |
45+
| [DataFrame CTAS and RTAS](#creating-tables) | ✔️ | |
4646

4747

4848
## Writing with SQL
@@ -234,17 +234,6 @@ val data: DataFrame = ...
234234
data.writeTo("prod.db.table").append()
235235
```
236236

237-
#### Spark 2.4
238-
239-
In Spark 2.4, use the v1 API with `append` mode and `iceberg` format:
240-
241-
```scala
242-
data.write
243-
.format("iceberg")
244-
.mode("append")
245-
.save("db.table")
246-
```
247-
248237
### Overwriting data
249238

250239
To overwrite partitions dynamically, use `overwritePartitions()`:
@@ -260,23 +249,6 @@ To explicitly overwrite partitions, use `overwrite` to supply a filter:
260249
data.writeTo("prod.db.table").overwrite($"level" === "INFO")
261250
```
262251

263-
#### Spark 2.4
264-
265-
In Spark 2.4, overwrite values in an Iceberg table with `overwrite` mode and `iceberg` format:
266-
267-
```scala
268-
data.write
269-
.format("iceberg")
270-
.mode("overwrite")
271-
.save("db.table")
272-
```
273-
274-
{{< hint danger >}}
275-
**The behavior of overwrite mode changed between Spark 2.4 and Spark 3**.
276-
{{< /hint >}}
277-
278-
The behavior of DataFrameWriter overwrite mode was undefined in Spark 2.4, but is required to overwrite the entire table in Spark 3. Because of this new requirement, the Iceberg source's behavior changed in Spark 3. In Spark 2.4, the behavior was to dynamically overwrite partitions. To use the Spark 2.4 behavior, add option `overwrite-mode=dynamic`.
279-
280252
### Creating tables
281253

282254
To run a CTAS or RTAS, use `create`, `replace`, or `createOrReplace` operations:

gradle.properties

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ systemProp.knownFlinkVersions=1.15,1.16,1.17
2121
systemProp.defaultHiveVersions=2
2222
systemProp.knownHiveVersions=2,3
2323
systemProp.defaultSparkVersions=3.4
24-
systemProp.knownSparkVersions=2.4,3.1,3.2,3.3,3.4
24+
systemProp.knownSparkVersions=3.1,3.2,3.3,3.4
2525
systemProp.defaultScalaVersion=2.12
2626
systemProp.knownScalaVersions=2.12,2.13
2727
org.gradle.parallel=true

jmh.gradle

-4
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,6 @@ def sparkVersions = (System.getProperty("sparkVersions") != null ? System.getPro
2525
def scalaVersion = System.getProperty("scalaVersion") != null ? System.getProperty("scalaVersion") : System.getProperty("defaultScalaVersion")
2626
def jmhProjects = [project(":iceberg-core")]
2727

28-
if (jdkVersion == '8' && sparkVersions.contains("2.4")) {
29-
jmhProjects.add(project(":iceberg-spark:iceberg-spark-2.4"))
30-
}
31-
3228
if (sparkVersions.contains("3.1")) {
3329
jmhProjects.add(project(":iceberg-spark:iceberg-spark-3.1_2.12"))
3430
}

settings.gradle

-10
Original file line numberDiff line numberDiff line change
@@ -183,16 +183,6 @@ if (hiveVersions.contains("2") || hiveVersions.contains("3")) {
183183
}
184184

185185
if (JavaVersion.current() == JavaVersion.VERSION_1_8) {
186-
if (sparkVersions.contains("2.4")) {
187-
include ':iceberg-spark:spark-2.4'
188-
include ':iceberg-spark:spark-runtime-2.4'
189-
190-
project(':iceberg-spark:spark-2.4').projectDir = file('spark/v2.4/spark')
191-
project(':iceberg-spark:spark-2.4').name = 'iceberg-spark-2.4'
192-
project(':iceberg-spark:spark-runtime-2.4').projectDir = file('spark/v2.4/spark-runtime')
193-
project(':iceberg-spark:spark-runtime-2.4').name = 'iceberg-spark-runtime-2.4'
194-
}
195-
196186
if (hiveVersions.contains("3")) {
197187
include 'hive3'
198188
include 'hive3-orc-bundle'

spark/build.gradle

-4
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,6 @@
2020
// add enabled Spark version modules to the build
2121
def sparkVersions = (System.getProperty("sparkVersions") != null ? System.getProperty("sparkVersions") : System.getProperty("defaultSparkVersions")).split(",")
2222

23-
if (jdkVersion == '8' && sparkVersions.contains("2.4")) {
24-
apply from: file("$projectDir/v2.4/build.gradle")
25-
}
26-
2723
if (sparkVersions.contains("3.1")) {
2824
apply from: file("$projectDir/v3.1/build.gradle")
2925
}

0 commit comments

Comments
 (0)