forked from databricks/docker-spark-iceberg
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request databricks#26 from tabular-io/table-maintenance
Add table maintenance notebook and nyc taxi parquet files
- Loading branch information
Showing
4 changed files
with
349 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
330 changes: 330 additions & 0 deletions
330
spark/notebooks/Iceberg - Table Maintenance Spark Procedures.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,330 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "1041ae6f", | ||
"metadata": {}, | ||
"source": [ | ||
"![iceberg-logo](https://www.apache.org/logos/res/iceberg/iceberg.png)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "247fb2ab", | ||
"metadata": {}, | ||
"source": [ | ||
"### [Table Maintenance: The Key To Keeping Your Iceberg Tables Healthy and Performant](https://tabular.io/blog/table-maintenance/)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "6a5c8206", | ||
"metadata": { | ||
"scrolled": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"spark" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "1dab5ef0", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"spark.sql(\"DROP TABLE IF EXISTS demo.nyc.taxis_sample\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "49a45d0b", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"spark.sql(\"\"\"\n", | ||
"CREATE TABLE demo.nyc.taxis_sample (\n", | ||
" `VendorID` BIGINT,\n", | ||
" `tpep_pickup_datetime` TIMESTAMP,\n", | ||
" `tpep_dropoff_datetime` TIMESTAMP,\n", | ||
" `passenger_count` DOUBLE,\n", | ||
" `trip_distance` DOUBLE,\n", | ||
" `RatecodeID` DOUBLE,\n", | ||
" `store_and_fwd_flag` STRING,\n", | ||
" `PULocationID` BIGINT,\n", | ||
" `DOLocationID` BIGINT,\n", | ||
" `payment_type` BIGINT,\n", | ||
" `fare_amount` DOUBLE,\n", | ||
" `extra` DOUBLE,\n", | ||
" `mta_tax` DOUBLE,\n", | ||
" `tip_amount` DOUBLE,\n", | ||
" `tolls_amount` DOUBLE,\n", | ||
" `improvement_surcharge` DOUBLE,\n", | ||
" `total_amount` DOUBLE,\n", | ||
" `congestion_surcharge` DOUBLE,\n", | ||
" `airport_fee` DOUBLE)\n", | ||
"USING iceberg\n", | ||
"TBLPROPERTIES(\n", | ||
" 'write.target-file-size-bytes'='5242880'\n", | ||
")\n", | ||
"\"\"\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "f1f30120", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"!ls -Shs /home/iceberg/data/yellow_tripdata_*.parquet" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "997bb9df", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"val df_202201 = spark.read.parquet(\"/home/iceberg/data/yellow_tripdata_2022-01.parquet\")\n", | ||
"val df_202202 = spark.read.parquet(\"/home/iceberg/data/yellow_tripdata_2022-02.parquet\")\n", | ||
"val df_202203 = spark.read.parquet(\"/home/iceberg/data/yellow_tripdata_2022-03.parquet\")\n", | ||
"val df_q1 = df_202201.union(df_202202).union(df_202203)\n", | ||
"df_q1.write.insertInto(\"nyc.taxis_sample\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "78cab088", | ||
"metadata": {}, | ||
"source": [ | ||
"## Rewriting Data Files" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "7ad64e6b", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"spark.sql(\"SELECT file_path, file_size_in_bytes FROM nyc.taxis_sample.files\").show(100)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "d5d10355", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"spark.sql(\"ALTER TABLE nyc.taxis_sample UNSET TBLPROPERTIES ('write.target-file-size-bytes')\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "f26228a5", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"spark.sql(\"CALL demo.system.rewrite_data_files(table => 'nyc.taxis_sample', options => map('target-file-size-bytes','52428800'))\").show()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "43a9ed67", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"spark.sql(\"SELECT file_path, file_size_in_bytes FROM nyc.taxis_sample.files\").show(100)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "523eb893", | ||
"metadata": {}, | ||
"source": [ | ||
"## Expiring Snapshots" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "a8f21e73", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"!ls -Shs /home/iceberg/warehouse/nyc/taxis_sample/data" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "98e8c5db", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"spark.sql(\"SELECT committed_at, snapshot_id, operation FROM nyc.taxis_sample.snapshots\").show(truncate=false)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "b264c989", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"val now = java.util.Calendar.getInstance().getTime()\n", | ||
"val format = new java.text.SimpleDateFormat(\"yyyy-MM-dd HH:mm:ss.SSS\")\n", | ||
"val now_str = format.format(now)\n", | ||
"\n", | ||
"spark.sql(s\"CALL demo.system.expire_snapshots(table => 'nyc.taxis_sample', older_than => TIMESTAMP '$now_str', retain_last => 1)\").show()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "131e1f09", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"spark.sql(\"SELECT committed_at, snapshot_id, operation FROM nyc.taxis_sample.snapshots\").show(truncate=false)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "f810ebb1", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"!ls -Shs /home/iceberg/warehouse/nyc/taxis_sample/data" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "c60631b7", | ||
"metadata": {}, | ||
"source": [ | ||
"## Removing Orphan Files" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "3dd93ad5", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"spark.sql(s\"CALL demo.system.remove_orphan_files(table => 'nyc.taxis_sample', dry_run => true)\").show(truncate=false)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "9998166a", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"!echo \"orphan data\" | tee -a /home/iceberg/warehouse/nyc/taxis_sample/data/i-shouldnt-be-here-1.parquet /home/iceberg/warehouse/nyc/taxis_sample/data/i-shouldnt-be-here-2.parquet /home/iceberg/warehouse/nyc/taxis_sample/data/i-shouldnt-be-here-3.parquet" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "94ad504b", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"!touch -d \"2022-01-01 00:00:00.000000000\" /home/iceberg/warehouse/nyc/taxis_sample/data/i-shouldnt-be-here-1.parquet /home/iceberg/warehouse/nyc/taxis_sample/data/i-shouldnt-be-here-2.parquet /home/iceberg/warehouse/nyc/taxis_sample/data/i-shouldnt-be-here-3.parquet" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "226f74da", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"!ls -Shs /home/iceberg/warehouse/nyc/taxis_sample/data/" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "f22673aa", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"spark.sql(s\"CALL demo.system.remove_orphan_files(table => 'nyc.taxis_sample', dry_run => true)\").show(truncate=false)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "3444042b", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"spark.sql(s\"CALL demo.system.remove_orphan_files(table => 'nyc.taxis_sample')\").show(truncate=false)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "2d157881", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"!ls -Shs /home/iceberg/warehouse/nyc/taxis_sample/data/" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "181212b6", | ||
"metadata": {}, | ||
"source": [ | ||
"## Rewriting Manifest Files" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "49290e56", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"spark.sql(\"CALL demo.system.rewrite_manifests('nyc.taxis_sample')\").show()" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "spylon-kernel", | ||
"language": "scala", | ||
"name": "spylon-kernel" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": "text/x-scala", | ||
"file_extension": ".scala", | ||
"help_links": [ | ||
{ | ||
"text": "MetaKernel Magics", | ||
"url": "https://metakernel.readthedocs.io/en/latest/source/README.html" | ||
} | ||
], | ||
"mimetype": "text/x-scala", | ||
"name": "scala", | ||
"pygments_lexer": "scala", | ||
"version": "0.4.1" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |