Skip to content

Commit

Permalink
Merge pull request databricks#26 from tabular-io/table-maintenance
Browse files Browse the repository at this point in the history
Add table maintenance notebook and nyc taxi parquet files
  • Loading branch information
samredai authored Jul 18, 2022
2 parents 79b5855 + 685aa82 commit 8635800
Show file tree
Hide file tree
Showing 4 changed files with 349 additions and 8 deletions.
2 changes: 0 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ services:
image: tabulario/spark-iceberg
depends_on:
- postgres
container_name: spark-iceberg
volumes:
- ./warehouse:/home/iceberg/warehouse
- ./notebooks:/home/iceberg/notebooks/notebooks
Expand All @@ -15,7 +14,6 @@ services:
- 18080:18080
postgres:
image: postgres:13.4-bullseye
container_name: postgres
environment:
- POSTGRES_USER=admin
- POSTGRES_PASSWORD=password
Expand Down
23 changes: 18 additions & 5 deletions spark/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME}
WORKDIR ${SPARK_HOME}

# Download spark
RUN curl https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz -o spark-3.2.1-bin-hadoop3.2.tgz \
RUN curl https://archive.apache.org/dist/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz -o spark-3.2.1-bin-hadoop3.2.tgz \
&& tar xvzf spark-3.2.1-bin-hadoop3.2.tgz --directory /opt/spark --strip-components 1 \
&& rm -rf spark-3.2.1-bin-hadoop3.2.tgz

Expand Down Expand Up @@ -69,13 +69,26 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2
ENV IJAVA_CLASSPATH=/opt/spark/jars/*

RUN mkdir -p /home/iceberg/data \
&& curl https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-04.csv -o /home/iceberg/data/yello_tripdata_2020-04.csv \
&& curl https://data.cityofnewyork.us/resource/tg4x-b46p.json > /home/iceberg/data/nyc_film_permits.json

RUN mkdir -p /home/iceberg/localwarehouse /home/iceberg/notebooks /home/iceberg/warehouse /home/iceberg/spark-events
&& curl https://data.cityofnewyork.us/resource/tg4x-b46p.json > /home/iceberg/data/nyc_film_permits.json \
&& curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-04.parquet -o /home/iceberg/data/yellow_tripdata_2022-04.parquet \
&& curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-03.parquet -o /home/iceberg/data/yellow_tripdata_2022-03.parquet \
&& curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet -o /home/iceberg/data/yellow_tripdata_2022-02.parquet \
&& curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet -o /home/iceberg/data/yellow_tripdata_2022-01.parquet \
&& curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet -o /home/iceberg/data/yellow_tripdata_2021-12.parquet \
&& curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet -o /home/iceberg/data/yellow_tripdata_2021-11.parquet \
&& curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet -o /home/iceberg/data/yellow_tripdata_2021-10.parquet \
&& curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet -o /home/iceberg/data/yellow_tripdata_2021-09.parquet \
&& curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet -o /home/iceberg/data/yellow_tripdata_2021-08.parquet \
&& curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet -o /home/iceberg/data/yellow_tripdata_2021-07.parquet \
&& curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet -o /home/iceberg/data/yellow_tripdata_2021-06.parquet \
&& curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet -o /home/iceberg/data/yellow_tripdata_2021-05.parquet \
&& curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet -o /home/iceberg/data/yellow_tripdata_2021-04.parquet

RUN mkdir -p /home/iceberg/localwarehouse /home/iceberg/notebooks /home/iceberg/warehouse /home/iceberg/spark-events /home/iceberg
COPY ["notebooks/Iceberg - Getting Started.ipynb", "/home/iceberg/notebooks"]
COPY ["notebooks/Iceberg - Integrated Audits Demo.ipynb", "/home/iceberg/notebooks"]
COPY ["notebooks/Iceberg - An Introduction to the Iceberg Java API.ipynb", "/home/iceberg/notebooks"]
COPY ["notebooks/Iceberg - Table Maintenance Spark Procedures.ipynb", "/home/iceberg/notebooks"]

# Add a notebook command
RUN echo '#! /bin/sh' >> /bin/notebook \
Expand Down
2 changes: 1 addition & 1 deletion spark/notebooks/Iceberg - Getting Started.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
"metadata": {},
"outputs": [],
"source": [
"df = spark.read.option(\"header\", True).option(\"inferSchema\", \"true\").csv(\"/home/iceberg/data/yello_tripdata_2020-04.csv\")\n",
"df = spark.read.parquet(\"/home/iceberg/data/yellow_tripdata_2021-04.parquet\")\n",
"df.write.saveAsTable(\"nyc.taxis\")"
]
},
Expand Down
330 changes: 330 additions & 0 deletions spark/notebooks/Iceberg - Table Maintenance Spark Procedures.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,330 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "1041ae6f",
"metadata": {},
"source": [
"![iceberg-logo](https://www.apache.org/logos/res/iceberg/iceberg.png)"
]
},
{
"cell_type": "markdown",
"id": "247fb2ab",
"metadata": {},
"source": [
"### [Table Maintenance: The Key To Keeping Your Iceberg Tables Healthy and Performant](https://tabular.io/blog/table-maintenance/)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6a5c8206",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"spark"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1dab5ef0",
"metadata": {},
"outputs": [],
"source": [
"spark.sql(\"DROP TABLE IF EXISTS demo.nyc.taxis_sample\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "49a45d0b",
"metadata": {},
"outputs": [],
"source": [
"spark.sql(\"\"\"\n",
"CREATE TABLE demo.nyc.taxis_sample (\n",
" `VendorID` BIGINT,\n",
" `tpep_pickup_datetime` TIMESTAMP,\n",
" `tpep_dropoff_datetime` TIMESTAMP,\n",
" `passenger_count` DOUBLE,\n",
" `trip_distance` DOUBLE,\n",
" `RatecodeID` DOUBLE,\n",
" `store_and_fwd_flag` STRING,\n",
" `PULocationID` BIGINT,\n",
" `DOLocationID` BIGINT,\n",
" `payment_type` BIGINT,\n",
" `fare_amount` DOUBLE,\n",
" `extra` DOUBLE,\n",
" `mta_tax` DOUBLE,\n",
" `tip_amount` DOUBLE,\n",
" `tolls_amount` DOUBLE,\n",
" `improvement_surcharge` DOUBLE,\n",
" `total_amount` DOUBLE,\n",
" `congestion_surcharge` DOUBLE,\n",
" `airport_fee` DOUBLE)\n",
"USING iceberg\n",
"TBLPROPERTIES(\n",
" 'write.target-file-size-bytes'='5242880'\n",
")\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f1f30120",
"metadata": {},
"outputs": [],
"source": [
"!ls -Shs /home/iceberg/data/yellow_tripdata_*.parquet"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "997bb9df",
"metadata": {},
"outputs": [],
"source": [
"val df_202201 = spark.read.parquet(\"/home/iceberg/data/yellow_tripdata_2022-01.parquet\")\n",
"val df_202202 = spark.read.parquet(\"/home/iceberg/data/yellow_tripdata_2022-02.parquet\")\n",
"val df_202203 = spark.read.parquet(\"/home/iceberg/data/yellow_tripdata_2022-03.parquet\")\n",
"val df_q1 = df_202201.union(df_202202).union(df_202203)\n",
"df_q1.write.insertInto(\"nyc.taxis_sample\")"
]
},
{
"cell_type": "markdown",
"id": "78cab088",
"metadata": {},
"source": [
"## Rewriting Data Files"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7ad64e6b",
"metadata": {},
"outputs": [],
"source": [
"spark.sql(\"SELECT file_path, file_size_in_bytes FROM nyc.taxis_sample.files\").show(100)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d5d10355",
"metadata": {},
"outputs": [],
"source": [
"spark.sql(\"ALTER TABLE nyc.taxis_sample UNSET TBLPROPERTIES ('write.target-file-size-bytes')\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f26228a5",
"metadata": {},
"outputs": [],
"source": [
"spark.sql(\"CALL demo.system.rewrite_data_files(table => 'nyc.taxis_sample', options => map('target-file-size-bytes','52428800'))\").show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "43a9ed67",
"metadata": {},
"outputs": [],
"source": [
"spark.sql(\"SELECT file_path, file_size_in_bytes FROM nyc.taxis_sample.files\").show(100)"
]
},
{
"cell_type": "markdown",
"id": "523eb893",
"metadata": {},
"source": [
"## Expiring Snapshots"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a8f21e73",
"metadata": {},
"outputs": [],
"source": [
"!ls -Shs /home/iceberg/warehouse/nyc/taxis_sample/data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "98e8c5db",
"metadata": {},
"outputs": [],
"source": [
"spark.sql(\"SELECT committed_at, snapshot_id, operation FROM nyc.taxis_sample.snapshots\").show(truncate=false)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b264c989",
"metadata": {},
"outputs": [],
"source": [
"val now = java.util.Calendar.getInstance().getTime()\n",
"val format = new java.text.SimpleDateFormat(\"yyyy-MM-dd HH:mm:ss.SSS\")\n",
"val now_str = format.format(now)\n",
"\n",
"spark.sql(s\"CALL demo.system.expire_snapshots(table => 'nyc.taxis_sample', older_than => TIMESTAMP '$now_str', retain_last => 1)\").show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "131e1f09",
"metadata": {},
"outputs": [],
"source": [
"spark.sql(\"SELECT committed_at, snapshot_id, operation FROM nyc.taxis_sample.snapshots\").show(truncate=false)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f810ebb1",
"metadata": {},
"outputs": [],
"source": [
"!ls -Shs /home/iceberg/warehouse/nyc/taxis_sample/data"
]
},
{
"cell_type": "markdown",
"id": "c60631b7",
"metadata": {},
"source": [
"## Removing Orphan Files"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3dd93ad5",
"metadata": {},
"outputs": [],
"source": [
"spark.sql(s\"CALL demo.system.remove_orphan_files(table => 'nyc.taxis_sample', dry_run => true)\").show(truncate=false)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9998166a",
"metadata": {},
"outputs": [],
"source": [
"!echo \"orphan data\" | tee -a /home/iceberg/warehouse/nyc/taxis_sample/data/i-shouldnt-be-here-1.parquet /home/iceberg/warehouse/nyc/taxis_sample/data/i-shouldnt-be-here-2.parquet /home/iceberg/warehouse/nyc/taxis_sample/data/i-shouldnt-be-here-3.parquet"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "94ad504b",
"metadata": {},
"outputs": [],
"source": [
"!touch -d \"2022-01-01 00:00:00.000000000\" /home/iceberg/warehouse/nyc/taxis_sample/data/i-shouldnt-be-here-1.parquet /home/iceberg/warehouse/nyc/taxis_sample/data/i-shouldnt-be-here-2.parquet /home/iceberg/warehouse/nyc/taxis_sample/data/i-shouldnt-be-here-3.parquet"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "226f74da",
"metadata": {},
"outputs": [],
"source": [
"!ls -Shs /home/iceberg/warehouse/nyc/taxis_sample/data/"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f22673aa",
"metadata": {},
"outputs": [],
"source": [
"spark.sql(s\"CALL demo.system.remove_orphan_files(table => 'nyc.taxis_sample', dry_run => true)\").show(truncate=false)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3444042b",
"metadata": {},
"outputs": [],
"source": [
"spark.sql(s\"CALL demo.system.remove_orphan_files(table => 'nyc.taxis_sample')\").show(truncate=false)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2d157881",
"metadata": {},
"outputs": [],
"source": [
"!ls -Shs /home/iceberg/warehouse/nyc/taxis_sample/data/"
]
},
{
"cell_type": "markdown",
"id": "181212b6",
"metadata": {},
"source": [
"## Rewriting Manifest Files"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "49290e56",
"metadata": {},
"outputs": [],
"source": [
"spark.sql(\"CALL demo.system.rewrite_manifests('nyc.taxis_sample')\").show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "spylon-kernel",
"language": "scala",
"name": "spylon-kernel"
},
"language_info": {
"codemirror_mode": "text/x-scala",
"file_extension": ".scala",
"help_links": [
{
"text": "MetaKernel Magics",
"url": "https://metakernel.readthedocs.io/en/latest/source/README.html"
}
],
"mimetype": "text/x-scala",
"name": "scala",
"pygments_lexer": "scala",
"version": "0.4.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit 8635800

Please sign in to comment.