Merge pull request databricks#26 from tabular-io/table-maintenance

Add table maintenance notebook and nyc taxi parquet files
cheewaphat · Jul 18, 2022 · 8635800 · 8635800
2 parents 79b5855 + 685aa82
commit 8635800
Show file tree

Hide file tree

Showing 4 changed files with 349 additions and 8 deletions.
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -5,7 +5,6 @@ services:
     image: tabulario/spark-iceberg
     depends_on:
       - postgres
-    container_name: spark-iceberg
     volumes:
       - ./warehouse:/home/iceberg/warehouse
       - ./notebooks:/home/iceberg/notebooks/notebooks
@@ -15,7 +14,6 @@ services:
       - 18080:18080
   postgres:
     image: postgres:13.4-bullseye
-    container_name: postgres
     environment:
       - POSTGRES_USER=admin
       - POSTGRES_PASSWORD=password

diff --git a/spark/Dockerfile b/spark/Dockerfile
@@ -37,7 +37,7 @@ RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME}
 WORKDIR ${SPARK_HOME}
 
 # Download spark
-RUN curl https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz -o spark-3.2.1-bin-hadoop3.2.tgz \
+RUN curl https://archive.apache.org/dist/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz -o spark-3.2.1-bin-hadoop3.2.tgz \
  && tar xvzf spark-3.2.1-bin-hadoop3.2.tgz --directory /opt/spark --strip-components 1 \
  && rm -rf spark-3.2.1-bin-hadoop3.2.tgz
 
@@ -69,13 +69,26 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2
 ENV IJAVA_CLASSPATH=/opt/spark/jars/*
 
 RUN mkdir -p /home/iceberg/data \
- && curl https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-04.csv -o /home/iceberg/data/yello_tripdata_2020-04.csv \
- && curl https://data.cityofnewyork.us/resource/tg4x-b46p.json > /home/iceberg/data/nyc_film_permits.json
-
-RUN mkdir -p /home/iceberg/localwarehouse /home/iceberg/notebooks /home/iceberg/warehouse /home/iceberg/spark-events
+ && curl https://data.cityofnewyork.us/resource/tg4x-b46p.json > /home/iceberg/data/nyc_film_permits.json \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-04.parquet -o /home/iceberg/data/yellow_tripdata_2022-04.parquet \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-03.parquet -o /home/iceberg/data/yellow_tripdata_2022-03.parquet \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet -o /home/iceberg/data/yellow_tripdata_2022-02.parquet \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet -o /home/iceberg/data/yellow_tripdata_2022-01.parquet \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet -o /home/iceberg/data/yellow_tripdata_2021-12.parquet \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet -o /home/iceberg/data/yellow_tripdata_2021-11.parquet \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet -o /home/iceberg/data/yellow_tripdata_2021-10.parquet \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet -o /home/iceberg/data/yellow_tripdata_2021-09.parquet \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet -o /home/iceberg/data/yellow_tripdata_2021-08.parquet \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet -o /home/iceberg/data/yellow_tripdata_2021-07.parquet \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet -o /home/iceberg/data/yellow_tripdata_2021-06.parquet \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet -o /home/iceberg/data/yellow_tripdata_2021-05.parquet \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet -o /home/iceberg/data/yellow_tripdata_2021-04.parquet
+
+RUN mkdir -p /home/iceberg/localwarehouse /home/iceberg/notebooks /home/iceberg/warehouse /home/iceberg/spark-events /home/iceberg
 COPY ["notebooks/Iceberg - Getting Started.ipynb", "/home/iceberg/notebooks"]
 COPY ["notebooks/Iceberg - Integrated Audits Demo.ipynb", "/home/iceberg/notebooks"]
 COPY ["notebooks/Iceberg - An Introduction to the Iceberg Java API.ipynb", "/home/iceberg/notebooks"]
+COPY ["notebooks/Iceberg - Table Maintenance Spark Procedures.ipynb", "/home/iceberg/notebooks"]
 
 # Add a notebook command
 RUN echo '#! /bin/sh' >> /bin/notebook \

diff --git a/spark/notebooks/Iceberg - Getting Started.ipynb b/spark/notebooks/Iceberg - Getting Started.ipynb
@@ -63,7 +63,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = spark.read.option(\"header\", True).option(\"inferSchema\", \"true\").csv(\"/home/iceberg/data/yello_tripdata_2020-04.csv\")\n",
+    "df = spark.read.parquet(\"/home/iceberg/data/yellow_tripdata_2021-04.parquet\")\n",
     "df.write.saveAsTable(\"nyc.taxis\")"
    ]
   },

diff --git a/spark/notebooks/Iceberg - Table Maintenance Spark Procedures.ipynb b/spark/notebooks/Iceberg - Table Maintenance Spark Procedures.ipynb
@@ -0,0 +1,330 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "1041ae6f",
+   "metadata": {},
+   "source": [
+    "![iceberg-logo](https://www.apache.org/logos/res/iceberg/iceberg.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "247fb2ab",
+   "metadata": {},
+   "source": [
+    "### [Table Maintenance: The Key To Keeping Your Iceberg Tables Healthy and Performant](https://tabular.io/blog/table-maintenance/)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6a5c8206",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "spark"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1dab5ef0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(\"DROP TABLE IF EXISTS demo.nyc.taxis_sample\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "49a45d0b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(\"\"\"\n",
+    "CREATE TABLE demo.nyc.taxis_sample (\n",
+    "  `VendorID` BIGINT,\n",
+    "  `tpep_pickup_datetime` TIMESTAMP,\n",
+    "  `tpep_dropoff_datetime` TIMESTAMP,\n",
+    "  `passenger_count` DOUBLE,\n",
+    "  `trip_distance` DOUBLE,\n",
+    "  `RatecodeID` DOUBLE,\n",
+    "  `store_and_fwd_flag` STRING,\n",
+    "  `PULocationID` BIGINT,\n",
+    "  `DOLocationID` BIGINT,\n",
+    "  `payment_type` BIGINT,\n",
+    "  `fare_amount` DOUBLE,\n",
+    "  `extra` DOUBLE,\n",
+    "  `mta_tax` DOUBLE,\n",
+    "  `tip_amount` DOUBLE,\n",
+    "  `tolls_amount` DOUBLE,\n",
+    "  `improvement_surcharge` DOUBLE,\n",
+    "  `total_amount` DOUBLE,\n",
+    "  `congestion_surcharge` DOUBLE,\n",
+    "  `airport_fee` DOUBLE)\n",
+    "USING iceberg\n",
+    "TBLPROPERTIES(\n",
+    "  'write.target-file-size-bytes'='5242880'\n",
+    ")\n",
+    "\"\"\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f1f30120",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!ls -Shs /home/iceberg/data/yellow_tripdata_*.parquet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "997bb9df",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val df_202201 = spark.read.parquet(\"/home/iceberg/data/yellow_tripdata_2022-01.parquet\")\n",
+    "val df_202202 = spark.read.parquet(\"/home/iceberg/data/yellow_tripdata_2022-02.parquet\")\n",
+    "val df_202203 = spark.read.parquet(\"/home/iceberg/data/yellow_tripdata_2022-03.parquet\")\n",
+    "val df_q1 = df_202201.union(df_202202).union(df_202203)\n",
+    "df_q1.write.insertInto(\"nyc.taxis_sample\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "78cab088",
+   "metadata": {},
+   "source": [
+    "## Rewriting Data Files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7ad64e6b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(\"SELECT file_path, file_size_in_bytes FROM nyc.taxis_sample.files\").show(100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5d10355",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(\"ALTER TABLE nyc.taxis_sample UNSET TBLPROPERTIES ('write.target-file-size-bytes')\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f26228a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(\"CALL demo.system.rewrite_data_files(table => 'nyc.taxis_sample', options => map('target-file-size-bytes','52428800'))\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "43a9ed67",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(\"SELECT file_path, file_size_in_bytes FROM nyc.taxis_sample.files\").show(100)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "523eb893",
+   "metadata": {},
+   "source": [
+    "## Expiring Snapshots"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a8f21e73",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!ls -Shs /home/iceberg/warehouse/nyc/taxis_sample/data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "98e8c5db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(\"SELECT committed_at, snapshot_id, operation FROM nyc.taxis_sample.snapshots\").show(truncate=false)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b264c989",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val now = java.util.Calendar.getInstance().getTime()\n",
+    "val format = new java.text.SimpleDateFormat(\"yyyy-MM-dd HH:mm:ss.SSS\")\n",
+    "val now_str = format.format(now)\n",
+    "\n",
+    "spark.sql(s\"CALL demo.system.expire_snapshots(table => 'nyc.taxis_sample', older_than => TIMESTAMP '$now_str', retain_last => 1)\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "131e1f09",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(\"SELECT committed_at, snapshot_id, operation FROM nyc.taxis_sample.snapshots\").show(truncate=false)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f810ebb1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!ls -Shs /home/iceberg/warehouse/nyc/taxis_sample/data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c60631b7",
+   "metadata": {},
+   "source": [
+    "## Removing Orphan Files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3dd93ad5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(s\"CALL demo.system.remove_orphan_files(table => 'nyc.taxis_sample', dry_run => true)\").show(truncate=false)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9998166a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!echo \"orphan data\" | tee -a /home/iceberg/warehouse/nyc/taxis_sample/data/i-shouldnt-be-here-1.parquet /home/iceberg/warehouse/nyc/taxis_sample/data/i-shouldnt-be-here-2.parquet /home/iceberg/warehouse/nyc/taxis_sample/data/i-shouldnt-be-here-3.parquet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "94ad504b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!touch -d \"2022-01-01 00:00:00.000000000\" /home/iceberg/warehouse/nyc/taxis_sample/data/i-shouldnt-be-here-1.parquet /home/iceberg/warehouse/nyc/taxis_sample/data/i-shouldnt-be-here-2.parquet /home/iceberg/warehouse/nyc/taxis_sample/data/i-shouldnt-be-here-3.parquet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "226f74da",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!ls -Shs /home/iceberg/warehouse/nyc/taxis_sample/data/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f22673aa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(s\"CALL demo.system.remove_orphan_files(table => 'nyc.taxis_sample', dry_run => true)\").show(truncate=false)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3444042b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(s\"CALL demo.system.remove_orphan_files(table => 'nyc.taxis_sample')\").show(truncate=false)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2d157881",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!ls -Shs /home/iceberg/warehouse/nyc/taxis_sample/data/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "181212b6",
+   "metadata": {},
+   "source": [
+    "## Rewriting Manifest Files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "49290e56",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(\"CALL demo.system.rewrite_manifests('nyc.taxis_sample')\").show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "spylon-kernel",
+   "language": "scala",
+   "name": "spylon-kernel"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-scala",
+   "file_extension": ".scala",
+   "help_links": [
+    {
+     "text": "MetaKernel Magics",
+     "url": "https://metakernel.readthedocs.io/en/latest/source/README.html"
+    }
+   ],
+   "mimetype": "text/x-scala",
+   "name": "scala",
+   "pygments_lexer": "scala",
+   "version": "0.4.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}