add single quotes around parameters

Amrit-Hub · Dec 21, 2020 · e6bb857 · e6bb857
2 parents eb8500d + ccd45a0
commit e6bb857
Show file tree

Hide file tree

Showing 37 changed files with 105 additions and 13 deletions.
diff --git a/2020-04-15 | Data Analysis with Pandas /Data-Analysis-with-Pandas.ipynb b/2020-04-15 | Data Analysis with Pandas /Data-Analysis-with-Pandas.ipynb
diff --git a/2020-04-15 | Data Analysis with Pandas /Plotting My Area Lab.ipynb b/2020-04-15 | Data Analysis with Pandas /Plotting My Area Lab.ipynb
@@ -1 +1 @@
-{"cells":[{"cell_type":"markdown","source":["## Plotting My Area\n\nUse the starter code below to explore the data for your area"],"metadata":{}},{"cell_type":"code","source":["import datetime\nimport glob\nimport pandas as pd\n\npath = \"/dbfs/databricks-datasets/COVID/CSSEGISandData/csse_covid_19_data/csse_covid_19_daily_reports\"\nall_files = glob.glob(path + \"/*.csv\")\n\ndfs = []\n\nfor filename in all_files:\n  temp_df = pd.read_csv(filename)\n  temp_df.columns = [c.replace(\"/\", \"_\") for c in temp_df.columns]\n  temp_df.columns = [c.replace(\" \", \"_\") for c in temp_df.columns]\n  \n  month, day, year = filename.split(\"/\")[-1].replace(\".csv\", \"\").split(\"-\")\n  d = datetime.date(int(year), int(month), int(day))\n  temp_df[\"Date\"] = d\n\n  dfs.append(temp_df)\n  \nall_days_df = pd.concat(dfs, axis=0, ignore_index=True, sort=False)\nall_days_df = all_days_df.drop([\"Latitude\", \"Longitude\", \"Lat\", \"Long_\", \"FIPS\", \"Combined_Key\", \"Last_Update\"], axis=1)\n\nall_days_df.head(10)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Province_State</th>\n      <th>Country_Region</th>\n      <th>Confirmed</th>\n      <th>Deaths</th>\n      <th>Recovered</th>\n      <th>Date</th>\n      <th>Admin2</th>\n      <th>Active</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Anhui</td>\n      <td>Mainland China</td>\n      <td>1.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>2020-01-22</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Beijing</td>\n      <td>Mainland China</td>\n      <td>14.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>2020-01-22</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Chongqing</td>\n      <td>Mainland China</td>\n      <td>6.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>2020-01-22</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Fujian</td>\n      <td>Mainland China</td>\n      <td>1.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>2020-01-22</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Gansu</td>\n      <td>Mainland China</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>2020-01-22</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Guangdong</td>\n      <td>Mainland China</td>\n      <td>26.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>2020-01-22</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>Guangxi</td>\n      <td>Mainland China</td>\n      <td>2.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>2020-01-22</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>Guizhou</td>\n      <td>Mainland China</td>\n      <td>1.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>2020-01-22</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>Hainan</td>\n      <td>Mainland China</td>\n      <td>4.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>2020-01-22</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>Hebei</td>\n      <td>Mainland China</td>\n      <td>1.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>2020-01-22</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n  </tbody>\n</table>\n</div>"]}}],"execution_count":2},{"cell_type":"code","source":[""],"metadata":{},"outputs":[],"execution_count":3}],"metadata":{"name":"Plotting My Area Lab","notebookId":17365945},"nbformat":4,"nbformat_minor":0}
+{"cells":[{"cell_type":"markdown","source":["## Plotting My Area\n\nUse the starter code below to explore the data for your area"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"6ac80b32-96fe-419b-8de0-78435d8152e4"}}},{"cell_type":"code","source":["src_path_base = \"dbfs:/databricks-datasets/COVID/CSSEGISandData/csse_covid_19_data/csse_covid_19_daily_reports/\"\ndest_path_base = \"file:////tmp/covid_daily_reports/\"\n\nfiles = [\n '11-21-2020.csv',\n '11-22-2020.csv',\n '11-23-2020.csv',\n '11-24-2020.csv',\n '11-25-2020.csv',\n '11-26-2020.csv',\n '11-27-2020.csv',\n '11-28-2020.csv',\n '11-29-2020.csv',\n '11-30-2020.csv'\n]\n\nall_files = []\n\nfor file in files:\n  filename = dest_path_base+file\n  dbutils.fs.cp(src_path_base+file, filename)\n  all_files.append(filename)\n\nall_files"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"7698baa0-81eb-476b-811c-8367680430eb"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"<div class=\"ansiout\">Out[3]: [&#39;file:////tmp/covid_daily_reports/11-21-2020.csv&#39;,\n &#39;file:////tmp/covid_daily_reports/11-22-2020.csv&#39;,\n &#39;file:////tmp/covid_daily_reports/11-23-2020.csv&#39;,\n &#39;file:////tmp/covid_daily_reports/11-24-2020.csv&#39;,\n &#39;file:////tmp/covid_daily_reports/11-25-2020.csv&#39;,\n &#39;file:////tmp/covid_daily_reports/11-26-2020.csv&#39;,\n &#39;file:////tmp/covid_daily_reports/11-27-2020.csv&#39;,\n &#39;file:////tmp/covid_daily_reports/11-28-2020.csv&#39;,\n &#39;file:////tmp/covid_daily_reports/11-29-2020.csv&#39;,\n &#39;file:////tmp/covid_daily_reports/11-30-2020.csv&#39;]</div>","removedWidgets":[],"addedWidgets":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">Out[3]: [&#39;file:////tmp/covid_daily_reports/11-21-2020.csv&#39;,\n &#39;file:////tmp/covid_daily_reports/11-22-2020.csv&#39;,\n &#39;file:////tmp/covid_daily_reports/11-23-2020.csv&#39;,\n &#39;file:////tmp/covid_daily_reports/11-24-2020.csv&#39;,\n &#39;file:////tmp/covid_daily_reports/11-25-2020.csv&#39;,\n &#39;file:////tmp/covid_daily_reports/11-26-2020.csv&#39;,\n &#39;file:////tmp/covid_daily_reports/11-27-2020.csv&#39;,\n &#39;file:////tmp/covid_daily_reports/11-28-2020.csv&#39;,\n &#39;file:////tmp/covid_daily_reports/11-29-2020.csv&#39;,\n &#39;file:////tmp/covid_daily_reports/11-30-2020.csv&#39;]</div>"]}}],"execution_count":0},{"cell_type":"code","source":["import datetime\nimport pandas as pd\n\ndfs = []\n\nfor filename in all_files:\n  temp_df = pd.read_csv(filename)\n  temp_df.columns = [c.replace(\"/\", \"_\") for c in temp_df.columns]\n  temp_df.columns = [c.replace(\" \", \"_\") for c in temp_df.columns]\n  \n  month, day, year = filename.split(\"/\")[-1].replace(\".csv\", \"\").split(\"-\")\n  d = datetime.date(int(year), int(month), int(day))\n  temp_df[\"Date\"] = d\n\n  dfs.append(temp_df)\n  \nall_days_df = pd.concat(dfs, axis=0, ignore_index=True, sort=False)\nall_days_df = all_days_df.drop([\"Lat\", \"Long_\", \"FIPS\", \"Combined_Key\", \"Last_Update\"], axis=1)\n\nall_days_df.head(10)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"0c71a076-4e9e-434c-97b4-75bc5fceff57"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Admin2</th>\n      <th>Province_State</th>\n      <th>Country_Region</th>\n      <th>Confirmed</th>\n      <th>Deaths</th>\n      <th>Recovered</th>\n      <th>Active</th>\n      <th>Incident_Rate</th>\n      <th>Case_Fatality_Ratio</th>\n      <th>Date</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Afghanistan</td>\n      <td>44503</td>\n      <td>1675</td>\n      <td>35422</td>\n      <td>7406.0</td>\n      <td>114.320310</td>\n      <td>3.763791</td>\n      <td>2020-11-21</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Albania</td>\n      <td>32196</td>\n      <td>685</td>\n      <td>15469</td>\n      <td>16042.0</td>\n      <td>1118.771284</td>\n      <td>2.127593</td>\n      <td>2020-11-21</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Algeria</td>\n      <td>73774</td>\n      <td>2255</td>\n      <td>48183</td>\n      <td>23336.0</td>\n      <td>168.237732</td>\n      <td>3.056632</td>\n      <td>2020-11-21</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Andorra</td>\n      <td>6207</td>\n      <td>76</td>\n      <td>5290</td>\n      <td>841.0</td>\n      <td>8033.391574</td>\n      <td>1.224424</td>\n      <td>2020-11-21</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Angola</td>\n      <td>14413</td>\n      <td>336</td>\n      <td>7273</td>\n      <td>6804.0</td>\n      <td>43.853473</td>\n      <td>2.331229</td>\n      <td>2020-11-21</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Antigua and Barbuda</td>\n      <td>139</td>\n      <td>4</td>\n      <td>128</td>\n      <td>7.0</td>\n      <td>141.941018</td>\n      <td>2.877698</td>\n      <td>2020-11-21</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Argentina</td>\n      <td>1366182</td>\n      <td>36902</td>\n      <td>1187053</td>\n      <td>142227.0</td>\n      <td>3022.808967</td>\n      <td>2.701104</td>\n      <td>2020-11-21</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Armenia</td>\n      <td>124839</td>\n      <td>1931</td>\n      <td>92829</td>\n      <td>30079.0</td>\n      <td>4212.930872</td>\n      <td>1.546792</td>\n      <td>2020-11-21</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>NaN</td>\n      <td>Australian Capital Territory</td>\n      <td>Australia</td>\n      <td>115</td>\n      <td>3</td>\n      <td>111</td>\n      <td>1.0</td>\n      <td>26.862883</td>\n      <td>2.608696</td>\n      <td>2020-11-21</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>NaN</td>\n      <td>New South Wales</td>\n      <td>Australia</td>\n      <td>4538</td>\n      <td>53</td>\n      <td>3173</td>\n      <td>1312.0</td>\n      <td>55.900468</td>\n      <td>1.167915</td>\n      <td>2020-11-21</td>\n    </tr>\n  </tbody>\n</table>\n</div>","textData":"<div class=\"ansiout\">Out[5]: </div>","removedWidgets":[],"addedWidgets":{},"type":"htmlSandbox","arguments":{}}},"output_type":"display_data","data":{"text/html":["<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Admin2</th>\n      <th>Province_State</th>\n      <th>Country_Region</th>\n      <th>Confirmed</th>\n      <th>Deaths</th>\n      <th>Recovered</th>\n      <th>Active</th>\n      <th>Incident_Rate</th>\n      <th>Case_Fatality_Ratio</th>\n      <th>Date</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Afghanistan</td>\n      <td>44503</td>\n      <td>1675</td>\n      <td>35422</td>\n      <td>7406.0</td>\n      <td>114.320310</td>\n      <td>3.763791</td>\n      <td>2020-11-21</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Albania</td>\n      <td>32196</td>\n      <td>685</td>\n      <td>15469</td>\n      <td>16042.0</td>\n      <td>1118.771284</td>\n      <td>2.127593</td>\n      <td>2020-11-21</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Algeria</td>\n      <td>73774</td>\n      <td>2255</td>\n      <td>48183</td>\n      <td>23336.0</td>\n      <td>168.237732</td>\n      <td>3.056632</td>\n      <td>2020-11-21</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Andorra</td>\n      <td>6207</td>\n      <td>76</td>\n      <td>5290</td>\n      <td>841.0</td>\n      <td>8033.391574</td>\n      <td>1.224424</td>\n      <td>2020-11-21</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Angola</td>\n      <td>14413</td>\n      <td>336</td>\n      <td>7273</td>\n      <td>6804.0</td>\n      <td>43.853473</td>\n      <td>2.331229</td>\n      <td>2020-11-21</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Antigua and Barbuda</td>\n      <td>139</td>\n      <td>4</td>\n      <td>128</td>\n      <td>7.0</td>\n      <td>141.941018</td>\n      <td>2.877698</td>\n      <td>2020-11-21</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Argentina</td>\n      <td>1366182</td>\n      <td>36902</td>\n      <td>1187053</td>\n      <td>142227.0</td>\n      <td>3022.808967</td>\n      <td>2.701104</td>\n      <td>2020-11-21</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Armenia</td>\n      <td>124839</td>\n      <td>1931</td>\n      <td>92829</td>\n      <td>30079.0</td>\n      <td>4212.930872</td>\n      <td>1.546792</td>\n      <td>2020-11-21</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>NaN</td>\n      <td>Australian Capital Territory</td>\n      <td>Australia</td>\n      <td>115</td>\n      <td>3</td>\n      <td>111</td>\n      <td>1.0</td>\n      <td>26.862883</td>\n      <td>2.608696</td>\n      <td>2020-11-21</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>NaN</td>\n      <td>New South Wales</td>\n      <td>Australia</td>\n      <td>4538</td>\n      <td>53</td>\n      <td>3173</td>\n      <td>1312.0</td>\n      <td>55.900468</td>\n      <td>1.167915</td>\n      <td>2020-11-21</td>\n    </tr>\n  </tbody>\n</table>\n</div>"]}}],"execution_count":0},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b238a91a-46a8-4189-a0db-229e482b48f7"}},"outputs":[],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"Plotting My Area Lab","dashboards":[],"language":"python","widgets":{},"notebookOrigID":4179985937936567}},"nbformat":4,"nbformat_minor":0}
diff --git a/...I Summit EU 2020/Unpacking the Transaction Log V2/Delta Lake - Under the Sediments V2.pdf b/...I Summit EU 2020/Unpacking the Transaction Log V2/Delta Lake - Under the Sediments V2.pdf
diff --git a/...cking the Transaction Log V2/Diving Into Delta Lake_ Unpacking The Transaction Log V2.dbc b/...cking the Transaction Log V2/Diving Into Delta Lake_ Unpacking The Transaction Log V2.dbc
diff --git a/2020-11-18 | Data + AI Summit EU 2020/Unpacking the Transaction Log V2/readme.md b/2020-11-18 | Data + AI Summit EU 2020/Unpacking the Transaction Log V2/readme.md
@@ -0,0 +1,20 @@
+## DAISEU20: Unpacking the Transaction Log V2
+
+2020-11-18 | [Watch the video]() | This folder contains the notebooks used in this session.
+
+The transaction log is key to understanding Delta Lake because it is the common thread that runs through many of its most important features, including ACID transactions, scalable metadata handling, time travel, and more. In this session, we'll explore what the Delta Lake transaction log is, how it works at the file level, and how it offers an elegant solution to the problem of multiple concurrent reads and writes.
+
+In this tech talk you will learn about:
+
+* What is the Delta Lake Transaction Log
+* What is the transaction log used for?
+* How does the transaction log work?
+* Reviewing the Delta Lake transaction log at the file level
+* Dealing with multiple concurrent reads and writes
+* How the Delta Lake transaction log solves other use cases including Time Travel and Data Lineage and Debugging
+
+### Speakers ###
+
+Burak Yavuz is a Software Engineer and Apache Spark committer at Databricks. He has been developing Structured Streaming and Delta Lake to simplify the lives of Data Engineers. Burak received his MS in Management Science & Engineering at Stanford and his BS in Mechanical Engineering at Bogazici University, Istanbul.
+
+Denny Lee is a developer advocate at Databricks, where he works on Delta Lake, Apache Spark, Data Sciences, and Healthcare Life Sciences. He has previously built enterprise DW/BI and big data systems at Microsoft including Azure Cosmos DB, Project Isotope (HDInsight), and SQL Server as well as the Senior Director of Data Sciences Engineering at SAP Concur. Denny holds a Masters in Biomedical Informatics from Oregon Health Sciences University.
diff --git a/...-12-03 | Faster Spark SQL - Adaptive Query Execution in Databricks/AQE Tech Talk Demo.dbc b/...-12-03 | Faster Spark SQL - Adaptive Query Execution in Databricks/AQE Tech Talk Demo.dbc
diff --git a/2020-12-03 | Faster Spark SQL - Adaptive Query Execution in Databricks/AQE Tech Talk.pdf b/2020-12-03 | Faster Spark SQL - Adaptive Query Execution in Databricks/AQE Tech Talk.pdf
diff --git a/2020-12-03 | Faster Spark SQL - Adaptive Query Execution in Databricks/readme.md b/2020-12-03 | Faster Spark SQL - Adaptive Query Execution in Databricks/readme.md
@@ -0,0 +1,17 @@
+## Tech Talk: Faster Spark SQL: Adaptive Query Execution in Databricks
+
+2020-12-03 | [Watch the video](https://youtu.be/bQ33bwUE-ms) | This folder contains the notebooks used in this tutorial.
+
+Over the years, there has been extensive and continuous effort on improving Spark SQL's query optimizer and planner, in order to generate high quality query execution plans. One of the biggest improvements is the cost-based optimization framework that collects and leverages a variety of data statistics (e.g., row count, number of distinct values, NULL values, max/min values, etc.) to help Spark make better decisions in picking the most optimal query plan.
+
+Examples of these cost-based optimizations include choosing the right join type (broadcast-hash-join vs. sort-merge-join), selecting the correct build side in a hash-join, or adjusting the join order in a multi-way join. However, chances are data statistics can be out of date and cardinality estimates can be inaccurate, which may lead to a less optimal query plan. Adaptive Query Execution, new in Spark 3.0, now looks to tackle such issues by re-optimizing and adjusting query plans based on runtime statistics collected in the process of query execution. This talk is going to introduce the adaptive query execution framework along with a few optimizations it employs to address some major performance challenges the industry faces when using Spark SQL. We will illustrate how these statistics-guided optimizations work to accelerate execution through query examples. Finally, we will share the significant performance improvement we have seen on the TPC-DS benchmark with Adaptive Query Execution.
+
+Check out this technical blog Allison and Maryann wrote: https://databricks.com/blog/2020/10/21/faster-sql-adaptive-query-execution-in-databricks.html
+
+### Speakers ###
+
+Maryann Xue
+Maryann is a staff software engineer at Databricks, committer and PMC member of Apache Calcite and Apache Phoenix. Previously, she worked on a number of big data and compiler projects at Intel.
+
+Allison Wang
+Allison is a software engineer at Databricks, primarily focusing on Spark SQL. Previously she was on the data team at Robinhood. She holds a Bachelor’s degree in Computer Science from Carnegie Mellon University.
diff --git a/2020-12-10 | Fatal Force: Exploring Police Shootings with SQL Analytics/01_Ingest_Data.html b/2020-12-10 | Fatal Force: Exploring Police Shootings with SQL Analytics/01_Ingest_Data.html
diff --git a/2020-12-10 | Fatal Force: Exploring Police Shootings with SQL Analytics/README.md b/2020-12-10 | Fatal Force: Exploring Police Shootings with SQL Analytics/README.md
@@ -4,7 +4,9 @@
 
 Download the `Police.dbc` file, and import it into your Databricks workspace. There you will find all the notebooks necessary to reproduce our analysis. Please follow the steps outlined in the notebooks for how to import the necessary data/tables into Databricks.
 
-Additionally, we've included a `state_mappings.csv` file that maps state abbreviations to state names to make the analysis a bit more readible.
+Alternatively, if you do not have access to a Databricks workspace, you can also download the individual `.html` files and open them in your browser to view the notebooks. 
+
+Additionally, we've included a `state_mappings.csv` file that maps state abbreviations to state names to make the analysis more readable.
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"cells":[{"cell_type":"markdown","source":["## Plotting My Area\n\nUse the starter code below to explore the data for your area"],"metadata":{}},{"cell_type":"code","source":["import datetime\nimport glob\nimport pandas as pd\n\npath = \"/dbfs/databricks-datasets/COVID/CSSEGISandData/csse_covid_19_data/csse_covid_19_daily_reports\"\nall_files = glob.glob(path + \"/*.csv\")\n\ndfs = []\n\nfor filename in all_files:\n temp_df = pd.read_csv(filename)\n temp_df.columns = [c.replace(\"/\", \"_\") for c in temp_df.columns]\n temp_df.columns = [c.replace(\" \", \"_\") for c in temp_df.columns]\n \n month, day, year = filename.split(\"/\")[-1].replace(\".csv\", \"\").split(\"-\")\n d = datetime.date(int(year), int(month), int(day))\n temp_df[\"Date\"] = d\n\n dfs.append(temp_df)\n \nall_days_df = pd.concat(dfs, axis=0, ignore_index=True, sort=False)\nall_days_df = all_days_df.drop([\"Latitude\", \"Longitude\", \"Lat\", \"Long_\", \"FIPS\", \"Combined_Key\", \"Last_Update\"], axis=1)\n\nall_days_df.head(10)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Province_State</th>\n <th>Country_Region</th>\n <th>Confirmed</th>\n <th>Deaths</th>\n <th>Recovered</th>\n <th>Date</th>\n <th>Admin2</th>\n <th>Active</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Anhui</td>\n <td>Mainland China</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2020-01-22</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Beijing</td>\n <td>Mainland China</td>\n <td>14.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2020-01-22</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Chongqing</td>\n <td>Mainland China</td>\n <td>6.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2020-01-22</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Fujian</td>\n <td>Mainland China</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2020-01-22</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>4</th>\n <td>Gansu</td>\n <td>Mainland China</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2020-01-22</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>5</th>\n <td>Guangdong</td>\n <td>Mainland China</td>\n <td>26.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2020-01-22</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>6</th>\n <td>Guangxi</td>\n <td>Mainland China</td>\n <td>2.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2020-01-22</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>7</th>\n <td>Guizhou</td>\n <td>Mainland China</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2020-01-22</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>8</th>\n <td>Hainan</td>\n <td>Mainland China</td>\n <td>4.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2020-01-22</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>9</th>\n <td>Hebei</td>\n <td>Mainland China</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2020-01-22</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n </tbody>\n</table>\n</div>"]}}],"execution_count":2},{"cell_type":"code","source":[""],"metadata":{},"outputs":[],"execution_count":3}],"metadata":{"name":"Plotting My Area Lab","notebookId":17365945},"nbformat":4,"nbformat_minor":0}
		{"cells":[{"cell_type":"markdown","source":["## Plotting My Area\n\nUse the starter code below to explore the data for your area"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"6ac80b32-96fe-419b-8de0-78435d8152e4"}}},{"cell_type":"code","source":["src_path_base = \"dbfs:/databricks-datasets/COVID/CSSEGISandData/csse_covid_19_data/csse_covid_19_daily_reports/\"\ndest_path_base = \"file:////tmp/covid_daily_reports/\"\n\nfiles = [\n '11-21-2020.csv',\n '11-22-2020.csv',\n '11-23-2020.csv',\n '11-24-2020.csv',\n '11-25-2020.csv',\n '11-26-2020.csv',\n '11-27-2020.csv',\n '11-28-2020.csv',\n '11-29-2020.csv',\n '11-30-2020.csv'\n]\n\nall_files = []\n\nfor file in files:\n filename = dest_path_base+file\n dbutils.fs.cp(src_path_base+file, filename)\n all_files.append(filename)\n\nall_files"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"7698baa0-81eb-476b-811c-8367680430eb"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"<div class=\"ansiout\">Out[3]: ['file:////tmp/covid_daily_reports/11-21-2020.csv',\n 'file:////tmp/covid_daily_reports/11-22-2020.csv',\n 'file:////tmp/covid_daily_reports/11-23-2020.csv',\n 'file:////tmp/covid_daily_reports/11-24-2020.csv',\n 'file:////tmp/covid_daily_reports/11-25-2020.csv',\n 'file:////tmp/covid_daily_reports/11-26-2020.csv',\n 'file:////tmp/covid_daily_reports/11-27-2020.csv',\n 'file:////tmp/covid_daily_reports/11-28-2020.csv',\n 'file:////tmp/covid_daily_reports/11-29-2020.csv',\n 'file:////tmp/covid_daily_reports/11-30-2020.csv']</div>","removedWidgets":[],"addedWidgets":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>\n<div class=\"ansiout\">Out[3]: ['file:////tmp/covid_daily_reports/11-21-2020.csv',\n 'file:////tmp/covid_daily_reports/11-22-2020.csv',\n 'file:////tmp/covid_daily_reports/11-23-2020.csv',\n 'file:////tmp/covid_daily_reports/11-24-2020.csv',\n 'file:////tmp/covid_daily_reports/11-25-2020.csv',\n 'file:////tmp/covid_daily_reports/11-26-2020.csv',\n 'file:////tmp/covid_daily_reports/11-27-2020.csv',\n 'file:////tmp/covid_daily_reports/11-28-2020.csv',\n 'file:////tmp/covid_daily_reports/11-29-2020.csv',\n 'file:////tmp/covid_daily_reports/11-30-2020.csv']</div>"]}}],"execution_count":0},{"cell_type":"code","source":["import datetime\nimport pandas as pd\n\ndfs = []\n\nfor filename in all_files:\n temp_df = pd.read_csv(filename)\n temp_df.columns = [c.replace(\"/\", \"_\") for c in temp_df.columns]\n temp_df.columns = [c.replace(\" \", \"_\") for c in temp_df.columns]\n \n month, day, year = filename.split(\"/\")[-1].replace(\".csv\", \"\").split(\"-\")\n d = datetime.date(int(year), int(month), int(day))\n temp_df[\"Date\"] = d\n\n dfs.append(temp_df)\n \nall_days_df = pd.concat(dfs, axis=0, ignore_index=True, sort=False)\nall_days_df = all_days_df.drop([\"Lat\", \"Long_\", \"FIPS\", \"Combined_Key\", \"Last_Update\"], axis=1)\n\nall_days_df.head(10)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"0c71a076-4e9e-434c-97b4-75bc5fceff57"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Admin2</th>\n <th>Province_State</th>\n <th>Country_Region</th>\n <th>Confirmed</th>\n <th>Deaths</th>\n <th>Recovered</th>\n <th>Active</th>\n <th>Incident_Rate</th>\n <th>Case_Fatality_Ratio</th>\n <th>Date</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>Afghanistan</td>\n <td>44503</td>\n <td>1675</td>\n <td>35422</td>\n <td>7406.0</td>\n <td>114.320310</td>\n <td>3.763791</td>\n <td>2020-11-21</td>\n </tr>\n <tr>\n <th>1</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>Albania</td>\n <td>32196</td>\n <td>685</td>\n <td>15469</td>\n <td>16042.0</td>\n <td>1118.771284</td>\n <td>2.127593</td>\n <td>2020-11-21</td>\n </tr>\n <tr>\n <th>2</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>Algeria</td>\n <td>73774</td>\n <td>2255</td>\n <td>48183</td>\n <td>23336.0</td>\n <td>168.237732</td>\n <td>3.056632</td>\n <td>2020-11-21</td>\n </tr>\n <tr>\n <th>3</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>Andorra</td>\n <td>6207</td>\n <td>76</td>\n <td>5290</td>\n <td>841.0</td>\n <td>8033.391574</td>\n <td>1.224424</td>\n <td>2020-11-21</td>\n </tr>\n <tr>\n <th>4</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>Angola</td>\n <td>14413</td>\n <td>336</td>\n <td>7273</td>\n <td>6804.0</td>\n <td>43.853473</td>\n <td>2.331229</td>\n <td>2020-11-21</td>\n </tr>\n <tr>\n <th>5</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>Antigua and Barbuda</td>\n <td>139</td>\n <td>4</td>\n <td>128</td>\n <td>7.0</td>\n <td>141.941018</td>\n <td>2.877698</td>\n <td>2020-11-21</td>\n </tr>\n <tr>\n <th>6</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>Argentina</td>\n <td>1366182</td>\n <td>36902</td>\n <td>1187053</td>\n <td>142227.0</td>\n <td>3022.808967</td>\n <td>2.701104</td>\n <td>2020-11-21</td>\n </tr>\n <tr>\n <th>7</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>Armenia</td>\n <td>124839</td>\n <td>1931</td>\n <td>92829</td>\n <td>30079.0</td>\n <td>4212.930872</td>\n <td>1.546792</td>\n <td>2020-11-21</td>\n </tr>\n <tr>\n <th>8</th>\n <td>NaN</td>\n <td>Australian Capital Territory</td>\n <td>Australia</td>\n <td>115</td>\n <td>3</td>\n <td>111</td>\n <td>1.0</td>\n <td>26.862883</td>\n <td>2.608696</td>\n <td>2020-11-21</td>\n </tr>\n <tr>\n <th>9</th>\n <td>NaN</td>\n <td>New South Wales</td>\n <td>Australia</td>\n <td>4538</td>\n <td>53</td>\n <td>3173</td>\n <td>1312.0</td>\n <td>55.900468</td>\n <td>1.167915</td>\n <td>2020-11-21</td>\n </tr>\n </tbody>\n</table>\n</div>","textData":"<div class=\"ansiout\">Out[5]: </div>","removedWidgets":[],"addedWidgets":{},"type":"htmlSandbox","arguments":{}}},"output_type":"display_data","data":{"text/html":["<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Admin2</th>\n <th>Province_State</th>\n <th>Country_Region</th>\n <th>Confirmed</th>\n <th>Deaths</th>\n <th>Recovered</th>\n <th>Active</th>\n <th>Incident_Rate</th>\n <th>Case_Fatality_Ratio</th>\n <th>Date</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>Afghanistan</td>\n <td>44503</td>\n <td>1675</td>\n <td>35422</td>\n <td>7406.0</td>\n <td>114.320310</td>\n <td>3.763791</td>\n <td>2020-11-21</td>\n </tr>\n <tr>\n <th>1</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>Albania</td>\n <td>32196</td>\n <td>685</td>\n <td>15469</td>\n <td>16042.0</td>\n <td>1118.771284</td>\n <td>2.127593</td>\n <td>2020-11-21</td>\n </tr>\n <tr>\n <th>2</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>Algeria</td>\n <td>73774</td>\n <td>2255</td>\n <td>48183</td>\n <td>23336.0</td>\n <td>168.237732</td>\n <td>3.056632</td>\n <td>2020-11-21</td>\n </tr>\n <tr>\n <th>3</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>Andorra</td>\n <td>6207</td>\n <td>76</td>\n <td>5290</td>\n <td>841.0</td>\n <td>8033.391574</td>\n <td>1.224424</td>\n <td>2020-11-21</td>\n </tr>\n <tr>\n <th>4</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>Angola</td>\n <td>14413</td>\n <td>336</td>\n <td>7273</td>\n <td>6804.0</td>\n <td>43.853473</td>\n <td>2.331229</td>\n <td>2020-11-21</td>\n </tr>\n <tr>\n <th>5</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>Antigua and Barbuda</td>\n <td>139</td>\n <td>4</td>\n <td>128</td>\n <td>7.0</td>\n <td>141.941018</td>\n <td>2.877698</td>\n <td>2020-11-21</td>\n </tr>\n <tr>\n <th>6</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>Argentina</td>\n <td>1366182</td>\n <td>36902</td>\n <td>1187053</td>\n <td>142227.0</td>\n <td>3022.808967</td>\n <td>2.701104</td>\n <td>2020-11-21</td>\n </tr>\n <tr>\n <th>7</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>Armenia</td>\n <td>124839</td>\n <td>1931</td>\n <td>92829</td>\n <td>30079.0</td>\n <td>4212.930872</td>\n <td>1.546792</td>\n <td>2020-11-21</td>\n </tr>\n <tr>\n <th>8</th>\n <td>NaN</td>\n <td>Australian Capital Territory</td>\n <td>Australia</td>\n <td>115</td>\n <td>3</td>\n <td>111</td>\n <td>1.0</td>\n <td>26.862883</td>\n <td>2.608696</td>\n <td>2020-11-21</td>\n </tr>\n <tr>\n <th>9</th>\n <td>NaN</td>\n <td>New South Wales</td>\n <td>Australia</td>\n <td>4538</td>\n <td>53</td>\n <td>3173</td>\n <td>1312.0</td>\n <td>55.900468</td>\n <td>1.167915</td>\n <td>2020-11-21</td>\n </tr>\n </tbody>\n</table>\n</div>"]}}],"execution_count":0},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b238a91a-46a8-4189-a0db-229e482b48f7"}},"outputs":[],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"Plotting My Area Lab","dashboards":[],"language":"python","widgets":{},"notebookOrigID":4179985937936567}},"nbformat":4,"nbformat_minor":0}