forked from LearningJournal/Spark-Programming-In-Python
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
c42be84
commit 33d16ad
Showing
6 changed files
with
226 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"cells":[{"cell_type":"code","source":["diamonds_df = spark.read.format(\"csv\") \\\n .option(\"header\", \"true\") \\\n .option(\"inferSchema\", \"true\") \\\n .load(\"/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv\")\n\ndiamonds_df.show(10)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b1b0d245-342b-472b-9c14-9e7883cf73f4"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["from pyspark.sql.functions import avg\n\nresults_df = diamonds_df.select(\"color\", \"price\") \\\n .groupBy(\"color\") \\\n .agg(avg(\"price\")) \\\n .sort(\"color\")\n\nresults_df.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"015c9e24-1834-4494-a61e-f842bf41371d"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["display(results_df)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"c6d78c3a-7eb6-4376-af28-4bde47c83b4c"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"32c660e0-a73f-43d4-9695-affcc19a0a2d"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"01-getting-started","dashboards":[],"notebookMetadata":{"pythonIndentUnit":4},"language":"python","widgets":{},"notebookOrigID":2879982568079096}},"nbformat":4,"nbformat_minor":0} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"cells":[{"cell_type":"code","source":["raw_fire_df = spark.read \\\n .format(\"csv\") \\\n .option(\"header\", \"true\") \\\n .option(\"inferSchema\", \"true\") \\\n .load(\"/databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv\")"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"1493b35d-a05e-4322-949c-2c6a7db9e146"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["raw_fire_df.show(10)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"33c9f1f1-299b-45d9-b7cf-7940ac9e1d80"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["display(raw_fire_df)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b333860d-334b-42a4-b073-a98bc58b1c43"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["raw_fire_df.createGlobalTempView(\"fire_service_calls_view\")"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"56d2f0d6-90a7-4399-8f09-9dead0bbf526"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["%sql\nselect * from global_temp.fire_service_calls_view"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"75c57597-0f81-40a7-885f-64829f3db180"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"19f40c42-622a-43a2-bd56-d182e528fe6b"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"02-spark-dataframe-demo","dashboards":[],"notebookMetadata":{"pythonIndentUnit":4},"language":"python","widgets":{},"notebookOrigID":2787702214819532}},"nbformat":4,"nbformat_minor":0} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
-- Databricks notebook source | ||
drop table if exists demo_db.fire_service_calls_tbl; | ||
drop view if exists demo_db; | ||
|
||
-- COMMAND ---------- | ||
|
||
-- MAGIC %fs rm -r /user/hive/warehouse/demo_db.db | ||
|
||
-- COMMAND ---------- | ||
|
||
create database if not exists demo_db | ||
|
||
-- COMMAND ---------- | ||
|
||
create table if not exists demo_db.fire_service_calls_tbl( | ||
CallNumber integer, | ||
UnitID string, | ||
IncidentNumber integer, | ||
CallType string, | ||
CallDate string, | ||
WatchDate string, | ||
CallFinalDisposition string, | ||
AvailableDtTm string, | ||
Address string, | ||
City string, | ||
Zipcode integer, | ||
Battalion string, | ||
StationArea string, | ||
Box string, | ||
OriginalPriority string, | ||
Priority string, | ||
FinalPriority integer, | ||
ALSUnit boolean, | ||
CallTypeGroup string, | ||
NumAlarms integer, | ||
UnitType string, | ||
UnitSequenceInCallDispatch integer, | ||
FirePreventionDistrict string, | ||
SupervisorDistrict string, | ||
Neighborhood string, | ||
Location string, | ||
RowID string, | ||
Delay float | ||
) using parquet | ||
|
||
-- COMMAND ---------- | ||
|
||
insert into demo_db.fire_service_calls_tbl | ||
values(1234, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, | ||
null, null, null, null, null, null, null, null, null) | ||
|
||
-- COMMAND ---------- | ||
|
||
select * from demo_db.fire_service_calls_tbl | ||
|
||
-- COMMAND ---------- | ||
|
||
truncate table demo_db.fire_service_calls_tbl | ||
|
||
-- COMMAND ---------- | ||
|
||
insert into demo_db.fire_service_calls_tbl | ||
select * from global_temp.fire_service_calls_view | ||
|
||
-- COMMAND ---------- | ||
|
||
select * from demo_db.fire_service_calls_tbl | ||
|
||
-- COMMAND ---------- | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
-- Databricks notebook source | ||
select * from demo_db.fire_service_calls_tbl limit 100 | ||
|
||
-- COMMAND ---------- | ||
|
||
drop view if exists fire_service_calls_tbl_cache; | ||
|
||
-- COMMAND ---------- | ||
|
||
cache lazy table fire_service_calls_tbl_cache as | ||
select * from demo_db.fire_service_calls_tbl | ||
|
||
-- COMMAND ---------- | ||
|
||
select count(*) from demo_db.fire_service_calls_tbl | ||
|
||
-- COMMAND ---------- | ||
|
||
-- MAGIC %md | ||
-- MAGIC ##### Q1. How many distinct types of calls were made to the Fire Department? | ||
|
||
-- COMMAND ---------- | ||
|
||
select count(distinct callType) as distinct_call_type_count | ||
from demo_db.fire_service_calls_tbl | ||
where callType is not null | ||
|
||
-- COMMAND ---------- | ||
|
||
-- MAGIC %md | ||
-- MAGIC ##### Q2. What were distinct types of calls made to the Fire Department? | ||
|
||
-- COMMAND ---------- | ||
|
||
select distinct callType as distinct_call_types | ||
from demo_db.fire_service_calls_tbl | ||
where callType is not null | ||
|
||
-- COMMAND ---------- | ||
|
||
-- MAGIC %md | ||
-- MAGIC ##### Q3. Find out all response for delayed times greater than 5 mins? | ||
|
||
-- COMMAND ---------- | ||
|
||
select callNumber, Delay | ||
from demo_db.fire_service_calls_tbl | ||
where Delay > 5 | ||
|
||
-- COMMAND ---------- | ||
|
||
-- MAGIC %md | ||
-- MAGIC ##### Q4. What were the most common call types? | ||
|
||
-- COMMAND ---------- | ||
|
||
select callType, count(*) as count | ||
from demo_db.fire_service_calls_tbl | ||
where callType is not null | ||
group by callType | ||
order by count desc | ||
|
||
-- COMMAND ---------- | ||
|
||
-- MAGIC %md | ||
-- MAGIC ##### Q5. What zip codes accounted for most common calls? | ||
|
||
-- COMMAND ---------- | ||
|
||
select callType, zipCode, count(*) as count | ||
from demo_db.fire_service_calls_tbl | ||
where callType is not null | ||
group by callType, zipCode | ||
order by count desc | ||
|
||
-- COMMAND ---------- | ||
|
||
-- MAGIC %md | ||
-- MAGIC ##### Q6. What San Francisco neighborhoods are in the zip codes 94102 and 94103? | ||
|
||
-- COMMAND ---------- | ||
|
||
select zipCode, neighborhood | ||
from demo_db.fire_service_calls_tbl | ||
where zipCode == 94102 or zipCode == 94103 | ||
|
||
-- COMMAND ---------- | ||
|
||
-- MAGIC %md | ||
-- MAGIC #####Q7. What was the sum of all call alarms, average, min, and max of the call response times? | ||
|
||
-- COMMAND ---------- | ||
|
||
select sum(NumAlarms), avg(Delay), min(Delay), max(Delay) | ||
from demo_db.fire_service_calls_tbl | ||
|
||
-- COMMAND ---------- | ||
|
||
-- MAGIC %md | ||
-- MAGIC ##### Q8. How many distinct years of data is in the data set? | ||
|
||
-- COMMAND ---------- | ||
|
||
select distinct year(to_date(callDate, "MM/dd/yyyy")) as year_num | ||
from demo_db.fire_service_calls_tbl | ||
order by year_num | ||
|
||
-- COMMAND ---------- | ||
|
||
-- MAGIC %md | ||
-- MAGIC ##### Q9. What week of the year in 2018 had the most fire calls? | ||
|
||
-- COMMAND ---------- | ||
|
||
select weekofyear(to_date(callDate, "MM/dd/yyyy")) week_year, count(*) as count | ||
from demo_db.fire_service_calls_tbl | ||
where year(to_date(callDate, "MM/dd/yyyy")) == 2018 | ||
group by week_year | ||
order by count desc | ||
|
||
-- COMMAND ---------- | ||
|
||
-- MAGIC %md | ||
-- MAGIC ##### Q10. What neighborhoods in San Francisco had the worst response time in 2018? | ||
|
||
-- COMMAND ---------- | ||
|
||
select neighborhood, delay | ||
from demo_db.fire_service_calls_tbl | ||
where year(to_date(callDate, "MM/dd/yyyy")) == 2018 | ||
order by delay desc | ||
|
||
-- COMMAND ---------- | ||
|
||
|
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from pyspark.sql import * | ||
|
||
if __name__ == "__main__": | ||
|
||
spark = SparkSession.builder \ | ||
.appName("Hello Spark") \ | ||
.master("local[2]") \ | ||
.getOrCreate() | ||
|
||
data_list = [("Ravi", 28), | ||
("David", 45), | ||
("Abdul", 27)] | ||
|
||
|
||
|
||
df = spark.createDataFrame(data_list).toDF("Name", "Age") | ||
df.show() |