forked from databricks/devrel
-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
828d0bb
commit 18df060
Showing
10 changed files
with
6,034 additions
and
0 deletions.
There are no files selected for viewing
43 changes: 43 additions & 0 deletions
43
Live Webinars/MLOps Virtual Event - Standardizing MLOps at Scale/Python/01. EDA.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# Databricks notebook source | ||
# MAGIC %md | ||
# MAGIC ## Import Data | ||
# MAGIC [More info around pyspark.pandas](https://databricks.com/blog/2021/10/04/pandas-api-on-upcoming-apache-spark-3-2.html) | ||
|
||
# COMMAND ---------- | ||
|
||
import pyspark.pandas as pd | ||
#import databricks.koalas as pd # for spark less than 3.2 | ||
|
||
data_file = "/mnt/training/airbnb-sf-listings.csv" | ||
airbnb_sf_listings = pd.read_csv( data_file, quotechar='"', escapechar='"' ) | ||
display(airbnb_sf_listings) | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %md | ||
# MAGIC ## Explore the data | ||
|
||
# COMMAND ---------- | ||
|
||
airbnb_sf_listings.describe() | ||
|
||
# COMMAND ---------- | ||
|
||
airbnb_sf_listings['price'].plot.hist(100) | ||
|
||
# COMMAND ---------- | ||
|
||
airbnb_spark_df = airbnb_sf_listings.to_spark() #conversion from dataframe to spark | ||
|
||
dbutils.data.summarize(airbnb_spark_df) #Data profiling with Spark dataframes | ||
|
||
# COMMAND ---------- | ||
|
||
from pandas_profiling import ProfileReport | ||
|
||
airbnb_pandas_df = airbnb_sf_listings.to_pandas() #convert spark dataframe to pandas | ||
|
||
df_profile = ProfileReport(airbnb_pandas_df, title="Profiling Report", progress_bar=False, infer_dtypes=False) | ||
profile_html = df_profile.to_html() | ||
|
||
displayHTML(profile_html) |
93 changes: 93 additions & 0 deletions
93
Live Webinars/MLOps Virtual Event - Standardizing MLOps at Scale/Python/02. Data Prep.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
# Databricks notebook source | ||
# MAGIC %md | ||
# MAGIC ##Import Data | ||
|
||
# COMMAND ---------- | ||
|
||
import pyspark.pandas as pd | ||
|
||
data_file = "/mnt/training/airbnb-sf-listings.csv" | ||
airbnb_sf_listings = pd.read_csv( data_file, quotechar='"', escapechar='"' ) | ||
display(airbnb_sf_listings) | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %md | ||
# MAGIC ## Featurization | ||
|
||
# COMMAND ---------- | ||
|
||
# combine id columns | ||
airbnb_sf_listings['id'] = airbnb_sf_listings['host_id'] +'-'+ airbnb_sf_listings['id'] | ||
airbnb_sf_listings = airbnb_sf_listings.drop(['host_id', 'neighbourhood_group']) | ||
|
||
# COMMAND ---------- | ||
|
||
display(airbnb_sf_listings) | ||
|
||
# COMMAND ---------- | ||
|
||
# index categorical variables | ||
def index_categorical( cat_col ): | ||
# get distinct category labels | ||
cat_labels = cat_col.dropna().drop_duplicates().to_list() | ||
# create inverse mapping from labels to index | ||
cat_map = dict((l,i) for i, l in enumerate(cat_labels)) | ||
# transform the column by applying the mapping | ||
return cat_col.map(cat_map) | ||
|
||
cat_columns = ['neighbourhood', 'room_type'] | ||
indexed_listings = airbnb_sf_listings | ||
for col in cat_columns: | ||
idx_col = col+'_idx' | ||
indexed_listings[idx_col] = pd.to_numeric(index_categorical(indexed_listings[col])).astype(int) | ||
|
||
# COMMAND ---------- | ||
|
||
feature_cols = ['id', 'neighbourhood_idx', 'room_type_idx', 'price', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365' ] | ||
featurized = indexed_listings[feature_cols] | ||
featurized = featurized.dropna() | ||
|
||
# COMMAND ---------- | ||
|
||
display(featurized) | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %md | ||
# MAGIC ## Build a Feature Table | ||
# MAGIC [Databricks Feature Store](https://docs.databricks.com/applications/machine-learning/feature-store/index.html) | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %sql | ||
# MAGIC | ||
# MAGIC CREATE DATABASE IF NOT EXISTS airbnb; | ||
|
||
# COMMAND ---------- | ||
|
||
from databricks.feature_store import FeatureStoreClient | ||
|
||
fs = FeatureStoreClient() | ||
feature_table_name = 'airbnb.features' | ||
|
||
# create table if it doesn't exist | ||
try: | ||
fs.get_feature_table(feature_table_name) | ||
except: | ||
featurized_fs = fs.create_feature_table( | ||
name = feature_table_name, | ||
keys = 'id', | ||
schema = featurized.spark.schema(), | ||
description = 'These features are derived from the airbnb.features table in the lakehouse. I created dummy variables for the categorical columns, cleaned up their names, and dropped the neighbourhood_group_idx . No aggregations were performed.' | ||
) | ||
|
||
fs.write_table( | ||
name = feature_table_name, | ||
df = featurized.to_spark(), | ||
mode = 'merge' | ||
) | ||
|
||
# COMMAND ---------- | ||
|
||
|
Oops, something went wrong.