Commit 12/16 MLOps Commit

Amrit-Hub · Dec 17, 2021 · 18df060 · 18df060
1 parent 828d0bb
commit 18df060
Show file tree

Hide file tree

Showing 10 changed files with 6,034 additions and 0 deletions.
diff --git a/Live Webinars/MLOps Virtual Event - Standardizing MLOps at Scale/Python/01. EDA.py b/Live Webinars/MLOps Virtual Event - Standardizing MLOps at Scale/Python/01. EDA.py
@@ -0,0 +1,43 @@
+# Databricks notebook source
+# MAGIC %md
+# MAGIC ## Import Data
+# MAGIC [More info around pyspark.pandas](https://databricks.com/blog/2021/10/04/pandas-api-on-upcoming-apache-spark-3-2.html)
+
+# COMMAND ----------
+
+import pyspark.pandas as pd
+#import databricks.koalas as pd # for spark less than 3.2
+
+data_file = "/mnt/training/airbnb-sf-listings.csv"
+airbnb_sf_listings = pd.read_csv( data_file, quotechar='"', escapechar='"' )
+display(airbnb_sf_listings)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Explore the data
+
+# COMMAND ----------
+
+airbnb_sf_listings.describe()
+
+# COMMAND ----------
+
+airbnb_sf_listings['price'].plot.hist(100)
+
+# COMMAND ----------
+
+airbnb_spark_df = airbnb_sf_listings.to_spark() #conversion from dataframe to spark
+
+dbutils.data.summarize(airbnb_spark_df) #Data profiling with Spark dataframes
+
+# COMMAND ----------
+
+from pandas_profiling import ProfileReport
+
+airbnb_pandas_df = airbnb_sf_listings.to_pandas() #convert spark dataframe to pandas
+
+df_profile = ProfileReport(airbnb_pandas_df, title="Profiling Report", progress_bar=False, infer_dtypes=False)
+profile_html = df_profile.to_html()
+
+displayHTML(profile_html)
diff --git a/Live Webinars/MLOps Virtual Event - Standardizing MLOps at Scale/Python/02. Data Prep.py b/Live Webinars/MLOps Virtual Event - Standardizing MLOps at Scale/Python/02. Data Prep.py
@@ -0,0 +1,93 @@
+# Databricks notebook source
+# MAGIC %md
+# MAGIC ##Import Data
+
+# COMMAND ----------
+
+import pyspark.pandas as pd
+
+data_file = "/mnt/training/airbnb-sf-listings.csv"
+airbnb_sf_listings = pd.read_csv( data_file, quotechar='"', escapechar='"' )
+display(airbnb_sf_listings)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Featurization
+
+# COMMAND ----------
+
+# combine id columns
+airbnb_sf_listings['id'] = airbnb_sf_listings['host_id'] +'-'+ airbnb_sf_listings['id']
+airbnb_sf_listings = airbnb_sf_listings.drop(['host_id', 'neighbourhood_group'])
+
+# COMMAND ----------
+
+display(airbnb_sf_listings)
+
+# COMMAND ----------
+
+# index categorical variables
+def index_categorical( cat_col ):
+  # get distinct category labels
+  cat_labels = cat_col.dropna().drop_duplicates().to_list()
+  # create inverse mapping from labels to index
+  cat_map = dict((l,i) for i, l in enumerate(cat_labels))
+  # transform the column by applying the mapping
+  return cat_col.map(cat_map)
+
+cat_columns = ['neighbourhood', 'room_type']
+indexed_listings = airbnb_sf_listings
+for col in cat_columns:
+  idx_col = col+'_idx'
+  indexed_listings[idx_col] = pd.to_numeric(index_categorical(indexed_listings[col])).astype(int)
+
+# COMMAND ----------
+
+feature_cols = ['id', 'neighbourhood_idx', 'room_type_idx', 'price', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365' ]
+featurized = indexed_listings[feature_cols]
+featurized = featurized.dropna()
+
+# COMMAND ----------
+
+display(featurized)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Build a Feature Table
+# MAGIC [Databricks Feature Store](https://docs.databricks.com/applications/machine-learning/feature-store/index.html)
+
+# COMMAND ----------
+
+# MAGIC %sql
+# MAGIC 
+# MAGIC CREATE DATABASE IF NOT EXISTS airbnb;
+
+# COMMAND ----------
+
+from databricks.feature_store import FeatureStoreClient
+
+fs = FeatureStoreClient()
+feature_table_name = 'airbnb.features'
+
+# create table if it doesn't exist
+try:
+  fs.get_feature_table(feature_table_name)
+except:
+  featurized_fs = fs.create_feature_table(
+    name = feature_table_name,
+    keys = 'id',
+    schema = featurized.spark.schema(),
+    description = 'These features are derived from the airbnb.features table in the lakehouse.  I created dummy variables for the categorical columns, cleaned up their names, and dropped the neighbourhood_group_idx .  No aggregations were performed.'
+  )
+
+fs.write_table(
+  name = feature_table_name,
+  df = featurized.to_spark(),
+  mode = 'merge'
+)
+
+# COMMAND ----------
+
+