Skip to content

Commit

Permalink
Commit 12/16 MLOps Commit
Browse files Browse the repository at this point in the history
  • Loading branch information
diganparikh-dp committed Dec 17, 2021
1 parent 828d0bb commit 18df060
Show file tree
Hide file tree
Showing 10 changed files with 6,034 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Databricks notebook source
# MAGIC %md
# MAGIC ## Import Data
# MAGIC [More info around pyspark.pandas](https://databricks.com/blog/2021/10/04/pandas-api-on-upcoming-apache-spark-3-2.html)

# COMMAND ----------

import pyspark.pandas as pd
#import databricks.koalas as pd # for spark less than 3.2

data_file = "/mnt/training/airbnb-sf-listings.csv"
airbnb_sf_listings = pd.read_csv( data_file, quotechar='"', escapechar='"' )
display(airbnb_sf_listings)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Explore the data

# COMMAND ----------

airbnb_sf_listings.describe()

# COMMAND ----------

airbnb_sf_listings['price'].plot.hist(100)

# COMMAND ----------

airbnb_spark_df = airbnb_sf_listings.to_spark() #conversion from dataframe to spark

dbutils.data.summarize(airbnb_spark_df) #Data profiling with Spark dataframes

# COMMAND ----------

from pandas_profiling import ProfileReport

airbnb_pandas_df = airbnb_sf_listings.to_pandas() #convert spark dataframe to pandas

df_profile = ProfileReport(airbnb_pandas_df, title="Profiling Report", progress_bar=False, infer_dtypes=False)
profile_html = df_profile.to_html()

displayHTML(profile_html)
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# Databricks notebook source
# MAGIC %md
# MAGIC ##Import Data

# COMMAND ----------

import pyspark.pandas as pd

data_file = "/mnt/training/airbnb-sf-listings.csv"
airbnb_sf_listings = pd.read_csv( data_file, quotechar='"', escapechar='"' )
display(airbnb_sf_listings)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Featurization

# COMMAND ----------

# combine id columns
airbnb_sf_listings['id'] = airbnb_sf_listings['host_id'] +'-'+ airbnb_sf_listings['id']
airbnb_sf_listings = airbnb_sf_listings.drop(['host_id', 'neighbourhood_group'])

# COMMAND ----------

display(airbnb_sf_listings)

# COMMAND ----------

# index categorical variables
def index_categorical( cat_col ):
# get distinct category labels
cat_labels = cat_col.dropna().drop_duplicates().to_list()
# create inverse mapping from labels to index
cat_map = dict((l,i) for i, l in enumerate(cat_labels))
# transform the column by applying the mapping
return cat_col.map(cat_map)

cat_columns = ['neighbourhood', 'room_type']
indexed_listings = airbnb_sf_listings
for col in cat_columns:
idx_col = col+'_idx'
indexed_listings[idx_col] = pd.to_numeric(index_categorical(indexed_listings[col])).astype(int)

# COMMAND ----------

feature_cols = ['id', 'neighbourhood_idx', 'room_type_idx', 'price', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365' ]
featurized = indexed_listings[feature_cols]
featurized = featurized.dropna()

# COMMAND ----------

display(featurized)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Build a Feature Table
# MAGIC [Databricks Feature Store](https://docs.databricks.com/applications/machine-learning/feature-store/index.html)

# COMMAND ----------

# MAGIC %sql
# MAGIC
# MAGIC CREATE DATABASE IF NOT EXISTS airbnb;

# COMMAND ----------

from databricks.feature_store import FeatureStoreClient

fs = FeatureStoreClient()
feature_table_name = 'airbnb.features'

# create table if it doesn't exist
try:
fs.get_feature_table(feature_table_name)
except:
featurized_fs = fs.create_feature_table(
name = feature_table_name,
keys = 'id',
schema = featurized.spark.schema(),
description = 'These features are derived from the airbnb.features table in the lakehouse. I created dummy variables for the categorical columns, cleaned up their names, and dropped the neighbourhood_group_idx . No aggregations were performed.'
)

fs.write_table(
name = feature_table_name,
df = featurized.to_spark(),
mode = 'merge'
)

# COMMAND ----------


Loading

0 comments on commit 18df060

Please sign in to comment.