Merge pull request #1 from gro-intelligence/gro-model-example

tolothy · web-flow · commit 259430c5b5fa · 2021-05-13T14:11:11.000-04:00
Gro model example
diff --git a/MLproject b/MLproject
@@ -0,0 +1,11 @@
+name: yiqing-test
+
+docker_env:
+  # this is a docker image on DockerHub that need to be accessed using credentials
+  image: grointelligence/worker
+
+entry_points:
+  main:
+    parameters:
+      model_input: {type: float, default: 0.1}
+    command: "python mlflowtest/train.py {model_input}"
diff --git a/README.md b/README.md
@@ -1 +1,18 @@
 # gro-mlproject
+The goal of this project is to demo what we want to achieve with this POC.
+
+## Problem description
+We want to run thousands of experimentations to tweak a parameter - alpha. To run one experiment locally, you can run in command line
+```
+python mlflowtest/train.py 0.5
+``` 
+, which is not scalable if we want to run the training script with 1000 different values for alpha. 
+
+Alternatively, we can send the training script to Databricks, with something like the following
+```
+mlflow run ../gro-mlproject -b databricks --backend-config cluster-spec.json --experiment-id 4069866474349730 -P alpha=0.5
+```
+In order for Databricks to to run the script, one needs to specify the enviroment, using one of the three options [here](https://www.mlflow.org/docs/latest/projects.html#project-environments). Specifically,
+1. conda: mlflow run works when we specify conda_env in MLproject. However, it is not a feasible option because our model requires more complicated dependencies than a conda.yaml file can hold. For example, we might require certain system packages to be installed. 
+2. docker container: mlflow run does not work with the current setup. Got the following error message ```ERROR mlflow.cli: === Running docker-based projects on Databricks is not yet supported. ===```
+3. system: not an option. We want to run experiments on Databricks.
diff --git a/cluster-spec.json b/cluster-spec.json
@@ -0,0 +1,6 @@
+{
+  "spark_version": "7.3.x-scala2.12",
+  "num_workers": 1,
+  "node_type_id": "i3.xlarge"
+}
+
diff --git a/conda.yaml b/conda.yaml
@@ -0,0 +1,10 @@
+name: yiqing-test
+channels:
+  - defaults
+dependencies:
+  - numpy>=1.14.3
+  - pandas>=1.0.0
+  - scikit-learn=0.19.1
+  - pip
+  - pip:
+    - mlflow
diff --git a/mlflowtest/train.py b/mlflowtest/train.py
@@ -0,0 +1,67 @@
+# The data set used in this example is from http://archive.ics.uci.edu/ml/datasets/Wine+Quality
+# P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.
+# Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.
+
+import os
+import warnings
+import sys
+
+import pandas as pd
+import numpy as np
+from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import ElasticNet
+
+import mlflow
+import mlflow.sklearn
+
+
+def eval_metrics(actual, pred):
+    rmse = np.sqrt(mean_squared_error(actual, pred))
+    mae = mean_absolute_error(actual, pred)
+    r2 = r2_score(actual, pred)
+    return rmse, mae, r2
+
+
+
+if __name__ == "__main__":
+    warnings.filterwarnings("ignore")
+    np.random.seed(40)
+
+    # Read the wine-quality csv file (make sure you're running this from the root of MLflow!)
+    wine_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "wine-quality.csv")
+    data = pd.read_csv(wine_path)
+
+    # Split the data into training and test sets. (0.75, 0.25) split.
+    train, test = train_test_split(data)
+
+    # The predicted column is "quality" which is a scalar from [3, 9]
+    train_x = train.drop(["quality"], axis=1)
+    test_x = test.drop(["quality"], axis=1)
+    train_y = train[["quality"]]
+    test_y = test[["quality"]]
+
+    alpha = float(sys.argv[1]) if len(sys.argv) > 1 else 0.5
+    l1_ratio = float(sys.argv[2]) if len(sys.argv) > 2 else 0.5
+
+    with mlflow.start_run():
+        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
+        lr.fit(train_x, train_y)
+
+        predicted_qualities = lr.predict(test_x)
+
+        (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)
+
+        print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
+        print("  RMSE: %s" % rmse)
+        print("  MAE: %s" % mae)
+        print("  R2: %s" % r2)
+
+        mlflow.log_param("alpha", alpha)
+        mlflow.log_param("l1_ratio", l1_ratio)
+        mlflow.log_metric("rmse", rmse)
+        mlflow.log_metric("r2", r2)
+        mlflow.log_metric("mae", mae)
+
+        mlflow.sklearn.log_model(lr, "model")
+
diff --git a/mlflowtest/wine-quality.csv b/mlflowtest/wine-quality.csv
@@ -0,0 +1,36 @@
+"fixed acidity","volatile acidity","citric acid","residual sugar","chlorides","free sulfur dioxide","total sulfur dioxide","density","pH","sulphates","alcohol","quality"
+7,0.27,0.36,20.7,0.045,45,170,1.001,3,0.45,8.8,6
+6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6
+8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6
+7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6
+7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6
+8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6
+6.2,0.32,0.16,7,0.045,30,136,0.9949,3.18,0.47,9.6,6
+7,0.27,0.36,20.7,0.045,45,170,1.001,3,0.45,8.8,6
+6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6
+8.1,0.22,0.43,1.5,0.044,28,129,0.9938,3.22,0.45,11,6
+8.1,0.27,0.41,1.45,0.033,11,63,0.9908,2.99,0.56,12,5
+8.6,0.23,0.4,4.2,0.035,17,109,0.9947,3.14,0.53,9.7,5
+7.9,0.18,0.37,1.2,0.04,16,75,0.992,3.18,0.63,10.8,5
+6.6,0.16,0.4,1.5,0.044,48,143,0.9912,3.54,0.52,12.4,7
+8.3,0.42,0.62,19.25,0.04,41,172,1.0002,2.98,0.67,9.7,5
+6.6,0.17,0.38,1.5,0.032,28,112,0.9914,3.25,0.55,11.4,7
+6.3,0.48,0.04,1.1,0.046,30,99,0.9928,3.24,0.36,9.6,6
+6.2,0.66,0.48,1.2,0.029,29,75,0.9892,3.33,0.39,12.8,8
+7.4,0.34,0.42,1.1,0.033,17,171,0.9917,3.12,0.53,11.3,6
+6.5,0.31,0.14,7.5,0.044,34,133,0.9955,3.22,0.5,9.5,5
+6.2,0.66,0.48,1.2,0.029,29,75,0.9892,3.33,0.39,12.8,8
+6.4,0.31,0.38,2.9,0.038,19,102,0.9912,3.17,0.35,11,7
+6.8,0.26,0.42,1.7,0.049,41,122,0.993,3.47,0.48,10.5,8
+7.6,0.67,0.14,1.5,0.074,25,168,0.9937,3.05,0.51,9.3,5
+6.6,0.27,0.41,1.3,0.052,16,142,0.9951,3.42,0.47,10,6
+7,0.25,0.32,9,0.046,56,245,0.9955,3.25,0.5,10.4,6
+6.9,0.24,0.35,1,0.052,35,146,0.993,3.45,0.44,10,6
+7,0.28,0.39,8.7,0.051,32,141,0.9961,3.38,0.53,10.5,6
+7.4,0.27,0.48,1.1,0.047,17,132,0.9914,3.19,0.49,11.6,6
+7.2,0.32,0.36,2,0.033,37,114,0.9906,3.1,0.71,12.3,7
+8.5,0.24,0.39,10.4,0.044,20,142,0.9974,3.2,0.53,10,6
+8.3,0.14,0.34,1.1,0.042,7,47,0.9934,3.47,0.4,10.2,6
+7.4,0.25,0.36,2.05,0.05,31,100,0.992,3.19,0.44,10.8,6
+6.2,0.12,0.34,1.5,0.045,43,117,0.9939,3.42,0.51,9,6
+5.8,0.27,0.2,14.95,0.044,22,179,0.9962,3.37,0.37,10.2,5