Skip to content

Commit 259430c

Browse files
authoredMay 13, 2021
Merge pull request #1 from gro-intelligence/gro-model-example
Gro model example
2 parents 5ce3a93 + 7bdae22 commit 259430c

File tree

6 files changed

+147
-0
lines changed

6 files changed

+147
-0
lines changed
 

‎MLproject

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
name: yiqing-test
2+
3+
docker_env:
4+
# this is a docker image on DockerHub that need to be accessed using credentials
5+
image: grointelligence/worker
6+
7+
entry_points:
8+
main:
9+
parameters:
10+
model_input: {type: float, default: 0.1}
11+
command: "python mlflowtest/train.py {model_input}"

‎README.md

+17
Original file line numberDiff line numberDiff line change
@@ -1 +1,18 @@
11
# gro-mlproject
2+
The goal of this project is to demo what we want to achieve with this POC.
3+
4+
## Problem description
5+
We want to run thousands of experimentations to tweak a parameter - alpha. To run one experiment locally, you can run in command line
6+
```
7+
python mlflowtest/train.py 0.5
8+
```
9+
, which is not scalable if we want to run the training script with 1000 different values for alpha.
10+
11+
Alternatively, we can send the training script to Databricks, with something like the following
12+
```
13+
mlflow run ../gro-mlproject -b databricks --backend-config cluster-spec.json --experiment-id 4069866474349730 -P alpha=0.5
14+
```
15+
In order for Databricks to to run the script, one needs to specify the enviroment, using one of the three options [here](https://www.mlflow.org/docs/latest/projects.html#project-environments). Specifically,
16+
1. conda: mlflow run works when we specify conda_env in MLproject. However, it is not a feasible option because our model requires more complicated dependencies than a conda.yaml file can hold. For example, we might require certain system packages to be installed.
17+
2. docker container: mlflow run does not work with the current setup. Got the following error message ```ERROR mlflow.cli: === Running docker-based projects on Databricks is not yet supported. ===```
18+
3. system: not an option. We want to run experiments on Databricks.

‎cluster-spec.json

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"spark_version": "7.3.x-scala2.12",
3+
"num_workers": 1,
4+
"node_type_id": "i3.xlarge"
5+
}
6+

‎conda.yaml

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
name: yiqing-test
2+
channels:
3+
- defaults
4+
dependencies:
5+
- numpy>=1.14.3
6+
- pandas>=1.0.0
7+
- scikit-learn=0.19.1
8+
- pip
9+
- pip:
10+
- mlflow

‎mlflowtest/train.py

+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# The data set used in this example is from http://archive.ics.uci.edu/ml/datasets/Wine+Quality
2+
# P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.
3+
# Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.
4+
5+
import os
6+
import warnings
7+
import sys
8+
9+
import pandas as pd
10+
import numpy as np
11+
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
12+
from sklearn.model_selection import train_test_split
13+
from sklearn.linear_model import ElasticNet
14+
15+
import mlflow
16+
import mlflow.sklearn
17+
18+
19+
def eval_metrics(actual, pred):
20+
rmse = np.sqrt(mean_squared_error(actual, pred))
21+
mae = mean_absolute_error(actual, pred)
22+
r2 = r2_score(actual, pred)
23+
return rmse, mae, r2
24+
25+
26+
27+
if __name__ == "__main__":
28+
warnings.filterwarnings("ignore")
29+
np.random.seed(40)
30+
31+
# Read the wine-quality csv file (make sure you're running this from the root of MLflow!)
32+
wine_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "wine-quality.csv")
33+
data = pd.read_csv(wine_path)
34+
35+
# Split the data into training and test sets. (0.75, 0.25) split.
36+
train, test = train_test_split(data)
37+
38+
# The predicted column is "quality" which is a scalar from [3, 9]
39+
train_x = train.drop(["quality"], axis=1)
40+
test_x = test.drop(["quality"], axis=1)
41+
train_y = train[["quality"]]
42+
test_y = test[["quality"]]
43+
44+
alpha = float(sys.argv[1]) if len(sys.argv) > 1 else 0.5
45+
l1_ratio = float(sys.argv[2]) if len(sys.argv) > 2 else 0.5
46+
47+
with mlflow.start_run():
48+
lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
49+
lr.fit(train_x, train_y)
50+
51+
predicted_qualities = lr.predict(test_x)
52+
53+
(rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)
54+
55+
print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
56+
print(" RMSE: %s" % rmse)
57+
print(" MAE: %s" % mae)
58+
print(" R2: %s" % r2)
59+
60+
mlflow.log_param("alpha", alpha)
61+
mlflow.log_param("l1_ratio", l1_ratio)
62+
mlflow.log_metric("rmse", rmse)
63+
mlflow.log_metric("r2", r2)
64+
mlflow.log_metric("mae", mae)
65+
66+
mlflow.sklearn.log_model(lr, "model")
67+

‎mlflowtest/wine-quality.csv

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
"fixed acidity","volatile acidity","citric acid","residual sugar","chlorides","free sulfur dioxide","total sulfur dioxide","density","pH","sulphates","alcohol","quality"
2+
7,0.27,0.36,20.7,0.045,45,170,1.001,3,0.45,8.8,6
3+
6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6
4+
8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6
5+
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6
6+
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6
7+
8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6
8+
6.2,0.32,0.16,7,0.045,30,136,0.9949,3.18,0.47,9.6,6
9+
7,0.27,0.36,20.7,0.045,45,170,1.001,3,0.45,8.8,6
10+
6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6
11+
8.1,0.22,0.43,1.5,0.044,28,129,0.9938,3.22,0.45,11,6
12+
8.1,0.27,0.41,1.45,0.033,11,63,0.9908,2.99,0.56,12,5
13+
8.6,0.23,0.4,4.2,0.035,17,109,0.9947,3.14,0.53,9.7,5
14+
7.9,0.18,0.37,1.2,0.04,16,75,0.992,3.18,0.63,10.8,5
15+
6.6,0.16,0.4,1.5,0.044,48,143,0.9912,3.54,0.52,12.4,7
16+
8.3,0.42,0.62,19.25,0.04,41,172,1.0002,2.98,0.67,9.7,5
17+
6.6,0.17,0.38,1.5,0.032,28,112,0.9914,3.25,0.55,11.4,7
18+
6.3,0.48,0.04,1.1,0.046,30,99,0.9928,3.24,0.36,9.6,6
19+
6.2,0.66,0.48,1.2,0.029,29,75,0.9892,3.33,0.39,12.8,8
20+
7.4,0.34,0.42,1.1,0.033,17,171,0.9917,3.12,0.53,11.3,6
21+
6.5,0.31,0.14,7.5,0.044,34,133,0.9955,3.22,0.5,9.5,5
22+
6.2,0.66,0.48,1.2,0.029,29,75,0.9892,3.33,0.39,12.8,8
23+
6.4,0.31,0.38,2.9,0.038,19,102,0.9912,3.17,0.35,11,7
24+
6.8,0.26,0.42,1.7,0.049,41,122,0.993,3.47,0.48,10.5,8
25+
7.6,0.67,0.14,1.5,0.074,25,168,0.9937,3.05,0.51,9.3,5
26+
6.6,0.27,0.41,1.3,0.052,16,142,0.9951,3.42,0.47,10,6
27+
7,0.25,0.32,9,0.046,56,245,0.9955,3.25,0.5,10.4,6
28+
6.9,0.24,0.35,1,0.052,35,146,0.993,3.45,0.44,10,6
29+
7,0.28,0.39,8.7,0.051,32,141,0.9961,3.38,0.53,10.5,6
30+
7.4,0.27,0.48,1.1,0.047,17,132,0.9914,3.19,0.49,11.6,6
31+
7.2,0.32,0.36,2,0.033,37,114,0.9906,3.1,0.71,12.3,7
32+
8.5,0.24,0.39,10.4,0.044,20,142,0.9974,3.2,0.53,10,6
33+
8.3,0.14,0.34,1.1,0.042,7,47,0.9934,3.47,0.4,10.2,6
34+
7.4,0.25,0.36,2.05,0.05,31,100,0.992,3.19,0.44,10.8,6
35+
6.2,0.12,0.34,1.5,0.045,43,117,0.9939,3.42,0.51,9,6
36+
5.8,0.27,0.2,14.95,0.044,22,179,0.9962,3.37,0.37,10.2,5

0 commit comments

Comments
 (0)
Please sign in to comment.