Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
allwefantasy committed Oct 7, 2018
1 parent 0b554a9 commit 13c32ab
Show file tree
Hide file tree
Showing 15 changed files with 5,374 additions and 27 deletions.
21 changes: 21 additions & 0 deletions examples/sklearn_elasticnet_wine/MLproject
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: tutorial

conda_env: conda.yaml

entry_points:
main:
train:
parameters:
alpha: {type: float, default: 0.5}
l1_ratio: {type: float, default: 0.1}
command: "python train.py 0.5 0.1"
batch_predict:
parameters:
alpha: {type: float, default: 0.5}
l1_ratio: {type: float, default: 0.1}
command: "python batchPredict.py"
api_predict:
parameters:
alpha: {type: float, default: 0.5}
l1_ratio: {type: float, default: 0.1}
command: "python predict.py"
26 changes: 26 additions & 0 deletions examples/sklearn_elasticnet_wine/batchPredict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import mlsql
import pickle
import json
import os
from pyspark.ml.linalg import VectorUDT, Vectors

# get information from mlsql
isp = mlsql.params()["internalSystemParam"]
tempDataLocalPath = isp["tempDataLocalPath"]
tempModelLocalPath = isp["tempModelLocalPath"]
tempOutputLocalPath = isp["tempOutputLocalPath"]

print("tempModelLocalPath:%s" % (tempModelLocalPath))
model = pickle.load(open(tempModelLocalPath + "/model.pkl", "rb"))

print("tempDataLocalPath:%s" % (tempDataLocalPath))
with open(tempOutputLocalPath, "w") as o:
with open(tempDataLocalPath) as f:
for line in f.readlines():
obj = json.loads(line)
features = []
for attribute, value in obj.items():
if attribute != "quality":
features.append(value)
y = model.predict([features])
o.write(json.dumps({"predict": y.tolist()}) + "\n")
12 changes: 12 additions & 0 deletions examples/sklearn_elasticnet_wine/conda.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
name: tutorial
channels:
- defaults
dependencies:
- python=3.6
- numpy=1.14.3
- pandas=0.22.0
- scikit-learn=0.19.1
- pip:
- mlflow
- kafka-python==1.4.3
- pyspark==2.3.1
17 changes: 17 additions & 0 deletions examples/sklearn_elasticnet_wine/predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from pyspark.ml.linalg import VectorUDT, Vectors
import pickle
import os
import python_fun


def predict(index, s):
items = [i for i in s]
feature = VectorUDT().deserialize(pickle.loads(items[0]))
print(pickle.loads(items[1])[0])
model = pickle.load(open(pickle.loads(items[1])[0] + "/model.pkl", "rb"))
y = model.predict([feature.toArray()])
print("------------")
return [VectorUDT().serialize(Vectors.dense(y))]


python_fun.udf(predict)
191 changes: 191 additions & 0 deletions examples/sklearn_elasticnet_wine/train.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# MLflow Training Tutorial\n",
"\n",
"This `train.pynb` Jupyter notebook predicts the quality of wine using [sklearn.linear_model.ElasticNet](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html). \n",
"\n",
"> This is the Jupyter notebook version of the `train.py` example\n",
"\n",
"Attribution\n",
"* The data set used in this example is from http://archive.ics.uci.edu/ml/datasets/Wine+Quality\n",
"* P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.\n",
"* Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# Wine Quality Sample\n",
"def train(in_alpha, in_l1_ratio):\n",
" import os\n",
" import warnings\n",
" import sys\n",
"\n",
" import pandas as pd\n",
" import numpy as np\n",
" from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n",
" from sklearn.model_selection import train_test_split\n",
" from sklearn.linear_model import ElasticNet\n",
"\n",
" import mlflow\n",
" import mlflow.sklearn\n",
"\n",
" def eval_metrics(actual, pred):\n",
" rmse = np.sqrt(mean_squared_error(actual, pred))\n",
" mae = mean_absolute_error(actual, pred)\n",
" r2 = r2_score(actual, pred)\n",
" return rmse, mae, r2\n",
"\n",
"\n",
" warnings.filterwarnings(\"ignore\")\n",
" np.random.seed(40)\n",
"\n",
" # Read the wine-quality csv file (make sure you're running this from the root of MLflow!)\n",
" # Assumes wine-quality.csv is located in the same folder as the notebook\n",
" wine_path = \"wine-quality.csv\"\n",
" data = pd.read_csv(wine_path)\n",
"\n",
" # Split the data into training and test sets. (0.75, 0.25) split.\n",
" train, test = train_test_split(data)\n",
"\n",
" # The predicted column is \"quality\" which is a scalar from [3, 9]\n",
" train_x = train.drop([\"quality\"], axis=1)\n",
" test_x = test.drop([\"quality\"], axis=1)\n",
" train_y = train[[\"quality\"]]\n",
" test_y = test[[\"quality\"]]\n",
"\n",
" # Set default values if no alpha is provided\n",
" if float(in_alpha) is None:\n",
" alpha = 0.5\n",
" else:\n",
" alpha = float(in_alpha)\n",
"\n",
" # Set default values if no l1_ratio is provided\n",
" if float(in_l1_ratio) is None:\n",
" l1_ratio = 0.5\n",
" else:\n",
" l1_ratio = float(in_l1_ratio)\n",
"\n",
" # Useful for multiple runs (only doing one run in this sample notebook) \n",
" with mlflow.start_run():\n",
" # Execute ElasticNet\n",
" lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)\n",
" lr.fit(train_x, train_y)\n",
"\n",
" # Evaluate Metrics\n",
" predicted_qualities = lr.predict(test_x)\n",
" (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)\n",
"\n",
" # Print out metrics\n",
" print(\"Elasticnet model (alpha=%f, l1_ratio=%f):\" % (alpha, l1_ratio))\n",
" print(\" RMSE: %s\" % rmse)\n",
" print(\" MAE: %s\" % mae)\n",
" print(\" R2: %s\" % r2)\n",
"\n",
" # Log parameter, metrics, and model to MLflow\n",
" mlflow.log_param(\"alpha\", alpha)\n",
" mlflow.log_param(\"l1_ratio\", l1_ratio)\n",
" mlflow.log_metric(\"rmse\", rmse)\n",
" mlflow.log_metric(\"r2\", r2)\n",
" mlflow.log_metric(\"mae\", mae)\n",
"\n",
" mlflow.sklearn.log_model(lr, \"model\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Elasticnet model (alpha=0.500000, l1_ratio=0.500000):\n",
" RMSE: 0.82224284975954\n",
" MAE: 0.6278761410160691\n",
" R2: 0.12678721972772689\n"
]
}
],
"source": [
"train(0.5, 0.5)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Elasticnet model (alpha=0.200000, l1_ratio=0.200000):\n",
" RMSE: 0.7859129997062342\n",
" MAE: 0.6155290394093894\n",
" R2: 0.20224631822892092\n"
]
}
],
"source": [
"train(0.2, 0.2)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Elasticnet model (alpha=0.100000, l1_ratio=0.100000):\n",
" RMSE: 0.7792546522251949\n",
" MAE: 0.6112547988118587\n",
" R2: 0.2157063843066196\n"
]
}
],
"source": [
"train(0.1, 0.1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
71 changes: 71 additions & 0 deletions examples/sklearn_elasticnet_wine/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# The data set used in this example is from http://archive.ics.uci.edu/ml/datasets/Wine+Quality
# P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.
# Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.

import os
import warnings
import sys

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet

import mlflow
import mlflow.sklearn

import mlsql


def eval_metrics(actual, pred):
rmse = np.sqrt(mean_squared_error(actual, pred))
mae = mean_absolute_error(actual, pred)
r2 = r2_score(actual, pred)
return rmse, mae, r2


if __name__ == "__main__":
warnings.filterwarnings("ignore")
np.random.seed(40)

# Read the wine-quality csv file (make sure you're running this from the root of MLflow!)
wine_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "wine-quality.csv")
data = pd.read_csv(wine_path)

# Split the data into training and test sets. (0.75, 0.25) split.
train, test = train_test_split(data)

# The predicted column is "quality" which is a scalar from [3, 9]
train_x = train.drop(["quality"], axis=1)
test_x = test.drop(["quality"], axis=1)
train_y = train[["quality"]]
test_y = test[["quality"]]

alpha = float(sys.argv[1]) if len(sys.argv) > 1 else 0.5
l1_ratio = float(sys.argv[2]) if len(sys.argv) > 2 else 0.5

with mlflow.start_run():
lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
lr.fit(train_x, train_y)

predicted_qualities = lr.predict(test_x)

(rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
print(" RMSE: %s" % rmse)
print(" MAE: %s" % mae)
print(" R2: %s" % r2)

mlflow.log_param("alpha", alpha)
mlflow.log_param("l1_ratio", l1_ratio)
mlflow.log_metric("rmse", rmse)
mlflow.log_metric("r2", r2)
mlflow.log_metric("mae", mae)

mlflow.sklearn.log_model(lr, "model")
print(mlsql.params())
isp = mlsql.params()["internalSystemParam"]
tempModelLocalPath = isp["tempModelLocalPath"]
mlflow.sklearn.save_model(lr, tempModelLocalPath)
Loading

0 comments on commit 13c32ab

Please sign in to comment.