update

reyesyang · Oct 7, 2018 · 13c32ab · 13c32ab
1 parent 0b554a9
commit 13c32ab
Show file tree

Hide file tree

Showing 15 changed files with 5,374 additions and 27 deletions.
diff --git a/examples/sklearn_elasticnet_wine/MLproject b/examples/sklearn_elasticnet_wine/MLproject
@@ -0,0 +1,21 @@
+name: tutorial
+
+conda_env: conda.yaml
+
+entry_points:
+  main:
+    train:
+        parameters:
+          alpha: {type: float, default: 0.5}
+          l1_ratio: {type: float, default: 0.1}
+        command: "python train.py 0.5 0.1"
+    batch_predict:
+        parameters:
+          alpha: {type: float, default: 0.5}
+          l1_ratio: {type: float, default: 0.1}
+        command: "python batchPredict.py"
+    api_predict:
+        parameters:
+          alpha: {type: float, default: 0.5}
+          l1_ratio: {type: float, default: 0.1}
+        command: "python predict.py"
diff --git a/examples/sklearn_elasticnet_wine/batchPredict.py b/examples/sklearn_elasticnet_wine/batchPredict.py
@@ -0,0 +1,26 @@
+import mlsql
+import pickle
+import json
+import os
+from pyspark.ml.linalg import VectorUDT, Vectors
+
+# get information from mlsql
+isp = mlsql.params()["internalSystemParam"]
+tempDataLocalPath = isp["tempDataLocalPath"]
+tempModelLocalPath = isp["tempModelLocalPath"]
+tempOutputLocalPath = isp["tempOutputLocalPath"]
+
+print("tempModelLocalPath:%s" % (tempModelLocalPath))
+model = pickle.load(open(tempModelLocalPath + "/model.pkl", "rb"))
+
+print("tempDataLocalPath:%s" % (tempDataLocalPath))
+with open(tempOutputLocalPath, "w") as o:
+    with open(tempDataLocalPath) as f:
+        for line in f.readlines():
+            obj = json.loads(line)
+            features = []
+            for attribute, value in obj.items():
+                if attribute != "quality":
+                    features.append(value)
+            y = model.predict([features])
+            o.write(json.dumps({"predict": y.tolist()}) + "\n")
diff --git a/examples/sklearn_elasticnet_wine/conda.yaml b/examples/sklearn_elasticnet_wine/conda.yaml
@@ -0,0 +1,12 @@
+name: tutorial
+channels:
+  - defaults
+dependencies:
+  - python=3.6
+  - numpy=1.14.3
+  - pandas=0.22.0
+  - scikit-learn=0.19.1
+  - pip:
+    - mlflow
+    - kafka-python==1.4.3
+    - pyspark==2.3.1
diff --git a/examples/sklearn_elasticnet_wine/predict.py b/examples/sklearn_elasticnet_wine/predict.py
@@ -0,0 +1,17 @@
+from pyspark.ml.linalg import VectorUDT, Vectors
+import pickle
+import os
+import python_fun
+
+
+def predict(index, s):
+    items = [i for i in s]
+    feature = VectorUDT().deserialize(pickle.loads(items[0]))
+    print(pickle.loads(items[1])[0])
+    model = pickle.load(open(pickle.loads(items[1])[0] + "/model.pkl", "rb"))
+    y = model.predict([feature.toArray()])
+    print("------------")
+    return [VectorUDT().serialize(Vectors.dense(y))]
+
+
+python_fun.udf(predict)
diff --git a/examples/sklearn_elasticnet_wine/train.ipynb b/examples/sklearn_elasticnet_wine/train.ipynb
@@ -0,0 +1,191 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# MLflow Training Tutorial\n",
+    "\n",
+    "This `train.pynb` Jupyter notebook predicts the quality of wine using [sklearn.linear_model.ElasticNet](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html).  \n",
+    "\n",
+    "> This is the Jupyter notebook version of the `train.py` example\n",
+    "\n",
+    "Attribution\n",
+    "* The data set used in this example is from http://archive.ics.uci.edu/ml/datasets/Wine+Quality\n",
+    "* P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.\n",
+    "* Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Wine Quality Sample\n",
+    "def train(in_alpha, in_l1_ratio):\n",
+    "    import os\n",
+    "    import warnings\n",
+    "    import sys\n",
+    "\n",
+    "    import pandas as pd\n",
+    "    import numpy as np\n",
+    "    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n",
+    "    from sklearn.model_selection import train_test_split\n",
+    "    from sklearn.linear_model import ElasticNet\n",
+    "\n",
+    "    import mlflow\n",
+    "    import mlflow.sklearn\n",
+    "\n",
+    "    def eval_metrics(actual, pred):\n",
+    "        rmse = np.sqrt(mean_squared_error(actual, pred))\n",
+    "        mae = mean_absolute_error(actual, pred)\n",
+    "        r2 = r2_score(actual, pred)\n",
+    "        return rmse, mae, r2\n",
+    "\n",
+    "\n",
+    "    warnings.filterwarnings(\"ignore\")\n",
+    "    np.random.seed(40)\n",
+    "\n",
+    "    # Read the wine-quality csv file (make sure you're running this from the root of MLflow!)\n",
+    "    #  Assumes wine-quality.csv is located in the same folder as the notebook\n",
+    "    wine_path = \"wine-quality.csv\"\n",
+    "    data = pd.read_csv(wine_path)\n",
+    "\n",
+    "    # Split the data into training and test sets. (0.75, 0.25) split.\n",
+    "    train, test = train_test_split(data)\n",
+    "\n",
+    "    # The predicted column is \"quality\" which is a scalar from [3, 9]\n",
+    "    train_x = train.drop([\"quality\"], axis=1)\n",
+    "    test_x = test.drop([\"quality\"], axis=1)\n",
+    "    train_y = train[[\"quality\"]]\n",
+    "    test_y = test[[\"quality\"]]\n",
+    "\n",
+    "    # Set default values if no alpha is provided\n",
+    "    if float(in_alpha) is None:\n",
+    "        alpha = 0.5\n",
+    "    else:\n",
+    "        alpha = float(in_alpha)\n",
+    "\n",
+    "    # Set default values if no l1_ratio is provided\n",
+    "    if float(in_l1_ratio) is None:\n",
+    "        l1_ratio = 0.5\n",
+    "    else:\n",
+    "        l1_ratio = float(in_l1_ratio)\n",
+    "\n",
+    "    # Useful for multiple runs (only doing one run in this sample notebook)    \n",
+    "    with mlflow.start_run():\n",
+    "        # Execute ElasticNet\n",
+    "        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)\n",
+    "        lr.fit(train_x, train_y)\n",
+    "\n",
+    "        # Evaluate Metrics\n",
+    "        predicted_qualities = lr.predict(test_x)\n",
+    "        (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)\n",
+    "\n",
+    "        # Print out metrics\n",
+    "        print(\"Elasticnet model (alpha=%f, l1_ratio=%f):\" % (alpha, l1_ratio))\n",
+    "        print(\"  RMSE: %s\" % rmse)\n",
+    "        print(\"  MAE: %s\" % mae)\n",
+    "        print(\"  R2: %s\" % r2)\n",
+    "\n",
+    "        # Log parameter, metrics, and model to MLflow\n",
+    "        mlflow.log_param(\"alpha\", alpha)\n",
+    "        mlflow.log_param(\"l1_ratio\", l1_ratio)\n",
+    "        mlflow.log_metric(\"rmse\", rmse)\n",
+    "        mlflow.log_metric(\"r2\", r2)\n",
+    "        mlflow.log_metric(\"mae\", mae)\n",
+    "\n",
+    "        mlflow.sklearn.log_model(lr, \"model\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Elasticnet model (alpha=0.500000, l1_ratio=0.500000):\n",
+      "  RMSE: 0.82224284975954\n",
+      "  MAE: 0.6278761410160691\n",
+      "  R2: 0.12678721972772689\n"
+     ]
+    }
+   ],
+   "source": [
+    "train(0.5, 0.5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Elasticnet model (alpha=0.200000, l1_ratio=0.200000):\n",
+      "  RMSE: 0.7859129997062342\n",
+      "  MAE: 0.6155290394093894\n",
+      "  R2: 0.20224631822892092\n"
+     ]
+    }
+   ],
+   "source": [
+    "train(0.2, 0.2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Elasticnet model (alpha=0.100000, l1_ratio=0.100000):\n",
+      "  RMSE: 0.7792546522251949\n",
+      "  MAE: 0.6112547988118587\n",
+      "  R2: 0.2157063843066196\n"
+     ]
+    }
+   ],
+   "source": [
+    "train(0.1, 0.1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/sklearn_elasticnet_wine/train.py b/examples/sklearn_elasticnet_wine/train.py
@@ -0,0 +1,71 @@
+# The data set used in this example is from http://archive.ics.uci.edu/ml/datasets/Wine+Quality
+# P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.
+# Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.
+
+import os
+import warnings
+import sys
+
+import pandas as pd
+import numpy as np
+from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import ElasticNet
+
+import mlflow
+import mlflow.sklearn
+
+import mlsql
+
+
+def eval_metrics(actual, pred):
+    rmse = np.sqrt(mean_squared_error(actual, pred))
+    mae = mean_absolute_error(actual, pred)
+    r2 = r2_score(actual, pred)
+    return rmse, mae, r2
+
+
+if __name__ == "__main__":
+    warnings.filterwarnings("ignore")
+    np.random.seed(40)
+
+    # Read the wine-quality csv file (make sure you're running this from the root of MLflow!)
+    wine_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "wine-quality.csv")
+    data = pd.read_csv(wine_path)
+
+    # Split the data into training and test sets. (0.75, 0.25) split.
+    train, test = train_test_split(data)
+
+    # The predicted column is "quality" which is a scalar from [3, 9]
+    train_x = train.drop(["quality"], axis=1)
+    test_x = test.drop(["quality"], axis=1)
+    train_y = train[["quality"]]
+    test_y = test[["quality"]]
+
+    alpha = float(sys.argv[1]) if len(sys.argv) > 1 else 0.5
+    l1_ratio = float(sys.argv[2]) if len(sys.argv) > 2 else 0.5
+
+    with mlflow.start_run():
+        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
+        lr.fit(train_x, train_y)
+
+        predicted_qualities = lr.predict(test_x)
+
+        (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)
+
+        print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
+        print("  RMSE: %s" % rmse)
+        print("  MAE: %s" % mae)
+        print("  R2: %s" % r2)
+
+        mlflow.log_param("alpha", alpha)
+        mlflow.log_param("l1_ratio", l1_ratio)
+        mlflow.log_metric("rmse", rmse)
+        mlflow.log_metric("r2", r2)
+        mlflow.log_metric("mae", mae)
+
+        mlflow.sklearn.log_model(lr, "model")
+        print(mlsql.params())
+        isp = mlsql.params()["internalSystemParam"]
+        tempModelLocalPath = isp["tempModelLocalPath"]
+        mlflow.sklearn.save_model(lr, tempModelLocalPath)