updated ai exp 3 lab - replaced get-data with tabular dataset to run …

…local experiment
PeMCaeiro · Dec 19, 2019 · d5b03c1 · d5b03c1
1 parent 63a98e7
commit d5b03c1
Showing 1 changed file with 36 additions and 48 deletions.
diff --git a/lab-files/ai/3/predict-battery-life-with-AML.ipynb b/lab-files/ai/3/predict-battery-life-with-AML.ipynb
@@ -32,7 +32,7 @@
         "To get these values, do the following:\n",
         "1. Navigate to the Azure Portal and login with the credentials provided.\n",
         "2. From the left hand menu, under Favorites, select `Resource Groups`.\n",
-        "3. In the list, select the resource group with the name similar to `tech_immersion_XXXXX`.\n",
+        "3. In the list, select the resource group with the name similar to `tech-immersion-XXXXX`.\n",
         "4. From the Overview tab, capture the desired values.\n",
         "\n",
         "Execute the following cell by selecting the `>|Run` button in the command bar above.\n",
@@ -52,10 +52,10 @@
         "subscription_id = \"\" # <- needs to be the subscription with the resource group\n",
         "\n",
         "#Provide values for the existing Resource Group \n",
-        "resource_group = \"tech_immersion_XXXXX\" # <- replace XXXXX with your unique identifier\n",
+        "resource_group = \"tech-immersion-XXXXX\" # <- replace XXXXX with your unique identifier\n",
         "\n",
         "#Provide the Workspace Name and Azure Region of the Azure Machine Learning Workspace\n",
-        "workspace_name = \"tech_immersion_aml_XXXXX\" # <- replace XXXXX with your unique identifier (should be lowercase)\n",
+        "workspace_name = \"tech-immersion-aml-XXXXX\" # <- replace XXXXX with your unique identifier (should be lowercase)\n",
         "workspace_region = \"eastus2\" # <- region of your resource group\n",
         "#other options for region include eastus, westcentralus, southeastasia, australiaeast, westeurope"
       ]
@@ -105,17 +105,6 @@
         "In Azure Notebooks, all of the libraries needed for Azure Machine Learning are pre-installed. To use them, you just need to import them. Run the following cell to do so:"
       ]
     },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "!pip install --upgrade azureml-sdk==1.0.74\n",
-        "!pip install azureml-automl-core==1.0.74\n",
-        "!pip install azureml-train-automl==1.0.74  "
-      ]
-    },
     {
       "cell_type": "code",
       "execution_count": null,
@@ -128,23 +117,27 @@
         "import os\n",
         "import random\n",
         "import re\n",
+        "import urllib.request\n",
         "\n",
-        "from matplotlib import pyplot as plt\n",
-        "from matplotlib.pyplot import imshow\n",
         "import numpy as np\n",
         "import pandas as pd\n",
         "from sklearn import datasets\n",
         "\n",
         "import azureml.core\n",
         "from azureml.core.experiment import Experiment\n",
         "from azureml.core.workspace import Workspace\n",
-        "from azureml.core.compute import ComputeTarget\n",
-        "from azureml.core.webservice import Webservice\n",
+        "from azureml.core.compute import AksCompute, ComputeTarget\n",
+        "from azureml.core.webservice import Webservice, AksWebservice\n",
         "from azureml.core.image import Image\n",
         "from azureml.core.model import Model\n",
         "from azureml.train.automl import AutoMLConfig\n",
         "from azureml.train.automl.run import AutoMLRun\n",
-        "from azureml.core import Workspace"
+        "from azureml.core import Workspace\n",
+        "from azureml.data.azure_storage_datastore import AzureBlobDatastore\n",
+        "from azureml.core import Dataset\n",
+        "\n",
+        "# Check core SDK version number\n",
+        "print(\"SDK version:\", azureml.core.VERSION)"
       ]
     },
     {
@@ -285,18 +278,14 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "### Create the data loading script for remote compute"
+        "### Create Azure Machine Learning TabularDataset"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "The Azure Machine Learning Compute cluster needs to know how to get the data to train against. You can package this logic in a script that will be executed by the compute when it starts executing the training.\n",
-        "\n",
-        "Run the following cells to locally create the **get_data.py** script that will be deployed to remote compute. You will also use this script when you want train the model locally. \n",
-        "\n",
-        "Observe that the get_data method returns the features (`X`) and the labels (`Y`) in an object. This structure is expected later when you will configure Auto ML."
+        "Download the training dataset to the project_folder, and then upload the data to the default workspace datastore which is backed by the Azure blob storage. Next, using the training data saved in the default workspace datastore, we will create an unregistered TabularDataset pointing to the path in the datastore. This dataset reference, will allow us to seamlessly access the training data during model training without worrying about connection strings or data paths."
       ]
     },
     {
@@ -309,30 +298,27 @@
       "source": [
         "# create project folder\n",
         "if not os.path.exists(project_folder):\n",
-        "    os.makedirs(project_folder)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "trusted": true
-      },
-      "outputs": [],
-      "source": [
-        "%%writefile $project_folder/get_data.py\n",
+        "    os.makedirs(project_folder)\n",
         "\n",
-        "import pandas as pd\n",
-        "import numpy as np\n",
+        "# download the training dataset from data_url to the project folder\n",
+        "urllib.request.urlretrieve(data_url, os.path.join(project_folder, 'training-formatted.csv'))\n",
         "\n",
-        "def get_data():\n",
-        "    \n",
-        "    data = pd.read_csv(\"https://databricksdemostore.blob.core.windows.net/data/connected-car/training-formatted.csv\")\n",
-        "    \n",
-        "    X = data.iloc[:,1:73]\n",
-        "    Y = data.iloc[:,0].values.flatten()\n",
+        "# upload training dataset to default workspace datastore\n",
+        "datastore = ws.get_default_datastore()\n",
+        "datastore.upload_files(files = [os.path.join(project_folder, 'training-formatted.csv')],\n",
+        "                       target_path = 'train-dataset/tabular/',\n",
+        "                       overwrite = True,\n",
+        "                       show_progress = True)\n",
+        "\n",
+        "# create TabularDataset reference\n",
+        "dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, \n",
+        "                                                        'train-dataset/tabular/training-formatted.csv')])\n",
+        "\n",
+        "# target or label column name\n",
+        "target_column_name = 'Survival_In_Days'\n",
         "\n",
-        "    return { \"X\" : X, \"y\" : Y }"
+        "# preview the first 5 rows of the dataset\n",
+        "dataset.take(5).to_pandas_dataframe()"
       ]
     },
     {
@@ -426,7 +412,8 @@
         "                             n_cross_validations = 5,\n",
         "                             debug_log = 'automl.log',\n",
         "                             verbosity = logging.DEBUG,\n",
-        "                             data_script = project_folder + \"/get_data.py\",\n",
+        "                             training_data = dataset, \n",
+        "                             label_column_name=target_column_name,\n",
         "                             path = project_folder)"
       ]
     },
@@ -491,7 +478,8 @@
         "                             n_cross_validations = 5,\n",
         "                             debug_log = 'automl.log',\n",
         "                             verbosity = logging.DEBUG,\n",
-        "                             data_script = project_folder + \"/get_data.py\",\n",
+        "                             training_data = dataset, \n",
+        "                             label_column_name=target_column_name,\n",
         "                             compute_target = compute_target,\n",
         "                             path = project_folder)\n",
         "remote_run = experiment.submit(automl_config, show_output=False)\n",