Fix 2020_06_04 catboost tutorial

Note: mandatory check (NEED_CHECK) was skipped ref:fdfe4258918442af8f0eec5e5b60c005eeaee7a9
blatr · Jun 4, 2020 · 25c7f81 · 25c7f81
1 parent 2514389
commit 25c7f81
Showing 1 changed file with 31 additions and 54 deletions.
diff --git a/events/2020_06_04_catboost_tutorial/catboost_features.ipynb b/events/2020_06_04_catboost_tutorial/catboost_features.ipynb
@@ -191,27 +191,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Dataset Quantization"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Features quantization. It is effective to quantize features single time before several trainings."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pool1.quantize(\n",
-    "    border_count=254,\n",
-    "    feature_border_type='GreedyLogSum',\n",
-    "    per_float_feature_quantization=['0:border_count=1024']\n",
-    ")"
+    "## Split your data into train and validation"
    ]
   },
   {
@@ -220,31 +200,27 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from catboost.utils import quantize\n",
+    "from sklearn.model_selection import train_test_split\n",
     "\n",
-    "pool2 = quantize(\n",
-    "    data_path=os.path.join(dataset_dir, 'train.csv'),\n",
-    "    delimiter=',',\n",
-    "    column_description=os.path.join(dataset_dir, 'train.cd'),\n",
-    "    has_header=True,\n",
-    ")"
+    "data = train_test_split(X, y, train_size=0.8, random_state=0)\n",
+    "X_train, X_validation, y_train, y_validation = data\n",
+    "\n",
+    "train_pool = Pool(data=X_train, label=y_train)\n",
+    "validation_pool = Pool(data=X_validation, label=y_validation)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "You can dump borders and use it for another dataset."
+    "## Dataset Quantization"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "pool2.save_quantization_borders('borders.tsv')\n",
-    "!head -20 borders.tsv"
+    "Features quantization. It is effective to quantize features single time before several trainings."
    ]
   },
   {
@@ -253,15 +229,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "pool1 = Pool(data=X, label=y)\n",
-    "pool1.quantize(input_borders='borders.tsv')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Split your data into train and validation"
+    "train_pool.quantize(\n",
+    "    border_count=254,\n",
+    "    # per_float_feature_quantization=['0:border_count=1024']\n",
+    ")\n",
+    "\n",
+    "train_pool.save_quantization_borders('borders.tsv')\n",
+    "\n",
+    "validation_pool.quantize(input_borders='borders.tsv')"
    ]
   },
   {
@@ -270,13 +245,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from sklearn.model_selection import train_test_split\n",
-    "\n",
-    "data = train_test_split(X, y, train_size=0.8, random_state=0)\n",
-    "X_train, X_validation, y_train, y_validation = data\n",
+    "from catboost.utils import quantize\n",
     "\n",
-    "train_pool = Pool(data=X_train, label=y_train)\n",
-    "validation_pool = Pool(data=X_validation, label=y_validation)"
+    "pool2 = quantize(\n",
+    "    data_path=os.path.join(dataset_dir, 'train.csv'),\n",
+    "    delimiter=',',\n",
+    "    column_description=os.path.join(dataset_dir, 'train.cd'),\n",
+    "    has_header=True,\n",
+    ")"
    ]
   },
   {
@@ -402,9 +378,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "pool = Pool(data=X_train, label=y_train)\n",
     "model = CatBoostRegressor(iterations=10, eval_metric='MAE')\n",
-    "grid = {'learning_rate': [0.001, 0.01, 0.1, 1.0, 10.0], 'depth': [4, 5, 6]}\n",
-    "result = model.grid_search(grid, train_pool)"
+    "grid = {'learning_rate': [0.001, 0.01, 0.1], 'depth': [4, 5, 6]}\n",
+    "result = model.grid_search(grid, pool)"
    ]
   },
   {
@@ -446,7 +423,7 @@
    "outputs": [],
    "source": [
     "model = CatBoostRegressor(iterations=100, eval_metric='MAE')\n",
-    "model.grid_search(grid, train_pool, plot=True, verbose=False);"
+    "model.grid_search(grid, pool, plot=True, verbose=False);"
    ]
   },
   {
@@ -628,7 +605,7 @@
    "source": [
     "x = [0, 7, 2]\n",
     "\n",
-    "raw_pred = model.predict([x], prediction_type='RawFormulaVal')\n",
+    "raw_pred = model.predict([x])\n",
     "print(raw_pred)"
    ]
   },
@@ -658,7 +635,7 @@
    },
    "outputs": [],
    "source": [
-    "model = CatBoostRegressor(iterations=2, depth=4, grow_policy='Lossguide')\n",
+    "model = CatBoostRegressor(iterations=2, max_leaves=16, grow_policy='Lossguide')\n",
     "model.fit(features, labels, verbose=False);\n",
     "display(model.plot_tree(0))"
    ]