FIX typos in imbalance learn

yangzhang33 · Nov 13, 2020 · cfeefc0 · cfeefc0
1 parent d66713e
commit cfeefc0
Showing 1 changed file with 38 additions and 21 deletions.
diff --git a/05_imbalance_learning/01_imbalanced_learning.ipynb b/05_imbalance_learning/01_imbalanced_learning.ipynb
@@ -165,6 +165,7 @@
    "outputs": [],
    "source": [
     "from sklearn.dummy import DummyClassifier\n",
+    "\n",
     "dummy_clf = DummyClassifier(strategy='most_frequent')\n",
     "dummy_clf.fit(data, target)"
    ]
@@ -308,8 +309,10 @@
     "    index=dummy_clf.classes_\n",
     ")\n",
     "sns.set(font_scale=1.4) # for label size\n",
-    "sns.heatmap(cm_df, annot=True, annot_kws={\"size\": 16},\n",
-    "            cmap='Oranges',)\n",
+    "sns.heatmap(\n",
+    "    cm_df, annot=True, annot_kws={\"size\": 16},\n",
+    "    cmap='Oranges'\n",
+    ")\n",
     "\n",
     "plt.xlim(0, 2)\n",
     "plt.ylim(0, 2)"
@@ -340,9 +343,11 @@
     "from sklearn.metrics import recall_score\n",
     "\n",
     "precision = precision_score(\n",
-    "    target, y_pred, pos_label=' >50K')\n",
+    "    target, y_pred, pos_label=' >50K'\n",
+    ")\n",
     "recall = recall_score(\n",
-    "    target, y_pred, pos_label=' >50K')\n",
+    "    target, y_pred, pos_label=' >50K'\n",
+    ")\n",
     "\n",
     "print(f\"The recall of the dummy model is \"\n",
     "      f\"{recall}\")\n",
@@ -410,7 +415,8 @@
     "    display(HTML(\n",
     "        f\"<h5>{name}</h5>\"\n",
     "        f\"Test Accuracy: {score:7.2%} <br/>\"\n",
-    "        f\"Balanced accuracy: {balanced_score:7.2%} <br/>\"))"
+    "        f\"Balanced accuracy: {balanced_score:7.2%} <br/>\"\n",
+    "    ))"
    ]
   },
   {
@@ -438,11 +444,11 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### 2.2 Make use of the `class_weight` parameter\n",
+    "### 2.2 Make use of the `class_weight` parameter to modify the training cost\n",
     "\n",
     "A first class of methods rely on sample weights to correct the imbalance. The core idea here is to weight prediction mistakes on the minority class higher than mistakes on the most common class.\n",
     "\n",
-    "#### In linear model\n",
+    "#### In linear model - modifying the cost function\n",
     "\n",
     "In `scikit-learn`, some estimators have a `class_weight` parameter that permits to do this. The idea is that the ERM is changed such that\n",
     "$$\n",
@@ -459,11 +465,14 @@
    "outputs": [],
    "source": [
     "binary_encoding_columns = ['sex']\n",
-    "one_hot_encoding_columns = ['workclass', 'education', 'marital-status',\n",
-    "                            'occupation', 'relationship',\n",
-    "                            'race', 'native-country']\n",
-    "scaling_columns = ['age', 'education-num', 'hours-per-week',\n",
-    "                   'capital-gain', 'capital-loss']"
+    "one_hot_encoding_columns = [\n",
+    "    'workclass', 'education', 'marital-status',\n",
+    "    'occupation', 'relationship', 'race', 'native-country'\n",
+    "]\n",
+    "scaling_columns = [\n",
+    "    'age', 'education-num', 'hours-per-week',\n",
+    "    'capital-gain', 'capital-loss'\n",
+    "]"
    ]
   },
   {
@@ -525,7 +534,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#### In tree-based model\n",
+    "#### In tree-based model - modifying the purity criterion\n",
     "\n",
     "In tree based models, the `class_weight` option is used to chose on the splits. Indeed, the purity criterion (which is minimize for to chose the split) is computed using these weights. In the leaf, the weights are used to compute the class to output."
    ]
@@ -553,7 +562,7 @@
     "from sklearn.ensemble import RandomForestClassifier\n",
     "\n",
     "preprocessor_rf = ColumnTransformer([\n",
-    "    ('binary-encoder', OrdinalEncoder(), ordinal_encoding_columns),\n",
+    "    ('ordinal-encoder', OrdinalEncoder(), ordinal_encoding_columns),\n",
     "    ('standard-scaler', FunctionTransformer(validate=False), scaling_columns)\n",
     "])\n",
     "model_rf = make_pipeline(\n",
@@ -580,7 +589,8 @@
    "source": [
     "model_rf_balanced = clone(model_rf)\n",
     "model_rf_balanced.set_params(\n",
-    "    randomforestclassifier__class_weight='balanced')\n",
+    "    randomforestclassifier__class_weight='balanced'\n",
+    ")\n",
     "model_rf_balanced.name = \"Balanced Random Forest\"\n",
     "\n",
     "evaluate_classifier(model_rf_balanced)"
@@ -601,7 +611,8 @@
    "source": [
     "model_rf_subbalanced = clone(model_rf)\n",
     "model_rf_subbalanced.set_params(\n",
-    "    randomforestclassifier__class_weight='balanced_subsample')\n",
+    "    randomforestclassifier__class_weight='balanced_subsample'\n",
+    ")\n",
     "model_rf_subbalanced.name = \"Balanced Subsample Random Forest\"\n",
     "\n",
     "evaluate_classifier(model_rf_subbalanced)"
@@ -768,7 +779,9 @@
     "\n",
     "model_balanced_rf = make_pipeline(\n",
     "    preprocessor_rf,\n",
-    "    BalancedRandomForestClassifier(n_estimators=100, random_state=42)\n",
+    "    BalancedRandomForestClassifier(\n",
+    "        n_estimators=100, random_state=42\n",
+    "    )\n",
     ")\n",
     "model_balanced_rf.name = \"Balanced Random Forest\""
    ]
@@ -810,8 +823,10 @@
    "source": [
     "model_bagging = make_pipeline(\n",
     "    preprocessor_rf,\n",
-    "    BaggingClassifier(base_estimator=HistGradientBoostingClassifier(),\n",
-    "                      n_estimators=10, random_state=42)\n",
+    "    BaggingClassifier(\n",
+    "        base_estimator=HistGradientBoostingClassifier(),\n",
+    "        n_estimators=10, random_state=42\n",
+    "    )\n",
     ")\n",
     "model_bagging.name = \"Bagging Model\""
    ]
@@ -841,8 +856,10 @@
     "from imblearn.ensemble import BalancedBaggingClassifier\n",
     "model_balanced_bagging = make_pipeline(\n",
     "    preprocessor_rf,\n",
-    "    BalancedBaggingClassifier(base_estimator=HistGradientBoostingClassifier(),\n",
-    "                              n_estimators=10, random_state=42)\n",
+    "    BalancedBaggingClassifier(\n",
+    "        base_estimator=HistGradientBoostingClassifier(),\n",
+    "        n_estimators=10, random_state=42\n",
+    "    )\n",
     ")\n",
     "model_balanced_bagging.name = \"Balanced Bagging Model\""
    ]