missing solutions + pass on 06

yangzhang33 · Jan 9, 2022 · fcd6b58 · fcd6b58
1 parent b99f4a9
commit fcd6b58
Show file tree

Hide file tree

Showing 5 changed files with 100 additions and 9 deletions.
diff --git a/02_pipelines_and_column_transformers/solutions/01-count_encoder.py b/02_pipelines_and_column_transformers/solutions/01-count_encoder.py
@@ -0,0 +1,50 @@
+
+from collections import Counter
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+
+class CountEncoder(BaseEstimator, TransformerMixin):
+    def __init__(self):
+        pass
+
+    def fit(self, X, y=None):
+        n_features = X.shape[1]
+        counters = []
+        for k in range(n_features):
+            counters.append(Counter(X[:, k]))
+        self.counters_ = counters
+        return self
+
+    def transform(self, X):
+        X_t = X.copy()
+        for x, counter in zip(X_t.T, self.counters_):
+            # Uses numpy broadcasting
+            idx = np.nonzero(list(counter.keys()) == x[:, None])[1]
+            x[:] = np.asarray(list(counter.values()))[idx]
+        return X_t
+
+X = np.array([
+    [0, 2],
+    [1, 3],
+    [1, 1],
+    [1, 1],
+])
+ce = CountEncoder()
+print(ce.fit_transform(X))
+
+# Let's put this now in a Pipeline
+cat_pipeline = Pipeline([
+    ("imputer", SimpleImputer(strategy='constant', fill_value='missing')),
+    ("count_encoder", CountEncoder())
+])
+
+categorical_preprocessing = ColumnTransformer([
+    ("categorical_preproc", cat_pipeline, cat_col)
+])
+
+model = Pipeline([
+    ("categorical_preproc", categorical_preprocessing),
+    ("classifier", RandomForestClassifier(n_estimators=100))
+])
+model.fit(X_train, y_train)
+model.score(X_test, y_test)
diff --git a/02_pipelines_and_column_transformers/solutions/01-pandas_fillna_test.py b/02_pipelines_and_column_transformers/solutions/01-pandas_fillna_test.py
@@ -0,0 +1,3 @@
+
+X_test_num_imputed = X_test_num.fillna(X_train_num.mean())
+model.score(X_test_num_imputed, y_test)
diff --git a/02_pipelines_and_column_transformers/solutions/01b-full_column_transformer.py b/02_pipelines_and_column_transformers/solutions/01b-full_column_transformer.py
@@ -0,0 +1,23 @@
+
+from sklearn.preprocessing import OrdinalEncoder
+
+cat_cols = ['sex', 'embarked', 'pclass']
+num_cols = ['pclass', 'age', 'parch', 'fare']
+
+cat_pipeline = Pipeline([
+    ("imputer", SimpleImputer(strategy='constant', fill_value='missing')),
+    ("ordinal_encoder", OrdinalEncoder())
+])
+
+preprocessor = ColumnTransformer([
+    ("categorical_preproc", cat_pipeline, cat_cols),
+    ("numerical_preproc", SimpleImputer(), num_cols)
+
+])
+
+model = Pipeline([
+    ("preprocessor", preprocessor),
+    ("classifier", RandomForestClassifier(max_depth=10, n_estimators=500))
+])
+model.fit(X_train, y_train)
+model.score(X_test, y_test)
diff --git a/02_pipelines_and_column_transformers/solutions/01c-splitter.py b/02_pipelines_and_column_transformers/solutions/01c-splitter.py
@@ -0,0 +1,22 @@
+
+import numpy as np
+from sklearn.model_selection import BaseCrossValidator
+
+class IndexBasedSplitter(BaseCrossValidator):
+    def __init__(self):
+        pass
+
+    def get_n_splits(self, X=None, y=None, groups=None):
+        return len(np.unique(y.index.values))
+
+    def split(self, X, y, groups=None):
+        splits_idx = np.unique(y.index.values)
+        idx = np.arange(len(X))
+        for k in splits_idx:
+            mask = (y.index.values == k)
+            train_idx = idx[~mask]
+            test_idx = idx[mask]
+            yield train_idx, test_idx
+
+cv = IndexBasedSplitter()
+plot_cv_indices(cv, X_df, y_with_provenance)
diff --git a/06_feature_engineering/01-feature_engineering.ipynb b/06_feature_engineering/01-feature_engineering.ipynb
@@ -873,18 +873,11 @@
    "source": [
     "### Now do better !"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -898,7 +891,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.8.6"
   }
  },
  "nbformat": 4,
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@

		X_test_num_imputed = X_test_num.fillna(X_train_num.mean())
		model.score(X_test_num_imputed, y_test)