Skip to content

Commit

Permalink
missing solutions + pass on 06
Browse files Browse the repository at this point in the history
  • Loading branch information
agramfort committed Jan 9, 2022
1 parent b99f4a9 commit fcd6b58
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 9 deletions.
50 changes: 50 additions & 0 deletions 02_pipelines_and_column_transformers/solutions/01-count_encoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@

from collections import Counter
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class CountEncoder(BaseEstimator, TransformerMixin):
def __init__(self):
pass

def fit(self, X, y=None):
n_features = X.shape[1]
counters = []
for k in range(n_features):
counters.append(Counter(X[:, k]))
self.counters_ = counters
return self

def transform(self, X):
X_t = X.copy()
for x, counter in zip(X_t.T, self.counters_):
# Uses numpy broadcasting
idx = np.nonzero(list(counter.keys()) == x[:, None])[1]
x[:] = np.asarray(list(counter.values()))[idx]
return X_t

X = np.array([
[0, 2],
[1, 3],
[1, 1],
[1, 1],
])
ce = CountEncoder()
print(ce.fit_transform(X))

# Let's put this now in a Pipeline
cat_pipeline = Pipeline([
("imputer", SimpleImputer(strategy='constant', fill_value='missing')),
("count_encoder", CountEncoder())
])

categorical_preprocessing = ColumnTransformer([
("categorical_preproc", cat_pipeline, cat_col)
])

model = Pipeline([
("categorical_preproc", categorical_preprocessing),
("classifier", RandomForestClassifier(n_estimators=100))
])
model.fit(X_train, y_train)
model.score(X_test, y_test)
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@

X_test_num_imputed = X_test_num.fillna(X_train_num.mean())
model.score(X_test_num_imputed, y_test)
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@

from sklearn.preprocessing import OrdinalEncoder

cat_cols = ['sex', 'embarked', 'pclass']
num_cols = ['pclass', 'age', 'parch', 'fare']

cat_pipeline = Pipeline([
("imputer", SimpleImputer(strategy='constant', fill_value='missing')),
("ordinal_encoder", OrdinalEncoder())
])

preprocessor = ColumnTransformer([
("categorical_preproc", cat_pipeline, cat_cols),
("numerical_preproc", SimpleImputer(), num_cols)

])

model = Pipeline([
("preprocessor", preprocessor),
("classifier", RandomForestClassifier(max_depth=10, n_estimators=500))
])
model.fit(X_train, y_train)
model.score(X_test, y_test)
22 changes: 22 additions & 0 deletions 02_pipelines_and_column_transformers/solutions/01c-splitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@

import numpy as np
from sklearn.model_selection import BaseCrossValidator

class IndexBasedSplitter(BaseCrossValidator):
def __init__(self):
pass

def get_n_splits(self, X=None, y=None, groups=None):
return len(np.unique(y.index.values))

def split(self, X, y, groups=None):
splits_idx = np.unique(y.index.values)
idx = np.arange(len(X))
for k in splits_idx:
mask = (y.index.values == k)
train_idx = idx[~mask]
test_idx = idx[mask]
yield train_idx, test_idx

cv = IndexBasedSplitter()
plot_cv_indices(cv, X_df, y_with_provenance)
11 changes: 2 additions & 9 deletions 06_feature_engineering/01-feature_engineering.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -873,18 +873,11 @@
"source": [
"### Now do better !"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -898,7 +891,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
"version": "3.8.6"
}
},
"nbformat": 4,
Expand Down

0 comments on commit fcd6b58

Please sign in to comment.