Skip to content

Commit

Permalink
Section 6.3 Connecting the Pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
ChristopherGS committed Mar 29, 2020
1 parent 75b48f5 commit 53bc67c
Show file tree
Hide file tree
Showing 9 changed files with 348 additions and 68 deletions.
Empty file.
92 changes: 92 additions & 0 deletions packages/regression_model/regression_model/config/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import pathlib

import regression_model


PACKAGE_ROOT = pathlib.Path(regression_model.__file__).resolve().parent
TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models"
DATASET_DIR = PACKAGE_ROOT / "datasets"

# data
TESTING_DATA_FILE = "test.csv"
TRAINING_DATA_FILE = "train.csv"
TARGET = "SalePrice"


# variables
FEATURES = [
"MSSubClass",
"MSZoning",
"Neighborhood",
"OverallQual",
"OverallCond",
"YearRemodAdd",
"RoofStyle",
"MasVnrType",
"BsmtQual",
"BsmtExposure",
"HeatingQC",
"CentralAir",
"1stFlrSF",
"GrLivArea",
"BsmtFullBath",
"KitchenQual",
"Fireplaces",
"FireplaceQu",
"GarageType",
"GarageFinish",
"GarageCars",
"PavedDrive",
"LotFrontage",
# this one is only to calculate temporal variable:
"YrSold",
]

# this variable is to calculate the temporal variable,
# can be dropped afterwards
DROP_FEATURES = "YrSold"

# numerical variables with NA in train set
NUMERICAL_VARS_WITH_NA = ["LotFrontage"]

# categorical variables with NA in train set
CATEGORICAL_VARS_WITH_NA = [
"MasVnrType",
"BsmtQual",
"BsmtExposure",
"FireplaceQu",
"GarageType",
"GarageFinish",
]

TEMPORAL_VARS = "YearRemodAdd"

# variables to log transform
NUMERICALS_LOG_VARS = ["LotFrontage", "1stFlrSF", "GrLivArea"]

# categorical variables to encode
CATEGORICAL_VARS = [
"MSZoning",
"Neighborhood",
"RoofStyle",
"MasVnrType",
"BsmtQual",
"BsmtExposure",
"HeatingQC",
"CentralAir",
"KitchenQual",
"FireplaceQu",
"GarageType",
"GarageFinish",
"PavedDrive",
]

NUMERICAL_NA_NOT_ALLOWED = [
feature
for feature in FEATURES
if feature not in CATEGORICAL_VARS + NUMERICAL_VARS_WITH_NA
]

CATEGORICAL_NA_NOT_ALLOWED = [
feature for feature in CATEGORICAL_VARS if feature not in CATEGORICAL_VARS_WITH_NA
]
55 changes: 35 additions & 20 deletions packages/regression_model/regression_model/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,41 @@
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

import preprocessors as pp
from regression_model.processing import preprocessors as pp
from regression_model.config import config


CATEGORICAL_VARS = ['MSZoning',
'Neighborhood',
'RoofStyle',
'MasVnrType',
'BsmtQual',
'BsmtExposure',
'HeatingQC',
'CentralAir',
'KitchenQual',
'FireplaceQu',
'GarageType',
'GarageFinish',
'PavedDrive']

PIPELINE_NAME = 'lasso_regression'

price_pipe = Pipeline(
[
('categorical_imputer',
pp.CategoricalImputer(variables=CATEGORICAL_VARS)),
])
(
"categorical_imputer",
pp.CategoricalImputer(variables=config.CATEGORICAL_VARS_WITH_NA),
),
(
"numerical_inputer",
pp.NumericalImputer(variables=config.NUMERICAL_VARS_WITH_NA),
),
(
"temporal_variable",
pp.TemporalVariableEstimator(
variables=config.TEMPORAL_VARS, reference_variable=config.DROP_FEATURES
),
),
(
"rare_label_encoder",
pp.RareLabelCategoricalEncoder(tol=0.01, variables=config.CATEGORICAL_VARS),
),
(
"categorical_encoder",
pp.CategoricalEncoder(variables=config.CATEGORICAL_VARS),
),
("log_transformer", pp.LogTransformer(variables=config.NUMERICALS_LOG_VARS)),
(
"drop_features",
pp.DropUnecessaryFeatures(variables_to_drop=config.DROP_FEATURES),
),
("scaler", MinMaxScaler()),
("Linear_model", Lasso(alpha=0.005, random_state=0)),
]
)
26 changes: 0 additions & 26 deletions packages/regression_model/regression_model/preprocessors.py

This file was deleted.

Empty file.
192 changes: 192 additions & 0 deletions packages/regression_model/regression_model/processing/preprocessors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin


class CategoricalImputer(BaseEstimator, TransformerMixin):
"""Categorical data missing value imputer."""

def __init__(self, variables=None) -> None:
if not isinstance(variables, list):
self.variables = [variables]
else:
self.variables = variables

def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "CategoricalImputer":
"""Fit statement to accomodate the sklearn pipeline."""

return self

def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""Apply the transforms to the dataframe."""

X = X.copy()
for feature in self.variables:
X[feature] = X[feature].fillna("Missing")

return X


class NumericalImputer(BaseEstimator, TransformerMixin):
"""Numerical missing value imputer."""

def __init__(self, variables=None):
if not isinstance(variables, list):
self.variables = [variables]
else:
self.variables = variables

def fit(self, X, y=None):
# persist mode in a dictionary
self.imputer_dict_ = {}
for feature in self.variables:
self.imputer_dict_[feature] = X[feature].mode()[0]
return self

def transform(self, X):
X = X.copy()
for feature in self.variables:
X[feature].fillna(self.imputer_dict_[feature], inplace=True)
return X


class TemporalVariableEstimator(BaseEstimator, TransformerMixin):
"""Temporal variable calculator."""

def __init__(self, variables=None, reference_variable=None):
if not isinstance(variables, list):
self.variables = [variables]
else:
self.variables = variables

self.reference_variables = reference_variable

def fit(self, X, y=None):
# we need this step to fit the sklearn pipeline
return self

def transform(self, X):
X = X.copy()
for feature in self.variables:
X[feature] = X[self.reference_variables] - X[feature]

return X


class RareLabelCategoricalEncoder(BaseEstimator, TransformerMixin):
"""Rare label categorical encoder"""

def __init__(self, tol=0.05, variables=None):
self.tol = tol
if not isinstance(variables, list):
self.variables = [variables]
else:
self.variables = variables

def fit(self, X, y=None):
# persist frequent labels in dictionary
self.encoder_dict_ = {}

for var in self.variables:
# the encoder will learn the most frequent categories
t = pd.Series(X[var].value_counts() / np.float(len(X)))
# frequent labels:
self.encoder_dict_[var] = list(t[t >= self.tol].index)

return self

def transform(self, X):
X = X.copy()
for feature in self.variables:
X[feature] = np.where(
X[feature].isin(self.encoder_dict_[feature]), X[feature], "Rare"
)

return X


class CategoricalEncoder(BaseEstimator, TransformerMixin):
"""String to numbers categorical encoder."""

def __init__(self, variables=None):
if not isinstance(variables, list):
self.variables = [variables]
else:
self.variables = variables

def fit(self, X, y):
temp = pd.concat([X, y], axis=1)
temp.columns = list(X.columns) + ["target"]

# persist transforming dictionary
self.encoder_dict_ = {}

for var in self.variables:
t = temp.groupby([var])["target"].mean().sort_values(ascending=True).index
self.encoder_dict_[var] = {k: i for i, k in enumerate(t, 0)}

return self

def transform(self, X):
# encode labels
X = X.copy()
for feature in self.variables:
X[feature] = X[feature].map(self.encoder_dict_[feature])

# check if transformer introduces NaN
if X[self.variables].isnull().any().any():
null_counts = X[self.variables].isnull().any()
vars_ = {
key: value for (key, value) in null_counts.items() if value is True
}
raise ValueError(
f"Categorical encoder has introduced NaN when "
f"transforming categorical variables: {vars_.keys()}"
)

return X


class LogTransformer(BaseEstimator, TransformerMixin):
"""Logarithm transformer."""

def __init__(self, variables=None):
if not isinstance(variables, list):
self.variables = [variables]
else:
self.variables = variables

def fit(self, X, y=None):
# to accomodate the pipeline
return self

def transform(self, X):
X = X.copy()

# check that the values are non-negative for log transform
if not (X[self.variables] > 0).all().all():
vars_ = self.variables[(X[self.variables] <= 0).any()]
raise ValueError(
f"Variables contain zero or negative values, "
f"can't apply log for vars: {vars_}"
)

for feature in self.variables:
X[feature] = np.log(X[feature])

return X


class DropUnecessaryFeatures(BaseEstimator, TransformerMixin):
def __init__(self, variables_to_drop=None):
self.variables = variables_to_drop

def fit(self, X, y=None):
return self

def transform(self, X):
# encode labels
X = X.copy()
X = X.drop(self.variables, axis=1)

return X
Loading

0 comments on commit 53bc67c

Please sign in to comment.