Section 6.3 Connecting the Pipeline

adidonatocda · Mar 29, 2020 · 53bc67c · 53bc67c
1 parent 75b48f5
commit 53bc67c
Show file tree

Hide file tree

Showing 9 changed files with 348 additions and 68 deletions.
diff --git a/packages/regression_model/regression_model/config/__init__.py b/packages/regression_model/regression_model/config/__init__.py
diff --git a/packages/regression_model/regression_model/config/config.py b/packages/regression_model/regression_model/config/config.py
@@ -0,0 +1,92 @@
+import pathlib
+
+import regression_model
+
+
+PACKAGE_ROOT = pathlib.Path(regression_model.__file__).resolve().parent
+TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models"
+DATASET_DIR = PACKAGE_ROOT / "datasets"
+
+# data
+TESTING_DATA_FILE = "test.csv"
+TRAINING_DATA_FILE = "train.csv"
+TARGET = "SalePrice"
+
+
+# variables
+FEATURES = [
+    "MSSubClass",
+    "MSZoning",
+    "Neighborhood",
+    "OverallQual",
+    "OverallCond",
+    "YearRemodAdd",
+    "RoofStyle",
+    "MasVnrType",
+    "BsmtQual",
+    "BsmtExposure",
+    "HeatingQC",
+    "CentralAir",
+    "1stFlrSF",
+    "GrLivArea",
+    "BsmtFullBath",
+    "KitchenQual",
+    "Fireplaces",
+    "FireplaceQu",
+    "GarageType",
+    "GarageFinish",
+    "GarageCars",
+    "PavedDrive",
+    "LotFrontage",
+    # this one is only to calculate temporal variable:
+    "YrSold",
+]
+
+# this variable is to calculate the temporal variable,
+# can be dropped afterwards
+DROP_FEATURES = "YrSold"
+
+# numerical variables with NA in train set
+NUMERICAL_VARS_WITH_NA = ["LotFrontage"]
+
+# categorical variables with NA in train set
+CATEGORICAL_VARS_WITH_NA = [
+    "MasVnrType",
+    "BsmtQual",
+    "BsmtExposure",
+    "FireplaceQu",
+    "GarageType",
+    "GarageFinish",
+]
+
+TEMPORAL_VARS = "YearRemodAdd"
+
+# variables to log transform
+NUMERICALS_LOG_VARS = ["LotFrontage", "1stFlrSF", "GrLivArea"]
+
+# categorical variables to encode
+CATEGORICAL_VARS = [
+    "MSZoning",
+    "Neighborhood",
+    "RoofStyle",
+    "MasVnrType",
+    "BsmtQual",
+    "BsmtExposure",
+    "HeatingQC",
+    "CentralAir",
+    "KitchenQual",
+    "FireplaceQu",
+    "GarageType",
+    "GarageFinish",
+    "PavedDrive",
+]
+
+NUMERICAL_NA_NOT_ALLOWED = [
+    feature
+    for feature in FEATURES
+    if feature not in CATEGORICAL_VARS + NUMERICAL_VARS_WITH_NA
+]
+
+CATEGORICAL_NA_NOT_ALLOWED = [
+    feature for feature in CATEGORICAL_VARS if feature not in CATEGORICAL_VARS_WITH_NA
+]
diff --git a/packages/regression_model/regression_model/pipeline.py b/packages/regression_model/regression_model/pipeline.py
@@ -1,26 +1,41 @@
+from sklearn.linear_model import Lasso
 from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import MinMaxScaler
 
-import preprocessors as pp
+from regression_model.processing import preprocessors as pp
+from regression_model.config import config
 
 
-CATEGORICAL_VARS = ['MSZoning',
-                    'Neighborhood',
-                    'RoofStyle',
-                    'MasVnrType',
-                    'BsmtQual',
-                    'BsmtExposure',
-                    'HeatingQC',
-                    'CentralAir',
-                    'KitchenQual',
-                    'FireplaceQu',
-                    'GarageType',
-                    'GarageFinish',
-                    'PavedDrive']
-
-PIPELINE_NAME = 'lasso_regression'
-
 price_pipe = Pipeline(
     [
-        ('categorical_imputer',
-         pp.CategoricalImputer(variables=CATEGORICAL_VARS)),
-    ])
+        (
+            "categorical_imputer",
+            pp.CategoricalImputer(variables=config.CATEGORICAL_VARS_WITH_NA),
+        ),
+        (
+            "numerical_inputer",
+            pp.NumericalImputer(variables=config.NUMERICAL_VARS_WITH_NA),
+        ),
+        (
+            "temporal_variable",
+            pp.TemporalVariableEstimator(
+                variables=config.TEMPORAL_VARS, reference_variable=config.DROP_FEATURES
+            ),
+        ),
+        (
+            "rare_label_encoder",
+            pp.RareLabelCategoricalEncoder(tol=0.01, variables=config.CATEGORICAL_VARS),
+        ),
+        (
+            "categorical_encoder",
+            pp.CategoricalEncoder(variables=config.CATEGORICAL_VARS),
+        ),
+        ("log_transformer", pp.LogTransformer(variables=config.NUMERICALS_LOG_VARS)),
+        (
+            "drop_features",
+            pp.DropUnecessaryFeatures(variables_to_drop=config.DROP_FEATURES),
+        ),
+        ("scaler", MinMaxScaler()),
+        ("Linear_model", Lasso(alpha=0.005, random_state=0)),
+    ]
+)
diff --git a/packages/regression_model/regression_model/preprocessors.py b/packages/regression_model/regression_model/preprocessors.py
diff --git a/packages/regression_model/regression_model/processing/__init__.py b/packages/regression_model/regression_model/processing/__init__.py
diff --git a/packages/regression_model/regression_model/processing/preprocessors.py b/packages/regression_model/regression_model/processing/preprocessors.py
@@ -0,0 +1,192 @@
+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+
+
+class CategoricalImputer(BaseEstimator, TransformerMixin):
+    """Categorical data missing value imputer."""
+
+    def __init__(self, variables=None) -> None:
+        if not isinstance(variables, list):
+            self.variables = [variables]
+        else:
+            self.variables = variables
+
+    def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "CategoricalImputer":
+        """Fit statement to accomodate the sklearn pipeline."""
+
+        return self
+
+    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
+        """Apply the transforms to the dataframe."""
+
+        X = X.copy()
+        for feature in self.variables:
+            X[feature] = X[feature].fillna("Missing")
+
+        return X
+
+
+class NumericalImputer(BaseEstimator, TransformerMixin):
+    """Numerical missing value imputer."""
+
+    def __init__(self, variables=None):
+        if not isinstance(variables, list):
+            self.variables = [variables]
+        else:
+            self.variables = variables
+
+    def fit(self, X, y=None):
+        # persist mode in a dictionary
+        self.imputer_dict_ = {}
+        for feature in self.variables:
+            self.imputer_dict_[feature] = X[feature].mode()[0]
+        return self
+
+    def transform(self, X):
+        X = X.copy()
+        for feature in self.variables:
+            X[feature].fillna(self.imputer_dict_[feature], inplace=True)
+        return X
+
+
+class TemporalVariableEstimator(BaseEstimator, TransformerMixin):
+    """Temporal variable calculator."""
+
+    def __init__(self, variables=None, reference_variable=None):
+        if not isinstance(variables, list):
+            self.variables = [variables]
+        else:
+            self.variables = variables
+
+        self.reference_variables = reference_variable
+
+    def fit(self, X, y=None):
+        # we need this step to fit the sklearn pipeline
+        return self
+
+    def transform(self, X):
+        X = X.copy()
+        for feature in self.variables:
+            X[feature] = X[self.reference_variables] - X[feature]
+
+        return X
+
+
+class RareLabelCategoricalEncoder(BaseEstimator, TransformerMixin):
+    """Rare label categorical encoder"""
+
+    def __init__(self, tol=0.05, variables=None):
+        self.tol = tol
+        if not isinstance(variables, list):
+            self.variables = [variables]
+        else:
+            self.variables = variables
+
+    def fit(self, X, y=None):
+        # persist frequent labels in dictionary
+        self.encoder_dict_ = {}
+
+        for var in self.variables:
+            # the encoder will learn the most frequent categories
+            t = pd.Series(X[var].value_counts() / np.float(len(X)))
+            # frequent labels:
+            self.encoder_dict_[var] = list(t[t >= self.tol].index)
+
+        return self
+
+    def transform(self, X):
+        X = X.copy()
+        for feature in self.variables:
+            X[feature] = np.where(
+                X[feature].isin(self.encoder_dict_[feature]), X[feature], "Rare"
+            )
+
+        return X
+
+
+class CategoricalEncoder(BaseEstimator, TransformerMixin):
+    """String to numbers categorical encoder."""
+
+    def __init__(self, variables=None):
+        if not isinstance(variables, list):
+            self.variables = [variables]
+        else:
+            self.variables = variables
+
+    def fit(self, X, y):
+        temp = pd.concat([X, y], axis=1)
+        temp.columns = list(X.columns) + ["target"]
+
+        # persist transforming dictionary
+        self.encoder_dict_ = {}
+
+        for var in self.variables:
+            t = temp.groupby([var])["target"].mean().sort_values(ascending=True).index
+            self.encoder_dict_[var] = {k: i for i, k in enumerate(t, 0)}
+
+        return self
+
+    def transform(self, X):
+        # encode labels
+        X = X.copy()
+        for feature in self.variables:
+            X[feature] = X[feature].map(self.encoder_dict_[feature])
+
+        # check if transformer introduces NaN
+        if X[self.variables].isnull().any().any():
+            null_counts = X[self.variables].isnull().any()
+            vars_ = {
+                key: value for (key, value) in null_counts.items() if value is True
+            }
+            raise ValueError(
+                f"Categorical encoder has introduced NaN when "
+                f"transforming categorical variables: {vars_.keys()}"
+            )
+
+        return X
+
+
+class LogTransformer(BaseEstimator, TransformerMixin):
+    """Logarithm transformer."""
+
+    def __init__(self, variables=None):
+        if not isinstance(variables, list):
+            self.variables = [variables]
+        else:
+            self.variables = variables
+
+    def fit(self, X, y=None):
+        # to accomodate the pipeline
+        return self
+
+    def transform(self, X):
+        X = X.copy()
+
+        # check that the values are non-negative for log transform
+        if not (X[self.variables] > 0).all().all():
+            vars_ = self.variables[(X[self.variables] <= 0).any()]
+            raise ValueError(
+                f"Variables contain zero or negative values, "
+                f"can't apply log for vars: {vars_}"
+            )
+
+        for feature in self.variables:
+            X[feature] = np.log(X[feature])
+
+        return X
+
+
+class DropUnecessaryFeatures(BaseEstimator, TransformerMixin):
+    def __init__(self, variables_to_drop=None):
+        self.variables = variables_to_drop
+
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X):
+        # encode labels
+        X = X.copy()
+        X = X.drop(self.variables, axis=1)
+
+        return X