hi.py

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import warnings
import xgboost as xgb
import lightgbm as lgb
from scipy.stats import skew
from scipy import stats
from scipy.stats.stats import pearsonr
from scipy.stats import norm
from collections import Counter
from sklearn.linear_model import LinearRegression,LassoCV, Ridge, LassoLarsCV,ElasticNetCV
from sklearn.model_selection import GridSearchCV, cross_val_score, learning_curve
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, Normalizer, RobustScaler
sns.set(style='white', context='notebook', palette='deep')

class hi():
    def temp(train):
        cat = len(train.select_dtypes(include=['object']).columns)
        num = len(train.select_dtypes(include=['int64', 'float64']).columns)
        return cat, num

    train = pd.read_csv("../resources/train.csv")
    test = pd.read_csv("../resources/test.csv")

    print("The train data size before dropping Id feature is : {} ".format(train.shape))
    print("The test data size before dropping Id feature is : {} ".format(test.shape))
    train_ID = train['Id']
    test_ID = test['Id']
    train.drop("Id", axis=1, inplace=True)
    test.drop("Id", axis=1, inplace=True)

    print("\nThe train data size after dropping Id feature is : {} ".format(train.shape))
    print("The test data size after dropping Id feature is : {} ".format(test.shape))
    train.head()
    test.head()
    train['SalePrice'].describe()
    sns.distplot(train['SalePrice'], fit=norm);
    (mu, sigma) = norm.fit(train['SalePrice'])
    print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
    plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
               loc='best')
    plt.ylabel('Frequency')
    plt.title('SalePrice distribution')

    fig = plt.figure()
    res = stats.probplot(train['SalePrice'], plot=plt)
    plt.show()

    print("Skewness: %f" % train['SalePrice'].skew())
    print("Kurtosis: %f" % train['SalePrice'].kurt())

    train.select_dtypes(include=['object']).columns
    train.select_dtypes(include=['int64', 'float64']).columns

    # ...
    cat, num = temp(train)

    print('Total Features: ', cat, 'categorical', '+',
          num, 'numerical', '=', cat + num, 'features')

    corrmat = train.corr()
    f, ax = plt.subplots(figsize=(12, 9))
    sns.heatmap(corrmat, vmax=1.0, square=True);

    cols = corrmat.nlargest(10, 'SalePrice')['SalePrice'].index
    cm = np.corrcoef(train[cols].values.T)
    print(cm)
    sns.set(font_scale=1.25)
    hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values,
                     xticklabels=cols.values)
    plt.show()

    most_corr = pd.DataFrame(cols)
    most_corr.columns = ['Most Correlated Features']
    most_corr

    var = 'OverallQual'
    data = pd.concat([train['SalePrice'], train[var]], axis=1)
    f, ax = plt.subplots(figsize=(8, 6))
    fig = sns.boxplot(x=var, y="SalePrice", data=data)
    fig.axis(ymin=0, ymax=800000);

    sns.jointplot(x=train['GrLivArea'], y=train['SalePrice'], kind='reg')

    train = train.drop(train[(train['GrLivArea'] > 4000)
                             & (train['SalePrice'] < 300000)].index).reset_index(drop=True)

    sns.jointplot(x=train['GrLivArea'], y=train['SalePrice'], kind='reg')

    sns.boxplot(x=train['GarageCars'], y=train['SalePrice'])

    train = train.drop(train[(train['GarageCars'] > 3)
                             & (train['SalePrice'] < 300000)].index).reset_index(drop=True)

    sns.boxplot(x=train['GarageCars'], y=train['SalePrice'])

    sns.jointplot(x=train['GarageArea'], y=train['SalePrice'], kind='reg')

    train = train.drop(train[(train['GarageArea'] > 1000)
                             & (train['SalePrice'] < 300000)].index).reset_index(drop=True)

    sns.jointplot(x=train['GarageArea'], y=train['SalePrice'], kind='reg')

    sns.jointplot(x=train['TotalBsmtSF'], y=train['SalePrice'], kind='reg')

    sns.jointplot(x=train['1stFlrSF'], y=train['SalePrice'], kind='reg')

    sns.boxplot(x=train['TotRmsAbvGrd'], y=train['SalePrice'])

    var = 'YearBuilt'
    data = pd.concat([train['SalePrice'], train[var]], axis=1)
    f, ax = plt.subplots(figsize=(16, 8))
    fig = sns.boxplot(x=var, y="SalePrice", data=data)
    fig.axis(ymin=0, ymax=800000);
    plt.xticks(rotation=90);

    ntrain = train.shape[0]
    ntest = test.shape[0]
    y_train = train.SalePrice.values
    all_data = pd.concat((train, test)).reset_index(drop=True)
    all_data.drop(['SalePrice'], axis=1, inplace=True)
    print("Train data size is : {}".format(train.shape))
    print("Test data size is : {}".format(test.shape))
    print("Combined dataset size is : {}".format(all_data.shape))

    all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
    all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
    missing_data = pd.DataFrame({'Missing Ratio': all_data_na})
    missing_data
    f, ax = plt.subplots(figsize=(15, 12))
    plt.xticks(rotation='90')
    sns.barplot(x=all_data_na.index, y=all_data_na)
    plt.xlabel('Features', fontsize=15)
    plt.ylabel('Percent of missing values', fontsize=15)
    plt.title('Percent missing data by feature', fontsize=15)

    all_data["PoolQC"] = all_data["PoolQC"].fillna("None")
    all_data["MiscFeature"] = all_data["MiscFeature"].fillna("None")
    all_data["Alley"] = all_data["Alley"].fillna("None")
    all_data["Fence"] = all_data["Fence"].fillna("None")
    all_data["FireplaceQu"] = all_data["FireplaceQu"].fillna("None")
    all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
    for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
        all_data[col] = all_data[col].fillna('None')
    for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
        all_data[col] = all_data[col].fillna(0)
    for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
        all_data[col] = all_data[col].fillna(0)
    for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
        all_data[col] = all_data[col].fillna('None')
    all_data["MasVnrType"] = all_data["MasVnrType"].fillna("None")
    all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0)
    all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
    all_data = all_data.drop(['Utilities'], axis=1)
    all_data["Functional"] = all_data["Functional"].fillna("Typ")
    all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])
    all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])
    all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])
    all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])
    all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])
    all_data['MSSubClass'] = all_data['MSSubClass'].fillna("None")

    all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
    all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio': all_data_na})
    missing_data.head()

    all_data['MSSubClass'].describe()

    all_data['MSSubClass'] = all_data['MSSubClass'].apply(str)

    # Changing OverallCond into a categorical variable
    all_data['OverallCond'] = all_data['OverallCond'].astype(str)

    # Year and month sold are transformed into categorical features.
    all_data['YrSold'] = all_data['YrSold'].astype(str)
    all_data['MoSold'] = all_data['MoSold'].astype(str)

    all_data['KitchenQual'].unique()

    from sklearn.preprocessing import LabelEncoder
    cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond',
            'ExterQual', 'ExterCond', 'HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1',
            'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
            'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond',
            'YrSold', 'MoSold')
    # Process columns and apply LabelEncoder to categorical features
    for c in cols:
        lbl = LabelEncoder()
        lbl.fit(list(all_data[c].values))
        all_data[c] = lbl.transform(list(all_data[c].values))

    # Check shape
    print('Shape all_data: {}'.format(all_data.shape))

    all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

    train["SalePrice"] = np.log1p(train["SalePrice"])

    sns.distplot(train['SalePrice'], fit=norm);

    # Get the fitted parameters used by the function
    (mu, sigma) = norm.fit(train['SalePrice'])
    print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
    plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
               loc='best')
    plt.ylabel('Frequency')
    plt.title('SalePrice distribution')

    fig = plt.figure()
    res = stats.probplot(train['SalePrice'], plot=plt)
    plt.show()

    y_train = train.SalePrice.values

    print("Skewness: %f" % train['SalePrice'].skew())
    print("Kurtosis: %f" % train['SalePrice'].kurt())

    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

    skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
    skewness = pd.DataFrame({'Skewed Features': skewed_feats})
    skewness.head()

    skewness = skewness[abs(skewness) > 0.75]
    print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

    from scipy.special import boxcox1p
    skewed_features = skewness.index
    lam = 0.15
    for feat in skewed_features:
        all_data[feat] = boxcox1p(all_data[feat], lam)
        all_data[feat] += 1

    all_data = pd.get_dummies(all_data)
    print(all_data.shape)

    train = all_data[:ntrain]
    test = all_data[ntrain:]

    from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC
    from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
    from sklearn.kernel_ridge import KernelRidge
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import RobustScaler
    from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
    from sklearn.model_selection import KFold, cross_val_score, train_test_split
    from sklearn.metrics import mean_squared_error
    import xgboost as xgb
    import lightgbm as lgb

    n_folds = 5

    def rmsle_cv(model):
        kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
        rmse = np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv=kf))
        return (rmse)

    lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1))
    ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
    KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
    GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                       max_depth=4, max_features='sqrt',
                                       min_samples_leaf=15, min_samples_split=10,
                                       loss='huber', random_state=5)
    model_xgb = xgb.XGBRegressor(colsample_bytree=0.2, gamma=0.0,
                                 learning_rate=0.05, max_depth=6,
                                 min_child_weight=1.5, n_estimators=7200,
                                 reg_alpha=0.9, reg_lambda=0.6,
                                 subsample=0.2, seed=42, silent=1,
                                 random_state=7)
    model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=5,
                                  learning_rate=0.05, n_estimators=720,
                                  max_bin=55, bagging_fraction=0.8,
                                  bagging_freq=5, feature_fraction=0.2319,
                                  feature_fraction_seed=9, bagging_seed=9,
                                  min_data_in_leaf=6, min_sum_hessian_in_leaf=11)

    score = rmsle_cv(lasso)
    print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
    score = rmsle_cv(ENet)
    print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
    score = rmsle_cv(KRR)
    print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
    score = rmsle_cv(GBoost)
    print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
    score = rmsle_cv(model_xgb)
    print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
    score = rmsle_cv(model_lgb)
    print("LGBM score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

    class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
        def __init__(self, models):
            self.models = models

        # we define clones of the original models to fit the data in
        def fit(self, X, y):
            self.models_ = [clone(x) for x in self.models]

            # Train cloned base models
            for model in self.models_:
                model.fit(X, y)

            return self

        # Now we do the predictions for cloned models and average them
        def predict(self, X):
            predictions = np.column_stack([
                model.predict(X) for model in self.models_
            ])
            return np.mean(predictions, axis=1)

    averaged_models = AveragingModels(models=(ENet, GBoost, KRR, lasso))
    score = rmsle_cv(averaged_models)
    print("Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

    class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
        def __init__(self, base_models, meta_model, n_folds=5):
            self.base_models = base_models
            self.meta_model = meta_model
            self.n_folds = n_folds

        # We again fit the data on clones of the original models
        def fit(self, X, y):
            self.base_models_ = [list() for x in self.base_models]
            self.meta_model_ = clone(self.meta_model)
            kfold = KFold(n_splits=self.n_folds, shuffle=True)

            # Train cloned base models then create out-of-fold predictions
            # that are needed to train the cloned meta-model
            out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
            for i, clf in enumerate(self.base_models):
                for train_index, holdout_index in kfold.split(X, y):
                    instance = clone(clf)
                    self.base_models_[i].append(instance)
                    instance.fit(X[train_index], y[train_index])
                    y_pred = instance.predict(X[holdout_index])
                    out_of_fold_predictions[holdout_index, i] = y_pred

            # Now train the cloned  meta-model using the out-of-fold predictions
            self.meta_model_.fit(out_of_fold_predictions, y)
            return self

        def predict(self, X):
            meta_features = np.column_stack([
                np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
                for base_models in self.base_models_])
            return self.meta_model_.predict(meta_features)

    stacked_averaged_models = StackingAveragedModels(base_models=(ENet, GBoost, KRR),
                                                     meta_model=lasso)

    score = rmsle_cv(stacked_averaged_models)
    print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

    def rmsle(y, y_pred):
        return np.sqrt(mean_squared_error(y, y_pred))

    stacked_averaged_models.fit(train.values, y_train)
    stacked_train_pred = stacked_averaged_models.predict(train.values)
    stacked_pred = np.expm1(stacked_averaged_models.predict(test.values))
    print(rmsle(y_train, stacked_train_pred))

    model_xgb.fit(train, y_train)
    xgb_train_pred = model_xgb.predict(train)
    xgb_pred = np.expm1(model_xgb.predict(test))
    print(rmsle(y_train, xgb_train_pred))

    model_lgb.fit(train, y_train)
    lgb_train_pred = model_lgb.predict(train)
    lgb_pred = np.expm1(model_lgb.predict(test.values))
    print(rmsle(y_train, lgb_train_pred))

    print('RMSLE score on train data:')
    print(rmsle(y_train, stacked_train_pred * 0.70 +
                xgb_train_pred * 0.10 + lgb_train_pred * 0.20))

    Stacked = 1 / (0.1077)
    XGBoost = 1 / (0.1177)
    LGBM = 1 / (0.1159)
    Sum = Stacked + XGBoost + LGBM
    Stacked = Stacked / Sum
    XGBoost = XGBoost / Sum
    LGBM = LGBM / Sum
    print(Stacked, XGBoost, LGBM)

    print('RMSLE score on train data:')
    print(rmsle(y_train, stacked_train_pred * Stacked +
                xgb_train_pred * XGBoost + lgb_train_pred * LGBM))

    ensemble = stacked_pred * Stacked + xgb_pred * XGBoost + lgb_pred * LGBM

    sub = pd.DataFrame()
    sub['Id'] = test_ID
    sub['SalePrice'] = ensemble
    sub.to_csv('submission.csv', index=False)