Skip to content

Commit

Permalink
Faster XGBoost
Browse files Browse the repository at this point in the history
  • Loading branch information
AutoViML committed Mar 21, 2021
1 parent 9717a4e commit 5d6de60
Show file tree
Hide file tree
Showing 12 changed files with 172 additions and 38 deletions.
2 changes: 1 addition & 1 deletion build/lib/featurewiz/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from .featurewiz import FE_create_categorical_feature_crosses, EDA_find_skewed_variables
from .featurewiz import FE_kmeans_resampler, FE_find_and_cap_outliers, EDA_find_outliers
from .featurewiz import split_data_n_ways, FE_concatenate_multiple_columns
from .featurewiz import simple_XGBoost_model
from .featurewiz import simple_XGBoost_model, FE_discretize_numeric_variables
################################################################################
if __name__ == "__main__":
module_type = 'Running'
Expand Down
2 changes: 1 addition & 1 deletion build/lib/featurewiz/__version__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@
__author__ = "Ram Seshadri"
__description__ = "Advanced Feature Engineering and Feature Selection for any data set, any size"
__url__ = "https://github.com/Auto_ViML/featurewiz.git"
__version__ = "0.0.25"
__version__ = "0.0.26"
__license__ = "Apache License 2.0"
__copyright__ = "2020-21 Google"
89 changes: 68 additions & 21 deletions build/lib/featurewiz/featurewiz.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,7 @@ def add(a,b):
from sklearn.feature_selection import chi2, mutual_info_regression, mutual_info_classif
from sklearn.feature_selection import SelectKBest
##################################################################################
def load_file_dataframe(dataname, sep=",", header=0, verbose=0, nrows='all'):
def load_file_dataframe(dataname, sep=",", header=0, verbose=0, nrows='all',parse_dates=False):
start_time = time.time()
########################### This is where we load file or data frame ###############
if isinstance(dataname,str):
Expand All @@ -377,9 +377,11 @@ def load_file_dataframe(dataname, sep=",", header=0, verbose=0, nrows='all'):
for code in codex:
try:
if isinstance(nrows, str):
dfte = pd.read_csv(dataname,sep=sep,index_col=None,encoding=code)
dfte = pd.read_csv(dataname,sep=sep,index_col=None,encoding=code,
parse_dates=parse_dates)
else:
dfte = pd.read_csv(dataname,sep=sep,index_col=None,encoding=code, nrows=nrows)
dfte = pd.read_csv(dataname,sep=sep,index_col=None,encoding=code,
nrows=nrows, parse_dates=parse_dates)
print(' Encoder %s chosen to read CSV file' %code)
print('Shape of your Data Set loaded: %s' %(dfte.shape,))
if len(np.array(list(dfte))[dfte.columns.duplicated()]) > 0:
Expand All @@ -392,9 +394,9 @@ def load_file_dataframe(dataname, sep=",", header=0, verbose=0, nrows='all'):
elif dataname.endswith(('xlsx','xls','txt')):
#### It's very important to get header rows in Excel since people put headers anywhere in Excel#
if isinstance(nrows, str):
dfte = pd.read_excel(dataname,header=header)
dfte = pd.read_excel(dataname,header=header, parse_dates=parse_dates)
else:
dfte = pd.read_excel(dataname,header=header, nrows=nrows)
dfte = pd.read_excel(dataname,header=header, nrows=nrows, parse_dates=parse_dates)
print('Shape of your Data Set loaded: %s' %(dfte.shape,))
return dfte
else:
Expand Down Expand Up @@ -1634,18 +1636,20 @@ def transform(self, dft ):
print('Error in groupby function: returning dataframe as is')
return dft
return dft
###################################################################################

def FE_add_groupby_features_aggregated_to_dataframe(train,
agg_types,groupby_column,ignore_variables, test=""):
agg_types,groupby_columns,ignore_variables, test=""):
"""
FE stands for Feature Engineering. That means this function performs feature engineering on data.
######################################################################################
### This function is a very fast function that will iteratively compute aggregates for all numeric columns
### It returns original dataframe with added features using numeric variables grouped and aggregated
### What do you mean aggregate? aggregates can be "count, "mean", "median", "mode", "min", "max", etc.
### What do you aggregrate? all numeric columns in your data
### What do you groupby? a groupby column
### except those numeric variables you designate in the ignore_variables list. Can be empty.
### What do you groupby? one groupby column at a time or multiple columns one by one
### -- if you give it a list of columns, it will execute the grouping one by one
### What is the ignore_variables for? it will ignore these variables from grouping.
######################################################################################
### Inputs:
### train: Just sent in the data frame df that you want features added to
Expand All @@ -1654,8 +1658,8 @@ def FE_add_groupby_features_aggregated_to_dataframe(train,
### List of aggregates available: {'count','sum','mean','mad','median','min','max','mode','abs',
### 'prod','std','var','sem','skew','kurt',
### 'quantile','cumsum','cumprod','cummax','cummin'}
### groupby_column: can be a string representing a single column or a list of multiple columns
### - it will groupby all the numeric features and compute aggregates by this.
### groupby_columns: can be a string representing a single column or a list of multiple columns
### - it will groupby all the numeric features using one groupby column at a time in a loop.
### ignore_variables: list of variables to ignore among numeric variables in data since they may be ID variables.
### test: (optional) a data frame that you want features added to based on train
### Outputs:
Expand All @@ -1664,16 +1668,26 @@ def FE_add_groupby_features_aggregated_to_dataframe(train,
######################################################################################
### Make sure you reduce correlated variables by using FE_remove_variables_using_SULOV_method()
"""
train = copy.deepcopy(train)
test = copy.deepcopy(test)
MGB = My_Groupby_Encoder(groupby_column, agg_types, ignore_variables)
trainm = MGB.fit_transform(train)
train_copy = copy.deepcopy(train)
test_copy = copy.deepcopy(test)
if isinstance(groupby_columns, str):
groupby_columns = [groupby_columns]
for groupby_column in groupby_columns:
MGB = My_Groupby_Encoder(groupby_column, agg_types, ignore_variables)
train1 = MGB.fit_transform(train)
addl_cols = left_subtract(train1.columns,train.columns)
train_copy = pd.concat([train_copy,train1[addl_cols]],axis=1)
if isinstance(test, str) or test is None:
pass
else:
test1 = MGB.transform(test)
addl_cols = left_subtract(test1.columns,test.columns)
test_copy = pd.concat([test_copy,test1[addl_cols]],axis=1)
### return the dataframes ###########
if isinstance(test, str) or test is None:
return trainm
return train_copy
else:
testm = MGB.transform(test)
return trainm, testm

return train_copy, test_copy
#####################################################################################################
def FE_combine_rare_categories(train_df, categorical_features, test_df=""):
"""
Expand Down Expand Up @@ -2603,7 +2617,7 @@ def FE_concatenate_multiple_columns(df, cols, filler=" ", drop=True):
from sklearn.metrics import mean_squared_log_error, balanced_accuracy_score
from scipy import stats

def simple_XGBoost_model(X_XGB, Y_XGB, X_XGB_test, modeltype):
def simple_XGBoost_model(X_XGB, Y_XGB, X_XGB_test, modeltype,verbose=0):
"""
Easy to use XGBoost model. Just send in X_train, y_train and what you want to predict, X_test
It will automatically split X_train into multiple folds (10) and train and predict each time on X_test.
Expand Down Expand Up @@ -2674,8 +2688,9 @@ def simple_XGBoost_model(X_XGB, Y_XGB, X_XGB_test, modeltype):
score = balanced_accuracy_score(y_test, preds)
print('Balanced Accuracy score in fold %d = %0.1f%%' %(folds+1, score*100))
scores.append(score)
plot_importances_XGB(train_set=X_XGB, labels=Y_XGB, ls=ls, y_preds=pred_xgbs,
modeltype=modeltype)
if verbose:
plot_importances_XGB(train_set=X_XGB, labels=Y_XGB, ls=ls, y_preds=pred_xgbs,
modeltype=modeltype)
print('final predictions', pred_xgbs)
print("Average scores are: ", np.sum(scores)/len(scores))
return pred_xgbs
Expand Down Expand Up @@ -2707,3 +2722,35 @@ def plot_importances_XGB(train_set, labels, ls, y_preds, modeltype):
ax2=plt.subplot(2, 2, 2)
pd.Series(y_preds).plot(ax=ax2, color='b');
##################################################################################
from sklearn.preprocessing import KBinsDiscretizer
def FE_discretize_numeric_variables(df, bin_dict, strategy='kmeans',verbose=0):
"""
This handy function discretizes numeric variables into binned variables using kmeans algorithm.
You need to provide the names of the variables and the numbers of bins for each variable in a dictionary.
It will return the same dataframe with new binned variables that it has created.
Inputs:
----------
df : pandas dataframe - please ensure it is a dataframe. No arrays please.
bin_dict: dictionary of names of variables and the bins that you want for each variable.
strategy: default is 'kmeans': but you can choose any one of {‘uniform’, ‘quantile’, ‘kmeans’}
Outputs:
----------
df: pandas dataframe with new variables with names such as: variable+'_discrete'
"""
num_cols = len(bin_dict)
nrows = int((num_cols/2)+0.5)
print('nrows',nrows)
if verbose:
fig = plt.figure(figsize=(10,3*num_cols))
for i, (col, binvalue) in enumerate(bin_dict.items()):
kbd = KBinsDiscretizer(n_bins=binvalue, encode='ordinal', strategy=strategy)
new_col = col+'_discrete'
df[new_col] = kbd.fit_transform(df[[col]]).astype(int)
if verbose:
ax1 = plt.subplot(nrows,2,i+1)
ax1.scatter(df[col],df[new_col])
ax1.set_title(new_col)
return df
##################################################################################
Binary file removed dist/featurewiz-0.0.25-py3-none-any.whl
Binary file not shown.
Binary file removed dist/featurewiz-0.0.25.tar.gz
Binary file not shown.
Binary file added dist/featurewiz-0.0.26-py3-none-any.whl
Binary file not shown.
Binary file added dist/featurewiz-0.0.26.tar.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion featurewiz.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: featurewiz
Version: 0.0.25
Version: 0.0.26
Summary: Select Best Features from your data set - any size - now with XGBoost!
Home-page: https://github.com/AutoViML/featurewiz
Author: Ram Seshadri
Expand Down
2 changes: 1 addition & 1 deletion featurewiz/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from .featurewiz import FE_create_categorical_feature_crosses, EDA_find_skewed_variables
from .featurewiz import FE_kmeans_resampler, FE_find_and_cap_outliers, EDA_find_outliers
from .featurewiz import split_data_n_ways, FE_concatenate_multiple_columns
from .featurewiz import simple_XGBoost_model, FE_discretize_numeric_variables
from .featurewiz import simple_XGBoost_model, FE_discretize_numeric_variables,FE_transform_numeric_columns
################################################################################
if __name__ == "__main__":
module_type = 'Running'
Expand Down
2 changes: 1 addition & 1 deletion featurewiz/__version__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@
__author__ = "Ram Seshadri"
__description__ = "Advanced Feature Engineering and Feature Selection for any data set, any size"
__url__ = "https://github.com/Auto_ViML/featurewiz.git"
__version__ = "0.0.26"
__version__ = "0.0.27"
__license__ = "Apache License 2.0"
__copyright__ = "2020-21 Google"
109 changes: 98 additions & 11 deletions featurewiz/featurewiz.py
Original file line number Diff line number Diff line change
Expand Up @@ -2638,11 +2638,39 @@ def simple_XGBoost_model(X_XGB, Y_XGB, X_XGB_test, modeltype,verbose=0):
"""

if modeltype == 'Regression':
xgb=XGBRegressor(learning_rate=0.03,max_depth=7,min_child_weight=1,
n_estimators=200,subsample=0.7)
xgb = XGBRegressor(
colsample_bytree=0.5,
alpha=0.01563,
#gamma=0.0,
learning_rate=0.1,
max_depth=15,
min_child_weight=2,
n_estimators=4000,
#reg_alpha=0.9,
reg_lambda=0.003,
subsample=0.7,
random_state=2020,
#metric_period=100,
verbosity = 0,
n_jobs=-1,
silent = True)
else:
xgb=XGBClassifier(learning_rate=0.03,max_depth=7,min_child_weight=1,
n_estimators=200,subsample=0.7)
xgb = XGBClassifier(
colsample_bytree=0.5,
alpha=0.01563,
#gamma=0.0,
learning_rate=0.1,
max_depth=15,
min_child_weight=2,
n_estimators=4000,
#reg_alpha=0.9,
reg_lambda=0.003,
subsample=0.7,
random_state=2020,
#metric_period=100,
n_jobs=-1,
verbosity = 0,
silent = True)

#testing for xgbregressor
n_splits = 10
Expand All @@ -2661,7 +2689,8 @@ def simple_XGBoost_model(X_XGB, Y_XGB, X_XGB_test, modeltype,verbose=0):
y_train, y_test = Y_XGB.values[train_index], Y_XGB.values[test_index]

model = xgb
model.fit(x_train, y_train)
model.fit(x_train, y_train, early_stopping_rounds=6,
eval_set=[(x_test, np.log(y_test))], verbose=0)
if modeltype == 'Regression':
preds = np.exp(model.predict(x_test))
else:
Expand Down Expand Up @@ -2723,7 +2752,9 @@ def plot_importances_XGB(train_set, labels, ls, y_preds, modeltype):
pd.Series(y_preds).plot(ax=ax2, color='b');
##################################################################################
from sklearn.preprocessing import KBinsDiscretizer
def FE_discretize_numeric_variables(df, bin_dict, strategy='kmeans',verbose=0):
from sklearn.mixture import GaussianMixture

def FE_discretize_numeric_variables(train, bin_dict, test='', strategy='kmeans',verbose=0):
"""
This handy function discretizes numeric variables into binned variables using kmeans algorithm.
You need to provide the names of the variables and the numbers of bins for each variable in a dictionary.
Expand All @@ -2733,24 +2764,80 @@ def FE_discretize_numeric_variables(df, bin_dict, strategy='kmeans',verbose=0):
----------
df : pandas dataframe - please ensure it is a dataframe. No arrays please.
bin_dict: dictionary of names of variables and the bins that you want for each variable.
strategy: default is 'kmeans': but you can choose any one of {‘uniform’, ‘quantile’, ‘kmeans}
strategy: default is 'kmeans': but you can choose: {'gauusian','uniform', 'quantile', 'kmeans'}
Outputs:
----------
df: pandas dataframe with new variables with names such as: variable+'_discrete'
"""
df = copy.deepcopy(train)
test = copy.deepcopy(test)
num_cols = len(bin_dict)
nrows = int((num_cols/2)+0.5)
print('nrows',nrows)
#print('nrows',nrows)
if verbose:
fig = plt.figure(figsize=(10,3*num_cols))
for i, (col, binvalue) in enumerate(bin_dict.items()):
kbd = KBinsDiscretizer(n_bins=binvalue, encode='ordinal', strategy=strategy)
new_col = col+'_discrete'
df[new_col] = kbd.fit_transform(df[[col]]).astype(int)
if strategy == 'gaussian':
kbd = GaussianMixture(n_components=binvalue, random_state=99)
df[new_col] = kbd.fit_predict(df[[col]]).astype(int)
if not isinstance(test, str):
test[new_col] = kbd.predict(test[[col]]).astype(int)
else:
kbd = KBinsDiscretizer(n_bins=binvalue, encode='ordinal', strategy=strategy)
df[new_col] = kbd.fit_transform(df[[col]]).astype(int)
if not isinstance(test, str):
test[new_col] = kbd.transform(test[[col]]).astype(int)
if verbose:
ax1 = plt.subplot(nrows,2,i+1)
ax1.scatter(df[col],df[new_col])
ax1.set_title(new_col)
return df
if not isinstance(test, str):
return df, test
else:
return df
##################################################################################
def FE_transform_numeric_columns(df, bin_dict, verbose=0):
"""
This handy function discretizes numeric variables into binned variables using kmeans algorithm.
You need to provide the names of the variables and the numbers of bins for each variable in a dictionary.
It will return the same dataframe with new binned variables that it has created.
Inputs:
----------
df : pandas dataframe - please ensure it is a dataframe. No arrays please.
bin_dict: dictionary of names of variables and the kind of transformation you want
default is 'log': but you can choose: {'log','log10', 'sqrt', 'max-abs'}
Outputs:
----------
df: pandas dataframe with new variables with names such as: variable+'_discrete'
"""
num_cols = len(bin_dict)
nrows = int((num_cols/2)+0.5)
if verbose:
fig = plt.figure(figsize=(10,3*num_cols))
for i, (col, binvalue) in enumerate(bin_dict.items()):
new_col = col+'_'+binvalue
if binvalue == 'log':
df[new_col] = np.log(df[col]).values
elif binvalue == 'log10':
df[new_col] = np.log10(df[col]).values
elif binvalue == 'sqrt':
df[new_col] = np.sqrt(df[col]).values
elif binvalue == 'max-abs':
col_max = df[col].max()
if col_max == 0:
col_max = 1
df[new_col] = (df[col]/col_max).values
else:
df[new_col] = np.log(df[col]).values
if verbose:
ax1 = plt.subplot(nrows,2,i+1)
df[col].plot.kde(ax=ax1, label=col,alpha=0.5,color='r')
ax2 = ax1.twiny()
df[new_col].plot.kde(ax=ax2,label=new_col,alpha=0.5,color='b')
plt.legend();
return df
#################################################################################
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

setuptools.setup(
name="featurewiz",
version="0.0.26",
version="0.0.27",
author="Ram Seshadri",
author_email="[email protected]",
description="Select Best Features from your data set - any size - now with XGBoost!",
Expand Down

0 comments on commit 5d6de60

Please sign in to comment.