Skip to content

Commit

Permalink
Updated feature engineering for multi-class settings.
Browse files Browse the repository at this point in the history
  • Loading branch information
AutoViML committed Dec 24, 2020
1 parent 88cce54 commit d4344e1
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 6 deletions.
2 changes: 1 addition & 1 deletion featurewiz/__version__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@
__author__ = "Ram Seshadri"
__description__ = "Fast Feature Engineering and Feature Selection for any data set, any size"
__url__ = "https://github.com/Auto_ViML/featurewiz.git"
__version__ = "0.0.10"
__version__ = "0.0.11"
__license__ = "Apache License 2.0"
__copyright__ = "2020 Google"
31 changes: 28 additions & 3 deletions featurewiz/databunch.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,14 +109,26 @@ def __init__(self,
le = LabelEncoder()
if self.check_data_format(y_train):
if settings.multi_label:
### if the model is mult-Label, don't transform it since it won't work
self.y_train_source = y_train
else:
self.y_train_source = le.fit_transform(y_train)
if y_train.dtype == 'object' or str(y_train.dtype) == 'category':
self.y_train_source = le.fit_transform(y_train)
else:
if settings.modeltype == 'Multi_Classification':
rare_class = find_rare_class(y_train)
if rare_class != 0:
### if the rare class is not zero, then transform it using Label Encoder
y_train = le.fit_transform(y_train)
self.y_train_source = copy.deepcopy(y_train)
else:
if settings.multi_label:
self.y_train_source = pd.DataFrame(y_train)
else:
self.y_train_source = le.fit_transform(pd.DataFrame(y_train))
if y_train.dtype == 'object' or str(y_train.dtype) == 'category':
self.y_train_source = le.fit_transform(pd.DataFrame(y_train))
else:
self.y_train_source = copy.deepcopy(y_train)
else:
print("No target data found!")
return
Expand Down Expand Up @@ -270,7 +282,10 @@ def gen_target_encodet_features(self, x_data, y_data=None, cat_encoder_name=''):
encoder = self.target_encoders_names[cat_encoder_name][0](cols=self.cat_features, drop_invariant=True)
if settings.modeltype == 'Multi_Classification':
### you must put a Polynomial Wrapper on the cat_encoder in case the model is multi-class
encoder = PolynomialWrapper(encoder)
if cat_encoder_name in ['WOEEncoder']:
encoder = PolynomialWrapper(encoder)
### All other encoders TargetEncoder CatBoostEncoder GLMMEncoder don't need
### Polynomial Wrappers since they handle multi-class (label encoded) very well!
data_encodet = encoder.fit_transform(x_data, y_data)
data_encodet = data_encodet.add_prefix(cat_encoder_name + '_')
else:
Expand Down Expand Up @@ -567,3 +582,13 @@ def preproc_data(self, X_train=None,
print('New X_train shape: ', X_train.shape, '| X_test shape: ', X_test.shape)

return (X_train, X_test)
################################################################################
def find_rare_class(series, verbose=0):
######### Print the % count of each class in a Target variable #####
"""
Works on Multi Class too. Prints class percentages count of target variable.
It returns the name of the Rare class (the one with the minimum class member count).
This can also be helpful in using it as pos_label in Binary and Multi Class problems.
"""
return series.value_counts().index[-1]
#################################################################################
2 changes: 1 addition & 1 deletion featurewiz/featurewiz.py
Original file line number Diff line number Diff line change
Expand Up @@ -958,7 +958,7 @@ def featurewiz(dataname, target, corr_limit=0.7, verbose=0, sep=",", header=0,
if settings.modeltype == 'Multi_Classification':
### you must put a Polynomial Wrapper on the cat_encoder in case the model is multi-class
if final_cat_encoders:
final_cat_encoders = [PolynomialWrapper(x) for x in final_cat_encoders if x in target_encoders_names]
final_cat_encoders = [PolynomialWrapper(x) for x in final_cat_encoders if x in settings.target_encoders_names]
elif settings.modeltype == 'Regression':
if final_cat_encoders:
if 'WOEEncoder' in final_cat_encoders:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

setuptools.setup(
name="featurewiz",
version="0.0.10",
version="0.0.11",
author="Ram Seshadri",
author_email="[email protected]",
description="Select Best Features from your data set - any size - now with XGBoost!",
Expand Down

0 comments on commit d4344e1

Please sign in to comment.