Updated feature engineering for multi-class settings.

melihekici · Dec 24, 2020 · d4344e1 · d4344e1
1 parent 88cce54
commit d4344e1
Show file tree

Hide file tree

Showing 4 changed files with 31 additions and 6 deletions.
diff --git a/featurewiz/__version__.py b/featurewiz/__version__.py
@@ -5,6 +5,6 @@
 __author__ = "Ram Seshadri"
 __description__ = "Fast Feature Engineering and Feature Selection for any data set, any size"
 __url__ = "https://github.com/Auto_ViML/featurewiz.git"
-__version__ = "0.0.10"
+__version__ = "0.0.11"
 __license__ = "Apache License 2.0"
 __copyright__ = "2020 Google"
diff --git a/featurewiz/databunch.py b/featurewiz/databunch.py
@@ -109,14 +109,26 @@ def __init__(self,
             le = LabelEncoder()
             if self.check_data_format(y_train):
                 if settings.multi_label:
+                    ### if the model is mult-Label, don't transform it since it won't work
                     self.y_train_source = y_train
                 else:
-                    self.y_train_source =  le.fit_transform(y_train)
+                    if y_train.dtype == 'object' or str(y_train.dtype) == 'category':
+                        self.y_train_source =  le.fit_transform(y_train)
+                    else:
+                        if settings.modeltype == 'Multi_Classification':
+                            rare_class = find_rare_class(y_train)
+                            if rare_class != 0:
+                                ### if the rare class is not zero, then transform it using Label Encoder
+                                y_train =  le.fit_transform(y_train)
+                        self.y_train_source =  copy.deepcopy(y_train)
             else:
                 if settings.multi_label:
                     self.y_train_source = pd.DataFrame(y_train)
                 else:
-                    self.y_train_source = le.fit_transform(pd.DataFrame(y_train))
+                    if y_train.dtype == 'object' or str(y_train.dtype) == 'category':
+                        self.y_train_source = le.fit_transform(pd.DataFrame(y_train))
+                    else:
+                        self.y_train_source =  copy.deepcopy(y_train)
         else:
             print("No target data found!")
             return
@@ -270,7 +282,10 @@ def gen_target_encodet_features(self, x_data, y_data=None, cat_encoder_name=''):
                 encoder = self.target_encoders_names[cat_encoder_name][0](cols=self.cat_features, drop_invariant=True)
                 if settings.modeltype == 'Multi_Classification':
                     ### you must put a Polynomial Wrapper on the cat_encoder in case the model is multi-class
-                    encoder = PolynomialWrapper(encoder)
+                    if cat_encoder_name in ['WOEEncoder']:
+                        encoder = PolynomialWrapper(encoder)
+                ### All other encoders TargetEncoder CatBoostEncoder GLMMEncoder don't need
+                ### Polynomial Wrappers since they handle multi-class (label encoded) very well!
                 data_encodet = encoder.fit_transform(x_data, y_data)
                 data_encodet = data_encodet.add_prefix(cat_encoder_name + '_')
             else:
@@ -567,3 +582,13 @@ def preproc_data(self, X_train=None,
             print('New X_train shape: ', X_train.shape, '| X_test shape: ', X_test.shape)
 
         return (X_train, X_test)
+################################################################################
+def find_rare_class(series, verbose=0):
+    ######### Print the % count of each class in a Target variable  #####
+    """
+    Works on Multi Class too. Prints class percentages count of target variable.
+    It returns the name of the Rare class (the one with the minimum class member count).
+    This can also be helpful in using it as pos_label in Binary and Multi Class problems.
+    """
+    return series.value_counts().index[-1]
+#################################################################################
diff --git a/featurewiz/featurewiz.py b/featurewiz/featurewiz.py
@@ -958,7 +958,7 @@ def featurewiz(dataname, target, corr_limit=0.7, verbose=0, sep=",", header=0,
     if settings.modeltype == 'Multi_Classification':
         ### you must put a Polynomial Wrapper on the cat_encoder in case the model is multi-class
         if final_cat_encoders:
-            final_cat_encoders = [PolynomialWrapper(x) for x in final_cat_encoders if x in target_encoders_names]
+            final_cat_encoders = [PolynomialWrapper(x) for x in final_cat_encoders if x in settings.target_encoders_names]
     elif settings.modeltype == 'Regression':
         if final_cat_encoders:
             if 'WOEEncoder' in final_cat_encoders:

diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@
 
 setuptools.setup(
     name="featurewiz",
-    version="0.0.10",
+    version="0.0.11",
     author="Ram Seshadri",
     author_email="[email protected]",
     description="Select Best Features from your data set - any size - now with XGBoost!",