feature engineering Updated with minor bug fixes.

melihekici · Dec 26, 2020 · 47b4099 · 47b4099
1 parent d93aa17
commit 47b4099
Show file tree

Hide file tree

Showing 5 changed files with 38 additions and 19 deletions.
diff --git a/featurewiz/__init__.py b/featurewiz/__init__.py
@@ -14,17 +14,17 @@
 if __name__ == "__main__":
     version_number = __version__
     print("""Running featurewiz: Auto_ViML's feature engg and selection library. Version=%s
-output_tuple = featurewiz(dataname, target, corr_limit=0.70,
+output = featurewiz(dataname, target, corr_limit=0.70,
                     verbose=2, sep=',', header=0, test_data='',
                     feature_engg='', category_encoders='')
-Let featurewiz add features to your data! Set feature_engg as: 'interactions' or 'groupby' or 'target'
+Let featurewiz add features to your data! Set 'feature_engg' as: 'interactions' or 'groupby' or 'target'
                                 """ %version_number)
 else:
     version_number = __version__
     print("""Imported featurewiz: Auto_ViML's feature engg and selection library. Version=%s
-output_tuple = featurewiz(dataname, target, corr_limit=0.70,
+output = featurewiz(dataname, target, corr_limit=0.70,
                     verbose=2,  sep=',', header=0, test_data='',
                     feature_engg='', category_encoders='')
-Let featurewiz add features to your data! Set feature_engg as: 'interactions' or 'groupby' or 'target'
+Let featurewiz add features to your data! Set 'feature_engg' as: 'interactions' or 'groupby' or 'target'
 """ %version_number)
 ################################################################################
diff --git a/featurewiz/__version__.py b/featurewiz/__version__.py
@@ -5,6 +5,6 @@
 __author__ = "Ram Seshadri"
 __description__ = "Fast Feature Engineering and Feature Selection for any data set, any size"
 __url__ = "https://github.com/Auto_ViML/featurewiz.git"
-__version__ = "0.0.12"
+__version__ = "0.0.13"
 __license__ = "Apache License 2.0"
 __copyright__ = "2020 Google"
diff --git a/featurewiz/databunch.py b/featurewiz/databunch.py
@@ -104,23 +104,27 @@ def __init__(self,
         if X_test is not None:
             if self.check_data_format(X_test):
                 self.X_test_source = pd.DataFrame(X_test)
-
+        ### There is a chance for an error in this - so worth watching!
         if y_train is not None:
             le = LabelEncoder()
             if self.check_data_format(y_train):
                 if settings.multi_label:
                     ### if the model is mult-Label, don't transform it since it won't work
                     self.y_train_source = y_train
                 else:
-                    if y_train.dtype == 'object' or str(y_train.dtype) == 'category':
-                        self.y_train_source =  le.fit_transform(y_train)
+                    if not isinstance(y_train, pd.DataFrame):
+                        if y_train.dtype == 'object' or str(y_train.dtype) == 'category':
+                            self.y_train_source =  le.fit_transform(y_train)
+                        else:
+                            if settings.modeltype == 'Multi_Classification':
+                                rare_class = find_rare_class(y_train)
+                                if rare_class != 0:
+                                    ### if the rare class is not zero, then transform it using Label Encoder
+                                    y_train =  le.fit_transform(y_train)
+                            self.y_train_source =  copy.deepcopy(y_train)
                     else:
-                        if settings.modeltype == 'Multi_Classification':
-                            rare_class = find_rare_class(y_train)
-                            if rare_class != 0:
-                                ### if the rare class is not zero, then transform it using Label Encoder
-                                y_train =  le.fit_transform(y_train)
-                        self.y_train_source =  copy.deepcopy(y_train)
+                        print('Error: y_train should be a series. Skipping target encoding for dataset...')
+                        target_enc_cat_features = False
             else:
                 if settings.multi_label:
                     self.y_train_source = pd.DataFrame(y_train)

diff --git a/featurewiz/featurewiz.py b/featurewiz/featurewiz.py
@@ -751,12 +751,19 @@ def convert_all_object_columns_to_numeric(train, test=""):
     """
     #######################################################################################
     This is a utility that converts string columns to numeric WITHOUT LABEL ENCODER.
+    Make sure test and train have the same number of columns. If you have target in train,
+    remove it before sending it through this utility. Otherwise, might blow up during test transform.
     The beauty of this utility is that it does not blow up when it finds strings in test not in train.
     #######################################################################################
     """
     train = copy.deepcopy(train)
     lis = []
     lis = train.select_dtypes('object').columns.tolist() + train.select_dtypes('category').columns.tolist()
+    if not isinstance(test, str):
+        lis_test = test.select_dtypes('object').columns.tolist() + test.select_dtypes('category').columns.tolist()
+        if len(left_subtract(lis, lis_test)) > 0:
+            ### if there is an extra column in train that is not in test, then remove it from consideration
+            lis = copy.deepcopy(lis_test)
     if not (len(lis)==0):
         for everycol in lis:
             #print('    Converting %s to numeric' %everycol)
@@ -970,8 +977,16 @@ def featurewiz(dataname, target, corr_limit=0.7, verbose=0, sep=",", header=0,
     if feature_gen or feature_type:
         print('Starting feature engineering...this will take time...')
         if test is None:
-            X_train, X_test, y_train, y_test = train_test_split(train[preds],
-                                                            train[target],
+            if settings.multi_label:
+                ### if it is a multi_label problem, leave target as it is - a list!
+                X_train, X_test, y_train, y_test = train_test_split(train[preds],
+                                                                train[target],
+                                                                test_size=0.2,
+                                                                random_state=RANDOM_SEED)
+            else:
+                ### if it not a multi_label problem, make target as target[0]
+                X_train, X_test, y_train, y_test = train_test_split(train[preds],
+                                                            train[target[0]],
                                                             test_size=0.2,
                                                             random_state=RANDOM_SEED)
         else:
@@ -1045,8 +1060,8 @@ def featurewiz(dataname, target, corr_limit=0.7, verbose=0, sep=",", header=0,
     preds = final_list+important_cats
     #######You must convert category variables into integers ###############
     if len(important_cats) > 0:
-        train, traindict = convert_all_object_columns_to_numeric(train, "")
-        if not isinstance(test, str) or test is not None:
+        train, traindict = convert_all_object_columns_to_numeric(train,  "")
+        if test is not None:
             test, _ = convert_all_object_columns_to_numeric(test, traindict)
     ########   Dont move this train and y definition anywhere else ########
     y = train[target]

diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@
 
 setuptools.setup(
     name="featurewiz",
-    version="0.0.12",
+    version="0.0.13",
     author="Ram Seshadri",
     author_email="[email protected]",
     description="Select Best Features from your data set - any size - now with XGBoost!",