major new upgrade 5.0 version with autoencoders

harel-coffee · Dec 20, 2023 · 1007b94 · 1007b94
1 parent 0e4cf54
commit 1007b94
Show file tree

Hide file tree

Showing 8 changed files with 1,658 additions and 37 deletions.
diff --git a/examples/Featurewiz_with_AutoEncoder_Demo.ipynb b/examples/Featurewiz_with_AutoEncoder_Demo.ipynb
diff --git a/featurewiz/__init__.py b/featurewiz/__init__.py
@@ -20,7 +20,8 @@
 from .featurewiz import FE_kmeans_resampler, FE_find_and_cap_outliers, EDA_find_outliers
 from .featurewiz import split_data_n_ways, FE_concatenate_multiple_columns
 from .featurewiz import FE_discretize_numeric_variables, reduce_mem_usage
-from .ml_models import simple_XGBoost_model, simple_LightGBM_model, complex_XGBoost_model, complex_LightGBM_model,data_transform
+from .ml_models import simple_XGBoost_model, simple_LightGBM_model, complex_XGBoost_model
+from .ml_models import complex_LightGBM_model,data_transform, get_class_weights
 from .my_encoders import My_LabelEncoder, Groupby_Aggregator, My_LabelEncoder_Pipe, Ranking_Aggregator, DateTime_Transformer
 from .my_encoders import Rare_Class_Combiner, Rare_Class_Combiner_Pipe, FE_create_time_series_features, Binning_Transformer
 from .my_encoders import Column_Names_Transformer, FE_convert_all_object_columns_to_numeric, Numeric_Transformer
@@ -32,6 +33,8 @@
 from .featurewiz import FE_transform_numeric_columns_to_bins, FE_create_interaction_vars
 from .stacking_models import Stacking_Classifier, Blending_Regressor, Stacking_Regressor, stacking_models_list
 from .stacking_models import StackingClassifier_Multi, analyze_problem_type_array
+from .stacking_models import DenoisingAutoEncoder, VariationalAutoEncoder
+from .stacking_models import GAN, GANAugmenter
 from .featurewiz import EDA_binning_numeric_column_displaying_bins, FE_calculate_duration_from_timestamp
 from .featurewiz import FE_convert_mixed_datatypes_to_string, FE_drop_rows_with_infinity
 from .featurewiz import EDA_find_remove_columns_with_infinity, FE_split_list_into_columns
@@ -47,7 +50,8 @@
 version_number = __version__
 print("""%s featurewiz %s. Use the following syntax:
     >>> wiz = FeatureWiz(feature_engg = '', nrows=None, transform_target=True, scalers="std",
-        		category_encoders="auto", add_missing=False, verbose=0)
+        		category_encoders="auto", add_missing=False, verbose=0. imbalanced=False,
+        		ae_options={})
     >>> X_train_selected, y_train = wiz.fit_transform(X_train, y_train)
     >>> X_test_selected = wiz.transform(X_test)
     >>> selected_features = wiz.features

diff --git a/featurewiz/__version__.py b/featurewiz/__version__.py
@@ -5,6 +5,6 @@
 __author__ = "Ram Seshadri"
 __description__ = "Advanced Feature Engineering and Feature Selection for any data set, any size"
 __url__ = "https://github.com/Auto_ViML/featurewiz.git"
-__version__ = "0.4.8"
+__version__ = "0.5.0"
 __license__ = "Apache License 2.0"
 __copyright__ = "2020-23 Google"
diff --git a/featurewiz/featurewiz.py b/featurewiz/featurewiz.py
diff --git a/featurewiz/ml_models.py b/featurewiz/ml_models.py
@@ -1,6 +1,8 @@
-import pandas as pd
 import numpy as np
 np.random.seed(99)
+import random
+random.seed(42)
+import pandas as pd
 ################################################################################
 import warnings
 warnings.filterwarnings("ignore")
@@ -781,38 +783,35 @@ def get_sample_weight_array(y_train):
     return wt_array
 ###############################################################################
 from collections import OrderedDict
+from collections import Counter
+from sklearn.utils.class_weight import compute_class_weight
+import copy
 def get_class_weights(y_input):    
+    ### get_class_weights has lower ROC_AUC but higher F1 scores than get_class_distribution
     y_input = copy.deepcopy(y_input)
     if isinstance(y_input, np.ndarray):
-        y_input = pd.Series(y_input)
+        class_weights = compute_class_weight('balanced', classes=np.unique(y_input), y=y_input.reshape(-1))
     elif isinstance(y_input, pd.Series):
-        pass
+        class_weights = compute_class_weight('balanced', classes=np.unique(y_input.values), y=y_input.values.reshape(-1))
     elif isinstance(y_input, pd.DataFrame):
         ### if it is a dataframe, return only if it s one column dataframe ##
         y_input = y_input.iloc[:,0]
+        class_weights = compute_class_weight('balanced', classes=np.unique(y_input.values), y=y_input.values.reshape(-1))
     else:
         ### if you cannot detect the type or if it is a multi-column dataframe, ignore it
         return None
     classes = np.unique(y_input)
-    rare_class = find_rare_class(y_input)
-    xp = Counter(y_input)
-    class_weights = compute_class_weight('balanced', classes=classes, y=y_input)
-
+    xp = Counter(y_input)    
     if len(class_weights[(class_weights < 1)]) > 0:
         ### if the weights are less than 1, then divide them until the lowest weight is 1.
         class_weights = class_weights/min(class_weights)
     else:
         class_weights = (class_weights)
-    ### even after you change weights if they are all below 1.5 do this ##
-    #if (class_weights<=1.5).all():
-    #    class_weights = np.around(class_weights+0.49)
-
+    ### This is the best version that returns correct weights ###   
     class_weights = class_weights.astype(int)
     class_weights[(class_weights<1)]=1
-    class_rows = class_weights*[xp[x] for x in classes]
-    class_rows = class_rows.astype(int)
-    class_weighted_rows = dict(zip(classes,class_weights))
-    return class_weighted_rows
+    class_weights_dict_corrected = dict(zip(classes,class_weights))
+    return class_weights_dict_corrected
 ##################################################################################
 from collections import OrderedDict
 def get_scale_pos_weight(y_input):