Skip to content

Commit

Permalink
major new upgrade 5.0 version with autoencoders
Browse files Browse the repository at this point in the history
  • Loading branch information
AutoViML committed Dec 20, 2023
1 parent 0e4cf54 commit 1007b94
Show file tree
Hide file tree
Showing 8 changed files with 1,658 additions and 37 deletions.
698 changes: 698 additions & 0 deletions examples/Featurewiz_with_AutoEncoder_Demo.ipynb

Large diffs are not rendered by default.

8 changes: 6 additions & 2 deletions featurewiz/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
from .featurewiz import FE_kmeans_resampler, FE_find_and_cap_outliers, EDA_find_outliers
from .featurewiz import split_data_n_ways, FE_concatenate_multiple_columns
from .featurewiz import FE_discretize_numeric_variables, reduce_mem_usage
from .ml_models import simple_XGBoost_model, simple_LightGBM_model, complex_XGBoost_model, complex_LightGBM_model,data_transform
from .ml_models import simple_XGBoost_model, simple_LightGBM_model, complex_XGBoost_model
from .ml_models import complex_LightGBM_model,data_transform, get_class_weights
from .my_encoders import My_LabelEncoder, Groupby_Aggregator, My_LabelEncoder_Pipe, Ranking_Aggregator, DateTime_Transformer
from .my_encoders import Rare_Class_Combiner, Rare_Class_Combiner_Pipe, FE_create_time_series_features, Binning_Transformer
from .my_encoders import Column_Names_Transformer, FE_convert_all_object_columns_to_numeric, Numeric_Transformer
Expand All @@ -32,6 +33,8 @@
from .featurewiz import FE_transform_numeric_columns_to_bins, FE_create_interaction_vars
from .stacking_models import Stacking_Classifier, Blending_Regressor, Stacking_Regressor, stacking_models_list
from .stacking_models import StackingClassifier_Multi, analyze_problem_type_array
from .stacking_models import DenoisingAutoEncoder, VariationalAutoEncoder
from .stacking_models import GAN, GANAugmenter
from .featurewiz import EDA_binning_numeric_column_displaying_bins, FE_calculate_duration_from_timestamp
from .featurewiz import FE_convert_mixed_datatypes_to_string, FE_drop_rows_with_infinity
from .featurewiz import EDA_find_remove_columns_with_infinity, FE_split_list_into_columns
Expand All @@ -47,7 +50,8 @@
version_number = __version__
print("""%s featurewiz %s. Use the following syntax:
>>> wiz = FeatureWiz(feature_engg = '', nrows=None, transform_target=True, scalers="std",
category_encoders="auto", add_missing=False, verbose=0)
category_encoders="auto", add_missing=False, verbose=0. imbalanced=False,
ae_options={})
>>> X_train_selected, y_train = wiz.fit_transform(X_train, y_train)
>>> X_test_selected = wiz.transform(X_test)
>>> selected_features = wiz.features
Expand Down
2 changes: 1 addition & 1 deletion featurewiz/__version__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@
__author__ = "Ram Seshadri"
__description__ = "Advanced Feature Engineering and Feature Selection for any data set, any size"
__url__ = "https://github.com/Auto_ViML/featurewiz.git"
__version__ = "0.4.8"
__version__ = "0.5.0"
__license__ = "Apache License 2.0"
__copyright__ = "2020-23 Google"
187 changes: 171 additions & 16 deletions featurewiz/featurewiz.py

Large diffs are not rendered by default.

29 changes: 14 additions & 15 deletions featurewiz/ml_models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import pandas as pd
import numpy as np
np.random.seed(99)
import random
random.seed(42)
import pandas as pd
################################################################################
import warnings
warnings.filterwarnings("ignore")
Expand Down Expand Up @@ -781,38 +783,35 @@ def get_sample_weight_array(y_train):
return wt_array
###############################################################################
from collections import OrderedDict
from collections import Counter
from sklearn.utils.class_weight import compute_class_weight
import copy
def get_class_weights(y_input):
### get_class_weights has lower ROC_AUC but higher F1 scores than get_class_distribution
y_input = copy.deepcopy(y_input)
if isinstance(y_input, np.ndarray):
y_input = pd.Series(y_input)
class_weights = compute_class_weight('balanced', classes=np.unique(y_input), y=y_input.reshape(-1))
elif isinstance(y_input, pd.Series):
pass
class_weights = compute_class_weight('balanced', classes=np.unique(y_input.values), y=y_input.values.reshape(-1))
elif isinstance(y_input, pd.DataFrame):
### if it is a dataframe, return only if it s one column dataframe ##
y_input = y_input.iloc[:,0]
class_weights = compute_class_weight('balanced', classes=np.unique(y_input.values), y=y_input.values.reshape(-1))
else:
### if you cannot detect the type or if it is a multi-column dataframe, ignore it
return None
classes = np.unique(y_input)
rare_class = find_rare_class(y_input)
xp = Counter(y_input)
class_weights = compute_class_weight('balanced', classes=classes, y=y_input)

xp = Counter(y_input)
if len(class_weights[(class_weights < 1)]) > 0:
### if the weights are less than 1, then divide them until the lowest weight is 1.
class_weights = class_weights/min(class_weights)
else:
class_weights = (class_weights)
### even after you change weights if they are all below 1.5 do this ##
#if (class_weights<=1.5).all():
# class_weights = np.around(class_weights+0.49)

### This is the best version that returns correct weights ###
class_weights = class_weights.astype(int)
class_weights[(class_weights<1)]=1
class_rows = class_weights*[xp[x] for x in classes]
class_rows = class_rows.astype(int)
class_weighted_rows = dict(zip(classes,class_weights))
return class_weighted_rows
class_weights_dict_corrected = dict(zip(classes,class_weights))
return class_weights_dict_corrected
##################################################################################
from collections import OrderedDict
def get_scale_pos_weight(y_input):
Expand Down
Loading

0 comments on commit 1007b94

Please sign in to comment.