Skip to content

Commit

Permalink
feature engineering Updated with minor bug fixes.
Browse files Browse the repository at this point in the history
  • Loading branch information
AutoViML committed Dec 26, 2020
1 parent d93aa17 commit 47b4099
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 19 deletions.
8 changes: 4 additions & 4 deletions featurewiz/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,17 @@
if __name__ == "__main__":
version_number = __version__
print("""Running featurewiz: Auto_ViML's feature engg and selection library. Version=%s
output_tuple = featurewiz(dataname, target, corr_limit=0.70,
output = featurewiz(dataname, target, corr_limit=0.70,
verbose=2, sep=',', header=0, test_data='',
feature_engg='', category_encoders='')
Let featurewiz add features to your data! Set feature_engg as: 'interactions' or 'groupby' or 'target'
Let featurewiz add features to your data! Set 'feature_engg' as: 'interactions' or 'groupby' or 'target'
""" %version_number)
else:
version_number = __version__
print("""Imported featurewiz: Auto_ViML's feature engg and selection library. Version=%s
output_tuple = featurewiz(dataname, target, corr_limit=0.70,
output = featurewiz(dataname, target, corr_limit=0.70,
verbose=2, sep=',', header=0, test_data='',
feature_engg='', category_encoders='')
Let featurewiz add features to your data! Set feature_engg as: 'interactions' or 'groupby' or 'target'
Let featurewiz add features to your data! Set 'feature_engg' as: 'interactions' or 'groupby' or 'target'
""" %version_number)
################################################################################
2 changes: 1 addition & 1 deletion featurewiz/__version__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@
__author__ = "Ram Seshadri"
__description__ = "Fast Feature Engineering and Feature Selection for any data set, any size"
__url__ = "https://github.com/Auto_ViML/featurewiz.git"
__version__ = "0.0.12"
__version__ = "0.0.13"
__license__ = "Apache License 2.0"
__copyright__ = "2020 Google"
22 changes: 13 additions & 9 deletions featurewiz/databunch.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,23 +104,27 @@ def __init__(self,
if X_test is not None:
if self.check_data_format(X_test):
self.X_test_source = pd.DataFrame(X_test)

### There is a chance for an error in this - so worth watching!
if y_train is not None:
le = LabelEncoder()
if self.check_data_format(y_train):
if settings.multi_label:
### if the model is mult-Label, don't transform it since it won't work
self.y_train_source = y_train
else:
if y_train.dtype == 'object' or str(y_train.dtype) == 'category':
self.y_train_source = le.fit_transform(y_train)
if not isinstance(y_train, pd.DataFrame):
if y_train.dtype == 'object' or str(y_train.dtype) == 'category':
self.y_train_source = le.fit_transform(y_train)
else:
if settings.modeltype == 'Multi_Classification':
rare_class = find_rare_class(y_train)
if rare_class != 0:
### if the rare class is not zero, then transform it using Label Encoder
y_train = le.fit_transform(y_train)
self.y_train_source = copy.deepcopy(y_train)
else:
if settings.modeltype == 'Multi_Classification':
rare_class = find_rare_class(y_train)
if rare_class != 0:
### if the rare class is not zero, then transform it using Label Encoder
y_train = le.fit_transform(y_train)
self.y_train_source = copy.deepcopy(y_train)
print('Error: y_train should be a series. Skipping target encoding for dataset...')
target_enc_cat_features = False
else:
if settings.multi_label:
self.y_train_source = pd.DataFrame(y_train)
Expand Down
23 changes: 19 additions & 4 deletions featurewiz/featurewiz.py
Original file line number Diff line number Diff line change
Expand Up @@ -751,12 +751,19 @@ def convert_all_object_columns_to_numeric(train, test=""):
"""
#######################################################################################
This is a utility that converts string columns to numeric WITHOUT LABEL ENCODER.
Make sure test and train have the same number of columns. If you have target in train,
remove it before sending it through this utility. Otherwise, might blow up during test transform.
The beauty of this utility is that it does not blow up when it finds strings in test not in train.
#######################################################################################
"""
train = copy.deepcopy(train)
lis = []
lis = train.select_dtypes('object').columns.tolist() + train.select_dtypes('category').columns.tolist()
if not isinstance(test, str):
lis_test = test.select_dtypes('object').columns.tolist() + test.select_dtypes('category').columns.tolist()
if len(left_subtract(lis, lis_test)) > 0:
### if there is an extra column in train that is not in test, then remove it from consideration
lis = copy.deepcopy(lis_test)
if not (len(lis)==0):
for everycol in lis:
#print(' Converting %s to numeric' %everycol)
Expand Down Expand Up @@ -970,8 +977,16 @@ def featurewiz(dataname, target, corr_limit=0.7, verbose=0, sep=",", header=0,
if feature_gen or feature_type:
print('Starting feature engineering...this will take time...')
if test is None:
X_train, X_test, y_train, y_test = train_test_split(train[preds],
train[target],
if settings.multi_label:
### if it is a multi_label problem, leave target as it is - a list!
X_train, X_test, y_train, y_test = train_test_split(train[preds],
train[target],
test_size=0.2,
random_state=RANDOM_SEED)
else:
### if it not a multi_label problem, make target as target[0]
X_train, X_test, y_train, y_test = train_test_split(train[preds],
train[target[0]],
test_size=0.2,
random_state=RANDOM_SEED)
else:
Expand Down Expand Up @@ -1045,8 +1060,8 @@ def featurewiz(dataname, target, corr_limit=0.7, verbose=0, sep=",", header=0,
preds = final_list+important_cats
#######You must convert category variables into integers ###############
if len(important_cats) > 0:
train, traindict = convert_all_object_columns_to_numeric(train, "")
if not isinstance(test, str) or test is not None:
train, traindict = convert_all_object_columns_to_numeric(train, "")
if test is not None:
test, _ = convert_all_object_columns_to_numeric(test, traindict)
######## Dont move this train and y definition anywhere else ########
y = train[target]
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

setuptools.setup(
name="featurewiz",
version="0.0.12",
version="0.0.13",
author="Ram Seshadri",
author_email="[email protected]",
description="Select Best Features from your data set - any size - now with XGBoost!",
Expand Down

0 comments on commit 47b4099

Please sign in to comment.