Skip to content

Commit

Permalink
new version
Browse files Browse the repository at this point in the history
  • Loading branch information
AutoViML committed May 16, 2021
1 parent 0f30fad commit 1abc95d
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 192 deletions.
1 change: 0 additions & 1 deletion featurewiz/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
from .stacking_models import Stacking_Classifier, Blending_Regressor
from .featurewiz import EDA_binning_numeric_column_displaying_bins, FE_add_lagged_targets_by_date_category
from .featurewiz import NLP_Pipeline, FE_convert_mixed_datatypes_to_string
from .featurewiz import FE_add_time_series_features
################################################################################
if __name__ == "__main__":
module_type = 'Running'
Expand Down
32 changes: 31 additions & 1 deletion featurewiz/databunch.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,13 @@ def __init__(self,
# check X_train, y_train, X_test
if self.check_data_format(X_train):
self.X_train_source = pd.DataFrame(X_train)
self.X_train_source = remove_duplicate_cols_in_dataset(self.X_train_source)
if X_test is not None:
if self.check_data_format(X_test):
self.X_test_source = pd.DataFrame(X_test)
self.X_test_source = remove_duplicate_cols_in_dataset(self.X_test_source)


### There is a chance for an error in this - so worth watching!
if y_train is not None:
le = LabelEncoder()
Expand Down Expand Up @@ -397,6 +401,8 @@ def preproc_data(self, X_train=None,
X_test (pd.DataFrame)
"""
#### Sometimes there are duplicates in column names. You must remove them here. ###
cat_features = find_remove_duplicates(cat_features)

# concat datasets for correct processing.
df_train = X_train.copy()
Expand All @@ -406,8 +412,11 @@ def preproc_data(self, X_train=None,
test_data = None ### Set test_data to None if X_test is None
else:
test_data = X_test.copy()
test_data = remove_duplicate_cols_in_dataset(test_data)
data = copy.deepcopy(df_train)

data = remove_duplicate_cols_in_dataset(data)

# object & num features
object_features = list(data.columns[(data.dtypes == 'object') | (data.dtypes == 'category')])
num_features = list(set(data.columns) - set(cat_features) - set(object_features) - {'test'})
Expand All @@ -421,7 +430,9 @@ def preproc_data(self, X_train=None,
self.binary_features_names = []

# LabelEncode all Binary Features - leave the rest alone
for feature in data.columns:
cols = data.columns.tolist()
#### This sometimes errors because there are duplicate columns in a dataset ###
for feature in cols:
if (feature != 'test') and (data[feature].nunique(dropna=False) < 3):
data[feature] = data[feature].astype('category').cat.codes
if test_data is not None:
Expand Down Expand Up @@ -617,3 +628,22 @@ def left_subtract(l1,l2):
lst.append(i)
return lst
#################################################################################
def remove_duplicate_cols_in_dataset(df):
df = copy.deepcopy(df)
cols = df.columns.tolist()
number_duplicates = df.columns.duplicated().astype(int).sum()
if number_duplicates > 0:
print('Detected %d duplicate columns in dataset. Removing duplicates...' %number_duplicates)
df = df.loc[:,~df.columns.duplicated()]
return df
###########################################################################
# Removes duplicates from a list to return unique values - USED ONLYONCE
def find_remove_duplicates(values):
output = []
seen = set()
for value in values:
if value not in seen:
output.append(value)
seen.add(value)
return output
#################################################################################
Loading

0 comments on commit 1abc95d

Please sign in to comment.