-
Notifications
You must be signed in to change notification settings - Fork 54
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #8 from bruckwubete/bugfix/issue_7
fixing package name
- Loading branch information
Showing
5 changed files
with
726 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from AutoClean.version import __version__ | ||
from AutoClean.autoclean import AutoClean |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
03-07-2022 20:50:11.82 - INFO - Started validation of input parameters... | ||
03-07-2022 20:50:11.82 - INFO - Completed validation of input parameters | ||
03-07-2022 20:50:11.82 - INFO - Skipped handling of duplicates | ||
03-07-2022 20:50:11.82 - INFO - Started handling of missing values... | ||
03-07-2022 20:50:11.82 - INFO - Found a total of 183 missing value(s) | ||
03-07-2022 20:50:11.82 - INFO - Started handling of CATEGORICAL missing values... Method: "DELETE" | ||
03-07-2022 20:50:11.83 - DEBUG - Deletion of 95 CATEGORICAL missing value(s) succeeded | ||
03-07-2022 20:50:11.83 - INFO - Completed handling of missing values in 0.005795 seconds | ||
03-07-2022 20:50:11.83 - INFO - Skipped handling of outliers | ||
03-07-2022 20:50:11.83 - INFO - Skipped datetime feature conversion | ||
03-07-2022 20:50:11.83 - INFO - Skipped encoding of categorical features | ||
03-07-2022 20:50:11.83 - INFO - Started feature type conversion... | ||
03-07-2022 20:50:11.83 - DEBUG - Conversion to type INT succeeded for feature "cylinders" | ||
03-07-2022 20:50:11.83 - DEBUG - Conversion to type FLOAT succeeded for feature "displacement" | ||
03-07-2022 20:50:11.83 - DEBUG - Conversion to type INT succeeded for feature "weight" | ||
03-07-2022 20:50:11.83 - DEBUG - Conversion to type FLOAT succeeded for feature "acceleration" | ||
03-07-2022 20:50:11.83 - DEBUG - Conversion to type INT succeeded for feature "model.year" | ||
03-07-2022 20:50:11.83 - DEBUG - Conversion to type INT succeeded for feature "origin" | ||
03-07-2022 20:50:11.83 - INFO - Completed feature type conversion for 6 feature(s) in 0.006115 seconds | ||
03-07-2022 20:50:11.83 - INFO - AutoClean process completed in 0.014983 seconds |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,167 @@ | ||
# https://github.com/elisemercury/AutoClean | ||
|
||
import os | ||
import sys | ||
from timeit import default_timer as timer | ||
import pandas as pd | ||
from loguru import logger | ||
from AutoClean.modules import * | ||
from AutoClean.version import __version__ | ||
|
||
class AutoClean: | ||
|
||
def __init__(self, input_data, mode='auto', duplicates=False, missing_num=False, missing_categ=False, encode_categ=False, extract_datetime=False, outliers=False, outlier_param=1.5, logfile=True, verbose=False): | ||
''' | ||
input_data (dataframe)..........Pandas dataframe | ||
mode (str)......................define in which mode you want to run AutoClean | ||
'auto' = sets all parameters to 'auto' and let AutoClean do the data cleaning automatically | ||
'manual' = lets you choose which parameters/cleaning steps you want to perform | ||
duplicates (str)................define if duplicates in the data should be handled | ||
duplicates are rows where all features are identical | ||
'auto' = automated handling, deletes all copies of duplicates except one | ||
False = skips this step | ||
missing_num (str)...............define how NUMERICAL missing values are handled | ||
'auto' = automated handling | ||
'linreg' = uses Linear Regression for predicting missing values | ||
'knn' = uses K-NN algorithm for imputation | ||
'mean','median' or 'most_frequent' = uses mean/median/mode imputatiom | ||
'delete' = deletes observations with missing values | ||
False = skips this step | ||
missing_categ (str).............define how CATEGORICAL missing values are handled | ||
'auto' = automated handling | ||
'logreg' = uses Logistic Regression for predicting missing values | ||
'knn' = uses K-NN algorithm for imputation | ||
'most_frequent' = uses mode imputatiom | ||
'delete' = deletes observations with missing values | ||
False = skips this step | ||
encode_categ (list).............encode CATEGORICAL features, takes a list as input | ||
['auto'] = automated encoding | ||
['onehot'] = one-hot-encode all CATEGORICAL features | ||
['label'] = label-encode all categ. features | ||
to encode only specific features add the column name or index: ['onehot', ['col1', 2]] | ||
False = skips this step | ||
extract_datetime (str)..........define whether DATETIME type features should be extracted into separate features | ||
to define granularity set to 'D'= day, 'M'= month, 'Y'= year, 'h'= hour, 'm'= minute or 's'= second | ||
False = skips this step | ||
outliers (str)..................define how outliers are handled | ||
'winz' = replaces outliers through winsorization | ||
'delete' = deletes observations containing outliers | ||
oberservations are considered outliers if they are outside the lower and upper bound [Q1-1.5*IQR, Q3+1.5*IQR], where IQR is the interquartile range | ||
to set a custom multiplier use the 'outlier_param' parameter | ||
False = skips this step | ||
outlier_param (int, float)......define the multiplier for the outlier bounds | ||
logfile (bool)..................define whether to create a logile during the AutoClean process | ||
logfile will be saved in working directory as "autoclean.log" | ||
verbose (bool)..................define whether AutoClean logs will be printed in console | ||
OUTPUT (dataframe)..............a cleaned Pandas dataframe, accessible through the 'output' instance | ||
''' | ||
start = timer() | ||
self._initialize_logger(verbose, logfile) | ||
|
||
output_data = input_data.copy() | ||
|
||
if mode == 'auto': | ||
duplicates, missing_num, missing_categ, outliers, encode_categ, extract_datetime = 'auto', 'auto', 'auto', 'winz', ['auto'], 's' | ||
|
||
self.mode = mode | ||
self.duplicates = duplicates | ||
self.missing_num = missing_num | ||
self.missing_categ = missing_categ | ||
self.outliers = outliers | ||
self.encode_categ = encode_categ | ||
self.extract_datetime = extract_datetime | ||
self.outlier_param = outlier_param | ||
|
||
# validate the input parameters | ||
self._validate_params(output_data, verbose, logfile) | ||
|
||
# initialize our class and start the autoclean process | ||
self.output = self._clean_data(output_data, input_data) | ||
|
||
end = timer() | ||
logger.info('AutoClean process completed in {} seconds', round(end-start, 6)) | ||
|
||
if not verbose: | ||
print('AutoClean process completed in', round(end-start, 6), 'seconds') | ||
if logfile: | ||
print('Logfile saved to:', os.path.join(os.getcwd(), 'autoclean.log')) | ||
|
||
def help(): | ||
# function that outputs some basic usage information | ||
help_msg = f""" | ||
**** Welcome to AutoClean! {__version__} **** | ||
Run AutoClean by selecting your input data (Pandas dataframe) and setting the 'mode' parameter to: | ||
\t* 'auto' (default) or | ||
\t* 'manual' | ||
If set to 'auto', AutoClean will start the automated cleaning process. | ||
If set to 'manual', you can customize your AutoClean pipeline by defining some of the optional parameters: | ||
\tduplicates, missing_num, missing_categ, outliers, encode_categ, extract_datetime | ||
📋 For detailed documentation and usage guide, please visit the official GitHub Repo: https://github.com/elisemercury/AutoClean | ||
""" | ||
print(help_msg) | ||
return | ||
|
||
def _initialize_logger(self, verbose, logfile): | ||
# function for initializing the logging process | ||
logger.remove() | ||
if verbose == True: | ||
logger.add(sys.stderr, format='{time:DD-MM-YYYY HH:mm:ss.SS} - {level} - {message}') | ||
if logfile == True: | ||
logger.add('autoclean.log', mode='w', format='{time:DD-MM-YYYY HH:mm:ss.SS} - {level} - {message}') | ||
return | ||
|
||
def _validate_params(self, df, verbose, logfile): | ||
# function for validating the input parameters of the autolean process | ||
logger.info('Started validation of input parameters...') | ||
|
||
if type(df) != pd.core.frame.DataFrame: | ||
raise ValueError('Invalid value for "df" parameter.') | ||
if self.mode not in ['manual', 'auto']: | ||
AutoClean.help() | ||
raise ValueError('Invalid value for "mode" parameter.') | ||
if self.duplicates not in [False, 'auto']: | ||
raise ValueError('Invalid value for "duplicates" parameter.') | ||
if self.missing_num not in [False, 'auto', 'knn', 'mean', 'median', 'most_frequent', 'delete']: | ||
raise ValueError('Invalid value for "missing_num" parameter.') | ||
if self.missing_categ not in [False, 'auto', 'knn', 'most_frequent', 'delete']: | ||
raise ValueError('Invalid value for "missing_categ" parameter.') | ||
if self.outliers not in [False, 'auto', 'winz', 'delete']: | ||
raise ValueError('Invalid value for "outliers" parameter.') | ||
if isinstance(self.encode_categ, list): | ||
if len(self.encode_categ) > 2 and self.encode_categ[0] not in ['auto', 'onehot', 'label']: | ||
raise ValueError('Invalid value for "encode_categ" parameter.') | ||
if len(self.encode_categ) == 2: | ||
if not isinstance(self.encode_categ[1], list): | ||
raise ValueError('Invalid value for "encode_categ" parameter.') | ||
else: | ||
if not self.encode_categ in ['auto', False]: | ||
raise ValueError('Invalid value for "encode_categ" parameter.') | ||
if not isinstance(self.outlier_param, int) and not isinstance(self.outlier_param, float): | ||
raise ValueError('Invalid value for "outlier_param" parameter.') | ||
if self.extract_datetime not in [False, 'auto', 'D','M','Y','h','m','s']: | ||
raise ValueError('Invalid value for "extract_datetime" parameter.') | ||
if not isinstance(verbose, bool): | ||
raise ValueError('Invalid value for "verbose" parameter.') | ||
if not isinstance(logfile, bool): | ||
raise ValueError('Invalid value for "logfile" parameter.') | ||
|
||
logger.info('Completed validation of input parameters') | ||
return | ||
|
||
def _clean_data(self, df, input_data): | ||
# function for starting the autoclean process | ||
df = df.reset_index(drop=True) | ||
df = Duplicates.handle(self, df) | ||
df = MissingValues.handle(self, df) | ||
df = Outliers.handle(self, df) | ||
df = Adjust.convert_datetime(self, df) | ||
df = EncodeCateg.handle(self, df) | ||
df = Adjust.round_values(self, df, input_data) | ||
return df |
Oops, something went wrong.