Skip to content

Commit

Permalink
Merge pull request #8 from bruckwubete/bugfix/issue_7
Browse files Browse the repository at this point in the history
fixing package name
  • Loading branch information
elisemercury authored Jul 29, 2022
2 parents 96f0430 + 5825d7b commit b9434cf
Show file tree
Hide file tree
Showing 5 changed files with 726 additions and 0 deletions.
2 changes: 2 additions & 0 deletions AutoClean/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from AutoClean.version import __version__
from AutoClean.autoclean import AutoClean
20 changes: 20 additions & 0 deletions AutoClean/autoclean.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
03-07-2022 20:50:11.82 - INFO - Started validation of input parameters...
03-07-2022 20:50:11.82 - INFO - Completed validation of input parameters
03-07-2022 20:50:11.82 - INFO - Skipped handling of duplicates
03-07-2022 20:50:11.82 - INFO - Started handling of missing values...
03-07-2022 20:50:11.82 - INFO - Found a total of 183 missing value(s)
03-07-2022 20:50:11.82 - INFO - Started handling of CATEGORICAL missing values... Method: "DELETE"
03-07-2022 20:50:11.83 - DEBUG - Deletion of 95 CATEGORICAL missing value(s) succeeded
03-07-2022 20:50:11.83 - INFO - Completed handling of missing values in 0.005795 seconds
03-07-2022 20:50:11.83 - INFO - Skipped handling of outliers
03-07-2022 20:50:11.83 - INFO - Skipped datetime feature conversion
03-07-2022 20:50:11.83 - INFO - Skipped encoding of categorical features
03-07-2022 20:50:11.83 - INFO - Started feature type conversion...
03-07-2022 20:50:11.83 - DEBUG - Conversion to type INT succeeded for feature "cylinders"
03-07-2022 20:50:11.83 - DEBUG - Conversion to type FLOAT succeeded for feature "displacement"
03-07-2022 20:50:11.83 - DEBUG - Conversion to type INT succeeded for feature "weight"
03-07-2022 20:50:11.83 - DEBUG - Conversion to type FLOAT succeeded for feature "acceleration"
03-07-2022 20:50:11.83 - DEBUG - Conversion to type INT succeeded for feature "model.year"
03-07-2022 20:50:11.83 - DEBUG - Conversion to type INT succeeded for feature "origin"
03-07-2022 20:50:11.83 - INFO - Completed feature type conversion for 6 feature(s) in 0.006115 seconds
03-07-2022 20:50:11.83 - INFO - AutoClean process completed in 0.014983 seconds
167 changes: 167 additions & 0 deletions AutoClean/autoclean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
# https://github.com/elisemercury/AutoClean

import os
import sys
from timeit import default_timer as timer
import pandas as pd
from loguru import logger
from AutoClean.modules import *
from AutoClean.version import __version__

class AutoClean:

def __init__(self, input_data, mode='auto', duplicates=False, missing_num=False, missing_categ=False, encode_categ=False, extract_datetime=False, outliers=False, outlier_param=1.5, logfile=True, verbose=False):
'''
input_data (dataframe)..........Pandas dataframe
mode (str)......................define in which mode you want to run AutoClean
'auto' = sets all parameters to 'auto' and let AutoClean do the data cleaning automatically
'manual' = lets you choose which parameters/cleaning steps you want to perform
duplicates (str)................define if duplicates in the data should be handled
duplicates are rows where all features are identical
'auto' = automated handling, deletes all copies of duplicates except one
False = skips this step
missing_num (str)...............define how NUMERICAL missing values are handled
'auto' = automated handling
'linreg' = uses Linear Regression for predicting missing values
'knn' = uses K-NN algorithm for imputation
'mean','median' or 'most_frequent' = uses mean/median/mode imputatiom
'delete' = deletes observations with missing values
False = skips this step
missing_categ (str).............define how CATEGORICAL missing values are handled
'auto' = automated handling
'logreg' = uses Logistic Regression for predicting missing values
'knn' = uses K-NN algorithm for imputation
'most_frequent' = uses mode imputatiom
'delete' = deletes observations with missing values
False = skips this step
encode_categ (list).............encode CATEGORICAL features, takes a list as input
['auto'] = automated encoding
['onehot'] = one-hot-encode all CATEGORICAL features
['label'] = label-encode all categ. features
to encode only specific features add the column name or index: ['onehot', ['col1', 2]]
False = skips this step
extract_datetime (str)..........define whether DATETIME type features should be extracted into separate features
to define granularity set to 'D'= day, 'M'= month, 'Y'= year, 'h'= hour, 'm'= minute or 's'= second
False = skips this step
outliers (str)..................define how outliers are handled
'winz' = replaces outliers through winsorization
'delete' = deletes observations containing outliers
oberservations are considered outliers if they are outside the lower and upper bound [Q1-1.5*IQR, Q3+1.5*IQR], where IQR is the interquartile range
to set a custom multiplier use the 'outlier_param' parameter
False = skips this step
outlier_param (int, float)......define the multiplier for the outlier bounds
logfile (bool)..................define whether to create a logile during the AutoClean process
logfile will be saved in working directory as "autoclean.log"
verbose (bool)..................define whether AutoClean logs will be printed in console
OUTPUT (dataframe)..............a cleaned Pandas dataframe, accessible through the 'output' instance
'''
start = timer()
self._initialize_logger(verbose, logfile)

output_data = input_data.copy()

if mode == 'auto':
duplicates, missing_num, missing_categ, outliers, encode_categ, extract_datetime = 'auto', 'auto', 'auto', 'winz', ['auto'], 's'

self.mode = mode
self.duplicates = duplicates
self.missing_num = missing_num
self.missing_categ = missing_categ
self.outliers = outliers
self.encode_categ = encode_categ
self.extract_datetime = extract_datetime
self.outlier_param = outlier_param

# validate the input parameters
self._validate_params(output_data, verbose, logfile)

# initialize our class and start the autoclean process
self.output = self._clean_data(output_data, input_data)

end = timer()
logger.info('AutoClean process completed in {} seconds', round(end-start, 6))

if not verbose:
print('AutoClean process completed in', round(end-start, 6), 'seconds')
if logfile:
print('Logfile saved to:', os.path.join(os.getcwd(), 'autoclean.log'))

def help():
# function that outputs some basic usage information
help_msg = f"""
**** Welcome to AutoClean! {__version__} ****
Run AutoClean by selecting your input data (Pandas dataframe) and setting the 'mode' parameter to:
\t* 'auto' (default) or
\t* 'manual'
If set to 'auto', AutoClean will start the automated cleaning process.
If set to 'manual', you can customize your AutoClean pipeline by defining some of the optional parameters:
\tduplicates, missing_num, missing_categ, outliers, encode_categ, extract_datetime
📋 For detailed documentation and usage guide, please visit the official GitHub Repo: https://github.com/elisemercury/AutoClean
"""
print(help_msg)
return

def _initialize_logger(self, verbose, logfile):
# function for initializing the logging process
logger.remove()
if verbose == True:
logger.add(sys.stderr, format='{time:DD-MM-YYYY HH:mm:ss.SS} - {level} - {message}')
if logfile == True:
logger.add('autoclean.log', mode='w', format='{time:DD-MM-YYYY HH:mm:ss.SS} - {level} - {message}')
return

def _validate_params(self, df, verbose, logfile):
# function for validating the input parameters of the autolean process
logger.info('Started validation of input parameters...')

if type(df) != pd.core.frame.DataFrame:
raise ValueError('Invalid value for "df" parameter.')
if self.mode not in ['manual', 'auto']:
AutoClean.help()
raise ValueError('Invalid value for "mode" parameter.')
if self.duplicates not in [False, 'auto']:
raise ValueError('Invalid value for "duplicates" parameter.')
if self.missing_num not in [False, 'auto', 'knn', 'mean', 'median', 'most_frequent', 'delete']:
raise ValueError('Invalid value for "missing_num" parameter.')
if self.missing_categ not in [False, 'auto', 'knn', 'most_frequent', 'delete']:
raise ValueError('Invalid value for "missing_categ" parameter.')
if self.outliers not in [False, 'auto', 'winz', 'delete']:
raise ValueError('Invalid value for "outliers" parameter.')
if isinstance(self.encode_categ, list):
if len(self.encode_categ) > 2 and self.encode_categ[0] not in ['auto', 'onehot', 'label']:
raise ValueError('Invalid value for "encode_categ" parameter.')
if len(self.encode_categ) == 2:
if not isinstance(self.encode_categ[1], list):
raise ValueError('Invalid value for "encode_categ" parameter.')
else:
if not self.encode_categ in ['auto', False]:
raise ValueError('Invalid value for "encode_categ" parameter.')
if not isinstance(self.outlier_param, int) and not isinstance(self.outlier_param, float):
raise ValueError('Invalid value for "outlier_param" parameter.')
if self.extract_datetime not in [False, 'auto', 'D','M','Y','h','m','s']:
raise ValueError('Invalid value for "extract_datetime" parameter.')
if not isinstance(verbose, bool):
raise ValueError('Invalid value for "verbose" parameter.')
if not isinstance(logfile, bool):
raise ValueError('Invalid value for "logfile" parameter.')

logger.info('Completed validation of input parameters')
return

def _clean_data(self, df, input_data):
# function for starting the autoclean process
df = df.reset_index(drop=True)
df = Duplicates.handle(self, df)
df = MissingValues.handle(self, df)
df = Outliers.handle(self, df)
df = Adjust.convert_datetime(self, df)
df = EncodeCateg.handle(self, df)
df = Adjust.round_values(self, df, input_data)
return df
Loading

0 comments on commit b9434cf

Please sign in to comment.