Skip to content

Commit

Permalink
remove Bayesian data type discover
Browse files Browse the repository at this point in the history
  • Loading branch information
Ji-Zhang committed Nov 28, 2019
1 parent e6213bb commit 20630ab
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 89 deletions.
165 changes: 84 additions & 81 deletions datacleanbot/dataclean.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from scipy.io import savemat
from datacleanbot.bayesian.bin import abda
# from datacleanbot.bayesian.bin import abda

from sklearn.preprocessing import Imputer
from fancyimpute import KNN, IterativeImputer, MatrixFactorization
Expand Down Expand Up @@ -288,83 +288,84 @@ def discover_type_heuristic(data):
#print ("Result:",inferredType) #For debugging purposes
return result

def generate_mat(Xy, extra_cardinality=1):
"""Convert data to mat format.
In order to use the Bayesian model, data need to be converted
to the .mat format.
"""
data = Xy
simple_types = discover_type_heuristic(data)
# map simple types to meta types
# 1: real (w positive: all real | positive | interval)
# 2: real (w/o positive: all real | interval)
# 3: binary data
# 4: discrete (non-binary: categorical | ordinal | count)
# note: for now, the implemented bayesian method version by isabel can distinguish between real, postive real, categorical and count
# the binary data should be mapped to meta type 4 discrete instead of meta type 3 due to the limited implemented version. This may change
# if the extended version has been implemented by isabel.
meta_types = []
for i in range(len(simple_types)):
# print(simple_types[i])
if simple_types[i] == "bool":
meta_types.append(4) # may change in the future
elif simple_types[i] == "int64" or simple_types[i] == "float64":
if (len(np.unique(data[:,i])) < 0.02 * len(data[:,i]) and \
len(np.unique(data[:,i])) < 50):
meta_types.append(4)
else:
if (data[:,i] > 0).all():
meta_types.append(1)
else:
meta_types.append(2)
else:
meta_types.append(1)
discrete_cardinality = [] # max for discrete feature, 1 for others
for i in range(len(meta_types)):
if (meta_types[i] == 4):
discrete_cardinality.append(int(np.max(data[:,i])) + extra_cardinality)
else:
discrete_cardinality.append(1)
data_dict = {'X': data,
'T': np.asarray(meta_types),
'R': np.asarray(discrete_cardinality)}
# pprint.pprint(data_dict)
savemat('data.mat', data_dict, oned_as='row')

def discover_type_bayesian(Xy):
"""Infer data types for each feature using Bayesian model.
Retrieve the key with the higher value from 'weights' of the output of
the Bayesian model. The retrieved key is the statisical type of the
corresponding feature.
# def generate_mat(Xy, extra_cardinality=1):
# """Convert data to mat format.

# In order to use the Bayesian model, data need to be converted
# to the .mat format.
# """
# data = Xy
# simple_types = discover_type_heuristic(data)
# # map simple types to meta types
# # 1: real (w positive: all real | positive | interval)
# # 2: real (w/o positive: all real | interval)
# # 3: binary data
# # 4: discrete (non-binary: categorical | ordinal | count)
# # note: for now, the implemented bayesian method version by isabel can distinguish between real, postive real, categorical and count
# # the binary data should be mapped to meta type 4 discrete instead of meta type 3 due to the limited implemented version. This may change
# # if the extended version has been implemented by isabel.
# meta_types = []
# for i in range(len(simple_types)):
# # print(simple_types[i])
# if simple_types[i] == "bool":
# meta_types.append(4) # may change in the future
# elif simple_types[i] == "int64" or simple_types[i] == "float64":
# if (len(np.unique(data[:,i])) < 0.02 * len(data[:,i]) and \
# len(np.unique(data[:,i])) < 50):
# meta_types.append(4)
# else:
# if (data[:,i] > 0).all():
# meta_types.append(1)
# else:
# meta_types.append(2)
# else:
# meta_types.append(1)
# discrete_cardinality = [] # max for discrete feature, 1 for others
# for i in range(len(meta_types)):
# if (meta_types[i] == 4):
# discrete_cardinality.append(int(np.max(data[:,i])) + extra_cardinality)
# else:
# discrete_cardinality.append(1)
# data_dict = {'X': data,
# 'T': np.asarray(meta_types),
# 'R': np.asarray(discrete_cardinality)}
# # pprint.pprint(data_dict)
# savemat('data.mat', data_dict, oned_as='row')

# def discover_type_bayesian(Xy):
# """Infer data types for each feature using Bayesian model.

# Retrieve the key with the higher value from 'weights' of the output of
# the Bayesian model. The retrieved key is the statisical type of the
# corresponding feature.

Parameters
----------
# Parameters
# ----------

# Xy : numpy array
# Xy can only be numeric in order to run the Bayesian model.

# Returns
# -------
# result : list
# List of data types.
# """
# statistical_types = []
# generate_mat(Xy)
# # with HiddenPrints():
# with NoStdStreams():
# print("This will not be printed")
# weights = abda.main(seed=1337, dataset='data.mat', exp_id=None, args_output='./exp/temp/', args_miss=None, verbose=1,
# args_col_split_threshold=0.8, args_min_inst_slice=500, args_leaf_type='pm',
# args_type_param_map='spicky-prior-1', args_param_init='default', args_param_weight_init='uniform',
# args_n_iters=5, args_burn_in=4000, args_w_unif_prior=100, args_save_samples=1,
# args_ll_history=1, args_omega_prior='uniform', args_plot_iter=10, args_omega_unif_prior=10,
# args_leaf_omega_unif_prior=0.1, args_cat_unif_prior=1);
# for i in range(len(weights)):
# # print(max(weights[i], key=weights[i].get))
# statistical_types.append(str(max(weights[i], key=weights[i].get)))
# return statistical_types

Xy : numpy array
Xy can only be numeric in order to run the Bayesian model.
Returns
-------
result : list
List of data types.
"""
statistical_types = []
generate_mat(Xy)
# with HiddenPrints():
with NoStdStreams():
print("This will not be printed")
weights = abda.main(seed=1337, dataset='data.mat', exp_id=None, args_output='./exp/temp/', args_miss=None, verbose=1,
args_col_split_threshold=0.8, args_min_inst_slice=500, args_leaf_type='pm',
args_type_param_map='spicky-prior-1', args_param_init='default', args_param_weight_init='uniform',
args_n_iters=5, args_burn_in=4000, args_w_unif_prior=100, args_save_samples=1,
args_ll_history=1, args_omega_prior='uniform', args_plot_iter=10, args_omega_unif_prior=10,
args_leaf_omega_unif_prior=0.1, args_cat_unif_prior=1);
for i in range(len(weights)):
# print(max(weights[i], key=weights[i].get))
statistical_types.append(str(max(weights[i], key=weights[i].get)))
return statistical_types

def discover_types(Xy):
"""Discover types for numpy array.
Expand All @@ -387,11 +388,11 @@ def discover_types(Xy):
display(HTML('<h2>Discover Data Types</h2>'))
display(HTML('<h4>Simple Data Types</h4>'))
print(discover_type_heuristic(Xy))
display(HTML('<h4>Statistical Data Types</h4>'))
try:
print(discover_type_bayesian(Xy))
except:
print("Failed to run the Bayesian model.")
# display(HTML('<h4>Statistical Data Types</h4>'))
# try:
# print(discover_type_bayesian(Xy))
# except:
# print("Failed to run the Bayesian model.")
################################

##### Duplicated Rows ######
Expand Down Expand Up @@ -848,6 +849,7 @@ def clean_missing(df,features):
print("")
print("Choose the missing mechanism [a/b/c/d]:")
print("a.MCAR b.MAR c.MNAR d.Skip")
time.sleep(0.05)
ans = input()
if ans == 'a':
recommend = deal_mcar(df_preprocessed)
Expand Down Expand Up @@ -1013,6 +1015,7 @@ def compute_metafeatures(X, y):
if len(np.unique(y)) > 100 or len(np.unique(y)) > 0.1*y.shape[0]:
print("regression")
print("meta features cannot be extracted as the target is not categorical")
metafeatures = None
# if classification
else:
# print("classification")
Expand Down
10 changes: 2 additions & 8 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="datacleanbot",
version="0.4",
version="0.5",
author="Ji Zhang",
author_email="",
description="automated data cleaning tool",
Expand All @@ -21,13 +21,7 @@
'seaborn>=0.8',
'matplotlib>=2.2.2',
'missingno>=0.4.0',
'fancyimpute',
'numba>=0.27',
'pystruct>=0.2.4',
'cvxopt>=1.1.9',
'pymc3>=3.4',
'pyro-ppl>=0.2',
'rpy2==2.9.4'],
'fancyimpute'],
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
Expand Down

0 comments on commit 20630ab

Please sign in to comment.