remove Bayesian data type discover

Ji-Zhang · Nov 28, 2019 · 20630ab · 20630ab
1 parent e6213bb
commit 20630ab
Show file tree

Hide file tree

Showing 2 changed files with 86 additions and 89 deletions.
diff --git a/datacleanbot/dataclean.py b/datacleanbot/dataclean.py
@@ -19,7 +19,7 @@
 from sklearn.pipeline import Pipeline
 from sklearn.ensemble import RandomForestClassifier
 from scipy.io import savemat
-from datacleanbot.bayesian.bin import abda
+# from datacleanbot.bayesian.bin import abda
 
 from sklearn.preprocessing import Imputer
 from fancyimpute import KNN, IterativeImputer, MatrixFactorization
@@ -288,83 +288,84 @@ def discover_type_heuristic(data):
         #print ("Result:",inferredType) #For debugging purposes
     return result
 
-def generate_mat(Xy, extra_cardinality=1):
-    """Convert data to mat format.
-
-    In order to use the Bayesian model, data need to be converted
-    to the .mat format.
-    """
-    data = Xy
-    simple_types = discover_type_heuristic(data)
-    # map simple types to meta types
-    # 1: real (w positive: all real | positive | interval)
-    # 2: real (w/o positive: all real | interval)
-    # 3: binary data
-    # 4: discrete (non-binary: categorical | ordinal | count)
-    # note: for now, the implemented bayesian method version by isabel can distinguish between real, postive real, categorical and count
-    # the binary data should be mapped to meta type 4 discrete instead of meta type 3 due to the limited implemented version. This may change
-    # if the extended version has been implemented by isabel.
-    meta_types = [] 
-    for i in range(len(simple_types)):
-#         print(simple_types[i])
-        if simple_types[i] == "bool":
-            meta_types.append(4) # may change in the future
-        elif simple_types[i] == "int64" or simple_types[i] == "float64":
-            if (len(np.unique(data[:,i])) < 0.02 * len(data[:,i]) and \
-                len(np.unique(data[:,i])) < 50):
-                meta_types.append(4)
-            else:
-                if (data[:,i] > 0).all():
-                    meta_types.append(1)
-                else:
-                    meta_types.append(2)
-        else:
-            meta_types.append(1)
-    discrete_cardinality = [] # max for discrete feature, 1 for others
-    for i in range(len(meta_types)):
-        if (meta_types[i] == 4):
-            discrete_cardinality.append(int(np.max(data[:,i])) + extra_cardinality) 
-        else:
-            discrete_cardinality.append(1)
-    data_dict = {'X': data,
-                 'T': np.asarray(meta_types),
-                 'R': np.asarray(discrete_cardinality)}
-    # pprint.pprint(data_dict)
-    savemat('data.mat', data_dict, oned_as='row')
-
-def discover_type_bayesian(Xy):
-    """Infer data types for each feature using Bayesian model.
-
-    Retrieve the key with the higher value from 'weights' of the output of
-    the Bayesian model. The retrieved key is the statisical type of the 
-    corresponding feature.
+# def generate_mat(Xy, extra_cardinality=1):
+#     """Convert data to mat format.
+
+#     In order to use the Bayesian model, data need to be converted
+#     to the .mat format.
+#     """
+#     data = Xy
+#     simple_types = discover_type_heuristic(data)
+#     # map simple types to meta types
+#     # 1: real (w positive: all real | positive | interval)
+#     # 2: real (w/o positive: all real | interval)
+#     # 3: binary data
+#     # 4: discrete (non-binary: categorical | ordinal | count)
+#     # note: for now, the implemented bayesian method version by isabel can distinguish between real, postive real, categorical and count
+#     # the binary data should be mapped to meta type 4 discrete instead of meta type 3 due to the limited implemented version. This may change
+#     # if the extended version has been implemented by isabel.
+#     meta_types = [] 
+#     for i in range(len(simple_types)):
+# #         print(simple_types[i])
+#         if simple_types[i] == "bool":
+#             meta_types.append(4) # may change in the future
+#         elif simple_types[i] == "int64" or simple_types[i] == "float64":
+#             if (len(np.unique(data[:,i])) < 0.02 * len(data[:,i]) and \
+#                 len(np.unique(data[:,i])) < 50):
+#                 meta_types.append(4)
+#             else:
+#                 if (data[:,i] > 0).all():
+#                     meta_types.append(1)
+#                 else:
+#                     meta_types.append(2)
+#         else:
+#             meta_types.append(1)
+#     discrete_cardinality = [] # max for discrete feature, 1 for others
+#     for i in range(len(meta_types)):
+#         if (meta_types[i] == 4):
+#             discrete_cardinality.append(int(np.max(data[:,i])) + extra_cardinality) 
+#         else:
+#             discrete_cardinality.append(1)
+#     data_dict = {'X': data,
+#                  'T': np.asarray(meta_types),
+#                  'R': np.asarray(discrete_cardinality)}
+#     # pprint.pprint(data_dict)
+#     savemat('data.mat', data_dict, oned_as='row')
+
+# def discover_type_bayesian(Xy):
+#     """Infer data types for each feature using Bayesian model.
+
+#     Retrieve the key with the higher value from 'weights' of the output of
+#     the Bayesian model. The retrieved key is the statisical type of the 
+#     corresponding feature.
 
-    Parameters
-    ----------
+#     Parameters
+#     ----------
+
+#     Xy : numpy array
+#         Xy can only be numeric in order to run the Bayesian model.
+
+#     Returns
+#     -------
+#     result : list
+#         List of data types.
+#     """
+#     statistical_types = [] 
+#     generate_mat(Xy)
+#     #     with HiddenPrints():
+#     with NoStdStreams():
+#         print("This will not be printed")
+#         weights = abda.main(seed=1337, dataset='data.mat', exp_id=None, args_output='./exp/temp/', args_miss=None, verbose=1,
+#          args_col_split_threshold=0.8, args_min_inst_slice=500, args_leaf_type='pm',
+#          args_type_param_map='spicky-prior-1', args_param_init='default', args_param_weight_init='uniform',
+#          args_n_iters=5, args_burn_in=4000, args_w_unif_prior=100, args_save_samples=1,
+#          args_ll_history=1, args_omega_prior='uniform', args_plot_iter=10, args_omega_unif_prior=10,
+#          args_leaf_omega_unif_prior=0.1, args_cat_unif_prior=1);
+#     for i in range(len(weights)):
+# #         print(max(weights[i], key=weights[i].get))
+#         statistical_types.append(str(max(weights[i], key=weights[i].get)))
+#     return statistical_types
 
-    Xy : numpy array
-        Xy can only be numeric in order to run the Bayesian model.
-
-    Returns
-    -------
-    result : list
-        List of data types.
-    """
-    statistical_types = [] 
-    generate_mat(Xy)
-    #     with HiddenPrints():
-    with NoStdStreams():
-        print("This will not be printed")
-        weights = abda.main(seed=1337, dataset='data.mat', exp_id=None, args_output='./exp/temp/', args_miss=None, verbose=1,
-         args_col_split_threshold=0.8, args_min_inst_slice=500, args_leaf_type='pm',
-         args_type_param_map='spicky-prior-1', args_param_init='default', args_param_weight_init='uniform',
-         args_n_iters=5, args_burn_in=4000, args_w_unif_prior=100, args_save_samples=1,
-         args_ll_history=1, args_omega_prior='uniform', args_plot_iter=10, args_omega_unif_prior=10,
-         args_leaf_omega_unif_prior=0.1, args_cat_unif_prior=1);
-    for i in range(len(weights)):
-#         print(max(weights[i], key=weights[i].get))
-        statistical_types.append(str(max(weights[i], key=weights[i].get)))
-    return statistical_types
 
 def discover_types(Xy):
     """Discover types for numpy array.
@@ -387,11 +388,11 @@ def discover_types(Xy):
     display(HTML('<h2>Discover Data Types</h2>'))
     display(HTML('<h4>Simple Data Types</h4>'))
     print(discover_type_heuristic(Xy))
-    display(HTML('<h4>Statistical Data Types</h4>'))
-    try:
-        print(discover_type_bayesian(Xy))
-    except:
-        print("Failed to run the Bayesian model.")
+    # display(HTML('<h4>Statistical Data Types</h4>'))
+    # try:
+    #     print(discover_type_bayesian(Xy))
+    # except:
+    #     print("Failed to run the Bayesian model.")
 ################################
 
 ##### Duplicated Rows ######
@@ -848,6 +849,7 @@ def clean_missing(df,features):
     print("")
     print("Choose the missing mechanism [a/b/c/d]:")
     print("a.MCAR b.MAR c.MNAR d.Skip")
+    time.sleep(0.05)
     ans = input()
     if ans == 'a':
         recommend = deal_mcar(df_preprocessed)
@@ -1013,6 +1015,7 @@ def compute_metafeatures(X, y):
     if len(np.unique(y)) > 100 or len(np.unique(y)) > 0.1*y.shape[0]:
         print("regression")
         print("meta features cannot be extracted as the target is not categorical")
+        metafeatures = None
     # if classification
     else:
 #         print("classification")

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="datacleanbot",
-    version="0.4",
+    version="0.5",
     author="Ji Zhang",
     author_email="",
     description="automated data cleaning tool",
@@ -21,13 +21,7 @@
         'seaborn>=0.8',
         'matplotlib>=2.2.2',
         'missingno>=0.4.0',
-        'fancyimpute',
-        'numba>=0.27',
-        'pystruct>=0.2.4',
-        'cvxopt>=1.1.9',
-        'pymc3>=3.4',
-        'pyro-ppl>=0.2',
-        'rpy2==2.9.4'],
+        'fancyimpute'],
     classifiers=[
         "Programming Language :: Python :: 3",
         "License :: OSI Approved :: MIT License",