forked from HoloClean/holoclean
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy patheval.py
168 lines (149 loc) · 7.26 KB
/
eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import logging
import time
import os
import pandas as pd
from string import Template
from dataset import AuxTables
from dataset.table import Table, Source
errors_template = Template('SELECT count(*) '\
'FROM $init_table as t1, $grdt_table as t2 '\
'WHERE t1._tid_ = t2._tid_ '\
'AND t2._attribute_ = \'$attr\' '\
'AND t1.$attr != t2._value_')
correct_repairs_template = Template('SELECT COUNT(*) FROM'\
'(SELECT t2._tid_, t2._attribute_, t2._value_ '\
'FROM $init_table as t1, $grdt_table as t2 '\
'WHERE t1._tid_ = t2._tid_ '\
'AND t2._attribute_ = \'$attr\' '\
'AND t1.$attr != t2._value_ ) as errors, $inf_dom as repairs '\
'WHERE errors._tid_ = repairs._tid_ '\
'AND errors._attribute_ = repairs.attribute '\
'AND errors._value_ = repairs.rv_value')
class EvalEngine:
def __init__(self, env, dataset):
self.env = env
self.ds = dataset
def load_data(self, name, f_path, f_name, get_tid, get_attr, get_val, na_values=None):
tic = time.clock()
try:
raw_data = pd.read_csv(os.path.join(f_path,f_name), na_values=na_values)
raw_data.fillna('_nan_',inplace=True)
raw_data['_tid_'] = raw_data.apply(get_tid, axis=1)
raw_data['_attribute_'] = raw_data.apply(get_attr, axis=1)
raw_data['_value_'] = raw_data.apply(get_val, axis=1)
raw_data = raw_data[['_tid_', '_attribute_', '_value_']]
# Normalize string to lower-case and strip whitespaces.
raw_data['_attribute_'] = raw_data['_attribute_'].apply(lambda x: x.lower())
raw_data['_value_'] = raw_data['_value_'].apply(lambda x: x.strip())
self.clean_data = Table(name, Source.DF, raw_data)
self.clean_data.store_to_db(self.ds.engine.engine)
self.clean_data.create_db_index(self.ds.engine, ['_tid_'])
self.clean_data.create_db_index(self.ds.engine, ['_attribute_'])
status = 'DONE Loading '+f_name
except Exception:
logging.error('load_data for table %s', name)
raise
toc = time.clock()
load_time = toc - tic
return status, load_time
def evaluate_repairs(self):
self.compute_total_repairs()
self.compute_total_repairs_grdt()
self.compute_total_errors()
self.compute_detected_errors()
self.compute_correct_repairs()
prec = self.compute_precision()
rec = self.compute_recall()
rep_recall = self.compute_repairing_recall()
f1 = self.compute_f1()
rep_f1 = self.compute_repairing_f1()
return prec, rec, rep_recall, f1, rep_f1
def eval_report(self):
tic = time.clock()
prec, rec, rep_recall, f1, rep_f1 = self.evaluate_repairs()
report = "Precision = %.2f, Recall = %.2f, Repairing Recall = %.2f, F1 = %.2f, Repairing F1 = %.2f, Detected Errors = %d, Total Errors = %d, Correct Repairs = %d, Total Repairs = %d, Total Repairs (Grdth present) = %d" % (
prec, rec, rep_recall, f1, rep_f1, self.detected_errors, self.total_errors, self.correct_repairs, self.total_repairs, self.total_repairs_grdt)
toc = time.clock()
report_time = toc - tic
return report, report_time
def compute_total_repairs(self):
query = "SELECT count(*) FROM " \
"(SELECT _vid_ " \
"FROM %s as t1, %s as t2 " \
"WHERE t1._tid_ = t2._tid_ " \
"AND t1.attribute = t2.attribute " \
"AND t1.init_value != t2.rv_value) AS t"\
%(AuxTables.cell_domain.name, AuxTables.inf_values_dom.name)
res = self.ds.engine.execute_query(query)
self.total_repairs = float(res[0][0])
def compute_total_repairs_grdt(self):
query = "SELECT count(*) FROM " \
"(SELECT _vid_ " \
"FROM %s as t1, %s as t2, %s as t3 " \
"WHERE t1._tid_ = t2._tid_ " \
"AND t1.attribute = t2.attribute " \
"AND t1.init_value != t2.rv_value " \
"AND t1._tid_ = t3._tid_ " \
"AND t1.attribute = t3._attribute_) AS t"\
%(AuxTables.cell_domain.name, AuxTables.inf_values_dom.name, self.clean_data.name)
res = self.ds.engine.execute_query(query)
self.total_repairs_grdt = float(res[0][0])
def compute_total_errors(self):
queries = []
total_errors = 0.0
for attr in self.ds.get_attributes():
query = errors_template.substitute(init_table=self.ds.raw_data.name, grdt_table=self.clean_data.name,
attr=attr)
queries.append(query)
results = self.ds.engine.execute_queries(queries)
for res in results:
total_errors += float(res[0][0])
self.total_errors = total_errors
def compute_total_errors_grdt(self):
queries = []
total_errors = 0.0
for attr in self.ds.get_attributes():
query = errors_template.substitute(init_table=self.ds.raw_data.name, grdt_table=self.clean_data.name,
attr=attr)
queries.append(query)
results = self.ds.engine.execute_queries(queries)
for res in results:
total_errors += float(res[0][0])
self.total_errors = total_errors
def compute_detected_errors(self):
query = "SELECT count(*) FROM " \
"(SELECT _vid_ " \
"FROM %s as t1, %s as t2, %s as t3 " \
"WHERE t1._tid_ = t2._tid_ AND t1._cid_ = t3._cid_ " \
"AND t1.attribute = t2._attribute_ " \
"AND t1.init_value != t2._value_) AS t" \
% (AuxTables.cell_domain.name, self.clean_data.name, AuxTables.dk_cells.name)
res = self.ds.engine.execute_query(query)
self.detected_errors = float(res[0][0])
def compute_correct_repairs(self):
queries = []
correct_repairs = 0.0
for attr in self.ds.get_attributes():
query = correct_repairs_template.substitute(init_table=self.ds.raw_data.name, grdt_table=self.clean_data.name,
attr=attr, inf_dom=AuxTables.inf_values_dom.name)
queries.append(query)
results = self.ds.engine.execute_queries(queries)
for res in results:
correct_repairs += float(res[0][0])
self.correct_repairs = correct_repairs
def compute_recall(self):
return self.correct_repairs / self.total_errors
def compute_repairing_recall(self):
return self.correct_repairs / self.detected_errors
def compute_precision(self):
return self.correct_repairs / self.total_repairs_grdt
def compute_f1(self):
prec = self.compute_precision()
rec = self.compute_recall()
f1 = 2*(prec*rec)/(prec+rec)
return f1
def compute_repairing_f1(self):
prec = self.compute_precision()
rec = self.compute_repairing_recall()
f1 = 2*(prec*rec)/(prec+rec)
return f1