-
Notifications
You must be signed in to change notification settings - Fork 32
/
Copy pathpreprocessing.py
144 lines (113 loc) · 4.22 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import sys
import csv
import gzip
import datetime
import re
import hashlib
# ==============================================
# PROJECT CONFIGURATIONS
# ==============================================
PROJECTS = {
"tiramisu": {
"name": "tiramisu",
"files": "dbp*postgresql-*.anonymized.gz",
"mysql": False,
"query_index": 3,
"param_index": 4,
"conn_pos": 1,
"time_stamp_format": "%Y-%m-%d %H:%M:%S",
"schema": "../mysql/combinedTiramisuSchema.sql",
},
"admission": {
"name": "admission",
"files": "magneto.log.*.anonymized.gz",
"mysql": True,
"type_index": 3,
"query_index": 4,
"conn_pos": 2,
"time_stamp_format": "%Y-%m-%d %H:%M:%S",
'schema': "../mysql/gradAdmissions2009New.sql",
},
"oli": {
"name": "oli",
"files": "db*logfile*.anonymized.gz",
"mysql": True,
"type_index": 2,
"query_index": 3,
"conn_pos": 1,
"time_stamp_format": "%y%m%d %H:%M:%S",
'schema': "../mysql/olischema.sql",
}
}
STATEMENTS = ['select', 'SELECT', 'INSERT', 'insert', 'UPDATE', 'update', 'delete', 'DELETE']
SALT = "I fucking hate anonymizing queries"
SALT = SALT.encode('utf-8')
def GetEnumDict(schema_file):
enum_dict = dict()
sql_schema = open(schema_file, 'r')
for line in sql_schema:
enum_match = re.search("(ENUM|enum)\((.*)\)", line)
#print(line)
if enum_match is not None:
enums = enum_match.group(2)
enums = re.split(", | |,", enums)
for enum in enums:
data_length = str(len(enum) - 2)
cleaned = hashlib.md5(SALT +
enum.encode("utf-8")).hexdigest()
clean_enum = "'" + (data_length + "\\" + str(cleaned)) + "'"
enum_dict[clean_enum] = enum
return enum_dict
def preprocess(config, path, num_logs = None):
# input: string of path to csv file
# output: prints lines consisting of timestamp and query (comma separated)
enum_dict = GetEnumDict(config['schema'])
processed_queries = 0
f = gzip.open(path, mode='rt')
reader = csv.reader(f, delimiter=',')
for query_info in reader:
processed_queries += 1
if (not num_logs is None) and processed_queries > num_logs:
break
if config['name'] == 'tiramisu':
time_stamp = query_info[0]
time_stamp = time_stamp[: -8] # remove milliseconds and the time zone
else:
if query_info[config['type_index']] != 'Query': # skip if not a query
continue
# create timestamp
if config['name'] == 'admission':
day = query_info[0]
time = query_info[1].split(".")[0] # removes the milliseconds
time_stamp = day + " " + time
if config['name'] == 'oli':
time_stamp = query_info[0]
if time_stamp[7] == ' ':
time_stamp = time_stamp[0: 7] + '0' + time_stamp[8: -1]
#IF
time_stamp = datetime.datetime.strptime(
time_stamp, config['time_stamp_format'])
time_stamp = time_stamp.replace(second=0) # accurate to the minute
# Format query
query = query_info[config['query_index']]
for stmt in STATEMENTS:
idx = query.find(stmt)
if idx >= 0:
break
if idx < 0:
continue
# put back all the params for unnamed prepared statements...
# this is nasty...
if (not config['mysql']) and "execute" in query:
params = query_info[config['param_index']]
params = re.findall("'.+?'", params)
for i, param in reversed(list(enumerate(params))):
query = query.replace("${}".format(i + 1), param)
query = query[idx:]
if query[-1] != ";":
query += ";"
for clean_enum in enum_dict:
query = query.replace(clean_enum, enum_dict[clean_enum])
print(str(time_stamp) + ',' + query_info[config['conn_pos']] + ',' + query)
if __name__ == '__main__':
preprocess(PROJECTS[sys.argv[1]], sys.argv[2], 1100000000)