forked from mozilla-services/socorro
-
Notifications
You must be signed in to change notification settings - Fork 0
/
hive_adi_test.py
101 lines (89 loc) · 2.95 KB
/
hive_adi_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python
import codecs
import datetime
import optparse
import os
import pyhs2
import tempfile
import unicodedata
import urllib2
# Example command-line usage:
# $ . /etc/socorro/socorrorc
# $ $PYTHON hive_adi_test.py -d '2015-01-21' -s peach-gw.peach.metrics.scl3.mozilla.com -o /tmp/output_adi.txt
def main():
storage_date = datetime.date.today().isoformat()
# Defaulting to creating a temp file for output
raw_adi_logs_pathname = os.path.join(
tempfile.gettempdir(),
"%s.raw_adi_logs.TEMPORARY%s" % (
storage_date,
'.txt'
)
)
p = optparse.OptionParser()
p.add_option('--target-date', '-d', default=storage_date)
p.add_option('--host', '-s', default='localhost')
p.add_option('--user', '-u', default='socorro')
p.add_option('--output-filename', '-o', default=raw_adi_logs_pathname)
options, arguments = p.parse_args()
query = """
select
ds,
split(request_url,'/')[5],
split(split(request_url,'/')[10], '%%20')[0],
split(split(request_url,'/')[10], '%%20')[1],
split(request_url,'/')[4],
split(request_url,'/')[6],
split(request_url,'/')[9],
split(request_url,'/')[3],
count(*)
FROM v2_raw_logs
WHERE
(domain='addons.mozilla.org' OR domain='blocklist.addons.mozilla.org')
and http_status_code = '200'
and request_url like '/blocklist/3/%%'
and ds='%s'
GROUP BY
ds,
split(request_url,'/')[5],
split(split(request_url,'/')[10], '%%20')[0],
split(split(request_url,'/')[10], '%%20')[1],
split(request_url,'/')[4],
split(request_url,'/')[6],
split(request_url,'/')[9],
split(request_url,'/')[3]
"""
hive = pyhs2.connect(
host=options.host,
port=10000,
authMechanism='PLAIN',
user=options.user,
password='ignored',
database='default',
# the underlying TSocket setTimeout() wants milliseconds
timeout=30 * 60 * 1000
)
def remove_control_characters(s):
if isinstance(s, str):
s = unicode(s, 'utf-8', errors='replace')
return ''.join(c for c in s if unicodedata.category(c)[0] != "C")
with codecs.open(options.output_filename, 'w', 'utf-8') as f:
cur = hive.cursor()
query = query % options.target_date
cur.execute(query)
for row in cur:
if None in row:
continue
f.write(
"\t"
.join(
remove_control_characters(
urllib2.unquote(v)
).replace('\\', '\\\\')
if isinstance(v, basestring) else str(v)
for v in row
)
)
f.write("\n")
if __name__ == '__main__':
main()