-
Notifications
You must be signed in to change notification settings - Fork 2
/
analytics.py
212 lines (183 loc) · 8.71 KB
/
analytics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
from fabric import Connection
import argparse
import subprocess
# Analytics modules
from _Apr import *
from _Curl import *
from _Memcached import *
from _Redis import *
from _Zeromq import *
# from _Lighttpd import *
from _Lighttpd2 import *
# from _Beanstalkd import *
from _Git import *
# from _Diffutils import *
from _Binutils import *
from _BinutilsGdb import *
# from _Findutils import *
from _Dovecot import *
from _Squid import *
from _Vim import *
# Flow of control:
# Analytics() set up a cycle of containers using Container() + Subclass(Container)
# Subclass() runs the tests and store results in a Collector()
# Collector() is passed to DataHandler(), which dumps data to a CSV output file
""" cleaning functions to clean old/running containers """
def clean_r(c):
""" stop all running containers """
# TODO: is this the right way of invoking Connection?
c.local("docker ps | awk '{print $1}' | xargs docker stop")
def clean_s(c):
""" delete all container being run so far """
c.local("docker ps -a | grep 'ago' | awk '{print $1}' | xargs docker rm")
def clean_a():
# TODO: initialize Connection correctly.
c = Connection()
clean_r(c)
clean_s(c)
class Analytics(object):
""" Main class """
def __init__(self, _pclass, _image, _commits):
# the class itself
self.pclass = _pclass
# docker image
self.image = _image
# commits
self.commits = _commits
# Dummy local connection
self.conn = Connection('host')
@classmethod
def run_last(cls, _pclass, _image, _commit):
""" process the last n commits """
r = _pclass(_image, 'root', 'root')
r.spawn()
clist = r.get_commits(_commit)
r.halt()
return cls(_pclass, _image, clist)
@classmethod
def run_custom(cls, _pclass, _image, _commit, _count, _startaftercommit=None, _maxcommits=0, _repeats=1):
""" process a custom range of commits, given as tuple """
r = _pclass(_image, 'root', 'root', _repeats)
clist = []
r.spawn()
# attach timestamp and author to the commit
clist = r.get_commits(_count + 1, _commit)
r.halt()
if _startaftercommit:
# keep the whole list by default
startindex = len(clist)
for index, c in enumerate(clist):
if c.startswith("%s__" % _startaftercommit):
startindex = index
break
print("Retaining %d revisions" % startindex)
clist = clist[:startindex + 1]
if _maxcommits:
clist = clist[-(_maxcommits + 1):]
print("Will analyse %d commits" % len(clist))
return cls(_pclass, _image, clist)
def go(self, outputfolder, outputfile, repeats=1):
""" run all the tests for every version specified in a new container """
# create a data/program-name directory where data will be collected
self.conn.local('mkdir -p data/' + outputfolder)
# list of uncovered files (and corresponding lines) i revisions ago
prev_uncovered_list = [([], [])] * 10
# check oldest commit first. this makes it easier to check patch coverage in subsequent versions
prev_commit_id = self.commits[-1].split('__')[0]
del self.commits[-1]
self.commits.reverse()
for i in self.commits:
# self.commits format is ['commit.id__author.name__timestamp']
print(i)
a = i.split('__')
commit_id = a[0]
timestamp = a[1]
author_name = a[2]
c = self.pclass(self.image, 'root', 'root', repeats)
c.outputfolder = outputfolder
c.spawn()
if c.offline: # TODO: Don't really know why being offline changes the commit hash length we get...
prev_commit_id = prev_commit_id[:7]
commit_id = commit_id[:7]
try:
c.checkout(prev_commit_id, commit_id)
if not c.emptyCommit:
c.tsize_compute()
if not c.offline:
c.compile() # long steps
c.make_test() #
c.overall_coverage()
if not c.offline:
c.backup(commit_id)
c.patch_coverage(prev_commit_id)
for j, (files, lines) in enumerate(prev_uncovered_list):
prev_uncovered_list[j] = c.prev_patch_coverage(j, files, lines)
print(c.changed_files, c.uncovered_lines_list)
prev_uncovered_list.insert(0, (c.changed_files, c.uncovered_lines_list))
prev_uncovered_list.pop()
c.collect(author_name, timestamp, outputfolder, outputfile)
finally:
c.halt()
if not c.compileError:
prev_commit_id = commit_id
def main():
parser = argparse.ArgumentParser(prog='Analytics')
group = parser.add_mutually_exclusive_group()
group.add_argument('--image', default='baseline',
help="specify docker image (default: %(default)s)")
group.add_argument('--offline', action="store_true",
help="process the revisions reusing previous coverage information")
parser.add_argument('--resume', action="store_true",
help="resume processing from the last revision found in data file (e.g. data/<program>/<program>.csv)")
parser.add_argument('--endatcommit', help="process revisions up to this commit")
parser.add_argument('--limit', type=int, help="limit to n number of revisions")
parser.add_argument('--output', help="output file name")
parser.add_argument('--repeats', type=int, default=1, help="number of times to repeat the test suite (default: %(default)s)")
parser.add_argument('program', help="program to analyse")
parser.add_argument('revisions', type=int, nargs='?', default=0, help="number of revisions to process")
args = parser.parse_args()
benchmarks = {
"apr": {"class": Apr, "revision": "d54e362", "n": 500},
"curl": {"class": Curl, "revision": "b3e55bf", "n": 500},
# "beanstalkd": {"class": Beanstalkd, "revision": "fb0a466", "n": 600},
# "lighttpd": {"class": Lighttpd, "revision": "c8fbc16", "n": 600},
"lighttpd2": {"class": Lighttpd2, "revision": "0d40b25", "n": 400},
"memcached": {"class": Memcached, "revision": "87e2f36", "n": 409},
"zeromq": {"class": Zeromq, "revision": "573d7b0", "n": 500},
"redis": {"class": Redis, "revision": "347ab78", "n": 500},
"binutils": {"class": Binutils, "revision": "a0a1bb07", "n": 6000},
# In reality only ~2500 commits are relevant (inside binutils/) but binutils-gdb contains many other projects
"binutils-gdb": {"class": BinutilsGdb, "revision": "26be601", "n": 36106},
# "diffutils": {"class": Diffutils, "revision": "b2f1e4b", "n": 350},
# "dovecot": {"class": Dovecot, "revision": "fbf5813", "n": 1000},
# matches up with mercurial/git-hg commits for ffbf5813, some commits don't work since have external dependencies we can't roll back to (Unicode)
"dovecot": {"class": Dovecot, "revision": "121b017", "n": 1000},
"squid": {"class": Squid, "revision": "ae64d121", "n": 1500}, # 3896 commits from 1st Jan 2013 (f1402d4) to 6 May 2023
"git": {"class": Git, "revision": "d7aced9", "n": 500},
# For Vim, Jun 2013 revision, v7 last rev is edeb846c
"vim": {"class": Vim, "revision": "f751255", "n": 500},
}
try:
b = benchmarks[args.program]
outputfolder = args.output if args.output else b["class"].__name__
outputfile = b["class"].__name__
if args.offline:
outputfile += "Offline"
print('running offline, requires previous coverage information (data/<program>/coverage-<revision>.tar.bz2)')
output = "data/%s/%s.csv" % (outputfolder, outputfile)
lastrev = None
if args.resume:
lastrecord = subprocess.check_output(["tail", "-1", output])
lastrecord = lastrecord.decode().split(',')
if len(lastrecord):
lastrev = lastrecord[0]
container = Analytics.run_custom(b["class"],
args.image if not args.offline else None,
args.endatcommit if args.endatcommit else b["revision"],
args.revisions if args.revisions else b["n"], lastrev,
args.limit)
container.go(outputfolder, outputfile, repeats=args.repeats)
except KeyError:
print("Unrecognized program name %s" % args.program)
if __name__ == "__main__":
main()