forked from kubernetes/test-infra
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassifier.py
441 lines (371 loc) · 15.9 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
# Copyright 2016 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datetime
import logging
import re
import google.appengine.ext.ndb as ndb
import models
XREF_RE = re.compile(r'(?:k8s-gubernator\.appspot\.com|gubernator\.k8s\.io)/build(/[^])\s]+/\d+)')
APPROVERS_RE = re.compile(r'<!-- META={"?approvers"?:\[([^]]*)\]} -->')
def classify_issue(repo, number):
"""
Classify an issue in a repo based on events in Datastore.
Args:
repo: string
number: int
Returns:
is_pr: bool
is_open: bool
involved: list of strings representing usernames involved
payload: a dict, see full description for classify below.
last_event_timestamp: the timestamp of the most recent event.
"""
ancestor = models.GithubResource.make_key(repo, number)
logging.info('finding webhooks for %s %s', repo, number)
event_keys = list(models.GithubWebhookRaw.query(ancestor=ancestor)
.order(models.GithubWebhookRaw.timestamp)
.fetch(keys_only=True))
logging.info('classifying %s %s (%d events)', repo, number, len(event_keys))
last_event_timestamp = [datetime.datetime(2000, 1, 1)]
def events_iterator():
for x in xrange(0, len(event_keys), 100):
events = ndb.get_multi(event_keys[x:x+100])
for event in events:
last_event_timestamp[0] = max(last_event_timestamp[0], event.timestamp)
yield [event.to_tuple() for event in events]
def get_status_for(sha):
statuses = {}
for status in models.GHStatus.query_for_sha(repo, sha):
last_event_timestamp[0] = max(last_event_timestamp[0], status.updated_at)
statuses[status.context] = [
status.state, status.target_url, status.description]
return statuses
classified = classify_from_iterator(events_iterator(), status_fetcher=get_status_for)
return list(classified) + last_event_timestamp
def get_merged(events, merged=None):
"""
Determine the most up-to-date view of the issue given its inclusion
in a series of events.
Note that different events have different levels of detail-- comments
don't include head SHA information, pull request events don't have label
information, etc.
Args:
events: a list of (event_type str, event_body dict, timestamp).
merged: the result of a previous invocation.
Returns:
body: a dict representing the issue's latest state.
"""
merged = merged or {}
for _event, body, _timestamp in events:
if 'issue' in body:
merged.update(body['issue'])
if 'pull_request' in body:
merged.update(body['pull_request'])
return merged
def get_labels(events, labels=None):
"""
Determine the labels applied to an issue.
Args:
events: a list of (event_type str, event_body dict, timestamp).
Returns:
labels: the currently applied labels as {label_name: label_color}
"""
labels = labels or {}
for event, body, _timestamp in events:
if 'issue' in body:
# issues come with labels, so we can update here
labels = {l['name']: l['color'] for l in body['issue']['labels']}
# pull_requests don't include their full labels :(
action = body.get('action')
if event == 'pull_request':
# Pull request label events don't come with a full label set.
# Track them explicitly here.
try:
if action in ('labeled', 'unlabeled') and 'label' not in body:
logging.warning('label event with no labels (multiple changes?)')
elif action == 'labeled':
label = body['label']
if label['name'] not in labels:
labels[label['name']] = label['color']
elif action == 'unlabeled':
labels.pop(body['label']['name'], None)
except:
logging.exception('??? %r', body)
raise
return labels
def get_skip_comments(events, skip_users=None):
"""
Determine comment ids that should be ignored, either because of
deletion or because the user should be skipped.
Args:
events: a list of (event_type str, event_body dict, timestamp).
Returns:
comment_ids: a set of comment ids that were deleted or made by
users that should be skipped.
"""
skip_users = skip_users or []
skip_comments = set()
for event, body, _timestamp in events:
action = body.get('action')
if event in ('issue_comment', 'pull_request_review_comment'):
comment_id = body['comment']['id']
if action == 'deleted' or body['sender']['login'] in skip_users:
skip_comments.add(comment_id)
return skip_comments
def classify(events, status_fetcher=None):
"""
Given an event-stream for an issue and status-getter, process
the events and determine what action should be taken, if any.
Args: One of:
events: a list of (event_type str, event_body dict, timestamp).
events_iterator: an iterable yielding successive events lists
status_fetcher: a function that returns statuses for the given SHA.
Returns:
is_pr: bool
is_open: bool
involved: list of strings representing usernames involved
payload: a dictionary of additional information, including:
{
'author': str author_name,
'title': str issue title,
'labels': {label_name: label_color},
'attn': {user_name: reason},
'mergeable': bool,
'comments': [{'user': str name, 'comment': comment, 'timestamp': str iso8601}],
'xrefs': list of builds referenced (by GCS path),
}
"""
merged = get_merged(events)
labels = get_labels(events)
comments = get_comments(events)
reviewers = get_reviewers(events)
distilled_events = distill_events(events)
return _classify_internal(
merged, labels, comments, reviewers, distilled_events, status_fetcher)
def classify_from_iterator(events_iterator, status_fetcher=None):
"""Like classify(), but process batches of events from an iterator."""
merged = None
labels = None
comments = None
reviewers = None
distilled_events = None
for events in events_iterator:
merged = get_merged(events, merged)
labels = get_labels(events, labels)
comments = get_comments(events, comments)
reviewers = get_reviewers(events, reviewers)
distilled_events = distill_events(events, distilled_events)
return _classify_internal(
merged, labels, comments, reviewers, distilled_events, status_fetcher)
def _classify_internal(merged, labels, comments, reviewers, distilled_events, status_fetcher):
approvers = get_approvers(comments)
is_pr = 'head' in merged or 'pull_request' in merged
is_open = merged['state'] != 'closed'
author = merged['user']['login']
assignees = sorted({assignee['login'] for assignee in merged['assignees']} | reviewers)
involved = sorted(u.lower() for u in set([author] + assignees + approvers))
payload = {
'author': author,
'assignees': assignees,
'title': merged['title'],
'labels': labels,
'xrefs': get_xrefs(comments, merged),
}
if is_pr:
if is_open:
payload['needs_rebase'] = 'needs-rebase' in labels or merged.get('mergeable') == 'false'
payload['additions'] = merged.get('additions', 0)
payload['deletions'] = merged.get('deletions', 0)
if 'head' in merged:
payload['head'] = merged['head']['sha']
if approvers:
payload['approvers'] = approvers
if status_fetcher and 'head' in payload:
payload['status'] = status_fetcher(payload['head'])
if merged.get('milestone'):
payload['milestone'] = merged['milestone']['title']
payload['attn'] = calculate_attention(distilled_events, payload)
return is_pr, is_open, involved, payload
def get_xrefs(comments, merged):
xrefs = set(XREF_RE.findall(merged.get('body') or ''))
for c in comments:
xrefs.update(XREF_RE.findall(c['comment']))
return sorted(xrefs)
def get_comments(events, comments=None):
"""
Pick comments and pull-request review comments out of a list of events.
Args:
events: a list of (event_type str, event_body dict, timestamp).
comments_prev: the previous output of this function.
Returns:
comments: a list of dict(author=..., comment=..., timestamp=...),
ordered with the earliest comment first.
"""
if not comments:
comments = {}
else:
comments = {c['id']: c for c in comments}
comments = {} # comment_id : comment
for event, body, _timestamp in events:
action = body.get('action')
if event in ('issue_comment', 'pull_request_review_comment'):
comment_id = body['comment']['id']
if action == 'deleted':
comments.pop(comment_id, None)
else:
c = body['comment']
comments[comment_id] = {
'author': c['user']['login'],
'comment': c['body'],
'timestamp': c['created_at'],
'id': c['id'],
}
return sorted(comments.values(), key=lambda c: c['timestamp'])
def get_reviewers(events, reviewers=None):
"""
Return the set of users that have a code review requested or completed.
"""
reviewers = reviewers or set()
for event, body, _timestamp in events:
action = body.get('action')
if event == 'pull_request':
if action == 'review_requested':
if 'requested_reviewer' not in body:
logging.warning('no reviewer present -- self-review?')
continue
reviewers.add(body['requested_reviewer']['login'])
elif action == 'review_request_removed':
reviewers -= {body['requested_reviewer']['login']}
elif event == 'pull_request_review':
if action == 'submitted':
reviewers.add(body['sender']['login'])
return reviewers
def get_approvers(comments):
"""
Return approvers requested in comments.
This MUST be kept in sync with mungegithub's getGubernatorMetadata().
"""
approvers = []
for comment in comments:
if comment['author'] == 'k8s-merge-robot':
m = APPROVERS_RE.search(comment['comment'])
if m:
approvers = m.group(1).replace('"', '').split(',')
return approvers
def distill_events(events, distilled_events=None):
"""
Given a sequence of events, return a series of user-action tuples
relevant to determining user state.
"""
bots = [
'google-oss-robot',
'istio-testing',
'k8s-bot',
'k8s-ci-robot',
'k8s-merge-robot',
'k8s-oncall',
'k8s-reviewable',
]
skip_comments = get_skip_comments(events, bots)
output = distilled_events or []
for event, body, timestamp in events:
action = body.get('action')
user = body.get('sender', {}).get('login')
if event in ('issue_comment', 'pull_request_review_comment'):
if body['comment']['id'] in skip_comments:
continue
if action == 'created':
output.append(('comment', user, timestamp))
if event == 'pull_request_review':
if action == 'submitted':
# this is morally equivalent to a comment
output.append(('comment', user, timestamp))
if event == 'pull_request':
if action in ('opened', 'reopened', 'synchronize'):
output.append(('push', user, timestamp))
if action == 'labeled' and 'label' in body:
output.append(('label ' + body['label']['name'].lower(), user, timestamp))
return output
def evaluate_fsm(events, start, transitions):
"""
Given a series of event tuples and a start state, execute the list of transitions
and return the resulting state, the time it entered that state, and the last time
the state would be entered (self-transitions are allowed).
transitions is a list of tuples
(state_before str, state_after str, condition str or callable)
The transition occurs if condition equals the action (as a str), or if
condition(action, user) is True.
"""
state = start
state_start = 0 # time that we entered this state
state_last = 0 # time of last transition into this state
for action, user, timestamp in events:
for state_before, state_after, condition in transitions:
if state_before is None or state_before == state:
if condition == action or (callable(condition) and condition(action, user)):
if state_after != state:
state_start = timestamp
state = state_after
state_last = timestamp
break
return state, state_start, state_last
def get_author_state(author, distilled_events):
"""
Determine the state of the author given a series of distilled events.
"""
return evaluate_fsm(distilled_events, start='waiting', transitions=[
# before, after, condition
(None, 'address comments', lambda a, u: a == 'comment' and u != author),
('address comments', 'waiting', 'push'),
('address comments', 'waiting', lambda a, u: a == 'comment' and u == author),
])
def get_assignee_state(assignee, author, distilled_events):
"""
Determine the state of an assignee given a series of distilled events.
"""
return evaluate_fsm(distilled_events, start='needs review', transitions=[
# before, after, condition
('needs review', 'waiting', lambda a, u: u == assignee and a in ('comment', 'label lgtm')),
(None, 'needs review', 'push'),
(None, 'needs review', lambda a, u: a == 'comment' and u == author),
])
def calculate_attention(distilled_events, payload):
"""
Given information about an issue, determine who should look at it.
It can include start and last update time for various states --
"address comments#123#456" means that something has been in 'address comments' since
123, and there was some other event that put it in 'address comments' at 456.
"""
author = payload['author']
assignees = payload['assignees']
attn = {}
def notify(to, reason):
attn[to] = reason
if any(state == 'failure' for state, _url, _desc
in payload.get('status', {}).values()):
notify(author, 'fix tests')
for approver in payload.get('approvers', []):
notify(approver, 'needs approval')
for assignee in assignees:
assignee_state, first, last = get_assignee_state(assignee, author, distilled_events)
if assignee_state != 'waiting':
notify(assignee, '%s#%s#%s' % (assignee_state, first, last))
author_state, first, last = get_author_state(author, distilled_events)
if author_state != 'waiting':
notify(author, '%s#%s#%s' % (author_state, first, last))
if payload.get('needs_rebase'):
notify(author, 'needs rebase')
if 'do-not-merge/release-note-label-needed' in payload['labels']:
notify(author, 'needs release-note label')
return attn