-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathhasher.py
178 lines (147 loc) · 6.69 KB
/
hasher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# Copyright 2023 Google LLC
#
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file or at
# https://developers.google.com/open-source/licenses/bsd
"""Hasher to generate signature hashes for the given code snippets."""
import functools
from typing import Mapping, Optional, Sequence, Tuple
from absl import logging
import mmh3
# For experimental purpose, the n-gram size can be adjusted, but note that any
# change on the line n-gram size requires regeneration of entire signatures.
# Generally, decreasing n-gram size may end up with increase of findings,
# including both true positives and false positives.
_LINE_SIGNATURE_NGRAM_SIZE = 4
_HASH = functools.partial(mmh3.hash128, seed=0, x64arch=True, signed=False)
class _LineNgram:
"""Class for maintaining an n-gram where the units are code lines."""
def __init__(self,
normalized_code: Mapping[int, str],
line_numbers: Sequence[int],
is_first: Optional[bool] = False,
is_last: Optional[bool] = False):
"""Initializes the line n-gram.
The first and last ngram of a file must be explicitly marked through
|is_first| and |is_last| in order to cover patch hunks adding lines at the
top or the bottom of the file. Internally, the first ngram will be regarded
as ranging from the line number negative infinity, and the last ngram
ranging to the line number infinity.
Args:
normalized_code: dictionary of normalized code lines including (but not
limited to) the lines for the n-gram. Each key is a line number, and the
value is normalized line in string.
line_numbers: the list of line numbers comprising the n-gram. Each line
number must be a valid line number existing in |normalized_code|.
is_first: True if the n-gram is the first n-gram of the target file.
is_last: True if the n-gram is the last n-gram of the target file.
Raises:
ValueError: raises value error if any line number in |line_numbers| is not
a valid line number in |normalized_code|.
"""
self._normalized_code = normalized_code
self._line_numbers = sorted(line_numbers)
self._is_first = is_first
self._is_last = is_last
def is_overlapping(self, line_range: Tuple[int, int]) -> bool:
"""Returns true if the line ngram range overlaps with given |line_range|."""
if self._is_first and self._is_last:
# This n-gram is the first and the last n-gram, covering the entire file.
return True
range_start, range_end = line_range
if range_start > range_end:
raise ValueError(f'line_range: start ({range_start}) cannot be greater '
f'than end ({range_end})')
# For given ranges r1 and r2, if r1.start <= r2.end && r1.end >= r2.start,
# r1 and r2 overlaps.
if self._is_first:
return self._line_numbers[-1] >= range_start
if self._is_last:
return self._line_numbers[0] <= range_end
return (self._line_numbers[0] <= range_end and
self._line_numbers[-1] >= range_start)
def get_ngram_string(self) -> str:
"""Returns the actual string of the n-gram."""
try:
return ' '.join([
self._normalized_code[line_number]
for line_number in self._line_numbers
])
except KeyError as e:
raise KeyError(
f'Invalid line numbers for ngram: {self._line_numbers}. This is a '
'bug and should never have happened. A _LineNgram object should only '
'be initialized with line_numbers being a subset of normalized_code.'
) from e
def get_line_numbers(self) -> Sequence[int]:
"""Returns the line numbers comprising the n-gram."""
return self._line_numbers
def hash_function_chunk(normalized_code: str) -> int:
"""Computes hash for the normalized code of a function chunk.
A function chunk signature is a Murmur3 128-bit x64 hash of the normalized
function code.
Args:
normalized_code: a normalized function code in string.
Returns:
The 128-bit hash in integer.
"""
return _HASH(normalized_code)
def hash_line_chunk(
normalized_code: Mapping[int, str],
affected_line_ranges: Sequence[Tuple[int, int]]
) -> Tuple[Sequence[int], Sequence[int]]:
"""Computes hash for the normalized code of a line chunk.
A line chunk signature is a set of n-gram line hashes. Each n-gram consists
of affected lines and their context lines (up to n - 1 lines before and
after the affected lines). Note that any empty lines / comment lines are not
regarded as valid lines so the actual context lines can be located further
than n - 1 lines of an affected line.
Args:
normalized_code: a normalized code of a line chunk.
affected_line_ranges: list of the ranges indicating the lines changed by the
patch in the chunk's target file. The line numbers are based on the
unpatched file. Inclusive.
Returns:
A tuple of the hash list and used line list. The hash list is a list of
128-bit line n-gram hashes. The used line list is a list of integer line
numbers used as elements of the n-grams.
"""
valid_line_numbers = sorted(normalized_code.keys())
if not valid_line_numbers:
logging.debug('No valid line found from the normalized code. Returning '
'empty lists.')
return [], []
if not affected_line_ranges:
# If no affected line range is specified, regard all lines as affected.
affected_line_ranges = [
(valid_line_numbers[0], valid_line_numbers[-1])
]
# Make a list of all valid line ngrams.
ngrams = []
if len(valid_line_numbers) < _LINE_SIGNATURE_NGRAM_SIZE:
# If the number of valid lines in a file is shorter than n-gram size,
# just use all valid lines.
ngrams.append(
_LineNgram(
normalized_code, valid_line_numbers, is_first=True, is_last=True))
else:
ngram_first_line_indices = range(
len(valid_line_numbers) - _LINE_SIGNATURE_NGRAM_SIZE + 1)
for line_index in ngram_first_line_indices:
ngram_line_numbers = valid_line_numbers[
line_index:_LINE_SIGNATURE_NGRAM_SIZE + line_index]
is_first = line_index == ngram_first_line_indices[0]
is_last = line_index == ngram_first_line_indices[-1]
ngrams.append(
_LineNgram(normalized_code, ngram_line_numbers, is_first, is_last))
# For only "valid & affected" ngrams, compute ngram hashes.
line_hashes = []
used_lines = set()
for affected_range in affected_line_ranges:
for ngram in ngrams.copy():
if ngram.is_overlapping(affected_range):
ngram_hash = _HASH(ngram.get_ngram_string())
line_hashes.append(ngram_hash)
used_lines.update(ngram.get_line_numbers())
ngrams.remove(ngram)
return line_hashes, sorted(used_lines)