forked from google/fuzzbench
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_utils.py
304 lines (234 loc) · 12.5 KB
/
data_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utility functions for data (frame) transformations."""
from analysis import stat_tests
def validate_data(experiment_df):
"""Checks if the experiment data is valid."""
if experiment_df.empty:
raise ValueError('Empty experiment data.')
expected_columns = {
'experiment', 'benchmark', 'fuzzer', 'trial_id', 'time_started',
'time_ended', 'time', 'edges_covered'
}
missing_columns = expected_columns.difference(experiment_df.columns)
if missing_columns:
raise ValueError(
'Missing columns in experiment data: {}'.format(missing_columns))
def drop_uninteresting_columns(experiment_df):
"""Returns table with only interesting columns."""
return experiment_df[[
'benchmark', 'fuzzer', 'trial_id', 'time', 'edges_covered'
]]
def clobber_experiments_data(df, experiments):
"""Clobber experiment data that is part of lower priority (generally
earlier) versions of the same trials in |df|. For example in experiment-1 we
may test fuzzer-a on benchmark-1. In experiment-2 we may again test fuzzer-a
on benchmark-1 because fuzzer-a was updated. This function will remove the
snapshots from fuzzer-a,benchmark-1,experiment-1 from |df| because we want
the report to only contain the up-to-date data. Experiment priority is
determined by order of each experiment in |experiments| with the highest
priority experiment coming last in that list."""
# We don't call |df| "experiment_df" because it is a misnomer and leads to
# confusion in this case where it contains data from multiple experiments.
# Include everything from the last experiment.
experiments = experiments.copy() # Copy so we dont mutate experiments.
experiments.reverse()
highest_rank_experiment = experiments[0]
result = df[df.experiment == highest_rank_experiment]
for experiment in experiments[1:]:
# Include data for not yet covered benchmark/fuzzer pairs.
covered_pairs = result[['benchmark', 'fuzzer']].drop_duplicates()
covered_pairs = covered_pairs.apply(tuple, axis=1)
experiment_data = df[df.experiment == experiment]
experiment_pairs = experiment_data[['benchmark',
'fuzzer']].apply(tuple, axis=1)
to_include = experiment_data[~experiment_pairs.isin(covered_pairs)]
result = result.append(to_include)
return result
def filter_fuzzers(experiment_df, included_fuzzers):
"""Returns table with only rows where fuzzer is in |included_fuzzers|."""
return experiment_df[experiment_df['fuzzer'].isin(included_fuzzers)]
def filter_benchmarks(experiment_df, included_benchmarks):
"""Returns table with only rows where benchmark is in
|included_benchmarks|."""
return experiment_df[experiment_df['benchmark'].isin(included_benchmarks)]
def label_fuzzers_by_experiment(experiment_df):
"""Returns table where every fuzzer is labeled by the experiment it
was run in."""
experiment_df['fuzzer'] = (experiment_df['fuzzer'] + '-' +
experiment_df['experiment'])
return experiment_df
def filter_max_time(experiment_df, max_time):
"""Returns table with snapshots that have time less than or equal to
|max_time|."""
return experiment_df[experiment_df['time'] <= max_time]
# Creating "snapshots" (see README.md for definition).
_DEFAULT_BENCHMARK_SAMPLE_NUM_THRESHOLD = 0.8
def get_benchmark_snapshot(benchmark_df,
threshold=_DEFAULT_BENCHMARK_SAMPLE_NUM_THRESHOLD):
"""Finds the latest time where 80% of the trials were still running. In most
cases, this is the end of the experiment. In this case, we won't consider
the <20% of the trials that ended early for our analysis. If more than 20%
of the trials ended early, it's better to pick an earlier snapshot time.
The 80% can be overridden using the |threshold| argument. E.g., to find the
latest time where each trials were running, set |threshold| to 1.0.
Returns data frame that only contains the measurements of the picked
snapshot time.
"""
num_trials = benchmark_df.trial_id.nunique()
trials_running_at_time = benchmark_df.time.value_counts()
criteria = trials_running_at_time > threshold * num_trials
ok_times = trials_running_at_time[criteria]
latest_ok_time = ok_times.index.max()
benchmark_snapshot_df = benchmark_df[benchmark_df.time == latest_ok_time]
return benchmark_snapshot_df
_DEFAULT_FUZZER_SAMPLE_NUM_THRESHOLD = 0.8
def get_fuzzers_with_not_enough_samples(
benchmark_snapshot_df, threshold=_DEFAULT_FUZZER_SAMPLE_NUM_THRESHOLD):
"""Retruns fuzzers that didn't have enough trials running at snapshot time.
It takes a benchmark snapshot and finds the fuzzers that have a sample size
smaller than 80% of the largest sample size. Default threshold can be
overridden.
"""
samples_per_fuzzer = benchmark_snapshot_df.fuzzer.value_counts()
max_samples = samples_per_fuzzer.max()
few_sample_criteria = samples_per_fuzzer < threshold * max_samples
few_sample_fuzzers = samples_per_fuzzer[few_sample_criteria].index
return few_sample_fuzzers.tolist()
def get_experiment_snapshots(experiment_df):
"""Finds a good snapshot time for each benchmark in the experiment data.
Returns the data frame that only contains the measurements made at these
snapshot times.
"""
benchmark_groups = experiment_df.groupby('benchmark')
experiment_snapshots = benchmark_groups.apply(get_benchmark_snapshot)
# We don't need the extra index added by the groupby('benchmark').
experiment_snapshots.reset_index(drop=True, inplace=True)
return experiment_snapshots
# Summary tables containing statistics on the samples.
def benchmark_summary(benchmark_snapshot_df):
"""Creates summary table for a benchmark snapshot with columns:
|fuzzer|time||count|mean|std|min|25%|median|75%|max|
"""
groups = benchmark_snapshot_df.groupby(['fuzzer', 'time'])
summary = groups['edges_covered'].describe()
summary.rename(columns={'50%': 'median'}, inplace=True)
return summary.sort_values(('median'), ascending=False)
def experiment_summary(experiment_snapshots_df):
"""Creates summary table for all benchmarks in experiment, i.e. table like:
|benchmark|| < benchmark level summary >
"""
groups = experiment_snapshots_df.groupby('benchmark')
summaries = groups.apply(benchmark_summary)
return summaries
# Per-benchmark fuzzer ranking options.
def benchmark_rank_by_mean(benchmark_snapshot_df):
"""Returns ranking of fuzzers based on mean coverage."""
assert benchmark_snapshot_df.time.nunique() == 1, 'Not a snapshot!'
means = benchmark_snapshot_df.groupby('fuzzer')['edges_covered'].mean()
means.rename('mean cov', inplace=True)
return means.sort_values(ascending=False)
def benchmark_rank_by_median(benchmark_snapshot_df):
"""Returns ranking of fuzzers based on median coverage."""
assert benchmark_snapshot_df.time.nunique() == 1, 'Not a snapshot!'
medians = benchmark_snapshot_df.groupby('fuzzer')['edges_covered'].median()
medians.rename('median cov', inplace=True)
return medians.sort_values(ascending=False)
def benchmark_rank_by_average_rank(benchmark_snapshot_df):
"""Ranks all coverage measurements in the snapshot across fuzzers.
Returns the average rank by fuzzer.
"""
# Make a copy of the dataframe view, because we want to add a new column.
measurements = benchmark_snapshot_df[['fuzzer', 'edges_covered']].copy()
measurements['rank'] = measurements['edges_covered'].rank()
avg_rank = measurements.groupby('fuzzer').mean()
avg_rank.rename(columns={'rank': 'avg rank'}, inplace=True)
avg_rank.sort_values('avg rank', ascending=False, inplace=True)
return avg_rank['avg rank']
def benchmark_rank_by_stat_test_wins(benchmark_snapshot_df):
"""Carries out one-tailed statistical tests for each fuzzer pair.
Returns ranking according to the number of statistical test wins.
"""
p_values = stat_tests.one_sided_u_test(benchmark_snapshot_df)
# Turn "significant" p-values into 1-s.
better_than = p_values.applymap(
lambda p: p < stat_tests.SIGNIFICANCE_THRESHOLD)
better_than = better_than.applymap(int)
score = better_than.sum(axis=1).sort_values(ascending=False)
score.rename('stat wins', inplace=True)
return score
def create_better_than_table(benchmark_snapshot_df):
"""Creates table showing whether fuzzer in row is statistically
significantly better than the fuzzer in the column."""
p_values = stat_tests.one_sided_u_test(benchmark_snapshot_df)
# Turn "significant" p-values into 1-s.
better_than = p_values.applymap(
lambda p: p < stat_tests.SIGNIFICANCE_THRESHOLD)
better_than = better_than.applymap(int)
# Order rows and columns of matrix according to score ranking.
score = better_than.sum(axis=1).sort_values(ascending=False)
better_than = better_than.reindex(index=score.index,
columns=score.index[::-1])
return better_than
# Experiment level ranking of fuzzers (across-benchmarks).
# Experiment level ranking depends on the per-benchmark ranking method.
def experiment_pivot_table(experiment_snapshots_df,
benchmark_level_ranking_function):
"""Creates a pivot table according to a given per benchmark ranking, where
the columns are the fuzzers, the rows are the benchmarks, and the values
are the scores according to the per benchmark ranking."""
benchmark_blocks = experiment_snapshots_df.groupby('benchmark')
groups_ranked = benchmark_blocks.apply(benchmark_level_ranking_function)
already_unstacked = groups_ranked.index.names == ['benchmark']
pivot_df = groups_ranked if already_unstacked else groups_ranked.unstack()
return pivot_df
def experiment_rank_by_average_rank(experiment_pivot_df):
"""Creates experiment level ranking of fuzzers.
Takes a pivot table representing per benchmark ranking scores. Ranks
fuzzers per benchmark, then takes the average rank across benchmarks
(smaller is better).
"""
# Rank fuzzers in each benchmark block.
pivot_ranked = experiment_pivot_df.rank('columns',
na_option='keep',
ascending=False)
average_ranks = pivot_ranked.mean().sort_values()
return average_ranks.rename('average rank')
def experiment_rank_by_num_firsts(experiment_pivot_df):
"""Creates experiment level ranking by number of first places in per
benchmark rankings (higher is better)."""
# Rank fuzzers in each benchmark block.
pivot_ranked = experiment_pivot_df.rank('columns',
na_option='keep',
ascending=False)
# Count first places for each fuzzer.
firsts = pivot_ranked[pivot_ranked == 1]
num_firsts = firsts.sum().sort_values(ascending=False)
return num_firsts.rename('number of wins')
def experiment_rank_by_average_normalized_score(experiment_pivot_df):
"""Creates experiment level ranking by taking the average of normalized per
benchmark scores from 0 to 100, where 100 is the highest reach coverage."""
# Normalize coverage values.
benchmark_maximum = experiment_pivot_df.max(axis='columns')
normalized_score = experiment_pivot_df.div(benchmark_maximum,
axis='index').mul(100)
average_score = normalized_score.mean().sort_values(ascending=False)
return average_score.rename('average normalized score')
def experiment_level_ranking(experiment_snapshots_df,
benchmark_level_ranking_function,
experiment_level_ranking_function):
"""Returns an aggregate ranking of fuzzers across all benchmarks."""
pivot_table = experiment_pivot_table(experiment_snapshots_df,
benchmark_level_ranking_function)
return experiment_level_ranking_function(pivot_table)