Skip to content

Commit

Permalink
adds Vargha-Delaney A measure for effect size (google#998)
Browse files Browse the repository at this point in the history
- adds a separate table under per-benchmark statistics to complement
  the Mann-Whitney U test.
- current implementation uses a continuous palette instead of
  discrete significance levels.
- plan to integrate the A12 measure in to benchmark and experiment
  level summary statistics.
  • Loading branch information
wideglide authored Dec 24, 2020
1 parent 6e9dd77 commit c6788ec
Show file tree
Hide file tree
Showing 6 changed files with 273 additions and 62 deletions.
51 changes: 43 additions & 8 deletions analysis/benchmark_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,21 +188,56 @@ def bug_mann_whitney_p_values(self):
key='bugs_covered')

@property
def mann_whitney_plot(self):
"""Mann Whitney U test plot."""
plot_filename = self._prefix_with_benchmark('mann_whitney_plot.svg')
self._plotter.write_heatmap_plot(self.mann_whitney_p_values,
@functools.lru_cache()
def vargha_delaney_a12_values(self):
"""Vargha Delaney A12 mesaure results (code coverage)."""
return stat_tests.a12_measure_test(self._benchmark_snapshot_df)

@property
@functools.lru_cache()
def bug_vargha_delaney_a12_values(self):
"""Vargha Delaney A12 mesaure results (bug coverage)."""
return stat_tests.a12_measure_test(self._benchmark_snapshot_df,
key='bugs_covered')

def _mann_whitney_plot(self, filename, p_values):
"""Generic Mann Whitney U test plot."""
plot_filename = self._prefix_with_benchmark(filename)
self._plotter.write_heatmap_plot(p_values,
self._get_full_path(plot_filename))
return plot_filename

@property
def mann_whitney_plot(self):
"""Mann Whitney U test plot (code coverage)."""
return self._mann_whitney_plot('mann_whitney_plot.svg',
self.mann_whitney_p_values)

@property
def bug_mann_whitney_plot(self):
"""Mann Whitney U test plot based on bugs covered."""
plot_filename = self._prefix_with_benchmark('bug_mann_whitney_plot.svg')
self._plotter.write_heatmap_plot(self.bug_mann_whitney_p_values,
self._get_full_path(plot_filename))
"""Mann Whitney U test plot (bug coverage)."""
return self._mann_whitney_plot('bug_mann_whitney_plot.svg',
self.bug_mann_whitney_p_values)

def _vargha_delaney_plot(self, filename, a12_values):
"""Generic Vargha Delany A12 measure plot."""
plot_filename = self._prefix_with_benchmark(filename)
self._plotter.write_a12_heatmap_plot(a12_values,
self._get_full_path(plot_filename))
return plot_filename

@property
def vargha_delaney_plot(self):
"""Vargha Delany A12 measure plot (code coverage)."""
return self._vargha_delaney_plot('varga_delaney_a12_plot.svg',
self.vargha_delaney_a12_values)

@property
def bug_vargha_delaney_plot(self):
"""Vargha Delany A12 measure plot (bug coverage)."""
return self._vargha_delaney_plot('bug_varga_delaney_a12_plot.svg',
self.bug_vargha_delaney_a12_values)

@property
def anova_p_value(self):
"""ANOVA test result."""
Expand Down
90 changes: 80 additions & 10 deletions analysis/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
"""Plotting functions."""

import matplotlib.pyplot as plt
import matplotlib.colors as colors
import numpy as np
import Orange
import scikit_posthocs as sp
import seaborn as sns

from analysis import data_utils
Expand Down Expand Up @@ -378,33 +378,103 @@ def write_better_than_plot(self, better_than_table, image_path):
self._write_plot_to_image(self.better_than_plot, better_than_table,
image_path)

def heatmap_plot(self, p_values, axes=None, symmetric=False):
@staticmethod
def _generic_heatmap_plot(values, axes, args, shrink_cbar=0.2):
"""Custom heatmap plot which mimics SciPy's sign_plot."""
args.update({'linewidths': 0.5, 'linecolor': '0.5', 'square': True})
# Annotate with values if less than 12 fuzzers.
if values.shape[0] > 11 and args.get('annot'):
args['annot'] = False

axis = sns.heatmap(values, ax=axes, **args)
axis.set_ylabel("")
axis.set_xlabel("")
label_args = {'rotation': 0, 'horizontalalignment': 'right'}
axis.set_yticklabels(axis.get_yticklabels(), **label_args)
label_args = {'rotation': 270, 'horizontalalignment': 'right'}
axis.set_xticklabels(axis.get_xticklabels(), **label_args)

cbar_ax = axis.collections[0].colorbar
cbar_ax.outline.set_linewidth(1)
cbar_ax.outline.set_edgecolor('0.5')

pos_bbox = cbar_ax.ax.get_position()
pos_bbox.y0 += shrink_cbar
pos_bbox.y1 -= shrink_cbar
cbar_ax.ax.set_position(pos_bbox)
return axis

def _pvalue_heatmap_plot(self, p_values, axes=None, symmetric=False):
"""Draws heatmap plot for visualizing statistical test results.
If |symmetric| is enabled, it masks out the upper triangle of the
p-value table (as it is redundant with the lower triangle).
"""
cmap_colors = ['#005a32', '#238b45', '#a1d99b', '#fbd7d4']
cmap = colors.ListedColormap(cmap_colors)

boundaries = [0, 0.001, 0.01, 0.05, 1]
norm = colors.BoundaryNorm(boundaries, cmap.N)

if symmetric:
mask = np.zeros_like(p_values)
mask[np.triu_indices_from(p_values)] = True

heatmap_args = {
'linewidths': 0.5,
'linecolor': '0.5',
'clip_on': False,
'square': True,
'cbar_ax_bbox': [0.85, 0.35, 0.04, 0.3],
'mask': mask if symmetric else None
'cmap': cmap,
'mask': mask if symmetric else None,
'fmt': ".3f",
'norm': norm
}
sp.sign_plot(p_values, ax=axes, **heatmap_args)

axis = self._generic_heatmap_plot(p_values, axes, heatmap_args)

cbar_ax = axis.collections[0].colorbar
cbar_ax.set_ticklabels(['p < 0.001', 'p < 0.01', 'p < 0.05', 'NS'])
cbar_ax.set_ticks([0.0005, 0.005, 0.03, 0.5])
cbar_ax.ax.tick_params(size=0)
return axis

def write_heatmap_plot(self, p_values, image_path, symmetric=False):
"""Writes heatmap plot."""
self._write_plot_to_image(self.heatmap_plot,
self._write_plot_to_image(self._pvalue_heatmap_plot,
p_values,
image_path,
symmetric=symmetric)

def _a12_heatmap_plot(self, a12_values, axes=None):
"""Draws heatmap plot for visualizing effect size results.
"""

palette_args = {
'h_neg': 12,
'h_pos': 128,
's': 99,
'l': 47,
'sep': 20,
'as_cmap': True
}

rdgn = sns.diverging_palette(**palette_args)

heatmap_args = {
'cmap': rdgn,
'vmin': 0.0,
'vmax': 1.0,
'square': True,
'annot': True,
'fmt': ".2f"
}
return self._generic_heatmap_plot(a12_values,
axes,
heatmap_args,
shrink_cbar=0.1)

def write_a12_heatmap_plot(self, a12_values, image_path):
"""Writes A12 heatmap plot."""
self._write_plot_to_image(self._a12_heatmap_plot, a12_values,
image_path)

def write_critical_difference_plot(self, average_ranks, num_of_benchmarks,
image_path):
"""Writes critical difference diagram."""
Expand Down
24 changes: 22 additions & 2 deletions analysis/report_templates/default.html
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,17 @@ <h5 class="center-align">Bug coverage sample statistics</h5>
<br>

<div class="row">
<div class="col s6 offset-s3">
<div class="col s6">
<h5 class="center-align">Vargha-Delaney A12 measure</h4>
<img class="responsive-img materialboxed"
src="{{ benchmark.bug_vargha_delaney_plot }}">
The table summarizes the A12 values from the
pairwise Vargha-Delaney A measure of effect size.
Green cells indicate the probability the fuzzer in the
row will outperform the fuzzer in the column.
</div>

<div class="col s6">
<h5 class="center-align">Mann-Whitney U test</h4>
<img class="responsive-img materialboxed"
src="{{ benchmark.bug_mann_whitney_plot }}">
Expand Down Expand Up @@ -288,7 +298,17 @@ <h5 class="center-align">Code coverage sample statistics</h5>
<br>

<div class="row">
<div class="col s6 offset-s3">
<div class="col s6">
<h5 class="center-align">Vargha-Delaney A12 measure</h4>
<img class="responsive-img materialboxed"
src="{{ benchmark.vargha_delaney_plot }}">
The table summarizes the A12 values from the
pairwise Vargha-Delaney A measure of effect size.
Green cells indicate the probability the fuzzer in the
row will outperform the fuzzer in the column.
</div>

<div class="col s6">
<h5 class="center-align">Mann-Whitney U test</h4>
<img class="responsive-img materialboxed"
src="{{ benchmark.mann_whitney_plot }}">
Expand Down
105 changes: 69 additions & 36 deletions analysis/stat_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,7 @@
SIGNIFICANCE_THRESHOLD = 0.05


def _create_p_value_table(benchmark_snapshot_df,
key,
statistical_test,
alternative="two-sided"):
def _create_pairwise_table(benchmark_snapshot_df, key, statistical_test):
"""Given a benchmark snapshot data frame and a statistical test function,
returns a p-value table. The |alternative| parameter defines the alternative
hypothesis to be tested. Use "two-sided" for two-tailed (default), and
Expand All @@ -35,11 +32,6 @@ def _create_p_value_table(benchmark_snapshot_df,
statistical test of the fuzzer in the row and column of the cell.
"""

def test_pair(measurements_x, measurements_y):
return statistical_test(measurements_x,
measurements_y,
alternative=alternative).pvalue

groups = benchmark_snapshot_df.groupby('fuzzer')
samples = groups[key].apply(list)
fuzzers = samples.index
Expand All @@ -48,51 +40,46 @@ def test_pair(measurements_x, measurements_y):
for f_i in fuzzers:
row = []
for f_j in fuzzers:
if f_i == f_j:
# TODO(lszekeres): With Pandas 1.0.0+, switch to:
# p_value = pd.NA
p_value = np.nan
elif set(samples[f_i]) == set(samples[f_j]):
p_value = np.nan
else:
p_value = test_pair(samples[f_i], samples[f_j])
row.append(p_value)
value = np.nan
if f_i != f_j and set(samples[f_i]) != set(samples[f_j]):
value = statistical_test(samples[f_i], samples[f_j])
row.append(value)
data.append(row)

p_values = pd.DataFrame(data, index=fuzzers, columns=fuzzers)
return p_values
return pd.DataFrame(data, index=fuzzers, columns=fuzzers)


def one_sided_u_test(benchmark_snapshot_df, key):
"""Returns p-value table for one-tailed Mann-Whitney U test."""
return _create_p_value_table(benchmark_snapshot_df,
key,
ss.mannwhitneyu,
alternative='greater')
return _create_pairwise_table(
benchmark_snapshot_df, key,
lambda xs, ys: ss.mannwhitneyu(xs, ys, alternative='greater').pvalue)


def two_sided_u_test(benchmark_snapshot_df, key):
"""Returns p-value table for two-tailed Mann-Whitney U test."""
return _create_p_value_table(benchmark_snapshot_df,
key,
ss.mannwhitneyu,
alternative='two-sided')
return _create_pairwise_table(
benchmark_snapshot_df, key,
lambda xs, ys: ss.mannwhitneyu(xs, ys, alternative='two-sided').pvalue)


def one_sided_wilcoxon_test(benchmark_snapshot_df, key):
"""Returns p-value table for one-tailed Wilcoxon signed-rank test."""
return _create_p_value_table(benchmark_snapshot_df,
key,
ss.wilcoxon,
alternative='greater')
return _create_pairwise_table(
benchmark_snapshot_df, key,
lambda xs, ys: ss.wilcoxon(xs, ys, alternative='greater').pvalue)


def two_sided_wilcoxon_test(benchmark_snapshot_df, key):
"""Returns p-value table for two-tailed Wilcoxon signed-rank test."""
return _create_p_value_table(benchmark_snapshot_df,
key,
ss.wilcoxon,
alternative='two-sided')
return _create_pairwise_table(
benchmark_snapshot_df, key,
lambda xs, ys: ss.wilcoxon(xs, ys, alternative='two-sided').pvalue)


def a12_measure_test(benchmark_snapshot_df, key='edges_covered'):
"""Returns a Vargha-Delaney A12 measure table."""
return _create_pairwise_table(benchmark_snapshot_df, key, a12)


def anova_test(benchmark_snapshot_df, key):
Expand Down Expand Up @@ -179,3 +166,49 @@ def friedman_posthoc_tests(experiment_pivot_df):
posthoc_tests['conover'] = sp.posthoc_conover_friedman(experiment_pivot_df)
posthoc_tests['nemenyi'] = sp.posthoc_nemenyi_friedman(experiment_pivot_df)
return posthoc_tests


def a12(measurements_x, measurements_y):
"""Returns Vargha-Delaney A12 measure effect size for two distributions.
A. Vargha and H. D. Delaney.
A critique and improvement of the CL common language effect size statistics
of McGraw and Wong.
Journal of Educational and Behavioral Statistics, 25(2):101-132, 2000
The Vargha and Delaney A12 statistic is a non-parametric effect size
measure.
Given observations of a metric (edges_covered or bugs_covered) for
fuzzer 1 (F2) and fuzzer 2 (F2), the A12 measures the probability that
running F1 will yield a higher metric than running F2.
Significant levels from original paper:
Large is > 0.714
Mediumm is > 0.638
Small is > 0.556
"""

x_array = np.asarray(measurements_x)
y_array = np.asarray(measurements_y)
x_size, y_size = x_array.size, y_array.size
ranked = ss.rankdata(np.concatenate((x_array, y_array)))
rank_x = ranked[0:x_size] # get the x-ranks

rank_x_sum = rank_x.sum()
# A = (R1/n1 - (n1+1)/2)/n2 # formula (14) in Vargha and Delaney, 2000
# The formula to compute A has been transformed to minimize accuracy errors.
# See: http://mtorchiano.wordpress.com/2014/05/19/effect-size-of-r-precision/

a12_measure = (2 * rank_x_sum - x_size * (x_size + 1)) / (
2 * y_size * x_size) # equivalent formula to avoid accuracy errors
return a12_measure


def benchmark_a12(benchmark_snapshot_df, f1_name, f2_name, key='edges_covered'):
"""Compute Vargha-Delaney A measure given a benchmark snapshot and the names
of two fuzzers to compare."""
df = benchmark_snapshot_df
f1_metric = df[df.fuzzer == f1_name][key]
f2_metric = df[df.fuzzer == f2_name][key]
return a12(f1_metric, f2_metric)
Loading

0 comments on commit c6788ec

Please sign in to comment.