adds Vargha-Delaney A measure for effect size (google#998)

- adds a separate table under per-benchmark statistics to complement the Mann-Whitney U test. - current implementation uses a continuous palette instead of discrete significance levels. - plan to integrate the A12 measure in to benchmark and experiment level summary statistics.
RosenZhu · Dec 24, 2020 · c6788ec · c6788ec
1 parent 6e9dd77
commit c6788ec
Show file tree

Hide file tree

Showing 6 changed files with 273 additions and 62 deletions.
diff --git a/analysis/benchmark_results.py b/analysis/benchmark_results.py
@@ -188,21 +188,56 @@ def bug_mann_whitney_p_values(self):
                                            key='bugs_covered')
 
     @property
-    def mann_whitney_plot(self):
-        """Mann Whitney U test plot."""
-        plot_filename = self._prefix_with_benchmark('mann_whitney_plot.svg')
-        self._plotter.write_heatmap_plot(self.mann_whitney_p_values,
+    @functools.lru_cache()
+    def vargha_delaney_a12_values(self):
+        """Vargha Delaney A12 mesaure results (code coverage)."""
+        return stat_tests.a12_measure_test(self._benchmark_snapshot_df)
+
+    @property
+    @functools.lru_cache()
+    def bug_vargha_delaney_a12_values(self):
+        """Vargha Delaney A12 mesaure results (bug coverage)."""
+        return stat_tests.a12_measure_test(self._benchmark_snapshot_df,
+                                           key='bugs_covered')
+
+    def _mann_whitney_plot(self, filename, p_values):
+        """Generic Mann Whitney U test plot."""
+        plot_filename = self._prefix_with_benchmark(filename)
+        self._plotter.write_heatmap_plot(p_values,
                                          self._get_full_path(plot_filename))
         return plot_filename
 
+    @property
+    def mann_whitney_plot(self):
+        """Mann Whitney U test plot (code coverage)."""
+        return self._mann_whitney_plot('mann_whitney_plot.svg',
+                                       self.mann_whitney_p_values)
+
     @property
     def bug_mann_whitney_plot(self):
-        """Mann Whitney U test plot based on bugs covered."""
-        plot_filename = self._prefix_with_benchmark('bug_mann_whitney_plot.svg')
-        self._plotter.write_heatmap_plot(self.bug_mann_whitney_p_values,
-                                         self._get_full_path(plot_filename))
+        """Mann Whitney U test plot (bug coverage)."""
+        return self._mann_whitney_plot('bug_mann_whitney_plot.svg',
+                                       self.bug_mann_whitney_p_values)
+
+    def _vargha_delaney_plot(self, filename, a12_values):
+        """Generic Vargha Delany A12 measure plot."""
+        plot_filename = self._prefix_with_benchmark(filename)
+        self._plotter.write_a12_heatmap_plot(a12_values,
+                                             self._get_full_path(plot_filename))
         return plot_filename
 
+    @property
+    def vargha_delaney_plot(self):
+        """Vargha Delany A12 measure plot (code coverage)."""
+        return self._vargha_delaney_plot('varga_delaney_a12_plot.svg',
+                                         self.vargha_delaney_a12_values)
+
+    @property
+    def bug_vargha_delaney_plot(self):
+        """Vargha Delany A12 measure plot (bug coverage)."""
+        return self._vargha_delaney_plot('bug_varga_delaney_a12_plot.svg',
+                                         self.bug_vargha_delaney_a12_values)
+
     @property
     def anova_p_value(self):
         """ANOVA test result."""

diff --git a/analysis/plotting.py b/analysis/plotting.py
@@ -14,9 +14,9 @@
 """Plotting functions."""
 
 import matplotlib.pyplot as plt
+import matplotlib.colors as colors
 import numpy as np
 import Orange
-import scikit_posthocs as sp
 import seaborn as sns
 
 from analysis import data_utils
@@ -378,33 +378,103 @@ def write_better_than_plot(self, better_than_table, image_path):
         self._write_plot_to_image(self.better_than_plot, better_than_table,
                                   image_path)
 
-    def heatmap_plot(self, p_values, axes=None, symmetric=False):
+    @staticmethod
+    def _generic_heatmap_plot(values, axes, args, shrink_cbar=0.2):
+        """Custom heatmap plot which mimics SciPy's sign_plot."""
+        args.update({'linewidths': 0.5, 'linecolor': '0.5', 'square': True})
+        # Annotate with values if less than 12 fuzzers.
+        if values.shape[0] > 11 and args.get('annot'):
+            args['annot'] = False
+
+        axis = sns.heatmap(values, ax=axes, **args)
+        axis.set_ylabel("")
+        axis.set_xlabel("")
+        label_args = {'rotation': 0, 'horizontalalignment': 'right'}
+        axis.set_yticklabels(axis.get_yticklabels(), **label_args)
+        label_args = {'rotation': 270, 'horizontalalignment': 'right'}
+        axis.set_xticklabels(axis.get_xticklabels(), **label_args)
+
+        cbar_ax = axis.collections[0].colorbar
+        cbar_ax.outline.set_linewidth(1)
+        cbar_ax.outline.set_edgecolor('0.5')
+
+        pos_bbox = cbar_ax.ax.get_position()
+        pos_bbox.y0 += shrink_cbar
+        pos_bbox.y1 -= shrink_cbar
+        cbar_ax.ax.set_position(pos_bbox)
+        return axis
+
+    def _pvalue_heatmap_plot(self, p_values, axes=None, symmetric=False):
         """Draws heatmap plot for visualizing statistical test results.
 
         If |symmetric| is enabled, it masks out the upper triangle of the
         p-value table (as it is redundant with the lower triangle).
         """
+        cmap_colors = ['#005a32', '#238b45', '#a1d99b', '#fbd7d4']
+        cmap = colors.ListedColormap(cmap_colors)
+
+        boundaries = [0, 0.001, 0.01, 0.05, 1]
+        norm = colors.BoundaryNorm(boundaries, cmap.N)
+
         if symmetric:
             mask = np.zeros_like(p_values)
             mask[np.triu_indices_from(p_values)] = True
 
         heatmap_args = {
-            'linewidths': 0.5,
-            'linecolor': '0.5',
-            'clip_on': False,
-            'square': True,
-            'cbar_ax_bbox': [0.85, 0.35, 0.04, 0.3],
-            'mask': mask if symmetric else None
+            'cmap': cmap,
+            'mask': mask if symmetric else None,
+            'fmt': ".3f",
+            'norm': norm
         }
-        sp.sign_plot(p_values, ax=axes, **heatmap_args)
+
+        axis = self._generic_heatmap_plot(p_values, axes, heatmap_args)
+
+        cbar_ax = axis.collections[0].colorbar
+        cbar_ax.set_ticklabels(['p < 0.001', 'p < 0.01', 'p < 0.05', 'NS'])
+        cbar_ax.set_ticks([0.0005, 0.005, 0.03, 0.5])
+        cbar_ax.ax.tick_params(size=0)
+        return axis
 
     def write_heatmap_plot(self, p_values, image_path, symmetric=False):
         """Writes heatmap plot."""
-        self._write_plot_to_image(self.heatmap_plot,
+        self._write_plot_to_image(self._pvalue_heatmap_plot,
                                   p_values,
                                   image_path,
                                   symmetric=symmetric)
 
+    def _a12_heatmap_plot(self, a12_values, axes=None):
+        """Draws heatmap plot for visualizing effect size results.
+        """
+
+        palette_args = {
+            'h_neg': 12,
+            'h_pos': 128,
+            's': 99,
+            'l': 47,
+            'sep': 20,
+            'as_cmap': True
+        }
+
+        rdgn = sns.diverging_palette(**palette_args)
+
+        heatmap_args = {
+            'cmap': rdgn,
+            'vmin': 0.0,
+            'vmax': 1.0,
+            'square': True,
+            'annot': True,
+            'fmt': ".2f"
+        }
+        return self._generic_heatmap_plot(a12_values,
+                                          axes,
+                                          heatmap_args,
+                                          shrink_cbar=0.1)
+
+    def write_a12_heatmap_plot(self, a12_values, image_path):
+        """Writes A12 heatmap plot."""
+        self._write_plot_to_image(self._a12_heatmap_plot, a12_values,
+                                  image_path)
+
     def write_critical_difference_plot(self, average_ranks, num_of_benchmarks,
                                        image_path):
         """Writes critical difference diagram."""

diff --git a/analysis/report_templates/default.html b/analysis/report_templates/default.html
@@ -259,7 +259,17 @@ <h5 class="center-align">Bug coverage sample statistics</h5>
                         <br>
 
                         <div class="row">
-                            <div class="col s6 offset-s3">
+                            <div class="col s6">
+                                <h5 class="center-align">Vargha-Delaney A12 measure</h4>
+                                    <img class="responsive-img materialboxed"
+                                         src="{{ benchmark.bug_vargha_delaney_plot }}">
+                                    The table summarizes the A12 values from the
+                                    pairwise Vargha-Delaney A measure of effect size.
+                                    Green cells indicate the probability the fuzzer in the
+                                    row will outperform the fuzzer in the column.
+                            </div>
+
+                            <div class="col s6">
                                 <h5 class="center-align">Mann-Whitney U test</h4>
                                     <img class="responsive-img materialboxed"
                                          src="{{ benchmark.bug_mann_whitney_plot }}">
@@ -288,7 +298,17 @@ <h5 class="center-align">Code coverage sample statistics</h5>
                         <br>
 
                         <div class="row">
-                            <div class="col s6 offset-s3">
+                            <div class="col s6">
+                                <h5 class="center-align">Vargha-Delaney A12 measure</h4>
+                                    <img class="responsive-img materialboxed"
+                                         src="{{ benchmark.vargha_delaney_plot }}">
+                                    The table summarizes the A12 values from the
+                                    pairwise Vargha-Delaney A measure of effect size.
+                                    Green cells indicate the probability the fuzzer in the
+                                    row will outperform the fuzzer in the column.
+                            </div>
+
+                            <div class="col s6">
                                 <h5 class="center-align">Mann-Whitney U test</h4>
                                     <img class="responsive-img materialboxed"
                                          src="{{ benchmark.mann_whitney_plot }}">

diff --git a/analysis/stat_tests.py b/analysis/stat_tests.py
@@ -21,10 +21,7 @@
 SIGNIFICANCE_THRESHOLD = 0.05
 
 
-def _create_p_value_table(benchmark_snapshot_df,
-                          key,
-                          statistical_test,
-                          alternative="two-sided"):
+def _create_pairwise_table(benchmark_snapshot_df, key, statistical_test):
     """Given a benchmark snapshot data frame and a statistical test function,
     returns a p-value table. The |alternative| parameter defines the alternative
     hypothesis to be tested. Use "two-sided" for two-tailed (default), and
@@ -35,11 +32,6 @@ def _create_p_value_table(benchmark_snapshot_df,
     statistical test of the fuzzer in the row and column of the cell.
     """
 
-    def test_pair(measurements_x, measurements_y):
-        return statistical_test(measurements_x,
-                                measurements_y,
-                                alternative=alternative).pvalue
-
     groups = benchmark_snapshot_df.groupby('fuzzer')
     samples = groups[key].apply(list)
     fuzzers = samples.index
@@ -48,51 +40,46 @@ def test_pair(measurements_x, measurements_y):
     for f_i in fuzzers:
         row = []
         for f_j in fuzzers:
-            if f_i == f_j:
-                # TODO(lszekeres): With Pandas 1.0.0+, switch to:
-                # p_value = pd.NA
-                p_value = np.nan
-            elif set(samples[f_i]) == set(samples[f_j]):
-                p_value = np.nan
-            else:
-                p_value = test_pair(samples[f_i], samples[f_j])
-            row.append(p_value)
+            value = np.nan
+            if f_i != f_j and set(samples[f_i]) != set(samples[f_j]):
+                value = statistical_test(samples[f_i], samples[f_j])
+            row.append(value)
         data.append(row)
 
-    p_values = pd.DataFrame(data, index=fuzzers, columns=fuzzers)
-    return p_values
+    return pd.DataFrame(data, index=fuzzers, columns=fuzzers)
 
 
 def one_sided_u_test(benchmark_snapshot_df, key):
     """Returns p-value table for one-tailed Mann-Whitney U test."""
-    return _create_p_value_table(benchmark_snapshot_df,
-                                 key,
-                                 ss.mannwhitneyu,
-                                 alternative='greater')
+    return _create_pairwise_table(
+        benchmark_snapshot_df, key,
+        lambda xs, ys: ss.mannwhitneyu(xs, ys, alternative='greater').pvalue)
 
 
 def two_sided_u_test(benchmark_snapshot_df, key):
     """Returns p-value table for two-tailed Mann-Whitney U test."""
-    return _create_p_value_table(benchmark_snapshot_df,
-                                 key,
-                                 ss.mannwhitneyu,
-                                 alternative='two-sided')
+    return _create_pairwise_table(
+        benchmark_snapshot_df, key,
+        lambda xs, ys: ss.mannwhitneyu(xs, ys, alternative='two-sided').pvalue)
 
 
 def one_sided_wilcoxon_test(benchmark_snapshot_df, key):
     """Returns p-value table for one-tailed Wilcoxon signed-rank test."""
-    return _create_p_value_table(benchmark_snapshot_df,
-                                 key,
-                                 ss.wilcoxon,
-                                 alternative='greater')
+    return _create_pairwise_table(
+        benchmark_snapshot_df, key,
+        lambda xs, ys: ss.wilcoxon(xs, ys, alternative='greater').pvalue)
 
 
 def two_sided_wilcoxon_test(benchmark_snapshot_df, key):
     """Returns p-value table for two-tailed Wilcoxon signed-rank test."""
-    return _create_p_value_table(benchmark_snapshot_df,
-                                 key,
-                                 ss.wilcoxon,
-                                 alternative='two-sided')
+    return _create_pairwise_table(
+        benchmark_snapshot_df, key,
+        lambda xs, ys: ss.wilcoxon(xs, ys, alternative='two-sided').pvalue)
+
+
+def a12_measure_test(benchmark_snapshot_df, key='edges_covered'):
+    """Returns a Vargha-Delaney A12 measure table."""
+    return _create_pairwise_table(benchmark_snapshot_df, key, a12)
 
 
 def anova_test(benchmark_snapshot_df, key):
@@ -179,3 +166,49 @@ def friedman_posthoc_tests(experiment_pivot_df):
     posthoc_tests['conover'] = sp.posthoc_conover_friedman(experiment_pivot_df)
     posthoc_tests['nemenyi'] = sp.posthoc_nemenyi_friedman(experiment_pivot_df)
     return posthoc_tests
+
+
+def a12(measurements_x, measurements_y):
+    """Returns Vargha-Delaney A12 measure effect size for two distributions.
+
+    A. Vargha and H. D. Delaney.
+    A critique and improvement of the CL common language effect size statistics
+    of McGraw and Wong.
+    Journal of Educational and Behavioral Statistics, 25(2):101-132, 2000
+
+    The Vargha and Delaney A12 statistic is a non-parametric effect size
+    measure.
+
+    Given observations of a metric (edges_covered or bugs_covered) for
+    fuzzer 1 (F2) and fuzzer 2 (F2), the A12 measures the probability that
+    running F1 will yield a higher metric than running F2.
+
+    Significant levels from original paper:
+      Large   is > 0.714
+      Mediumm is > 0.638
+      Small   is > 0.556
+    """
+
+    x_array = np.asarray(measurements_x)
+    y_array = np.asarray(measurements_y)
+    x_size, y_size = x_array.size, y_array.size
+    ranked = ss.rankdata(np.concatenate((x_array, y_array)))
+    rank_x = ranked[0:x_size]  # get the x-ranks
+
+    rank_x_sum = rank_x.sum()
+    # A = (R1/n1 - (n1+1)/2)/n2 # formula (14) in Vargha and Delaney, 2000
+    # The formula to compute A has been transformed to minimize accuracy errors.
+    # See: http://mtorchiano.wordpress.com/2014/05/19/effect-size-of-r-precision/
+
+    a12_measure = (2 * rank_x_sum - x_size * (x_size + 1)) / (
+        2 * y_size * x_size)  # equivalent formula to avoid accuracy errors
+    return a12_measure
+
+
+def benchmark_a12(benchmark_snapshot_df, f1_name, f2_name, key='edges_covered'):
+    """Compute Vargha-Delaney A measure given a benchmark snapshot and the names
+    of two fuzzers to compare."""
+    df = benchmark_snapshot_df
+    f1_metric = df[df.fuzzer == f1_name][key]
+    f2_metric = df[df.fuzzer == f2_name][key]
+    return a12(f1_metric, f2_metric)