utils/compare.py

#!/usr/bin/env python
"""Tool to filter, organize, compare and display benchmarking results. Usefull
for smaller datasets. It works great with a few dozen runs it is not designed to
deal with hundreds.
Requires the pandas library to be installed."""
from __future__ import print_function

import pandas as pd
from scipy import stats
import sys
import os.path
import re
import numbers
import argparse

GEOMEAN_ROW = "Geomean difference"


def read_lit_json(filename):
    import json

    jsondata = json.load(open(filename))
    columns = []
    columnindexes = {}
    names = set()
    info_columns = ["hash"]
    # Pass1: Figure out metrics (= the column index)
    if "tests" not in jsondata:
        print("%s: Could not find toplevel 'tests' key")
        sys.exit(1)
    for test in jsondata["tests"]:
        name = test.get("name")
        if name is None:
            sys.stderr.write("Error: Found unnamed test\n" % name)
            sys.exit(1)
        if name in names:
            sys.stderr.write("Error: Multiple tests with name '%s'\n" % name)
            sys.exit(1)
        if "metrics" not in test:
            print("Warning: '%s' has no metrics, skipping!" % test["name"])
            continue
        names.add(name)
        for name in test["metrics"].keys():
            if name not in columnindexes:
                columnindexes[name] = len(columns)
                columns.append(name)
        for name in test.keys():
            if name not in columnindexes and name in info_columns:
                columnindexes[name] = len(columns)
                columns.append(name)

    # Pass2 actual data construction
    nan = float("NaN")
    data = []
    testnames = []
    for test in jsondata["tests"]:
        if "metrics" not in test:
            continue
        name = test["name"]
        if "shortname" in test:
            name = test["shortname"]
        testnames.append(name)

        datarow = [nan] * len(columns)
        for (metricname, value) in test["metrics"].items():
            datarow[columnindexes[metricname]] = value
        for (name, value) in test.items():
            index = columnindexes.get(name)
            if index is not None:
                datarow[index] = test[name]
        data.append(datarow)
    index = pd.Index(testnames, name="Program")
    return pd.DataFrame(data=data, index=index, columns=columns)


def read_report_simple_csv(filename):
    return pd.read_csv(filename, na_values=["*"], index_col=0, header=0)


def read(name):
    if name.endswith(".json"):
        return read_lit_json(name)
    if name.endswith(".csv"):
        return read_report_simple_csv(name)
    raise Exception("Cannot determine file format")


def readmulti(filenames):
    # Read datasets
    datasetnames = []
    datasets = []
    prev_index = None
    for filename in filenames:
        data = read(filename)
        name = os.path.basename(filename)
        # drop .json/.csv suffix; TODO: Should we rather do this in the printing
        # logic?
        for ext in [".csv", ".json"]:
            if name.endswith(ext):
                name = name[: -len(ext)]
        datasets.append(data)
        suffix = ""
        count = 0
        while True:
            if name + suffix not in datasetnames:
                break
            suffix = str(count)
            count += 1

        datasetnames.append(name + suffix)
        # Warn if index names are different
        if prev_index is not None and prev_index.name != data.index.name:
            sys.stderr.write(
                "Warning: Mismatched index names: '%s' vs '%s'\n"
                % (prev_index.name, data.index.name)
            )
        prev_index = data.index
    # Merge datasets
    d = pd.concat(datasets, axis=0, names=["run"], keys=datasetnames)
    return d


def get_values(values):
    # Create data view without diff column.
    if "diff" in values.columns:
        values = values[[c for c in values.columns if c != "diff"]]
    has_two_runs = len(values.columns) == 2
    if has_two_runs:
        return (values.iloc[:, 0], values.iloc[:, 1])
    else:
        return (values.min(axis=1), values.max(axis=1))


def add_diff_column(metric, values, absolute_diff=False):
    values0, values1 = get_values(values[metric])
    values0.fillna(0.0, inplace=True)
    values1.fillna(0.0, inplace=True)
    # Quotient or absolute difference?
    if absolute_diff:
        values[(metric, "diff")] = values1 - values0
    else:
        values[(metric, "diff")] = (values1 / values0) - 1.0
    return values


def add_geomean_row(metrics, data, dataout):
    """
    Normalize values1 over values0, compute geomean difference and add a
    summary row to dataout.
    """
    gm = pd.DataFrame(index=[GEOMEAN_ROW], columns=dataout.columns, dtype="float64")
    for metric in metrics:
        values0, values1 = get_values(data[metric])
        # Avoid infinite values in the diff and instead use NaN, as otherwise
        # the computation of the geometric mean will fail.
        values0 = values0.replace({0: float("NaN")})
        relative = values1 / values0
        gm_diff = stats.gmean(relative.dropna()) - 1.0
        gm[(metric, "diff")] = gm_diff
    gm.Program = GEOMEAN_ROW
    return pd.concat([dataout, gm])


def filter_failed(data, key="Exec"):
    return data.loc[data[key] == "pass"]


def filter_short(data, threshold, key="Exec_Time"):
    return data.loc[data[key] >= threshold]


def filter_same_hash(data, key="hash"):
    assert key in data.columns
    assert data.index.get_level_values(0).nunique() > 1

    return data.groupby(level=1).filter(lambda x: x[key].nunique() != 1)


def filter_blacklist(data, blacklist):
    return data.loc[~(data.index.get_level_values(1).isin(blacklist))]


def print_filter_stats(reason, before, after):
    n_before = len(before.groupby(level=1))
    n_after = len(after.groupby(level=1))
    n_filtered = n_before - n_after
    if n_filtered != 0:
        print("%s: %s (filtered out)" % (reason, n_filtered))


# Truncate a string to a maximum length by keeping a prefix, a suffix and ...
# in the middle
def truncate(string, prefix_len, suffix_len):
    return re.sub(
        "^(.{%d}).*(.{%d})$" % (prefix_len, suffix_len), r"\g<1>...\g<2>", string
    )


# Search for common prefixes and suffixes in a list of names and return
# a (prefix,suffix) tuple that specifies how many characters can be dropped
# for the prefix/suffix. The numbers will be small enough that no name will
# become shorter than min_len characters.
def determine_common_prefix_suffix(names, min_len=8):
    if len(names) <= 1:
        return (0, 0)
    name0 = names.iloc[0]
    prefix = name0
    prefix_len = len(name0)
    suffix = name0
    suffix_len = len(name0)
    shortest_name = len(name0)
    for name in names:
        if len(name) < shortest_name:
            shortest_name = len(name)
        while prefix_len > 0 and name[:prefix_len] != prefix:
            prefix_len -= 1
            prefix = name0[:prefix_len]
        while suffix_len > 0 and name[-suffix_len:] != suffix:
            suffix_len -= 1
            suffix = name0[-suffix_len:]

    if suffix[0] != "." and suffix[0] != "_":
        suffix_len = 0
    suffix_len = max(0, min(shortest_name - prefix_len - min_len, suffix_len))
    prefix_len = max(0, min(shortest_name - suffix_len, prefix_len))
    return (prefix_len, suffix_len)


def format_relative_diff(value):
    if not isinstance(value, numbers.Integral):
        return "%4.1f%%" % (value * 100.0)
    else:
        return "%-5d" % value


def print_result(
    d,
    limit_output=True,
    shorten_names=True,
    minimal_names=False,
    show_diff_column=True,
    sortkey="diff",
    sort_by_abs=True,
    absolute_diff=False,
):
    metrics = d.columns.levels[0]
    if sort_by_abs:
        d = d.sort_values(by=(metrics[0], sortkey), key=pd.Series.abs, ascending=False)
    else:
        d = d.sort_values(by=(metrics[0], sortkey), ascending=False)

    # Ensure that the columns are grouped by metric (rather than having the
    # diffs at the end of the line).
    d = d.reindex(columns=d.columns.levels[0], level=0)

    if not show_diff_column:
        # Remove all diff columns (using level=1 since level 0 is the metric).
        d.drop(labels="diff", level=1, axis=1, inplace=True)
    dataout = d
    if limit_output:
        # Take 15 topmost elements
        dataout = dataout.head(15)

    formatters = dict()
    if not absolute_diff:
        for m in metrics:
            formatters[(m, "diff")] = format_relative_diff
    # Turn index into a column so we can format it...
    formatted_program = dataout.index.to_series()
    if shorten_names:

        def format_name(name, common_prefix, common_suffix):
            name = name[common_prefix:]
            if common_suffix > 0:
                name = name[:-common_suffix]
            return "%-45s" % truncate(name, 10, 30)

        def strip_name_fully(name):
            name = name.split("/")[-1]
            if name.endswith(".test"):
                name = name[:-5]
            return name

        # The to_string formatters argument appears to be ignored for
        # dtype=object, so transform the program column manually.
        if minimal_names:
            formatted_program = formatted_program.map(strip_name_fully)
        else:
            drop_prefix, drop_suffix = determine_common_prefix_suffix(formatted_program)
            formatted_program = formatted_program.map(
                lambda name: format_name(name, drop_prefix, drop_suffix)
            )
    dataout.insert(0, "Program", formatted_program)
    # Add the geometric mean row after we have formatted the program names
    # as it will otherwise interfere with common prefix/suffix computation.
    if show_diff_column and not absolute_diff:
        # geometric mean only makes sense for relative differences.
        dataout = add_geomean_row(metrics, d, dataout)

    def float_format(x):
        if x == "":
            return ""
        return "%6.2f" % (x,)

    pd.set_option("display.max_colwidth", 0)
    pd.set_option("display.width", 0)
    # Print an empty value instead of NaN (for the geomean row).
    out = dataout.to_string(
        index=False,
        justify="left",
        na_rep="",
        float_format=float_format,
        formatters=formatters,
    )
    print(out)
    print(d.describe())


def main():
    parser = argparse.ArgumentParser(prog="compare.py")
    parser.add_argument("-a", "--all", action="store_true")
    parser.add_argument("-f", "--full", action="store_true")
    parser.add_argument("-m", "--metric", action="append", dest="metrics", default=[])
    parser.add_argument(
        "--nodiff", action="store_false", dest="show_diff", default=None
    )
    parser.add_argument("--diff", action="store_true", dest="show_diff")
    parser.add_argument(
        "--absolute-diff",
        action="store_true",
        help="Use an absolute instead of a relative difference",
    )
    parser.add_argument(
        "--filter-short",
        nargs="?",
        dest="filter_short",
        default=None,
        help="Filter benchmarks with execution times less than N seconds (default 1.0s)",
    )
    parser.add_argument(
        "--no-filter-failed", action="store_false", dest="filter_failed", default=True
    )
    parser.add_argument(
        "--filter-hash", action="store_true", dest="filter_hash", default=False
    )
    parser.add_argument("--filter-blacklist", dest="filter_blacklist", default=None)
    parser.add_argument(
        "--merge-average",
        action="store_const",
        dest="merge_function",
        const=pd.DataFrame.mean,
        default=pd.DataFrame.min,
    )
    parser.add_argument(
        "--merge-min",
        action="store_const",
        dest="merge_function",
        const=pd.DataFrame.min,
    )
    parser.add_argument(
        "--merge-max",
        action="store_const",
        dest="merge_function",
        const=pd.DataFrame.max,
    )
    parser.add_argument(
        "--lhs-name", default="lhs", help="Name used to describe left side in 'vs' mode"
    )
    parser.add_argument(
        "--rhs-name",
        default="rhs",
        help="Name used to describe right side in 'vs' mode",
    )
    parser.add_argument(
        "files",
        metavar="FILE",
        nargs="+",
        help="To compare two groups of results, put 'vs' between them",
    )
    parser.add_argument(
        "--minimal-names", action="store_true", dest="minimal_names", default=False
    )
    parser.add_argument(
        "--no-abs-sort",
        action="store_true",
        dest="no_abs_sort",
        default=False,
        help="Don't use abs() when sorting results",
    )
    config = parser.parse_args()

    if config.show_diff is None:
        config.show_diff = len(config.files) > 1

    # If only --filter-short is provided, i.e. its optional argument is
    # omitted, we default to threshold of 1 second to filter out apps and
    # results with a execution time less than that.
    filter_short_threshold = 1.0

    # If the optional argument to --filter-short is omitted, we need to take
    # care of this case and command line:
    #     --filter-short FILE [FILE ...]
    # I.e., we need to recognise that FILE is not the optional argument to
    # --filter-short. The way we do this, is to try converting the option value
    # to a float, and if that fails, we insert it back into the files list (in
    # the first position).
    if config.filter_short is not None:
        try:
            filter_short_threshold = float(config.filter_short)
        except:
            config.files.insert(0, config.filter_short)

    # Read inputs
    files = config.files
    if "vs" in files:
        split = files.index("vs")
        lhs = files[0:split]
        rhs = files[split + 1 :]

        # Filter minimum of lhs and rhs
        lhs_d = readmulti(lhs)
        lhs_merged = lhs_d.groupby(level=1).apply(config.merge_function)
        rhs_d = readmulti(rhs)
        rhs_merged = rhs_d.groupby(level=1).apply(config.merge_function)

        # Combine to new dataframe
        data = pd.concat(
            [lhs_merged, rhs_merged],
            names=["l/r"],
            keys=[config.lhs_name, config.rhs_name],
        )
    else:
        data = readmulti(files)

    # Decide which metric to display / what is our "main" metric
    metrics = config.metrics
    if len(metrics) == 0:
        defaults = ["Exec_Time", "exec_time", "Value", "Runtime"]
        for defkey in defaults:
            if defkey in data.columns:
                metrics = [defkey]
                break
    if len(metrics) == 0:
        sys.stderr.write("No default metric found and none specified\n")
        sys.stderr.write("Available metrics:\n")
        for column in data.columns:
            sys.stderr.write("\t%s\n" % column)
        sys.exit(1)
    for metric in metrics:
        problem = False
        if metric not in data.columns:
            sys.stderr.write("Unknown metric '%s'\n" % metric)
            problem = True
        if problem:
            sys.exit(1)

    # Filter data
    proggroup = data.groupby(level=1)
    initial_size = len(proggroup.indices)
    print("Tests: %s" % (initial_size,))
    if config.filter_failed and hasattr(data, "Exec"):
        newdata = filter_failed(data)
        print_filter_stats("Failed", data, newdata)
        newdata = newdata.drop("Exec", 1)
        data = newdata
    if config.filter_short:
        newdata = filter_short(data, filter_short_threshold, metric)
        print_filter_stats("Short Running", data, newdata)
        data = newdata
    if (
        config.filter_hash
        and "hash" in data.columns
        and data.index.get_level_values(0).nunique() > 1
    ):
        newdata = filter_same_hash(data)
        print_filter_stats("Same hash", data, newdata)
        data = newdata
    if config.filter_blacklist:
        blacklist = open(config.filter_blacklist).readlines()
        blacklist = [line.strip() for line in blacklist]
        newdata = filter_blacklist(data, blacklist)
        print_filter_stats("In Blacklist", data, newdata)
        data = newdata
    final_size = len(data.groupby(level=1))
    if final_size != initial_size:
        print("Remaining: %d" % (final_size,))

    # Reduce / add columns
    print("Metric: %s" % (",".join(metrics),))
    if len(metrics) > 0:
        data = data[metrics]

    data = data.unstack(level=0)

    for metric in data.columns.levels[0]:
        data = add_diff_column(metric, data, absolute_diff=config.absolute_diff)

    sortkey = "diff"
    # TODO: should we still be sorting by diff even if the diff is hidden?
    if len(config.files) == 1:
        sortkey = data.columns.levels[1][0]

    # Print data
    print("")
    shorten_names = not config.full
    limit_output = (not config.all) and (not config.full)
    print_result(
        data,
        limit_output,
        shorten_names,
        config.minimal_names,
        config.show_diff,
        sortkey,
        config.no_abs_sort,
        config.absolute_diff,
    )


if __name__ == "__main__":
    main()