tfhe-rs/ci/benchmark_parser.py

"""
benchmark_parser
----------------

Parse criterion benchmark or keys size results.
"""
import argparse
import csv
import pathlib
import json
import sys


ONE_HOUR_IN_NANOSECONDS = 3600E9

parser = argparse.ArgumentParser()
parser.add_argument('results',
                    help='Location of criterion benchmark results directory.'
                         'If the --key-size option is used, then the value would have to point to'
                         'a CSV file.')
parser.add_argument('output_file', help='File storing parsed results')
parser.add_argument('-d', '--database', dest='database',
                    help='Name of the database used to store results')
parser.add_argument('-w', '--hardware', dest='hardware',
                    help='Hardware reference used to perform benchmark')
parser.add_argument('-V', '--project-version', dest='project_version',
                    help='Commit hash reference')
parser.add_argument('-b', '--branch', dest='branch',
                    help='Git branch name on which benchmark was performed')
parser.add_argument('--commit-date', dest='commit_date',
                    help='Timestamp of commit hash used in project_version')
parser.add_argument('--bench-date', dest='bench_date',
                    help='Timestamp when benchmark was run')
parser.add_argument('--name-suffix', dest='name_suffix', default='',
                    help='Suffix to append to each of the result test names')
parser.add_argument('--append-results', dest='append_results', action='store_true',
                    help='Append parsed results to an existing file')
parser.add_argument('--walk-subdirs', dest='walk_subdirs', action='store_true',
                    help='Check for results in subdirectories')
parser.add_argument('--key-sizes', dest='key_sizes', action='store_true',
                    help='Parse only the results regarding keys size measurements')
parser.add_argument('--throughput', dest='throughput', action='store_true',
                    help='Compute and append number of operations per millisecond and'
                         'operations per dollar')


def recursive_parse(directory, walk_subdirs=False, name_suffix="", compute_throughput=False,
                    hardware_hourly_cost=None):
    """
    Parse all the benchmark results in a directory. It will attempt to parse all the files having a
    .json extension at the top-level of this directory.

    :param directory: path to directory that contains raw results as :class:`pathlib.Path`
    :param walk_subdirs: traverse results subdirectories if parameters changes for benchmark case.
    :param name_suffix: a :class:`str` suffix to apply to each test name found
    :param compute_throughput: compute number of operations per millisecond and operations per
        dollar
    :param hardware_hourly_cost: hourly cost of the hardware used in dollar

    :return: :class:`list` of data points
    """
    excluded_directories = ["child_generate", "fork", "parent_generate", "report"]
    result_values = []
    for dire in directory.iterdir():
        if dire.name in excluded_directories or not dire.is_dir():
            continue
        for subdir in dire.iterdir():
            if walk_subdirs:
                subdir = subdir.joinpath("new")
                if not subdir.exists():
                    continue
            elif subdir.name != "new":
                continue

            test_name = parse_benchmark_file(subdir)
            for stat_name, value in parse_estimate_file(subdir).items():
                test_name_parts = list(filter(None, [test_name, stat_name, name_suffix]))
                result_values.append({"value": value, "test": "_".join(test_name_parts)})

                if stat_name == "mean" and compute_throughput:
                    test_name_parts.append("ops-per-ms")
                    result_values.append({"value": compute_ops_per_millisecond(value),
                                          "test": "_".join(test_name_parts)})
                    test_name_parts.pop()

                    if hardware_hourly_cost is not None:
                        test_name_parts.append("ops-per-dollar")
                        result_values.append({
                            "value": compute_ops_per_dollar(value, hardware_hourly_cost),
                            "test": "_".join(test_name_parts)})

    return result_values


def parse_benchmark_file(directory):
    """
    Parse file containing details of the parameters used for a benchmark.

    :param directory: directory where a benchmark case results are located as :class:`pathlib.Path`

    :return: name of the test as :class:`str`
    """
    raw_res = _parse_file_to_json(directory, "benchmark.json")
    return raw_res["full_id"].replace(" ", "_")


def parse_estimate_file(directory):
    """
    Parse file containing timing results for a benchmark.

    :param directory: directory where a benchmark case results are located as :class:`pathlib.Path`

    :return: :class:`dict` of data points
    """
    raw_res = _parse_file_to_json(directory, "estimates.json")
    return {
        stat_name: raw_res[stat_name]["point_estimate"]
        for stat_name in ("mean", "std_dev")
    }


def parse_key_sizes(result_file):
    """
    Parse file containing key sizes results. The file must be formatted as CSV.

    :param result_file: results file as :class:`pathlib.Path`

    :return: :class:`list` of data points
    """
    result_values = []
    with result_file.open() as csv_file:
        reader = csv.reader(csv_file)
        for (test_name, value) in reader:
            result_values.append({"value": int(value), "test": test_name})

    return result_values


def compute_ops_per_dollar(data_point, product_hourly_cost):
    """
    Compute numbers of operations per dollar for a given ``data_point``.

    :param data_point: timing value measured during benchmark in nanoseconds
    :param product_hourly_cost: cost in dollar per hour of hardware used

    :return: number of operations per dollar
    """
    return ONE_HOUR_IN_NANOSECONDS / (product_hourly_cost * data_point)


def compute_ops_per_millisecond(data_point):
    """
    Compute numbers of operations per millisecond for a given ``data_point``.

    :param data_point: timing value measured during benchmark in nanoseconds

    :return: number of operations per millisecond
    """
    return 1E6 / data_point


def _parse_file_to_json(directory, filename):
    result_file = directory.joinpath(filename)
    return json.loads(result_file.read_text())


def dump_results(parsed_results, filename, input_args):
    """
    Dump parsed results formatted as JSON to file.

    :param parsed_results: :class:`list` of data points
    :param filename: filename for dump file as :class:`pathlib.Path`
    :param input_args: CLI input arguments
    """
    if input_args.append_results:
        parsed_content = json.loads(filename.read_text())
        parsed_content["points"].extend(parsed_results)
        filename.write_text(json.dumps(parsed_content))
    else:
        filename.parent.mkdir(parents=True, exist_ok=True)
        series = {
            "database": input_args.database,
            "hardware": input_args.hardware,
            "project_version": input_args.project_version,
            "branch": input_args.branch,
            "insert_date": input_args.bench_date,
            "commit_date": input_args.commit_date,
            "points": parsed_results,
        }
        filename.write_text(json.dumps(series))


def check_mandatory_args(input_args):
    """
    Check for availability of required input arguments, the program will exit if one of them is
    not present. If `append_results` flag is set, all the required arguments will be ignored.

    :param input_args: CLI input arguments
    """
    if input_args.append_results:
        return

    missing_args = []
    for arg_name in vars(input_args):
        if arg_name in ["results_dir", "output_file", "name_suffix",
                        "append_results", "walk_subdirs", "key_sizes",
                        "throughput"]:
            continue
        if not getattr(input_args, arg_name):
            missing_args.append(arg_name)

    if missing_args:
        for arg_name in missing_args:
            print(f"Missing required argument: --{arg_name.replace('_', '-')}")
        sys.exit(1)


if __name__ == "__main__":
    args = parser.parse_args()
    check_mandatory_args(args)

    raw_results = pathlib.Path(args.results)
    if not args.key_sizes:
        print("Parsing benchmark results... ")
        hardware_cost = None
        if args.throughput:
            print("Throughput computation enabled")
            ec2_costs = json.loads(
                pathlib.Path("ci/ec2_products_cost.json").read_text(encoding="utf-8"))
            try:
                hardware_cost = abs(ec2_costs[args.hardware])
                print(f"Hardware hourly cost: {hardware_cost} $/h")
            except KeyError:
                print(f"Cannot find hardware hourly cost for '{args.hardware}'")
                sys.exit(1)

        results = recursive_parse(raw_results, args.walk_subdirs, args.name_suffix, args.throughput,
                                  hardware_cost)
    else:
        print("Parsing key sizes results... ")
        results = parse_key_sizes(raw_results)
    print("Parsing results done")

    output_file = pathlib.Path(args.output_file)
    print(f"Dump parsed results into '{output_file.resolve()}' ... ", end="")
    dump_results(results, output_file, args)

    print("Done")