From ee19320ee8dd18f825399dc44a3337080411569c Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Wed, 18 Feb 2026 15:33:31 +0000 Subject: [PATCH] ci(bench): use ABBA run order to reduce variance (#22321) --- .github/scripts/bench-reth-charts.py | 38 ++++++++++++++++++++++----- .github/scripts/bench-reth-summary.py | 5 +++- .github/workflows/bench.yml | 10 ++++--- 3 files changed, 42 insertions(+), 11 deletions(-) diff --git a/.github/scripts/bench-reth-charts.py b/.github/scripts/bench-reth-charts.py index b0c114a470..86d457cf4d 100644 --- a/.github/scripts/bench-reth-charts.py +++ b/.github/scripts/bench-reth-charts.py @@ -188,30 +188,56 @@ def plot_gas_vs_latency( plt.close(fig) +def merge_csvs(paths: list[str]) -> list[dict]: + """Parse and merge multiple CSVs, averaging values for duplicate blocks.""" + by_block: dict[int, list[dict]] = {} + for path in paths: + for row in parse_combined_csv(path): + by_block.setdefault(row["block_number"], []).append(row) + + merged = [] + for bn in sorted(by_block): + rows = by_block[bn] + if len(rows) == 1: + merged.append(rows[0]) + else: + avg = {"block_number": bn} + for key in ("gas_used", "new_payload_latency_us"): + avg[key] = int(sum(r[key] for r in rows) / len(rows)) + for key in ("persistence_wait_us", "execution_cache_wait_us", "sparse_trie_wait_us"): + vals = [r[key] for r in rows if r[key] is not None] + avg[key] = int(sum(vals) / len(vals)) if vals else None + merged.append(avg) + return merged + + def main(): parser = argparse.ArgumentParser(description="Generate benchmark charts") - parser.add_argument("combined_csv", help="Path to combined_latency.csv (feature)") + parser.add_argument( + "--feature", nargs="+", required=True, + help="Path(s) to feature combined_latency.csv", + ) parser.add_argument( "--output-dir", required=True, help="Output directory for PNG charts" ) parser.add_argument( - "--baseline", help="Path to baseline combined_latency.csv" + "--baseline", nargs="+", help="Path(s) to baseline combined_latency.csv" ) parser.add_argument("--baseline-name", default="baseline", help="Label for baseline") parser.add_argument("--feature-name", "--branch-name", default="feature", help="Label for feature") args = parser.parse_args() - feature = parse_combined_csv(args.combined_csv) + feature = merge_csvs(args.feature) if not feature: - print("No results found in combined CSV", file=sys.stderr) + print("No results found in feature CSV(s)", file=sys.stderr) sys.exit(1) baseline = None if args.baseline: - baseline = parse_combined_csv(args.baseline) + baseline = merge_csvs(args.baseline) if not baseline: print( - "Warning: no results in baseline CSV, skipping comparison", + "Warning: no results in baseline CSV(s), skipping comparison", file=sys.stderr, ) baseline = None diff --git a/.github/scripts/bench-reth-summary.py b/.github/scripts/bench-reth-summary.py index 4d7621d6f2..50b053f527 100755 --- a/.github/scripts/bench-reth-summary.py +++ b/.github/scripts/bench-reth-summary.py @@ -183,11 +183,13 @@ def compute_paired_stats( all_pairs = [] all_lat_diffs = [] all_mgas_diffs = [] + blocks_per_pair = [] for baseline, feature in zip(baseline_runs, feature_runs): pairs, lat_diffs, mgas_diffs = _paired_data(baseline, feature) all_pairs.extend(pairs) all_lat_diffs.extend(lat_diffs) all_mgas_diffs.extend(mgas_diffs) + blocks_per_pair.append(len(pairs)) if not all_lat_diffs: return {} @@ -237,6 +239,7 @@ def compute_paired_stats( "p99_ci_ms": (p99_boot[hi] - p99_boot[lo]) / 2, "mean_mgas_diff": mean_mgas_diff, "mgas_ci": mgas_ci, + "blocks": max(blocks_per_pair), } @@ -298,7 +301,7 @@ def generate_comparison_table( feature_sha: str, ) -> str: """Generate a markdown comparison table between baseline and feature.""" - n = paired["n"] + n = paired["blocks"] def pct(base: float, feat: float) -> float: return (feat - base) / base * 100.0 if base > 0 else 0.0 diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index cdd7ec2f01..4e7b5286c7 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -476,12 +476,13 @@ jobs: - name: Prepare source dirs run: | + BASELINE_REF="${{ steps.refs.outputs.baseline-ref }}" if [ -d ../reth-baseline ]; then - git -C ../reth-baseline fetch origin + git -C ../reth-baseline fetch origin "$BASELINE_REF" else git clone . ../reth-baseline fi - git -C ../reth-baseline checkout "${{ steps.refs.outputs.baseline-ref }}" + git -C ../reth-baseline checkout "$BASELINE_REF" ln -sfn "$(pwd)" ../reth-feature - name: Build baseline and feature binaries in parallel @@ -611,8 +612,9 @@ jobs: BASELINE_NAME: ${{ steps.refs.outputs.baseline-name }} FEATURE_NAME: ${{ steps.refs.outputs.feature-name }} run: | - CHART_ARGS="/tmp/bench-results-feature/combined_latency.csv --output-dir /tmp/bench-charts" - CHART_ARGS="$CHART_ARGS --baseline /tmp/bench-results-baseline/combined_latency.csv" + CHART_ARGS="--output-dir /tmp/bench-charts" + CHART_ARGS="$CHART_ARGS --feature /tmp/bench-results-feature-1/combined_latency.csv /tmp/bench-results-feature-2/combined_latency.csv" + CHART_ARGS="$CHART_ARGS --baseline /tmp/bench-results-baseline-1/combined_latency.csv /tmp/bench-results-baseline-2/combined_latency.csv" CHART_ARGS="$CHART_ARGS --baseline-name ${BASELINE_NAME}" CHART_ARGS="$CHART_ARGS --feature-name ${FEATURE_NAME}" # shellcheck disable=SC2086