ci(bench): rename main/branch to baseline/feature, add ref args (#22284)

Co-authored-by: Georgios Konstantopoulos <me@gakonst.com>
Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
Alexey Shekhirin
2026-02-17 23:00:01 +00:00
committed by GitHub
parent aeb2c6e731
commit b49cadb346
4 changed files with 171 additions and 114 deletions

View File

@@ -2,15 +2,15 @@
#
# Builds (or fetches from cache) reth binaries for benchmarking.
#
# Usage: bench-reth-build.sh <main|branch> <commit> [branch-sha]
# Usage: bench-reth-build.sh <baseline|feature> <commit> [branch-sha]
#
# main — build/fetch the baseline binary at <commit> (merge-base)
# branch — build/fetch the candidate binary + reth-bench at <commit>
# optional branch-sha is the PR head commit for cache key
# baseline — build/fetch the baseline binary at <commit> (merge-base)
# feature — build/fetch the candidate binary + reth-bench at <commit>
# optional branch-sha is the PR head commit for cache key
#
# Outputs:
# main: target/profiling-baseline/reth
# branch: target/profiling/reth, reth-bench installed to cargo bin
# baseline: target/profiling-baseline/reth
# feature: target/profiling/reth, reth-bench installed to cargo bin
#
# Required: mc (MinIO client) configured at /home/ubuntu/.mc
set -euo pipefail
@@ -20,16 +20,16 @@ MODE="$1"
COMMIT="$2"
case "$MODE" in
main)
baseline|main)
BUCKET="minio/reth-binaries/${COMMIT}"
mkdir -p target/profiling-baseline
if $MC stat "${BUCKET}/reth" &>/dev/null; then
echo "Cache hit for main (${COMMIT}), downloading binary..."
echo "Cache hit for baseline (${COMMIT}), downloading binary..."
$MC cp "${BUCKET}/reth" target/profiling-baseline/reth
chmod +x target/profiling-baseline/reth
else
echo "Cache miss for main (${COMMIT}), building from source..."
echo "Cache miss for baseline (${COMMIT}), building from source..."
CURRENT_REF=$(git rev-parse HEAD)
git checkout "${COMMIT}"
cargo build --profile profiling --bin reth
@@ -39,7 +39,7 @@ case "$MODE" in
fi
;;
branch)
feature|branch)
BRANCH_SHA="${3:-$COMMIT}"
BUCKET="minio/reth-binaries/${BRANCH_SHA}"
@@ -60,7 +60,7 @@ case "$MODE" in
;;
*)
echo "Usage: $0 <main|branch> <commit> [branch-sha]"
echo "Usage: $0 <baseline|feature> <commit> [branch-sha]"
exit 1
;;
esac

View File

@@ -53,7 +53,7 @@ def parse_combined_csv(path: str) -> list[dict]:
def plot_latency_and_throughput(
feature: list[dict], baseline: list[dict] | None, out: Path,
baseline_name: str = "main", branch_name: str = "branch",
baseline_name: str = "baseline", feature_name: str = "feature",
):
num_plots = 3 if baseline else 2
fig, axes = plt.subplots(num_plots, 1, figsize=(12, 4 * num_plots), sharex=True)
@@ -76,14 +76,14 @@ def plot_latency_and_throughput(
ax1.plot(base_x, base_lat, linewidth=0.8, label=baseline_name, alpha=0.7)
ax2.plot(base_x, base_ggas, linewidth=0.8, label=baseline_name, alpha=0.7)
ax1.plot(feat_x, feat_lat, linewidth=0.8, label=branch_name)
ax1.plot(feat_x, feat_lat, linewidth=0.8, label=feature_name)
ax1.set_ylabel("Latency (ms)")
ax1.set_title("newPayload Latency per Block")
ax1.grid(True, alpha=0.3)
if baseline:
ax1.legend()
ax2.plot(feat_x, feat_ggas, linewidth=0.8, label=branch_name)
ax2.plot(feat_x, feat_ggas, linewidth=0.8, label=feature_name)
ax2.set_ylabel("Ggas/s")
ax2.set_title("Execution Throughput per Block")
ax2.grid(True, alpha=0.3)
@@ -105,7 +105,7 @@ def plot_latency_and_throughput(
ax3.bar(blocks, diffs, width=1.0, color=colors, alpha=0.7, edgecolor="none")
ax3.axhline(0, color="black", linewidth=0.5)
ax3.set_ylabel("Δ Latency (%)")
ax3.set_title("Per-Block newPayload Latency Change (branch vs main)")
ax3.set_title("Per-Block newPayload Latency Change (feature vs baseline)")
ax3.grid(True, alpha=0.3, axis="y")
axes[-1].set_xlabel("Block Number")
@@ -116,7 +116,7 @@ def plot_latency_and_throughput(
def plot_wait_breakdown(
feature: list[dict], baseline: list[dict] | None, out: Path,
baseline_name: str = "main", branch_name: str = "branch",
baseline_name: str = "baseline", feature_name: str = "feature",
):
series = [
("Persistence Wait", "persistence_wait_us"),
@@ -135,7 +135,7 @@ def plot_wait_breakdown(
fx = [r["block_number"] for r in feature if r[key] is not None]
fy = [r[key] / 1_000 for r in feature if r[key] is not None]
if fx:
ax.plot(fx, fy, linewidth=0.8, label=branch_name)
ax.plot(fx, fy, linewidth=0.8, label=feature_name)
ax.set_ylabel("ms")
ax.set_title(label)
@@ -163,7 +163,7 @@ def _add_regression(ax, x, y, color, label):
def plot_gas_vs_latency(
feature: list[dict], baseline: list[dict] | None, out: Path,
baseline_name: str = "main", branch_name: str = "branch",
baseline_name: str = "baseline", feature_name: str = "feature",
):
fig, ax = plt.subplots(figsize=(8, 6))
@@ -176,7 +176,7 @@ def plot_gas_vs_latency(
fgas = [r["gas_used"] / 1_000_000 for r in feature]
flat = [r["new_payload_latency_us"] / 1_000 for r in feature]
ax.scatter(fgas, flat, s=8, alpha=0.6)
_add_regression(ax, fgas, flat, "tab:orange", branch_name)
_add_regression(ax, fgas, flat, "tab:orange", feature_name)
ax.set_xlabel("Gas Used (Mgas)")
ax.set_ylabel("newPayload Latency (ms)")
@@ -195,10 +195,10 @@ def main():
"--output-dir", required=True, help="Output directory for PNG charts"
)
parser.add_argument(
"--baseline", help="Path to baseline (main) combined_latency.csv"
"--baseline", help="Path to baseline combined_latency.csv"
)
parser.add_argument("--baseline-name", default="main", help="Label for baseline")
parser.add_argument("--branch-name", default="branch", help="Label for branch")
parser.add_argument("--baseline-name", default="baseline", help="Label for baseline")
parser.add_argument("--feature-name", "--branch-name", default="feature", help="Label for feature")
args = parser.parse_args()
feature = parse_combined_csv(args.combined_csv)
@@ -220,7 +220,7 @@ def main():
out_dir.mkdir(parents=True, exist_ok=True)
bname = args.baseline_name
fname = args.branch_name
fname = args.feature_name
plot_latency_and_throughput(feature, baseline, out_dir / "latency_throughput.png", bname, fname)
plot_wait_breakdown(feature, baseline, out_dir / "wait_breakdown.png", bname, fname)
plot_gas_vs_latency(feature, baseline, out_dir / "gas_vs_latency.png", bname, fname)

View File

@@ -8,12 +8,12 @@ Usage:
--baseline-csv <baseline_combined.csv> \
[--repo <owner/repo>] \
[--baseline-ref <sha>] \
[--branch-name <name>] \
[--branch-sha <sha>]
[--feature-name <name>] \
[--feature-sha <sha>]
Generates a paired statistical comparison between baseline (main) and branch.
Generates a paired statistical comparison between baseline and feature.
Matches blocks by number and computes per-block diffs to cancel out gas
variance. Fails if baseline or branch CSV is missing or empty.
variance. Fails if baseline or feature CSV is missing or empty.
"""
import argparse
@@ -113,25 +113,25 @@ def compute_stats(combined: list[dict]) -> dict:
def _paired_data(
baseline: list[dict], branch: list[dict]
baseline: list[dict], feature: list[dict]
) -> tuple[list[tuple[float, float]], list[float], list[float]]:
"""Match blocks and return paired latencies and per-block diffs.
Returns:
pairs: list of (baseline_ms, branch_ms) tuples
lat_diffs_ms: list of branch baseline latency diffs in ms
mgas_diffs: list of branch baseline Mgas/s diffs
pairs: list of (baseline_ms, feature_ms) tuples
lat_diffs_ms: list of feature baseline latency diffs in ms
mgas_diffs: list of feature baseline Mgas/s diffs
"""
baseline_by_block = {r["block_number"]: r for r in baseline}
branch_by_block = {r["block_number"]: r for r in branch}
common_blocks = sorted(set(baseline_by_block) & set(branch_by_block))
feature_by_block = {r["block_number"]: r for r in feature}
common_blocks = sorted(set(baseline_by_block) & set(feature_by_block))
pairs = []
lat_diffs_ms = []
mgas_diffs = []
for bn in common_blocks:
b = baseline_by_block[bn]
f = branch_by_block[bn]
f = feature_by_block[bn]
b_ms = b["new_payload_latency_us"] / 1_000
f_ms = f["new_payload_latency_us"] / 1_000
pairs.append((b_ms, f_ms))
@@ -148,18 +148,18 @@ def _paired_data(
def compute_paired_stats(
baseline_runs: list[list[dict]],
branch_runs: list[list[dict]],
feature_runs: list[list[dict]],
) -> dict:
"""Compute paired statistics between baseline and branch runs.
"""Compute paired statistics between baseline and feature runs.
Each pair (baseline_runs[i], branch_runs[i]) produces per-block diffs.
Each pair (baseline_runs[i], feature_runs[i]) produces per-block diffs.
All diffs are pooled for the final CI.
"""
all_pairs = []
all_lat_diffs = []
all_mgas_diffs = []
for baseline, branch in zip(baseline_runs, branch_runs):
pairs, lat_diffs, mgas_diffs = _paired_data(baseline, branch)
for baseline, feature in zip(baseline_runs, feature_runs):
pairs, lat_diffs, mgas_diffs = _paired_data(baseline, feature)
all_pairs.extend(pairs)
all_lat_diffs.extend(lat_diffs)
all_mgas_diffs.extend(mgas_diffs)
@@ -175,10 +175,10 @@ def compute_paired_stats(
# Bootstrap CI on difference-of-percentiles (resample paired blocks)
base_lats = sorted([p[0] for p in all_pairs])
branch_lats = sorted([p[1] for p in all_pairs])
p50_diff = percentile(branch_lats, 50) - percentile(base_lats, 50)
p90_diff = percentile(branch_lats, 90) - percentile(base_lats, 90)
p99_diff = percentile(branch_lats, 99) - percentile(base_lats, 99)
feature_lats = sorted([p[1] for p in all_pairs])
p50_diff = percentile(feature_lats, 50) - percentile(base_lats, 50)
p90_diff = percentile(feature_lats, 90) - percentile(base_lats, 90)
p99_diff = percentile(feature_lats, 99) - percentile(base_lats, 99)
rng = random.Random(42)
p50_boot, p90_boot, p99_boot = [], [], []
@@ -268,10 +268,11 @@ def generate_comparison_table(
paired: dict,
repo: str,
baseline_ref: str,
branch_name: str,
branch_sha: str,
baseline_name: str,
feature_name: str,
feature_sha: str,
) -> str:
"""Generate a markdown comparison table between baseline (main) and branch."""
"""Generate a markdown comparison table between baseline and feature."""
n = paired["n"]
def pct(base: float, feat: float) -> float:
@@ -294,11 +295,11 @@ def generate_comparison_table(
mgas_ci_pct = paired["mgas_ci"] / run1["mean_mgas_s"] * 100.0 if run1["mean_mgas_s"] > 0 else 0.0
base_url = f"https://github.com/{repo}/commit"
baseline_label = f"[`main`]({base_url}/{baseline_ref})"
branch_label = f"[`{branch_name}`]({base_url}/{branch_sha})"
baseline_label = f"[`{baseline_name}`]({base_url}/{baseline_ref})"
feature_label = f"[`{feature_name}`]({base_url}/{feature_sha})"
lines = [
f"| Metric | {baseline_label} | {branch_label} | Change |",
f"| Metric | {baseline_label} | {feature_label} | Change |",
"|--------|------|--------|--------|",
f"| Mean | {fmt_ms(run1['mean_ms'])} | {fmt_ms(run2['mean_ms'])} | {change_str(mean_pct, lat_ci_pct, lower_is_better=True)} |",
f"| StdDev | {fmt_ms(run1['stddev_ms'])} | {fmt_ms(run2['stddev_ms'])} | |",
@@ -314,15 +315,15 @@ def generate_comparison_table(
def generate_markdown(
summary: dict, comparison_table: str,
behind_main: int = 0, repo: str = "", baseline_ref: str = "",
behind_baseline: int = 0, repo: str = "", baseline_ref: str = "", baseline_name: str = "",
) -> str:
"""Generate a markdown comment body."""
lines = ["## Benchmark Results", "", comparison_table]
if behind_main > 0:
s = "s" if behind_main > 1 else ""
diff_link = f"https://github.com/{repo}/compare/{baseline_ref[:12]}...main"
if behind_baseline > 0:
s = "s" if behind_baseline > 1 else ""
diff_link = f"https://github.com/{repo}/compare/{baseline_ref[:12]}...{baseline_name}"
lines.append("")
lines.append(f"> ⚠️ Branch is [**{behind_main} commit{s} behind `main`**]({diff_link}). Consider rebasing for accurate results.")
lines.append(f"> ⚠️ Feature is [**{behind_baseline} commit{s} behind `{baseline_name}`**]({diff_link}). Consider rebasing for accurate results.")
return "\n".join(lines)
@@ -333,8 +334,8 @@ def main():
help="Baseline combined_latency.csv files (A1, A2)",
)
parser.add_argument(
"--branch-csv", nargs="+", required=True,
help="Branch combined_latency.csv files (B1, B2)",
"--feature-csv", "--branch-csv", nargs="+", required=True,
help="Feature combined_latency.csv files (B1, B2)",
)
parser.add_argument("--gas-csv", required=True, help="Path to total_gas.csv")
parser.add_argument(
@@ -345,65 +346,68 @@ def main():
"--repo", default="paradigmxyz/reth", help="GitHub repo (owner/name)"
)
parser.add_argument("--baseline-ref", default=None, help="Baseline commit SHA")
parser.add_argument("--branch-name", default=None, help="Branch name")
parser.add_argument("--branch-sha", default=None, help="Branch commit SHA")
parser.add_argument("--behind-main", type=int, default=0, help="Commits behind main")
parser.add_argument("--baseline-name", default=None, help="Baseline display name")
parser.add_argument("--feature-name", "--branch-name", default=None, help="Feature branch name")
parser.add_argument("--feature-ref", "--branch-sha", "--feature-sha", default=None, help="Feature commit SHA")
parser.add_argument("--behind-baseline", "--behind-main", type=int, default=0, help="Commits behind baseline")
args = parser.parse_args()
if len(args.baseline_csv) != len(args.branch_csv):
print("Must provide equal number of baseline and branch CSVs", file=sys.stderr)
if len(args.baseline_csv) != len(args.feature_csv):
print("Must provide equal number of baseline and feature CSVs", file=sys.stderr)
sys.exit(1)
baseline_runs = []
branch_runs = []
feature_runs = []
for path in args.baseline_csv:
data = parse_combined_csv(path)
if not data:
print(f"No results in {path}", file=sys.stderr)
sys.exit(1)
baseline_runs.append(data)
for path in args.branch_csv:
for path in args.feature_csv:
data = parse_combined_csv(path)
if not data:
print(f"No results in {path}", file=sys.stderr)
sys.exit(1)
branch_runs.append(data)
feature_runs.append(data)
gas = parse_gas_csv(args.gas_csv)
all_baseline = [r for run in baseline_runs for r in run]
all_branch = [r for run in branch_runs for r in run]
all_feature = [r for run in feature_runs for r in run]
summary = compute_summary(all_branch, gas)
summary = compute_summary(all_feature, gas)
with open(args.output_summary, "w") as f:
json.dump(summary, f, indent=2)
print(f"Summary written to {args.output_summary}")
baseline_stats = compute_stats(all_baseline)
branch_stats = compute_stats(all_branch)
paired_stats = compute_paired_stats(baseline_runs, branch_runs)
feature_stats = compute_stats(all_feature)
paired_stats = compute_paired_stats(baseline_runs, feature_runs)
if not paired_stats:
print("No common blocks between baseline and branch runs", file=sys.stderr)
print("No common blocks between baseline and feature runs", file=sys.stderr)
sys.exit(1)
comparison_table = generate_comparison_table(
baseline_stats,
branch_stats,
feature_stats,
paired_stats,
repo=args.repo,
baseline_ref=args.baseline_ref or "main",
branch_name=args.branch_name or "branch",
branch_sha=args.branch_sha or "unknown",
baseline_name=args.baseline_name or "baseline",
feature_name=args.feature_name or "feature",
feature_sha=args.feature_ref or "unknown",
)
print(f"Generated comparison ({paired_stats['n']} paired blocks, "
f"mean diff {paired_stats['mean_diff_ms']:+.3f}ms ± {paired_stats['ci_ms']:.3f}ms)")
markdown = generate_markdown(
summary, comparison_table,
behind_main=args.behind_main,
behind_baseline=args.behind_baseline,
repo=args.repo,
baseline_ref=args.baseline_ref or "",
baseline_name=args.baseline_name or "main",
)
with open(args.output_markdown, "w") as f: