ci: reth-bench (#22134)

This commit is contained in:
Alexey Shekhirin
2026-02-17 16:47:47 +00:00
committed by GitHub
parent a9a6044bc5
commit 719bbc2543
5 changed files with 1176 additions and 2 deletions

66
.github/scripts/bench-reth-build.sh vendored Executable file
View File

@@ -0,0 +1,66 @@
#!/usr/bin/env bash
#
# Builds (or fetches from cache) reth binaries for benchmarking.
#
# Usage: bench-reth-build.sh <main|branch> <commit> [branch-sha]
#
# main — build/fetch the baseline binary at <commit> (merge-base)
# branch — build/fetch the candidate binary + reth-bench at <commit>
# optional branch-sha is the PR head commit for cache key
#
# Outputs:
# main: target/profiling-baseline/reth
# branch: target/profiling/reth, reth-bench installed to cargo bin
#
# Required: mc (MinIO client) configured at /home/ubuntu/.mc
set -euo pipefail
MC="mc --config-dir /home/ubuntu/.mc"
MODE="$1"
COMMIT="$2"
case "$MODE" in
main)
BUCKET="minio/reth-binaries/${COMMIT}"
mkdir -p target/profiling-baseline
if $MC stat "${BUCKET}/reth" &>/dev/null; then
echo "Cache hit for main (${COMMIT}), downloading binary..."
$MC cp "${BUCKET}/reth" target/profiling-baseline/reth
chmod +x target/profiling-baseline/reth
else
echo "Cache miss for main (${COMMIT}), building from source..."
CURRENT_REF=$(git rev-parse HEAD)
git checkout "${COMMIT}"
cargo build --profile profiling --bin reth
cp target/profiling/reth target/profiling-baseline/reth
$MC cp target/profiling-baseline/reth "${BUCKET}/reth"
git checkout "${CURRENT_REF}"
fi
;;
branch)
BRANCH_SHA="${3:-$COMMIT}"
BUCKET="minio/reth-binaries/${BRANCH_SHA}"
if $MC stat "${BUCKET}/reth" &>/dev/null && $MC stat "${BUCKET}/reth-bench" &>/dev/null; then
echo "Cache hit for ${BRANCH_SHA}, downloading binaries..."
mkdir -p target/profiling
$MC cp "${BUCKET}/reth" target/profiling/reth
$MC cp "${BUCKET}/reth-bench" /home/ubuntu/.cargo/bin/reth-bench
chmod +x target/profiling/reth /home/ubuntu/.cargo/bin/reth-bench
else
echo "Cache miss for ${BRANCH_SHA}, building from source..."
rustup show active-toolchain || rustup default stable
make profiling
make install-reth-bench
$MC cp target/profiling/reth "${BUCKET}/reth"
$MC cp "$(which reth-bench)" "${BUCKET}/reth-bench"
fi
;;
*)
echo "Usage: $0 <main|branch> <commit> [branch-sha]"
exit 1
;;
esac

232
.github/scripts/bench-reth-charts.py vendored Normal file
View File

@@ -0,0 +1,232 @@
#!/usr/bin/env python3
"""Generate benchmark charts from reth-bench CSV output.
Usage:
bench-engine-charts.py <combined_csv> --output-dir <dir> [--baseline <baseline_csv>]
Generates three PNG charts:
1. newPayload latency + Ggas/s per block (+ latency diff when baseline present)
2. Wait breakdown (persistence, execution cache, sparse trie) per block
3. Scatter plot of gas used vs latency
When --baseline is provided, charts overlay both datasets for comparison.
"""
import argparse
import csv
import sys
from pathlib import Path
import numpy as np
try:
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
except ImportError:
print("matplotlib is required: pip install matplotlib", file=sys.stderr)
sys.exit(1)
GIGAGAS = 1_000_000_000
def parse_combined_csv(path: str) -> list[dict]:
rows = []
with open(path) as f:
reader = csv.DictReader(f)
for row in reader:
rows.append(
{
"block_number": int(row["block_number"]),
"gas_used": int(row["gas_used"]),
"new_payload_latency_us": int(row["new_payload_latency"]),
"persistence_wait_us": int(row["persistence_wait"])
if row.get("persistence_wait")
else None,
"execution_cache_wait_us": int(row.get("execution_cache_wait", 0)),
"sparse_trie_wait_us": int(row.get("sparse_trie_wait", 0)),
}
)
return rows
def plot_latency_and_throughput(
feature: list[dict], baseline: list[dict] | None, out: Path,
baseline_name: str = "main", branch_name: str = "branch",
):
num_plots = 3 if baseline else 2
fig, axes = plt.subplots(num_plots, 1, figsize=(12, 4 * num_plots), sharex=True)
ax1, ax2 = axes[0], axes[1]
feat_x = [r["block_number"] for r in feature]
feat_lat = [r["new_payload_latency_us"] / 1_000 for r in feature]
feat_ggas = []
for r in feature:
lat_s = r["new_payload_latency_us"] / 1_000_000
feat_ggas.append(r["gas_used"] / lat_s / GIGAGAS if lat_s > 0 else 0)
if baseline:
base_x = [r["block_number"] for r in baseline]
base_lat = [r["new_payload_latency_us"] / 1_000 for r in baseline]
base_ggas = []
for r in baseline:
lat_s = r["new_payload_latency_us"] / 1_000_000
base_ggas.append(r["gas_used"] / lat_s / GIGAGAS if lat_s > 0 else 0)
ax1.plot(base_x, base_lat, linewidth=0.8, label=baseline_name, alpha=0.7)
ax2.plot(base_x, base_ggas, linewidth=0.8, label=baseline_name, alpha=0.7)
ax1.plot(feat_x, feat_lat, linewidth=0.8, label=branch_name)
ax1.set_ylabel("Latency (ms)")
ax1.set_title("newPayload Latency per Block")
ax1.grid(True, alpha=0.3)
if baseline:
ax1.legend()
ax2.plot(feat_x, feat_ggas, linewidth=0.8, label=branch_name)
ax2.set_ylabel("Ggas/s")
ax2.set_title("Execution Throughput per Block")
ax2.grid(True, alpha=0.3)
if baseline:
ax2.legend()
if baseline:
ax3 = axes[2]
base_by_block = {r["block_number"]: r["new_payload_latency_us"] for r in baseline}
blocks, diffs = [], []
for r in feature:
bn = r["block_number"]
if bn in base_by_block and base_by_block[bn] > 0:
pct = (r["new_payload_latency_us"] - base_by_block[bn]) / base_by_block[bn] * 100
blocks.append(bn)
diffs.append(pct)
if blocks:
colors = ["green" if d <= 0 else "red" for d in diffs]
ax3.bar(blocks, diffs, width=1.0, color=colors, alpha=0.7, edgecolor="none")
ax3.axhline(0, color="black", linewidth=0.5)
ax3.set_ylabel("Δ Latency (%)")
ax3.set_title("Per-Block newPayload Latency Change (branch vs main)")
ax3.grid(True, alpha=0.3, axis="y")
axes[-1].set_xlabel("Block Number")
fig.tight_layout()
fig.savefig(out, dpi=150)
plt.close(fig)
def plot_wait_breakdown(
feature: list[dict], baseline: list[dict] | None, out: Path,
baseline_name: str = "main", branch_name: str = "branch",
):
series = [
("Persistence Wait", "persistence_wait_us"),
("State Cache Wait", "execution_cache_wait_us"),
("Trie Cache Wait", "sparse_trie_wait_us"),
]
fig, axes = plt.subplots(len(series), 1, figsize=(12, 3 * len(series)), sharex=True)
for ax, (label, key) in zip(axes, series):
if baseline:
bx = [r["block_number"] for r in baseline if r[key] is not None]
by = [r[key] / 1_000 for r in baseline if r[key] is not None]
if bx:
ax.plot(bx, by, linewidth=0.8, label=baseline_name, alpha=0.7)
fx = [r["block_number"] for r in feature if r[key] is not None]
fy = [r[key] / 1_000 for r in feature if r[key] is not None]
if fx:
ax.plot(fx, fy, linewidth=0.8, label=branch_name)
ax.set_ylabel("ms")
ax.set_title(label)
ax.grid(True, alpha=0.3)
if baseline:
ax.legend()
axes[-1].set_xlabel("Block Number")
fig.suptitle("Wait Time Breakdown per Block", fontsize=14, y=1.01)
fig.tight_layout()
fig.savefig(out, dpi=150, bbox_inches="tight")
plt.close(fig)
def _add_regression(ax, x, y, color, label):
"""Add a linear regression line to the axes."""
if len(x) < 2:
return
xa, ya = np.array(x), np.array(y)
m, b = np.polyfit(xa, ya, 1)
x_range = np.linspace(xa.min(), xa.max(), 100)
ax.plot(x_range, m * x_range + b, color=color, linewidth=1.5, alpha=0.8,
label=label)
def plot_gas_vs_latency(
feature: list[dict], baseline: list[dict] | None, out: Path,
baseline_name: str = "main", branch_name: str = "branch",
):
fig, ax = plt.subplots(figsize=(8, 6))
if baseline:
bgas = [r["gas_used"] / 1_000_000 for r in baseline]
blat = [r["new_payload_latency_us"] / 1_000 for r in baseline]
ax.scatter(bgas, blat, s=8, alpha=0.5)
_add_regression(ax, bgas, blat, "tab:blue", baseline_name)
fgas = [r["gas_used"] / 1_000_000 for r in feature]
flat = [r["new_payload_latency_us"] / 1_000 for r in feature]
ax.scatter(fgas, flat, s=8, alpha=0.6)
_add_regression(ax, fgas, flat, "tab:orange", branch_name)
ax.set_xlabel("Gas Used (Mgas)")
ax.set_ylabel("newPayload Latency (ms)")
ax.set_title("Gas Used vs Latency")
ax.grid(True, alpha=0.3)
ax.legend()
fig.tight_layout()
fig.savefig(out, dpi=150)
plt.close(fig)
def main():
parser = argparse.ArgumentParser(description="Generate benchmark charts")
parser.add_argument("combined_csv", help="Path to combined_latency.csv (feature)")
parser.add_argument(
"--output-dir", required=True, help="Output directory for PNG charts"
)
parser.add_argument(
"--baseline", help="Path to baseline (main) combined_latency.csv"
)
parser.add_argument("--baseline-name", default="main", help="Label for baseline")
parser.add_argument("--branch-name", default="branch", help="Label for branch")
args = parser.parse_args()
feature = parse_combined_csv(args.combined_csv)
if not feature:
print("No results found in combined CSV", file=sys.stderr)
sys.exit(1)
baseline = None
if args.baseline:
baseline = parse_combined_csv(args.baseline)
if not baseline:
print(
"Warning: no results in baseline CSV, skipping comparison",
file=sys.stderr,
)
baseline = None
out_dir = Path(args.output_dir)
out_dir.mkdir(parents=True, exist_ok=True)
bname = args.baseline_name
fname = args.branch_name
plot_latency_and_throughput(feature, baseline, out_dir / "latency_throughput.png", bname, fname)
plot_wait_breakdown(feature, baseline, out_dir / "wait_breakdown.png", bname, fname)
plot_gas_vs_latency(feature, baseline, out_dir / "gas_vs_latency.png", bname, fname)
print(f"Charts written to {out_dir}")
if __name__ == "__main__":
main()

94
.github/scripts/bench-reth-run.sh vendored Executable file
View File

@@ -0,0 +1,94 @@
#!/usr/bin/env bash
#
# Runs a single reth-bench cycle: mount snapshot → start node → warmup →
# benchmark → stop node → recover snapshot.
#
# Usage: bench-reth-run.sh <label> <binary> <output-dir>
#
# Required env: SCHELK_MOUNT, BENCH_RPC_URL, BENCH_BLOCKS, BENCH_WARMUP_BLOCKS
set -euo pipefail
LABEL="$1"
BINARY="$2"
OUTPUT_DIR="$3"
DATADIR="$SCHELK_MOUNT/datadir"
LOG="/tmp/reth-bench-node-${LABEL}.log"
cleanup() {
kill "$TAIL_PID" 2>/dev/null || true
if [ -n "${RETH_PID:-}" ] && sudo kill -0 "$RETH_PID" 2>/dev/null; then
sudo kill "$RETH_PID"
for i in $(seq 1 30); do
sudo kill -0 "$RETH_PID" 2>/dev/null || break
sleep 1
done
sudo kill -9 "$RETH_PID" 2>/dev/null || true
fi
mountpoint -q "$SCHELK_MOUNT" && sudo schelk recover -y || true
}
TAIL_PID=
trap cleanup EXIT
# Mount
sudo schelk mount -y
sync
sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'
echo "=== Cache state after drop ==="
free -h
grep Cached /proc/meminfo
# Start reth
# CPU layout: core 0 = OS/IRQs/reth-bench/aux, cores 1+ = reth node
RETH_BENCH="$(which reth-bench)"
ONLINE=$(nproc --all)
RETH_CPUS="1-$(( ONLINE - 1 ))"
sudo taskset -c "$RETH_CPUS" nice -n -20 "$BINARY" node \
--datadir "$DATADIR" \
--engine.accept-execution-requests-hash \
--http \
--http.port 8545 \
--ws \
--ws.api all \
--authrpc.port 8551 \
--disable-discovery \
--no-persist-peers \
> "$LOG" 2>&1 &
RETH_PID=$!
stdbuf -oL tail -f "$LOG" | sed -u "s/^/[reth] /" &
TAIL_PID=$!
for i in $(seq 1 60); do
if curl -sf http://127.0.0.1:8545 -X POST \
-H 'Content-Type: application/json' \
-d '{"jsonrpc":"2.0","method":"eth_blockNumber","params":[],"id":1}' \
> /dev/null 2>&1; then
echo "reth (${LABEL}) is ready after ${i}s"
break
fi
if [ "$i" -eq 60 ]; then
echo "::error::reth (${LABEL}) failed to start within 60s"
cat "$LOG"
exit 1
fi
sleep 1
done
# Warmup
sudo nice -n -20 "$RETH_BENCH" new-payload-fcu \
--rpc-url "$BENCH_RPC_URL" \
--engine-rpc-url http://127.0.0.1:8551 \
--jwt-secret "$DATADIR/jwt.hex" \
--advance "${BENCH_WARMUP_BLOCKS:-50}" \
--reth-new-payload 2>&1 | sed -u "s/^/[bench] /"
# Benchmark
sudo nice -n -20 "$RETH_BENCH" new-payload-fcu \
--rpc-url "$BENCH_RPC_URL" \
--engine-rpc-url http://127.0.0.1:8551 \
--jwt-secret "$DATADIR/jwt.hex" \
--advance "$BENCH_BLOCKS" \
--reth-new-payload \
--output "$OUTPUT_DIR" 2>&1 | sed -u "s/^/[bench] /"
# cleanup runs via trap

415
.github/scripts/bench-reth-summary.py vendored Executable file
View File

@@ -0,0 +1,415 @@
#!/usr/bin/env python3
"""Parse reth-bench CSV output and generate a summary JSON + markdown comparison.
Usage:
bench-reth-summary.py <combined_csv> <gas_csv> \
--output-summary <summary.json> \
--output-markdown <comment.md> \
--baseline-csv <baseline_combined.csv> \
[--repo <owner/repo>] \
[--baseline-ref <sha>] \
[--branch-name <name>] \
[--branch-sha <sha>]
Generates a paired statistical comparison between baseline (main) and branch.
Matches blocks by number and computes per-block diffs to cancel out gas
variance. Fails if baseline or branch CSV is missing or empty.
"""
import argparse
import csv
import json
import math
import random
import sys
GIGAGAS = 1_000_000_000
T_CRITICAL = 1.96 # two-tailed 95% confidence
BOOTSTRAP_ITERATIONS = 10_000
def parse_combined_csv(path: str) -> list[dict]:
"""Parse combined_latency.csv into a list of per-block dicts."""
rows = []
with open(path) as f:
reader = csv.DictReader(f)
for row in reader:
rows.append(
{
"block_number": int(row["block_number"]),
"gas_used": int(row["gas_used"]),
"gas_limit": int(row["gas_limit"]),
"transaction_count": int(row["transaction_count"]),
"new_payload_latency_us": int(row["new_payload_latency"]),
"fcu_latency_us": int(row["fcu_latency"]),
"total_latency_us": int(row["total_latency"]),
"persistence_wait_us": int(row["persistence_wait"])
if row.get("persistence_wait")
else None,
"execution_cache_wait_us": int(row.get("execution_cache_wait", 0)),
"sparse_trie_wait_us": int(row.get("sparse_trie_wait", 0)),
}
)
return rows
def parse_gas_csv(path: str) -> list[dict]:
"""Parse total_gas.csv into a list of per-block dicts."""
rows = []
with open(path) as f:
reader = csv.DictReader(f)
for row in reader:
rows.append(
{
"block_number": int(row["block_number"]),
"gas_used": int(row["gas_used"]),
"time_us": int(row["time"]),
}
)
return rows
def stddev(values: list[float], mean: float) -> float:
if len(values) < 2:
return 0.0
return math.sqrt(sum((v - mean) ** 2 for v in values) / (len(values) - 1))
def percentile(sorted_vals: list[float], pct: int) -> float:
if not sorted_vals:
return 0.0
idx = int(len(sorted_vals) * pct / 100)
idx = min(idx, len(sorted_vals) - 1)
return sorted_vals[idx]
def compute_stats(combined: list[dict]) -> dict:
"""Compute per-run statistics from parsed CSV data."""
n = len(combined)
if n == 0:
return {}
latencies_ms = [r["new_payload_latency_us"] / 1_000 for r in combined]
sorted_lat = sorted(latencies_ms)
mean_lat = sum(latencies_ms) / n
std_lat = stddev(latencies_ms, mean_lat)
mgas_s_values = []
for r in combined:
lat_s = r["new_payload_latency_us"] / 1_000_000
if lat_s > 0:
mgas_s_values.append(r["gas_used"] / lat_s / 1_000_000)
mean_mgas_s = sum(mgas_s_values) / len(mgas_s_values) if mgas_s_values else 0
return {
"n": n,
"mean_ms": mean_lat,
"stddev_ms": std_lat,
"p50_ms": percentile(sorted_lat, 50),
"p90_ms": percentile(sorted_lat, 90),
"p99_ms": percentile(sorted_lat, 99),
"mean_mgas_s": mean_mgas_s,
}
def _paired_data(
baseline: list[dict], branch: list[dict]
) -> tuple[list[tuple[float, float]], list[float], list[float]]:
"""Match blocks and return paired latencies and per-block diffs.
Returns:
pairs: list of (baseline_ms, branch_ms) tuples
lat_diffs_ms: list of branch baseline latency diffs in ms
mgas_diffs: list of branch baseline Mgas/s diffs
"""
baseline_by_block = {r["block_number"]: r for r in baseline}
branch_by_block = {r["block_number"]: r for r in branch}
common_blocks = sorted(set(baseline_by_block) & set(branch_by_block))
pairs = []
lat_diffs_ms = []
mgas_diffs = []
for bn in common_blocks:
b = baseline_by_block[bn]
f = branch_by_block[bn]
b_ms = b["new_payload_latency_us"] / 1_000
f_ms = f["new_payload_latency_us"] / 1_000
pairs.append((b_ms, f_ms))
lat_diffs_ms.append(f_ms - b_ms)
b_lat_s = b["new_payload_latency_us"] / 1_000_000
f_lat_s = f["new_payload_latency_us"] / 1_000_000
if b_lat_s > 0 and f_lat_s > 0:
mgas_diffs.append(
f["gas_used"] / f_lat_s / 1_000_000
- b["gas_used"] / b_lat_s / 1_000_000
)
return pairs, lat_diffs_ms, mgas_diffs
def compute_paired_stats(
baseline_runs: list[list[dict]],
branch_runs: list[list[dict]],
) -> dict:
"""Compute paired statistics between baseline and branch runs.
Each pair (baseline_runs[i], branch_runs[i]) produces per-block diffs.
All diffs are pooled for the final CI.
"""
all_pairs = []
all_lat_diffs = []
all_mgas_diffs = []
for baseline, branch in zip(baseline_runs, branch_runs):
pairs, lat_diffs, mgas_diffs = _paired_data(baseline, branch)
all_pairs.extend(pairs)
all_lat_diffs.extend(lat_diffs)
all_mgas_diffs.extend(mgas_diffs)
if not all_lat_diffs:
return {}
n = len(all_lat_diffs)
mean_diff = sum(all_lat_diffs) / n
std_diff = stddev(all_lat_diffs, mean_diff)
se = std_diff / math.sqrt(n) if n > 0 else 0.0
ci = T_CRITICAL * se
# Bootstrap CI on difference-of-percentiles (resample paired blocks)
base_lats = sorted([p[0] for p in all_pairs])
branch_lats = sorted([p[1] for p in all_pairs])
p50_diff = percentile(branch_lats, 50) - percentile(base_lats, 50)
p90_diff = percentile(branch_lats, 90) - percentile(base_lats, 90)
p99_diff = percentile(branch_lats, 99) - percentile(base_lats, 99)
rng = random.Random(42)
p50_boot, p90_boot, p99_boot = [], [], []
for _ in range(BOOTSTRAP_ITERATIONS):
sample = rng.choices(all_pairs, k=n)
b_sorted = sorted(p[0] for p in sample)
f_sorted = sorted(p[1] for p in sample)
p50_boot.append(percentile(f_sorted, 50) - percentile(b_sorted, 50))
p90_boot.append(percentile(f_sorted, 90) - percentile(b_sorted, 90))
p99_boot.append(percentile(f_sorted, 99) - percentile(b_sorted, 99))
p50_boot.sort()
p90_boot.sort()
p99_boot.sort()
lo = int(BOOTSTRAP_ITERATIONS * 0.025)
hi = int(BOOTSTRAP_ITERATIONS * 0.975)
mean_mgas_diff = sum(all_mgas_diffs) / len(all_mgas_diffs) if all_mgas_diffs else 0.0
std_mgas_diff = stddev(all_mgas_diffs, mean_mgas_diff) if len(all_mgas_diffs) > 1 else 0.0
mgas_se = std_mgas_diff / math.sqrt(len(all_mgas_diffs)) if all_mgas_diffs else 0.0
mgas_ci = T_CRITICAL * mgas_se
return {
"n": n,
"mean_diff_ms": mean_diff,
"ci_ms": ci,
"p50_diff_ms": p50_diff,
"p50_ci_ms": (p50_boot[hi] - p50_boot[lo]) / 2,
"p90_diff_ms": p90_diff,
"p90_ci_ms": (p90_boot[hi] - p90_boot[lo]) / 2,
"p99_diff_ms": p99_diff,
"p99_ci_ms": (p99_boot[hi] - p99_boot[lo]) / 2,
"mean_mgas_diff": mean_mgas_diff,
"mgas_ci": mgas_ci,
}
def compute_summary(combined: list[dict], gas: list[dict]) -> dict:
"""Compute aggregate metrics from parsed CSV data."""
blocks = len(combined)
return {
"blocks": blocks,
}
def format_duration(seconds: float) -> str:
if seconds >= 60:
return f"{seconds / 60:.1f}min"
return f"{seconds}s"
def format_gas(gas: int) -> str:
if gas >= GIGAGAS:
return f"{gas / GIGAGAS:.1f}G"
if gas >= 1_000_000:
return f"{gas / 1_000_000:.1f}M"
return f"{gas:,}"
def fmt_ms(v: float) -> str:
return f"{v:.2f}ms"
def fmt_mgas(v: float) -> str:
return f"{v:.2f}"
def change_str(pct: float, ci_pct: float, lower_is_better: bool) -> str:
"""Format change% with paired CI significance.
Significant if the CI doesn't cross zero (i.e. |pct| > ci_pct).
"""
significant = abs(pct) > ci_pct
if not significant:
emoji = ""
elif (pct < 0) == lower_is_better:
emoji = ""
else:
emoji = ""
return f"{pct:+.2f}% {emoji}{ci_pct:.2f}%)"
def generate_comparison_table(
run1: dict,
run2: dict,
paired: dict,
repo: str,
baseline_ref: str,
branch_name: str,
branch_sha: str,
) -> str:
"""Generate a markdown comparison table between baseline (main) and branch."""
n = paired["n"]
def pct(base: float, feat: float) -> float:
return (feat - base) / base * 100.0 if base > 0 else 0.0
mean_pct = pct(run1["mean_ms"], run2["mean_ms"])
gas_pct = pct(run1["mean_mgas_s"], run2["mean_mgas_s"])
p50_pct = pct(run1["p50_ms"], run2["p50_ms"])
p90_pct = pct(run1["p90_ms"], run2["p90_ms"])
p99_pct = pct(run1["p99_ms"], run2["p99_ms"])
# Bootstrap CIs as % of baseline percentile
p50_ci_pct = paired["p50_ci_ms"] / run1["p50_ms"] * 100.0 if run1["p50_ms"] > 0 else 0.0
p90_ci_pct = paired["p90_ci_ms"] / run1["p90_ms"] * 100.0 if run1["p90_ms"] > 0 else 0.0
p99_ci_pct = paired["p99_ci_ms"] / run1["p99_ms"] * 100.0 if run1["p99_ms"] > 0 else 0.0
# CI as a percentage of baseline mean
lat_ci_pct = paired["ci_ms"] / run1["mean_ms"] * 100.0 if run1["mean_ms"] > 0 else 0.0
mgas_ci_pct = paired["mgas_ci"] / run1["mean_mgas_s"] * 100.0 if run1["mean_mgas_s"] > 0 else 0.0
base_url = f"https://github.com/{repo}/commit"
baseline_label = f"[`main`]({base_url}/{baseline_ref})"
branch_label = f"[`{branch_name}`]({base_url}/{branch_sha})"
lines = [
f"| Metric | {baseline_label} | {branch_label} | Change |",
"|--------|------|--------|--------|",
f"| Mean | {fmt_ms(run1['mean_ms'])} | {fmt_ms(run2['mean_ms'])} | {change_str(mean_pct, lat_ci_pct, lower_is_better=True)} |",
f"| StdDev | {fmt_ms(run1['stddev_ms'])} | {fmt_ms(run2['stddev_ms'])} | |",
f"| P50 | {fmt_ms(run1['p50_ms'])} | {fmt_ms(run2['p50_ms'])} | {change_str(p50_pct, p50_ci_pct, lower_is_better=True)} |",
f"| P90 | {fmt_ms(run1['p90_ms'])} | {fmt_ms(run2['p90_ms'])} | {change_str(p90_pct, p90_ci_pct, lower_is_better=True)} |",
f"| P99 | {fmt_ms(run1['p99_ms'])} | {fmt_ms(run2['p99_ms'])} | {change_str(p99_pct, p99_ci_pct, lower_is_better=True)} |",
f"| Mgas/s | {fmt_mgas(run1['mean_mgas_s'])} | {fmt_mgas(run2['mean_mgas_s'])} | {change_str(gas_pct, mgas_ci_pct, lower_is_better=False)} |",
"",
f"*{n} blocks*",
]
return "\n".join(lines)
def generate_markdown(
summary: dict, comparison_table: str,
behind_main: int = 0, repo: str = "", baseline_ref: str = "",
) -> str:
"""Generate a markdown comment body."""
lines = ["## Benchmark Results", "", comparison_table]
if behind_main > 0:
s = "s" if behind_main > 1 else ""
diff_link = f"https://github.com/{repo}/compare/{baseline_ref[:12]}...main"
lines.append("")
lines.append(f"> ⚠️ Branch is [**{behind_main} commit{s} behind `main`**]({diff_link}). Consider rebasing for accurate results.")
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(description="Parse reth-bench ABBA results")
parser.add_argument(
"--baseline-csv", nargs="+", required=True,
help="Baseline combined_latency.csv files (A1, A2)",
)
parser.add_argument(
"--branch-csv", nargs="+", required=True,
help="Branch combined_latency.csv files (B1, B2)",
)
parser.add_argument("--gas-csv", required=True, help="Path to total_gas.csv")
parser.add_argument(
"--output-summary", required=True, help="Output JSON summary path"
)
parser.add_argument("--output-markdown", required=True, help="Output markdown path")
parser.add_argument(
"--repo", default="paradigmxyz/reth", help="GitHub repo (owner/name)"
)
parser.add_argument("--baseline-ref", default=None, help="Baseline commit SHA")
parser.add_argument("--branch-name", default=None, help="Branch name")
parser.add_argument("--branch-sha", default=None, help="Branch commit SHA")
parser.add_argument("--behind-main", type=int, default=0, help="Commits behind main")
args = parser.parse_args()
if len(args.baseline_csv) != len(args.branch_csv):
print("Must provide equal number of baseline and branch CSVs", file=sys.stderr)
sys.exit(1)
baseline_runs = []
branch_runs = []
for path in args.baseline_csv:
data = parse_combined_csv(path)
if not data:
print(f"No results in {path}", file=sys.stderr)
sys.exit(1)
baseline_runs.append(data)
for path in args.branch_csv:
data = parse_combined_csv(path)
if not data:
print(f"No results in {path}", file=sys.stderr)
sys.exit(1)
branch_runs.append(data)
gas = parse_gas_csv(args.gas_csv)
all_baseline = [r for run in baseline_runs for r in run]
all_branch = [r for run in branch_runs for r in run]
summary = compute_summary(all_branch, gas)
with open(args.output_summary, "w") as f:
json.dump(summary, f, indent=2)
print(f"Summary written to {args.output_summary}")
baseline_stats = compute_stats(all_baseline)
branch_stats = compute_stats(all_branch)
paired_stats = compute_paired_stats(baseline_runs, branch_runs)
if not paired_stats:
print("No common blocks between baseline and branch runs", file=sys.stderr)
sys.exit(1)
comparison_table = generate_comparison_table(
baseline_stats,
branch_stats,
paired_stats,
repo=args.repo,
baseline_ref=args.baseline_ref or "main",
branch_name=args.branch_name or "branch",
branch_sha=args.branch_sha or "unknown",
)
print(f"Generated comparison ({paired_stats['n']} paired blocks, "
f"mean diff {paired_stats['mean_diff_ms']:+.3f}ms ± {paired_stats['ci_ms']:.3f}ms)")
markdown = generate_markdown(
summary, comparison_table,
behind_main=args.behind_main,
repo=args.repo,
baseline_ref=args.baseline_ref or "",
)
with open(args.output_markdown, "w") as f:
f.write(markdown)
print(f"Markdown written to {args.output_markdown}")
if __name__ == "__main__":
main()

View File

@@ -1,11 +1,25 @@
# Runs benchmarks.
#
# The reth-bench job replays real blocks via the Engine API against a reth node
# backed by a local snapshot managed with schelk.
#
# It runs the main (baseline) binary and the branch (candidate) binary on the
# same block range (snapshot recovered between runs) to compare performance.
on:
pull_request:
# TODO: Disabled temporarily for https://github.com/CodSpeedHQ/runner/issues/55
# merge_group:
push:
branches: [main]
issue_comment:
types: [created, edited]
workflow_dispatch:
inputs:
blocks:
description: "Number of blocks to benchmark"
required: false
default: "50"
type: string
env:
CARGO_TERM_COLOR: always
@@ -14,9 +28,19 @@ env:
RUSTC_WRAPPER: "sccache"
name: bench
permissions:
contents: write
pull-requests: write
concurrency:
group: bench-${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
codspeed:
runs-on: ${{ github.repository == 'paradigmxyz/reth' && 'depot-ubuntu-latest' || 'ubuntu-latest' }}
if: github.event_name != 'issue_comment'
runs-on: depot-ubuntu-latest
strategy:
matrix:
partition: [1, 2]
@@ -31,6 +55,7 @@ jobs:
- uses: actions/checkout@v6
with:
submodules: true
ref: ${{ github.event_name == 'issue_comment' && format('refs/pull/{0}/merge', github.event.issue.number) || '' }}
- uses: rui314/setup-mold@v1
- uses: dtolnay/rust-toolchain@stable
- uses: mozilla-actions/sccache-action@v0.0.9
@@ -49,3 +74,345 @@ jobs:
run: cargo codspeed run ${{ matrix.crates }}
mode: instrumentation
token: ${{ secrets.CODSPEED_TOKEN }}
reth-bench:
if: github.event_name == 'issue_comment' && github.event.issue.pull_request && startsWith(github.event.comment.body, 'derek bench')
name: reth-bench
runs-on: [self-hosted, Linux, X64]
timeout-minutes: 120
env:
BENCH_RPC_URL: https://ethereum.reth.rs/rpc
SCHELK_MOUNT: /reth-bench
steps:
- name: Check org membership
uses: actions/github-script@v7
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
const user = context.payload.comment.user.login;
try {
const { status } = await github.rest.orgs.checkMembershipForUser({
org: 'paradigmxyz',
username: user,
});
if (status !== 204 && status !== 302) {
core.setFailed(`@${user} is not a member of paradigmxyz`);
}
} catch (e) {
core.setFailed(`@${user} is not a member of paradigmxyz`);
}
- name: Parse arguments
id: args
uses: actions/github-script@v7
with:
script: |
const body = context.payload.comment.body.trim();
const known = new Set(['blocks', 'warmup']);
const defaults = { blocks: '500', warmup: '100' };
const unknown = [];
const invalid = [];
const args = body.replace(/^derek bench\s*/, '');
for (const part of args.split(/\s+/).filter(Boolean)) {
const eq = part.indexOf('=');
if (eq === -1) {
unknown.push(part);
continue;
}
const key = part.slice(0, eq);
const value = part.slice(eq + 1);
if (!known.has(key)) {
unknown.push(key);
} else if (!/^\d+$/.test(value)) {
invalid.push(`\`${key}=${value}\` (must be a positive integer)`);
} else {
defaults[key] = value;
}
}
const errors = [];
if (unknown.length) errors.push(`Unknown argument(s): \`${unknown.join('`, `')}\``);
if (invalid.length) errors.push(`Invalid value(s): ${invalid.join(', ')}`);
if (errors.length) {
const msg = `❌ **Invalid bench command**\n\n${errors.join('\n')}\n\n**Usage:** \`derek bench [blocks=N] [warmup=N]\``;
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: msg,
});
core.setFailed(msg);
return;
}
core.setOutput('blocks', defaults.blocks);
core.setOutput('warmup', defaults.warmup);
core.exportVariable('BENCH_BLOCKS', defaults.blocks);
core.exportVariable('BENCH_WARMUP_BLOCKS', defaults.warmup);
- name: Acknowledge request
id: ack
uses: actions/github-script@v7
with:
script: |
await github.rest.reactions.createForIssueComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: context.payload.comment.id,
content: 'eyes',
});
const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
const blocks = '${{ steps.args.outputs.blocks }}';
const warmup = '${{ steps.args.outputs.warmup }}';
const { data: comment } = await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: `🚀 Benchmark started! [View run](${runUrl})\n\n⏳ **Status:** Building binaries...\n\n**Config:** ${blocks} blocks, ${warmup} warmup blocks`,
});
core.setOutput('comment-id', comment.id);
- uses: actions/checkout@v6
with:
submodules: true
fetch-depth: 0
ref: ${{ format('refs/pull/{0}/merge', github.event.issue.number) }}
- uses: dtolnay/rust-toolchain@stable
- uses: mozilla-actions/sccache-action@v0.0.9
continue-on-error: true
# Verify all required tools are available
- name: Check dependencies
run: |
export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
echo "$HOME/.local/bin" >> "$GITHUB_PATH"
echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
missing=()
for cmd in mc schelk cpupower taskset stdbuf python3 curl make uv; do
command -v "$cmd" &>/dev/null || missing+=("$cmd")
done
if [ ${#missing[@]} -gt 0 ]; then
echo "::error::Missing required tools: ${missing[*]}"
exit 1
fi
echo "All dependencies found"
# Build binaries
- name: Fetch or build main binaries
run: |
MERGE_BASE=$(git merge-base HEAD origin/main 2>/dev/null || echo "${{ github.sha }}")
.github/scripts/bench-reth-build.sh main "$MERGE_BASE"
- name: Fetch or build branch binaries
run: |
BRANCH_SHA="${{ github.sha }}"
.github/scripts/bench-reth-build.sh branch "$BRANCH_SHA"
# System tuning for reproducible benchmarks
- name: System setup
run: |
sudo cpupower frequency-set -g performance || true
# Disable turbo boost (Intel and AMD paths)
echo 1 | sudo tee /sys/devices/system/cpu/intel_pstate/no_turbo 2>/dev/null || true
echo 0 | sudo tee /sys/devices/system/cpu/cpufreq/boost 2>/dev/null || true
sudo swapoff -a || true
echo 0 | sudo tee /proc/sys/kernel/randomize_va_space || true
# Disable SMT (hyperthreading)
for cpu in /sys/devices/system/cpu/cpu*/topology/thread_siblings_list; do
first=$(cut -d, -f1 < "$cpu" | cut -d- -f1)
current=$(echo "$cpu" | grep -o 'cpu[0-9]*' | grep -o '[0-9]*')
if [ "$current" != "$first" ]; then
echo 0 | sudo tee "/sys/devices/system/cpu/cpu${current}/online" || true
fi
done
echo "Online CPUs: $(nproc)"
# Disable transparent huge pages (compaction causes latency spikes)
for p in /sys/kernel/mm/transparent_hugepage /sys/kernel/mm/transparent_hugepages; do
[ -d "$p" ] && echo never | sudo tee "$p/enabled" && echo never | sudo tee "$p/defrag" && break
done || true
# Prevent deep C-states (avoids wake-up latency jitter)
sudo sh -c 'exec 3<>/dev/cpu_dma_latency; echo -ne "\x00\x00\x00\x00" >&3; sleep infinity' &
# Move all IRQs to core 0 (housekeeping core)
for irq in /proc/irq/*/smp_affinity_list; do
echo 0 | sudo tee "$irq" 2>/dev/null || true
done
# Stop noisy background services
sudo systemctl stop irqbalance cron atd unattended-upgrades snapd 2>/dev/null || true
# Log environment for reproducibility
echo "=== Benchmark environment ==="
uname -r
lscpu | grep -E 'Model name|CPU\(s\)|MHz|NUMA'
cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq
cat /sys/kernel/mm/transparent_hugepage/enabled 2>/dev/null || cat /sys/kernel/mm/transparent_hugepages/enabled 2>/dev/null || echo "THP: unknown"
free -h
# Clean up any leftover state
- name: Pre-flight cleanup
run: |
pkill -9 reth || true
mountpoint -q "$SCHELK_MOUNT" && sudo schelk recover -y || true
- name: Update status (running benchmarks)
if: steps.ack.outputs.comment-id
uses: actions/github-script@v7
with:
script: |
const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
await github.rest.issues.updateComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: ${{ steps.ack.outputs.comment-id || 0 }},
body: `🚀 Benchmark started! [View run](${runUrl})\n\n⏳ **Status:** Running benchmarks (2 runs)...`,
});
- name: "Run benchmark: baseline"
run: taskset -c 0 .github/scripts/bench-reth-run.sh baseline target/profiling-baseline/reth /tmp/bench-results-baseline
- name: "Run benchmark: branch"
run: taskset -c 0 .github/scripts/bench-reth-run.sh branch target/profiling/reth /tmp/bench-results-branch
# Results & charts
- name: Parse results
id: results
if: success()
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
BRANCH_SHA: ${{ github.sha }}
run: |
git fetch origin main --quiet
# Use the actual PR head commit, not HEAD (which is the merge commit
# refs/pull/N/merge and always has origin/main as a parent).
MERGE_BASE=$(git merge-base "${BRANCH_SHA}" origin/main 2>/dev/null || echo "${{ github.sha }}")
MAIN_HEAD=$(git rev-parse origin/main 2>/dev/null || echo "")
BEHIND_MAIN=0
if [ -n "$MAIN_HEAD" ] && [ "$MERGE_BASE" != "$MAIN_HEAD" ]; then
BEHIND_MAIN=$(git rev-list --count "${MERGE_BASE}..${MAIN_HEAD}" 2>/dev/null || echo "0")
fi
SUMMARY_ARGS="--output-summary /tmp/bench-summary.json"
SUMMARY_ARGS="$SUMMARY_ARGS --output-markdown /tmp/bench-comment.md"
SUMMARY_ARGS="$SUMMARY_ARGS --repo ${{ github.repository }}"
SUMMARY_ARGS="$SUMMARY_ARGS --baseline-ref ${MERGE_BASE}"
SUMMARY_ARGS="$SUMMARY_ARGS --branch-name ${BRANCH_NAME}"
SUMMARY_ARGS="$SUMMARY_ARGS --branch-sha ${BRANCH_SHA}"
SUMMARY_ARGS="$SUMMARY_ARGS --baseline-csv /tmp/bench-results-baseline/combined_latency.csv"
SUMMARY_ARGS="$SUMMARY_ARGS --branch-csv /tmp/bench-results-branch/combined_latency.csv"
SUMMARY_ARGS="$SUMMARY_ARGS --gas-csv /tmp/bench-results-branch/total_gas.csv"
if [ "$BEHIND_MAIN" -gt 0 ]; then
SUMMARY_ARGS="$SUMMARY_ARGS --behind-main $BEHIND_MAIN"
fi
# shellcheck disable=SC2086
python3 .github/scripts/bench-reth-summary.py $SUMMARY_ARGS
- name: Generate charts
if: success()
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
run: |
CHART_ARGS="/tmp/bench-results-branch/combined_latency.csv --output-dir /tmp/bench-charts"
CHART_ARGS="$CHART_ARGS --baseline /tmp/bench-results-baseline/combined_latency.csv"
CHART_ARGS="$CHART_ARGS --branch-name ${BRANCH_NAME}"
# shellcheck disable=SC2086
uv run --with matplotlib python3 .github/scripts/bench-reth-charts.py $CHART_ARGS
- name: Upload results
if: success()
uses: actions/upload-artifact@v4
with:
name: bench-reth-results
path: |
/tmp/bench-results-baseline/
/tmp/bench-results-branch/
/tmp/bench-summary.json
/tmp/bench-charts/
- name: Push charts
id: push-charts
if: success()
run: |
PR_NUMBER=${{ github.event.issue.number }}
RUN_ID=${{ github.run_id }}
CHART_DIR="pr/${PR_NUMBER}/${RUN_ID}"
if git fetch origin bench-charts 2>/dev/null; then
git checkout bench-charts
else
git checkout --orphan bench-charts
git rm -rf . 2>/dev/null || true
fi
mkdir -p "${CHART_DIR}"
cp /tmp/bench-charts/*.png "${CHART_DIR}/"
git add "${CHART_DIR}"
git -c user.name="github-actions" -c user.email="github-actions@github.com" \
commit -m "bench charts for PR #${PR_NUMBER} run ${RUN_ID}"
git push origin bench-charts
echo "sha=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT"
- name: Compare & comment
if: success()
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
let comment = '';
try {
comment = fs.readFileSync('/tmp/bench-comment.md', 'utf8');
} catch (e) {
comment = '⚠️ Engine benchmark completed but failed to generate comparison.';
}
const sha = '${{ steps.push-charts.outputs.sha }}';
const prNumber = context.issue.number;
const runId = '${{ github.run_id }}';
const baseUrl = `https://raw.githubusercontent.com/${context.repo.owner}/${context.repo.repo}/${sha}/pr/${prNumber}/${runId}`;
const charts = [
{ file: 'latency_throughput.png', label: 'Latency, Throughput & Diff' },
{ file: 'wait_breakdown.png', label: 'Wait Time Breakdown' },
{ file: 'gas_vs_latency.png', label: 'Gas vs Latency' },
];
let chartMarkdown = '\n\n### Charts\n\n';
for (const chart of charts) {
chartMarkdown += `<details><summary>${chart.label}</summary>\n\n`;
chartMarkdown += `![${chart.label}](${baseUrl}/${chart.file})\n\n`;
chartMarkdown += `</details>\n\n`;
}
comment += chartMarkdown;
const requestedBy = '${{ github.event.comment.user.login }}';
const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
const body = `cc @${requestedBy}\n\n✅ Benchmark complete! [View run](${runUrl})\n\n${comment}`;
const ackCommentId = '${{ steps.ack.outputs.comment-id }}';
if (ackCommentId) {
await github.rest.issues.updateComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: parseInt(ackCommentId),
body,
});
} else {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body,
});
}
- name: Upload node log
if: failure()
uses: actions/upload-artifact@v4
with:
name: reth-node-log
path: |
/tmp/reth-bench-node-baseline.log
/tmp/reth-bench-node-branch.log
- name: Restore system settings
if: always()
run: |
sudo systemctl start irqbalance cron atd 2>/dev/null || true