feat(metrics): add MerkleChangeSets stage performance metrics

- Add comprehensive metrics to track MerkleChangeSets stage execution:
  * execution_duration: Time to execute stage per run
  * blocks_per_execution: Number of blocks processed per run
  * per_block_duration: Average processing time per block
  * checkpoint_block: Current checkpoint block number
  * checkpoint_lag: Distance between checkpoint and finalized tip
  * execution_count: Total number of stage executions

- Add Grafana dashboard panels:
  * MerkleChangeSets - Execution Duration panel
  * MerkleChangeSets - Blocks Per Execution panel
  * New row for MerkleChangeSets Stage Performance metrics

These metrics will provide visibility into why the overlay state provider
checkpoint lags 45-74 blocks behind, causing 600-700ms spikes when reverts
are required. Expected baseline: ~64 blocks per execution (default retention).

Part of implementation plan to reduce overlay provider revert spikes.
This commit is contained in:
Yong Kang
2025-10-31 16:59:04 +08:00
parent c2973f8a60
commit f3d0858f18
2 changed files with 293 additions and 2 deletions

View File

@@ -17,6 +17,35 @@ use reth_trie_db::{DatabaseHashedPostState, DatabaseStateRoot};
use std::ops::Range;
use tracing::{debug, error};
#[cfg(feature = "metrics")]
use reth_metrics::{
metrics::{Counter, Gauge, Histogram},
Metrics,
};
#[cfg(feature = "metrics")]
#[derive(Clone, Metrics)]
#[metrics(scope = "stages.merkle_changesets")]
struct MerkleChangeSetsMetrics {
/// Total execution duration per stage run
execution_duration: Histogram,
/// Number of blocks processed per execution
blocks_per_execution: Histogram,
/// Time to process one block (average)
per_block_duration: Histogram,
/// Current checkpoint block number
checkpoint_block: Gauge,
/// Checkpoint lag in blocks (tip - checkpoint)
checkpoint_lag: Gauge,
/// Number of stage executions
execution_count: Counter,
}
/// The `MerkleChangeSets` stage.
///
/// This stage processes and maintains trie changesets from the finalized block to the latest block.
@@ -25,17 +54,35 @@ pub struct MerkleChangeSets {
/// The number of blocks to retain changesets for, used as a fallback when the finalized block
/// is not found. Defaults to 64 (2 epochs in beacon chain).
retention_blocks: u64,
#[cfg(feature = "metrics")]
metrics: MerkleChangeSetsMetrics,
}
impl MerkleChangeSets {
/// Creates a new `MerkleChangeSets` stage with default retention blocks of 64.
pub const fn new() -> Self {
Self { retention_blocks: 64 }
#[cfg(not(feature = "metrics"))]
{
Self { retention_blocks: 64 }
}
#[cfg(feature = "metrics")]
{
Self { retention_blocks: 64, metrics: MerkleChangeSetsMetrics::default() }
}
}
/// Creates a new `MerkleChangeSets` stage with a custom finalized block height.
pub const fn with_retention_blocks(retention_blocks: u64) -> Self {
Self { retention_blocks }
#[cfg(not(feature = "metrics"))]
{
Self { retention_blocks }
}
#[cfg(feature = "metrics")]
{
Self { retention_blocks, metrics: MerkleChangeSetsMetrics::default() }
}
}
/// Returns the range of blocks which are already computed. Will return an empty range if none
@@ -297,6 +344,12 @@ where
}
fn execute(&mut self, provider: &Provider, input: ExecInput) -> Result<ExecOutput, StageError> {
#[cfg(feature = "metrics")]
let execution_start = std::time::Instant::now();
#[cfg(feature = "metrics")]
self.metrics.execution_count.increment(1);
// Get merkle checkpoint and assert that the target is the same.
let merkle_checkpoint = provider
.get_stage_checkpoint(StageId::MerkleExecute)?
@@ -353,6 +406,11 @@ where
computed_range = target_range.clone();
}
let blocks_count = target_range.end.saturating_sub(target_range.start);
#[cfg(feature = "metrics")]
self.metrics.blocks_per_execution.record(blocks_count as f64);
// Populate the target range with changesets
Self::populate_range(provider, target_range)?;
@@ -370,6 +428,26 @@ where
// `computed_range.end` is exclusive.
let checkpoint = StageCheckpoint::new(computed_range.end.saturating_sub(1));
#[cfg(feature = "metrics")]
{
let execution_duration = execution_start.elapsed();
self.metrics.execution_duration.record(execution_duration.as_secs_f64());
if blocks_count > 0 {
let per_block = execution_duration.as_secs_f64() / blocks_count as f64;
self.metrics.per_block_duration.record(per_block);
}
// Record checkpoint state
self.metrics.checkpoint_block.set(checkpoint.block_number as f64);
// Calculate lag
if let Ok(Some(tip)) = provider.last_finalized_block_number() {
let lag = tip.saturating_sub(checkpoint.block_number);
self.metrics.checkpoint_lag.set(lag as f64);
}
}
Ok(ExecOutput::done(checkpoint))
}

View File

@@ -6290,6 +6290,219 @@
"title": "State root latency",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 177
},
"id": 400,
"panels": [],
"title": "MerkleChangeSets Stage Performance",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"description": "Time to execute MerkleChangeSets stage. High values indicate stage is slow.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "red",
"value": 5
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 178
},
"id": 401,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "reth_stages_merkle_changesets_execution_duration{$instance_label=\"$instance\",quantile=~\"(0.5|0.9|0.95|1)\"}",
"legendFormat": "{{quantile}}",
"refId": "A"
}
],
"title": "MerkleChangeSets - Execution Duration",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"description": "Number of blocks processed per stage execution. High values mean infrequent updates causing checkpoint lag.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "red",
"value": 100
}
]
},
"unit": "blocks"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 178
},
"id": 402,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "reth_stages_merkle_changesets_blocks_per_execution{$instance_label=\"$instance\",quantile=~\"(0.5|0.9|0.95|1)\"}",
"legendFormat": "{{quantile}}",
"refId": "A"
}
],
"title": "MerkleChangeSets - Blocks Per Execution",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",