feat(metrics): track revert collection counts in MerkleChangeSets stage

Add metrics to measure the workload size of revert collection:

Metrics Added:
- state_reverts_collected: Total state items (accounts + storage slots)
- trie_reverts_collected: Total trie items (account + storage nodes)
- state_reverts_per_block: Average state reverts per block
- trie_reverts_per_block: Average trie reverts per block

Implementation:
- Created RevertCounts struct to return counts from populate_range()
- Track state revert counts during HashedPostState::from_reverts() loop
- Track trie revert counts using TrieUpdatesSorted::total_len()
- Record all metrics in execute() method

Dashboard:
- Added 'State Reverts Collected' panel (id: 403)
- Added 'Trie Reverts Collected' panel (id: 404)
- Both panels at y:186, side-by-side below existing MerkleChangeSets metrics

These metrics will help understand:
1. Correlation between revert count and execution time
2. Whether workload scales linearly with block count
3. If optimization is needed for high revert scenarios

Part of overlay provider performance analysis to reduce 600-700ms spikes.
This commit is contained in:
Yong Kang
2025-10-31 17:34:43 +08:00
parent f3d0858f18
commit 364910acb4
4 changed files with 1085 additions and 5 deletions

568
IMPLEMENTATION_PLAN.md Normal file
View File

@@ -0,0 +1,568 @@
# Implementation Plan: Reduce Overlay State Provider Spikes
## 🎯 Goal
Reduce 600-700ms overlay provider spikes to <200ms by addressing MerkleChangeSets checkpoint lag.
## 📋 Step-by-Step Implementation
### Step 1: Add MerkleChangeSets Metrics (Priority: CRITICAL)
This gives us visibility into WHY the checkpoint lags.
#### **File: `crates/stages/stages/src/stages/merkle_changesets.rs`**
**A. Add metrics struct at top of file:**
```rust
#[cfg(feature = "metrics")]
use reth_metrics::{
metrics::{Counter, Gauge, Histogram},
Metrics,
};
#[cfg(feature = "metrics")]
#[derive(Clone, Metrics)]
#[metrics(scope = "stages.merkle_changesets")]
struct MerkleChangeSetsMetrics {
/// Total execution duration per stage run
execution_duration: Histogram,
/// Number of blocks processed per execution
blocks_per_execution: Histogram,
/// Time to process one block (average)
per_block_duration: Histogram,
/// Current checkpoint block number
checkpoint_block: Gauge,
/// Checkpoint lag in blocks (tip - checkpoint)
checkpoint_lag: Gauge,
/// Number of stage executions
execution_count: Counter,
}
```
**B. Add metrics field to MerkleChangeSets struct:**
```rust
#[derive(Debug, Clone)]
pub struct MerkleChangeSets {
retention_blocks: u64,
#[cfg(feature = "metrics")]
metrics: MerkleChangeSetsMetrics,
}
```
**C. Update constructors:**
```rust
pub const fn new() -> Self {
#[cfg(not(feature = "metrics"))]
{
Self { retention_blocks: 64 }
}
#[cfg(feature = "metrics")]
{
Self {
retention_blocks: 64,
metrics: MerkleChangeSetsMetrics::default(),
}
}
}
pub const fn with_retention_blocks(retention_blocks: u64) -> Self {
#[cfg(not(feature = "metrics"))]
{
Self { retention_blocks }
}
#[cfg(feature = "metrics")]
{
Self {
retention_blocks,
metrics: MerkleChangeSetsMetrics::default(),
}
}
}
```
**D. Instrument execute() method:**
Find the `fn execute()` method around line 299 and add:
```rust
fn execute(&mut self, provider: &Provider, input: ExecInput) -> Result<ExecOutput, StageError> {
#[cfg(feature = "metrics")]
let execution_start = std::time::Instant::now();
#[cfg(feature = "metrics")]
self.metrics.execution_count.increment(1);
// ... existing code ...
let target_range = self.determine_target_range(provider)?;
let blocks_count = target_range.end.saturating_sub(target_range.start);
#[cfg(feature = "metrics")]
self.metrics.blocks_per_execution.record(blocks_count as f64);
// ... existing population code ...
// At the end, before returning:
#[cfg(feature = "metrics")]
{
let execution_duration = execution_start.elapsed();
self.metrics.execution_duration.record(execution_duration.as_secs_f64());
if blocks_count > 0 {
let per_block = execution_duration.as_secs_f64() / blocks_count as f64;
self.metrics.per_block_duration.record(per_block);
}
// Record checkpoint state
if let Some(checkpoint) = input.checkpoint {
self.metrics.checkpoint_block.set(checkpoint.block_number as f64);
// Calculate lag
if let Ok(Some(tip)) = provider.last_finalized_block_number() {
let lag = tip.saturating_sub(checkpoint.block_number);
self.metrics.checkpoint_lag.set(lag as f64);
}
}
}
Ok(output)
}
```
---
### Step 2: Add Grafana Panels for New Metrics
#### **File: `dashboard.json`**
Add these panels after the existing "Overlay State Provider - Checkpoint Delta" panel:
```json
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 177
},
"id": 400,
"panels": [],
"title": "MerkleChangeSets Stage Performance",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"description": "Time to execute MerkleChangeSets stage. High values indicate stage is slow.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 1
},
{
"color": "red",
"value": 5
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 178
},
"id": 401,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "reth_stages_merkle_changesets_execution_duration{$instance_label=\"$instance\",quantile=~\"(0.5|0.9|0.95|1)\"}",
"legendFormat": "{{quantile}}",
"refId": "A"
}
],
"title": "MerkleChangeSets - Execution Duration",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"description": "Number of blocks processed per stage execution. High values mean infrequent updates causing checkpoint lag.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 50
},
{
"color": "red",
"value": 100
}
]
},
"unit": "blocks"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 178
},
"id": 402,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "reth_stages_merkle_changesets_blocks_per_execution{$instance_label=\"$instance\",quantile=~\"(0.5|0.9|0.95|1)\"}",
"legendFormat": "{{quantile}}",
"refId": "A"
}
],
"title": "MerkleChangeSets - Blocks Per Execution",
"type": "timeseries"
}
```
---
### Step 3: Test and Baseline
**Commands to run:**
```bash
# 1. Format code
cargo +nightly fmt --all
# 2. Build with metrics
cargo build --release --features metrics
# 3. Run and collect metrics for 10 minutes
# Query Prometheus:
rate(reth_stages_merkle_changesets_execution_count[5m])
reth_stages_merkle_changesets_blocks_per_execution{quantile="0.5"}
reth_stages_merkle_changesets_checkpoint_lag
```
**Baseline expectations:**
- Execution frequency: Should see ~1-2 per minute
- Blocks per execution: Probably 50-70 (explains 45-74 block lag!)
- Checkpoint lag: Should match earlier observations (45-74 blocks)
---
### Step 4: Implement Fix (Reduce Batch Size)
Once you confirm batch size is too large:
#### **Find where MerkleChangeSets is constructed**
```bash
# Search for where stage is created
rg "MerkleChangeSets::new|MerkleChangeSets::with_retention_blocks" --type rust
```
Likely in `crates/node/builder/src/launch/` or similar pipeline setup.
#### **Change from:**
```rust
MerkleChangeSets::new() // Default 64 blocks
```
#### **To:**
```rust
MerkleChangeSets::with_retention_blocks(20) // Update every 20 blocks
```
**Or** add time-based triggering (more complex):
```rust
// In stage execution loop
if last_merkle_update.elapsed() > Duration::from_secs(10) {
stage.execute(provider, input)?;
last_merkle_update = Instant::now();
}
```
---
### Step 5: Validate Fix
**After deploying fix:**
```bash
# Check new metrics
reth_stages_merkle_changesets_blocks_per_execution{quantile="0.5"}
# Should be ~20 now (was 50-70)
reth_stages_merkle_changesets_checkpoint_lag
# Should be 0-20 blocks (was 45-74)
# Check overlay metrics
rate(reth_storage_overlay_state_provider_reverts_required[5m])
# Should stay ~1.7 req/s but...
# Check overlay duration
reth_storage_overlay_state_provider_trie_reverts_duration{quantile="0.9"}
# Should drop to ~100-200ms (was 500-600ms)
reth_storage_overlay_state_provider_total_database_provider_ro_duration{quantile="0.9"}
# Should drop to ~200-300ms (was 600-700ms)
```
**Success criteria:**
- ✅ Checkpoint lag: <25 blocks (was 45-74)
- ✅ Trie revert duration: <250ms (was 500-600ms)
- ✅ Total overlay duration: <350ms (was 600-700ms)
- ✅ No regression in overall throughput
---
### Step 6: Add Alerting
#### **File: `alerting_rules.yml` (or similar)**
```yaml
groups:
- name: merkle_changesets
interval: 30s
rules:
- alert: MerkleChangeSetsCheckpointLagging
expr: |
reth_stages_merkle_changesets_checkpoint_lag > 50
for: 5m
labels:
severity: warning
annotations:
summary: "MerkleChangeSets checkpoint is {{ $value }} blocks behind"
description: "Checkpoint lag >50 blocks. Target: <25 blocks. Check stage execution frequency."
- alert: MerkleChangeSetsExecutionSlow
expr: |
reth_stages_merkle_changesets_execution_duration{quantile="0.9"} > 5
for: 5m
labels:
severity: warning
annotations:
summary: "MerkleChangeSets taking {{ $value }}s to execute (p90)"
description: "Stage execution >5s. Check DB performance or reduce batch size."
- alert: OverlayRevertsSpiking
expr: |
reth_storage_overlay_state_provider_trie_reverts_duration{quantile="0.9"} > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: "Overlay trie reverts taking {{ $value }}s (p90)"
description: "Revert duration >500ms. Check MerkleChangeSets checkpoint lag."
```
---
## 🧪 Testing Checklist
### Before Fix
- [ ] Baseline checkpoint lag metric
- [ ] Baseline blocks per execution
- [ ] Baseline overlay revert duration
- [ ] Baseline Engine API throughput
### After Fix
- [ ] Checkpoint lag reduced to <25 blocks
- [ ] Overlay revert duration reduced to <250ms
- [ ] No throughput regression
- [ ] Alerts configured and tested
### Production Validation
- [ ] Deploy to staging first
- [ ] Run for 24 hours
- [ ] Compare metrics before/after
- [ ] Validate under peak load
- [ ] Deploy to production
---
## 📊 Expected Results
| Metric | Before | After | Improvement |
|--------|--------|-------|-------------|
| Checkpoint Lag | 45-74 blocks | 10-20 blocks | 60-70% ↓ |
| Trie Reverts | 500-600ms | 100-200ms | 65-80% ↓ |
| Total Overlay | 600-700ms | 200-300ms | 60-70% ↓ |
| Impact Rate | 1.72 req/s | 1.72 req/s | Same |
| State Root | 5-10μs | 5-10μs | Unchanged |
---
## 🚨 Rollback Plan
If fix causes issues:
1. **Revert retention_blocks change**:
```rust
MerkleChangeSets::new() // Back to default 64
```
2. **Monitor for recovery**:
```bash
# Checkpoint lag should return to baseline
reth_stages_merkle_changesets_checkpoint_lag
```
3. **Alternative approach**: Implement async revert fetching instead
---
## 📝 Next Steps
1. ✅ Add MerkleChangeSets metrics (Step 1)
2. ✅ Add Grafana panels (Step 2)
3. ✅ Collect baseline data (Step 3)
4. 🎯 Implement fix based on baseline (Step 4)
5. 📊 Validate results (Step 5)
6. 🚨 Configure alerts (Step 6)
**Estimated timeline**: 1-2 weeks
Want me to start implementing Step 1 (add the metrics)?

286
OVERLAY_ANALYSIS.md Normal file
View File

@@ -0,0 +1,286 @@
# Overlay State Provider Performance Analysis
## 🚨 Critical Finding
**600-700ms spikes in Overlay State Provider are NOT reth-bench specific** - they occur in production during Engine API block validation!
## 📊 Root Cause
### Where block_hash is Set (Production Code)
Found in `crates/engine/tree/src/tree/payload_validator.rs:655` and `:785`:
```rust
let factory = OverlayStateProviderFactory::new(self.provider.clone())
.with_block_hash(Some(block_hash)) // ← Sets block_hash to parent block
.with_trie_overlay(Some(multiproof_config.nodes_sorted))
.with_hashed_state_overlay(Some(multiproof_config.state_sorted));
```
**Context**: This is in the **payload validator** that validates blocks received from the consensus layer via Engine API.
**Why block_hash is set**: The validator needs state at the parent block to validate the new block's state transitions.
### Why Checkpoint Lags
**MerkleChangeSets stage** (`crates/stages/stages/src/stages/merkle_changesets.rs`):
- Runs as part of the **staged sync pipeline**
- Only updates when **MerkleExecute stage** completes
- Processes blocks in **batches** based on finalized blocks or retention window (default 64 blocks)
- Not real-time - runs periodically as part of sync
**The Problem**:
1. New blocks are processed by Engine API → Execution stage
2. MerkleChangeSets stage lags behind MerkleExecute
3. Checkpoint can be 45-74 blocks behind current tip
4. Overlay provider needs reverts when `requested_block > checkpoint`
5. **600ms spent fetching reverts** for 45-74 blocks of history
## 🎯 Impact Analysis
### Metrics Summary
- **Frequency**: 1.72 req/s (0.4% of overlay provider calls)
- **Cost per call**: 600-700ms
- Trie Reverts: 500-600ms
- State Reverts: 60-120ms
- **Total overhead**: ~1 second of DB queries per second
- **Throughput correlation**: One observed 57% drop (14:40), but not consistent
### What's NOT Affected
**State root computation**: Stays 5-10μs (completely unaffected)
**95%+ of overlay calls**: Use fast path (no reverts)
**Block processing**: No direct correlation with spikes
### What IS Affected
⚠️ **Engine API validation**: When checkpoint lags, validation requires expensive reverts
⚠️ **Database load**: 500-600ms of read queries per affected call
⚠️ **Tail latency**: p99 throughput can drop during revert fetches
## 🔍 Why This Happens
```
Timeline of Events:
14:40:00 ──────────────────────────────────────────────────────► 14:46:00
│ │
├─ Engine API receives new block (parent = block N) │
│ - Sets block_hash = N (parent block) │
│ - Checkpoint at block N-60 (lagging!) │
│ - Needs reverts: N-60 → N (60 blocks!) │
│ - Trie fetch: 500ms for 60 blocks │
│ - State fetch: 60ms for 60 blocks │
│ - Total: 600ms spike │
│ │
├─ MerkleChangeSets runs (periodic) │
│ - Updates checkpoint to block N │
│ - Next validation: no reverts needed! │
│ │
└─ Process repeats every ~70 blocks │
```
## 🛠️ Solution Options
### Option 1: Run MerkleChangeSets More Frequently ⭐ RECOMMENDED
**Change**: Reduce batch size from current (probably 64+ blocks) to smaller batches (10-20 blocks)
**How**:
1. Find MerkleChangeSets configuration in pipeline setup
2. Reduce `retention_blocks` or add time-based trigger
3. Make stage run every 10-20 blocks instead of 64+
**Pros**:
- ✅ Reduces checkpoint lag to 10-20 blocks
- ✅ Reduces revert fetch time to ~100-200ms (vs 600ms)
- ✅ More consistent performance
**Cons**:
- ⚠️ More frequent checkpoint writes (increased DB I/O)
- ⚠️ Need to measure impact on overall throughput
**Implementation**:
```rust
// In stage pipeline configuration
MerkleChangeSets::with_retention_blocks(20) // Was: 64
```
### Option 2: Optimize Revert Fetching
**Change**: Cache recent reverts or make fetching async
**A. LRU Cache**:
```rust
struct OverlayStateProviderFactory<F> {
factory: F,
revert_cache: Arc<Mutex<LruCache<(BlockNumber, BlockNumber), CachedReverts>>>,
}
```
**B. Async Fetching**:
```rust
// Don't block overlay creation - fetch reverts in background
let revert_future = tokio::spawn(async move {
provider.trie_reverts(from_block + 1)
});
```
**Pros**:
- ✅ Reduces blocking time
- ✅ Can help with repeated queries
**Cons**:
- ⚠️ Adds complexity
- ⚠️ Cache may not help much (queries are for different ranges)
- ⚠️ Async doesn't reduce actual DB query time
### Option 3: Accept Current Behavior ⚠️
**If**:
- 0.4% of calls taking 600ms is acceptable
- Throughput impact is minimal
- State root (critical path) is unaffected
**Then**: Document as expected behavior, add monitoring/alerting
### Option 4: Optimize MerkleChangeSets Stage Itself
**Change**: Make the stage itself faster so it can keep up
**How**:
- Profile `HashedPostState::from_reverts()` (line 195-198)
- Optimize trie update calculations (line 234-253)
- Parallelize block processing if possible
**Pros**:
- ✅ Benefits all operations, not just overlay
- ✅ Reduces overall sync time
**Cons**:
- ⚠️ Most complex solution
- ⚠️ May have limited optimization potential
## 📈 Additional Metrics Needed
### 1. MerkleChangeSets Performance
Add to `crates/stages/stages/src/stages/merkle_changesets.rs`:
```rust
#[cfg(feature = "metrics")]
use reth_metrics::{metrics::{Counter, Histogram, Gauge}, Metrics};
#[derive(Metrics)]
#[metrics(scope = "stages.merkle_changesets")]
struct MerkleChangeSetsMetrics {
/// Time to execute stage
execution_duration: Histogram,
/// Blocks processed per execution
blocks_per_execution: Histogram,
/// Current checkpoint block
checkpoint_block: Gauge,
/// Checkpoint lag (tip - checkpoint)
checkpoint_lag: Gauge,
}
```
**Add instrumentation**:
```rust
fn execute(&mut self, provider: &Provider, input: ExecInput) -> Result<ExecOutput, StageError> {
#[cfg(feature = "metrics")]
let _timer = start_timer(&self.metrics.execution_duration);
let target_range = self.determine_target_range(provider)?;
let blocks_count = target_range.end - target_range.start;
#[cfg(feature = "metrics")]
self.metrics.blocks_per_execution.record(blocks_count as f64);
// ... rest of execution
#[cfg(feature = "metrics")]
{
self.metrics.checkpoint_block.set(checkpoint as f64);
let tip = provider.best_block_number()?;
self.metrics.checkpoint_lag.set((tip - checkpoint) as f64);
}
}
```
### 2. Overlay Usage Tracking
Add to `crates/engine/tree/src/tree/payload_validator.rs`:
```rust
#[cfg(feature = "metrics")]
use reth_metrics::metrics::Counter;
// Track when overlay is created with block_hash
#[cfg(feature = "metrics")]
static OVERLAY_WITH_BLOCK_HASH: Counter =
Counter::new("engine_payload_validator_overlay_with_block_hash");
// Before creating factory:
#[cfg(feature = "metrics")]
OVERLAY_WITH_BLOCK_HASH.increment(1);
```
### 3. Grafana Alerts
```yaml
# Checkpoint lag alert
- alert: MerkleChangeSetsLagging
expr: |
(reth_sync_checkpoint{stage="MerkleChangeSets"}
- reth_best_block_number) > 100
for: 5m
annotations:
summary: "Checkpoint >100 blocks behind"
# High revert rate alert
- alert: OverlayRevertsFrequent
expr: |
rate(reth_storage_overlay_state_provider_reverts_required[5m]) > 5
for: 5m
annotations:
summary: "Overlay reverts at {{ $value }} req/s"
```
## 🎯 Recommended Action Plan
### Phase 1: Add Metrics (Week 1)
1. ✅ Add MerkleChangeSets stage metrics
2. ✅ Add checkpoint lag gauge
3. ✅ Add overlay usage tracking
4. ✅ Deploy and collect baseline data
### Phase 2: Quick Win (Week 2)
5. 🎯 **Reduce MerkleChangeSets batch size** from 64 to 20 blocks
6. 📊 Measure impact:
- Checkpoint lag should drop to 0-20 blocks
- Revert duration should drop to ~100-200ms
- Monitor overall throughput for regression
### Phase 3: Optimize (Week 3-4, if needed)
7. If Phase 2 insufficient:
- Profile MerkleChangeSets execution
- Consider async revert fetching
- Consider LRU cache for recent ranges
### Phase 4: Production Validation (Week 4)
8. Compare metrics in production vs test
9. Validate solution works under real load
10. Document final performance characteristics
## 📝 Key Takeaways
1.**Root cause identified**: MerkleChangeSets checkpoint lags 45-74 blocks
2. ⚠️ **Not reth-bench specific**: Happens in production Engine API validation
3.**Impact is measurable but limited**: 0.4% of calls, doesn't affect state root
4. 🎯 **Solution is clear**: Run MerkleChangeSets more frequently
5. 📊 **Need better observability**: Add stage performance metrics
## 🔗 Related Code Locations
- Overlay provider: `crates/storage/provider/src/providers/state/overlay.rs`
- Overlay metrics: `crates/storage/provider/src/providers/state/overlay_metrics.rs`
- Payload validator: `crates/engine/tree/src/tree/payload_validator.rs:655,785`
- MerkleChangeSets stage: `crates/stages/stages/src/stages/merkle_changesets.rs`
- Dashboard: `dashboard.json` (panels 303-311)

View File

@@ -44,6 +44,18 @@ struct MerkleChangeSetsMetrics {
/// Number of stage executions
execution_count: Counter,
/// Total state revert items collected per execution
state_reverts_collected: Histogram,
/// Total trie revert items collected per execution
trie_reverts_collected: Histogram,
/// Average state reverts per block
state_reverts_per_block: Histogram,
/// Average trie reverts per block
trie_reverts_per_block: Histogram,
}
/// The `MerkleChangeSets` stage.
@@ -58,6 +70,13 @@ pub struct MerkleChangeSets {
metrics: MerkleChangeSetsMetrics,
}
/// Counts of reverts collected during populate_range execution
#[derive(Debug, Default)]
struct RevertCounts {
total_state_reverts: usize,
total_trie_reverts: usize,
}
impl MerkleChangeSets {
/// Creates a new `MerkleChangeSets` stage with default retention blocks of 64.
pub const fn new() -> Self {
@@ -200,7 +219,7 @@ impl MerkleChangeSets {
fn populate_range<Provider>(
provider: &Provider,
target_range: Range<BlockNumber>,
) -> Result<(), StageError>
) -> Result<RevertCounts, StageError>
where
Provider: StageCheckpointReader
+ TrieWriter
@@ -238,11 +257,19 @@ impl MerkleChangeSets {
"Computing per-block state reverts",
);
let mut per_block_state_reverts = Vec::new();
let mut revert_counts = RevertCounts::default();
for block_number in target_range.clone() {
per_block_state_reverts.push(HashedPostState::from_reverts::<KeccakKeyHasher>(
let state_revert = HashedPostState::from_reverts::<KeccakKeyHasher>(
provider.tx_ref(),
block_number..=block_number,
)?);
)?;
// Count state reverts (accounts + storage slots)
revert_counts.total_state_reverts += state_revert.accounts.len() +
state_revert.storages.values().map(|s| s.storage.len()).sum::<usize>();
per_block_state_reverts.push(state_revert);
}
// Helper to retrieve state revert data for a specific block from the pre-computed array
@@ -305,6 +332,9 @@ impl MerkleChangeSets {
input.nodes.extend_ref(&this_trie_updates);
let this_trie_updates = this_trie_updates.into_sorted();
// Count trie reverts
revert_counts.total_trie_reverts += this_trie_updates.total_len();
// Write the changesets to the DB using the trie updates produced by the block, and the
// trie reverts as the overlay.
debug!(
@@ -319,7 +349,7 @@ impl MerkleChangeSets {
)?;
}
Ok(())
Ok(revert_counts)
}
}
@@ -412,7 +442,7 @@ where
self.metrics.blocks_per_execution.record(blocks_count as f64);
// Populate the target range with changesets
Self::populate_range(provider, target_range)?;
let revert_counts = Self::populate_range(provider, target_range)?;
// Update the prune checkpoint to reflect that all data before `computed_range.start`
// is not available.
@@ -446,6 +476,18 @@ where
let lag = tip.saturating_sub(checkpoint.block_number);
self.metrics.checkpoint_lag.set(lag as f64);
}
// Record revert collection metrics
self.metrics.state_reverts_collected.record(revert_counts.total_state_reverts as f64);
self.metrics.trie_reverts_collected.record(revert_counts.total_trie_reverts as f64);
if blocks_count > 0 {
let state_per_block =
revert_counts.total_state_reverts as f64 / blocks_count as f64;
let trie_per_block = revert_counts.total_trie_reverts as f64 / blocks_count as f64;
self.metrics.state_reverts_per_block.record(state_per_block);
self.metrics.trie_reverts_per_block.record(trie_per_block);
}
}
Ok(ExecOutput::done(checkpoint))

View File

@@ -6503,6 +6503,190 @@
"title": "MerkleChangeSets - Blocks Per Execution",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"description": "Total number of state revert items (accounts + storage slots) collected per execution. Shows workload size.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "items"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 186
},
"id": 403,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "reth_stages_merkle_changesets_state_reverts_collected{$instance_label=\"$instance\",quantile=~\"(0.5|0.9|0.95|1)\"}",
"legendFormat": "state {{quantile}}",
"refId": "A"
}
],
"title": "MerkleChangeSets - State Reverts Collected",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"description": "Total number of trie revert items (account nodes + storage nodes) collected per execution. Shows trie workload size.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "items"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 186
},
"id": 404,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "reth_stages_merkle_changesets_trie_reverts_collected{$instance_label=\"$instance\",quantile=~\"(0.5|0.9|0.95|1)\"}",
"legendFormat": "trie {{quantile}}",
"refId": "A"
}
],
"title": "MerkleChangeSets - Trie Reverts Collected",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",