diff --git a/crates/trie/sparse-parallel/src/trie.rs b/crates/trie/sparse-parallel/src/trie.rs index 5f710f4bb4..42dc77c86c 100644 --- a/crates/trie/sparse-parallel/src/trie.rs +++ b/crates/trie/sparse-parallel/src/trie.rs @@ -1872,7 +1872,9 @@ impl SparseSubtrie { // Memoize the hash of a previously blinded node in a new branch // node. hash: Some(*hash), - store_in_db_trie: Some(masks.is_some()), + store_in_db_trie: Some(masks.is_some_and(|m| { + !m.hash_mask.is_empty() || !m.tree_mask.is_empty() + })), }); } // Branch node already exists, or an extension node was placed where a diff --git a/crates/trie/trie/src/proof_v2/mod.rs b/crates/trie/trie/src/proof_v2/mod.rs index a4967f0421..f2402d0591 100644 --- a/crates/trie/trie/src/proof_v2/mod.rs +++ b/crates/trie/trie/src/proof_v2/mod.rs @@ -1648,15 +1648,8 @@ mod tests { // though we never store the root node so the masks for it aren't really valid. let masks = if path.is_empty() { None - } else if let Some(branch_masks) = - proof_legacy_result.branch_node_masks.get(path) - { - TrieMasks { - hash_mask: Some(branch_masks.hash_mask), - tree_mask: Some(branch_masks.tree_mask), - } } else { - None + proof_legacy_result.branch_node_masks.get(path).copied() }; ProofTrieNode { path: *path, node, masks } diff --git a/prototype-option-branch-node-masks.md b/prototype-option-branch-node-masks.md new file mode 100644 index 0000000000..97f0243f55 --- /dev/null +++ b/prototype-option-branch-node-masks.md @@ -0,0 +1,145 @@ +# Prototype: Replace `TrieMasks` with `Option` in `ProofTrieNode` + +## Summary + +This prototype explores replacing `TrieMasks` with `Option` in `ProofTrieNode` and related APIs, as suggested after PR #20664. + +## Current State + +```rust +// TrieMasks - each field is independently optional +pub struct TrieMasks { + pub hash_mask: Option, // 4 bytes + pub tree_mask: Option, // 4 bytes +} // Total: 8 bytes + +// BranchNodeMasks - both fields always present together +pub struct BranchNodeMasks { + pub hash_mask: TrieMask, // 2 bytes + pub tree_mask: TrieMask, // 2 bytes +} // Total: 4 bytes + +// Option = 6 bytes +``` + +## Key Findings + +### 1. Size Reduction +- `TrieMasks`: 8 bytes +- `Option`: 6 bytes +- **Savings: 25% reduction (2 bytes per instance)** + +### 2. Code Simplification + +The prototype shows **-27 net lines** across 5 files: + +| File | Insertions | Deletions | Net | +|------|------------|-----------|-----| +| common/src/trie.rs | +2 | -1 | +1 | +| sparse/src/provider.rs | +4 | -4 | 0 | +| sparse/src/state.rs | +16 | -23 | -7 | +| sparse/src/traits.rs | +3 | -3 | 0 | +| sparse/src/trie.rs | +18 | -33 | -15 | +| **Total** | **+40** | **-67** | **-27** | + +### 3. Semantic Correctness + +Analysis confirms that **both masks always come together**: +- When read from database, both masks are fetched together +- Production code never sets one mask without the other +- Independent mask setting (`hash_mask: Some(..), tree_mask: None`) only appears in **tests** + +### 4. Simplification Examples + +**Before:** +```rust +if masks.tree_mask.is_some() || masks.hash_mask.is_some() { + self.branch_node_masks.insert( + path, + BranchNodeMasks { + tree_mask: masks.tree_mask.unwrap_or_default(), + hash_mask: masks.hash_mask.unwrap_or_default(), + }, + ); +} +``` + +**After:** +```rust +if let Some(branch_masks) = masks { + self.branch_node_masks.insert(path, branch_masks); +} +``` + +**Before:** +```rust +store_in_db_trie: Some( + masks.hash_mask.is_some_and(|mask| !mask.is_empty()) || + masks.tree_mask.is_some_and(|mask| !mask.is_empty()), +), +``` + +**After:** +```rust +store_in_db_trie: Some(masks.is_some_and(|m| { + !m.hash_mask.is_empty() || !m.tree_mask.is_empty() +})), +``` + +### 5. Full Scope of Changes + +| Location | Count | Notes | +|----------|-------|-------| +| sparse-parallel/src/trie.rs | 70 | Mostly tests | +| sparse/src/trie.rs | 16 | All tests after migration | +| proof_v2/mod.rs | 8 | Production code | +| proof_v2/node.rs | 5 | Production code | +| configured_sparse_trie.rs | 3 | Production code | +| common/src/trie.rs | 2 | Struct definition | +| common/src/lib.rs | 1 | Export | + +### 6. Additional Changes Required + +- `RevealedNode` in provider.rs should also change from two `Option` fields to single `Option` (done in prototype) +- Tests need updates to use `Some(BranchNodeMasks { .. })` instead of `TrieMasks { hash_mask: Some(..), tree_mask: None }` +- `proof_v2` module needs similar updates +- Parallel sparse trie needs same changes as serial version + +## Recommendation + +**Proceed with this refactor** because: + +1. **Memory reduction**: 25% smaller per instance +2. **Code simplification**: -27 lines, cleaner patterns +3. **Semantic correctness**: Reflects actual usage (masks come together) +4. **Type safety**: Impossible to create invalid state with one mask missing + +## Test Migration Strategy + +For tests that currently set masks independently: +```rust +// Before +TrieMasks { hash_mask: Some(TrieMask::new(0b01)), tree_mask: None } + +// After - use default for the other mask +Some(BranchNodeMasks { hash_mask: TrieMask::new(0b01), tree_mask: TrieMask::default() }) +``` + +## Files Changed in Prototype + +``` +crates/trie/common/src/trie.rs (ProofTrieNode definition) +crates/trie/sparse/src/provider.rs (RevealedNode definition) +crates/trie/sparse/src/state.rs (State trie operations) +crates/trie/sparse/src/traits.rs (Trait signatures) +crates/trie/sparse/src/trie.rs (Serial sparse trie impl) +``` + +## Outstanding Work + +1. Update tests in `sparse/src/trie.rs` +2. Update `sparse-parallel/src/trie.rs` (same changes) +3. Update `proof_v2` module +4. Update `configured_sparse_trie.rs` +5. Run full test suite and benchmarks +6. Consider removing `TrieMasks` struct entirely if no longer needed diff --git a/reth-memory-layout-analysis.md b/reth-memory-layout-analysis.md new file mode 100644 index 0000000000..f240a89e48 --- /dev/null +++ b/reth-memory-layout-analysis.md @@ -0,0 +1,504 @@ +# Reth Memory Layout Optimization Analysis +## Engine, Trie, Multiproof, and Prewarming Components + +This document provides a comprehensive analysis of memory layout optimization opportunities across the reth codebase, focusing on engine, trie, multiproof, and prewarming components. + +--- + +## Executive Summary + +After thorough exploration of the codebase, I've identified **30+ data structures** with potential memory layout improvements. The highest priority optimizations are in: + +1. **Trie node representations** - `SparseNode` enum (40+ bytes per node) +2. **Proof structures** - `MultiProof`, `StorageMultiProof` with duplicate hash mask maps +3. **Execution cache** - `ExecutionCache`, `PrewarmContext` with scattered boolean fields +4. **State tracking** - `TreeState`, `BlockState` with suboptimal field ordering + +Estimated memory savings: **5-15% reduction** in hot path allocations with proper field reordering and consolidation. + +--- + +## Table of Contents + +1. [Critical Priority - Trie Node Structures](#1-critical-priority---trie-node-structures) +2. [Critical Priority - Proof Structures](#2-critical-priority---proof-structures) +3. [High Priority - Engine Execution Structures](#3-high-priority---engine-execution-structures) +4. [High Priority - State Management](#4-high-priority---state-management) +5. [Medium Priority - Supporting Structures](#5-medium-priority---supporting-structures) +6. [Alloy Type Considerations](#6-alloy-type-considerations) +7. [Recommended Actions](#7-recommended-actions) + +--- + +## 1. Critical Priority - Trie Node Structures + +### 1.1 `SparseNode` Enum + +**File**: `crates/trie/sparse/src/trie.rs:1835` + +```rust +pub enum SparseNode { + Empty, + Hash(B256), // 32 bytes + Leaf { key: Nibbles, hash: Option }, // Nibbles + 40 bytes + Extension { + key: Nibbles, + hash: Option, + store_in_db_trie: Option + }, + Branch { + state_mask: TrieMask, // u16 = 2 bytes + hash: Option, // 33 bytes (Option) + store_in_db_trie: Option // 2 bytes (Option) + }, +} +``` + +**Issues**: +- Enum discriminant + largest variant = 40+ bytes per node +- `Option` adds 33 bytes (1 byte tag + 32 bytes value) +- `Option` wastes 2 bytes when 1 bit suffices +- `TrieMask` (u16) followed by `Option` causes 6 bytes padding + +**Recommendations**: +1. Pack `store_in_db_trie` and `hash.is_some()` into a single `u8` flags field +2. Store hash externally in a separate HashMap keyed by node path +3. Consider smaller representation for common patterns + +### 1.2 `SerialSparseTrie` + +**File**: `crates/trie/sparse/src/trie.rs:297-315` + +```rust +pub struct SerialSparseTrie { + nodes: HashMap, + branch_node_tree_masks: HashMap, // TrieMask = u16 + branch_node_hash_masks: HashMap, // Duplicate structure + values: HashMap>, + prefix_set: PrefixSetMut, + updates: Option, + rlp_buf: Vec, +} +``` + +**Issues**: +- **Two identical HashMaps** (`tree_masks` and `hash_masks`) with same key type +- 4 HashMap allocations with `Nibbles` keys (variable-length) +- `rlp_buf: Vec` at end is a reusable buffer that could be `Arc` shared + +**Recommendations**: +1. **Consolidate mask maps**: `HashMap` or `HashMap` +2. Reorder fields: `rlp_buf` before `updates` (often accessed after updates) +3. Consider `Arc>>` for `rlp_buf` if shared across threads + +### 1.3 `ParallelSparseTrie` + +**File**: `crates/trie/sparse-parallel/src/trie.rs:105-127` + +```rust +pub struct ParallelSparseTrie { + upper_subtrie: Box, + lower_subtries: [LowerSparseSubtrie; 16], // Fixed array of 16 + prefix_set: PrefixSetMut, + updates: Option, + branch_node_tree_masks: HashMap, + branch_node_hash_masks: HashMap, + update_actions_buffers: Vec>, + parallelism_thresholds: ParallelismThresholds, + #[cfg(feature = "metrics")] + metrics: ParallelSparseTrieMetrics, +} +``` + +**Issues**: +- Array of 16 subtries after `Box` - severe padding potential +- Same duplicate mask map pattern as `SerialSparseTrie` +- `Option` before two HashMaps + +**Recommendations**: +1. Move `lower_subtries` array to end (largest field) +2. Consolidate mask maps +3. Group `Option` and small scalar fields together + +--- + +## 2. Critical Priority - Proof Structures + +### 2.1 `MultiProof` + +**File**: `crates/trie/common/src/proofs.rs:173-182` + +```rust +pub struct MultiProof { + pub account_subtree: ProofNodes, + pub branch_node_hash_masks: HashMap, + pub branch_node_tree_masks: HashMap, + pub storages: B256Map, +} +``` + +**Issues**: +- Two separate HashMaps with identical structure +- `ProofNodes` (large) followed by two maps, then another map + +**Recommendations**: +1. **Consolidate mask maps**: Create `TrieMasks { hash: TrieMask, tree: TrieMask }` struct +2. `HashMap` saves one HashMap allocation + +### 2.2 `StorageMultiProof` + +**File**: `crates/trie/common/src/proofs.rs:450-460` + +```rust +pub struct StorageMultiProof { + pub root: B256, // 32 bytes + pub subtree: ProofNodes, + pub branch_node_hash_masks: HashMap, + pub branch_node_tree_masks: HashMap, +} +``` + +**Issues**: +- Same duplicate HashMap pattern +- `root` (32 bytes) at start is optimal for alignment +- Two HashMaps at end should be consolidated + +### 2.3 `ProofResultMessage` + +**File**: `crates/trie/parallel/src/proof_task.rs:598-607` + +```rust +pub struct ProofResultMessage { + pub sequence_number: u64, + pub result: Result, + pub elapsed: Duration, + pub state: HashedPostState, +} +``` + +**Issues**: +- `u64` (8 bytes) at start, `Duration` (16 bytes) later +- `Result` enum size varies significantly by variant + +**Recommendations**: +1. Reorder: `elapsed` (16 bytes) → `sequence_number` (8 bytes) → `result` → `state` +2. Consider boxing the error variant in `Result` + +--- + +## 3. High Priority - Engine Execution Structures + +### 3.1 `PayloadProcessor` + +**File**: `crates/engine/tree/src/tree/payload_processor/mod.rs:104` + +```rust +pub struct PayloadProcessor { + executor: WorkloadExecutor, + execution_cache: ExecutionCache, + trie_metrics: MultiProofTaskMetrics, + cross_block_cache_size: u64, + disable_transaction_prewarming: bool, // scattered bools + disable_state_cache: bool, + evm_config: Evm, + precompile_cache_disabled: bool, + precompile_cache_map: PrecompileCacheMap>, + sparse_state_trie: Arc>>, + disable_parallel_sparse_trie: bool, + prewarm_max_concurrency: usize, +} +``` + +**Issues**: +- **4 boolean fields scattered** throughout structure +- Booleans between large fields cause padding waste +- Generic `Evm` type size varies + +**Recommendations**: +1. Create `ProcessorFlags` bitfield: + ```rust + struct ProcessorFlags { + bits: u8, // 4 bools = 4 bits + } + impl ProcessorFlags { + const DISABLE_TX_PREWARMING: u8 = 0b0001; + const DISABLE_STATE_CACHE: u8 = 0b0010; + const DISABLE_PRECOMPILE_CACHE: u8 = 0b0100; + const DISABLE_PARALLEL_SPARSE_TRIE: u8 = 0b1000; + } + ``` +2. Move all config to struct end +3. Group related fields (executor, evm_config) + +### 3.2 `PrewarmContext` + +**File**: `crates/engine/tree/src/tree/payload_processor/prewarm.rs:457` + +```rust +pub(super) struct PrewarmContext { + pub(super) env: ExecutionEnv, + pub(super) evm_config: Evm, + pub(super) saved_cache: Option, + pub(super) provider: StateProviderBuilder, + pub(super) metrics: PrewarmMetrics, + pub(super) terminate_execution: Arc, + pub(super) precompile_cache_disabled: bool, + pub(super) precompile_cache_map: PrecompileCacheMap>, +} +``` + +**Issues**: +- **Frequently cloned** for worker distribution +- `bool` after `Arc` causes padding +- Generic types make layout unpredictable + +**Recommendations**: +1. Move `terminate_execution` and `precompile_cache_disabled` together +2. Consider `Arc` for thread-shared parts +3. Separate mutable worker state from immutable config + +### 3.3 `ExecutionCache` + +**File**: `crates/engine/tree/src/tree/cached_state.rs:342` + +```rust +pub(crate) struct ExecutionCache { + code_cache: Cache>, + storage_cache: Cache>, + account_cache: Cache>, +} +``` + +**Issues**: +- 3 separate Moka caches = 3 allocations +- Accessed on every state read (hot path) + +**Recommendations**: +1. Consider a unified cache with tagged keys if access patterns permit +2. Pre-size caches based on expected workload +3. Evaluate if `Arc` sharing is necessary + +--- + +## 4. High Priority - State Management + +### 4.1 `TreeState` + +**File**: `crates/engine/tree/src/tree/state.rs:24` + +```rust +pub struct TreeState { + pub(crate) blocks_by_hash: HashMap>, + pub(crate) blocks_by_number: BTreeMap>>, + pub(crate) parent_to_child: HashMap>, + pub(crate) current_canonical_head: BlockNumHash, + pub(crate) engine_kind: EngineApiKind, // 1-2 bytes +} +``` + +**Issues**: +- `engine_kind` (1-2 bytes) at end after large collections +- 6-7 bytes padding before `engine_kind` + +**Recommendations**: +1. Move `engine_kind` and `current_canonical_head` (16 bytes) to struct start +2. Order: `engine_kind` → `current_canonical_head` → collections + +### 4.2 `BlockState` + +**File**: `crates/chain-state/src/in_memory.rs:575` + +```rust +pub struct BlockState { + block: ExecutedBlock, + parent: Option>, +} +``` + +**Issues**: +- `Option>` is 16 bytes (Option tag + pointer) +- Small struct, but frequently allocated + +### 4.3 `HashedStorage` + +**File**: `crates/trie/common/src/hashed_state.rs:404-409` + +```rust +pub struct HashedStorage { + pub wiped: bool, // 1 byte + pub storage: B256Map, // HashMap +} +``` + +**Issues**: +- `bool` before HashMap causes 7 bytes padding (HashMap aligned to 8) + +**Recommendations**: +1. Move `wiped` after `storage`, or +2. Pack with other metadata if available + +--- + +## 5. Medium Priority - Supporting Structures + +### 5.1 `InvalidHeaderCache` + +**File**: `crates/engine/tree/src/tree/invalid_headers.rs:19` + +```rust +pub struct InvalidHeaderCache { + headers: LruMap, + metrics: InvalidHeaderCacheMetrics, +} + +struct HeaderEntry { + hit_count: u8, // 1 byte + header: BlockWithParent, // Large +} +``` + +**Issues**: +- `hit_count: u8` before large struct = 7 bytes padding + +### 5.2 `BlockBuffer` + +**File**: `crates/engine/tree/src/tree/block_buffer.rs:19` + +```rust +pub struct BlockBuffer { + pub(crate) blocks: HashMap>, + pub(crate) parent_to_child: HashMap>, + pub(crate) earliest_blocks: BTreeMap>, + pub(crate) block_queue: VecDeque, + pub(crate) max_blocks: usize, + pub(crate) metrics: BlockBufferMetrics, +} +``` + +**Issues**: +- 5 collection types in sequence +- `max_blocks` (8 bytes) between collections + +### 5.3 `AccountProof` + +**File**: `crates/trie/common/src/proofs.rs:573-585` + +```rust +pub struct AccountProof { + pub address: Address, // 20 bytes + pub info: Option, + pub proof: Vec, + pub storage_root: B256, // 32 bytes + pub storage_proofs: Vec, +} +``` + +**Issues**: +- `Address` (20 bytes) at start, misaligned for 32-byte B256 + +**Recommendations**: +1. Reorder: `storage_root` (32 bytes) → `address` (20 bytes) → Vecs/Options + +--- + +## 6. Alloy Type Considerations + +### 6.1 Core Types + +| Type | Size | Alignment | Notes | +|------|------|-----------|-------| +| `B256` | 32 bytes | 1 (byte array) | Optimal for hashing | +| `Address` | 20 bytes | 1 (byte array) | Often padded to 24 | +| `U256` | 32 bytes | 8 | Little-endian limbs | +| `TrieMask` | 2 bytes | 2 (u16) | Branch child bitmap | +| `Nibbles` | 8-72 bytes | 8 | SmallVec-backed | + +### 6.2 Map Types + +- `B256Map` - Uses `foldhash` hasher optimized for 32-byte keys +- `B256Set` - Same optimization for set operations +- Both avoid unnecessary hashing computation on fixed-size keys + +### 6.3 Potential Alloy Improvements + +1. **`Address` padding**: Consider `#[repr(align(8))]` for Address to reduce struct padding +2. **`TrieMask` packing**: Could be combined with flags in same u16 +3. **`Nibbles` variants**: Consider fixed-size array for common path lengths (≤ 64) + +--- + +## 7. Recommended Actions + +### Immediate Wins (Low Risk, High Impact) + +| Priority | Structure | Change | Est. Savings | +|----------|-----------|--------|--------------| +| 1 | `HashedStorage` | Move `wiped` after `storage` | 7 bytes/instance | +| 2 | `TreeState` | Reorder `engine_kind` to start | 6 bytes/instance | +| 3 | `PayloadProcessor` | Consolidate bools to bitfield | 24 bytes/instance | +| 4 | `HeaderEntry` | Move `hit_count` to end | 7 bytes/entry | + +### Medium Effort (Good ROI) + +| Priority | Structure | Change | Est. Savings | +|----------|-----------|--------|--------------| +| 5 | `MultiProof` | Consolidate mask maps | 1 HashMap alloc | +| 6 | `StorageMultiProof` | Consolidate mask maps | 1 HashMap alloc | +| 7 | `SerialSparseTrie` | Consolidate mask maps | 1 HashMap alloc | +| 8 | `PrewarmContext` | Group related fields | Better cache locality | +| 9 | `AccountProof` | Reorder by size | 4-8 bytes/instance | + +### Strategic (Requires Deeper Changes) + +| Priority | Structure | Change | Impact | +|----------|-----------|--------|--------| +| 10 | `SparseNode` | External hash storage | 8-16 bytes/node | +| 11 | `SparseNode` | Pack flags into u8 | 2 bytes/node | +| 12 | `ExecutionCache` | Unified cache design | Reduced allocations | +| 13 | `PrewarmContext` | Arc-shared immutable data | Reduced cloning | + +### Mask Map Consolidation Pattern + +```rust +// Before: Two separate HashMaps +pub branch_node_hash_masks: HashMap, +pub branch_node_tree_masks: HashMap, + +// After: Single HashMap with tuple value +#[derive(Clone, Copy, Default)] +pub struct TrieMasks { + pub hash: TrieMask, // 2 bytes + pub tree: TrieMask, // 2 bytes +} // Total: 4 bytes, no padding + +pub branch_node_masks: HashMap, +``` + +This pattern applies to: +- `MultiProof` +- `DecodedMultiProof` +- `StorageMultiProof` +- `DecodedStorageMultiProof` +- `SerialSparseTrie` +- `ParallelSparseTrie` + +--- + +## Appendix: Files to Review + +### Critical Files +- `crates/trie/sparse/src/trie.rs` - SparseNode, SerialSparseTrie +- `crates/trie/common/src/proofs.rs` - MultiProof, StorageMultiProof +- `crates/engine/tree/src/tree/payload_processor/mod.rs` - PayloadProcessor +- `crates/engine/tree/src/tree/payload_processor/prewarm.rs` - PrewarmContext + +### High Priority Files +- `crates/engine/tree/src/tree/cached_state.rs` - ExecutionCache +- `crates/engine/tree/src/tree/state.rs` - TreeState +- `crates/trie/common/src/hashed_state.rs` - HashedStorage +- `crates/trie/parallel/src/proof_task.rs` - ProofResultMessage + +### Supporting Files +- `crates/engine/tree/src/tree/invalid_headers.rs` - InvalidHeaderCache +- `crates/engine/tree/src/tree/block_buffer.rs` - BlockBuffer +- `crates/chain-state/src/in_memory.rs` - BlockState diff --git a/reth-persistence-deep-dive.md b/reth-persistence-deep-dive.md new file mode 100644 index 0000000000..d0eaaa93d6 --- /dev/null +++ b/reth-persistence-deep-dive.md @@ -0,0 +1,2138 @@ +# Reth Persistence Layer Deep Dive + +A comprehensive analysis of Reth's storage architecture, database implementation, and data flow patterns. + +## Table of Contents + +1. [Overview](#overview) +2. [High-Level Architecture](#high-level-architecture) +3. [Storage Layer Organization](#storage-layer-organization) +4. [Core Abstractions](#core-abstractions) +5. [Database Tables Schema](#database-tables-schema) +6. [MDBX Implementation](#mdbx-implementation) +7. [Static Files (NippyJar)](#static-files-nippyjar) +8. [Provider Layer](#provider-layer) +9. [Data Flow Examples](#data-flow-examples) +10. [Key Design Patterns](#key-design-patterns) +11. [The Pipeline: Why We Need Persistence](#the-pipeline-why-we-need-persistence) +12. [Engine API: The Bridge Between Consensus and Execution](#engine-api-the-bridge-between-consensus-and-execution) +13. [Reorgs and Unwinds: Detailed Mechanics](#reorgs-and-unwinds-detailed-mechanics) +14. [Cursor Deep Dive: The Intuitive Guide](#cursor-deep-dive-the-intuitive-guide) + +--- + +## Overview + +Reth implements a **hybrid storage architecture** combining: +- **MDBX**: A high-performance key-value store for dynamic, frequently updated data +- **Static Files (NippyJar)**: Immutable columnar storage for historical, append-only data + +This dual approach optimizes for both: +- **Write performance**: MDBX handles state updates efficiently +- **Read performance**: Static files provide fast historical data access via memory-mapped I/O + +--- + +## High-Level Architecture + +``` + +------------------------------------------+ + | Reth Node | + +------------------------------------------+ + | + v ++----------------------------------------------------------------------------------------------------------+ +| PROVIDER LAYER | +| | +| +-------------------+ +---------------------+ +------------------------+ +-----------------+ | +| | BlockchainProvider| | DatabaseProvider | | StaticFileProvider | | StateProvider | | +| | | | | | | | | | +| | - Orchestrates | | - Read/Write ops | | - Immutable data | | - Current state | | +| | - Combines sources| | - Transaction mgmt | | - Historical blocks | | - Historical | | +| +-------------------+ +---------------------+ +------------------------+ +-----------------+ | +| | | | | | ++------------|------------------------|---------------------------|--------------------------|-------------+ + | | | | + v v v v ++----------------------------------------------------------------------------------------------------------+ +| STORAGE API LAYER | +| | +| Traits: BlockReader, BlockWriter, AccountReader, StateReader, TrieReader, HeaderProvider... | +| | ++----------------------------------------------------------------------------------------------------------+ + | | + v v ++---------------------------+ +----------------------------------+ +| DB-API LAYER | | NIPPY-JAR LAYER | +| | | | +| Traits: | | NippyJar | +| - Database | | - Columnar storage | +| - DbTx / DbTxMut | | - Compression support | +| - DbCursorRO / DbCursorRW | | - Memory-mapped reads | +| - Table / DupSort | | | ++---------------------------+ +----------------------------------+ + | | + v v ++---------------------------+ +----------------------------------+ +| MDBX IMPLEMENTATION | | STATIC FILES | +| | | | +| DatabaseEnv | | *.idx - Index files | +| - Tx / Tx | | *.off - Offset files | +| - Cursor | | *.conf - Configuration | +| - libmdbx FFI bindings | | data - Compressed data | ++---------------------------+ +----------------------------------+ + | | + v v ++---------------------------+ +----------------------------------+ +| mdbx.dat (LMDB file) | | headers/, transactions/, | +| | | receipts/ directories | ++---------------------------+ +----------------------------------+ +``` + +--- + +## Storage Layer Organization + +The storage layer is organized into **11 interconnected crates** under `crates/storage/`: + +``` +crates/storage/ +| ++-- db-api/ # Core database abstraction traits (database-agnostic) +| +-- database.rs # Database trait (entry point) +| +-- transaction.rs # DbTx, DbTxMut traits +| +-- cursor.rs # Cursor traits (DbCursorRO, DbCursorRW, DupCursors) +| +-- table.rs # Table trait, Encode/Decode, Compress/Decompress +| +-- tables/mod.rs # All 25+ table definitions +| ++-- db/ # MDBX implementation +| +-- implementation/ +| | +-- mdbx/ +| | +-- mod.rs # DatabaseEnv (main database wrapper) +| | +-- tx.rs # Tx transaction implementation +| | +-- cursor.rs# Cursor implementation +| +-- metrics.rs # Performance metrics +| +-- lockfile.rs # Concurrent access control +| ++-- storage-api/ # High-level provider traits +| +-- block.rs # BlockReader, BlockWriter +| +-- state.rs # StateReader, StateWriter +| +-- trie.rs # TrieReader, TrieWriter +| ++-- provider/ # Provider implementations +| +-- providers/ +| | +-- database/ +| | | +-- provider.rs # DatabaseProvider +| | | +-- blockchain_provider.rs +| | +-- static_file/ +| | +-- manager.rs # Static file management +| +-- traits/ +| +-- full.rs # FullProvider composite trait +| ++-- codecs/ # Serialization (Compact codec) ++-- nippy-jar/ # Immutable columnar storage format ++-- db-models/ # Data model wrapper types ++-- libmdbx-rs/ # Rust bindings for libmdbx C library ++-- errors/ # Error types ++-- zstd-compressors/ # Zstd compression +``` + +--- + +## Core Abstractions + +### 1. Database Trait (`db-api/src/database.rs`) + +The main entry point for database operations: + +```rust +pub trait Database: Send + Sync + Debug { + /// Read-only transaction type + type TX: DbTx + Send + Sync + Debug + 'static; + /// Read-write transaction type + type TXMut: DbTxMut + DbTx + TableImporter + Send + Sync + Debug + 'static; + + /// Create read-only transaction + fn tx(&self) -> Result; + + /// Create read-write transaction + fn tx_mut(&self) -> Result; + + /// Execute closure with read-only transaction + fn view(&self, f: F) -> Result; + + /// Execute closure with read-write transaction + fn update(&self, f: F) -> Result; +} +``` + +### 2. Transaction Traits (`db-api/src/transaction.rs`) + +```rust +/// Read-only transaction +pub trait DbTx: Debug + Send { + type Cursor: DbCursorRO; + type DupCursor: DbDupCursorRO; + + fn get(&self, key: T::Key) -> Result, DatabaseError>; + fn commit(self) -> Result; + fn abort(self); + fn cursor_read(&self) -> Result, DatabaseError>; + fn cursor_dup_read(&self) -> Result, DatabaseError>; + fn entries(&self) -> Result; +} + +/// Read-write transaction +pub trait DbTxMut: Send { + type CursorMut: DbCursorRW; + type DupCursorMut: DbDupCursorRW; + + fn put(&self, key: T::Key, value: T::Value) -> Result<(), DatabaseError>; + fn append(&self, key: T::Key, value: T::Value) -> Result<(), DatabaseError>; + fn delete(&self, key: T::Key, value: Option) -> Result; + fn clear(&self) -> Result<(), DatabaseError>; + fn cursor_write(&self) -> Result, DatabaseError>; +} +``` + +### 3. Cursor Traits (`db-api/src/cursor.rs`) + +```rust +/// Read-only cursor for iteration +pub trait DbCursorRO { + fn first(&mut self) -> PairResult; + fn seek_exact(&mut self, key: T::Key) -> PairResult; + fn seek(&mut self, key: T::Key) -> PairResult; // >= key + fn next(&mut self) -> PairResult; + fn prev(&mut self) -> PairResult; + fn last(&mut self) -> PairResult; + fn current(&mut self) -> PairResult; + fn walk(&mut self, start_key: Option) -> Result, DatabaseError>; + fn walk_range(&mut self, range: impl RangeBounds) -> Result, DatabaseError>; + fn walk_back(&mut self, start_key: Option) -> Result, DatabaseError>; +} + +/// Read-only cursor for duplicate-sorted tables +pub trait DbDupCursorRO { + fn next_dup(&mut self) -> PairResult; + fn next_no_dup(&mut self) -> PairResult; + fn next_dup_val(&mut self) -> ValueOnlyResult; + fn seek_by_key_subkey(&mut self, key: T::Key, subkey: T::SubKey) -> ValueOnlyResult; + fn walk_dup(&mut self, key: Option, subkey: Option) -> Result, DatabaseError>; +} + +/// Read-write cursor +pub trait DbCursorRW { + fn upsert(&mut self, key: T::Key, value: &T::Value) -> Result<(), DatabaseError>; + fn insert(&mut self, key: T::Key, value: &T::Value) -> Result<(), DatabaseError>; + fn append(&mut self, key: T::Key, value: &T::Value) -> Result<(), DatabaseError>; // O(1) for sorted inserts + fn delete_current(&mut self) -> Result<(), DatabaseError>; +} +``` + +### 4. Table Trait (`db-api/src/table.rs`) + +```rust +pub trait Table: Send + Sync + Debug + 'static { + const NAME: &'static str; + const DUPSORT: bool; + + type Key: Encode + Decode + Ord; + type Value: Compress + Decompress; +} + +pub trait DupSort: Table { + type SubKey: Encode + Decode; +} + +/// For encoding keys (must be sortable) +pub trait Encode { + type Encoded: AsRef<[u8]> + Into>; + fn encode(self) -> Self::Encoded; +} + +/// For compressing values +pub trait Compress { + type Compressed: AsRef<[u8]>; + fn compress(self) -> Self::Compressed; +} +``` + +--- + +## Database Tables Schema + +### Block Data Tables + +``` ++-------------------------+-------------------+---------------------------+ +| Table | Key | Value | ++-------------------------+-------------------+---------------------------+ +| CanonicalHeaders | BlockNumber | HeaderHash (B256) | +| Headers | BlockNumber | Header | +| HeaderNumbers | BlockHash (B256) | BlockNumber | +| BlockBodyIndices | BlockNumber | StoredBlockBodyIndices | +| BlockOmmers | BlockNumber | StoredBlockOmmers | +| BlockWithdrawals | BlockNumber | StoredBlockWithdrawals | ++-------------------------+-------------------+---------------------------+ +``` + +### Transaction Tables + +``` ++-------------------------+-------------------+---------------------------+ +| Table | Key | Value | ++-------------------------+-------------------+---------------------------+ +| Transactions | TxNumber | TransactionSigned | +| TransactionHashNumbers | TxHash (B256) | TxNumber | +| TransactionBlocks | TxNumber | BlockNumber | +| TransactionSenders | TxNumber | Address | +| Receipts | TxNumber | Receipt | ++-------------------------+-------------------+---------------------------+ +``` + +### State Tables (Current State) + +``` ++-------------------------+-------------------+---------------------------+-----------+ +| Table | Key | Value | DupSort | ++-------------------------+-------------------+---------------------------+-----------+ +| PlainAccountState | Address | Account | No | +| PlainStorageState | Address | StorageEntry | Yes (B256)| +| Bytecodes | B256 (code hash) | Bytecode | No | ++-------------------------+-------------------+---------------------------+-----------+ +``` + +### Historical State Tables + +``` ++-------------------------+------------------------+---------------------------+-----------+ +| Table | Key | Value | DupSort | ++-------------------------+------------------------+---------------------------+-----------+ +| AccountsHistory | ShardedKey
| BlockNumberList | No | +| StoragesHistory | StorageShardedKey | BlockNumberList | No | +| AccountChangeSets | BlockNumber | AccountBeforeTx | Yes (Addr)| +| StorageChangeSets | BlockNumberAddress | StorageEntry | Yes (B256)| ++-------------------------+------------------------+---------------------------+-----------+ +``` + +### Trie Tables (Merkle Patricia Trie) + +``` ++-------------------------+------------------------+---------------------------+-----------+ +| Table | Key | Value | DupSort | ++-------------------------+------------------------+---------------------------+-----------+ +| HashedAccounts | B256 (keccak addr) | Account | No | +| HashedStorages | B256 (keccak addr) | StorageEntry | Yes (B256)| +| AccountsTrie | StoredNibbles | BranchNodeCompact | No | +| StoragesTrie | B256 | StorageTrieEntry | Yes | ++-------------------------+------------------------+---------------------------+-----------+ +``` + +### Metadata Tables + +``` ++-------------------------+-------------------+---------------------------+ +| Table | Key | Value | ++-------------------------+-------------------+---------------------------+ +| StageCheckpoints | StageId (String) | StageCheckpoint | +| PruneCheckpoints | PruneSegment | PruneCheckpoint | +| VersionHistory | u64 (timestamp) | ClientVersion | +| ChainState | ChainStateKey | BlockNumber | +| Metadata | String | Vec | ++-------------------------+-------------------+---------------------------+ +``` + +--- + +## MDBX Implementation + +### DatabaseEnv (`db/src/implementation/mdbx/mod.rs`) + +```rust +pub struct DatabaseEnv { + /// Underlying libmdbx environment + inner: Environment, + + /// Cached database handles (DBI) for performance + dbis: Arc>, + + /// Optional metrics collection + metrics: Option>, + + /// File lock for read-write access + _lock_file: Option, +} + +impl Database for DatabaseEnv { + type TX = Tx; // Read-only transaction + type TXMut = Tx; // Read-write transaction + + fn tx(&self) -> Result { + Tx::new( + self.inner.begin_ro_txn()?, + self.dbis.clone(), + self.metrics.clone(), + ) + } + + fn tx_mut(&self) -> Result { + Tx::new( + self.inner.begin_rw_txn()?, + self.dbis.clone(), + self.metrics.clone(), + ) + } +} +``` + +### Transaction Flow + +``` ++-------------------+ +| DatabaseEnv::open | ++-------------------+ + | + v ++-------------------+ +-------------------+ +| db.tx() | | db.tx_mut() | ++-------------------+ +-------------------+ + | | + v v ++-------------------+ +-------------------+ +| Tx | | Tx | +| (read-only) | | (read-write) | ++-------------------+ +-------------------+ + | | + v v ++---------------------------------------------------+ +| Operations | +| | +| tx.get::(key) | +| tx.cursor_read::
() | +| tx.put::
(key, value) (RW only) | +| tx.delete::
(key, value) (RW only) | +| tx.cursor_write::
() (RW only) | ++---------------------------------------------------+ + | + v ++-------------------+ +| tx.commit() | <-- Atomic commit ++-------------------+ +``` + +### Key Configuration + +```rust +pub struct DatabaseArguments { + geometry: Geometry>, // Default: 0..8TB, 4GB growth steps + log_level: Option, + max_read_transaction_duration: Option, + exclusive: Option, + max_readers: Option, // Default: 32,000 (max: 32,767) + sync_mode: SyncMode, // Durable or SafeNoSync +} +``` + +--- + +## Static Files (NippyJar) + +### Overview + +NippyJar provides **immutable, columnar storage** for historical data that never changes (finalized blocks, transactions, receipts). + +``` ++----------------------------------+ +| NippyJar | ++----------------------------------+ +| version: usize | +| user_header: H | +| columns: usize | +| rows: usize | +| compressor: Option | +| max_row_size: usize | +| path: PathBuf | ++----------------------------------+ + | + | Files on disk: + | + +---> data.jar (compressed columnar data) + +---> data.jar.idx (index file) + +---> data.jar.off (offsets file) + +---> data.jar.conf (configuration) +``` + +### Data Organization + +``` + Column 0 Column 1 Column 2 + (Headers) (Transactions) (Receipts) + +---------+ +---------+ +---------+ + Row 0 | Data | | Data | | Data | + +---------+ +---------+ +---------+ + Row 1 | Data | | Data | | Data | + +---------+ +---------+ +---------+ + Row 2 | Data | | Data | | Data | + +---------+ +---------+ +---------+ + . . . . + . . . . +``` + +### Reading Process + +``` +1. Load offset file (memory-mapped) +2. Calculate offset for (row, column) +3. Read compressed data from data file +4. Decompress and return +``` + +--- + +## Provider Layer + +### DatabaseProvider + +The main provider combining MDBX and static files: + +```rust +pub struct DatabaseProvider { + /// Database transaction (RO or RW) + tx: TX, + + /// Chain specification + chain_spec: Arc, + + /// Static file provider for historical data + static_file_provider: StaticFileProvider, + + /// Prune modes configuration + prune_modes: PruneModes, +} +``` + +### Provider Trait Hierarchy + +``` + FullProvider + | + +--------------------+--------------------+ + | | | | | +BlockReader State Header Trie Receipts + | Provider Provider Provider Provider + | | | | | + +--------------------+--------------------+ + | + DatabaseProvider + | + +------------+------------+ + | | + MDBX StaticFiles + (current state) (historical data) +``` + +--- + +## Data Flow Examples + +### Reading a Block + +``` +Application + | + v ++-----------------+ +| block(number)? | ++-----------------+ + | + v ++-----------------+ +| Check if block | +| is in static |---> Yes ---> Read from NippyJar +| files | (memory-mapped) ++-----------------+ + | + No + v ++-----------------+ +| Open DB read TX | +| tx = db.tx() | ++-----------------+ + | + v ++-----------------+ +| Create cursors | +| for each table | ++-----------------+ + | + +---> Headers: cursor.seek_exact(number) + +---> BlockBodyIndices: cursor.seek_exact(number) + +---> Transactions: cursor.walk_range(tx_start..tx_end) + +---> Receipts: cursor.walk_range(tx_start..tx_end) + | + v ++-----------------+ +| Decompress data | +| Assemble Block | ++-----------------+ + | + v ++-----------------+ +| tx.commit() | +| (free pages) | ++-----------------+ + | + v + Return Block +``` + +### Writing a Block + +``` +Application + | + v ++-----------------+ +| write_block() | ++-----------------+ + | + v ++-----------------+ +| Open write TX | +| tx = db.tx_mut()| ++-----------------+ + | + v ++-----------------+ +| Create write | +| cursors | ++-----------------+ + | + +---> Headers: cursor.append(number, header.compress()) + +---> BlockBodyIndices: cursor.append(number, indices.compress()) + +---> Transactions: for each tx -> cursor.append(tx_num, tx.compress()) + +---> Receipts: for each receipt -> cursor.append(tx_num, receipt.compress()) + +---> PlainAccountState: cursor.upsert(addr, account.compress()) + +---> PlainStorageState: cursor.upsert(addr, entry.compress()) + +---> AccountChangeSets: cursor.append_dup(block_num, change) + +---> StorageChangeSets: cursor.append_dup(key, change) + | + v ++-----------------+ +| tx.commit() | +| (atomic write) | ++-----------------+ +``` + +### Historical State Lookup + +``` +Query: account_at(address, block_number) + | + v ++---------------------------+ +| Check current state | +| PlainAccountState[address]| ++---------------------------+ + | + v ++---------------------------+ +| Find relevant shard | +| AccountsHistory.seek( | +| ShardedKey(addr, block) | +| ) | ++---------------------------+ + | + v ++---------------------------+ +| Get block list where | +| account changed | +| BlockNumberList | ++---------------------------+ + | + v ++---------------------------+ +| Find latest change | +| before target block | ++---------------------------+ + | + v ++---------------------------+ +| Read from changeset | +| AccountChangeSets[block] | +| .seek_by_key_subkey( | +| block, address | +| ) | ++---------------------------+ + | + v + Return historical Account +``` + +--- + +## Key Design Patterns + +### 1. Trait-Based Abstraction + +All database operations are defined through traits, enabling: +- Database-agnostic code +- Easy testing with mock implementations +- Potential for alternative backends (RocksDB, etc.) + +### 2. Transaction Semantics + +- All operations occur within transactions +- Transactions are atomic (all-or-nothing commit) +- RAII-based cleanup (drop aborts uncommitted transactions) +- Two types: `DbTx` (read-only) and `DbTxMut` (read-write) + +### 3. Cursor Pattern + +Efficient iteration and range queries: +- Pre-sorted key access +- O(1) append for sequential inserts +- Walker/ReverseWalker for convenient iteration +- DupSort support for multi-value keys + +### 4. Serialization Pipeline + +``` +Value --> Compress --> Encode --> Store +Store --> Decode --> Decompress --> Value +``` + +- Keys use `Encode` trait (must be sortable bytes) +- Values use `Compress` trait (Compact codec by default) + +### 5. Sharded Keys for History + +Historical data uses sharded keys to enable efficient range queries: + +``` +ShardedKey
= (Address, max_block_in_shard) + +Example shards for address 0x1234: + (0x1234, 100) -> blocks 0-100 + (0x1234, 200) -> blocks 101-200 + (0x1234, u64::MAX) -> blocks 201-current +``` + +### 6. DupSort Tables + +Multiple values per key, sorted by subkey: +- `PlainStorageState`: Address -> [StorageEntry] (sorted by storage key) +- `AccountChangeSets`: BlockNumber -> [AccountBeforeTx] (sorted by address) + +--- + +## Performance Considerations + +### MDBX Optimizations + +1. **Write-ahead logging disabled**: Uses writemap mode +2. **No readahead**: Better for random access patterns +3. **Page coalescing**: Reduces fragmentation +4. **Large max readers**: 32,000 concurrent readers +5. **Large geometry**: 8TB max size, 4GB growth steps + +### Static File Optimizations + +1. **Memory-mapped I/O**: Zero-copy reads +2. **Columnar storage**: Efficient for specific column access +3. **Compression**: Zstd or LZ4 per column +4. **Immutable**: No locking needed for reads + +### Access Patterns + +| Operation | MDBX | Static Files | +|-----------|------|--------------| +| Random read | Good | Excellent | +| Sequential read | Good | Excellent | +| Write | Excellent | N/A (immutable) | +| Historical query | Good | Excellent | +| Current state | Excellent | N/A | + +--- + +## File Locations + +### MDBX Database +``` +/db/mdbx.dat +/db/mdbx.lck +``` + +### Static Files +``` +/static_files/ + headers/ + segment_0_1000000.jar + segment_0_1000000.jar.idx + segment_0_1000000.jar.off + transactions/ + ... + receipts/ + ... +``` + +--- + +## Quick Reference + +### Essential Types + +| Type | Location | Purpose | +|------|----------|---------| +| `Database` | db-api/src/database.rs | Main entry point trait | +| `DbTx` | db-api/src/transaction.rs | Read-only transaction | +| `DbTxMut` | db-api/src/transaction.rs | Read-write transaction | +| `DbCursorRO` | db-api/src/cursor.rs | Read cursor | +| `DbCursorRW` | db-api/src/cursor.rs | Write cursor | +| `Table` | db-api/src/table.rs | Table definition | +| `DatabaseEnv` | db/src/implementation/mdbx/mod.rs | MDBX wrapper | +| `Tx` | db/src/implementation/mdbx/tx.rs | Transaction impl | +| `DatabaseProvider` | provider/src/providers/database/provider.rs | Main provider | +| `NippyJar` | nippy-jar/src/lib.rs | Static file format | + +### Common Operations + +```rust +// Open database +let db = DatabaseEnv::open(path, DatabaseEnvKind::RW, args)?; +db.create_tables()?; + +// Read operation +let tx = db.tx()?; +let header = tx.get::(block_number)?; +tx.commit()?; + +// Write operation +let tx = db.tx_mut()?; +tx.put::(block_number, header)?; +tx.commit()?; + +// Cursor iteration +let tx = db.tx()?; +let mut cursor = tx.cursor_read::()?; +for result in cursor.walk(Some(start_block))? { + let (block_num, header) = result?; + // process... +} +``` + +--- + +## The Pipeline: Why We Need Persistence + +The pipeline is the heart of reth's synchronization system. Understanding **why persistence is critical** requires understanding how the pipeline works. + +### What is the Pipeline? + +The pipeline is a **staged synchronization architecture** that downloads and processes blockchain data in sequential, well-defined stages. Each stage handles a specific responsibility: + +``` ++--------------------------------------------------------------------------------+ +| RETH PIPELINE | ++--------------------------------------------------------------------------------+ +| | +| +--------+ +--------+ +-----------+ +--------+ +--------+ | +| |Headers |===>| Bodies |===>| Execution |===>| Hashing|===>| Merkle |===>... | +| | Stage | | Stage | | Stage | | Stages | | Stage | | +| +--------+ +--------+ +-----------+ +--------+ +--------+ | +| | | | | | | +| v v v v v | +| +--------+ +--------+ +-----------+ +--------+ +--------+ | +| | Save | | Save | | Save | | Save | | Save | | +| |Checkpoint |Checkpoint | Checkpoint| |Checkpoint |Checkpoint | +| +--------+ +--------+ +-----------+ +--------+ +--------+ | +| | ++--------------------------------------------------------------------------------+ + | + v + +------------------------+ + | MDBX Database | + | (Atomic Persistence) | + +------------------------+ +``` + +### The Pipeline Stages (In Order) + +```rust +pub enum StageId { + Headers, // 1. Download block headers from network + Bodies, // 2. Download block bodies (transactions) + SenderRecovery, // 3. Recover sender addresses from signatures + Execution, // 4. Execute transactions, update state + AccountHashing, // 5. Hash accounts for state root calculation + StorageHashing, // 6. Hash storage for state root calculation + MerkleExecute, // 7. Calculate Merkle state root + TransactionLookup, // 8. Build tx hash -> block number index + IndexAccountHistory, // 9. Index account change history + IndexStorageHistory, // 10. Index storage change history + Prune, // 11. Prune old data per configuration + Finish, // 12. Mark sync complete +} +``` + +### Why Persistence is Critical + +#### 1. **Crash Recovery & Resumability** + +``` +Scenario: Node crashes during Execution stage at block 15,000,000 + +WITHOUT Persistence: + - Restart from block 0 + - Re-download 15M headers (days of work) + - Re-download 15M bodies + - Re-execute 15M blocks (weeks of work) + - Total: WEEKS of wasted work + +WITH Persistence (Checkpoints): + - Read checkpoint: Execution stage at block 14,999,500 + - Resume execution from block 14,999,501 + - Total: Minutes of work lost +``` + +#### 2. **Atomic Stage Commits** + +Each stage commits atomically to the database: + +```rust +// Inside execute_stage_to_completion() +match self.stage(stage_index).execute(&provider_rw, exec_input) { + Ok(out @ ExecOutput { checkpoint, done }) => { + // 1. Save checkpoint to database + provider_rw.save_stage_checkpoint(stage_id, checkpoint)?; + + // 2. Commit ALL changes atomically + provider_rw.commit()?; // <-- ATOMIC: All or nothing + + // 3. Post-commit hooks (e.g., notify ExEx) + self.stage(stage_index).post_execute_commit()?; + } + Err(err) => { + // Transaction automatically rolled back (RAII) + // No partial state corruption + } +} +``` + +#### 3. **Unwind Support (Reorg Handling)** + +When a chain reorganization is detected: + +``` +Block 100 (canonical) --> Block 101A (our chain) + \--> Block 101B (new canonical, more work) + +Pipeline must: +1. UNWIND stages in REVERSE order back to block 100 +2. RE-EXECUTE forward on new canonical chain + +This requires: +- ChangeSets to reverse state changes +- Atomic unwinding (all or nothing) +- Checkpoint updates at each step +``` + +```rust +// Unwind process (stages/api/src/pipeline/mod.rs:300-411) +pub fn unwind(&mut self, to: BlockNumber, bad_block: Option) { + // Unwind stages in REVERSE order + let unwind_pipeline = self.stages.iter_mut().rev(); + + for stage in unwind_pipeline { + while checkpoint.block_number > to { + let output = stage.unwind(&provider_rw, input); + + // Save new (lower) checkpoint + provider_rw.save_stage_checkpoint(stage_id, checkpoint)?; + provider_rw.commit()?; + } + } +} +``` + +### The Stage Trait: Execute and Unwind + +Every stage must implement both forward execution AND backward unwinding: + +```rust +pub trait Stage: Send { + fn id(&self) -> StageId; + + /// Execute stage forward (sync new blocks) + fn execute( + &mut self, + provider: &Provider, + input: ExecInput, + ) -> Result; + + /// Unwind stage backward (handle reorg) + fn unwind( + &mut self, + provider: &Provider, + input: UnwindInput, + ) -> Result; +} +``` + +### Execution Stage Deep Dive + +The Execution stage is the most complex - it runs the EVM: + +```rust +// stages/stages/src/stages/execution.rs + +fn execute(&mut self, provider: &Provider, input: ExecInput) -> Result { + let start_block = input.next_block(); + let max_block = input.target(); + + // Create EVM executor with current state + let db = StateProviderDatabase(LatestStateProviderRef::new(provider)); + let mut executor = self.evm_config.batch_executor(db); + + for block_number in start_block..=max_block { + // 1. Fetch block from database + let block = provider.recovered_block(block_number)?; + + // 2. Execute all transactions in EVM + let result = executor.execute_one(&block)?; + + // 3. Validate post-execution (consensus rules) + self.consensus.validate_block_post_execution(&block, &result)?; + + // 4. Check batch thresholds + if self.thresholds.is_end_of_batch(...) { + break; // Commit now, continue in next iteration + } + } + + // 5. Write state changes to database + let state = ExecutionOutcome::from_blocks(start_block, executor.into_state()); + provider.write_state(&state, OriginalValuesKnown::Yes)?; + + Ok(ExecOutput { checkpoint, done }) +} +``` + +### Tables Written by Each Stage + +| Stage | Tables Written | +|-------|----------------| +| Headers | `CanonicalHeaders`, `Headers`, `HeaderNumbers` (+ Static Files) | +| Bodies | `BlockBodyIndices`, `Transactions` (+ Static Files) | +| SenderRecovery | `TransactionSenders` | +| Execution | `PlainAccountState`, `PlainStorageState`, `Bytecodes`, `AccountChangeSets`, `StorageChangeSets`, `Receipts` | +| AccountHashing | `HashedAccounts` | +| StorageHashing | `HashedStorages` | +| MerkleExecute | `AccountsTrie`, `StoragesTrie` | +| TransactionLookup | `TransactionHashNumbers`, `TransactionBlocks` | +| IndexAccountHistory | `AccountsHistory` | +| IndexStorageHistory | `StoragesHistory` | + +### Batching and Thresholds + +The Execution stage uses thresholds to batch commits: + +```rust +pub struct ExecutionStageThresholds { + pub max_blocks: Option, // Max blocks before commit + pub max_changes: Option, // Max state changes before commit + pub max_cumulative_gas: Option, // Max gas before commit + pub max_duration: Option, // Max time before commit +} +``` + +**Why batch?** +- Fewer commits = less I/O overhead +- But larger batches = more work lost on crash +- Thresholds balance these trade-offs + +### Data Flow: Pipeline to Persistence + +``` + PIPELINE EXECUTION + | + +---------------------|---------------------+ + | v | + | +----------------------------------------+ + | | Stage Execution | + | | | + | | 1. Read from DB (current state) | + | | 2. Process data (download/execute) | + | | 3. Write to DB (new state + changes) | + | | 4. Save checkpoint | + | +----------------------------------------+ + | | + | v + | +----------------------------------------+ + | | provider_rw.commit() | + | | | + | | - MDBX transaction commit | + | | - All table writes become durable | + | | - Checkpoint saved atomically | + | +----------------------------------------+ + | | + +---------------------|---------------------+ + v + +------------------------+ + | move_to_static_ | + | files() | + +------------------------+ + | + +------------+------------+ + | | + v v + +----------------+ +----------------+ + | StaticFile | | Pruner | + | Producer | | | + | (copy to jars) | | (delete from | + +----------------+ | MDBX) | + +----------------+ +``` + +### Things to Be Aware Of + +#### 1. **Checkpoint Consistency** + +The checkpoint represents the **last fully completed block**. A stage's checkpoint means: +- All blocks up to and including that number are processed +- The stage should resume from checkpoint + 1 + +```rust +impl ExecInput { + pub fn next_block(&self) -> BlockNumber { + self.checkpoint().block_number + 1 // Resume from next block + } +} +``` + +#### 2. **Static File Consistency** + +Data can exist in both MDBX and static files temporarily. The pipeline must: +1. Copy data from MDBX to static files +2. Prune data from MDBX +3. Handle crashes between these steps + +```rust +// Consistency check in Execution stage +fn ensure_consistency(&self, provider: &Provider, checkpoint: u64) { + // Compare database state vs static file state + let db_receipt_num = provider.block_body_indices(checkpoint)?.next_tx_num(); + let static_file_receipt_num = static_file_provider + .get_highest_static_file_tx(StaticFileSegment::Receipts); + + // If mismatch, fix it before proceeding + match static_file_block_num.cmp(&checkpoint) { + Ordering::Greater => { + // Static file ahead: prune static file + static_file_producer.prune_receipts(...)?; + } + Ordering::Less => { + // Database ahead: error, need to unwind + return Err(StageError::MissingStaticFileData { ... }); + } + } +} +``` + +#### 3. **Unwind Limitations** + +You cannot unwind past pruned data: + +```rust +// Before unwinding, verify target is not pruned +prune_modes.ensure_unwind_target_unpruned(latest_block, to, &checkpoints)?; +``` + +#### 4. **Stage Dependencies** + +Stages have implicit dependencies: +- Bodies stage needs Headers stage complete +- Execution needs Bodies complete +- Hashing needs Execution complete +- Merkle needs Hashing complete + +The pipeline enforces this by running stages in order. + +#### 5. **Detached Head Handling** + +If our chain tip doesn't connect to the network's chain: + +```rust +StageError::DetachedHead { local_head, header, error } +``` + +The pipeline will attempt to unwind and find a common ancestor. Multiple attempts are tracked to prevent infinite loops. + +### Summary: Why Persistence Matters + +| Requirement | How Persistence Helps | +|-------------|----------------------| +| **Crash Recovery** | Checkpoints allow resuming from last completed work | +| **Atomicity** | MDBX transactions ensure all-or-nothing commits | +| **Reorg Handling** | ChangeSets enable reversing state changes | +| **Consistency** | Static file consistency checks prevent corruption | +| **Progress Tracking** | Stage checkpoints show sync progress | +| **Historical Queries** | History tables enable state at any block | + +--- + +## Engine API: The Bridge Between Consensus and Execution + +The Engine API is how the **Consensus Layer (CL)** communicates with the **Execution Layer (EL)**. Post-merge Ethereum separates block proposal (CL) from block execution (EL), and the Engine API is the bridge. + +### Architecture Overview + +``` ++-------------------+ +-------------------+ +| Consensus | | Execution | +| Layer | | Layer | +| (Beacon Node) | | (Reth) | ++-------------------+ +-------------------+ + | | + | engine_newPayloadV3(payload) | + |-------------------------------------------------->| + | PayloadStatus | + |<--------------------------------------------------| + | | + | engine_forkchoiceUpdatedV3(state, attrs?) | + |-------------------------------------------------->| + | PayloadStatus + PayloadId? | + |<--------------------------------------------------| + | | + | engine_getPayloadV3(payload_id) | + |-------------------------------------------------->| + | ExecutionPayload | + |<--------------------------------------------------| + | | ++-------------------+ +-------------------+ +``` + +### Key Engine API Methods + +| Method | Purpose | +|--------|---------| +| `engine_newPayloadVX` | Submit a new block payload for validation & execution | +| `engine_forkchoiceUpdatedVX` | Update the canonical chain head, trigger payload building | +| `engine_getPayloadVX` | Retrieve a built payload for proposal | + +### Core Data Structures + +```rust +// Forkchoice state from CL +pub struct ForkchoiceState { + pub head_block_hash: B256, // Current chain head + pub safe_block_hash: B256, // Safe block (2/3 attestations) + pub finalized_block_hash: B256, // Finalized block (irreversible) +} + +// Response statuses +pub enum PayloadStatusEnum { + Valid, // Block is valid + Invalid { latest_valid_hash }, // Block or ancestor invalid + Syncing, // Still syncing, can't validate yet + Accepted, // Payload accepted (newPayload only) +} +``` + +### Engine Tree: In-Memory Block Forest + +The `EngineApiTreeHandler` manages an in-memory tree of executed blocks: + +``` + Persisted Chain (on disk) + | + Block 100 (canonical) + | + +-----------------+-----------------+ + | | + Block 101A Block 101B + (in-memory) (in-memory) + | | + Block 102A Block 102B + (in-memory) (current head) + | + Block 103A + (in-memory, fork) +``` + +```rust +// crates/engine/tree/src/tree/state.rs +pub struct TreeState { + /// All executed blocks by hash (includes all forks) + blocks_by_hash: HashMap>, + + /// Blocks grouped by number (multiple per number due to forks) + blocks_by_number: BTreeMap>>, + + /// Parent-to-children mapping for traversal + parent_to_child: HashMap>, + + /// Currently tracked canonical head + current_canonical_head: BlockNumHash, +} +``` + +### Forkchoice State Tracker + +Tracks the three types of forkchoice states: + +```rust +// crates/engine/primitives/src/forkchoice.rs +pub struct ForkchoiceStateTracker { + /// Most recent forkchoice (can be invalid!) + latest: Option, + + /// Last forkchoice requiring sync + last_syncing: Option, + + /// Last confirmed valid forkchoice + last_valid: Option, +} + +pub enum ForkchoiceStatus { + Valid, // Forkchoice applied successfully + Invalid, // Head or ancestor is invalid + Syncing, // Missing blocks, need to sync +} +``` + +### Engine API Request Flow + +#### 1. newPayload Processing + +``` +engine_newPayloadV3(ExecutionPayload) + | + v ++------------------------+ +| Convert to SealedBlock | +| Validate block format | ++------------------------+ + | + v ++------------------------+ +| Check if already | +| processed → VALID | ++------------------------+ + | + v ++------------------------+ +------------------------+ +| Parent available? |--No→| Buffer block | +| (in memory or disk) | | Return SYNCING | ++------------------------+ +------------------------+ + | + Yes + v ++------------------------+ +| Get parent state | +| Execute block in EVM | ++------------------------+ + | + +------------+------------+ + | | + Success Failure + | | + v v ++------------------------+ +------------------------+ +| Insert into TreeState | | Mark as invalid | +| Emit event | | Return INVALID | +| Try connect buffered | | + latestValidHash | +| Return VALID | +------------------------+ ++------------------------+ +``` + +#### 2. forkchoiceUpdated Processing + +``` +engine_forkchoiceUpdatedV3(ForkchoiceState, PayloadAttributes?) + | + v ++------------------------+ +| Validate forkchoice | +| - head != 0x00 | +| - No invalid ancestor | +| - Backfill not active | ++------------------------+ + | + v ++------------------------+ +------------------------+ +| Head already canonical?|--Yes→| Process payload attrs | +| (current_head == head) | | Return VALID | ++------------------------+ +------------------------+ + | + No + v ++------------------------+ +| Apply chain update | +| (may trigger reorg) | ++------------------------+ + | + +------------+------------+ + | | + Head found Head missing + | | + v v ++------------------------+ +------------------------+ +| on_new_head() | | Request block download | +| Compute new chain | | Return SYNCING | ++------------------------+ +------------------------+ + | + v ++------------------------+ +| Update canonical state | +| Queue persistence | +| Process payload attrs | +| Return VALID | ++------------------------+ +``` + +--- + +## Reorgs and Unwinds: Detailed Mechanics + +### What Triggers a Reorg? + +A reorg occurs when `forkchoiceUpdated` specifies a head that: +1. Is NOT a direct descendant of the current canonical head +2. Shares a common ancestor with the current chain + +``` +Before reorg: After reorg: + +Block 100 (canonical) Block 100 (canonical) + | | +Block 101 (canonical) Block 101 (orphaned) + | | +Block 102 (canonical, head) Block 102 (orphaned) + | +Block 101B (sidechain) ------> Block 101B (canonical) + | | +Block 102B (sidechain) Block 102B (canonical, new head) +``` + +### Computing the Reorg: on_new_head() + +The `on_new_head` function computes whether we have a simple extension or a reorg: + +```rust +// crates/engine/tree/src/tree/mod.rs:662-751 +fn on_new_head(&self, new_head: B256) -> ProviderResult>> { + let new_head_block = self.state.tree_state.blocks_by_hash.get(&new_head)?; + let mut new_chain = vec![new_head_block.clone()]; + let mut current_hash = new_head_block.parent_hash(); + + // Walk back the new chain collecting blocks + while current_number > current_canonical_number { + let block = self.state.tree_state.executed_block_by_hash(current_hash)?; + current_hash = block.parent_hash(); + new_chain.push(block); + } + + // Check if this is a simple extension + if current_hash == self.current_canonical_head.hash { + new_chain.reverse(); + return Ok(Some(NewCanonicalChain::Commit { new: new_chain })) + } + + // We have a reorg - find the fork point + let mut old_chain = Vec::new(); + let mut old_hash = self.current_canonical_head.hash; + + // Walk back old chain to same height + while current_canonical_number > current_number { + let block = self.canonical_block_by_hash(old_hash)?; + old_hash = block.parent_hash(); + old_chain.push(block); + } + + // Walk both chains until we find common ancestor + while old_hash != current_hash { + // Add block from old chain + old_chain.push(self.canonical_block_by_hash(old_hash)?); + old_hash = old_chain.last().unwrap().parent_hash(); + + // Add block from new chain + new_chain.push(self.state.tree_state.executed_block_by_hash(current_hash)?); + current_hash = new_chain.last().unwrap().parent_hash(); + } + + new_chain.reverse(); + old_chain.reverse(); + + Ok(Some(NewCanonicalChain::Reorg { new: new_chain, old: old_chain })) +} +``` + +### NewCanonicalChain Types + +```rust +// crates/chain-state/src/in_memory.rs:893-907 +pub enum NewCanonicalChain { + /// Simple extension - new blocks append to canonical head + Commit { + new: Vec>, // Blocks to add + }, + + /// Reorg - need to replace old chain with new chain + Reorg { + new: Vec>, // New canonical blocks + old: Vec>, // Orphaned blocks to remove + }, +} +``` + +### Reorg Detection Algorithm (Visual) + +``` + Current State New Head Request + +Canonical: 100 → 101 → 102 FCU: head = 102B + | +Sidechain: 101B → 102B + +Step 1: Walk back new_chain from 102B + new_chain = [102B, 101B] + current_hash = 100 (parent of 101B) + +Step 2: current_hash (100) == canonical_head.hash (102)? NO + +Step 3: Walk back old_chain to same height as new_chain tip + old_chain = [102] + old_hash = 101 + +Step 4: Walk both until hashes match (fork point) + Iteration 1: + old_chain += [101], old_hash = 100 + new_chain already at 101B's parent = 100 + current_hash = 100, old_hash = 100 → MATCH! + +Result: Reorg { + new: [101B, 102B], // Blocks to apply + old: [101, 102], // Blocks to orphan +} +``` + +### Pipeline Unwind Process + +When reorg affects persisted blocks, the pipeline must unwind: + +```rust +// crates/stages/api/src/pipeline/mod.rs +pub fn unwind(&mut self, to: BlockNumber, bad_block: Option) { + // Unwind stages in REVERSE order + let unwind_pipeline = self.stages.iter_mut().rev(); + + for stage in unwind_pipeline { + let checkpoint = provider.get_stage_checkpoint(stage.id())?; + + // Skip if already at or below target + if checkpoint.block_number <= to { + continue; + } + + // Unwind in batches + while checkpoint.block_number > to { + let input = UnwindInput { + checkpoint, + unwind_to: to, + bad_block, + }; + + let output = stage.unwind(&provider_rw, input)?; + + // Save new (lower) checkpoint + provider_rw.save_stage_checkpoint(stage.id(), output.checkpoint)?; + provider_rw.commit()?; // Atomic + + checkpoint = output.checkpoint; + } + } +} +``` + +### Execution Stage Unwind + +The most complex unwind - must reverse state changes: + +```rust +// Simplified from stages/stages/src/stages/execution.rs +fn unwind(&mut self, provider: &Provider, input: UnwindInput) -> Result { + let target = input.unwind_to; + + // 1. Get range of blocks to unwind + let range = (target + 1)..=input.checkpoint.block_number; + + // 2. Read change sets for these blocks + let account_changes = provider.account_changesets(range.clone())?; + let storage_changes = provider.storage_changesets(range)?; + + // 3. Reverse the changes (restore old values) + for (block_num, changes) in account_changes.into_iter().rev() { + for AccountBeforeTx { address, info } in changes { + match info { + Some(old_account) => { + // Restore previous account state + provider.put::(address, old_account)?; + } + None => { + // Account didn't exist before - delete it + provider.delete::(address, None)?; + } + } + } + } + + // 4. Similar for storage changes... + + // 5. Delete receipts for unwound blocks + provider.unwind_table_by_num::(target)?; + + // 6. Update checkpoint + Ok(UnwindOutput { checkpoint: StageCheckpoint::new(target) }) +} +``` + +### Why ChangeSets Are Critical + +ChangeSets record the **before** values for every state change: + +``` +Block 102 Execution: + - Account 0x1234: balance 100 → 50 + - Storage 0x1234[slot5]: 0 → 42 + +ChangeSet Stored: + AccountChangeSet[102] = [(0x1234, Account{balance: 100})] // Old value + StorageChangeSet[102] = [(0x1234, slot5, 0)] // Old value + +To Unwind Block 102: + 1. Read ChangeSet[102] + 2. Restore 0x1234.balance = 100 + 3. Restore 0x1234[slot5] = 0 + 4. Delete receipt for block 102 +``` + +### Dual Sync Modes + +The Engine API uses two sync strategies: + +``` ++-------------------+ +-------------------+ +| Live Sync | | Backfill Sync | +| (BlockDownloader) | | (Pipeline) | ++-------------------+ +-------------------+ +| | | | +| For small gaps | | For large gaps | +| (< EPOCH_SLOTS) | | (≥ EPOCH_SLOTS) | +| | | | +| Download + execute| | Full staged sync | +| blocks one by one | | (parallel stages) | +| | | | +| Quick response to | | Efficient for | +| CL forkchoice | | catching up | ++-------------------+ +-------------------+ + +Threshold: MIN_BLOCKS_FOR_PIPELINE_RUN = EPOCH_SLOTS (32 blocks) +``` + +### Persistence Service: Background Writing + +The Engine API doesn't block on persistence: + +```rust +// crates/engine/tree/src/persistence.rs +pub struct PersistenceService { + provider: ProviderFactory, + incoming: Receiver>, + pruner: PrunerWithFactory<...>, +} + +pub enum PersistenceAction { + SaveBlocks(Vec>, oneshot::Sender<...>), + RemoveBlocksAbove(u64, oneshot::Sender<...>), // For reorgs + SaveFinalizedBlock(u64), + SaveSafeBlock(u64), +} + +// Flow: +// 1. Engine handler executes blocks in-memory +// 2. Sends PersistenceAction::SaveBlocks to background service +// 3. Handler continues processing new requests +// 4. Background service writes to DB, triggers pruner +``` + +``` ++------------------+ +------------------+ +| Engine Handler | | Persistence Svc | +| (main thread) | | (background) | ++------------------+ +------------------+ + | | + | SaveBlocks([101, 102]) | + |----------------------------->| + | | + | (continues processing) | Write to static files + | newPayload(103) | Write to MDBX + | | Run pruner + | | + |<----- Done notification -----| + | | +``` + +### Block Buffer: Handling Orphans + +When a block's parent is missing, it's buffered: + +```rust +// crates/engine/tree/src/tree/block_buffer.rs +pub struct BlockBuffer { + blocks: HashMap>, + parent_to_children: HashMap>, + limit: u32, // Max buffered blocks +} + +// When parent arrives: +fn try_connect_buffered_blocks(&mut self, parent_hash: B256) { + if let Some(children) = self.parent_to_children.get(&parent_hash) { + for child_hash in children.clone() { + let child = self.blocks.remove(&child_hash)?; + // Execute child block + self.insert_payload(child)?; + // Recursively connect grandchildren + self.try_connect_buffered_blocks(child_hash); + } + } +} +``` + +### State Provider Building + +Creating state for block execution with in-memory overlay: + +```rust +// crates/engine/tree/src/tree/mod.rs +pub struct StateProviderBuilder { + provider_factory: P, + historical: B256, // Base block on disk + overlay: Option>>, // In-memory blocks +} + +impl StateProviderBuilder { + pub fn build(&self) -> ProviderResult { + // 1. Get historical state from disk + let provider = self.provider_factory.state_by_block_hash(self.historical)?; + + // 2. Apply in-memory block changes on top + if let Some(overlay) = self.overlay.clone() { + Ok(Box::new(MemoryOverlayStateProvider::new(provider, overlay))) + } else { + Ok(provider) + } + } +} +``` + +### Complete Data Flow: forkchoiceUpdated with Reorg + +``` +[Consensus Layer] + | + v +engine_forkchoiceUpdatedV3(head=102B, safe=100, finalized=99) + | + v ++-------------------------------------------+ +| EngineApiTreeHandler::on_forkchoice_updated| ++-------------------------------------------+ + | + v ++-------------------------------------------+ +| validate_forkchoice_state() | +| - head != 0x00 ✓ | +| - No invalid ancestors ✓ | +| - Backfill idle ✓ | ++-------------------------------------------+ + | + v ++-------------------------------------------+ +| head (102B) != current_head (102) → reorg | +| apply_chain_update() | ++-------------------------------------------+ + | + v ++-------------------------------------------+ +| on_new_head(102B) | +| Returns: Reorg { | +| new: [101B, 102B], | +| old: [101, 102] | +| } | ++-------------------------------------------+ + | + v ++-------------------------------------------+ +| on_canonical_chain_update() | +| 1. Update TreeState.canonical_head | +| 2. Update CanonicalInMemoryState | +| 3. Emit CanonicalChainCommitted event | +| 4. Send to PersistenceService | ++-------------------------------------------+ + | + +----------------+ + | | + v v ++----------------+ +-------------------+ +| Return VALID | | Background: | +| to CL | | - Save 101B, 102B | ++----------------+ | - Prune 101, 102 | + +-------------------+ + | + v +[Notification to RPC/ExEx listeners] +CanonStateNotification::Reorg { new, old } +``` + +### Things to Be Aware Of (Engine API) + +#### 1. **In-Memory vs Persisted Reorgs** + +- **In-memory reorg**: Just update `TreeState.canonical_head` +- **Persisted reorg**: Requires pipeline unwind + re-execute + +```rust +// Determine if persistence is needed +if reorg_affects_persisted_blocks { + // Must unwind persisted state + pipeline.unwind(fork_point)?; + // Then re-execute on new chain + for block in new_chain { + execute_and_persist(block)?; + } +} else { + // Just update in-memory canonical pointer + tree_state.set_canonical_head(new_head); +} +``` + +#### 2. **Finalization Cleans Sidechains** + +When a block is finalized, all competing chains below it are pruned: + +```rust +// crates/engine/tree/src/tree/state.rs:197-247 +fn prune_finalized_sidechains(&mut self, finalized: BlockNumHash) { + // Remove all blocks below finalized number + // Keep only the finalized block at its height + // BFS remove all children of non-finalized blocks +} +``` + +#### 3. **Invalid Block Caching** + +Invalid blocks are cached to prevent reprocessing: + +```rust +pub struct InvalidHeaderCache { + invalid: HashMap, +} + +// Check before processing +if invalid_cache.contains(block_hash) { + return PayloadStatus::Invalid { + latest_valid_hash: invalid_cache.get_latest_valid(block_hash) + } +} +``` + +#### 4. **Sync Target Tracking** + +The forkchoice tracker remembers where we need to sync to: + +```rust +// If we return SYNCING, we track the target +if status.is_syncing() { + self.last_syncing = Some(forkchoice_state); +} + +// Later, download can query sync target +let target = forkchoice_tracker.sync_target(); +``` + +--- + +## Cursor Deep Dive: The Intuitive Guide + +Cursors are fundamental to how reth interacts with MDBX. This section explains cursors at both intuitive and technical levels. + +### The Phonebook Analogy + +Think of a cursor like **your finger on a sorted phonebook**: + +``` +Database Table (sorted by key): +┌─────────────────────────────────┐ +│ Adams, John → 555-0101 │ +│ Brown, Alice → 555-0102 │ ← cursor position (your finger) +│ Clark, Bob → 555-0103 │ +│ Davis, Eve → 555-0104 │ +│ ... │ +└─────────────────────────────────┘ +``` + +**Basic cursor operations map to finger movements:** + +| Operation | What it does | Phonebook analogy | +|-----------|--------------|-------------------| +| `seek("Clark")` | Jump to >= "Clark" | Flip to the C section | +| `seek_exact("Clark")` | Jump to exactly "Clark" | Find "Clark, Bob" exactly | +| `next()` | Move to next entry | Slide finger down one row | +| `prev()` | Move to previous entry | Slide finger up one row | +| `first()` | Jump to beginning | Go to page 1 | +| `last()` | Jump to end | Go to last page | +| `current()` | Read without moving | Read where finger is | + +### Why Cursors Instead of get()? + +You might wonder: why not just use `tx.get(key)` for everything? + +``` +Scenario: Read blocks 1000 to 1010 + +Using get() - 11 separate lookups: + get(1000) → B+ tree traversal → find leaf → read + get(1001) → B+ tree traversal → find leaf → read (same leaf!) + get(1002) → B+ tree traversal → find leaf → read (same leaf!) + ... + Total: 11 × O(log n) tree traversals + +Using cursor - 1 lookup + 10 nexts: + seek(1000) → B+ tree traversal → find leaf → read + next() → already on leaf, move to next slot → read + next() → already on leaf, move to next slot → read + ... + Total: 1 × O(log n) + 10 × O(1) +``` + +**Cursors exploit data locality.** Once you find a leaf node, sequential reads are nearly free. + +### DupSort Tables: Multiple Values Per Key + +Some tables store **multiple values per key**. Think of it as: + +``` +Regular Table (PlainAccountState): + Address A → Account{balance: 100} + Address B → Account{balance: 200} + +DupSort Table (PlainStorageState): + Address A → [ + StorageEntry{slot: 0, value: 42}, + StorageEntry{slot: 1, value: 100}, + StorageEntry{slot: 5, value: 0} + ] + Address B → [ + StorageEntry{slot: 0, value: 7} + ] +``` + +**Phonebook analogy for DupSort:** +Like a person with multiple phone numbers, sorted: +``` +Adams, John: + - Home: 555-0101 + - Mobile: 555-0102 + - Work: 555-0103 +Brown, Alice: + - Mobile: 555-0201 +``` + +**DupSort cursor operations:** + +| Operation | What it does | +|-----------|--------------| +| `next_dup()` | Next value for same key (John's next number) | +| `next_no_dup()` | Skip to next key entirely (jump to Alice) | +| `seek_by_key_subkey(addr, slot)` | Find specific storage slot for address | +| `walk_dup(key, subkey)` | Iterate all slots for one address | + +### Trie Cursors: Walking a Flattened Tree + +The Merkle Patricia Trie is stored **flattened** in the database. Trie cursors help traverse it: + +``` +Conceptual Tree: Flattened in DB (sorted by path): + root ┌──────────────────────────────────┐ + / \ │ "" → BranchNode{...} │ + 0x1 0x2 │ "1" → BranchNode{...} │ + / \ │ "1a" → BranchNode{hash: 0x..} │ + 0x1a 0x2b │ "2" → BranchNode{...} │ + │ "2b" → BranchNode{hash: 0x..} │ + └──────────────────────────────────┘ +``` + +**Trie cursor operations:** + +```rust +// Find node at exact path +cursor.seek_exact(Nibbles::from_hex("1a"))?; + +// Find node at or after path +cursor.seek(Nibbles::from_hex("1"))?; // Finds "1" or "1a" or "2"... + +// Iterate through trie in order +while let Some((path, node)) = cursor.next()? { + // Process node +} +``` + +### The DupSort Upsert Gotcha + +**CRITICAL:** For DupSort tables, `upsert()` doesn't replace - it appends! + +```rust +// WRONG - Creates duplicates! +cursor.upsert(address, StorageEntry { slot: 5, value: 100 })?; +cursor.upsert(address, StorageEntry { slot: 5, value: 200 })?; + +// Result: TWO entries for slot 5! +// Address A → [slot:5→100, slot:5→200] // CORRUPTED + +// CORRECT - Delete first, then insert +cursor.seek_by_key_subkey(address, slot)?; +cursor.delete_current()?; +cursor.upsert(address, StorageEntry { slot: 5, value: 200 })?; + +// Result: Single correct entry +// Address A → [slot:5→200] +``` + +Why? Because DupSort keys aren't unique - the database can't know which duplicate you want to update. + +### Walker Patterns + +Walkers wrap cursors in Rust iterators: + +```rust +// Manual cursor loop +let mut cursor = tx.cursor_read::()?; +cursor.seek(1000)?; +loop { + match cursor.next()? { + Some((num, header)) if num <= 2000 => { /* process */ } + _ => break, + } +} + +// Using Walker (cleaner) +let mut cursor = tx.cursor_read::()?; +for result in cursor.walk_range(1000..=2000)? { + let (num, header) = result?; + // process +} + +// Using DupWalker (for DupSort tables) +let mut cursor = tx.cursor_dup_read::()?; +for result in cursor.walk_dup(Some(address), None)? { + let (addr, entry) = result?; + // process all storage slots for address +} +``` + +### Performance: append() vs insert() + +When writing sorted data, `append()` is O(1) vs `insert()`'s O(log n): + +```rust +// SLOW - Each insert searches the B+ tree +for i in 0..1_000_000 { + cursor.insert(i, value)?; // O(log n) each +} + +// FAST - Append knows data is sorted, skips search +for i in 0..1_000_000 { + cursor.append(i, value)?; // O(1) each +} +``` + +**append() requires:** Data must be inserted in sorted order. If you append out of order, you'll corrupt the database! + +### Real-World Usage Patterns + +#### Pattern 1: Reading a Range (Execution Stage) + +```rust +// Read all transactions in a block +let indices = provider.block_body_indices(block_num)?; +let mut cursor = tx.cursor_read::()?; + +for result in cursor.walk_range(indices.first_tx_num..indices.first_tx_num + indices.tx_count)? { + let (tx_num, transaction) = result?; + // Execute transaction +} +``` + +#### Pattern 2: Writing State Changes + +```rust +// Write account updates from execution +let mut cursor = tx.cursor_write::()?; + +for (address, account) in state_changes { + cursor.upsert(address, account)?; +} +``` + +#### Pattern 3: Reversing State (Unwind) + +```rust +// Read change sets to undo state +let mut changeset_cursor = tx.cursor_read::()?; +let mut state_cursor = tx.cursor_write::()?; + +for result in changeset_cursor.walk_dup(Some(block_num), None)?.rev() { + let (_, AccountBeforeTx { address, info }) = result?; + + match info { + Some(old_account) => state_cursor.upsert(address, old_account)?, + None => { state_cursor.seek_exact(address)?; state_cursor.delete_current()?; } + } +} +``` + +#### Pattern 4: Trie Traversal (State Root Calculation) + +```rust +// Iterate account trie nodes +let mut cursor = tx.cursor_read::()?; + +for result in cursor.walk(None)? { + let (nibbles, node) = result?; + + if node.state_mask.is_empty() && node.hash_mask.is_empty() { + // Leaf node + } else { + // Branch node + } +} +``` + +### Summary: When to Use What + +| Scenario | Use This | +|----------|----------| +| Single key lookup | `tx.get::
(key)` | +| Range of keys | `cursor.walk_range(start..end)` | +| All entries | `cursor.walk(None)` | +| Multiple values per key | `cursor.walk_dup(key, subkey)` | +| Sequential writes (sorted) | `cursor.append(key, value)` | +| Random writes | `cursor.upsert(key, value)` | +| Update DupSort entry | `delete_current()` then `upsert()` | +| Reverse iteration | `cursor.walk_back(start)` | + +--- + +*Generated from reth source code analysis - December 2024*