Compare commits

...

2 Commits

Author SHA1 Message Date
Manu NALEPA
329cc39f60 Never reconstruct 2026-03-18 17:34:34 +01:00
Manu NALEPA
5fed2218ae Periodically evict finalized states in checkpoint states cache (#16458)
**What type of PR is this?**
Bug fix

**What does this PR do? Why is it needed?**
This PR evicts finalized states in checkpoint states cache.

States are efficiently stored in caches, especially thanks to multi
value slices. If 1 state takes 300 MB and 2 states that are really
similar are stored in a cache, then these 2 states could only need let's
say 310MB of cache memory (instead of 300 MB x 2 = 600 MB).

**Before the commit creating the memory issue**, new states were
regularly stored in the `CheckpointStateCache`. This cache has 10 slots.
After this cache is full, oldest values (not quite exactly because it's
a LRU cache) are pruned.

**After the commit creating the memory issue**, new states are quite
rarely inserted into this cache. For example, on a run, almost 5H (!)
were needed before the first value was evicted from this cache. This
mean this cache contains multiple states that do not share a lot of
values with other states in all other caches/head.
==> A lot of fields stay in memory that are exclusively needed for the
(old) states only present in this cache.

The beacon node now evicts finalized states from the cache.

**Which issues(s) does this PR fix?**
- https://github.com/OffchainLabs/prysm/issues/16376

<img width="1022" height="914" alt="image"
src="https://github.com/user-attachments/assets/98886364-001a-48fc-a952-5c6a7e80bf88"
/>

**Acknowledgements**
- [x] I have read
[CONTRIBUTING.md](https://github.com/prysmaticlabs/prysm/blob/develop/CONTRIBUTING.md).
- [x] I have included a uniquely named [changelog fragment
file](https://github.com/prysmaticlabs/prysm/blob/develop/CONTRIBUTING.md#maintaining-changelogmd).
- [x] I have added a description with sufficient context for reviewers
to understand this PR.
- [x] I have tested that my changes work as expected and I added a
testing plan to the PR description (if applicable).
2026-03-18 17:33:38 +01:00
6 changed files with 125 additions and 13 deletions

View File

@@ -327,6 +327,8 @@ func (s *Service) executePostFinalizationTasks(ctx context.Context, finalizedSta
}
}()
}
go s.checkpointStateCache.EvictUpTo(finalized.Epoch)
}
// ReceiveBlockBatch processes the whole block batch at once, assuming the block batch is linear ,transitioning

View File

@@ -54,6 +54,7 @@ go_library(
"//proto/prysm/v1alpha1:go_default_library",
"//proto/prysm/v1alpha1/attestation:go_default_library",
"//runtime/version:go_default_library",
"//time/slots:go_default_library",
"@com_github_ethereum_go_ethereum//common:go_default_library",
"@com_github_hashicorp_golang_lru//:go_default_library",
"@com_github_patrickmn_go_cache//:go_default_library",

View File

@@ -3,8 +3,10 @@ package cache
import (
"github.com/OffchainLabs/prysm/v7/beacon-chain/state"
lruwrpr "github.com/OffchainLabs/prysm/v7/cache/lru"
"github.com/OffchainLabs/prysm/v7/consensus-types/primitives"
"github.com/OffchainLabs/prysm/v7/crypto/hash"
ethpb "github.com/OffchainLabs/prysm/v7/proto/prysm/v1alpha1"
"github.com/OffchainLabs/prysm/v7/time/slots"
lru "github.com/hashicorp/golang-lru"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
@@ -25,6 +27,14 @@ var (
Name: "check_point_state_cache_hit",
Help: "The number of check point state requests that are present in the cache.",
})
checkpointStateSize = promauto.NewGauge(prometheus.GaugeOpts{
Name: "check_point_state_cache_size",
Help: "The number of entries in the check point state cache.",
})
checkpointStateEvicted = promauto.NewCounter(prometheus.CounterOpts{
Name: "check_point_state_cache_evicted_total",
Help: "The number of entries evicted from the check point state cache.",
})
)
// CheckpointStateCache is a struct with 1 queue for looking up state by checkpoint.
@@ -49,14 +59,14 @@ func (c *CheckpointStateCache) StateByCheckpoint(cp *ethpb.Checkpoint) (state.Be
item, exists := c.cache.Get(h)
if exists && item != nil {
checkpointStateHit.Inc()
// Copy here is unnecessary since the return will only be used to verify attestation signature.
return item.(state.BeaconState), nil
if !exists || item == nil {
checkpointStateMiss.Inc()
return nil, nil
}
checkpointStateMiss.Inc()
return nil, nil
checkpointStateHit.Inc()
// Copy here is unnecessary since the return will only be used to verify attestation signature.
return item.(state.BeaconState), nil
}
// AddCheckpointState adds CheckpointState object to the cache. This method also trims the least
@@ -66,6 +76,35 @@ func (c *CheckpointStateCache) AddCheckpointState(cp *ethpb.Checkpoint, s state.
if err != nil {
return err
}
c.cache.Add(h, s)
checkpointStateSize.Set(float64(c.cache.Len()))
return nil
}
// EvictUpTo removes all entries from the cache whose state epoch is at
// or before the given epoch. Returns the number of evicted entries.
func (c *CheckpointStateCache) EvictUpTo(epoch primitives.Epoch) int {
evicted := 0
for _, key := range c.cache.Keys() {
// Peek is used here to avoid updating the recency of the entry,
// as we are only checking for eviction.
v, ok := c.cache.Peek(key)
if !ok {
continue
}
st := v.(state.ReadOnlyBeaconState)
if slots.ToEpoch(st.Slot()) <= epoch {
c.cache.Remove(key)
evicted++
}
}
if evicted > 0 {
checkpointStateSize.Set(float64(c.cache.Len()))
checkpointStateEvicted.Add(float64(evicted))
}
return evicted
}

View File

@@ -72,3 +72,76 @@ func TestCheckpointStateCache_MaxSize(t *testing.T) {
assert.Equal(t, cache.MaxCheckpointStateSize(), len(c.Cache().Keys()))
}
func TestCheckpointStateCache_EvictFinalized_FinalizedEntry(t *testing.T) {
c := cache.NewCheckpointStateCache()
cp := &ethpb.Checkpoint{Epoch: 1, Root: bytesutil.PadTo([]byte{'A'}, 32)}
st, err := state_native.InitializeFromProtoPhase0(&ethpb.BeaconState{Slot: 32})
require.NoError(t, err)
require.NoError(t, c.AddCheckpointState(cp, st))
evicted := c.EvictUpTo(1)
assert.Equal(t, 1, evicted, "expected finalized entry to be evicted")
s, err := c.StateByCheckpoint(cp)
require.NoError(t, err)
assert.Equal(t, state.BeaconState(nil), s, "expected cache to be empty after eviction")
}
func TestCheckpointStateCache_EvictFinalized_NotFinalizedEntry(t *testing.T) {
c := cache.NewCheckpointStateCache()
cp := &ethpb.Checkpoint{Epoch: 5, Root: bytesutil.PadTo([]byte{'A'}, 32)}
st, err := state_native.InitializeFromProtoPhase0(&ethpb.BeaconState{Slot: 160})
require.NoError(t, err)
require.NoError(t, c.AddCheckpointState(cp, st))
evicted := c.EvictUpTo(3)
assert.Equal(t, 0, evicted, "expected non-finalized entry NOT to be evicted")
s, err := c.StateByCheckpoint(cp)
require.NoError(t, err)
assert.NotNil(t, s, "expected entry to still be in cache")
}
func TestCheckpointStateCache_EvictFinalized_Mixed(t *testing.T) {
c := cache.NewCheckpointStateCache()
cp1 := &ethpb.Checkpoint{Epoch: 1, Root: bytesutil.PadTo([]byte{'A'}, 32)}
st1, err := state_native.InitializeFromProtoPhase0(&ethpb.BeaconState{Slot: 32})
require.NoError(t, err)
cp2 := &ethpb.Checkpoint{Epoch: 2, Root: bytesutil.PadTo([]byte{'B'}, 32)}
st2, err := state_native.InitializeFromProtoPhase0(&ethpb.BeaconState{Slot: 64})
require.NoError(t, err)
cp5 := &ethpb.Checkpoint{Epoch: 5, Root: bytesutil.PadTo([]byte{'C'}, 32)}
st5, err := state_native.InitializeFromProtoPhase0(&ethpb.BeaconState{Slot: 160})
require.NoError(t, err)
require.NoError(t, c.AddCheckpointState(cp1, st1))
require.NoError(t, c.AddCheckpointState(cp2, st2))
require.NoError(t, c.AddCheckpointState(cp5, st5))
evicted := c.EvictUpTo(3)
assert.Equal(t, 2, evicted, "expected epochs 1 and 2 to be evicted")
s, err := c.StateByCheckpoint(cp1)
require.NoError(t, err)
assert.Equal(t, state.BeaconState(nil), s, "expected cp1 to be evicted")
s, err = c.StateByCheckpoint(cp2)
require.NoError(t, err)
assert.Equal(t, state.BeaconState(nil), s, "expected cp2 to be evicted")
s, err = c.StateByCheckpoint(cp5)
require.NoError(t, err)
assert.NotNil(t, s, "expected cp5 to still be in cache")
}
func TestCheckpointStateCache_EvictFinalized_EmptyCache(t *testing.T) {
c := cache.NewCheckpointStateCache()
evicted := c.EvictUpTo(0)
assert.Equal(t, 0, evicted, "expected no eviction from empty cache")
}

View File

@@ -112,13 +112,7 @@ func (s *Service) processDataColumnSidecarsFromReconstruction(ctx context.Contex
// shouldReconstruct returns true if we should attempt to reconstruct the data columns for the given block root.
func (s *Service) shouldReconstruct(root [fieldparams.RootLength]byte) bool {
// Get the columns we store.
storedDataColumns := s.cfg.dataColumnStorage.Summary(root)
storedColumnsCount := storedDataColumns.Count()
// Reconstruct only if we have at least the minimum number of columns to reconstruct, but not all the columns.
// (If we have not enough columns, reconstruction is impossible. If we have all the columns, reconstruction is unnecessary.)
return storedColumnsCount >= peerdas.MinimumColumnCountToReconstruct() && storedColumnsCount != fieldparams.NumberOfColumns
return false
}
// computeRandomDelay computes a random delay duration to wait before reconstructing data column sidecars.

View File

@@ -0,0 +1,3 @@
### Added
- Implement finalization-based eviction for `CheckpointStateCache`.