From fe9dd255c7054b8bff32fbbcced68db34b75ff15 Mon Sep 17 00:00:00 2001 From: Preston Van Loon Date: Thu, 25 Sep 2025 10:46:28 -0500 Subject: [PATCH] slasherkv: Set a 1 minute timeout on PruneAttestationOnEpoch operations (#15746) * slasherkv: Set a 1 minute timeout on PruneAttestationOnEpoch operations to prevent very large bolt transactions. * Fix CI --------- Co-authored-by: Manu NALEPA --- beacon-chain/db/slasherkv/pruning.go | 33 +++++++++++++++++++++++----- changelog/pvl-slasherkv-timeout.md | 2 ++ 2 files changed, 30 insertions(+), 5 deletions(-) create mode 100644 changelog/pvl-slasherkv-timeout.md diff --git a/beacon-chain/db/slasherkv/pruning.go b/beacon-chain/db/slasherkv/pruning.go index 1334d375c8..8f4e2efbe5 100644 --- a/beacon-chain/db/slasherkv/pruning.go +++ b/beacon-chain/db/slasherkv/pruning.go @@ -4,17 +4,26 @@ import ( "bytes" "context" "encoding/binary" + "time" "github.com/OffchainLabs/prysm/v6/consensus-types/primitives" "github.com/OffchainLabs/prysm/v6/time/slots" + "github.com/pkg/errors" bolt "go.etcd.io/bbolt" ) +var errTimeOut = errors.New("operation timed out") + // PruneAttestationsAtEpoch deletes all attestations from the slasher DB with target epoch // less than or equal to the specified epoch. func (s *Store) PruneAttestationsAtEpoch( - _ context.Context, maxEpoch primitives.Epoch, + ctx context.Context, maxEpoch primitives.Epoch, ) (numPruned uint, err error) { + // In some cases, pruning may take a very long time and consume significant memory in the + // open Update transaction. Therefore, we impose a 1 minute timeout on this operation. + ctx, cancel := context.WithTimeout(ctx, 1*time.Minute) + defer cancel() + // We can prune everything less than the current epoch - history length. encodedEndPruneEpoch := make([]byte, 8) binary.BigEndian.PutUint64(encodedEndPruneEpoch, uint64(maxEpoch)) @@ -48,13 +57,18 @@ func (s *Store) PruneAttestationsAtEpoch( return } - if err = s.db.Update(func(tx *bolt.Tx) error { + err = s.db.Update(func(tx *bolt.Tx) error { signingRootsBkt := tx.Bucket(attestationDataRootsBucket) attRecordsBkt := tx.Bucket(attestationRecordsBucket) c := signingRootsBkt.Cursor() // We begin a pruning iteration starting from the first item in the bucket. for k, v := c.First(); k != nil; k, v = c.Next() { + if ctx.Err() != nil { + // Exit the routine if the context has expired. + return errTimeOut + } + // We check the epoch from the current key in the database. // If we have hit an epoch that is greater than the end epoch of the pruning process, // we then completely exit the process as we are done. @@ -67,18 +81,27 @@ func (s *Store) PruneAttestationsAtEpoch( // so it is possible we have a few adjacent objects that have the same slot, such as // (target_epoch = 3 ++ _) => encode(attestation) if err := signingRootsBkt.Delete(k); err != nil { - return err + return errors.Wrap(err, "delete attestation signing root") } if err := attRecordsBkt.Delete(v); err != nil { - return err + return errors.Wrap(err, "delete attestation record") } slasherAttestationsPrunedTotal.Inc() numPruned++ } return nil - }); err != nil { + }) + + if errors.Is(err, errTimeOut) { + log.Warning("Aborting pruning routine") return } + + if err != nil { + log.WithError(err).Error("Failed to prune attestations") + return + } + return } diff --git a/changelog/pvl-slasherkv-timeout.md b/changelog/pvl-slasherkv-timeout.md new file mode 100644 index 0000000000..7f632f8f4f --- /dev/null +++ b/changelog/pvl-slasherkv-timeout.md @@ -0,0 +1,2 @@ +### Added +- Added a 1 minute timeout on PruneAttestationOnEpoch operations to prevent very large bolt transactions