mirror of
https://github.com/OffchainLabs/prysm.git
synced 2026-01-08 23:18:15 -05:00
Backfill data columns (#15580)
**What type of PR is this?** Feature **What does this PR do? Why is it needed?** Adds data column support to backfill. **Acknowledgements** - [x] I have read [CONTRIBUTING.md](https://github.com/prysmaticlabs/prysm/blob/develop/CONTRIBUTING.md). - [x] I have included a uniquely named [changelog fragment file](https://github.com/prysmaticlabs/prysm/blob/develop/CONTRIBUTING.md#maintaining-changelogmd). - [x] I have added a description to this PR with sufficient context for reviewers to understand this PR. --------- Co-authored-by: Kasey <kasey@users.noreply.github.com> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Preston Van Loon <preston@pvl.dev>
This commit is contained in:
@@ -8,7 +8,6 @@ import (
|
||||
|
||||
"github.com/OffchainLabs/prysm/v7/beacon-chain/das"
|
||||
"github.com/OffchainLabs/prysm/v7/beacon-chain/sync"
|
||||
"github.com/OffchainLabs/prysm/v7/consensus-types/blocks"
|
||||
"github.com/OffchainLabs/prysm/v7/consensus-types/primitives"
|
||||
eth "github.com/OffchainLabs/prysm/v7/proto/prysm/v1alpha1"
|
||||
"github.com/libp2p/go-libp2p/core/peer"
|
||||
@@ -16,9 +15,13 @@ import (
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// ErrChainBroken indicates a backfill batch can't be imported to the db because it is not known to be the ancestor
|
||||
// of the canonical chain.
|
||||
var ErrChainBroken = errors.New("batch is not the ancestor of a known finalized root")
|
||||
var errChainBroken = errors.New("batch is not the ancestor of a known finalized root")
|
||||
|
||||
// retryLogMod defines how often retryable errors are logged at debug level instead of trace.
|
||||
const retryLogMod = 5
|
||||
|
||||
// retryDelay defines the delay between retry attempts for a batch.
|
||||
const retryDelay = time.Second
|
||||
|
||||
type batchState int
|
||||
|
||||
@@ -30,16 +33,20 @@ func (s batchState) String() string {
|
||||
return "init"
|
||||
case batchSequenced:
|
||||
return "sequenced"
|
||||
case batchErrRetryable:
|
||||
return "error_retryable"
|
||||
case batchSyncBlobs:
|
||||
return "sync_blobs"
|
||||
case batchSyncColumns:
|
||||
return "sync_columns"
|
||||
case batchImportable:
|
||||
return "importable"
|
||||
case batchImportComplete:
|
||||
return "import_complete"
|
||||
case batchEndSequence:
|
||||
return "end_sequence"
|
||||
case batchBlobSync:
|
||||
return "blob_sync"
|
||||
case batchErrRetryable:
|
||||
return "error_retryable"
|
||||
case batchErrFatal:
|
||||
return "error_fatal"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
@@ -49,15 +56,15 @@ const (
|
||||
batchNil batchState = iota
|
||||
batchInit
|
||||
batchSequenced
|
||||
batchErrRetryable
|
||||
batchBlobSync
|
||||
batchSyncBlobs
|
||||
batchSyncColumns
|
||||
batchImportable
|
||||
batchImportComplete
|
||||
batchErrRetryable
|
||||
batchErrFatal // if this is received in the main loop, the worker pool will be shut down.
|
||||
batchEndSequence
|
||||
)
|
||||
|
||||
var retryDelay = time.Second
|
||||
|
||||
type batchId string
|
||||
|
||||
type batch struct {
|
||||
@@ -67,35 +74,52 @@ type batch struct {
|
||||
retries int
|
||||
retryAfter time.Time
|
||||
begin primitives.Slot
|
||||
end primitives.Slot // half-open interval, [begin, end), ie >= start, < end.
|
||||
results verifiedROBlocks
|
||||
end primitives.Slot // half-open interval, [begin, end), ie >= begin, < end.
|
||||
blocks verifiedROBlocks
|
||||
err error
|
||||
state batchState
|
||||
busy peer.ID
|
||||
blockPid peer.ID
|
||||
blobPid peer.ID
|
||||
bs *blobSync
|
||||
// `assignedPeer` is used by the worker pool to assign and unassign peer.IDs to serve requests for the current batch state.
|
||||
// Depending on the state it will be copied to blockPeer, columns.Peer, blobs.Peer.
|
||||
assignedPeer peer.ID
|
||||
blockPeer peer.ID
|
||||
nextReqCols []uint64
|
||||
blobs *blobSync
|
||||
columns *columnSync
|
||||
}
|
||||
|
||||
func (b batch) logFields() logrus.Fields {
|
||||
f := map[string]any{
|
||||
"batchId": b.id(),
|
||||
"state": b.state.String(),
|
||||
"scheduled": b.scheduled.String(),
|
||||
"seq": b.seq,
|
||||
"retries": b.retries,
|
||||
"begin": b.begin,
|
||||
"end": b.end,
|
||||
"busyPid": b.busy,
|
||||
"blockPid": b.blockPid,
|
||||
"blobPid": b.blobPid,
|
||||
"batchId": b.id(),
|
||||
"state": b.state.String(),
|
||||
"scheduled": b.scheduled.String(),
|
||||
"seq": b.seq,
|
||||
"retries": b.retries,
|
||||
"retryAfter": b.retryAfter.String(),
|
||||
"begin": b.begin,
|
||||
"end": b.end,
|
||||
"busyPid": b.assignedPeer,
|
||||
"blockPid": b.blockPeer,
|
||||
}
|
||||
if b.blobs != nil {
|
||||
f["blobPid"] = b.blobs.peer
|
||||
}
|
||||
if b.columns != nil {
|
||||
f["colPid"] = b.columns.peer
|
||||
}
|
||||
if b.retries > 0 {
|
||||
f["retryAfter"] = b.retryAfter.String()
|
||||
}
|
||||
if b.state == batchSyncColumns {
|
||||
f["nextColumns"] = fmt.Sprintf("%v", b.nextReqCols)
|
||||
}
|
||||
if b.state == batchErrRetryable && b.blobs != nil {
|
||||
f["blobsMissing"] = b.blobs.needed()
|
||||
}
|
||||
return f
|
||||
}
|
||||
|
||||
// replaces returns true if `r` is a version of `b` that has been updated by a worker,
|
||||
// meaning it should replace `b` in the batch sequencing queue.
|
||||
func (b batch) replaces(r batch) bool {
|
||||
if r.state == batchImportComplete {
|
||||
return false
|
||||
@@ -114,9 +138,9 @@ func (b batch) id() batchId {
|
||||
}
|
||||
|
||||
func (b batch) ensureParent(expected [32]byte) error {
|
||||
tail := b.results[len(b.results)-1]
|
||||
tail := b.blocks[len(b.blocks)-1]
|
||||
if tail.Root() != expected {
|
||||
return errors.Wrapf(ErrChainBroken, "last parent_root=%#x, tail root=%#x", expected, tail.Root())
|
||||
return errors.Wrapf(errChainBroken, "last parent_root=%#x, tail root=%#x", expected, tail.Root())
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -136,21 +160,15 @@ func (b batch) blobRequest() *eth.BlobSidecarsByRangeRequest {
|
||||
}
|
||||
}
|
||||
|
||||
func (b batch) withResults(results verifiedROBlocks, bs *blobSync) batch {
|
||||
b.results = results
|
||||
b.bs = bs
|
||||
if bs.blobsNeeded() > 0 {
|
||||
return b.withState(batchBlobSync)
|
||||
func (b batch) transitionToNext() batch {
|
||||
if len(b.blocks) == 0 {
|
||||
return b.withState(batchSequenced)
|
||||
}
|
||||
return b.withState(batchImportable)
|
||||
}
|
||||
|
||||
func (b batch) postBlobSync() batch {
|
||||
if b.blobsNeeded() > 0 {
|
||||
log.WithFields(b.logFields()).WithField("blobsMissing", b.blobsNeeded()).Error("Batch still missing blobs after downloading from peer")
|
||||
b.bs = nil
|
||||
b.results = []blocks.ROBlock{}
|
||||
return b.withState(batchErrRetryable)
|
||||
if len(b.columns.columnsNeeded()) > 0 {
|
||||
return b.withState(batchSyncColumns)
|
||||
}
|
||||
if b.blobs != nil && b.blobs.needed() > 0 {
|
||||
return b.withState(batchSyncBlobs)
|
||||
}
|
||||
return b.withState(batchImportable)
|
||||
}
|
||||
@@ -159,44 +177,89 @@ func (b batch) withState(s batchState) batch {
|
||||
if s == batchSequenced {
|
||||
b.scheduled = time.Now()
|
||||
switch b.state {
|
||||
case batchErrRetryable:
|
||||
b.retries += 1
|
||||
b.retryAfter = time.Now().Add(retryDelay)
|
||||
log.WithFields(b.logFields()).Info("Sequencing batch for retry after delay")
|
||||
case batchInit, batchNil:
|
||||
b.firstScheduled = b.scheduled
|
||||
}
|
||||
}
|
||||
if s == batchImportComplete {
|
||||
backfillBatchTimeRoundtrip.Observe(float64(time.Since(b.firstScheduled).Milliseconds()))
|
||||
log.WithFields(b.logFields()).Debug("Backfill batch imported")
|
||||
}
|
||||
b.state = s
|
||||
b.seq += 1
|
||||
return b
|
||||
}
|
||||
|
||||
func (b batch) withPeer(p peer.ID) batch {
|
||||
b.blockPid = p
|
||||
backfillBatchTimeWaiting.Observe(float64(time.Since(b.scheduled).Milliseconds()))
|
||||
return b
|
||||
}
|
||||
|
||||
func (b batch) withRetryableError(err error) batch {
|
||||
b.err = err
|
||||
b.retries += 1
|
||||
b.retryAfter = time.Now().Add(retryDelay)
|
||||
|
||||
msg := "Could not proceed with batch processing due to error"
|
||||
logBase := log.WithFields(b.logFields()).WithError(err)
|
||||
// Log at trace level to limit log noise,
|
||||
// but escalate to debug level every nth attempt for batches that have some peristent issue.
|
||||
if b.retries&retryLogMod != 0 {
|
||||
logBase.Trace(msg)
|
||||
} else {
|
||||
logBase.Debug(msg)
|
||||
}
|
||||
return b.withState(batchErrRetryable)
|
||||
}
|
||||
|
||||
func (b batch) blobsNeeded() int {
|
||||
return b.bs.blobsNeeded()
|
||||
func (b batch) withFatalError(err error) batch {
|
||||
log.WithFields(b.logFields()).WithError(err).Error("Fatal batch processing error")
|
||||
b.err = err
|
||||
return b.withState(batchErrFatal)
|
||||
}
|
||||
|
||||
func (b batch) blobResponseValidator() sync.BlobResponseValidation {
|
||||
return b.bs.validateNext
|
||||
func (b batch) withError(err error) batch {
|
||||
if isRetryable(err) {
|
||||
return b.withRetryableError(err)
|
||||
}
|
||||
return b.withFatalError(err)
|
||||
}
|
||||
|
||||
func (b batch) availabilityStore() das.AvailabilityStore {
|
||||
return b.bs.store
|
||||
func (b batch) validatingColumnRequest(cb *columnBisector) (*validatingColumnRequest, error) {
|
||||
req, err := b.columns.request(b.nextReqCols, columnRequestLimit)
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "columns request")
|
||||
}
|
||||
if req == nil {
|
||||
return nil, nil
|
||||
}
|
||||
return &validatingColumnRequest{
|
||||
req: req,
|
||||
columnSync: b.columns,
|
||||
bisector: cb,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// resetToRetryColumns is called after a partial batch failure. It adds column indices back
|
||||
// to the toDownload structure for any blocks where those columns failed, and resets the bisector state.
|
||||
// Note that this method will also prune any columns that have expired, meaning we no longer need them
|
||||
// per spec and/or our backfill & retention settings.
|
||||
func resetToRetryColumns(b batch, needs das.CurrentNeeds) batch {
|
||||
// return the given batch as-is if it isn't in a state that this func should handle.
|
||||
if b.columns == nil || b.columns.bisector == nil || len(b.columns.bisector.errs) == 0 {
|
||||
return b.transitionToNext()
|
||||
}
|
||||
pruned := make(map[[32]byte]struct{})
|
||||
b.columns.pruneExpired(needs, pruned)
|
||||
|
||||
// clear out failed column state in the bisector and add back to
|
||||
bisector := b.columns.bisector
|
||||
roots := bisector.failingRoots()
|
||||
// Add all the failed columns back to the toDownload structure and reset the bisector state.
|
||||
for _, root := range roots {
|
||||
if _, rm := pruned[root]; rm {
|
||||
continue
|
||||
}
|
||||
bc := b.columns.toDownload[root]
|
||||
bc.remaining.Merge(bisector.failuresFor(root))
|
||||
}
|
||||
b.columns.bisector.reset()
|
||||
|
||||
return b.transitionToNext()
|
||||
}
|
||||
|
||||
var batchBlockUntil = func(ctx context.Context, untilRetry time.Duration, b batch) error {
|
||||
@@ -223,6 +286,26 @@ func (b batch) waitUntilReady(ctx context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (b batch) workComplete() bool {
|
||||
return b.state == batchImportable
|
||||
}
|
||||
|
||||
func (b batch) expired(needs das.CurrentNeeds) bool {
|
||||
if !needs.Block.At(b.end - 1) {
|
||||
log.WithFields(b.logFields()).WithField("retentionStartSlot", needs.Block.Begin).Debug("Batch outside retention window")
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (b batch) selectPeer(picker *sync.PeerPicker, busy map[peer.ID]bool) (peer.ID, []uint64, error) {
|
||||
if b.state == batchSyncColumns {
|
||||
return picker.ForColumns(b.columns.columnsNeeded(), busy)
|
||||
}
|
||||
peer, err := picker.ForBlocks(busy)
|
||||
return peer, nil, err
|
||||
}
|
||||
|
||||
func sortBatchDesc(bb []batch) {
|
||||
sort.Slice(bb, func(i, j int) bool {
|
||||
return bb[i].end > bb[j].end
|
||||
|
||||
Reference in New Issue
Block a user