Backfill data columns (#15580)

**What type of PR is this?** Feature **What does this PR do? Why is it needed?** Adds data column support to backfill. **Acknowledgements** - [x] I have read [CONTRIBUTING.md](https://github.com/prysmaticlabs/prysm/blob/develop/CONTRIBUTING.md). - [x] I have included a uniquely named [changelog fragment file](https://github.com/prysmaticlabs/prysm/blob/develop/CONTRIBUTING.md#maintaining-changelogmd). - [x] I have added a description to this PR with sufficient context for reviewers to understand this PR. --------- Co-authored-by: Kasey <kasey@users.noreply.github.com> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Preston Van Loon <preston@pvl.dev>
2026-01-08 23:18:15 -05:00 · 2025-12-02 10:19:32 -05:00
parent 2773bdef89
commit 61de11e2c4
84 changed files with 10944 additions and 964 deletions
--- a/beacon-chain/sync/backfill/batch.go
+++ b/beacon-chain/sync/backfill/batch.go
@@ -8,7 +8,6 @@ import (

 	"github.com/OffchainLabs/prysm/v7/beacon-chain/das"
 	"github.com/OffchainLabs/prysm/v7/beacon-chain/sync"
-	"github.com/OffchainLabs/prysm/v7/consensus-types/blocks"
 	"github.com/OffchainLabs/prysm/v7/consensus-types/primitives"
 	eth "github.com/OffchainLabs/prysm/v7/proto/prysm/v1alpha1"
 	"github.com/libp2p/go-libp2p/core/peer"
@@ -16,9 +15,13 @@ import (
 	"github.com/sirupsen/logrus"
 )

-// ErrChainBroken indicates a backfill batch can't be imported to the db because it is not known to be the ancestor
-// of the canonical chain.
-var ErrChainBroken = errors.New("batch is not the ancestor of a known finalized root")
+var errChainBroken = errors.New("batch is not the ancestor of a known finalized root")
+
+// retryLogMod defines how often retryable errors are logged at debug level instead of trace.
+const retryLogMod = 5
+
+// retryDelay defines the delay between retry attempts for a batch.
+const retryDelay = time.Second

 type batchState int

@@ -30,16 +33,20 @@ func (s batchState) String() string {
 		return "init"
 	case batchSequenced:
 		return "sequenced"
-	case batchErrRetryable:
-		return "error_retryable"
+	case batchSyncBlobs:
+		return "sync_blobs"
+	case batchSyncColumns:
+		return "sync_columns"
 	case batchImportable:
 		return "importable"
 	case batchImportComplete:
 		return "import_complete"
 	case batchEndSequence:
 		return "end_sequence"
-	case batchBlobSync:
-		return "blob_sync"
+	case batchErrRetryable:
+		return "error_retryable"
+	case batchErrFatal:
+		return "error_fatal"
 	default:
 		return "unknown"
 	}
@@ -49,15 +56,15 @@ const (
 	batchNil batchState = iota
 	batchInit
 	batchSequenced
-	batchErrRetryable
-	batchBlobSync
+	batchSyncBlobs
+	batchSyncColumns
 	batchImportable
 	batchImportComplete
+	batchErrRetryable
+	batchErrFatal // if this is received in the main loop, the worker pool will be shut down.
 	batchEndSequence
 )

-var retryDelay = time.Second
-
 type batchId string

 type batch struct {
@@ -67,35 +74,52 @@ type batch struct {
 	retries        int
 	retryAfter     time.Time
 	begin          primitives.Slot
-	end            primitives.Slot // half-open interval, [begin, end), ie >= start, < end.
-	results        verifiedROBlocks
+	end            primitives.Slot // half-open interval, [begin, end), ie >= begin, < end.
+	blocks         verifiedROBlocks
 	err            error
 	state          batchState
-	busy           peer.ID
-	blockPid       peer.ID
-	blobPid        peer.ID
-	bs             *blobSync
+	// `assignedPeer` is used by the worker pool to assign and unassign peer.IDs to serve requests for the current batch state.
+	// Depending on the state it will be copied to blockPeer, columns.Peer, blobs.Peer.
+	assignedPeer peer.ID
+	blockPeer    peer.ID
+	nextReqCols  []uint64
+	blobs        *blobSync
+	columns      *columnSync
 }

 func (b batch) logFields() logrus.Fields {
 	f := map[string]any{
-		"batchId":   b.id(),
-		"state":     b.state.String(),
-		"scheduled": b.scheduled.String(),
-		"seq":       b.seq,
-		"retries":   b.retries,
-		"begin":     b.begin,
-		"end":       b.end,
-		"busyPid":   b.busy,
-		"blockPid":  b.blockPid,
-		"blobPid":   b.blobPid,
+		"batchId":    b.id(),
+		"state":      b.state.String(),
+		"scheduled":  b.scheduled.String(),
+		"seq":        b.seq,
+		"retries":    b.retries,
+		"retryAfter": b.retryAfter.String(),
+		"begin":      b.begin,
+		"end":        b.end,
+		"busyPid":    b.assignedPeer,
+		"blockPid":   b.blockPeer,
+	}
+	if b.blobs != nil {
+		f["blobPid"] = b.blobs.peer
+	}
+	if b.columns != nil {
+		f["colPid"] = b.columns.peer
 	}
 	if b.retries > 0 {
 		f["retryAfter"] = b.retryAfter.String()
 	}
+	if b.state == batchSyncColumns {
+		f["nextColumns"] = fmt.Sprintf("%v", b.nextReqCols)
+	}
+	if b.state == batchErrRetryable && b.blobs != nil {
+		f["blobsMissing"] = b.blobs.needed()
+	}
 	return f
 }

+// replaces returns true if `r` is a version of `b` that has been updated by a worker,
+// meaning it should replace `b` in the batch sequencing queue.
 func (b batch) replaces(r batch) bool {
 	if r.state == batchImportComplete {
 		return false
@@ -114,9 +138,9 @@ func (b batch) id() batchId {
 }

 func (b batch) ensureParent(expected [32]byte) error {
-	tail := b.results[len(b.results)-1]
+	tail := b.blocks[len(b.blocks)-1]
 	if tail.Root() != expected {
-		return errors.Wrapf(ErrChainBroken, "last parent_root=%#x, tail root=%#x", expected, tail.Root())
+		return errors.Wrapf(errChainBroken, "last parent_root=%#x, tail root=%#x", expected, tail.Root())
 	}
 	return nil
 }
@@ -136,21 +160,15 @@ func (b batch) blobRequest() *eth.BlobSidecarsByRangeRequest {
 	}
 }

-func (b batch) withResults(results verifiedROBlocks, bs *blobSync) batch {
-	b.results = results
-	b.bs = bs
-	if bs.blobsNeeded() > 0 {
-		return b.withState(batchBlobSync)
+func (b batch) transitionToNext() batch {
+	if len(b.blocks) == 0 {
+		return b.withState(batchSequenced)
 	}
-	return b.withState(batchImportable)
-}
-
-func (b batch) postBlobSync() batch {
-	if b.blobsNeeded() > 0 {
-		log.WithFields(b.logFields()).WithField("blobsMissing", b.blobsNeeded()).Error("Batch still missing blobs after downloading from peer")
-		b.bs = nil
-		b.results = []blocks.ROBlock{}
-		return b.withState(batchErrRetryable)
+	if len(b.columns.columnsNeeded()) > 0 {
+		return b.withState(batchSyncColumns)
+	}
+	if b.blobs != nil && b.blobs.needed() > 0 {
+		return b.withState(batchSyncBlobs)
 	}
 	return b.withState(batchImportable)
 }
@@ -159,44 +177,89 @@ func (b batch) withState(s batchState) batch {
 	if s == batchSequenced {
 		b.scheduled = time.Now()
 		switch b.state {
-		case batchErrRetryable:
-			b.retries += 1
-			b.retryAfter = time.Now().Add(retryDelay)
-			log.WithFields(b.logFields()).Info("Sequencing batch for retry after delay")
 		case batchInit, batchNil:
 			b.firstScheduled = b.scheduled
 		}
 	}
 	if s == batchImportComplete {
 		backfillBatchTimeRoundtrip.Observe(float64(time.Since(b.firstScheduled).Milliseconds()))
-		log.WithFields(b.logFields()).Debug("Backfill batch imported")
 	}
 	b.state = s
 	b.seq += 1
 	return b
 }

-func (b batch) withPeer(p peer.ID) batch {
-	b.blockPid = p
-	backfillBatchTimeWaiting.Observe(float64(time.Since(b.scheduled).Milliseconds()))
-	return b
-}
-
 func (b batch) withRetryableError(err error) batch {
 	b.err = err
+	b.retries += 1
+	b.retryAfter = time.Now().Add(retryDelay)
+
+	msg := "Could not proceed with batch processing due to error"
+	logBase := log.WithFields(b.logFields()).WithError(err)
+	// Log at trace level to limit log noise,
+	// but escalate to debug level every nth attempt for batches that have some peristent issue.
+	if b.retries&retryLogMod != 0 {
+		logBase.Trace(msg)
+	} else {
+		logBase.Debug(msg)
+	}
 	return b.withState(batchErrRetryable)
 }

-func (b batch) blobsNeeded() int {
-	return b.bs.blobsNeeded()
+func (b batch) withFatalError(err error) batch {
+	log.WithFields(b.logFields()).WithError(err).Error("Fatal batch processing error")
+	b.err = err
+	return b.withState(batchErrFatal)
 }

-func (b batch) blobResponseValidator() sync.BlobResponseValidation {
-	return b.bs.validateNext
+func (b batch) withError(err error) batch {
+	if isRetryable(err) {
+		return b.withRetryableError(err)
+	}
+	return b.withFatalError(err)
 }

-func (b batch) availabilityStore() das.AvailabilityStore {
-	return b.bs.store
+func (b batch) validatingColumnRequest(cb *columnBisector) (*validatingColumnRequest, error) {
+	req, err := b.columns.request(b.nextReqCols, columnRequestLimit)
+	if err != nil {
+		return nil, errors.Wrap(err, "columns request")
+	}
+	if req == nil {
+		return nil, nil
+	}
+	return &validatingColumnRequest{
+		req:        req,
+		columnSync: b.columns,
+		bisector:   cb,
+	}, nil
+}
+
+// resetToRetryColumns is called after a partial batch failure. It adds column indices back
+// to the toDownload structure for any blocks where those columns failed, and resets the bisector state.
+// Note that this method will also prune any columns that have expired, meaning we no longer need them
+// per spec and/or our backfill & retention settings.
+func resetToRetryColumns(b batch, needs das.CurrentNeeds) batch {
+	// return the given batch as-is if it isn't in a state that this func should handle.
+	if b.columns == nil || b.columns.bisector == nil || len(b.columns.bisector.errs) == 0 {
+		return b.transitionToNext()
+	}
+	pruned := make(map[[32]byte]struct{})
+	b.columns.pruneExpired(needs, pruned)
+
+	// clear out failed column state in the bisector and add back to
+	bisector := b.columns.bisector
+	roots := bisector.failingRoots()
+	// Add all the failed columns back to the toDownload structure and reset the bisector state.
+	for _, root := range roots {
+		if _, rm := pruned[root]; rm {
+			continue
+		}
+		bc := b.columns.toDownload[root]
+		bc.remaining.Merge(bisector.failuresFor(root))
+	}
+	b.columns.bisector.reset()
+
+	return b.transitionToNext()
 }

 var batchBlockUntil = func(ctx context.Context, untilRetry time.Duration, b batch) error {
@@ -223,6 +286,26 @@ func (b batch) waitUntilReady(ctx context.Context) error {
 	return nil
 }

+func (b batch) workComplete() bool {
+	return b.state == batchImportable
+}
+
+func (b batch) expired(needs das.CurrentNeeds) bool {
+	if !needs.Block.At(b.end - 1) {
+		log.WithFields(b.logFields()).WithField("retentionStartSlot", needs.Block.Begin).Debug("Batch outside retention window")
+		return true
+	}
+	return false
+}
+
+func (b batch) selectPeer(picker *sync.PeerPicker, busy map[peer.ID]bool) (peer.ID, []uint64, error) {
+	if b.state == batchSyncColumns {
+		return picker.ForColumns(b.columns.columnsNeeded(), busy)
+	}
+	peer, err := picker.ForBlocks(busy)
+	return peer, nil, err
+}
+
 func sortBatchDesc(bb []batch) {
 	sort.Slice(bb, func(i, j int) bool {
 		return bb[i].end > bb[j].end