mirror of
https://github.com/OffchainLabs/prysm.git
synced 2026-01-06 22:23:56 -05:00
**What type of PR is this?** Feature **What does this PR do? Why is it needed?** Adds data column support to backfill. **Acknowledgements** - [x] I have read [CONTRIBUTING.md](https://github.com/prysmaticlabs/prysm/blob/develop/CONTRIBUTING.md). - [x] I have included a uniquely named [changelog fragment file](https://github.com/prysmaticlabs/prysm/blob/develop/CONTRIBUTING.md#maintaining-changelogmd). - [x] I have added a description to this PR with sufficient context for reviewers to understand this PR. --------- Co-authored-by: Kasey <kasey@users.noreply.github.com> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Preston Van Loon <preston@pvl.dev>
244 lines
8.7 KiB
Go
244 lines
8.7 KiB
Go
package backfill
|
|
|
|
import (
|
|
"context"
|
|
"time"
|
|
|
|
"github.com/OffchainLabs/prysm/v7/beacon-chain/das"
|
|
"github.com/OffchainLabs/prysm/v7/beacon-chain/db/filesystem"
|
|
"github.com/OffchainLabs/prysm/v7/beacon-chain/p2p"
|
|
"github.com/OffchainLabs/prysm/v7/beacon-chain/startup"
|
|
"github.com/OffchainLabs/prysm/v7/beacon-chain/sync"
|
|
"github.com/OffchainLabs/prysm/v7/beacon-chain/verification"
|
|
"github.com/OffchainLabs/prysm/v7/consensus-types/blocks"
|
|
"github.com/OffchainLabs/prysm/v7/encoding/bytesutil"
|
|
"github.com/libp2p/go-libp2p/core/peer"
|
|
"github.com/pkg/errors"
|
|
)
|
|
|
|
var errInvalidBatchState = errors.New("invalid batch state")
|
|
|
|
type peerDownscorer func(peer.ID, string, error)
|
|
|
|
type workerCfg struct {
|
|
clock *startup.Clock
|
|
verifier *verifier
|
|
ctxMap sync.ContextByteVersions
|
|
newVB verification.NewBlobVerifier
|
|
newVC verification.NewDataColumnsVerifier
|
|
blobStore *filesystem.BlobStorage
|
|
colStore *filesystem.DataColumnStorage
|
|
downscore peerDownscorer
|
|
currentNeeds func() das.CurrentNeeds
|
|
}
|
|
|
|
func initWorkerCfg(ctx context.Context, cfg *workerCfg, vw InitializerWaiter, store *Store) error {
|
|
vi, err := vw.WaitForInitializer(ctx)
|
|
if err != nil {
|
|
return errors.Wrap(err, "WaitForInitializer")
|
|
}
|
|
cps, err := store.originState(ctx)
|
|
if err != nil {
|
|
return errors.Wrap(err, "originState")
|
|
}
|
|
keys, err := cps.PublicKeys()
|
|
if err != nil {
|
|
return errors.Wrap(err, "unable to retrieve public keys for all validators in the origin state")
|
|
}
|
|
vr := cps.GenesisValidatorsRoot()
|
|
cm, err := sync.ContextByteVersionsForValRoot(bytesutil.ToBytes32(vr))
|
|
if err != nil {
|
|
return errors.Wrapf(err, "unable to initialize context version map using genesis validator root %#x", vr)
|
|
}
|
|
v, err := newBackfillVerifier(vr, keys)
|
|
if err != nil {
|
|
return errors.Wrapf(err, "newBackfillVerifier failed")
|
|
}
|
|
cfg.verifier = v
|
|
cfg.ctxMap = cm
|
|
cfg.newVB = newBlobVerifierFromInitializer(vi)
|
|
cfg.newVC = newDataColumnVerifierFromInitializer(vi)
|
|
return nil
|
|
}
|
|
|
|
type workerId int
|
|
|
|
type p2pWorker struct {
|
|
id workerId
|
|
todo chan batch
|
|
done chan batch
|
|
p2p p2p.P2P
|
|
cfg *workerCfg
|
|
}
|
|
|
|
func newP2pWorker(id workerId, p p2p.P2P, todo, done chan batch, cfg *workerCfg) *p2pWorker {
|
|
return &p2pWorker{
|
|
id: id,
|
|
todo: todo,
|
|
done: done,
|
|
p2p: p,
|
|
cfg: cfg,
|
|
}
|
|
}
|
|
|
|
func (w *p2pWorker) run(ctx context.Context) {
|
|
for {
|
|
select {
|
|
case b := <-w.todo:
|
|
if err := b.waitUntilReady(ctx); err != nil {
|
|
log.WithField("batchId", b.id()).WithError(ctx.Err()).Info("Worker context canceled while waiting to retry")
|
|
continue
|
|
}
|
|
log.WithFields(b.logFields()).WithField("backfillWorker", w.id).Trace("Worker received batch")
|
|
switch b.state {
|
|
case batchSequenced:
|
|
b = w.handleBlocks(ctx, b)
|
|
case batchSyncBlobs:
|
|
b = w.handleBlobs(ctx, b)
|
|
case batchSyncColumns:
|
|
b = w.handleColumns(ctx, b)
|
|
case batchImportable:
|
|
// This state indicates the batch got all the way to be imported and failed,
|
|
// so we need clear out the blocks to go all the way back to the start of the process.
|
|
b.blocks = nil
|
|
b = w.handleBlocks(ctx, b)
|
|
default:
|
|
// A batch in an unknown state represents an implementation error,
|
|
// so we treat it as a fatal error meaning the worker pool should shut down.
|
|
b = b.withFatalError(errors.Wrap(errInvalidBatchState, b.state.String()))
|
|
}
|
|
w.done <- b
|
|
case <-ctx.Done():
|
|
log.WithField("backfillWorker", w.id).Info("Worker exiting after context canceled")
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
func (w *p2pWorker) handleBlocks(ctx context.Context, b batch) batch {
|
|
current := w.cfg.clock.CurrentSlot()
|
|
b.blockPeer = b.assignedPeer
|
|
start := time.Now()
|
|
results, err := sync.SendBeaconBlocksByRangeRequest(ctx, w.cfg.clock, w.p2p, b.blockPeer, b.blockRequest(), blockValidationMetrics)
|
|
if err != nil {
|
|
log.WithError(err).WithFields(b.logFields()).Debug("Failed to request SignedBeaconBlocks by range")
|
|
return b.withRetryableError(err)
|
|
}
|
|
dlt := time.Now()
|
|
blockDownloadMs.Observe(float64(dlt.Sub(start).Milliseconds()))
|
|
|
|
toVerify, err := blocks.NewROBlockSlice(results)
|
|
if err != nil {
|
|
log.WithError(err).WithFields(b.logFields()).Debug("Failed to convert blocks to ROBlock slice")
|
|
return b.withRetryableError(err)
|
|
}
|
|
verified, err := w.cfg.verifier.verify(toVerify)
|
|
blockVerifyMs.Observe(float64(time.Since(dlt).Milliseconds()))
|
|
if err != nil {
|
|
if shouldDownscore(err) {
|
|
w.cfg.downscore(b.blockPeer, "invalid SignedBeaconBlock batch rpc response", err)
|
|
}
|
|
log.WithError(err).WithFields(b.logFields()).Debug("Validation failed")
|
|
return b.withRetryableError(err)
|
|
}
|
|
|
|
// This is a hack to get the rough size of the batch. This helps us approximate the amount of memory needed
|
|
// to hold batches and relative sizes between batches, but will be inaccurate when it comes to measuring actual
|
|
// bytes downloaded from peers, mainly because the p2p messages are snappy compressed.
|
|
bdl := 0
|
|
for i := range verified {
|
|
bdl += verified[i].SizeSSZ()
|
|
}
|
|
blockDownloadBytesApprox.Add(float64(bdl))
|
|
log.WithFields(b.logFields()).WithField("bytesDownloaded", bdl).Trace("Blocks downloaded")
|
|
b.blocks = verified
|
|
|
|
bscfg := &blobSyncConfig{currentNeeds: w.cfg.currentNeeds, nbv: w.cfg.newVB, store: w.cfg.blobStore}
|
|
bs, err := newBlobSync(current, verified, bscfg)
|
|
if err != nil {
|
|
return b.withRetryableError(err)
|
|
}
|
|
cs, err := newColumnSync(ctx, b, verified, current, w.p2p, w.cfg)
|
|
if err != nil {
|
|
return b.withRetryableError(err)
|
|
}
|
|
b.blobs = bs
|
|
b.columns = cs
|
|
return b.transitionToNext()
|
|
}
|
|
|
|
func (w *p2pWorker) handleBlobs(ctx context.Context, b batch) batch {
|
|
b.blobs.peer = b.assignedPeer
|
|
start := time.Now()
|
|
// we don't need to use the response for anything other than metrics, because blobResponseValidation
|
|
// adds each of them to a batch AvailabilityStore once it is checked.
|
|
blobs, err := sync.SendBlobsByRangeRequest(ctx, w.cfg.clock, w.p2p, b.blobs.peer, w.cfg.ctxMap, b.blobRequest(), b.blobs.validateNext, blobValidationMetrics)
|
|
if err != nil {
|
|
b.blobs = nil
|
|
return b.withRetryableError(err)
|
|
}
|
|
dlt := time.Now()
|
|
blobSidecarDownloadMs.Observe(float64(dlt.Sub(start).Milliseconds()))
|
|
if len(blobs) > 0 {
|
|
// All blobs are the same size, so we can compute 1 and use it for all in the batch.
|
|
sz := blobs[0].SizeSSZ() * len(blobs)
|
|
blobSidecarDownloadBytesApprox.Add(float64(sz))
|
|
log.WithFields(b.logFields()).WithField("bytesDownloaded", sz).Debug("Blobs downloaded")
|
|
}
|
|
if b.blobs.needed() > 0 {
|
|
// If we are missing blobs after processing the blob step, this is an error and we need to scrap the batch and start over.
|
|
b.blobs = nil
|
|
// Wipe retries periodically to avoid getting stuck on a bad block batch
|
|
if b.retries%3 == 0 {
|
|
b.blocks = []blocks.ROBlock{}
|
|
}
|
|
return b.withRetryableError(errors.New("missing blobs after blob download"))
|
|
}
|
|
return b.transitionToNext()
|
|
}
|
|
|
|
func (w *p2pWorker) handleColumns(ctx context.Context, b batch) batch {
|
|
start := time.Now()
|
|
b.columns.peer = b.assignedPeer
|
|
|
|
// Bisector is used to keep track of the peer that provided each column, for scoring purposes.
|
|
// When verification of a batch of columns fails, bisector is used to retry verification with batches
|
|
// grouped by peer, to figure out if the failure is due to a specific peer.
|
|
vr, err := b.validatingColumnRequest(b.columns.bisector)
|
|
if err != nil {
|
|
return b.withRetryableError(errors.Wrap(err, "creating validating column request"))
|
|
}
|
|
p := sync.DataColumnSidecarsParams{
|
|
Ctx: ctx,
|
|
Tor: w.cfg.clock,
|
|
P2P: w.p2p,
|
|
CtxMap: w.cfg.ctxMap,
|
|
// DownscorePeerOnRPCFault is very aggressive and is only used for fetching origin blobs during startup.
|
|
DownscorePeerOnRPCFault: false,
|
|
// SendDataColumnSidecarsByRangeRequest uses the DataColumnSidecarsParams param struct to cover
|
|
// multiple different use cases. Some of them have different required fields. The following fields are
|
|
// not used in the methods that backfill invokes. SendDataColumnSidecarsByRangeRequest should be refactored
|
|
// to only require the minimum set of parameters.
|
|
//RateLimiter *leakybucket.Collector
|
|
//Storage: w.cfg.cfs,
|
|
//NewVerifier: vr.validate,
|
|
}
|
|
// The return is dropped because the validation code adds the columns
|
|
// to the columnSync AvailabilityStore under the hood.
|
|
_, err = sync.SendDataColumnSidecarsByRangeRequest(p, b.columns.peer, vr.req, vr.validate)
|
|
if err != nil {
|
|
if shouldDownscore(err) {
|
|
w.cfg.downscore(b.columns.peer, "invalid DataColumnSidecar rpc response", err)
|
|
}
|
|
return b.withRetryableError(errors.Wrap(err, "failed to request data column sidecars"))
|
|
}
|
|
dataColumnSidecarDownloadMs.Observe(float64(time.Since(start).Milliseconds()))
|
|
return b.transitionToNext()
|
|
}
|
|
|
|
func shouldDownscore(err error) bool {
|
|
return errors.Is(err, errInvalidDataColumnResponse) ||
|
|
errors.Is(err, sync.ErrInvalidFetchedData) ||
|
|
errors.Is(err, errInvalidBlocks)
|
|
}
|