Fix race on ReceibeBlock (#15565)

* Fix race on ReceibeBlock

In the event two routines for `ReceiveBlock` are triggered with the same
block (it may happen if one routine is triggered over gossip and the
other in init-sync) it may happen that the second routine believes it's
syncing the block for the first time. This is because the cache on
`blocksBeingSynced` is not checked to be set and the block may still not
be put in forkchoice by the first routine.

In the normal case this would not cause any trouble as the second
forkchoice insertion is a noop by design. However, if the second routine
times out or has any error in processing (for example the engine will
return an error if we try to send FCU to an older head) then the second
routine will attempt to remove the inserted block from forkchoice and
this bricks the node since forkchoice refuses to remove a valid node,
the root is removed inconditionally from db and the node ends up with a
root that is not in db and remains in forkchoice.

This PR just prevents the race.

As a followup perhaps we can gate the rollback function from db to nodes
that are effectively not in forkchoice, alternatively, force removal
from forkchoice when rolling back from db (although this version is
complicated due to possible accounting issues on forkchoice).

* Fix lint
This commit is contained in:
Potuz
2025-08-06 14:18:13 -03:00
committed by GitHub
parent 2a09c9f681
commit 0f6070a866
5 changed files with 20 additions and 3 deletions

View File

@@ -7,10 +7,15 @@ type currentlySyncingBlock struct {
roots map[[32]byte]struct{}
}
func (b *currentlySyncingBlock) set(root [32]byte) {
func (b *currentlySyncingBlock) set(root [32]byte) error {
b.Lock()
defer b.Unlock()
_, ok := b.roots[root]
if ok {
return errBlockBeingSynced
}
b.roots[root] = struct{}{}
return nil
}
func (b *currentlySyncingBlock) unset(root [32]byte) {

View File

@@ -44,6 +44,8 @@ var (
errMaxBlobsExceeded = verification.AsVerificationFailure(errors.New("expected commitments in block exceeds MAX_BLOBS_PER_BLOCK"))
// errMaxDataColumnsExceeded is returned when the number of data columns exceeds the maximum allowed.
errMaxDataColumnsExceeded = verification.AsVerificationFailure(errors.New("expected data columns for node exceeds NUMBER_OF_COLUMNS"))
// errBlockBeingSynced is returned when a block is being synced.
errBlockBeingSynced = errors.New("block is being synced")
)
// An invalid block is the block that fails state transition based on the core protocol rules.

View File

@@ -84,7 +84,11 @@ func (s *Service) ReceiveBlock(ctx context.Context, block interfaces.ReadOnlySig
}
receivedTime := time.Now()
s.blockBeingSynced.set(blockRoot)
err := s.blockBeingSynced.set(blockRoot)
if errors.Is(err, errBlockBeingSynced) {
log.WithField("blockRoot", fmt.Sprintf("%#x", blockRoot)).Debug("Ignoring block currently being synced")
return nil
}
defer s.blockBeingSynced.unset(blockRoot)
blockCopy, err := block.Copy()

View File

@@ -311,7 +311,10 @@ func TestService_HasBlock(t *testing.T) {
r, err = b.Block.HashTreeRoot()
require.NoError(t, err)
require.Equal(t, true, s.HasBlock(t.Context(), r))
s.blockBeingSynced.set(r)
err = s.blockBeingSynced.set(r)
require.NoError(t, err)
err = s.blockBeingSynced.set(r)
require.ErrorIs(t, err, errBlockBeingSynced)
require.Equal(t, false, s.HasBlock(t.Context(), r))
}

View File

@@ -0,0 +1,3 @@
### Fixed
- Prevent a race on double `ReceiveBlock`.