Compare commits

...

3 Commits

Author SHA1 Message Date
satushh
94e5b3e805 corrected logic 2025-11-20 20:28:49 -03:00
satushh
c0b8d9ca52 changelog 2025-11-20 19:27:28 -03:00
satushh
b1eeb1b1f1 simple grace period (untested in kurtosis) + cgc metrics 2025-11-20 19:25:50 -03:00
4 changed files with 112 additions and 1 deletions

View File

@@ -28,6 +28,7 @@ func (s *Service) maintainCustodyInfo() {
func (s *Service) updateCustodyInfoIfNeeded() error {
const minimumPeerCount = 1
const gracePeriodSeconds = 300 // 300-second grace period for CGC increases
// Get our actual custody group count.
actualCustodyGrounpCount, err := s.cfg.p2p.CustodyGroupCount(s.ctx)
@@ -35,12 +36,101 @@ func (s *Service) updateCustodyInfoIfNeeded() error {
return errors.Wrap(err, "p2p custody group count")
}
// Update the P2P custody group count metric
custodyGroupCountP2P.Set(float64(actualCustodyGrounpCount))
// Get our target custody group count.
targetCustodyGroupCount, err := s.custodyGroupCount(s.ctx)
if err != nil {
return errors.Wrap(err, "custody group count")
}
// Handle pending CGC changes with proper grace period
s.pendingCGCLock.Lock()
now := time.Now()
switch {
case s.pendingCGC > 0 && now.After(s.pendingCGCDeadline):
// Grace period expired - check if pending change is still valid
targetToApply := s.pendingCGC
s.pendingCGC = 0 // Clear the pending change
s.pendingCGCDeadline = time.Time{}
s.pendingCGCLock.Unlock()
// Only apply the pending change if current target still justifies it
// This prevents applying stale increases when validators have been removed
// or configuration has changed during the grace period
if targetToApply <= targetCustodyGroupCount {
// Pending value is still valid (at or below current target)
// the network still wants at least that many groups
// Use the current target to allow for any increases that happened during grace period
if targetCustodyGroupCount > actualCustodyGrounpCount {
log.WithFields(logrus.Fields{
"previousCGC": actualCustodyGrounpCount,
"newCGC": targetCustodyGroupCount,
"pendingCGC": targetToApply,
}).Info("Applying custody group count increase after grace period")
}
} else {
// Pending value is higher than current target - drop it as stale
log.WithFields(logrus.Fields{
"currentCGC": actualCustodyGrounpCount,
"targetCGC": targetCustodyGroupCount,
"stalePendingCGC": targetToApply,
}).Info("Dropping stale pending CGC increase as target has decreased")
// Still check if current target needs an increase (with new grace period)
if targetCustodyGroupCount > actualCustodyGrounpCount {
// Re-schedule with current target and new grace period
s.pendingCGCLock.Lock()
s.pendingCGC = targetCustodyGroupCount
s.pendingCGCDeadline = now.Add(time.Duration(gracePeriodSeconds) * time.Second)
s.pendingCGCLock.Unlock()
log.WithFields(logrus.Fields{
"currentCGC": actualCustodyGrounpCount,
"targetCGC": targetCustodyGroupCount,
"gracePeriod": gracePeriodSeconds,
}).Info("Re-scheduling CGC increase with updated target")
return nil
}
}
case s.pendingCGC > 0 && !now.After(s.pendingCGCDeadline):
// Pending change exists but grace period not expired - do nothing
pending := s.pendingCGC
timeRemaining := s.pendingCGCDeadline.Sub(now).Seconds()
s.pendingCGCLock.Unlock()
log.WithFields(logrus.Fields{
"pendingCGC": pending,
"timeRemaining": timeRemaining,
}).Debug("Grace period still active, skipping CGC update")
return nil
default:
// No pending change: check if we need to schedule one
if targetCustodyGroupCount > actualCustodyGrounpCount {
// Schedule the increase with grace period
s.pendingCGC = targetCustodyGroupCount
s.pendingCGCDeadline = now.Add(time.Duration(gracePeriodSeconds) * time.Second)
s.pendingCGCLock.Unlock()
log.WithFields(logrus.Fields{
"currentCGC": actualCustodyGrounpCount,
"targetCGC": targetCustodyGroupCount,
"gracePeriod": gracePeriodSeconds,
"effectiveTime": s.pendingCGCDeadline.Format(time.RFC3339),
}).Info("Scheduling custody group count increase with grace period")
return nil
}
// No change needed
s.pendingCGCLock.Unlock()
}
// If the actual custody group count is already equal to the target, skip the update.
if actualCustodyGrounpCount >= targetCustodyGroupCount {
return nil
@@ -83,7 +173,7 @@ func (s *Service) updateCustodyInfoIfNeeded() error {
// Update the p2p earliest available slot metric
earliestAvailableSlotP2P.Set(float64(storedEarliestSlot))
dbEarliestSlot, _, err := s.cfg.beaconDB.UpdateCustodyInfo(s.ctx, storedEarliestSlot, storedGroupCount)
dbEarliestSlot, dbStoredGroupCount, err := s.cfg.beaconDB.UpdateCustodyInfo(s.ctx, storedEarliestSlot, storedGroupCount)
if err != nil {
return errors.Wrap(err, "beacon db update custody info")
}
@@ -91,6 +181,10 @@ func (s *Service) updateCustodyInfoIfNeeded() error {
// Update the DB earliest available slot metric
earliestAvailableSlotDB.Set(float64(dbEarliestSlot))
// Update both custody group count metrics with their respective values
custodyGroupCountP2P.Set(float64(storedGroupCount))
custodyGroupCountDB.Set(float64(dbStoredGroupCount))
return nil
}

View File

@@ -241,6 +241,16 @@ var (
Name: "custody_earliest_available_slot_db",
Help: "The earliest available slot tracked by the database for custody purposes",
})
// Custody group count metrics - separate for P2P and DB views
custodyGroupCountP2P = promauto.NewGauge(prometheus.GaugeOpts{
Name: "beacon_custody_group_count_p2p",
Help: "Current custody group count (CGC) from P2P layer",
})
custodyGroupCountDB = promauto.NewGauge(prometheus.GaugeOpts{
Name: "beacon_custody_group_count_db",
Help: "Current custody group count (CGC) stored in database",
})
)
func (s *Service) updateMetrics() {

View File

@@ -182,6 +182,10 @@ type Service struct {
dataColumnLogCh chan dataColumnLogEntry
digestActions perDigestSet
subscriptionSpawner func(func()) // see Service.spawn for details
// Grace period fields for CGC changes
pendingCGC uint64
pendingCGCDeadline time.Time
pendingCGCLock sync.RWMutex
}
// NewService initializes new regular sync service.

View File

@@ -0,0 +1,3 @@
### Added
- Grace period for cgc when it is supposed to get updated.