Add automatic heap dump monitoring based on memory threshold

This commit is contained in:
Manu NALEPA
2026-01-08 00:32:45 +01:00
parent 0fcb922702
commit cc523c370a
4 changed files with 124 additions and 0 deletions

View File

@@ -130,6 +130,7 @@ var appFlags = []cli.Flag{
debug.MemProfileRateFlag,
debug.BlockProfileRateFlag,
debug.MutexProfileFractionFlag,
debug.HeapDumpThresholdFlag,
cmd.LogFileName,
cmd.EnableUPnPFlag,
cmd.ConfigFileFlag,

View File

@@ -229,6 +229,7 @@ var appHelpFlagGroups = []flagGroup{
debug.PProfAddrFlag,
debug.PProfFlag,
debug.PProfPortFlag,
debug.HeapDumpThresholdFlag,
flags.SetGCPercent,
},
},

View File

@@ -18,6 +18,7 @@ go_library(
importpath = "github.com/OffchainLabs/prysm/v7/runtime/debug",
visibility = ["//visibility:public"],
deps = [
"//cmd:go_default_library",
"@com_github_prometheus_client_golang//prometheus:go_default_library",
"@com_github_prometheus_client_golang//prometheus/promauto:go_default_library",
"@com_github_sirupsen_logrus//:go_default_library",

View File

@@ -18,6 +18,7 @@ package debug
import (
"bytes"
"context"
"errors"
"fmt"
"io"
@@ -37,6 +38,7 @@ import (
"sync"
"time"
"github.com/OffchainLabs/prysm/v7/cmd"
log "github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"
)
@@ -78,6 +80,11 @@ var (
Name: "blockprofilerate",
Usage: "Turns on block profiling with the given rate.",
}
// HeapDumpThresholdFlag sets the memory threshold in MB for automatic heap dumps.
HeapDumpThresholdFlag = &cli.Uint64Flag{
Name: "heap-dump-threshold-mb",
Usage: "If set to a value > 0, automatically dump heap profile when heap memory exceeds this threshold in MB.",
}
)
// HandlerT implements the debugging API.
@@ -91,6 +98,20 @@ type HandlerT struct {
traceFile string
}
// memoryMonitor tracks heap memory and triggers dumps when threshold exceeded.
type memoryMonitor struct {
ctx context.Context
thresholdBytes uint64
dumpDir string
latestDumpTime time.Time
mu sync.Mutex
}
const (
heapDumpCooldownPeriod = 5 * time.Minute
heapCheckIntervalPeriod = 5 * time.Second
)
// MemStats returns detailed runtime memory statistics.
func (*HandlerT) MemStats() *runtime.MemStats {
s := new(runtime.MemStats)
@@ -105,6 +126,97 @@ func (*HandlerT) GcStats() *debug.GCStats {
return s
}
// startMemoryMonitoring initializes and starts the memory monitoring goroutine.
func startMemoryMonitoring(ctx context.Context, thresholdMB uint64, dumpDir string) error {
// Ensure dump directory exists
if err := os.MkdirAll(dumpDir, 0755); err != nil {
return fmt.Errorf("failed to create heap dump directory: %w", err)
}
monitor := &memoryMonitor{
ctx: ctx,
thresholdBytes: thresholdMB * 1024 * 1024,
dumpDir: dumpDir,
latestDumpTime: time.Time{}, // zero time means never dumped
}
go monitor.monitorLoop()
log.WithFields(log.Fields{
"thresholdMB": thresholdMB,
"cooldownPeriod": heapDumpCooldownPeriod,
"dumpDirectory": dumpDir,
"checkIntervalPeriod": heapCheckIntervalPeriod,
}).Info("Started heap dump monitoring")
return nil
}
// monitorLoop periodically checks heap memory and triggers dumps.
func (m *memoryMonitor) monitorLoop() {
ticker := time.NewTicker(heapCheckIntervalPeriod)
for {
select {
case <-m.ctx.Done():
log.Debug("Context done, stopping heap memory monitoring")
return
case <-ticker.C:
m.checkAndDumpIfNeeded()
}
}
}
// checkAndDumpIfNeeded checks current heap size and dumps if threshold exceeded.
func (m *memoryMonitor) checkAndDumpIfNeeded() {
var memStats runtime.MemStats
runtime.ReadMemStats(&memStats)
heapInUseMB := memStats.HeapInuse / (1024 * 1024)
heapAllocMB := memStats.HeapAlloc / (1024 * 1024)
// Use HeapInuse (currently allocated from OS) as the threshold metric
// This is more stable than HeapAlloc which fluctuates with GC
if memStats.HeapInuse < m.thresholdBytes {
return
}
m.mu.Lock()
defer m.mu.Unlock()
// Check cooldown period
if time.Since(m.latestDumpTime) < heapDumpCooldownPeriod {
log.WithFields(log.Fields{
"heapInUseMB": heapInUseMB,
"heapAllocMB": heapAllocMB,
"thresholdMB": m.thresholdBytes / (1024 * 1024),
"timeSinceLatestDump": time.Since(m.latestDumpTime).String(),
"cooldown": heapDumpCooldownPeriod,
}).Debug("Heap threshold exceeded but in cooldown period")
return
}
// Create timestamped filename
timestamp := time.Now().Format("20060102-150405")
filename := fmt.Sprintf("heap-%s-%dmb.prof", timestamp, heapInUseMB)
filepath := filepath.Join(m.dumpDir, filename)
log.WithFields(log.Fields{
"heapInUseMB": heapInUseMB,
"heapAllocMB": heapAllocMB,
"thresholdMB": m.thresholdBytes / (1024 * 1024),
"file": filepath,
}).Warning("Heap memory threshold exceeded, dumping heap profile")
// Use existing WriteMemProfile infrastructure
if err := Handler.WriteMemProfile(filepath); err != nil {
log.WithError(err).Error("Failed to write automatic heap dump")
return
}
m.latestDumpTime = time.Now()
}
// CPUProfile turns on CPU profiling for nsec seconds and writes
// profile data to file.
func (h *HandlerT) CPUProfile(file string, nsec uint) error {
@@ -323,6 +435,15 @@ func Setup(ctx *cli.Context) error {
address := fmt.Sprintf("%s:%d", ctx.String(PProfAddrFlag.Name), ctx.Int(PProfPortFlag.Name))
startPProf(address)
}
// automatic heap dump monitoring
thresholdMB := ctx.Uint64(HeapDumpThresholdFlag.Name)
if thresholdMB > 0 {
dataDir := ctx.String(cmd.DataDirFlag.Name)
dumpDir := filepath.Join(dataDir, "heapdumps")
if err := startMemoryMonitoring(ctx.Context, thresholdMB, dumpDir); err != nil {
return fmt.Errorf("failed to start memory monitoring: %w", err)
}
}
return nil
}