mirror of
https://github.com/OffchainLabs/prysm.git
synced 2026-01-10 07:58:22 -05:00
Add automatic heap dump monitoring based on memory threshold
This commit is contained in:
@@ -130,6 +130,7 @@ var appFlags = []cli.Flag{
|
||||
debug.MemProfileRateFlag,
|
||||
debug.BlockProfileRateFlag,
|
||||
debug.MutexProfileFractionFlag,
|
||||
debug.HeapDumpThresholdFlag,
|
||||
cmd.LogFileName,
|
||||
cmd.EnableUPnPFlag,
|
||||
cmd.ConfigFileFlag,
|
||||
|
||||
@@ -229,6 +229,7 @@ var appHelpFlagGroups = []flagGroup{
|
||||
debug.PProfAddrFlag,
|
||||
debug.PProfFlag,
|
||||
debug.PProfPortFlag,
|
||||
debug.HeapDumpThresholdFlag,
|
||||
flags.SetGCPercent,
|
||||
},
|
||||
},
|
||||
|
||||
@@ -18,6 +18,7 @@ go_library(
|
||||
importpath = "github.com/OffchainLabs/prysm/v7/runtime/debug",
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
"//cmd:go_default_library",
|
||||
"@com_github_prometheus_client_golang//prometheus:go_default_library",
|
||||
"@com_github_prometheus_client_golang//prometheus/promauto:go_default_library",
|
||||
"@com_github_sirupsen_logrus//:go_default_library",
|
||||
|
||||
@@ -18,6 +18,7 @@ package debug
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
@@ -37,6 +38,7 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/OffchainLabs/prysm/v7/cmd"
|
||||
log "github.com/sirupsen/logrus"
|
||||
"github.com/urfave/cli/v2"
|
||||
)
|
||||
@@ -78,6 +80,11 @@ var (
|
||||
Name: "blockprofilerate",
|
||||
Usage: "Turns on block profiling with the given rate.",
|
||||
}
|
||||
// HeapDumpThresholdFlag sets the memory threshold in MB for automatic heap dumps.
|
||||
HeapDumpThresholdFlag = &cli.Uint64Flag{
|
||||
Name: "heap-dump-threshold-mb",
|
||||
Usage: "If set to a value > 0, automatically dump heap profile when heap memory exceeds this threshold in MB.",
|
||||
}
|
||||
)
|
||||
|
||||
// HandlerT implements the debugging API.
|
||||
@@ -91,6 +98,20 @@ type HandlerT struct {
|
||||
traceFile string
|
||||
}
|
||||
|
||||
// memoryMonitor tracks heap memory and triggers dumps when threshold exceeded.
|
||||
type memoryMonitor struct {
|
||||
ctx context.Context
|
||||
thresholdBytes uint64
|
||||
dumpDir string
|
||||
latestDumpTime time.Time
|
||||
mu sync.Mutex
|
||||
}
|
||||
|
||||
const (
|
||||
heapDumpCooldownPeriod = 5 * time.Minute
|
||||
heapCheckIntervalPeriod = 5 * time.Second
|
||||
)
|
||||
|
||||
// MemStats returns detailed runtime memory statistics.
|
||||
func (*HandlerT) MemStats() *runtime.MemStats {
|
||||
s := new(runtime.MemStats)
|
||||
@@ -105,6 +126,97 @@ func (*HandlerT) GcStats() *debug.GCStats {
|
||||
return s
|
||||
}
|
||||
|
||||
// startMemoryMonitoring initializes and starts the memory monitoring goroutine.
|
||||
func startMemoryMonitoring(ctx context.Context, thresholdMB uint64, dumpDir string) error {
|
||||
// Ensure dump directory exists
|
||||
if err := os.MkdirAll(dumpDir, 0755); err != nil {
|
||||
return fmt.Errorf("failed to create heap dump directory: %w", err)
|
||||
}
|
||||
|
||||
monitor := &memoryMonitor{
|
||||
ctx: ctx,
|
||||
thresholdBytes: thresholdMB * 1024 * 1024,
|
||||
dumpDir: dumpDir,
|
||||
latestDumpTime: time.Time{}, // zero time means never dumped
|
||||
}
|
||||
|
||||
go monitor.monitorLoop()
|
||||
|
||||
log.WithFields(log.Fields{
|
||||
"thresholdMB": thresholdMB,
|
||||
"cooldownPeriod": heapDumpCooldownPeriod,
|
||||
"dumpDirectory": dumpDir,
|
||||
"checkIntervalPeriod": heapCheckIntervalPeriod,
|
||||
}).Info("Started heap dump monitoring")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// monitorLoop periodically checks heap memory and triggers dumps.
|
||||
func (m *memoryMonitor) monitorLoop() {
|
||||
ticker := time.NewTicker(heapCheckIntervalPeriod)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-m.ctx.Done():
|
||||
log.Debug("Context done, stopping heap memory monitoring")
|
||||
return
|
||||
case <-ticker.C:
|
||||
m.checkAndDumpIfNeeded()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// checkAndDumpIfNeeded checks current heap size and dumps if threshold exceeded.
|
||||
func (m *memoryMonitor) checkAndDumpIfNeeded() {
|
||||
var memStats runtime.MemStats
|
||||
runtime.ReadMemStats(&memStats)
|
||||
|
||||
heapInUseMB := memStats.HeapInuse / (1024 * 1024)
|
||||
heapAllocMB := memStats.HeapAlloc / (1024 * 1024)
|
||||
|
||||
// Use HeapInuse (currently allocated from OS) as the threshold metric
|
||||
// This is more stable than HeapAlloc which fluctuates with GC
|
||||
if memStats.HeapInuse < m.thresholdBytes {
|
||||
return
|
||||
}
|
||||
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
// Check cooldown period
|
||||
if time.Since(m.latestDumpTime) < heapDumpCooldownPeriod {
|
||||
log.WithFields(log.Fields{
|
||||
"heapInUseMB": heapInUseMB,
|
||||
"heapAllocMB": heapAllocMB,
|
||||
"thresholdMB": m.thresholdBytes / (1024 * 1024),
|
||||
"timeSinceLatestDump": time.Since(m.latestDumpTime).String(),
|
||||
"cooldown": heapDumpCooldownPeriod,
|
||||
}).Debug("Heap threshold exceeded but in cooldown period")
|
||||
return
|
||||
}
|
||||
|
||||
// Create timestamped filename
|
||||
timestamp := time.Now().Format("20060102-150405")
|
||||
filename := fmt.Sprintf("heap-%s-%dmb.prof", timestamp, heapInUseMB)
|
||||
filepath := filepath.Join(m.dumpDir, filename)
|
||||
|
||||
log.WithFields(log.Fields{
|
||||
"heapInUseMB": heapInUseMB,
|
||||
"heapAllocMB": heapAllocMB,
|
||||
"thresholdMB": m.thresholdBytes / (1024 * 1024),
|
||||
"file": filepath,
|
||||
}).Warning("Heap memory threshold exceeded, dumping heap profile")
|
||||
|
||||
// Use existing WriteMemProfile infrastructure
|
||||
if err := Handler.WriteMemProfile(filepath); err != nil {
|
||||
log.WithError(err).Error("Failed to write automatic heap dump")
|
||||
return
|
||||
}
|
||||
|
||||
m.latestDumpTime = time.Now()
|
||||
}
|
||||
|
||||
// CPUProfile turns on CPU profiling for nsec seconds and writes
|
||||
// profile data to file.
|
||||
func (h *HandlerT) CPUProfile(file string, nsec uint) error {
|
||||
@@ -323,6 +435,15 @@ func Setup(ctx *cli.Context) error {
|
||||
address := fmt.Sprintf("%s:%d", ctx.String(PProfAddrFlag.Name), ctx.Int(PProfPortFlag.Name))
|
||||
startPProf(address)
|
||||
}
|
||||
// automatic heap dump monitoring
|
||||
thresholdMB := ctx.Uint64(HeapDumpThresholdFlag.Name)
|
||||
if thresholdMB > 0 {
|
||||
dataDir := ctx.String(cmd.DataDirFlag.Name)
|
||||
dumpDir := filepath.Join(dataDir, "heapdumps")
|
||||
if err := startMemoryMonitoring(ctx.Context, thresholdMB, dumpDir); err != nil {
|
||||
return fmt.Errorf("failed to start memory monitoring: %w", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user