mirror of
https://github.com/vacp2p/linea-monorepo.git
synced 2026-01-09 15:38:06 -05:00
Prover: the controller skips waiting for the sub-process to yield on a SIGTERM in spot-instance mode (#1049)
* fix(controller): immediately exits when receiving the SIGTERM without waiting for the kill to complete * feat(tmp-response): also remove the tmp response file in case it exists * minor(execution): adds a comment explaining why we can safely close
This commit is contained in:
@@ -50,7 +50,12 @@ func runController(ctx context.Context, cfg *config.Config) {
|
|||||||
// SIGTERM is received, there would be no log entry about the signal
|
// SIGTERM is received, there would be no log entry about the signal
|
||||||
// until the proof completes.
|
// until the proof completes.
|
||||||
<-ctx.Done()
|
<-ctx.Done()
|
||||||
cLog.Infoln("Received cancellation request, will exit as soon as possible or once current proof task is complete.")
|
|
||||||
|
if cfg.Controller.SpotInstanceMode {
|
||||||
|
cLog.Infoln("Received cancellation request. Killing the ongoing process and exiting immediately after.")
|
||||||
|
} else {
|
||||||
|
cLog.Infoln("Received cancellation request, will exit as soon as possible or once current proof task is complete.")
|
||||||
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
for {
|
for {
|
||||||
@@ -194,6 +199,13 @@ func runController(ctx context.Context, cfg *config.Config) {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// As an edge-case, it's possible (in theory) that the process
|
||||||
|
// completes exactly when we receive the kill signal. So we
|
||||||
|
// could end up in a situation where the tmp-response file
|
||||||
|
// exists. In that case, we simply delete it before exiting to
|
||||||
|
// keep the FS clean.
|
||||||
|
os.Remove(job.TmpResponseFile(cfg))
|
||||||
|
|
||||||
// Failure case
|
// Failure case
|
||||||
default:
|
default:
|
||||||
// Move the inprogress to the done directory
|
// Move the inprogress to the done directory
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"strings"
|
"strings"
|
||||||
"sync/atomic"
|
|
||||||
"syscall"
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -231,55 +230,65 @@ func runCmd(ctx context.Context, cmd string, job *Job, retry bool) Status {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// The motivation for this goroutine is to early-exit the process via a
|
done := make(chan Status)
|
||||||
// kill if the context is cancelled. KilledByUs is set if the goroutine
|
|
||||||
// detects that the context has been cancelled. And the done channel
|
|
||||||
// indicates that the process has finished.
|
|
||||||
var (
|
|
||||||
killedByUs = &atomic.Bool{}
|
|
||||||
done = make(chan struct{})
|
|
||||||
)
|
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
// Since the channel is used for sending only once and only in this
|
||||||
_ = curProcess.Kill()
|
// goroutine, we can safely close it.
|
||||||
killedByUs.Store(true)
|
defer close(done)
|
||||||
case <-done:
|
|
||||||
|
// Lock on the process until it finishes
|
||||||
|
pstate, err := curProcess.Wait()
|
||||||
|
if err != nil {
|
||||||
|
// Here it means, the "os" package could not "wait" the process. It
|
||||||
|
// can happen for many different reasons essentially pertaining to
|
||||||
|
// the initialization of the process. It may be that some of theses
|
||||||
|
// errors are retryables but it remains to see which one. Until then
|
||||||
|
// we exited with a fatal code and the files will need to be
|
||||||
|
// manually reprocessed.
|
||||||
|
logrus.Errorf("unexpected : got an error trying to lock on %v : %v", pname, err)
|
||||||
|
done <- Status{
|
||||||
|
ExitCode: CodeFatal,
|
||||||
|
What: "got an error waiting for the process",
|
||||||
|
Err: err,
|
||||||
|
}
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
processingTime := time.Since(startTime)
|
||||||
|
exitCode, err := unixExitCode(pstate)
|
||||||
|
if err != nil {
|
||||||
|
// NB: this would be only possible if we did not wait for the process
|
||||||
|
// to finish trying to read the exit code.
|
||||||
|
utils.Panic("unexpectedly got an error while trying to read the exit code of the process: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
logrus.Infof(
|
||||||
|
"The processing of file `%s` (process=%v) took %v seconds to complete and returned exit code %v",
|
||||||
|
job.OriginalFile, pname, processingTime.Seconds(), exitCode,
|
||||||
|
)
|
||||||
|
|
||||||
|
// Build the response status
|
||||||
|
status := Status{ExitCode: exitCode}
|
||||||
|
switch status.ExitCode {
|
||||||
|
case CodeSuccess:
|
||||||
|
status.What = "success"
|
||||||
|
case CodeOom:
|
||||||
|
status.What = "out of memory error"
|
||||||
|
case CodeTraceLimit:
|
||||||
|
status.What = "trace limit overflow"
|
||||||
|
}
|
||||||
|
|
||||||
|
metrics.CollectPostProcess(job.Def.Name, status.ExitCode, processingTime, retry)
|
||||||
|
|
||||||
|
done <- status
|
||||||
}()
|
}()
|
||||||
|
|
||||||
// Lock on the process until it finishes
|
select {
|
||||||
pstate, err := curProcess.Wait()
|
case <-ctx.Done():
|
||||||
if err != nil {
|
logrus.Infof("The process %v is being killed by the controller", pname)
|
||||||
// Here it means, the "os" package could not "wait" the process. It
|
_ = curProcess.Kill()
|
||||||
// can happen for many different reasons essentially pertaining to
|
|
||||||
// the initialization of the process. It may be that some of theses
|
|
||||||
// errors are retryables but it remains to see which one. Until then
|
|
||||||
// we exited with a fatal code and the files will need to be
|
|
||||||
// manually reprocessed.
|
|
||||||
logrus.Errorf("unexpected : got an error trying to lock on %v : %v", pname, err)
|
|
||||||
return Status{
|
|
||||||
ExitCode: CodeFatal,
|
|
||||||
What: "got an error waiting for the process",
|
|
||||||
Err: err,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
processingTime := time.Since(startTime)
|
|
||||||
exitCode, err := unixExitCode(pstate)
|
|
||||||
if err != nil {
|
|
||||||
// NB: this would be only possible if we did not wait for the process
|
|
||||||
// to finish trying to read the exit code.
|
|
||||||
utils.Panic("unexpectedly got an error while trying to read the exit code of the process: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Closing the done channel tells the above goroutine to stop waiting for
|
|
||||||
// a context cancellation.
|
|
||||||
close(done)
|
|
||||||
|
|
||||||
if killedByUs.Load() {
|
|
||||||
logrus.Infof("The process %v was killed by the controller", pname)
|
|
||||||
|
|
||||||
// Not that we exit without providing post-execution metrics to
|
// Not that we exit without providing post-execution metrics to
|
||||||
// prometheus as these would not be relevant anyway.
|
// prometheus as these would not be relevant anyway.
|
||||||
@@ -287,27 +296,9 @@ func runCmd(ctx context.Context, cmd string, job *Job, retry bool) Status {
|
|||||||
ExitCode: CodeKilledByUs,
|
ExitCode: CodeKilledByUs,
|
||||||
What: "the process was killed by the controller",
|
What: "the process was killed by the controller",
|
||||||
}
|
}
|
||||||
|
case status := <-done:
|
||||||
|
return status
|
||||||
}
|
}
|
||||||
|
|
||||||
logrus.Infof(
|
|
||||||
"The processing of file `%s` (process=%v) took %v seconds to complete and returned exit code %v",
|
|
||||||
job.OriginalFile, pname, processingTime.Seconds(), exitCode,
|
|
||||||
)
|
|
||||||
|
|
||||||
// Build the response status
|
|
||||||
status := Status{ExitCode: exitCode}
|
|
||||||
switch status.ExitCode {
|
|
||||||
case CodeSuccess:
|
|
||||||
status.What = "success"
|
|
||||||
case CodeOom:
|
|
||||||
status.What = "out of memory error"
|
|
||||||
case CodeTraceLimit:
|
|
||||||
status.What = "trace limit overflow"
|
|
||||||
}
|
|
||||||
|
|
||||||
metrics.CollectPostProcess(job.Def.Name, status.ExitCode, processingTime, retry)
|
|
||||||
|
|
||||||
return status
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns a human-readable process name. The process name is formatted as in
|
// Returns a human-readable process name. The process name is formatted as in
|
||||||
|
|||||||
Reference in New Issue
Block a user