Compare commits

..

1 Commits

Author SHA1 Message Date
georgehao
c3ae79bff4 feat(coordinator): fix prover task timeout bug (#774) 2023-08-11 15:29:16 +08:00
3 changed files with 40 additions and 40 deletions

View File

@@ -6,7 +6,7 @@ import (
"strings"
)
var tag = "v4.1.31"
var tag = "v4.1.32"
var commit = func() string {
if info, ok := debug.ReadBuildInfo(); ok {

View File

@@ -66,50 +66,48 @@ func (c *Collector) timeoutProofTask() {
for {
select {
case <-ticker.C:
assignedProverTasks, err := c.proverTaskOrm.GetAssignedProverTasks(c.ctx, 10)
timeout := time.Duration(c.cfg.ProverManager.CollectionTimeSec) * time.Second
assignedProverTasks, err := c.proverTaskOrm.GetTimeoutAssignedProverTasks(c.ctx, 10, timeout)
if err != nil {
log.Error("get unassigned session info failure", "error", err)
break
}
// here not update the block batch proving status failed, because the collector loop will check
// the attempt times. if reach the times, the collector will set the block batch proving status.
for _, assignedProverTask := range assignedProverTasks {
timeoutDuration := time.Duration(c.cfg.ProverManager.CollectionTimeSec) * time.Second
// here not update the block batch proving status failed, because the collector loop will check
// the attempt times. if reach the times, the collector will set the block batch proving status.
if time.Since(assignedProverTask.AssignedAt) >= timeoutDuration {
log.Warn("proof task have reach the timeout", "task id", assignedProverTask.TaskID,
"prover public key", assignedProverTask.ProverPublicKey, "prover name", assignedProverTask.ProverName, "task type", assignedProverTask.TaskType)
err = c.db.Transaction(func(tx *gorm.DB) error {
// update prover task proving status as ProverProofInvalid
if err = c.proverTaskOrm.UpdateProverTaskProvingStatus(c.ctx, message.ProofType(assignedProverTask.TaskType),
assignedProverTask.TaskID, assignedProverTask.ProverPublicKey, types.ProverProofInvalid, tx); err != nil {
log.Error("update prover task proving status failure", "hash", assignedProverTask.TaskID, "pubKey", assignedProverTask.ProverPublicKey, "err", err)
return err
}
// update prover task failure type
if err = c.proverTaskOrm.UpdateProverTaskFailureType(c.ctx, message.ProofType(assignedProverTask.TaskType),
assignedProverTask.TaskID, assignedProverTask.ProverPublicKey, types.ProverTaskFailureTypeTimeout, tx); err != nil {
log.Error("update prover task failure type failure", "hash", assignedProverTask.TaskID, "pubKey", assignedProverTask.ProverPublicKey, "err", err)
return err
}
// update the task to unassigned, let collector restart it
if message.ProofType(assignedProverTask.TaskType) == message.ProofTypeChunk {
if err = c.chunkOrm.UpdateProvingStatus(c.ctx, assignedProverTask.TaskID, types.ProvingTaskUnassigned, tx); err != nil {
log.Error("update chunk proving status to unassigned to restart it failure", "hash", assignedProverTask.TaskID, "err", err)
}
}
if message.ProofType(assignedProverTask.TaskType) == message.ProofTypeBatch {
if err = c.batchOrm.UpdateProvingStatus(c.ctx, assignedProverTask.TaskID, types.ProvingTaskUnassigned, tx); err != nil {
log.Error("update batch proving status to unassigned to restart it failure", "hash", assignedProverTask.TaskID, "err", err)
}
}
return nil
})
if err != nil {
log.Error("check task proof is timeout failure", "error", err)
log.Warn("proof task have reach the timeout", "task id", assignedProverTask.TaskID,
"prover public key", assignedProverTask.ProverPublicKey, "prover name", assignedProverTask.ProverName, "task type", assignedProverTask.TaskType)
err = c.db.Transaction(func(tx *gorm.DB) error {
// update prover task proving status as ProverProofInvalid
if err = c.proverTaskOrm.UpdateProverTaskProvingStatus(c.ctx, message.ProofType(assignedProverTask.TaskType),
assignedProverTask.TaskID, assignedProverTask.ProverPublicKey, types.ProverProofInvalid, tx); err != nil {
log.Error("update prover task proving status failure", "hash", assignedProverTask.TaskID, "pubKey", assignedProverTask.ProverPublicKey, "err", err)
return err
}
// update prover task failure type
if err = c.proverTaskOrm.UpdateProverTaskFailureType(c.ctx, message.ProofType(assignedProverTask.TaskType),
assignedProverTask.TaskID, assignedProverTask.ProverPublicKey, types.ProverTaskFailureTypeTimeout, tx); err != nil {
log.Error("update prover task failure type failure", "hash", assignedProverTask.TaskID, "pubKey", assignedProverTask.ProverPublicKey, "err", err)
return err
}
// update the task to unassigned, let collector restart it
if message.ProofType(assignedProverTask.TaskType) == message.ProofTypeChunk {
if err = c.chunkOrm.UpdateProvingStatus(c.ctx, assignedProverTask.TaskID, types.ProvingTaskUnassigned, tx); err != nil {
log.Error("update chunk proving status to unassigned to restart it failure", "hash", assignedProverTask.TaskID, "err", err)
}
}
if message.ProofType(assignedProverTask.TaskType) == message.ProofTypeBatch {
if err = c.batchOrm.UpdateProvingStatus(c.ctx, assignedProverTask.TaskID, types.ProvingTaskUnassigned, tx); err != nil {
log.Error("update batch proving status to unassigned to restart it failure", "hash", assignedProverTask.TaskID, "err", err)
}
}
return nil
})
if err != nil {
log.Error("check task proof is timeout failure", "error", err)
}
}
case <-c.ctx.Done():

View File

@@ -11,6 +11,7 @@ import (
"scroll-tech/common/types"
"scroll-tech/common/types/message"
"scroll-tech/common/utils"
)
// ProverTask is assigned provers info of chunk/batch proof prover task
@@ -126,11 +127,12 @@ func (o *ProverTask) GetProvingStatusByTaskID(ctx context.Context, taskID string
return types.ProverProveStatus(proverTask.ProvingStatus), nil
}
// GetAssignedProverTasks get the assigned prover task
func (o *ProverTask) GetAssignedProverTasks(ctx context.Context, limit int) ([]ProverTask, error) {
// GetTimeoutAssignedProverTasks get the timeout and assigned proving_status prover task
func (o *ProverTask) GetTimeoutAssignedProverTasks(ctx context.Context, limit int, timeout time.Duration) ([]ProverTask, error) {
db := o.db.WithContext(ctx)
db = db.Model(&ProverTask{})
db = db.Where("proving_status", int(types.ProverAssigned))
db = db.Where("assigned_at < ?", utils.NowUTC().Add(-timeout))
db = db.Limit(limit)
var proverTasks []ProverTask