diff --git a/common/version/version.go b/common/version/version.go index 886e0ea98..346a4436a 100644 --- a/common/version/version.go +++ b/common/version/version.go @@ -6,7 +6,7 @@ import ( "strings" ) -var tag = "v4.1.83" +var tag = "v4.1.84" var commit = func() string { if info, ok := debug.ReadBuildInfo(); ok { diff --git a/coordinator/internal/logic/submitproof/proof_receiver.go b/coordinator/internal/logic/submitproof/proof_receiver.go index 590aa3ba4..0ab2000d1 100644 --- a/coordinator/internal/logic/submitproof/proof_receiver.go +++ b/coordinator/internal/logic/submitproof/proof_receiver.go @@ -5,6 +5,7 @@ import ( "encoding/json" "errors" "fmt" + "strings" "time" "github.com/gin-gonic/gin" @@ -231,12 +232,19 @@ func (m *ProofReceiverLogic) validator(ctx context.Context, proverTask *orm.Prov proofTimeSec := uint64(proofTime.Seconds()) if proofMsg.Status != message.StatusOk { - m.proofRecover(ctx, proofMsg.ID, pk, proofMsg) + // Temporarily replace "panic" with "pa-nic" to prevent triggering the alert based on logs. + failureMsg := strings.Replace(proofParameter.FailureMsg, "panic", "pa-nic", -1) + // Verify if the proving task has already been assigned to another prover. + // Upon receiving an error message, it's possible the proving status has been reset by another prover + // and the task has been reassigned. In this case, the coordinator should avoid resetting the proving status. + m.processProverErr(ctx, proofMsg.ID, pk, proofMsg.Type) + m.validateFailureProverTaskStatusNotOk.Inc() + log.Info("proof generated by prover failed", - "taskType", proofMsg.Type, "hash", proofMsg.ID, - "proverName", proverTask.ProverName, "proverVersion", proverTask.ProverVersion, - "proverPublicKey", pk, "failureType", proofParameter.FailureType, "failureMessage", proofParameter.FailureMsg) + "taskType", proofMsg.Type, "hash", proofMsg.ID, "proverName", proverTask.ProverName, + "proverVersion", proverTask.ProverVersion, "proverPublicKey", pk, "failureType", proofParameter.FailureType, + "failureMessage", "failureMessage", failureMsg) return ErrValidatorFailureProofMsgStatusNotOk } @@ -264,15 +272,6 @@ func (m *ProofReceiverLogic) validator(ctx context.Context, proverTask *orm.Prov return nil } -//func (m *ProofReceiverLogic) proofFailure(ctx context.Context, hash string, pubKey string, proofMsg *message.ProofMsg) { -// log.Info("proof failure update proof status", "hash", hash, "public key", pubKey, -// "proof type", proofMsg.Type.String(), "status", types.ProvingTaskFailed.String()) -// -// if err := m.updateProofStatus(ctx, hash, pubKey, proofMsg, types.ProvingTaskFailed, 0); err != nil { -// log.Error("failed to updated proof status ProvingTaskFailed", "hash", hash, "pubKey", pubKey, "error", err) -// } -//} - func (m *ProofReceiverLogic) proofRecover(ctx context.Context, hash string, pubKey string, proofMsg *message.ProofMsg) { log.Info("proof recover update proof status", "hash", hash, "proverPublicKey", pubKey, "taskType", proofMsg.Type.String(), "status", types.ProvingTaskUnassigned.String()) @@ -377,6 +376,33 @@ func (m *ProofReceiverLogic) checkIsTaskSuccess(ctx context.Context, hash string return provingStatus == types.ProvingTaskVerified } +func (m *ProofReceiverLogic) processProverErr(ctx context.Context, taskID, pk string, taskType message.ProofType) { + if updateErr := m.proverTaskOrm.UpdateProverTaskProvingStatus(ctx, taskType, taskID, pk, types.ProverProofInvalid); updateErr != nil { + log.Error("update prover task proving status failure", "taskID", taskID, "proverPublicKey", pk, "taskType", taskType, "error", updateErr) + } + + proverTasks, err := m.proverTaskOrm.GetValidOrAssignedTaskOfOtherProvers(ctx, taskID, pk, taskType) + if err != nil { + log.Warn("checkIsAssignedToOtherProver failure", "taskID", taskID, "proverPublicKey", pk, "taskType", taskType, "error", err) + return + } + + if len(proverTasks) > 0 { + return + } + + switch taskType { + case message.ProofTypeChunk: + if err := m.chunkOrm.UpdateProvingStatusFromProverError(ctx, taskID, types.ProvingTaskUnassigned); err != nil { + log.Error("failed to update chunk proving_status as failed", taskID, "proverPublicKey", pk, "taskType", taskType, "error", err) + } + case message.ProofTypeBatch: + if err := m.batchOrm.UpdateProvingStatusFromProverError(ctx, taskID, types.ProvingTaskUnassigned); err != nil { + log.Error("failed to update batch proving_status as failed", taskID, "proverPublicKey", pk, "taskType", taskType, "error", err) + } + } +} + func (m *ProofReceiverLogic) updateProverTaskProof(ctx context.Context, pk string, proofMsg *message.ProofMsg) error { // store the proof to prover task var proofBytes []byte diff --git a/coordinator/internal/orm/batch.go b/coordinator/internal/orm/batch.go index f41b3accc..00d37fa95 100644 --- a/coordinator/internal/orm/batch.go +++ b/coordinator/internal/orm/batch.go @@ -250,6 +250,30 @@ func (o *Batch) UpdateProvingStatus(ctx context.Context, hash string, status typ return nil } +// UpdateProvingStatusFromProverError updates batch proving status when prover prove failed +func (o *Batch) UpdateProvingStatusFromProverError(ctx context.Context, hash string, status types.ProvingStatus) error { + updateFields := make(map[string]interface{}) + updateFields["proving_status"] = int(status) + + switch status { + case types.ProvingTaskAssigned: + updateFields["prover_assigned_at"] = time.Now() + case types.ProvingTaskUnassigned: + updateFields["prover_assigned_at"] = nil + case types.ProvingTaskProved, types.ProvingTaskVerified: + updateFields["proved_at"] = time.Now() + } + + db := o.db.WithContext(ctx) + db = db.Model(&Batch{}) + db = db.Where("hash", hash).Where("proving_status", types.ProvingTaskAssigned) + + if err := db.Updates(updateFields).Error; err != nil { + return fmt.Errorf("Batch.UpdateProvingStatusOptimistic error: %w, batch hash: %v, status: %v", err, hash, status.String()) + } + return nil +} + // UpdateProofByHash updates the batch proof by hash. func (o *Batch) UpdateProofByHash(ctx context.Context, hash string, proof *message.BatchProof, proofTimeSec uint64, dbTX ...*gorm.DB) error { db := o.db diff --git a/coordinator/internal/orm/chunk.go b/coordinator/internal/orm/chunk.go index 701e85c23..e991260c0 100644 --- a/coordinator/internal/orm/chunk.go +++ b/coordinator/internal/orm/chunk.go @@ -302,6 +302,29 @@ func (o *Chunk) UpdateProvingStatus(ctx context.Context, hash string, status typ return nil } +// UpdateProvingStatusFromProverError updates chunk proving status when prover prove failed +func (o *Chunk) UpdateProvingStatusFromProverError(ctx context.Context, hash string, status types.ProvingStatus) error { + updateFields := make(map[string]interface{}) + updateFields["proving_status"] = int(status) + + switch status { + case types.ProvingTaskAssigned: + updateFields["prover_assigned_at"] = time.Now() + case types.ProvingTaskUnassigned: + updateFields["prover_assigned_at"] = nil + case types.ProvingTaskProved, types.ProvingTaskVerified: + updateFields["proved_at"] = time.Now() + } + db := o.db.WithContext(ctx) + db = db.Model(&Chunk{}) + db = db.Where("hash", hash).Where("proving_status", types.ProvingTaskAssigned) + + if err := db.Updates(updateFields).Error; err != nil { + return fmt.Errorf("Chunk.UpdateProvingStatusOptimistic error: %w, chunk hash: %v, status: %v", err, hash, status.String()) + } + return nil +} + // UpdateProofByHash updates the chunk proof by hash. func (o *Chunk) UpdateProofByHash(ctx context.Context, hash string, proof *message.ChunkProof, proofTimeSec uint64, dbTX ...*gorm.DB) error { db := o.db diff --git a/coordinator/internal/orm/prover_task.go b/coordinator/internal/orm/prover_task.go index b1fb8bf53..e09dbfa04 100644 --- a/coordinator/internal/orm/prover_task.go +++ b/coordinator/internal/orm/prover_task.go @@ -129,6 +129,22 @@ func (o *ProverTask) GetProverTaskByTaskIDAndProver(ctx context.Context, taskID, return &proverTask, nil } +// GetValidOrAssignedTaskOfOtherProvers get the chunk/batch task assigned other provers +func (o *ProverTask) GetValidOrAssignedTaskOfOtherProvers(ctx context.Context, taskID, proverPublicKey string, taskType message.ProofType) ([]ProverTask, error) { + db := o.db.WithContext(ctx) + db = db.Model(&ProverTask{}) + db = db.Where("task_id", taskID) + db = db.Where("task_type", int(taskType)) + db = db.Where("prover_public_key != ?", proverPublicKey) + db = db.Where("proving_status in (?)", []int{int(types.ProverAssigned), int(types.ProverProofValid)}) + + var proverTasks []ProverTask + if err := db.Find(&proverTasks).Error; err != nil { + return nil, fmt.Errorf("ProverTask.GetAssignedProverTask error: %w, taskID: %v", err, taskID) + } + return proverTasks, nil +} + // GetProvingStatusByTaskID retrieves the proving status of a prover task func (o *ProverTask) GetProvingStatusByTaskID(ctx context.Context, taskID string) (types.ProverProveStatus, error) { db := o.db.WithContext(ctx)