fix(roller): avoid discarding task when connection failure (#203)

Co-authored-by: colinlyguo <colinlyguo@gmail.com>
Co-authored-by: Lawliet-Chan <1576710154@qq.com>
Co-authored-by: HAOYUatHZ <37070449+HAOYUatHZ@users.noreply.github.com>
This commit is contained in:
colin
2023-01-09 10:52:50 +08:00
committed by GitHub
parent fff2517a76
commit a78160ddad
7 changed files with 132 additions and 57 deletions

View File

@@ -195,7 +195,12 @@ func (m *Manager) restorePrevSessions() {
finishChan: make(chan rollerProofStatus, proofAndPkBufferSize),
}
m.sessions[sess.info.ID] = sess
log.Info("Coordinator restart reload sessions", "ID", sess.info.ID, "sess", sess.info)
log.Info("Coordinator restart reload sessions", "session start time", time.Unix(sess.info.StartTimestamp, 0))
for _, roller := range sess.info.Rollers {
log.Info("restore roller info for session", "session id", sess.info.ID, "roller name", roller.Name, "public key", roller.PublicKey, "proof status", roller.Status)
}
go m.CollectProofs(sess.info.ID, sess)
}
}

View File

@@ -40,7 +40,7 @@ var (
dbImg docker.ImgInstance
)
func randomUrl() string {
func randomURL() string {
id, _ := rand.Int(rand.Reader, big.NewInt(2000-1))
return fmt.Sprintf("localhost:%d", 10000+2000+id.Int64())
}
@@ -83,7 +83,7 @@ func testHandshake(t *testing.T) {
defer l2db.Close()
// Setup coordinator and ws server.
wsURL := "ws://" + randomUrl()
wsURL := "ws://" + randomURL()
rollerManager, handler := setupCoordinator(t, cfg.DBConfig, wsURL)
defer func() {
handler.Shutdown(context.Background())
@@ -104,7 +104,7 @@ func testFailedHandshake(t *testing.T) {
defer l2db.Close()
// Setup coordinator and ws server.
wsURL := "ws://" + randomUrl()
wsURL := "ws://" + randomURL()
rollerManager, handler := setupCoordinator(t, cfg.DBConfig, wsURL)
defer func() {
handler.Shutdown(context.Background())
@@ -170,7 +170,7 @@ func testSeveralConnections(t *testing.T) {
defer l2db.Close()
// Setup coordinator and ws server.
wsURL := "ws://" + randomUrl()
wsURL := "ws://" + randomURL()
rollerManager, handler := setupCoordinator(t, cfg.DBConfig, wsURL)
defer func() {
handler.Shutdown(context.Background())
@@ -224,7 +224,7 @@ func testIdleRollerSelection(t *testing.T) {
defer l2db.Close()
// Setup coordinator and ws server.
wsURL := "ws://" + randomUrl()
wsURL := "ws://" + randomURL()
rollerManager, handler := setupCoordinator(t, cfg.DBConfig, wsURL)
defer func() {
handler.Shutdown(context.Background())
@@ -293,7 +293,7 @@ func testGracefulRestart(t *testing.T) {
assert.NoError(t, dbTx.Commit())
// Setup coordinator and ws server.
wsURL := "ws://" + randomUrl()
wsURL := "ws://" + randomURL()
rollerManager, handler := setupCoordinator(t, cfg.DBConfig, wsURL)
// create mock roller

View File

@@ -3,6 +3,7 @@ package orm
import (
"context"
"database/sql"
"fmt"
"github.com/jmoiron/sqlx"
"github.com/scroll-tech/go-ethereum/common"
@@ -84,6 +85,19 @@ const (
RollerProofInvalid
)
func (s RollerProveStatus) String() string {
switch s {
case RollerAssigned:
return "RollerAssigned"
case RollerProofValid:
return "RollerProofValid"
case RollerProofInvalid:
return "RollerProofInvalid"
default:
return fmt.Sprintf("Bad Value: %d", int32(s))
}
}
// RollerStatus is the roller name and roller prove status
type RollerStatus struct {
PublicKey string `json:"public_key"`

View File

@@ -42,8 +42,9 @@ type Roller struct {
taskChan chan *message.TaskMsg
sub ethereum.Subscription
isClosed int64
stopChan chan struct{}
isDisconnected int64
isClosed int64
stopChan chan struct{}
priv *ecdsa.PrivateKey
}
@@ -122,12 +123,11 @@ func (r *Roller) Register() error {
token, err := r.client.RequestToken(context.Background(), authMsg)
if err != nil {
return fmt.Errorf("request token failed %v", err)
} else {
authMsg.Identity.Token = token
}
authMsg.Identity.Token = token
// Sign auth message
if err := authMsg.Sign(r.priv); err != nil {
if err = authMsg.Sign(r.priv); err != nil {
return fmt.Errorf("sign auth message failed %v", err)
}
@@ -159,6 +159,8 @@ func (r *Roller) HandleCoordinator() {
}
func (r *Roller) mustRetryCoordinator() {
atomic.StoreInt64(&r.isDisconnected, 1)
defer atomic.StoreInt64(&r.isDisconnected, 0)
for {
log.Info("retry to connect to coordinator...")
err := r.Register()
@@ -196,64 +198,69 @@ func (r *Roller) ProveLoop() {
}
func (r *Roller) prove() error {
task, err := r.stack.Pop()
task, err := r.stack.Peek()
if err != nil {
return err
}
var proofMsg *message.ProofDetail
if task.Times > 2 {
proofMsg = &message.ProofDetail{
Status: message.StatusProofError,
Error: "prover has retried several times due to FFI panic",
ID: task.Task.ID,
Proof: &message.AggProof{},
if task.Times <= 2 {
// If panic times <= 2, try to proof the task.
if err = r.stack.UpdateTimes(task, task.Times+1); err != nil {
return err
}
_, err = r.signAndSubmitProof(proofMsg)
return err
}
// Sort BlockTraces by header number.
traces := task.Task.Traces
sort.Slice(traces, func(i, j int) bool {
return traces[i].Header.Number.Int64() < traces[j].Header.Number.Int64()
})
err = r.stack.Push(task)
if err != nil {
return err
}
log.Info("start to prove block", "task-id", task.Task.ID)
log.Info("start to prove block", "task-id", task.Task.ID)
// sort BlockTrace
traces := task.Task.Traces
sort.Slice(traces, func(i, j int) bool {
return traces[i].Header.Number.Int64() < traces[j].Header.Number.Int64()
})
proof, err := r.prover.Prove(traces)
if err != nil {
proofMsg = &message.ProofDetail{
Status: message.StatusProofError,
Error: err.Error(),
ID: task.Task.ID,
Proof: &message.AggProof{},
// If FFI panic during Prove, the roller will restart and re-enter prove() function,
// the proof will not be submitted.
var proof *message.AggProof
proof, err = r.prover.Prove(traces)
if err != nil {
proofMsg = &message.ProofDetail{
Status: message.StatusProofError,
Error: err.Error(),
ID: task.Task.ID,
Proof: &message.AggProof{},
}
log.Error("prove block failed!", "task-id", task.Task.ID)
} else {
proofMsg = &message.ProofDetail{
Status: message.StatusOk,
ID: task.Task.ID,
Proof: proof,
}
log.Info("prove block successfully!", "task-id", task.Task.ID)
}
log.Error("prove block failed!", "task-id", task.Task.ID)
} else {
// when the roller has more than 3 times panic,
// it will omit to prove the task, submit StatusProofError and then Pop the task.
proofMsg = &message.ProofDetail{
Status: message.StatusOk,
Status: message.StatusProofError,
Error: "zk proving panic",
ID: task.Task.ID,
Proof: proof,
Proof: &message.AggProof{},
}
log.Info("prove block successfully!", "task-id", task.Task.ID)
}
_, err = r.stack.Pop()
if err != nil {
return err
}
ok, err := r.signAndSubmitProof(proofMsg)
defer func() {
_, err = r.stack.Pop()
if err != nil {
log.Error("roller stack pop failed!", "err", err)
}
}()
ok, serr := r.signAndSubmitProof(proofMsg)
if !ok {
log.Error("submit proof to coordinator failed", "task ID", proofMsg.ID)
}
return err
return serr
}
func (r *Roller) signAndSubmitProof(msg *message.ProofDetail) (bool, error) {
@@ -262,6 +269,11 @@ func (r *Roller) signAndSubmitProof(msg *message.ProofDetail) (bool, error) {
return false, err
}
// When the roller is disconnected from the coordinator,
// wait until the roller reconnects to the coordinator.
for atomic.LoadInt64(&r.isDisconnected) == 1 {
time.Sleep(retryWait)
}
return r.client.SubmitProof(context.Background(), authZkProof)
}

View File

@@ -58,6 +58,29 @@ func (s *Stack) Push(task *ProvingTask) error {
})
}
// Peek return the top element of the Stack.
func (s *Stack) Peek() (*ProvingTask, error) {
var value []byte
if err := s.View(func(tx *bbolt.Tx) error {
bu := tx.Bucket(bucket)
c := bu.Cursor()
_, value = c.Last()
return nil
}); err != nil {
return nil, err
}
if len(value) == 0 {
return nil, ErrEmpty
}
traces := &ProvingTask{}
err := json.Unmarshal(value, traces)
if err != nil {
return nil, err
}
return traces, nil
}
// Pop pops the proving-task on the top of Stack.
func (s *Stack) Pop() (*ProvingTask, error) {
var value []byte
@@ -79,6 +102,21 @@ func (s *Stack) Pop() (*ProvingTask, error) {
if err != nil {
return nil, err
}
task.Times++
return task, nil
}
// UpdateTimes udpates the roller prove times of the proving task.
func (s *Stack) UpdateTimes(task *ProvingTask, udpateTimes int) error {
task.Times = udpateTimes
byt, err := json.Marshal(task)
if err != nil {
return err
}
key := []byte(task.Task.ID)
return s.Update(func(tx *bbolt.Tx) error {
bu := tx.Bucket(bucket)
c := bu.Cursor()
key, _ = c.Last()
return bu.Put(key, byt)
})
}

View File

@@ -57,7 +57,13 @@ func TestStack(t *testing.T) {
err = s.Push(pop)
assert.NoError(t, err)
peek, err := s.Peek()
assert.NoError(t, err)
pop2, err := s.Pop()
assert.NoError(t, err)
assert.Equal(t, 2, pop2.Times)
assert.Equal(t, peek, pop2)
s.UpdateTimes(pop2, 1)
assert.NoError(t, err)
assert.Equal(t, 1, pop2.Times)
}

View File

@@ -28,18 +28,18 @@ func testStartProcess(t *testing.T) {
// Start bridge process.
bridgeCmd := runBridgeApp(t)
bridgeCmd.ExpectWithTimeout(true, time.Second*10, "Start bridge successfully")
bridgeCmd.ExpectWithTimeout(true, time.Second*20, "Start bridge successfully")
bridgeCmd.RunApp(true)
// Start coordinator process.
coordinatorCmd := runCoordinatorApp(t, "--ws", "--ws.port", "8391")
coordinatorCmd.ExpectWithTimeout(true, time.Second*10, "Start coordinator successfully")
coordinatorCmd.ExpectWithTimeout(true, time.Second*20, "Start coordinator successfully")
coordinatorCmd.RunApp(true)
// Start roller process.
rollerCmd := runRollerApp(t)
rollerCmd.ExpectWithTimeout(true, time.Second*20, "roller start successfully")
rollerCmd.ExpectWithTimeout(true, time.Second*30, "register to coordinator successfully!")
rollerCmd.ExpectWithTimeout(true, time.Second*40, "roller start successfully")
rollerCmd.ExpectWithTimeout(true, time.Second*60, "register to coordinator successfully!")
rollerCmd.RunApp(true)
rollerCmd.WaitExit()