mirror of
https://github.com/scroll-tech/scroll.git
synced 2026-01-09 14:08:03 -05:00
fix(roller): avoid discarding task when connection failure (#203)
Co-authored-by: colinlyguo <colinlyguo@gmail.com> Co-authored-by: Lawliet-Chan <1576710154@qq.com> Co-authored-by: HAOYUatHZ <37070449+HAOYUatHZ@users.noreply.github.com>
This commit is contained in:
@@ -195,7 +195,12 @@ func (m *Manager) restorePrevSessions() {
|
||||
finishChan: make(chan rollerProofStatus, proofAndPkBufferSize),
|
||||
}
|
||||
m.sessions[sess.info.ID] = sess
|
||||
log.Info("Coordinator restart reload sessions", "ID", sess.info.ID, "sess", sess.info)
|
||||
|
||||
log.Info("Coordinator restart reload sessions", "session start time", time.Unix(sess.info.StartTimestamp, 0))
|
||||
for _, roller := range sess.info.Rollers {
|
||||
log.Info("restore roller info for session", "session id", sess.info.ID, "roller name", roller.Name, "public key", roller.PublicKey, "proof status", roller.Status)
|
||||
}
|
||||
|
||||
go m.CollectProofs(sess.info.ID, sess)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -40,7 +40,7 @@ var (
|
||||
dbImg docker.ImgInstance
|
||||
)
|
||||
|
||||
func randomUrl() string {
|
||||
func randomURL() string {
|
||||
id, _ := rand.Int(rand.Reader, big.NewInt(2000-1))
|
||||
return fmt.Sprintf("localhost:%d", 10000+2000+id.Int64())
|
||||
}
|
||||
@@ -83,7 +83,7 @@ func testHandshake(t *testing.T) {
|
||||
defer l2db.Close()
|
||||
|
||||
// Setup coordinator and ws server.
|
||||
wsURL := "ws://" + randomUrl()
|
||||
wsURL := "ws://" + randomURL()
|
||||
rollerManager, handler := setupCoordinator(t, cfg.DBConfig, wsURL)
|
||||
defer func() {
|
||||
handler.Shutdown(context.Background())
|
||||
@@ -104,7 +104,7 @@ func testFailedHandshake(t *testing.T) {
|
||||
defer l2db.Close()
|
||||
|
||||
// Setup coordinator and ws server.
|
||||
wsURL := "ws://" + randomUrl()
|
||||
wsURL := "ws://" + randomURL()
|
||||
rollerManager, handler := setupCoordinator(t, cfg.DBConfig, wsURL)
|
||||
defer func() {
|
||||
handler.Shutdown(context.Background())
|
||||
@@ -170,7 +170,7 @@ func testSeveralConnections(t *testing.T) {
|
||||
defer l2db.Close()
|
||||
|
||||
// Setup coordinator and ws server.
|
||||
wsURL := "ws://" + randomUrl()
|
||||
wsURL := "ws://" + randomURL()
|
||||
rollerManager, handler := setupCoordinator(t, cfg.DBConfig, wsURL)
|
||||
defer func() {
|
||||
handler.Shutdown(context.Background())
|
||||
@@ -224,7 +224,7 @@ func testIdleRollerSelection(t *testing.T) {
|
||||
defer l2db.Close()
|
||||
|
||||
// Setup coordinator and ws server.
|
||||
wsURL := "ws://" + randomUrl()
|
||||
wsURL := "ws://" + randomURL()
|
||||
rollerManager, handler := setupCoordinator(t, cfg.DBConfig, wsURL)
|
||||
defer func() {
|
||||
handler.Shutdown(context.Background())
|
||||
@@ -293,7 +293,7 @@ func testGracefulRestart(t *testing.T) {
|
||||
assert.NoError(t, dbTx.Commit())
|
||||
|
||||
// Setup coordinator and ws server.
|
||||
wsURL := "ws://" + randomUrl()
|
||||
wsURL := "ws://" + randomURL()
|
||||
rollerManager, handler := setupCoordinator(t, cfg.DBConfig, wsURL)
|
||||
|
||||
// create mock roller
|
||||
|
||||
@@ -3,6 +3,7 @@ package orm
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
|
||||
"github.com/jmoiron/sqlx"
|
||||
"github.com/scroll-tech/go-ethereum/common"
|
||||
@@ -84,6 +85,19 @@ const (
|
||||
RollerProofInvalid
|
||||
)
|
||||
|
||||
func (s RollerProveStatus) String() string {
|
||||
switch s {
|
||||
case RollerAssigned:
|
||||
return "RollerAssigned"
|
||||
case RollerProofValid:
|
||||
return "RollerProofValid"
|
||||
case RollerProofInvalid:
|
||||
return "RollerProofInvalid"
|
||||
default:
|
||||
return fmt.Sprintf("Bad Value: %d", int32(s))
|
||||
}
|
||||
}
|
||||
|
||||
// RollerStatus is the roller name and roller prove status
|
||||
type RollerStatus struct {
|
||||
PublicKey string `json:"public_key"`
|
||||
|
||||
100
roller/roller.go
100
roller/roller.go
@@ -42,8 +42,9 @@ type Roller struct {
|
||||
taskChan chan *message.TaskMsg
|
||||
sub ethereum.Subscription
|
||||
|
||||
isClosed int64
|
||||
stopChan chan struct{}
|
||||
isDisconnected int64
|
||||
isClosed int64
|
||||
stopChan chan struct{}
|
||||
|
||||
priv *ecdsa.PrivateKey
|
||||
}
|
||||
@@ -122,12 +123,11 @@ func (r *Roller) Register() error {
|
||||
token, err := r.client.RequestToken(context.Background(), authMsg)
|
||||
if err != nil {
|
||||
return fmt.Errorf("request token failed %v", err)
|
||||
} else {
|
||||
authMsg.Identity.Token = token
|
||||
}
|
||||
authMsg.Identity.Token = token
|
||||
|
||||
// Sign auth message
|
||||
if err := authMsg.Sign(r.priv); err != nil {
|
||||
if err = authMsg.Sign(r.priv); err != nil {
|
||||
return fmt.Errorf("sign auth message failed %v", err)
|
||||
}
|
||||
|
||||
@@ -159,6 +159,8 @@ func (r *Roller) HandleCoordinator() {
|
||||
}
|
||||
|
||||
func (r *Roller) mustRetryCoordinator() {
|
||||
atomic.StoreInt64(&r.isDisconnected, 1)
|
||||
defer atomic.StoreInt64(&r.isDisconnected, 0)
|
||||
for {
|
||||
log.Info("retry to connect to coordinator...")
|
||||
err := r.Register()
|
||||
@@ -196,64 +198,69 @@ func (r *Roller) ProveLoop() {
|
||||
}
|
||||
|
||||
func (r *Roller) prove() error {
|
||||
task, err := r.stack.Pop()
|
||||
task, err := r.stack.Peek()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var proofMsg *message.ProofDetail
|
||||
if task.Times > 2 {
|
||||
proofMsg = &message.ProofDetail{
|
||||
Status: message.StatusProofError,
|
||||
Error: "prover has retried several times due to FFI panic",
|
||||
ID: task.Task.ID,
|
||||
Proof: &message.AggProof{},
|
||||
if task.Times <= 2 {
|
||||
// If panic times <= 2, try to proof the task.
|
||||
if err = r.stack.UpdateTimes(task, task.Times+1); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = r.signAndSubmitProof(proofMsg)
|
||||
return err
|
||||
}
|
||||
// Sort BlockTraces by header number.
|
||||
traces := task.Task.Traces
|
||||
sort.Slice(traces, func(i, j int) bool {
|
||||
return traces[i].Header.Number.Int64() < traces[j].Header.Number.Int64()
|
||||
})
|
||||
|
||||
err = r.stack.Push(task)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
log.Info("start to prove block", "task-id", task.Task.ID)
|
||||
|
||||
log.Info("start to prove block", "task-id", task.Task.ID)
|
||||
|
||||
// sort BlockTrace
|
||||
traces := task.Task.Traces
|
||||
sort.Slice(traces, func(i, j int) bool {
|
||||
return traces[i].Header.Number.Int64() < traces[j].Header.Number.Int64()
|
||||
})
|
||||
proof, err := r.prover.Prove(traces)
|
||||
if err != nil {
|
||||
proofMsg = &message.ProofDetail{
|
||||
Status: message.StatusProofError,
|
||||
Error: err.Error(),
|
||||
ID: task.Task.ID,
|
||||
Proof: &message.AggProof{},
|
||||
// If FFI panic during Prove, the roller will restart and re-enter prove() function,
|
||||
// the proof will not be submitted.
|
||||
var proof *message.AggProof
|
||||
proof, err = r.prover.Prove(traces)
|
||||
if err != nil {
|
||||
proofMsg = &message.ProofDetail{
|
||||
Status: message.StatusProofError,
|
||||
Error: err.Error(),
|
||||
ID: task.Task.ID,
|
||||
Proof: &message.AggProof{},
|
||||
}
|
||||
log.Error("prove block failed!", "task-id", task.Task.ID)
|
||||
} else {
|
||||
proofMsg = &message.ProofDetail{
|
||||
Status: message.StatusOk,
|
||||
ID: task.Task.ID,
|
||||
Proof: proof,
|
||||
}
|
||||
log.Info("prove block successfully!", "task-id", task.Task.ID)
|
||||
}
|
||||
log.Error("prove block failed!", "task-id", task.Task.ID)
|
||||
} else {
|
||||
|
||||
// when the roller has more than 3 times panic,
|
||||
// it will omit to prove the task, submit StatusProofError and then Pop the task.
|
||||
proofMsg = &message.ProofDetail{
|
||||
Status: message.StatusOk,
|
||||
Status: message.StatusProofError,
|
||||
Error: "zk proving panic",
|
||||
ID: task.Task.ID,
|
||||
Proof: proof,
|
||||
Proof: &message.AggProof{},
|
||||
}
|
||||
log.Info("prove block successfully!", "task-id", task.Task.ID)
|
||||
}
|
||||
_, err = r.stack.Pop()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
ok, err := r.signAndSubmitProof(proofMsg)
|
||||
defer func() {
|
||||
_, err = r.stack.Pop()
|
||||
if err != nil {
|
||||
log.Error("roller stack pop failed!", "err", err)
|
||||
}
|
||||
}()
|
||||
|
||||
ok, serr := r.signAndSubmitProof(proofMsg)
|
||||
if !ok {
|
||||
log.Error("submit proof to coordinator failed", "task ID", proofMsg.ID)
|
||||
}
|
||||
return err
|
||||
return serr
|
||||
}
|
||||
|
||||
func (r *Roller) signAndSubmitProof(msg *message.ProofDetail) (bool, error) {
|
||||
@@ -262,6 +269,11 @@ func (r *Roller) signAndSubmitProof(msg *message.ProofDetail) (bool, error) {
|
||||
return false, err
|
||||
}
|
||||
|
||||
// When the roller is disconnected from the coordinator,
|
||||
// wait until the roller reconnects to the coordinator.
|
||||
for atomic.LoadInt64(&r.isDisconnected) == 1 {
|
||||
time.Sleep(retryWait)
|
||||
}
|
||||
return r.client.SubmitProof(context.Background(), authZkProof)
|
||||
}
|
||||
|
||||
|
||||
@@ -58,6 +58,29 @@ func (s *Stack) Push(task *ProvingTask) error {
|
||||
})
|
||||
}
|
||||
|
||||
// Peek return the top element of the Stack.
|
||||
func (s *Stack) Peek() (*ProvingTask, error) {
|
||||
var value []byte
|
||||
if err := s.View(func(tx *bbolt.Tx) error {
|
||||
bu := tx.Bucket(bucket)
|
||||
c := bu.Cursor()
|
||||
_, value = c.Last()
|
||||
return nil
|
||||
}); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(value) == 0 {
|
||||
return nil, ErrEmpty
|
||||
}
|
||||
|
||||
traces := &ProvingTask{}
|
||||
err := json.Unmarshal(value, traces)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return traces, nil
|
||||
}
|
||||
|
||||
// Pop pops the proving-task on the top of Stack.
|
||||
func (s *Stack) Pop() (*ProvingTask, error) {
|
||||
var value []byte
|
||||
@@ -79,6 +102,21 @@ func (s *Stack) Pop() (*ProvingTask, error) {
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
task.Times++
|
||||
return task, nil
|
||||
}
|
||||
|
||||
// UpdateTimes udpates the roller prove times of the proving task.
|
||||
func (s *Stack) UpdateTimes(task *ProvingTask, udpateTimes int) error {
|
||||
task.Times = udpateTimes
|
||||
byt, err := json.Marshal(task)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
key := []byte(task.Task.ID)
|
||||
return s.Update(func(tx *bbolt.Tx) error {
|
||||
bu := tx.Bucket(bucket)
|
||||
c := bu.Cursor()
|
||||
key, _ = c.Last()
|
||||
return bu.Put(key, byt)
|
||||
})
|
||||
}
|
||||
|
||||
@@ -57,7 +57,13 @@ func TestStack(t *testing.T) {
|
||||
err = s.Push(pop)
|
||||
assert.NoError(t, err)
|
||||
|
||||
peek, err := s.Peek()
|
||||
assert.NoError(t, err)
|
||||
pop2, err := s.Pop()
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, 2, pop2.Times)
|
||||
assert.Equal(t, peek, pop2)
|
||||
|
||||
s.UpdateTimes(pop2, 1)
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, 1, pop2.Times)
|
||||
}
|
||||
|
||||
@@ -28,18 +28,18 @@ func testStartProcess(t *testing.T) {
|
||||
|
||||
// Start bridge process.
|
||||
bridgeCmd := runBridgeApp(t)
|
||||
bridgeCmd.ExpectWithTimeout(true, time.Second*10, "Start bridge successfully")
|
||||
bridgeCmd.ExpectWithTimeout(true, time.Second*20, "Start bridge successfully")
|
||||
bridgeCmd.RunApp(true)
|
||||
|
||||
// Start coordinator process.
|
||||
coordinatorCmd := runCoordinatorApp(t, "--ws", "--ws.port", "8391")
|
||||
coordinatorCmd.ExpectWithTimeout(true, time.Second*10, "Start coordinator successfully")
|
||||
coordinatorCmd.ExpectWithTimeout(true, time.Second*20, "Start coordinator successfully")
|
||||
coordinatorCmd.RunApp(true)
|
||||
|
||||
// Start roller process.
|
||||
rollerCmd := runRollerApp(t)
|
||||
rollerCmd.ExpectWithTimeout(true, time.Second*20, "roller start successfully")
|
||||
rollerCmd.ExpectWithTimeout(true, time.Second*30, "register to coordinator successfully!")
|
||||
rollerCmd.ExpectWithTimeout(true, time.Second*40, "roller start successfully")
|
||||
rollerCmd.ExpectWithTimeout(true, time.Second*60, "register to coordinator successfully!")
|
||||
rollerCmd.RunApp(true)
|
||||
|
||||
rollerCmd.WaitExit()
|
||||
|
||||
Reference in New Issue
Block a user