Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions staker/block_validator.go
Original file line number Diff line number Diff line change
Expand Up @@ -912,8 +912,9 @@ func (v *BlockValidator) advanceValidations(ctx context.Context) (*arbutil.Messa
}
if !validationStatus.DoneEntry.Success {
if validationStatus.DoneEntry.Timeout {
// Timeout is transient — log a warning and retry without crashing.
log.Warn("validation timed out, scheduling retry", "pos", pos, "start", validationStatus.DoneEntry.Start)
// Timeout is transient — retry without crashing.
// Detailed warning already logged in sendValidations.
log.Trace("advanceValidations: retrying timed-out validation", "pos", pos)
} else {
v.possiblyFatal(fmt.Errorf("validation: failed entry pos %d, start %v", pos, validationStatus.DoneEntry.Start))
}
Expand Down
5 changes: 5 additions & 0 deletions system_tests/block_validator_timeout_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,4 +155,9 @@ func TestBlockValidatorTimeoutRetry(t *testing.T) {
if !testClientB.ConsensusNode.BlockValidator.WaitForPos(t, ctx, arbutil.MessageIndex(block.Uint64()), timeout) {
Fatal(t, "did not validate the block - timeout errors should have been retried")
}

// Verify the proxy actually injected timeout errors (remainingTimeouts should be negative).
if remaining := proxy.remainingTimeouts.Load(); remaining >= 0 {
Fatal(t, "proxy did not inject all expected timeouts, remaining:", remaining)
}
}
6 changes: 6 additions & 0 deletions validator/retry_wrapper/retry_wrapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ func (v *ValidationSpawnerRetryWrapper) LaunchWithNAllowedAttempts(entry *valida
if nonTimeoutAttempts >= allowedAttempts {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should have a strong comparison here - if we allow 3 attempts, then the 3rd failure shouldn't cause an error

return validator.GoGlobalState{}, err
}
log.Warn("validation attempt failed, retrying",
"err", err,
"moduleRoot", moduleRoot,
"attempt", nonTimeoutAttempts,
"allowedAttempts", allowedAttempts,
)
}
})
return server_common.NewValRun(promise, moduleRoot)
Expand Down
211 changes: 211 additions & 0 deletions validator/retry_wrapper/retry_wrapper_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
// Copyright 2025-2026, Offchain Labs, Inc.
// For license information, see https://github.com/OffchainLabs/nitro/blob/master/LICENSE.md

package retry_wrapper

import (
"context"
"errors"
"sync/atomic"
"testing"

"github.com/ethereum/go-ethereum/common"
"github.com/ethereum/go-ethereum/core/rawdb"

"github.com/offchainlabs/nitro/util/containers"
"github.com/offchainlabs/nitro/validator"
"github.com/offchainlabs/nitro/validator/server_common"
)

// testSpawner is a mock ValidationSpawner that returns a sequence of
// predetermined results. Each call to Launch pops the next result.
type testSpawner struct {
results []launchResult
idx atomic.Int64
}

type launchResult struct {
state validator.GoGlobalState
err error
}

func (s *testSpawner) Launch(_ *validator.ValidationInput, moduleRoot common.Hash) validator.ValidationRun {
i := int(s.idx.Add(1) - 1)
if i >= len(s.results) {
panic("testSpawner: too many Launch calls")
}
r := s.results[i]
promise := containers.NewPromise[validator.GoGlobalState](nil)
if r.err != nil {
promise.ProduceError(r.err)
} else {
promise.Produce(r.state)
}
Comment on lines +39 to +43
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if r.err != nil {
promise.ProduceError(r.err)
} else {
promise.Produce(r.state)
}
promise.ProduceResult(r.state, r.err)

return server_common.NewValRun(&promise, moduleRoot)
}

func (s *testSpawner) WasmModuleRoots() ([]common.Hash, error) { return nil, nil }
func (s *testSpawner) Start(context.Context) error { return nil }

Check failure on line 48 in validator/retry_wrapper/retry_wrapper_test.go

View workflow job for this annotation

GitHub Actions / fast / Lint and Build

File is not properly formatted (gci)
func (s *testSpawner) Stop() {}
func (s *testSpawner) Name() string { return "test" }
func (s *testSpawner) StylusArchs() []rawdb.WasmTarget { return nil }
func (s *testSpawner) Capacity() int { return 1 }
Comment on lines +47 to +52
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
func (s *testSpawner) WasmModuleRoots() ([]common.Hash, error) { return nil, nil }
func (s *testSpawner) Start(context.Context) error { return nil }
func (s *testSpawner) Stop() {}
func (s *testSpawner) Name() string { return "test" }
func (s *testSpawner) StylusArchs() []rawdb.WasmTarget { return nil }
func (s *testSpawner) Capacity() int { return 1 }
func (s *testSpawner) WasmModuleRoots() ([]common.Hash, error) { return nil, nil }
func (s *testSpawner) Start(context.Context) error { return nil }
func (s *testSpawner) Stop() {}
func (s *testSpawner) Name() string { return "test" }
func (s *testSpawner) StylusArchs() []rawdb.WasmTarget { return nil }
func (s *testSpawner) Capacity() int { return 1 }


var (
successState = validator.GoGlobalState{Batch: 1, PosInBatch: 1}
testRoot = common.Hash{1}
errTimeout = context.DeadlineExceeded
errGeneric = errors.New("validation failed")
)

func launchAndAwait(t *testing.T, wrapper *ValidationSpawnerRetryWrapper, allowedAttempts, allowedTimeouts uint64) (validator.GoGlobalState, error) {
t.Helper()
ctx := context.Background()
run := wrapper.LaunchWithNAllowedAttempts(nil, testRoot, allowedAttempts, allowedTimeouts)
return run.Await(ctx)
}

func setupWrapper(t *testing.T, results []launchResult) *ValidationSpawnerRetryWrapper {
t.Helper()
spawner := &testSpawner{results: results}
wrapper := NewValidationSpawnerRetryWrapper(spawner)
wrapper.StopWaiter.Start(context.Background(), wrapper)
Comment on lines +63 to +72
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering - shouldn't we use a common parent context and derive from it?

t.Cleanup(func() { wrapper.StopWaiter.StopAndWait() })
return wrapper
}

func TestRetryWrapper_SuccessOnFirstAttempt(t *testing.T) {
wrapper := setupWrapper(t, []launchResult{
{state: successState},
})
state, err := launchAndAwait(t, wrapper, 1, 1)
if err != nil {
t.Fatal("unexpected error:", err)
}
if state != successState {
t.Fatalf("unexpected state: got %v, want %v", state, successState)
}
}

func TestRetryWrapper_TimeoutThenSuccess(t *testing.T) {
wrapper := setupWrapper(t, []launchResult{
{err: errTimeout},
{err: errTimeout},
{state: successState},
})
state, err := launchAndAwait(t, wrapper, 1, 3)
if err != nil {
t.Fatal("unexpected error:", err)
}
if state != successState {
t.Fatalf("unexpected state: got %v, want %v", state, successState)
}
}

func TestRetryWrapper_TimeoutExhausted(t *testing.T) {
wrapper := setupWrapper(t, []launchResult{
{err: errTimeout},
{err: errTimeout},
})
_, err := launchAndAwait(t, wrapper, 1, 2)
if !errors.Is(err, errTimeout) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if !errors.Is(err, errTimeout) {
if !validator.IsTimeoutError(err) {

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe we want to use this utility here and in the other assertions

t.Fatalf("expected timeout error, got: %v", err)
}
}

func TestRetryWrapper_NonTimeoutExhausted(t *testing.T) {
wrapper := setupWrapper(t, []launchResult{
{err: errGeneric},
{err: errGeneric},
})
_, err := launchAndAwait(t, wrapper, 2, 3)
if err == nil || err.Error() != errGeneric.Error() {
t.Fatalf("expected generic error, got: %v", err)
}
}

func TestRetryWrapper_CountersAreIndependent(t *testing.T) {
// Mix of timeout and non-timeout errors: each counter should increment independently.
wrapper := setupWrapper(t, []launchResult{
{err: errTimeout}, // timeoutAttempts=1
{err: errGeneric}, // nonTimeoutAttempts=1
{err: errTimeout}, // timeoutAttempts=2
{state: successState},
})
state, err := launchAndAwait(t, wrapper, 2, 3)
if err != nil {
t.Fatal("unexpected error:", err)
}
if state != successState {
t.Fatalf("unexpected state: got %v, want %v", state, successState)
}
}

func TestRetryWrapper_MixedErrorsTimeoutExhausted(t *testing.T) {
// Non-timeout errors should not count toward timeout budget.
wrapper := setupWrapper(t, []launchResult{
{err: errGeneric}, // nonTimeoutAttempts=1
{err: errTimeout}, // timeoutAttempts=1
{err: errTimeout}, // timeoutAttempts=2, exhausted
})
_, err := launchAndAwait(t, wrapper, 3, 2)
if !errors.Is(err, errTimeout) {
t.Fatalf("expected timeout error, got: %v", err)
}
}

func TestRetryWrapper_MixedErrorsNonTimeoutExhausted(t *testing.T) {
// Timeout errors should not count toward non-timeout budget.
wrapper := setupWrapper(t, []launchResult{
{err: errTimeout}, // timeoutAttempts=1
{err: errGeneric}, // nonTimeoutAttempts=1
{err: errGeneric}, // nonTimeoutAttempts=2, exhausted
})
_, err := launchAndAwait(t, wrapper, 2, 3)
if err == nil || err.Error() != errGeneric.Error() {
t.Fatalf("expected generic error, got: %v", err)
}
}

func TestRetryWrapper_ZeroAllowedTimeouts(t *testing.T) {
// With 0 allowed timeouts, the first timeout should be immediately fatal.
wrapper := setupWrapper(t, []launchResult{
{err: errTimeout},
})
_, err := launchAndAwait(t, wrapper, 1, 0)
if !errors.Is(err, errTimeout) {
t.Fatalf("expected timeout error, got: %v", err)
}
}

func TestRetryWrapper_ZeroAllowedAttempts(t *testing.T) {
// With 0 allowed attempts, the first non-timeout error should be immediately fatal.
wrapper := setupWrapper(t, []launchResult{
{err: errGeneric},
})
_, err := launchAndAwait(t, wrapper, 0, 3)
if err == nil || err.Error() != errGeneric.Error() {
t.Fatalf("expected generic error, got: %v", err)
}
}

func TestRetryWrapper_ContextCanceled(t *testing.T) {
// If the context is canceled, the wrapper should return ctx.Err() rather
// than continuing to retry.
ctx, cancel := context.WithCancel(context.Background())
cancel() // cancel immediately

spawner := &testSpawner{results: []launchResult{
// The spawner's result won't matter — ctx is already canceled.
{err: errTimeout},
}}
wrapper := NewValidationSpawnerRetryWrapper(spawner)
wrapper.StopWaiter.Start(ctx, wrapper)
defer wrapper.StopWaiter.StopAndWait()

run := wrapper.LaunchWithNAllowedAttempts(nil, testRoot, 3, 3)
_, err := run.Await(context.Background())
if err == nil {
t.Fatal("expected error from canceled context")
}
}
6 changes: 5 additions & 1 deletion validator/timeout.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,11 @@ func IsTimeoutError(err error) bool {
return true
}

// String-based detection
// String-based detection for errors serialized through Redis or RPC,
// which lose Go type information. This may produce false positives for
// unrelated errors whose messages contain these substrings, but the
// consequence is only delayed failure detection (extra retries up to the
// configured limit), not silent suppression.

errMsg := err.Error()
if strings.Contains(errMsg, pubsub.TimeoutErrorMessage) ||
Expand Down
Loading