diff --git a/staker/block_validator.go b/staker/block_validator.go index fc4b14dc4f..e6006b877f 100644 --- a/staker/block_validator.go +++ b/staker/block_validator.go @@ -912,8 +912,9 @@ func (v *BlockValidator) advanceValidations(ctx context.Context) (*arbutil.Messa } if !validationStatus.DoneEntry.Success { if validationStatus.DoneEntry.Timeout { - // Timeout is transient — log a warning and retry without crashing. - log.Warn("validation timed out, scheduling retry", "pos", pos, "start", validationStatus.DoneEntry.Start) + // Timeout is transient — retry without crashing. + // Detailed warning already logged in sendValidations. + log.Trace("advanceValidations: retrying timed-out validation", "pos", pos) } else { v.possiblyFatal(fmt.Errorf("validation: failed entry pos %d, start %v", pos, validationStatus.DoneEntry.Start)) } diff --git a/system_tests/block_validator_timeout_test.go b/system_tests/block_validator_timeout_test.go index 59861d4203..3856ec848f 100644 --- a/system_tests/block_validator_timeout_test.go +++ b/system_tests/block_validator_timeout_test.go @@ -155,4 +155,9 @@ func TestBlockValidatorTimeoutRetry(t *testing.T) { if !testClientB.ConsensusNode.BlockValidator.WaitForPos(t, ctx, arbutil.MessageIndex(block.Uint64()), timeout) { Fatal(t, "did not validate the block - timeout errors should have been retried") } + + // Verify the proxy actually injected timeout errors (remainingTimeouts should be negative). + if remaining := proxy.remainingTimeouts.Load(); remaining >= 0 { + Fatal(t, "proxy did not inject all expected timeouts, remaining:", remaining) + } } diff --git a/validator/retry_wrapper/retry_wrapper.go b/validator/retry_wrapper/retry_wrapper.go index 1ca66c0171..8e580fb777 100644 --- a/validator/retry_wrapper/retry_wrapper.go +++ b/validator/retry_wrapper/retry_wrapper.go @@ -59,6 +59,12 @@ func (v *ValidationSpawnerRetryWrapper) LaunchWithNAllowedAttempts(entry *valida if nonTimeoutAttempts >= allowedAttempts { return validator.GoGlobalState{}, err } + log.Warn("validation attempt failed, retrying", + "err", err, + "moduleRoot", moduleRoot, + "attempt", nonTimeoutAttempts, + "allowedAttempts", allowedAttempts, + ) } }) return server_common.NewValRun(promise, moduleRoot) diff --git a/validator/retry_wrapper/retry_wrapper_test.go b/validator/retry_wrapper/retry_wrapper_test.go new file mode 100644 index 0000000000..fcf84d7758 --- /dev/null +++ b/validator/retry_wrapper/retry_wrapper_test.go @@ -0,0 +1,211 @@ +// Copyright 2025-2026, Offchain Labs, Inc. +// For license information, see https://github.com/OffchainLabs/nitro/blob/master/LICENSE.md + +package retry_wrapper + +import ( + "context" + "errors" + "sync/atomic" + "testing" + + "github.com/ethereum/go-ethereum/common" + "github.com/ethereum/go-ethereum/core/rawdb" + + "github.com/offchainlabs/nitro/util/containers" + "github.com/offchainlabs/nitro/validator" + "github.com/offchainlabs/nitro/validator/server_common" +) + +// testSpawner is a mock ValidationSpawner that returns a sequence of +// predetermined results. Each call to Launch pops the next result. +type testSpawner struct { + results []launchResult + idx atomic.Int64 +} + +type launchResult struct { + state validator.GoGlobalState + err error +} + +func (s *testSpawner) Launch(_ *validator.ValidationInput, moduleRoot common.Hash) validator.ValidationRun { + i := int(s.idx.Add(1) - 1) + if i >= len(s.results) { + panic("testSpawner: too many Launch calls") + } + r := s.results[i] + promise := containers.NewPromise[validator.GoGlobalState](nil) + if r.err != nil { + promise.ProduceError(r.err) + } else { + promise.Produce(r.state) + } + return server_common.NewValRun(&promise, moduleRoot) +} + +func (s *testSpawner) WasmModuleRoots() ([]common.Hash, error) { return nil, nil } +func (s *testSpawner) Start(context.Context) error { return nil } +func (s *testSpawner) Stop() {} +func (s *testSpawner) Name() string { return "test" } +func (s *testSpawner) StylusArchs() []rawdb.WasmTarget { return nil } +func (s *testSpawner) Capacity() int { return 1 } + +var ( + successState = validator.GoGlobalState{Batch: 1, PosInBatch: 1} + testRoot = common.Hash{1} + errTimeout = context.DeadlineExceeded + errGeneric = errors.New("validation failed") +) + +func launchAndAwait(t *testing.T, wrapper *ValidationSpawnerRetryWrapper, allowedAttempts, allowedTimeouts uint64) (validator.GoGlobalState, error) { + t.Helper() + ctx := context.Background() + run := wrapper.LaunchWithNAllowedAttempts(nil, testRoot, allowedAttempts, allowedTimeouts) + return run.Await(ctx) +} + +func setupWrapper(t *testing.T, results []launchResult) *ValidationSpawnerRetryWrapper { + t.Helper() + spawner := &testSpawner{results: results} + wrapper := NewValidationSpawnerRetryWrapper(spawner) + wrapper.StopWaiter.Start(context.Background(), wrapper) + t.Cleanup(func() { wrapper.StopWaiter.StopAndWait() }) + return wrapper +} + +func TestRetryWrapper_SuccessOnFirstAttempt(t *testing.T) { + wrapper := setupWrapper(t, []launchResult{ + {state: successState}, + }) + state, err := launchAndAwait(t, wrapper, 1, 1) + if err != nil { + t.Fatal("unexpected error:", err) + } + if state != successState { + t.Fatalf("unexpected state: got %v, want %v", state, successState) + } +} + +func TestRetryWrapper_TimeoutThenSuccess(t *testing.T) { + wrapper := setupWrapper(t, []launchResult{ + {err: errTimeout}, + {err: errTimeout}, + {state: successState}, + }) + state, err := launchAndAwait(t, wrapper, 1, 3) + if err != nil { + t.Fatal("unexpected error:", err) + } + if state != successState { + t.Fatalf("unexpected state: got %v, want %v", state, successState) + } +} + +func TestRetryWrapper_TimeoutExhausted(t *testing.T) { + wrapper := setupWrapper(t, []launchResult{ + {err: errTimeout}, + {err: errTimeout}, + }) + _, err := launchAndAwait(t, wrapper, 1, 2) + if !errors.Is(err, errTimeout) { + t.Fatalf("expected timeout error, got: %v", err) + } +} + +func TestRetryWrapper_NonTimeoutExhausted(t *testing.T) { + wrapper := setupWrapper(t, []launchResult{ + {err: errGeneric}, + {err: errGeneric}, + }) + _, err := launchAndAwait(t, wrapper, 2, 3) + if err == nil || err.Error() != errGeneric.Error() { + t.Fatalf("expected generic error, got: %v", err) + } +} + +func TestRetryWrapper_CountersAreIndependent(t *testing.T) { + // Mix of timeout and non-timeout errors: each counter should increment independently. + wrapper := setupWrapper(t, []launchResult{ + {err: errTimeout}, // timeoutAttempts=1 + {err: errGeneric}, // nonTimeoutAttempts=1 + {err: errTimeout}, // timeoutAttempts=2 + {state: successState}, + }) + state, err := launchAndAwait(t, wrapper, 2, 3) + if err != nil { + t.Fatal("unexpected error:", err) + } + if state != successState { + t.Fatalf("unexpected state: got %v, want %v", state, successState) + } +} + +func TestRetryWrapper_MixedErrorsTimeoutExhausted(t *testing.T) { + // Non-timeout errors should not count toward timeout budget. + wrapper := setupWrapper(t, []launchResult{ + {err: errGeneric}, // nonTimeoutAttempts=1 + {err: errTimeout}, // timeoutAttempts=1 + {err: errTimeout}, // timeoutAttempts=2, exhausted + }) + _, err := launchAndAwait(t, wrapper, 3, 2) + if !errors.Is(err, errTimeout) { + t.Fatalf("expected timeout error, got: %v", err) + } +} + +func TestRetryWrapper_MixedErrorsNonTimeoutExhausted(t *testing.T) { + // Timeout errors should not count toward non-timeout budget. + wrapper := setupWrapper(t, []launchResult{ + {err: errTimeout}, // timeoutAttempts=1 + {err: errGeneric}, // nonTimeoutAttempts=1 + {err: errGeneric}, // nonTimeoutAttempts=2, exhausted + }) + _, err := launchAndAwait(t, wrapper, 2, 3) + if err == nil || err.Error() != errGeneric.Error() { + t.Fatalf("expected generic error, got: %v", err) + } +} + +func TestRetryWrapper_ZeroAllowedTimeouts(t *testing.T) { + // With 0 allowed timeouts, the first timeout should be immediately fatal. + wrapper := setupWrapper(t, []launchResult{ + {err: errTimeout}, + }) + _, err := launchAndAwait(t, wrapper, 1, 0) + if !errors.Is(err, errTimeout) { + t.Fatalf("expected timeout error, got: %v", err) + } +} + +func TestRetryWrapper_ZeroAllowedAttempts(t *testing.T) { + // With 0 allowed attempts, the first non-timeout error should be immediately fatal. + wrapper := setupWrapper(t, []launchResult{ + {err: errGeneric}, + }) + _, err := launchAndAwait(t, wrapper, 0, 3) + if err == nil || err.Error() != errGeneric.Error() { + t.Fatalf("expected generic error, got: %v", err) + } +} + +func TestRetryWrapper_ContextCanceled(t *testing.T) { + // If the context is canceled, the wrapper should return ctx.Err() rather + // than continuing to retry. + ctx, cancel := context.WithCancel(context.Background()) + cancel() // cancel immediately + + spawner := &testSpawner{results: []launchResult{ + // The spawner's result won't matter — ctx is already canceled. + {err: errTimeout}, + }} + wrapper := NewValidationSpawnerRetryWrapper(spawner) + wrapper.StopWaiter.Start(ctx, wrapper) + defer wrapper.StopWaiter.StopAndWait() + + run := wrapper.LaunchWithNAllowedAttempts(nil, testRoot, 3, 3) + _, err := run.Await(context.Background()) + if err == nil { + t.Fatal("expected error from canceled context") + } +} diff --git a/validator/timeout.go b/validator/timeout.go index 6f09eb3e1b..de86e51fa8 100644 --- a/validator/timeout.go +++ b/validator/timeout.go @@ -45,7 +45,11 @@ func IsTimeoutError(err error) bool { return true } - // String-based detection + // String-based detection for errors serialized through Redis or RPC, + // which lose Go type information. This may produce false positives for + // unrelated errors whose messages contain these substrings, but the + // consequence is only delayed failure detection (extra retries up to the + // configured limit), not silent suppression. errMsg := err.Error() if strings.Contains(errMsg, pubsub.TimeoutErrorMessage) ||