OffchainLabs · joshuacolvin0 · Mar 5, 2026 · pmikolajczyk41 · Mar 5, 2026 · pmikolajczyk41
diff --git a/staker/block_validator.go b/staker/block_validator.go
@@ -912,8 +912,9 @@ func (v *BlockValidator) advanceValidations(ctx context.Context) (*arbutil.Messa
 		}
 		if !validationStatus.DoneEntry.Success {
 			if validationStatus.DoneEntry.Timeout {
-				// Timeout is transient — log a warning and retry without crashing.
-				log.Warn("validation timed out, scheduling retry", "pos", pos, "start", validationStatus.DoneEntry.Start)
+				// Timeout is transient — retry without crashing.
+				// Detailed warning already logged in sendValidations.
+				log.Trace("advanceValidations: retrying timed-out validation", "pos", pos)
 			} else {
 				v.possiblyFatal(fmt.Errorf("validation: failed entry pos %d, start %v", pos, validationStatus.DoneEntry.Start))
 			}

diff --git a/system_tests/block_validator_timeout_test.go b/system_tests/block_validator_timeout_test.go
@@ -155,4 +155,9 @@ func TestBlockValidatorTimeoutRetry(t *testing.T) {
 	if !testClientB.ConsensusNode.BlockValidator.WaitForPos(t, ctx, arbutil.MessageIndex(block.Uint64()), timeout) {
 		Fatal(t, "did not validate the block - timeout errors should have been retried")
 	}
+
+	// Verify the proxy actually injected timeout errors (remainingTimeouts should be negative).
+	if remaining := proxy.remainingTimeouts.Load(); remaining >= 0 {
+		Fatal(t, "proxy did not inject all expected timeouts, remaining:", remaining)
+	}
 }
diff --git a/validator/retry_wrapper/retry_wrapper.go b/validator/retry_wrapper/retry_wrapper.go
@@ -59,6 +59,12 @@ func (v *ValidationSpawnerRetryWrapper) LaunchWithNAllowedAttempts(entry *valida
 			if nonTimeoutAttempts >= allowedAttempts {
 				return validator.GoGlobalState{}, err
 			}
+			log.Warn("validation attempt failed, retrying",
+				"err", err,
+				"moduleRoot", moduleRoot,
+				"attempt", nonTimeoutAttempts,
+				"allowedAttempts", allowedAttempts,
+			)
 		}
 	})
 	return server_common.NewValRun(promise, moduleRoot)

diff --git a/validator/retry_wrapper/retry_wrapper_test.go b/validator/retry_wrapper/retry_wrapper_test.go
@@ -0,0 +1,211 @@
+// Copyright 2025-2026, Offchain Labs, Inc.
+// For license information, see https://github.com/OffchainLabs/nitro/blob/master/LICENSE.md
+
+package retry_wrapper
+
+import (
+	"context"
+	"errors"
+	"sync/atomic"
+	"testing"
+
+	"github.com/ethereum/go-ethereum/common"
+	"github.com/ethereum/go-ethereum/core/rawdb"
+
+	"github.com/offchainlabs/nitro/util/containers"
+	"github.com/offchainlabs/nitro/validator"
+	"github.com/offchainlabs/nitro/validator/server_common"
+)
+
+// testSpawner is a mock ValidationSpawner that returns a sequence of
+// predetermined results. Each call to Launch pops the next result.
+type testSpawner struct {
+	results []launchResult
+	idx     atomic.Int64
+}
+
+type launchResult struct {
+	state validator.GoGlobalState
+	err   error
+}
+
+func (s *testSpawner) Launch(_ *validator.ValidationInput, moduleRoot common.Hash) validator.ValidationRun {
+	i := int(s.idx.Add(1) - 1)
+	if i >= len(s.results) {
+		panic("testSpawner: too many Launch calls")
+	}
+	r := s.results[i]
+	promise := containers.NewPromise[validator.GoGlobalState](nil)
+	if r.err != nil {
+		promise.ProduceError(r.err)
+	} else {
+		promise.Produce(r.state)
+	}
-	if r.err != nil {
-		promise.ProduceError(r.err)
-	} else {
-		promise.Produce(r.state)
-	}
+	promise.ProduceResult(r.state, r.err)
-	if r.err != nil {
-		promise.ProduceError(r.err)
-	} else {
-		promise.Produce(r.state)
-	}
+	promise.ProduceResult(r.state, r.err)
+	return server_common.NewValRun(&promise, moduleRoot)
+}
+
+func (s *testSpawner) WasmModuleRoots() ([]common.Hash, error) { return nil, nil }
+func (s *testSpawner) Start(context.Context) error              { return nil }
+func (s *testSpawner) Stop()                                    {}
+func (s *testSpawner) Name() string                             { return "test" }
+func (s *testSpawner) StylusArchs() []rawdb.WasmTarget          { return nil }
+func (s *testSpawner) Capacity() int                            { return 1 }
-func (s *testSpawner) WasmModuleRoots() ([]common.Hash, error) { return nil, nil }
-func (s *testSpawner) Start(context.Context) error              { return nil }
-func (s *testSpawner) Stop()                                    {}
-func (s *testSpawner) Name() string                             { return "test" }
-func (s *testSpawner) StylusArchs() []rawdb.WasmTarget          { return nil }
-func (s *testSpawner) Capacity() int                            { return 1 }
+func (s *testSpawner) WasmModuleRoots() ([]common.Hash, error) { return nil, nil }
+func (s *testSpawner) Start(context.Context) error             { return nil }
+func (s *testSpawner) Stop()                                   {}
+func (s *testSpawner) Name() string                            { return "test" }
+func (s *testSpawner) StylusArchs() []rawdb.WasmTarget         { return nil }
+func (s *testSpawner) Capacity() int                           { return 1 }
-func (s *testSpawner) WasmModuleRoots() ([]common.Hash, error) { return nil, nil }
-func (s *testSpawner) Start(context.Context) error              { return nil }
-func (s *testSpawner) Stop()                                    {}
-func (s *testSpawner) Name() string                             { return "test" }
-func (s *testSpawner) StylusArchs() []rawdb.WasmTarget          { return nil }
-func (s *testSpawner) Capacity() int                            { return 1 }
+func (s *testSpawner) WasmModuleRoots() ([]common.Hash, error) { return nil, nil }
+func (s *testSpawner) Start(context.Context) error             { return nil }
+func (s *testSpawner) Stop()                                   {}
+func (s *testSpawner) Name() string                            { return "test" }
+func (s *testSpawner) StylusArchs() []rawdb.WasmTarget         { return nil }
+func (s *testSpawner) Capacity() int                           { return 1 }
+
+var (
+	successState = validator.GoGlobalState{Batch: 1, PosInBatch: 1}
+	testRoot     = common.Hash{1}
+	errTimeout   = context.DeadlineExceeded
+	errGeneric   = errors.New("validation failed")
+)
+
+func launchAndAwait(t *testing.T, wrapper *ValidationSpawnerRetryWrapper, allowedAttempts, allowedTimeouts uint64) (validator.GoGlobalState, error) {
+	t.Helper()
+	ctx := context.Background()
+	run := wrapper.LaunchWithNAllowedAttempts(nil, testRoot, allowedAttempts, allowedTimeouts)
+	return run.Await(ctx)
+}
+
+func setupWrapper(t *testing.T, results []launchResult) *ValidationSpawnerRetryWrapper {
+	t.Helper()
+	spawner := &testSpawner{results: results}
+	wrapper := NewValidationSpawnerRetryWrapper(spawner)
+	wrapper.StopWaiter.Start(context.Background(), wrapper)
+	t.Cleanup(func() { wrapper.StopWaiter.StopAndWait() })
+	return wrapper
+}
+
+func TestRetryWrapper_SuccessOnFirstAttempt(t *testing.T) {
+	wrapper := setupWrapper(t, []launchResult{
+		{state: successState},
+	})
+	state, err := launchAndAwait(t, wrapper, 1, 1)
+	if err != nil {
+		t.Fatal("unexpected error:", err)
+	}
+	if state != successState {
+		t.Fatalf("unexpected state: got %v, want %v", state, successState)
+	}
+}
+
+func TestRetryWrapper_TimeoutThenSuccess(t *testing.T) {
+	wrapper := setupWrapper(t, []launchResult{
+		{err: errTimeout},
+		{err: errTimeout},
+		{state: successState},
+	})
+	state, err := launchAndAwait(t, wrapper, 1, 3)
+	if err != nil {
+		t.Fatal("unexpected error:", err)
+	}
+	if state != successState {
+		t.Fatalf("unexpected state: got %v, want %v", state, successState)
+	}
+}
+
+func TestRetryWrapper_TimeoutExhausted(t *testing.T) {
+	wrapper := setupWrapper(t, []launchResult{
+		{err: errTimeout},
+		{err: errTimeout},
+	})
+	_, err := launchAndAwait(t, wrapper, 1, 2)
+	if !errors.Is(err, errTimeout) {
-	if !errors.Is(err, errTimeout) {
+	if !validator.IsTimeoutError(err) {
-	if !errors.Is(err, errTimeout) {
+	if !validator.IsTimeoutError(err) {
+		t.Fatalf("expected timeout error, got: %v", err)
+	}
+}
+
+func TestRetryWrapper_NonTimeoutExhausted(t *testing.T) {
+	wrapper := setupWrapper(t, []launchResult{
+		{err: errGeneric},
+		{err: errGeneric},
+	})
+	_, err := launchAndAwait(t, wrapper, 2, 3)
+	if err == nil || err.Error() != errGeneric.Error() {
+		t.Fatalf("expected generic error, got: %v", err)
+	}
+}
+
+func TestRetryWrapper_CountersAreIndependent(t *testing.T) {
+	// Mix of timeout and non-timeout errors: each counter should increment independently.
+	wrapper := setupWrapper(t, []launchResult{
+		{err: errTimeout},  // timeoutAttempts=1
+		{err: errGeneric},  // nonTimeoutAttempts=1
+		{err: errTimeout},  // timeoutAttempts=2
+		{state: successState},
+	})
+	state, err := launchAndAwait(t, wrapper, 2, 3)
+	if err != nil {
+		t.Fatal("unexpected error:", err)
+	}
+	if state != successState {
+		t.Fatalf("unexpected state: got %v, want %v", state, successState)
+	}
+}
+
+func TestRetryWrapper_MixedErrorsTimeoutExhausted(t *testing.T) {
+	// Non-timeout errors should not count toward timeout budget.
+	wrapper := setupWrapper(t, []launchResult{
+		{err: errGeneric},  // nonTimeoutAttempts=1
+		{err: errTimeout},  // timeoutAttempts=1
+		{err: errTimeout},  // timeoutAttempts=2, exhausted
+	})
+	_, err := launchAndAwait(t, wrapper, 3, 2)
+	if !errors.Is(err, errTimeout) {
+		t.Fatalf("expected timeout error, got: %v", err)
+	}
+}
+
+func TestRetryWrapper_MixedErrorsNonTimeoutExhausted(t *testing.T) {
+	// Timeout errors should not count toward non-timeout budget.
+	wrapper := setupWrapper(t, []launchResult{
+		{err: errTimeout},  // timeoutAttempts=1
+		{err: errGeneric},  // nonTimeoutAttempts=1
+		{err: errGeneric},  // nonTimeoutAttempts=2, exhausted
+	})
+	_, err := launchAndAwait(t, wrapper, 2, 3)
+	if err == nil || err.Error() != errGeneric.Error() {
+		t.Fatalf("expected generic error, got: %v", err)
+	}
+}
+
+func TestRetryWrapper_ZeroAllowedTimeouts(t *testing.T) {
+	// With 0 allowed timeouts, the first timeout should be immediately fatal.
+	wrapper := setupWrapper(t, []launchResult{
+		{err: errTimeout},
+	})
+	_, err := launchAndAwait(t, wrapper, 1, 0)
+	if !errors.Is(err, errTimeout) {
+		t.Fatalf("expected timeout error, got: %v", err)
+	}
+}
+
+func TestRetryWrapper_ZeroAllowedAttempts(t *testing.T) {
+	// With 0 allowed attempts, the first non-timeout error should be immediately fatal.
+	wrapper := setupWrapper(t, []launchResult{
+		{err: errGeneric},
+	})
+	_, err := launchAndAwait(t, wrapper, 0, 3)
+	if err == nil || err.Error() != errGeneric.Error() {
+		t.Fatalf("expected generic error, got: %v", err)
+	}
+}
+
+func TestRetryWrapper_ContextCanceled(t *testing.T) {
+	// If the context is canceled, the wrapper should return ctx.Err() rather
+	// than continuing to retry.
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel() // cancel immediately
+
+	spawner := &testSpawner{results: []launchResult{
+		// The spawner's result won't matter — ctx is already canceled.
+		{err: errTimeout},
+	}}
+	wrapper := NewValidationSpawnerRetryWrapper(spawner)
+	wrapper.StopWaiter.Start(ctx, wrapper)
+	defer wrapper.StopWaiter.StopAndWait()
+
+	run := wrapper.LaunchWithNAllowedAttempts(nil, testRoot, 3, 3)
+	_, err := run.Await(context.Background())
+	if err == nil {
+		t.Fatal("expected error from canceled context")
+	}
+}
diff --git a/validator/timeout.go b/validator/timeout.go
@@ -45,7 +45,11 @@ func IsTimeoutError(err error) bool {
 		return true
 	}
 
-	// String-based detection
+	// String-based detection for errors serialized through Redis or RPC,
+	// which lose Go type information. This may produce false positives for
+	// unrelated errors whose messages contain these substrings, but the
+	// consequence is only delayed failure detection (extra retries up to the
+	// configured limit), not silent suppression.
 
 	errMsg := err.Error()
 	if strings.Contains(errMsg, pubsub.TimeoutErrorMessage) ||