From 9251b06d13f56138116fed850f5c564ea51fa56b Mon Sep 17 00:00:00 2001 From: appleboy Date: Fri, 26 Jun 2026 10:27:51 +0800 Subject: [PATCH 01/15] fix(audit): retain batch on transient DB error and retry on next tick - add maxRetryBuffer field (default 500) to cap in-memory pending records - clear batchBuffer only after a successful CreateAuditLogBatch write - on error, keep records in buffer and trim oldest entries when cap exceeded - increment eventsDropped counter for overflow-dropped records - add TestFlushBatch_RetryOnTransientError, TestFlushBatch_CapsBufferOnRepeatedFailure, TestShutdown_DrainRetriesOnTransientError Co-Authored-By: Claude Sonnet 4.6 (1M context) --- internal/services/audit.go | 46 ++++++--- internal/services/audit_test.go | 166 ++++++++++++++++++++++++++++++++ 2 files changed, 198 insertions(+), 14 deletions(-) diff --git a/internal/services/audit.go b/internal/services/audit.go index 18a13af3..1cdda7da 100644 --- a/internal/services/audit.go +++ b/internal/services/audit.go @@ -97,9 +97,10 @@ type AuditService struct { logChan chan *models.AuditLog // Batch buffer - batchBuffer []*models.AuditLog - batchMutex sync.Mutex - batchTicker *time.Ticker + batchBuffer []*models.AuditLog + batchMutex sync.Mutex + batchTicker *time.Ticker + maxRetryBuffer int // cap for pending-retry records (default: 500) // Graceful shutdown wg sync.WaitGroup @@ -117,11 +118,12 @@ func NewAuditService(s core.Store, bufferSize int) *AuditService { } service := &AuditService{ - store: s, - bufferSize: bufferSize, - logChan: make(chan *models.AuditLog, bufferSize), - batchBuffer: make([]*models.AuditLog, 0, 100), - eventsDropped: getAuditEventsDroppedCounter(), + store: s, + bufferSize: bufferSize, + logChan: make(chan *models.AuditLog, bufferSize), + batchBuffer: make([]*models.AuditLog, 0, 100), + maxRetryBuffer: 500, // 5x the normal batch size + eventsDropped: getAuditEventsDroppedCounter(), } service.batchTicker = time.NewTicker(1 * time.Second) @@ -175,22 +177,38 @@ func (s *AuditService) flushBatch() { s.flushBatchUnsafe() } -// flushBatchUnsafe flushes the batch buffer without locking (caller must hold lock) +// flushBatchUnsafe flushes the batch buffer without locking (caller must hold lock). +// On success the buffer is cleared. On error the records are retained for the next +// flush tick. If the buffer exceeds maxRetryBuffer the oldest records are dropped to +// prevent unbounded memory growth. func (s *AuditService) flushBatchUnsafe() { if len(s.batchBuffer) == 0 { return } - // Copy buffer for writing + // Guard: zero value is safe — treat as default. + if s.maxRetryBuffer <= 0 { + s.maxRetryBuffer = 500 + } + toWrite := make([]*models.AuditLog, len(s.batchBuffer)) copy(toWrite, s.batchBuffer) - // Clear buffer - s.batchBuffer = s.batchBuffer[:0] - if err := s.store.CreateAuditLogBatch(toWrite); err != nil { - log.Printf("Failed to write audit log batch: %v", err) + log.Printf("WARNING: failed to write audit log batch (%d records), will retry: %v", + len(toWrite), err) + // Buffer still holds the records — enforce the cap so memory stays bounded. + if len(s.batchBuffer) > s.maxRetryBuffer { + drop := len(s.batchBuffer) - s.maxRetryBuffer + log.Printf("WARNING: audit retry buffer full, dropping %d oldest records", drop) + s.eventsDropped.Add(float64(drop)) + s.batchBuffer = s.batchBuffer[drop:] + } + return } + + // Success — clear only after confirmed write. + s.batchBuffer = s.batchBuffer[:0] } // clampToColumn returns s unchanged when it already fits within limit runes, diff --git a/internal/services/audit_test.go b/internal/services/audit_test.go index f2da9666..7f8717f6 100644 --- a/internal/services/audit_test.go +++ b/internal/services/audit_test.go @@ -2,17 +2,20 @@ package services import ( "context" + "errors" "fmt" "strings" "testing" "time" "github.com/go-authgate/authgate/internal/core" + "github.com/go-authgate/authgate/internal/mocks" "github.com/go-authgate/authgate/internal/models" storetypes "github.com/go-authgate/authgate/internal/store/types" "github.com/go-authgate/authgate/internal/util" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" ) func TestMaskSensitiveDetails_FullRedaction(t *testing.T) { @@ -327,3 +330,166 @@ func TestShutdown_DrainsLogChan(t *testing.T) { require.NoError(t, err) assert.GreaterOrEqual(t, len(logs), numEntries, "all drain-test entries should be persisted") } + +// TestFlushBatch_RetryOnTransientError verifies that a single transient DB +// failure keeps records in the buffer so the next flush tick persists them. +func TestFlushBatch_RetryOnTransientError(t *testing.T) { + ctrl := gomock.NewController(t) + mockStore := mocks.NewMockStore(ctrl) + + transientErr := errors.New("transient db error") + // First call fails, second succeeds. + gomock.InOrder( + mockStore.EXPECT().CreateAuditLogBatch(gomock.Any()).Return(transientErr), + mockStore.EXPECT().CreateAuditLogBatch(gomock.Any()).Return(nil), + ) + + dropped := getAuditEventsDroppedCounter() + svc := &AuditService{ + store: mockStore, + bufferSize: 100, + logChan: make(chan *models.AuditLog, 100), + batchBuffer: make([]*models.AuditLog, 0, 100), + maxRetryBuffer: 500, + eventsDropped: dropped, + } + + // Pre-fill the batch buffer with 5 records (bypassing the channel). + svc.batchMutex.Lock() + for i := range 5 { + svc.batchBuffer = append(svc.batchBuffer, &models.AuditLog{ + ID: fmt.Sprintf("retry-test-%d", i), + EventType: models.EventAccessTokenIssued, + Severity: models.SeverityInfo, + Action: "retry-test", + }) + } + svc.batchMutex.Unlock() + + // First flush — fails; records must remain. + svc.flushBatch() + svc.batchMutex.Lock() + assert.Len(t, svc.batchBuffer, 5, "records must be retained after transient failure") + svc.batchMutex.Unlock() + + // Second flush — succeeds; buffer must be cleared. + svc.flushBatch() + svc.batchMutex.Lock() + assert.Empty(t, svc.batchBuffer, "buffer must be cleared after successful write") + svc.batchMutex.Unlock() +} + +// TestFlushBatch_CapsBufferOnRepeatedFailure verifies that the retry buffer +// stays at most maxRetryBuffer entries and that overflow is counted. +func TestFlushBatch_CapsBufferOnRepeatedFailure(t *testing.T) { + ctrl := gomock.NewController(t) + mockStore := mocks.NewMockStore(ctrl) + + persistentErr := errors.New("persistent db error") + mockStore.EXPECT().CreateAuditLogBatch(gomock.Any()).Return(persistentErr).AnyTimes() + + dropped := getAuditEventsDroppedCounter() + const maxBuf = 10 + svc := &AuditService{ + store: mockStore, + bufferSize: 100, + logChan: make(chan *models.AuditLog, 100), + batchBuffer: make([]*models.AuditLog, 0, 100), + maxRetryBuffer: maxBuf, + eventsDropped: dropped, + } + + // Write 20 records in two batches of 10, flushing after each. + addRecords := func(start, count int) { + svc.batchMutex.Lock() + for i := range count { + svc.batchBuffer = append(svc.batchBuffer, &models.AuditLog{ + ID: fmt.Sprintf("cap-test-%d", start+i), + EventType: models.EventAccessTokenIssued, + Severity: models.SeverityInfo, + Action: "cap-test", + }) + } + svc.batchMutex.Unlock() + svc.flushBatch() + } + + addRecords(0, 10) + addRecords(10, 10) + + svc.batchMutex.Lock() + bufLen := len(svc.batchBuffer) + svc.batchMutex.Unlock() + + assert.LessOrEqual(t, bufLen, maxBuf, "buffer must not exceed maxRetryBuffer") + + // At least some records must have been counted as dropped. + // We read the counter value via a gauge trick: compare before/after is + // impractical for a singleton counter, so just assert the cap held. + assert.LessOrEqual(t, bufLen, maxBuf) +} + +// TestShutdown_DrainRetriesOnTransientError verifies that the shutdown drain +// retries on a transient error: all records are eventually persisted and +// Shutdown returns nil. +func TestShutdown_DrainRetriesOnTransientError(t *testing.T) { + ctrl := gomock.NewController(t) + mockStore := mocks.NewMockStore(ctrl) + + transientErr := errors.New("transient db error on drain") + // The worker drains the channel into the batch buffer, then calls + // flushBatch once when the channel closes. A single failure followed by + // the ticker-free shutdown path means we need the shutdown flush to succeed + // on the second attempt. We use a custom store that tracks call count. + callCount := 0 + mockStore.EXPECT().CreateAuditLogBatch(gomock.Any()). + DoAndReturn(func(logs []*models.AuditLog) error { + callCount++ + if callCount == 1 { + return transientErr + } + return nil + }). + Times(2) + + dropped := getAuditEventsDroppedCounter() + svc := &AuditService{ + store: mockStore, + bufferSize: 100, + logChan: make(chan *models.AuditLog, 100), + batchBuffer: make([]*models.AuditLog, 0, 100), + maxRetryBuffer: 500, + eventsDropped: dropped, + } + + // Pre-fill the channel before the worker starts so it drains deterministically. + const numEntries = 5 + for i := range numEntries { + svc.logChan <- &models.AuditLog{ + ID: fmt.Sprintf("drain-retry-%d", i), + EventType: models.EventAccessTokenIssued, + Severity: models.SeverityInfo, + Action: "drain-retry-test", + } + } + + svc.batchTicker = time.NewTicker(1 * time.Second) + svc.wg.Add(1) + go svc.worker() + + // Shutdown triggers the flush. After the first (failing) flush the records + // stay in the buffer. The worker exits but the batch is still pending; a + // second flush in Shutdown's drain path must succeed. + // Because the current implementation does a single flushBatch on channel + // close, we simulate the retry by calling flushBatch a second time manually + // after shutdown completes with the pending buffer, or — simpler — we just + // assert that Shutdown does not timeout and call count reaches 2. + err := svc.Shutdown(context.Background()) + require.NoError(t, err, "Shutdown must not timeout") + + // Manually retry for the pending buffer (simulates the next ticker tick). + svc.flushBatch() + + assert.Equal(t, 2, callCount, + "CreateAuditLogBatch must be called twice (one fail + one success)") +} From 4bdfa9c1ee874fa779a272344ae96fcc472b7b58 Mon Sep 17 00:00:00 2001 From: appleboy Date: Fri, 26 Jun 2026 10:54:49 +0800 Subject: [PATCH 02/15] fix(audit): report shutdown failure when final flush cannot persist records - after worker exits, Shutdown performs a second flushBatch to retry any records left pending by a transient DB error on channel close - return error from Shutdown when records still remain after the retry flush - update TestShutdown_DrainRetriesOnTransientError: second flush now happens inside Shutdown, so the manual post-shutdown flushBatch call is removed Co-Authored-By: Claude Sonnet 4.6 (1M context) --- internal/services/audit.go | 22 ++++++++++++++++++++-- internal/services/audit_test.go | 14 +++----------- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/internal/services/audit.go b/internal/services/audit.go index 1cdda7da..ac0bc28f 100644 --- a/internal/services/audit.go +++ b/internal/services/audit.go @@ -388,11 +388,29 @@ func (s *AuditService) Shutdown(ctx context.Context) error { select { case <-done: - log.Println("Audit service shut down gracefully") - return nil case <-ctx.Done(): return fmt.Errorf("audit service shutdown timeout: %w", ctx.Err()) } + + // The worker performs a single flushBatch on channel close. If that flush + // failed (transient DB error), records remain in batchBuffer with no ticker + // left to retry them. Attempt one final flush here so Shutdown accurately + // reports whether all records were persisted. + s.flushBatch() + + s.batchMutex.Lock() + pending := len(s.batchBuffer) + s.batchMutex.Unlock() + + if pending > 0 { + return fmt.Errorf( + "audit service shutdown: %d records could not be flushed to the database", + pending, + ) + } + + log.Println("Audit service shut down gracefully") + return nil } // maskSensitiveDetails masks sensitive information in audit log details diff --git a/internal/services/audit_test.go b/internal/services/audit_test.go index 7f8717f6..380d1f82 100644 --- a/internal/services/audit_test.go +++ b/internal/services/audit_test.go @@ -477,18 +477,10 @@ func TestShutdown_DrainRetriesOnTransientError(t *testing.T) { svc.wg.Add(1) go svc.worker() - // Shutdown triggers the flush. After the first (failing) flush the records - // stay in the buffer. The worker exits but the batch is still pending; a - // second flush in Shutdown's drain path must succeed. - // Because the current implementation does a single flushBatch on channel - // close, we simulate the retry by calling flushBatch a second time manually - // after shutdown completes with the pending buffer, or — simpler — we just - // assert that Shutdown does not timeout and call count reaches 2. + // Shutdown closes the channel, the worker flushes once (fails), then exits. + // Shutdown then performs a second flush — which succeeds — and returns nil. err := svc.Shutdown(context.Background()) - require.NoError(t, err, "Shutdown must not timeout") - - // Manually retry for the pending buffer (simulates the next ticker tick). - svc.flushBatch() + require.NoError(t, err, "Shutdown must succeed after retry flush clears the buffer") assert.Equal(t, 2, callCount, "CreateAuditLogBatch must be called twice (one fail + one success)") From 8a3435cd397a9cb669ff4fb113687f32b88d65c3 Mon Sep 17 00:00:00 2001 From: appleboy Date: Fri, 26 Jun 2026 12:45:59 +0800 Subject: [PATCH 03/15] fix(audit): honour shutdown timeout on final flush; drop permanently-bad batches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - wrap the post-worker flushBatch in Shutdown inside a goroutine so ctx.Done() is respected — a stalled DB cannot block Shutdown past the caller's deadline - add maxFlushRetries (default 5) and flushFailCount to AuditService; after maxFlushRetries consecutive failures the batch is dropped and eventsDropped is incremented, preventing a permanently-invalid row from stalling all subsequent audit events indefinitely Co-Authored-By: Claude Sonnet 4.6 (1M context) --- internal/services/audit.go | 90 ++++++++++++++++++++++++++------------ 1 file changed, 63 insertions(+), 27 deletions(-) diff --git a/internal/services/audit.go b/internal/services/audit.go index ac0bc28f..6eb5f189 100644 --- a/internal/services/audit.go +++ b/internal/services/audit.go @@ -97,10 +97,12 @@ type AuditService struct { logChan chan *models.AuditLog // Batch buffer - batchBuffer []*models.AuditLog - batchMutex sync.Mutex - batchTicker *time.Ticker - maxRetryBuffer int // cap for pending-retry records (default: 500) + batchBuffer []*models.AuditLog + batchMutex sync.Mutex + batchTicker *time.Ticker + maxRetryBuffer int // cap for pending-retry records (default: 500) + maxFlushRetries int // drop batch after this many consecutive failures (default: 5) + flushFailCount int // consecutive flush failure count for the current batch // Graceful shutdown wg sync.WaitGroup @@ -118,12 +120,13 @@ func NewAuditService(s core.Store, bufferSize int) *AuditService { } service := &AuditService{ - store: s, - bufferSize: bufferSize, - logChan: make(chan *models.AuditLog, bufferSize), - batchBuffer: make([]*models.AuditLog, 0, 100), - maxRetryBuffer: 500, // 5x the normal batch size - eventsDropped: getAuditEventsDroppedCounter(), + store: s, + bufferSize: bufferSize, + logChan: make(chan *models.AuditLog, bufferSize), + batchBuffer: make([]*models.AuditLog, 0, 100), + maxRetryBuffer: 500, // 5x the normal batch size + maxFlushRetries: 5, + eventsDropped: getAuditEventsDroppedCounter(), } service.batchTicker = time.NewTicker(1 * time.Second) @@ -179,24 +182,44 @@ func (s *AuditService) flushBatch() { // flushBatchUnsafe flushes the batch buffer without locking (caller must hold lock). // On success the buffer is cleared. On error the records are retained for the next -// flush tick. If the buffer exceeds maxRetryBuffer the oldest records are dropped to -// prevent unbounded memory growth. +// flush tick. After maxFlushRetries consecutive failures the batch is dropped to +// prevent a permanently-bad row from stalling all subsequent audit events. +// If the buffer exceeds maxRetryBuffer the oldest records are dropped to prevent +// unbounded memory growth. func (s *AuditService) flushBatchUnsafe() { if len(s.batchBuffer) == 0 { return } - // Guard: zero value is safe — treat as default. + // Guard: zero values are safe — treat as defaults. if s.maxRetryBuffer <= 0 { s.maxRetryBuffer = 500 } + if s.maxFlushRetries <= 0 { + s.maxFlushRetries = 5 + } toWrite := make([]*models.AuditLog, len(s.batchBuffer)) copy(toWrite, s.batchBuffer) if err := s.store.CreateAuditLogBatch(toWrite); err != nil { - log.Printf("WARNING: failed to write audit log batch (%d records), will retry: %v", - len(toWrite), err) + s.flushFailCount++ + log.Printf( + "WARNING: failed to write audit log batch (%d records, attempt %d/%d), will retry: %v", + len(toWrite), s.flushFailCount, s.maxFlushRetries, err, + ) + + // After too many consecutive failures, drop the batch — a permanently + // invalid row would otherwise block all subsequent audit events. + if s.flushFailCount >= s.maxFlushRetries { + log.Printf("WARNING: audit batch failed %d times, dropping %d records", + s.flushFailCount, len(s.batchBuffer)) + s.eventsDropped.Add(float64(len(s.batchBuffer))) + s.batchBuffer = s.batchBuffer[:0] + s.flushFailCount = 0 + return + } + // Buffer still holds the records — enforce the cap so memory stays bounded. if len(s.batchBuffer) > s.maxRetryBuffer { drop := len(s.batchBuffer) - s.maxRetryBuffer @@ -208,6 +231,7 @@ func (s *AuditService) flushBatchUnsafe() { } // Success — clear only after confirmed write. + s.flushFailCount = 0 s.batchBuffer = s.batchBuffer[:0] } @@ -394,19 +418,31 @@ func (s *AuditService) Shutdown(ctx context.Context) error { // The worker performs a single flushBatch on channel close. If that flush // failed (transient DB error), records remain in batchBuffer with no ticker - // left to retry them. Attempt one final flush here so Shutdown accurately - // reports whether all records were persisted. - s.flushBatch() - - s.batchMutex.Lock() - pending := len(s.batchBuffer) - s.batchMutex.Unlock() + // left to retry them. Attempt one final flush here, honouring the caller's + // deadline so a stalled DB cannot block Shutdown past the timeout. + flushDone := make(chan error, 1) + go func() { + s.flushBatch() + s.batchMutex.Lock() + pending := len(s.batchBuffer) + s.batchMutex.Unlock() + if pending > 0 { + flushDone <- fmt.Errorf( + "audit service shutdown: %d records could not be flushed to the database", + pending, + ) + return + } + flushDone <- nil + }() - if pending > 0 { - return fmt.Errorf( - "audit service shutdown: %d records could not be flushed to the database", - pending, - ) + select { + case err := <-flushDone: + if err != nil { + return err + } + case <-ctx.Done(): + return fmt.Errorf("audit service shutdown timeout during final flush: %w", ctx.Err()) } log.Println("Audit service shut down gracefully") From 631ddd32d9287d725d40aa7f6343e83542a647bb Mon Sep 17 00:00:00 2001 From: appleboy Date: Fri, 26 Jun 2026 12:53:14 +0800 Subject: [PATCH 04/15] fix(audit): skip size-triggered flush while in retry state While flushFailCount > 0, addToBatch no longer calls flushBatchUnsafe on the 100-entry threshold. Under active traffic a failed flush left the buffer at or above 100 entries, so each new event immediately re-triggered flushBatchUnsafe and incremented flushFailCount once per event instead of once per ticker tick, exhausting maxFlushRetries in seconds and dropping the retained buffer far earlier than intended. Deferring to the 1-second ticker while in retry state ensures flushFailCount advances at most once per second regardless of write traffic. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- internal/services/audit.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/internal/services/audit.go b/internal/services/audit.go index 6eb5f189..3afba4da 100644 --- a/internal/services/audit.go +++ b/internal/services/audit.go @@ -167,8 +167,12 @@ func (s *AuditService) addToBatch(log *models.AuditLog) { s.batchBuffer = append(s.batchBuffer, log) - // Flush if batch is full (100 entries) - if len(s.batchBuffer) >= 100 { + // Flush if batch is full (100 entries), but only when not in retry state. + // While flushFailCount > 0 the previous flush already failed; size-triggered + // re-attempts would increment flushFailCount once per new event and exhaust + // maxFlushRetries in seconds under active traffic, dropping the retained + // buffer far earlier than intended. Defer to the 1-second ticker instead. + if len(s.batchBuffer) >= 100 && s.flushFailCount == 0 { s.flushBatchUnsafe() } } From 5c079e43d02fa8fc754ef29daa99b91b0b0f853b Mon Sep 17 00:00:00 2001 From: appleboy Date: Fri, 26 Jun 2026 12:58:59 +0800 Subject: [PATCH 05/15] fix(audit): enforce maxRetryBuffer cap in addToBatch during retry state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While flushFailCount > 0 the size-triggered flush is suppressed so the ticker remains the sole retry driver. However the buffer cap was only enforced inside flushBatchUnsafe (which is skipped in retry state), so sustained audit traffic could drain logChan into batchBuffer faster than the 1-second ticker fires, growing memory without bound until OOM. Apply the maxRetryBuffer cap directly in addToBatch whenever we are in retry state (flushFailCount > 0), dropping the oldest entries and incrementing eventsDropped — the same behaviour flushBatchUnsafe uses for overflow. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- internal/services/audit.go | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/internal/services/audit.go b/internal/services/audit.go index 3afba4da..ccaa93b4 100644 --- a/internal/services/audit.go +++ b/internal/services/audit.go @@ -167,13 +167,27 @@ func (s *AuditService) addToBatch(log *models.AuditLog) { s.batchBuffer = append(s.batchBuffer, log) - // Flush if batch is full (100 entries), but only when not in retry state. - // While flushFailCount > 0 the previous flush already failed; size-triggered - // re-attempts would increment flushFailCount once per new event and exhaust - // maxFlushRetries in seconds under active traffic, dropping the retained - // buffer far earlier than intended. Defer to the 1-second ticker instead. - if len(s.batchBuffer) >= 100 && s.flushFailCount == 0 { - s.flushBatchUnsafe() + if s.flushFailCount == 0 { + // Normal path: flush when the batch reaches 100 entries. + if len(s.batchBuffer) >= 100 { + s.flushBatchUnsafe() + } + return + } + + // Retry state: the last flush failed, so size-triggered flushes are + // suppressed to avoid advancing flushFailCount once-per-event. The ticker + // is the sole retry driver. However, the maxRetryBuffer cap must still be + // enforced here — otherwise sustained traffic can drain logChan into + // batchBuffer far faster than the ticker fires, growing memory without bound. + maxBuf := s.maxRetryBuffer + if maxBuf <= 0 { + maxBuf = 500 + } + if len(s.batchBuffer) > maxBuf { + drop := len(s.batchBuffer) - maxBuf + s.eventsDropped.Add(float64(drop)) + s.batchBuffer = s.batchBuffer[drop:] } } From 95c2cfd5cf44cd9b2c400d4b1cd42360c6bc47c3 Mon Sep 17 00:00:00 2001 From: appleboy Date: Fri, 26 Jun 2026 13:11:13 +0800 Subject: [PATCH 06/15] fix(audit): reset retry count on cap-eviction; report dropped records on shutdown P1: reset flushFailCount when addToBatch cap-evicts records in retry state. The evicted records are new arrivals that have never been attempted; leaving flushFailCount unchanged meant those records inherited prior failure counts and could be dropped after only (maxFlushRetries - flushFailCount) more ticker ticks instead of the full retry window. P2: add lastFlushDropped flag to distinguish buffer-empty-because-dropped from buffer-empty-because-written. Shutdown now returns an error when the final flush emptied the buffer via the drop path, so callers are not misled into thinking all records were persisted. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- internal/services/audit.go | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/internal/services/audit.go b/internal/services/audit.go index ccaa93b4..5de708d6 100644 --- a/internal/services/audit.go +++ b/internal/services/audit.go @@ -2,6 +2,7 @@ package services import ( "context" + "errors" "fmt" "log" "strings" @@ -97,12 +98,13 @@ type AuditService struct { logChan chan *models.AuditLog // Batch buffer - batchBuffer []*models.AuditLog - batchMutex sync.Mutex - batchTicker *time.Ticker - maxRetryBuffer int // cap for pending-retry records (default: 500) - maxFlushRetries int // drop batch after this many consecutive failures (default: 5) - flushFailCount int // consecutive flush failure count for the current batch + batchBuffer []*models.AuditLog + batchMutex sync.Mutex + batchTicker *time.Ticker + maxRetryBuffer int // cap for pending-retry records (default: 500) + maxFlushRetries int // drop batch after this many consecutive failures (default: 5) + flushFailCount int // consecutive flush failure count for the current batch + lastFlushDropped bool // set true when flushBatchUnsafe drops records without writing // Graceful shutdown wg sync.WaitGroup @@ -188,6 +190,11 @@ func (s *AuditService) addToBatch(log *models.AuditLog) { drop := len(s.batchBuffer) - maxBuf s.eventsDropped.Add(float64(drop)) s.batchBuffer = s.batchBuffer[drop:] + // The retained records are now different from the ones that failed. + // Reset the retry counter so those new records get the full retry + // window instead of inheriting prior failures. + s.flushFailCount = 0 + s.lastFlushDropped = true } } @@ -235,6 +242,7 @@ func (s *AuditService) flushBatchUnsafe() { s.eventsDropped.Add(float64(len(s.batchBuffer))) s.batchBuffer = s.batchBuffer[:0] s.flushFailCount = 0 + s.lastFlushDropped = true return } @@ -250,6 +258,7 @@ func (s *AuditService) flushBatchUnsafe() { // Success — clear only after confirmed write. s.flushFailCount = 0 + s.lastFlushDropped = false s.batchBuffer = s.batchBuffer[:0] } @@ -443,15 +452,23 @@ func (s *AuditService) Shutdown(ctx context.Context) error { s.flushBatch() s.batchMutex.Lock() pending := len(s.batchBuffer) + dropped := s.lastFlushDropped s.batchMutex.Unlock() - if pending > 0 { + switch { + case pending > 0: flushDone <- fmt.Errorf( "audit service shutdown: %d records could not be flushed to the database", pending, ) - return + case dropped: + // Buffer is empty but only because records were dropped after exhausting + // retries — not because they were successfully written. + flushDone <- errors.New( + "audit service shutdown: pending audit records were dropped after exhausting retry attempts", + ) + default: + flushDone <- nil } - flushDone <- nil }() select { From 6f5c7692751a29c31de8684cb0da54d6d8dc5774 Mon Sep 17 00:00:00 2001 From: appleboy Date: Fri, 26 Jun 2026 13:28:32 +0800 Subject: [PATCH 07/15] fix(audit): remove maxFlushRetries drop; rely solely on maxRetryBuffer cap The maxFlushRetries=5 path dropped the entire retry buffer after ~5 failed ticks (5 seconds). A normal DB restart lasting 6 seconds would therefore permanently lose all buffered audit records, defeating the retry guarantee. Memory safety is already provided by maxRetryBuffer (default 500 records): records are evicted oldest-first when the cap is exceeded. The maxFlushRetries hard-drop added no safety benefit and was harmful to availability. Remove the maxFlushRetries field and its drop path entirely. The flushFailCount field is retained as a retry-state flag (still needed by addToBatch to suppress size-triggered flushes during outages), but it no longer drives any drop decision. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- internal/services/audit.go | 48 +++++++++++++------------------------- 1 file changed, 16 insertions(+), 32 deletions(-) diff --git a/internal/services/audit.go b/internal/services/audit.go index 5de708d6..cba3851c 100644 --- a/internal/services/audit.go +++ b/internal/services/audit.go @@ -102,9 +102,8 @@ type AuditService struct { batchMutex sync.Mutex batchTicker *time.Ticker maxRetryBuffer int // cap for pending-retry records (default: 500) - maxFlushRetries int // drop batch after this many consecutive failures (default: 5) - flushFailCount int // consecutive flush failure count for the current batch - lastFlushDropped bool // set true when flushBatchUnsafe drops records without writing + flushFailCount int // > 0 while in retry state (consecutive flush failures) + lastFlushDropped bool // set true when records are dropped without being written // Graceful shutdown wg sync.WaitGroup @@ -122,13 +121,12 @@ func NewAuditService(s core.Store, bufferSize int) *AuditService { } service := &AuditService{ - store: s, - bufferSize: bufferSize, - logChan: make(chan *models.AuditLog, bufferSize), - batchBuffer: make([]*models.AuditLog, 0, 100), - maxRetryBuffer: 500, // 5x the normal batch size - maxFlushRetries: 5, - eventsDropped: getAuditEventsDroppedCounter(), + store: s, + bufferSize: bufferSize, + logChan: make(chan *models.AuditLog, bufferSize), + batchBuffer: make([]*models.AuditLog, 0, 100), + maxRetryBuffer: 500, // 5x the normal batch size + eventsDropped: getAuditEventsDroppedCounter(), } service.batchTicker = time.NewTicker(1 * time.Second) @@ -207,22 +205,17 @@ func (s *AuditService) flushBatch() { // flushBatchUnsafe flushes the batch buffer without locking (caller must hold lock). // On success the buffer is cleared. On error the records are retained for the next -// flush tick. After maxFlushRetries consecutive failures the batch is dropped to -// prevent a permanently-bad row from stalling all subsequent audit events. -// If the buffer exceeds maxRetryBuffer the oldest records are dropped to prevent -// unbounded memory growth. +// flush tick. If the buffer exceeds maxRetryBuffer the oldest records are dropped +// to prevent unbounded memory growth during a sustained DB outage. func (s *AuditService) flushBatchUnsafe() { if len(s.batchBuffer) == 0 { return } - // Guard: zero values are safe — treat as defaults. + // Guard: zero value is safe — treat as default. if s.maxRetryBuffer <= 0 { s.maxRetryBuffer = 500 } - if s.maxFlushRetries <= 0 { - s.maxFlushRetries = 5 - } toWrite := make([]*models.AuditLog, len(s.batchBuffer)) copy(toWrite, s.batchBuffer) @@ -230,23 +223,14 @@ func (s *AuditService) flushBatchUnsafe() { if err := s.store.CreateAuditLogBatch(toWrite); err != nil { s.flushFailCount++ log.Printf( - "WARNING: failed to write audit log batch (%d records, attempt %d/%d), will retry: %v", - len(toWrite), s.flushFailCount, s.maxFlushRetries, err, + "WARNING: failed to write audit log batch (%d records, attempt %d), will retry: %v", + len(toWrite), s.flushFailCount, err, ) - // After too many consecutive failures, drop the batch — a permanently - // invalid row would otherwise block all subsequent audit events. - if s.flushFailCount >= s.maxFlushRetries { - log.Printf("WARNING: audit batch failed %d times, dropping %d records", - s.flushFailCount, len(s.batchBuffer)) - s.eventsDropped.Add(float64(len(s.batchBuffer))) - s.batchBuffer = s.batchBuffer[:0] - s.flushFailCount = 0 - s.lastFlushDropped = true - return - } - // Buffer still holds the records — enforce the cap so memory stays bounded. + // Memory safety is provided solely by maxRetryBuffer; there is no fixed + // retry limit so a transient outage of any duration cannot lose records + // that still fit within the cap. if len(s.batchBuffer) > s.maxRetryBuffer { drop := len(s.batchBuffer) - s.maxRetryBuffer log.Printf("WARNING: audit retry buffer full, dropping %d oldest records", drop) From faeded8d871c5c290371944c02f0e9cfe6bbc9ba Mon Sep 17 00:00:00 2001 From: appleboy Date: Fri, 26 Jun 2026 13:37:17 +0800 Subject: [PATCH 08/15] fix(audit): keep retry state after cap-eviction; add stale-tick record eviction P2: don't reset flushFailCount after cap-eviction in addToBatch. Previously, after dropping overflow records the counter was reset to 0, which re-entered the normal flush path. Since the buffer typically stays above 100 entries, the very next event would call flushBatchUnsafe again, causing a DB write attempt every other event under sustained traffic while the DB is already down. P2: add staleTicks-based single-record eviction in flushBatchUnsafe. After staleTicks (default 300 = ~5 min) consecutive failed ticks, one oldest record is evicted per tick. This handles the permanently-invalid row case (e.g. a ResourceName value exceeding varchar(255) on Postgres) that would otherwise block all subsequent audit events indefinitely in low-traffic deployments. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- internal/services/audit.go | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/internal/services/audit.go b/internal/services/audit.go index cba3851c..810a476a 100644 --- a/internal/services/audit.go +++ b/internal/services/audit.go @@ -102,6 +102,7 @@ type AuditService struct { batchMutex sync.Mutex batchTicker *time.Ticker maxRetryBuffer int // cap for pending-retry records (default: 500) + staleTicks int // ticks before starting per-tick eviction of oldest record (default: 300) flushFailCount int // > 0 while in retry state (consecutive flush failures) lastFlushDropped bool // set true when records are dropped without being written @@ -126,6 +127,7 @@ func NewAuditService(s core.Store, bufferSize int) *AuditService { logChan: make(chan *models.AuditLog, bufferSize), batchBuffer: make([]*models.AuditLog, 0, 100), maxRetryBuffer: 500, // 5x the normal batch size + staleTicks: 300, // ~5 minutes of failed ticks before single-record eviction begins eventsDropped: getAuditEventsDroppedCounter(), } @@ -188,11 +190,12 @@ func (s *AuditService) addToBatch(log *models.AuditLog) { drop := len(s.batchBuffer) - maxBuf s.eventsDropped.Add(float64(drop)) s.batchBuffer = s.batchBuffer[drop:] - // The retained records are now different from the ones that failed. - // Reset the retry counter so those new records get the full retry - // window instead of inheriting prior failures. - s.flushFailCount = 0 s.lastFlushDropped = true + // Do NOT reset flushFailCount — the DB may still be down. + // Resetting would re-enter the normal flush path on the next event: + // since the buffer stays above 100 entries, addToBatch would call + // flushBatchUnsafe immediately, causing a write attempt roughly every + // other event under sustained traffic and hammering an already failing DB. } } @@ -228,15 +231,29 @@ func (s *AuditService) flushBatchUnsafe() { ) // Buffer still holds the records — enforce the cap so memory stays bounded. - // Memory safety is provided solely by maxRetryBuffer; there is no fixed - // retry limit so a transient outage of any duration cannot lose records - // that still fit within the cap. if len(s.batchBuffer) > s.maxRetryBuffer { drop := len(s.batchBuffer) - s.maxRetryBuffer log.Printf("WARNING: audit retry buffer full, dropping %d oldest records", drop) s.eventsDropped.Add(float64(drop)) s.batchBuffer = s.batchBuffer[drop:] } + + // After staleTicks consecutive failures, drop one oldest record per tick + // so a permanently-invalid row eventually falls out of the buffer while + // preserving the rest. This handles the case where a single bad row blocks + // all subsequent valid events indefinitely in low-traffic deployments. + stale := s.staleTicks + if stale <= 0 { + stale = 300 + } + if s.flushFailCount > stale && len(s.batchBuffer) > 0 { + log.Printf( + "WARNING: audit batch stale after %d ticks, evicting oldest record", + s.flushFailCount, + ) + s.eventsDropped.Add(1) + s.batchBuffer = s.batchBuffer[1:] + } return } From 7979388709d374b1b065f1bc1b278c1afdc4f78b Mon Sep 17 00:00:00 2001 From: appleboy Date: Fri, 26 Jun 2026 13:43:56 +0800 Subject: [PATCH 09/15] fix(audit): set lastFlushDropped on all drop paths in flushBatchUnsafe Both the cap-overflow eviction and the stale-tick single-record eviction in flushBatchUnsafe could drop records without setting lastFlushDropped. If a subsequent write succeeded, Shutdown would see an empty buffer and a false lastFlushDropped=false, report graceful success, and silently hide that records had been lost. Setting lastFlushDropped=true on both drop paths ensures Shutdown accurately reports when any records were discarded. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- internal/services/audit.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/services/audit.go b/internal/services/audit.go index 810a476a..f80ea82b 100644 --- a/internal/services/audit.go +++ b/internal/services/audit.go @@ -236,6 +236,7 @@ func (s *AuditService) flushBatchUnsafe() { log.Printf("WARNING: audit retry buffer full, dropping %d oldest records", drop) s.eventsDropped.Add(float64(drop)) s.batchBuffer = s.batchBuffer[drop:] + s.lastFlushDropped = true } // After staleTicks consecutive failures, drop one oldest record per tick @@ -253,6 +254,7 @@ func (s *AuditService) flushBatchUnsafe() { ) s.eventsDropped.Add(1) s.batchBuffer = s.batchBuffer[1:] + s.lastFlushDropped = true } return } From 6114217c23aa9458fdc3ac8389b1bb705eb15c17 Mon Sep 17 00:00:00 2001 From: appleboy Date: Fri, 26 Jun 2026 13:50:35 +0800 Subject: [PATCH 10/15] fix(audit): remove staleTicks age-based eviction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit staleTicks dropped one record per second after ~5 minutes of consecutive failures regardless of buffer pressure. During any prolonged but transient DB maintenance window in a low-traffic deployment this silently lost one audit record per tick — reintroducing the exact data loss the retry logic was meant to prevent. Memory is already bounded by maxRetryBuffer (cap-based eviction). A permanently-stuck row produces WARNING logs on every tick, giving operators visibility to investigate without the service silently discarding records under time pressure. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- internal/services/audit.go | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/internal/services/audit.go b/internal/services/audit.go index f80ea82b..0c4a9490 100644 --- a/internal/services/audit.go +++ b/internal/services/audit.go @@ -102,7 +102,6 @@ type AuditService struct { batchMutex sync.Mutex batchTicker *time.Ticker maxRetryBuffer int // cap for pending-retry records (default: 500) - staleTicks int // ticks before starting per-tick eviction of oldest record (default: 300) flushFailCount int // > 0 while in retry state (consecutive flush failures) lastFlushDropped bool // set true when records are dropped without being written @@ -127,7 +126,6 @@ func NewAuditService(s core.Store, bufferSize int) *AuditService { logChan: make(chan *models.AuditLog, bufferSize), batchBuffer: make([]*models.AuditLog, 0, 100), maxRetryBuffer: 500, // 5x the normal batch size - staleTicks: 300, // ~5 minutes of failed ticks before single-record eviction begins eventsDropped: getAuditEventsDroppedCounter(), } @@ -239,23 +237,6 @@ func (s *AuditService) flushBatchUnsafe() { s.lastFlushDropped = true } - // After staleTicks consecutive failures, drop one oldest record per tick - // so a permanently-invalid row eventually falls out of the buffer while - // preserving the rest. This handles the case where a single bad row blocks - // all subsequent valid events indefinitely in low-traffic deployments. - stale := s.staleTicks - if stale <= 0 { - stale = 300 - } - if s.flushFailCount > stale && len(s.batchBuffer) > 0 { - log.Printf( - "WARNING: audit batch stale after %d ticks, evicting oldest record", - s.flushFailCount, - ) - s.eventsDropped.Add(1) - s.batchBuffer = s.batchBuffer[1:] - s.lastFlushDropped = true - } return } From c79513da9f3f70b9bcc1a776da88a6b39f40dcf3 Mon Sep 17 00:00:00 2001 From: appleboy Date: Fri, 26 Jun 2026 13:56:23 +0800 Subject: [PATCH 11/15] fix(audit): clamp ResourceName and Action to varchar(255) in buildAuditLog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ResourceName and Action are stored in varchar(255) columns but were not clamped before INSERT, so an over-long value (e.g. a client name used as ResourceName) would cause a permanent Postgres constraint error, leaving the entire batch stuck in the retry buffer indefinitely. Clamp both fields to 255 runes in buildAuditLog using the same clampToColumn helper already applied to UserAgent, RequestPath, ActorUsername, and ActorFullName — values within the column width are preserved verbatim; only over-long values get an ellipsis-terminated truncation. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- internal/services/audit.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/services/audit.go b/internal/services/audit.go index 0c4a9490..d55af384 100644 --- a/internal/services/audit.go +++ b/internal/services/audit.go @@ -321,6 +321,8 @@ func (s *AuditService) buildAuditLog( entry.RequestPath = clampToColumn(entry.RequestPath, 500) entry.ActorUsername = clampToColumn(entry.ActorUsername, 100) entry.ActorFullName = clampToColumn(entry.ActorFullName, 100) + entry.ResourceName = clampToColumn(entry.ResourceName, 255) + entry.Action = clampToColumn(entry.Action, 255) // RequestMethod is stored in a varchar(10) column. Preserve values up to // the full column width and hard-truncate anything longer without adding From 034bca77381c36b8d4b3f8c48013284b55e7bc43 Mon Sep 17 00:00:00 2001 From: appleboy Date: Fri, 26 Jun 2026 14:06:21 +0800 Subject: [PATCH 12/15] fix(audit): validate AuditDetails JSON serializability in buildAuditLog An AuditDetails value with a non-JSON-marshalable type (e.g. a channel, function, or cyclic map) would cause CreateAuditLogBatch to return a permanent serialization error on every retry, blocking all subsequent audit events in the buffer indefinitely. Validate Details.Value() before the record enters the batch. If it is not serializable, replace the details with a sentinel map containing the error message so the row is always writable. This prevents any permanently-invalid row from entering the retry buffer in the first place. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- internal/services/audit.go | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/internal/services/audit.go b/internal/services/audit.go index d55af384..41496fcb 100644 --- a/internal/services/audit.go +++ b/internal/services/audit.go @@ -312,6 +312,20 @@ func (s *AuditService) buildAuditLog( } entry.Details = maskSensitiveDetails(entry.Details) + // Validate that Details can be serialized to JSON before the record enters + // the batch buffer. An unserializable value would cause CreateAuditLogBatch + // to fail with a permanent error on every retry, blocking all subsequent + // audit events indefinitely. Replace bad details with an error indicator so + // the row can always be written. + if _, err := entry.Details.Value(); err != nil { + log.Printf( + "WARNING: audit details for event %q are not JSON-serializable, replacing with error indicator: %v", + entry.EventType, + err, + ) + entry.Details = models.AuditDetails{"_serialization_error": err.Error()} + } + // Clamp string fields to their varchar column widths. Values already within // the limit are preserved verbatim; only over-long values are truncated // (with an ellipsis). ActorUsername/ActorFullName mirror their unbounded From 7c78c58384564e67e6286de910cfae49e7f0ae5e Mon Sep 17 00:00:00 2001 From: appleboy Date: Fri, 26 Jun 2026 14:41:03 +0800 Subject: [PATCH 13/15] fix(audit): truncate ActorUserID to varchar(36) in buildAuditLog ActorUserID is stored in varchar(36) but machine identity strings such as 'client:' (43 chars) exceed that limit on Postgres strict mode, causing a permanent INSERT error that blocks the entire retry buffer. Hard-truncate ActorUserID to 36 bytes (no ellipsis) at build time using the same pattern as RequestMethod, matching the column's fixed width. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- internal/services/audit.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/internal/services/audit.go b/internal/services/audit.go index 41496fcb..c55a37cc 100644 --- a/internal/services/audit.go +++ b/internal/services/audit.go @@ -338,6 +338,12 @@ func (s *AuditService) buildAuditLog( entry.ResourceName = clampToColumn(entry.ResourceName, 255) entry.Action = clampToColumn(entry.Action, 255) + // ActorUserID is varchar(36); machine identity strings such as + // "client:" (43 chars) exceed that limit on strict drivers. + if len(entry.ActorUserID) > 36 { + entry.ActorUserID = entry.ActorUserID[:36] + } + // RequestMethod is stored in a varchar(10) column. Preserve values up to // the full column width and hard-truncate anything longer without adding // an ellipsis. From 6e976b834af130f6a5f09c01b48fff915fc25544 Mon Sep 17 00:00:00 2001 From: appleboy Date: Fri, 26 Jun 2026 14:47:24 +0800 Subject: [PATCH 14/15] revert(audit): remove ActorUserID truncation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Truncating 'client:' (43 chars) to 36 bytes produces a malformed prefix that silently corrupts the audit trail — different clients share the same prefix and filtering by machine identity breaks. The varchar(36) column is too narrow for machine identity strings; that requires a schema migration, not silent data truncation at the application layer. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- internal/services/audit.go | 6 ------ 1 file changed, 6 deletions(-) diff --git a/internal/services/audit.go b/internal/services/audit.go index c55a37cc..41496fcb 100644 --- a/internal/services/audit.go +++ b/internal/services/audit.go @@ -338,12 +338,6 @@ func (s *AuditService) buildAuditLog( entry.ResourceName = clampToColumn(entry.ResourceName, 255) entry.Action = clampToColumn(entry.Action, 255) - // ActorUserID is varchar(36); machine identity strings such as - // "client:" (43 chars) exceed that limit on strict drivers. - if len(entry.ActorUserID) > 36 { - entry.ActorUserID = entry.ActorUserID[:36] - } - // RequestMethod is stored in a varchar(10) column. Preserve values up to // the full column width and hard-truncate anything longer without adding // an ellipsis. From dbffc77469198ab2c81263325f681aa00781bd25 Mon Sep 17 00:00:00 2001 From: appleboy Date: Fri, 26 Jun 2026 15:02:29 +0800 Subject: [PATCH 15/15] fix(audit): truncate ActorUserID to varchar(36) with logged warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Machine identity strings such as 'client:' (43 chars) exceed the varchar(36) ActorUserID column on Postgres strict mode. Without truncation the batch fails permanently and blocks all later audit events in the retry buffer. Log the full original value at WARNING level before truncating so operators retain full visibility in logs. The column itself is truncated to 36 bytes (no ellipsis) — a hard identifier limit, not human-readable text. A follow-up migration will widen the column to accommodate the full machine identity string. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- internal/services/audit.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/internal/services/audit.go b/internal/services/audit.go index 41496fcb..4792253d 100644 --- a/internal/services/audit.go +++ b/internal/services/audit.go @@ -338,6 +338,19 @@ func (s *AuditService) buildAuditLog( entry.ResourceName = clampToColumn(entry.ResourceName, 255) entry.Action = clampToColumn(entry.Action, 255) + // ActorUserID is varchar(36). Machine identity strings such as + // "client:" (43 chars) exceed that limit on Postgres strict mode. + // Log the full value so operators retain visibility, then hard-truncate + // to prevent a permanent INSERT error from poisoning the retry buffer. + // TODO: widen the column in a follow-up migration. + if len(entry.ActorUserID) > 36 { + log.Printf( + "WARNING: audit ActorUserID %q exceeds varchar(36), truncating for event %q", + entry.ActorUserID, entry.EventType, + ) + entry.ActorUserID = entry.ActorUserID[:36] + } + // RequestMethod is stored in a varchar(10) column. Preserve values up to // the full column width and hard-truncate anything longer without adding // an ellipsis.