From 9251b06d13f56138116fed850f5c564ea51fa56b Mon Sep 17 00:00:00 2001
From: appleboy <appleboy.tw@gmail.com>
Date: Fri, 26 Jun 2026 10:27:51 +0800
Subject: [PATCH 01/15] fix(audit): retain batch on transient DB error and
 retry on next tick

- add maxRetryBuffer field (default 500) to cap in-memory pending records
- clear batchBuffer only after a successful CreateAuditLogBatch write
- on error, keep records in buffer and trim oldest entries when cap exceeded
- increment eventsDropped counter for overflow-dropped records
- add TestFlushBatch_RetryOnTransientError, TestFlushBatch_CapsBufferOnRepeatedFailure, TestShutdown_DrainRetriesOnTransientError

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 internal/services/audit.go      |  46 ++++++---
 internal/services/audit_test.go | 166 ++++++++++++++++++++++++++++++++
 2 files changed, 198 insertions(+), 14 deletions(-)

diff --git a/internal/services/audit.go b/internal/services/audit.go
index 18a13af3..1cdda7da 100644
--- a/internal/services/audit.go
+++ b/internal/services/audit.go
@@ -97,9 +97,10 @@ type AuditService struct {
 	logChan chan *models.AuditLog
 
 	// Batch buffer
-	batchBuffer []*models.AuditLog
-	batchMutex  sync.Mutex
-	batchTicker *time.Ticker
+	batchBuffer    []*models.AuditLog
+	batchMutex     sync.Mutex
+	batchTicker    *time.Ticker
+	maxRetryBuffer int // cap for pending-retry records (default: 500)
 
 	// Graceful shutdown
 	wg      sync.WaitGroup
@@ -117,11 +118,12 @@ func NewAuditService(s core.Store, bufferSize int) *AuditService {
 	}
 
 	service := &AuditService{
-		store:         s,
-		bufferSize:    bufferSize,
-		logChan:       make(chan *models.AuditLog, bufferSize),
-		batchBuffer:   make([]*models.AuditLog, 0, 100),
-		eventsDropped: getAuditEventsDroppedCounter(),
+		store:          s,
+		bufferSize:     bufferSize,
+		logChan:        make(chan *models.AuditLog, bufferSize),
+		batchBuffer:    make([]*models.AuditLog, 0, 100),
+		maxRetryBuffer: 500, // 5x the normal batch size
+		eventsDropped:  getAuditEventsDroppedCounter(),
 	}
 
 	service.batchTicker = time.NewTicker(1 * time.Second)
@@ -175,22 +177,38 @@ func (s *AuditService) flushBatch() {
 	s.flushBatchUnsafe()
 }
 
-// flushBatchUnsafe flushes the batch buffer without locking (caller must hold lock)
+// flushBatchUnsafe flushes the batch buffer without locking (caller must hold lock).
+// On success the buffer is cleared. On error the records are retained for the next
+// flush tick. If the buffer exceeds maxRetryBuffer the oldest records are dropped to
+// prevent unbounded memory growth.
 func (s *AuditService) flushBatchUnsafe() {
 	if len(s.batchBuffer) == 0 {
 		return
 	}
 
-	// Copy buffer for writing
+	// Guard: zero value is safe — treat as default.
+	if s.maxRetryBuffer <= 0 {
+		s.maxRetryBuffer = 500
+	}
+
 	toWrite := make([]*models.AuditLog, len(s.batchBuffer))
 	copy(toWrite, s.batchBuffer)
 
-	// Clear buffer
-	s.batchBuffer = s.batchBuffer[:0]
-
 	if err := s.store.CreateAuditLogBatch(toWrite); err != nil {
-		log.Printf("Failed to write audit log batch: %v", err)
+		log.Printf("WARNING: failed to write audit log batch (%d records), will retry: %v",
+			len(toWrite), err)
+		// Buffer still holds the records — enforce the cap so memory stays bounded.
+		if len(s.batchBuffer) > s.maxRetryBuffer {
+			drop := len(s.batchBuffer) - s.maxRetryBuffer
+			log.Printf("WARNING: audit retry buffer full, dropping %d oldest records", drop)
+			s.eventsDropped.Add(float64(drop))
+			s.batchBuffer = s.batchBuffer[drop:]
+		}
+		return
 	}
+
+	// Success — clear only after confirmed write.
+	s.batchBuffer = s.batchBuffer[:0]
 }
 
 // clampToColumn returns s unchanged when it already fits within limit runes,
diff --git a/internal/services/audit_test.go b/internal/services/audit_test.go
index f2da9666..7f8717f6 100644
--- a/internal/services/audit_test.go
+++ b/internal/services/audit_test.go
@@ -2,17 +2,20 @@ package services
 
 import (
 	"context"
+	"errors"
 	"fmt"
 	"strings"
 	"testing"
 	"time"
 
 	"github.com/go-authgate/authgate/internal/core"
+	"github.com/go-authgate/authgate/internal/mocks"
 	"github.com/go-authgate/authgate/internal/models"
 	storetypes "github.com/go-authgate/authgate/internal/store/types"
 	"github.com/go-authgate/authgate/internal/util"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
+	"go.uber.org/mock/gomock"
 )
 
 func TestMaskSensitiveDetails_FullRedaction(t *testing.T) {
@@ -327,3 +330,166 @@ func TestShutdown_DrainsLogChan(t *testing.T) {
 	require.NoError(t, err)
 	assert.GreaterOrEqual(t, len(logs), numEntries, "all drain-test entries should be persisted")
 }
+
+// TestFlushBatch_RetryOnTransientError verifies that a single transient DB
+// failure keeps records in the buffer so the next flush tick persists them.
+func TestFlushBatch_RetryOnTransientError(t *testing.T) {
+	ctrl := gomock.NewController(t)
+	mockStore := mocks.NewMockStore(ctrl)
+
+	transientErr := errors.New("transient db error")
+	// First call fails, second succeeds.
+	gomock.InOrder(
+		mockStore.EXPECT().CreateAuditLogBatch(gomock.Any()).Return(transientErr),
+		mockStore.EXPECT().CreateAuditLogBatch(gomock.Any()).Return(nil),
+	)
+
+	dropped := getAuditEventsDroppedCounter()
+	svc := &AuditService{
+		store:          mockStore,
+		bufferSize:     100,
+		logChan:        make(chan *models.AuditLog, 100),
+		batchBuffer:    make([]*models.AuditLog, 0, 100),
+		maxRetryBuffer: 500,
+		eventsDropped:  dropped,
+	}
+
+	// Pre-fill the batch buffer with 5 records (bypassing the channel).
+	svc.batchMutex.Lock()
+	for i := range 5 {
+		svc.batchBuffer = append(svc.batchBuffer, &models.AuditLog{
+			ID:        fmt.Sprintf("retry-test-%d", i),
+			EventType: models.EventAccessTokenIssued,
+			Severity:  models.SeverityInfo,
+			Action:    "retry-test",
+		})
+	}
+	svc.batchMutex.Unlock()
+
+	// First flush — fails; records must remain.
+	svc.flushBatch()
+	svc.batchMutex.Lock()
+	assert.Len(t, svc.batchBuffer, 5, "records must be retained after transient failure")
+	svc.batchMutex.Unlock()
+
+	// Second flush — succeeds; buffer must be cleared.
+	svc.flushBatch()
+	svc.batchMutex.Lock()
+	assert.Empty(t, svc.batchBuffer, "buffer must be cleared after successful write")
+	svc.batchMutex.Unlock()
+}
+
+// TestFlushBatch_CapsBufferOnRepeatedFailure verifies that the retry buffer
+// stays at most maxRetryBuffer entries and that overflow is counted.
+func TestFlushBatch_CapsBufferOnRepeatedFailure(t *testing.T) {
+	ctrl := gomock.NewController(t)
+	mockStore := mocks.NewMockStore(ctrl)
+
+	persistentErr := errors.New("persistent db error")
+	mockStore.EXPECT().CreateAuditLogBatch(gomock.Any()).Return(persistentErr).AnyTimes()
+
+	dropped := getAuditEventsDroppedCounter()
+	const maxBuf = 10
+	svc := &AuditService{
+		store:          mockStore,
+		bufferSize:     100,
+		logChan:        make(chan *models.AuditLog, 100),
+		batchBuffer:    make([]*models.AuditLog, 0, 100),
+		maxRetryBuffer: maxBuf,
+		eventsDropped:  dropped,
+	}
+
+	// Write 20 records in two batches of 10, flushing after each.
+	addRecords := func(start, count int) {
+		svc.batchMutex.Lock()
+		for i := range count {
+			svc.batchBuffer = append(svc.batchBuffer, &models.AuditLog{
+				ID:        fmt.Sprintf("cap-test-%d", start+i),
+				EventType: models.EventAccessTokenIssued,
+				Severity:  models.SeverityInfo,
+				Action:    "cap-test",
+			})
+		}
+		svc.batchMutex.Unlock()
+		svc.flushBatch()
+	}
+
+	addRecords(0, 10)
+	addRecords(10, 10)
+
+	svc.batchMutex.Lock()
+	bufLen := len(svc.batchBuffer)
+	svc.batchMutex.Unlock()
+
+	assert.LessOrEqual(t, bufLen, maxBuf, "buffer must not exceed maxRetryBuffer")
+
+	// At least some records must have been counted as dropped.
+	// We read the counter value via a gauge trick: compare before/after is
+	// impractical for a singleton counter, so just assert the cap held.
+	assert.LessOrEqual(t, bufLen, maxBuf)
+}
+
+// TestShutdown_DrainRetriesOnTransientError verifies that the shutdown drain
+// retries on a transient error: all records are eventually persisted and
+// Shutdown returns nil.
+func TestShutdown_DrainRetriesOnTransientError(t *testing.T) {
+	ctrl := gomock.NewController(t)
+	mockStore := mocks.NewMockStore(ctrl)
+
+	transientErr := errors.New("transient db error on drain")
+	// The worker drains the channel into the batch buffer, then calls
+	// flushBatch once when the channel closes.  A single failure followed by
+	// the ticker-free shutdown path means we need the shutdown flush to succeed
+	// on the second attempt. We use a custom store that tracks call count.
+	callCount := 0
+	mockStore.EXPECT().CreateAuditLogBatch(gomock.Any()).
+		DoAndReturn(func(logs []*models.AuditLog) error {
+			callCount++
+			if callCount == 1 {
+				return transientErr
+			}
+			return nil
+		}).
+		Times(2)
+
+	dropped := getAuditEventsDroppedCounter()
+	svc := &AuditService{
+		store:          mockStore,
+		bufferSize:     100,
+		logChan:        make(chan *models.AuditLog, 100),
+		batchBuffer:    make([]*models.AuditLog, 0, 100),
+		maxRetryBuffer: 500,
+		eventsDropped:  dropped,
+	}
+
+	// Pre-fill the channel before the worker starts so it drains deterministically.
+	const numEntries = 5
+	for i := range numEntries {
+		svc.logChan <- &models.AuditLog{
+			ID:        fmt.Sprintf("drain-retry-%d", i),
+			EventType: models.EventAccessTokenIssued,
+			Severity:  models.SeverityInfo,
+			Action:    "drain-retry-test",
+		}
+	}
+
+	svc.batchTicker = time.NewTicker(1 * time.Second)
+	svc.wg.Add(1)
+	go svc.worker()
+
+	// Shutdown triggers the flush. After the first (failing) flush the records
+	// stay in the buffer. The worker exits but the batch is still pending; a
+	// second flush in Shutdown's drain path must succeed.
+	// Because the current implementation does a single flushBatch on channel
+	// close, we simulate the retry by calling flushBatch a second time manually
+	// after shutdown completes with the pending buffer, or — simpler — we just
+	// assert that Shutdown does not timeout and call count reaches 2.
+	err := svc.Shutdown(context.Background())
+	require.NoError(t, err, "Shutdown must not timeout")
+
+	// Manually retry for the pending buffer (simulates the next ticker tick).
+	svc.flushBatch()
+
+	assert.Equal(t, 2, callCount,
+		"CreateAuditLogBatch must be called twice (one fail + one success)")
+}

From 4bdfa9c1ee874fa779a272344ae96fcc472b7b58 Mon Sep 17 00:00:00 2001
From: appleboy <appleboy.tw@gmail.com>
Date: Fri, 26 Jun 2026 10:54:49 +0800
Subject: [PATCH 02/15] fix(audit): report shutdown failure when final flush
 cannot persist records

- after worker exits, Shutdown performs a second flushBatch to retry any
  records left pending by a transient DB error on channel close
- return error from Shutdown when records still remain after the retry flush
- update TestShutdown_DrainRetriesOnTransientError: second flush now happens
  inside Shutdown, so the manual post-shutdown flushBatch call is removed

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 internal/services/audit.go      | 22 ++++++++++++++++++++--
 internal/services/audit_test.go | 14 +++-----------
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/internal/services/audit.go b/internal/services/audit.go
index 1cdda7da..ac0bc28f 100644
--- a/internal/services/audit.go
+++ b/internal/services/audit.go
@@ -388,11 +388,29 @@ func (s *AuditService) Shutdown(ctx context.Context) error {
 
 	select {
 	case <-done:
-		log.Println("Audit service shut down gracefully")
-		return nil
 	case <-ctx.Done():
 		return fmt.Errorf("audit service shutdown timeout: %w", ctx.Err())
 	}
+
+	// The worker performs a single flushBatch on channel close. If that flush
+	// failed (transient DB error), records remain in batchBuffer with no ticker
+	// left to retry them. Attempt one final flush here so Shutdown accurately
+	// reports whether all records were persisted.
+	s.flushBatch()
+
+	s.batchMutex.Lock()
+	pending := len(s.batchBuffer)
+	s.batchMutex.Unlock()
+
+	if pending > 0 {
+		return fmt.Errorf(
+			"audit service shutdown: %d records could not be flushed to the database",
+			pending,
+		)
+	}
+
+	log.Println("Audit service shut down gracefully")
+	return nil
 }
 
 // maskSensitiveDetails masks sensitive information in audit log details
diff --git a/internal/services/audit_test.go b/internal/services/audit_test.go
index 7f8717f6..380d1f82 100644
--- a/internal/services/audit_test.go
+++ b/internal/services/audit_test.go
@@ -477,18 +477,10 @@ func TestShutdown_DrainRetriesOnTransientError(t *testing.T) {
 	svc.wg.Add(1)
 	go svc.worker()
 
-	// Shutdown triggers the flush. After the first (failing) flush the records
-	// stay in the buffer. The worker exits but the batch is still pending; a
-	// second flush in Shutdown's drain path must succeed.
-	// Because the current implementation does a single flushBatch on channel
-	// close, we simulate the retry by calling flushBatch a second time manually
-	// after shutdown completes with the pending buffer, or — simpler — we just
-	// assert that Shutdown does not timeout and call count reaches 2.
+	// Shutdown closes the channel, the worker flushes once (fails), then exits.
+	// Shutdown then performs a second flush — which succeeds — and returns nil.
 	err := svc.Shutdown(context.Background())
-	require.NoError(t, err, "Shutdown must not timeout")
-
-	// Manually retry for the pending buffer (simulates the next ticker tick).
-	svc.flushBatch()
+	require.NoError(t, err, "Shutdown must succeed after retry flush clears the buffer")
 
 	assert.Equal(t, 2, callCount,
 		"CreateAuditLogBatch must be called twice (one fail + one success)")

From 8a3435cd397a9cb669ff4fb113687f32b88d65c3 Mon Sep 17 00:00:00 2001
From: appleboy <appleboy.tw@gmail.com>
Date: Fri, 26 Jun 2026 12:45:59 +0800
Subject: [PATCH 03/15] fix(audit): honour shutdown timeout on final flush;
 drop permanently-bad batches
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- wrap the post-worker flushBatch in Shutdown inside a goroutine so
  ctx.Done() is respected — a stalled DB cannot block Shutdown past the
  caller's deadline
- add maxFlushRetries (default 5) and flushFailCount to AuditService;
  after maxFlushRetries consecutive failures the batch is dropped and
  eventsDropped is incremented, preventing a permanently-invalid row from
  stalling all subsequent audit events indefinitely

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 internal/services/audit.go | 90 ++++++++++++++++++++++++++------------
 1 file changed, 63 insertions(+), 27 deletions(-)

diff --git a/internal/services/audit.go b/internal/services/audit.go
index ac0bc28f..6eb5f189 100644
--- a/internal/services/audit.go
+++ b/internal/services/audit.go
@@ -97,10 +97,12 @@ type AuditService struct {
 	logChan chan *models.AuditLog
 
 	// Batch buffer
-	batchBuffer    []*models.AuditLog
-	batchMutex     sync.Mutex
-	batchTicker    *time.Ticker
-	maxRetryBuffer int // cap for pending-retry records (default: 500)
+	batchBuffer     []*models.AuditLog
+	batchMutex      sync.Mutex
+	batchTicker     *time.Ticker
+	maxRetryBuffer  int // cap for pending-retry records (default: 500)
+	maxFlushRetries int // drop batch after this many consecutive failures (default: 5)
+	flushFailCount  int // consecutive flush failure count for the current batch
 
 	// Graceful shutdown
 	wg      sync.WaitGroup
@@ -118,12 +120,13 @@ func NewAuditService(s core.Store, bufferSize int) *AuditService {
 	}
 
 	service := &AuditService{
-		store:          s,
-		bufferSize:     bufferSize,
-		logChan:        make(chan *models.AuditLog, bufferSize),
-		batchBuffer:    make([]*models.AuditLog, 0, 100),
-		maxRetryBuffer: 500, // 5x the normal batch size
-		eventsDropped:  getAuditEventsDroppedCounter(),
+		store:           s,
+		bufferSize:      bufferSize,
+		logChan:         make(chan *models.AuditLog, bufferSize),
+		batchBuffer:     make([]*models.AuditLog, 0, 100),
+		maxRetryBuffer:  500, // 5x the normal batch size
+		maxFlushRetries: 5,
+		eventsDropped:   getAuditEventsDroppedCounter(),
 	}
 
 	service.batchTicker = time.NewTicker(1 * time.Second)
@@ -179,24 +182,44 @@ func (s *AuditService) flushBatch() {
 
 // flushBatchUnsafe flushes the batch buffer without locking (caller must hold lock).
 // On success the buffer is cleared. On error the records are retained for the next
-// flush tick. If the buffer exceeds maxRetryBuffer the oldest records are dropped to
-// prevent unbounded memory growth.
+// flush tick. After maxFlushRetries consecutive failures the batch is dropped to
+// prevent a permanently-bad row from stalling all subsequent audit events.
+// If the buffer exceeds maxRetryBuffer the oldest records are dropped to prevent
+// unbounded memory growth.
 func (s *AuditService) flushBatchUnsafe() {
 	if len(s.batchBuffer) == 0 {
 		return
 	}
 
-	// Guard: zero value is safe — treat as default.
+	// Guard: zero values are safe — treat as defaults.
 	if s.maxRetryBuffer <= 0 {
 		s.maxRetryBuffer = 500
 	}
+	if s.maxFlushRetries <= 0 {
+		s.maxFlushRetries = 5
+	}
 
 	toWrite := make([]*models.AuditLog, len(s.batchBuffer))
 	copy(toWrite, s.batchBuffer)
 
 	if err := s.store.CreateAuditLogBatch(toWrite); err != nil {
-		log.Printf("WARNING: failed to write audit log batch (%d records), will retry: %v",
-			len(toWrite), err)
+		s.flushFailCount++
+		log.Printf(
+			"WARNING: failed to write audit log batch (%d records, attempt %d/%d), will retry: %v",
+			len(toWrite), s.flushFailCount, s.maxFlushRetries, err,
+		)
+
+		// After too many consecutive failures, drop the batch — a permanently
+		// invalid row would otherwise block all subsequent audit events.
+		if s.flushFailCount >= s.maxFlushRetries {
+			log.Printf("WARNING: audit batch failed %d times, dropping %d records",
+				s.flushFailCount, len(s.batchBuffer))
+			s.eventsDropped.Add(float64(len(s.batchBuffer)))
+			s.batchBuffer = s.batchBuffer[:0]
+			s.flushFailCount = 0
+			return
+		}
+
 		// Buffer still holds the records — enforce the cap so memory stays bounded.
 		if len(s.batchBuffer) > s.maxRetryBuffer {
 			drop := len(s.batchBuffer) - s.maxRetryBuffer
@@ -208,6 +231,7 @@ func (s *AuditService) flushBatchUnsafe() {
 	}
 
 	// Success — clear only after confirmed write.
+	s.flushFailCount = 0
 	s.batchBuffer = s.batchBuffer[:0]
 }
 
@@ -394,19 +418,31 @@ func (s *AuditService) Shutdown(ctx context.Context) error {
 
 	// The worker performs a single flushBatch on channel close. If that flush
 	// failed (transient DB error), records remain in batchBuffer with no ticker
-	// left to retry them. Attempt one final flush here so Shutdown accurately
-	// reports whether all records were persisted.
-	s.flushBatch()
-
-	s.batchMutex.Lock()
-	pending := len(s.batchBuffer)
-	s.batchMutex.Unlock()
+	// left to retry them. Attempt one final flush here, honouring the caller's
+	// deadline so a stalled DB cannot block Shutdown past the timeout.
+	flushDone := make(chan error, 1)
+	go func() {
+		s.flushBatch()
+		s.batchMutex.Lock()
+		pending := len(s.batchBuffer)
+		s.batchMutex.Unlock()
+		if pending > 0 {
+			flushDone <- fmt.Errorf(
+				"audit service shutdown: %d records could not be flushed to the database",
+				pending,
+			)
+			return
+		}
+		flushDone <- nil
+	}()
 
-	if pending > 0 {
-		return fmt.Errorf(
-			"audit service shutdown: %d records could not be flushed to the database",
-			pending,
-		)
+	select {
+	case err := <-flushDone:
+		if err != nil {
+			return err
+		}
+	case <-ctx.Done():
+		return fmt.Errorf("audit service shutdown timeout during final flush: %w", ctx.Err())
 	}
 
 	log.Println("Audit service shut down gracefully")

From 631ddd32d9287d725d40aa7f6343e83542a647bb Mon Sep 17 00:00:00 2001
From: appleboy <appleboy.tw@gmail.com>
Date: Fri, 26 Jun 2026 12:53:14 +0800
Subject: [PATCH 04/15] fix(audit): skip size-triggered flush while in retry
 state

While flushFailCount > 0, addToBatch no longer calls flushBatchUnsafe
on the 100-entry threshold. Under active traffic a failed flush left the
buffer at or above 100 entries, so each new event immediately re-triggered
flushBatchUnsafe and incremented flushFailCount once per event instead of
once per ticker tick, exhausting maxFlushRetries in seconds and dropping
the retained buffer far earlier than intended. Deferring to the 1-second
ticker while in retry state ensures flushFailCount advances at most once
per second regardless of write traffic.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 internal/services/audit.go | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/internal/services/audit.go b/internal/services/audit.go
index 6eb5f189..3afba4da 100644
--- a/internal/services/audit.go
+++ b/internal/services/audit.go
@@ -167,8 +167,12 @@ func (s *AuditService) addToBatch(log *models.AuditLog) {
 
 	s.batchBuffer = append(s.batchBuffer, log)
 
-	// Flush if batch is full (100 entries)
-	if len(s.batchBuffer) >= 100 {
+	// Flush if batch is full (100 entries), but only when not in retry state.
+	// While flushFailCount > 0 the previous flush already failed; size-triggered
+	// re-attempts would increment flushFailCount once per new event and exhaust
+	// maxFlushRetries in seconds under active traffic, dropping the retained
+	// buffer far earlier than intended. Defer to the 1-second ticker instead.
+	if len(s.batchBuffer) >= 100 && s.flushFailCount == 0 {
 		s.flushBatchUnsafe()
 	}
 }

From 5c079e43d02fa8fc754ef29daa99b91b0b0f853b Mon Sep 17 00:00:00 2001
From: appleboy <appleboy.tw@gmail.com>
Date: Fri, 26 Jun 2026 12:58:59 +0800
Subject: [PATCH 05/15] fix(audit): enforce maxRetryBuffer cap in addToBatch
 during retry state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

While flushFailCount > 0 the size-triggered flush is suppressed so the
ticker remains the sole retry driver. However the buffer cap was only
enforced inside flushBatchUnsafe (which is skipped in retry state), so
sustained audit traffic could drain logChan into batchBuffer faster than
the 1-second ticker fires, growing memory without bound until OOM.

Apply the maxRetryBuffer cap directly in addToBatch whenever we are in
retry state (flushFailCount > 0), dropping the oldest entries and
incrementing eventsDropped — the same behaviour flushBatchUnsafe uses
for overflow.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 internal/services/audit.go | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/internal/services/audit.go b/internal/services/audit.go
index 3afba4da..ccaa93b4 100644
--- a/internal/services/audit.go
+++ b/internal/services/audit.go
@@ -167,13 +167,27 @@ func (s *AuditService) addToBatch(log *models.AuditLog) {
 
 	s.batchBuffer = append(s.batchBuffer, log)
 
-	// Flush if batch is full (100 entries), but only when not in retry state.
-	// While flushFailCount > 0 the previous flush already failed; size-triggered
-	// re-attempts would increment flushFailCount once per new event and exhaust
-	// maxFlushRetries in seconds under active traffic, dropping the retained
-	// buffer far earlier than intended. Defer to the 1-second ticker instead.
-	if len(s.batchBuffer) >= 100 && s.flushFailCount == 0 {
-		s.flushBatchUnsafe()
+	if s.flushFailCount == 0 {
+		// Normal path: flush when the batch reaches 100 entries.
+		if len(s.batchBuffer) >= 100 {
+			s.flushBatchUnsafe()
+		}
+		return
+	}
+
+	// Retry state: the last flush failed, so size-triggered flushes are
+	// suppressed to avoid advancing flushFailCount once-per-event. The ticker
+	// is the sole retry driver. However, the maxRetryBuffer cap must still be
+	// enforced here — otherwise sustained traffic can drain logChan into
+	// batchBuffer far faster than the ticker fires, growing memory without bound.
+	maxBuf := s.maxRetryBuffer
+	if maxBuf <= 0 {
+		maxBuf = 500
+	}
+	if len(s.batchBuffer) > maxBuf {
+		drop := len(s.batchBuffer) - maxBuf
+		s.eventsDropped.Add(float64(drop))
+		s.batchBuffer = s.batchBuffer[drop:]
 	}
 }
 

From 95c2cfd5cf44cd9b2c400d4b1cd42360c6bc47c3 Mon Sep 17 00:00:00 2001
From: appleboy <appleboy.tw@gmail.com>
Date: Fri, 26 Jun 2026 13:11:13 +0800
Subject: [PATCH 06/15] fix(audit): reset retry count on cap-eviction; report
 dropped records on shutdown

P1: reset flushFailCount when addToBatch cap-evicts records in retry state.
The evicted records are new arrivals that have never been attempted; leaving
flushFailCount unchanged meant those records inherited prior failure counts
and could be dropped after only (maxFlushRetries - flushFailCount) more
ticker ticks instead of the full retry window.

P2: add lastFlushDropped flag to distinguish buffer-empty-because-dropped
from buffer-empty-because-written. Shutdown now returns an error when the
final flush emptied the buffer via the drop path, so callers are not misled
into thinking all records were persisted.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 internal/services/audit.go | 35 ++++++++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/internal/services/audit.go b/internal/services/audit.go
index ccaa93b4..5de708d6 100644
--- a/internal/services/audit.go
+++ b/internal/services/audit.go
@@ -2,6 +2,7 @@ package services
 
 import (
 	"context"
+	"errors"
 	"fmt"
 	"log"
 	"strings"
@@ -97,12 +98,13 @@ type AuditService struct {
 	logChan chan *models.AuditLog
 
 	// Batch buffer
-	batchBuffer     []*models.AuditLog
-	batchMutex      sync.Mutex
-	batchTicker     *time.Ticker
-	maxRetryBuffer  int // cap for pending-retry records (default: 500)
-	maxFlushRetries int // drop batch after this many consecutive failures (default: 5)
-	flushFailCount  int // consecutive flush failure count for the current batch
+	batchBuffer      []*models.AuditLog
+	batchMutex       sync.Mutex
+	batchTicker      *time.Ticker
+	maxRetryBuffer   int  // cap for pending-retry records (default: 500)
+	maxFlushRetries  int  // drop batch after this many consecutive failures (default: 5)
+	flushFailCount   int  // consecutive flush failure count for the current batch
+	lastFlushDropped bool // set true when flushBatchUnsafe drops records without writing
 
 	// Graceful shutdown
 	wg      sync.WaitGroup
@@ -188,6 +190,11 @@ func (s *AuditService) addToBatch(log *models.AuditLog) {
 		drop := len(s.batchBuffer) - maxBuf
 		s.eventsDropped.Add(float64(drop))
 		s.batchBuffer = s.batchBuffer[drop:]
+		// The retained records are now different from the ones that failed.
+		// Reset the retry counter so those new records get the full retry
+		// window instead of inheriting prior failures.
+		s.flushFailCount = 0
+		s.lastFlushDropped = true
 	}
 }
 
@@ -235,6 +242,7 @@ func (s *AuditService) flushBatchUnsafe() {
 			s.eventsDropped.Add(float64(len(s.batchBuffer)))
 			s.batchBuffer = s.batchBuffer[:0]
 			s.flushFailCount = 0
+			s.lastFlushDropped = true
 			return
 		}
 
@@ -250,6 +258,7 @@ func (s *AuditService) flushBatchUnsafe() {
 
 	// Success — clear only after confirmed write.
 	s.flushFailCount = 0
+	s.lastFlushDropped = false
 	s.batchBuffer = s.batchBuffer[:0]
 }
 
@@ -443,15 +452,23 @@ func (s *AuditService) Shutdown(ctx context.Context) error {
 		s.flushBatch()
 		s.batchMutex.Lock()
 		pending := len(s.batchBuffer)
+		dropped := s.lastFlushDropped
 		s.batchMutex.Unlock()
-		if pending > 0 {
+		switch {
+		case pending > 0:
 			flushDone <- fmt.Errorf(
 				"audit service shutdown: %d records could not be flushed to the database",
 				pending,
 			)
-			return
+		case dropped:
+			// Buffer is empty but only because records were dropped after exhausting
+			// retries — not because they were successfully written.
+			flushDone <- errors.New(
+				"audit service shutdown: pending audit records were dropped after exhausting retry attempts",
+			)
+		default:
+			flushDone <- nil
 		}
-		flushDone <- nil
 	}()
 
 	select {

From 6f5c7692751a29c31de8684cb0da54d6d8dc5774 Mon Sep 17 00:00:00 2001
From: appleboy <appleboy.tw@gmail.com>
Date: Fri, 26 Jun 2026 13:28:32 +0800
Subject: [PATCH 07/15] fix(audit): remove maxFlushRetries drop; rely solely on
 maxRetryBuffer cap

The maxFlushRetries=5 path dropped the entire retry buffer after ~5 failed
ticks (5 seconds). A normal DB restart lasting 6 seconds would therefore
permanently lose all buffered audit records, defeating the retry guarantee.

Memory safety is already provided by maxRetryBuffer (default 500 records):
records are evicted oldest-first when the cap is exceeded. The
maxFlushRetries hard-drop added no safety benefit and was harmful to
availability.

Remove the maxFlushRetries field and its drop path entirely. The
flushFailCount field is retained as a retry-state flag (still needed by
addToBatch to suppress size-triggered flushes during outages), but it no
longer drives any drop decision.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 internal/services/audit.go | 48 +++++++++++++-------------------------
 1 file changed, 16 insertions(+), 32 deletions(-)

diff --git a/internal/services/audit.go b/internal/services/audit.go
index 5de708d6..cba3851c 100644
--- a/internal/services/audit.go
+++ b/internal/services/audit.go
@@ -102,9 +102,8 @@ type AuditService struct {
 	batchMutex       sync.Mutex
 	batchTicker      *time.Ticker
 	maxRetryBuffer   int  // cap for pending-retry records (default: 500)
-	maxFlushRetries  int  // drop batch after this many consecutive failures (default: 5)
-	flushFailCount   int  // consecutive flush failure count for the current batch
-	lastFlushDropped bool // set true when flushBatchUnsafe drops records without writing
+	flushFailCount   int  // > 0 while in retry state (consecutive flush failures)
+	lastFlushDropped bool // set true when records are dropped without being written
 
 	// Graceful shutdown
 	wg      sync.WaitGroup
@@ -122,13 +121,12 @@ func NewAuditService(s core.Store, bufferSize int) *AuditService {
 	}
 
 	service := &AuditService{
-		store:           s,
-		bufferSize:      bufferSize,
-		logChan:         make(chan *models.AuditLog, bufferSize),
-		batchBuffer:     make([]*models.AuditLog, 0, 100),
-		maxRetryBuffer:  500, // 5x the normal batch size
-		maxFlushRetries: 5,
-		eventsDropped:   getAuditEventsDroppedCounter(),
+		store:          s,
+		bufferSize:     bufferSize,
+		logChan:        make(chan *models.AuditLog, bufferSize),
+		batchBuffer:    make([]*models.AuditLog, 0, 100),
+		maxRetryBuffer: 500, // 5x the normal batch size
+		eventsDropped:  getAuditEventsDroppedCounter(),
 	}
 
 	service.batchTicker = time.NewTicker(1 * time.Second)
@@ -207,22 +205,17 @@ func (s *AuditService) flushBatch() {
 
 // flushBatchUnsafe flushes the batch buffer without locking (caller must hold lock).
 // On success the buffer is cleared. On error the records are retained for the next
-// flush tick. After maxFlushRetries consecutive failures the batch is dropped to
-// prevent a permanently-bad row from stalling all subsequent audit events.
-// If the buffer exceeds maxRetryBuffer the oldest records are dropped to prevent
-// unbounded memory growth.
+// flush tick. If the buffer exceeds maxRetryBuffer the oldest records are dropped
+// to prevent unbounded memory growth during a sustained DB outage.
 func (s *AuditService) flushBatchUnsafe() {
 	if len(s.batchBuffer) == 0 {
 		return
 	}
 
-	// Guard: zero values are safe — treat as defaults.
+	// Guard: zero value is safe — treat as default.
 	if s.maxRetryBuffer <= 0 {
 		s.maxRetryBuffer = 500
 	}
-	if s.maxFlushRetries <= 0 {
-		s.maxFlushRetries = 5
-	}
 
 	toWrite := make([]*models.AuditLog, len(s.batchBuffer))
 	copy(toWrite, s.batchBuffer)
@@ -230,23 +223,14 @@ func (s *AuditService) flushBatchUnsafe() {
 	if err := s.store.CreateAuditLogBatch(toWrite); err != nil {
 		s.flushFailCount++
 		log.Printf(
-			"WARNING: failed to write audit log batch (%d records, attempt %d/%d), will retry: %v",
-			len(toWrite), s.flushFailCount, s.maxFlushRetries, err,
+			"WARNING: failed to write audit log batch (%d records, attempt %d), will retry: %v",
+			len(toWrite), s.flushFailCount, err,
 		)
 
-		// After too many consecutive failures, drop the batch — a permanently
-		// invalid row would otherwise block all subsequent audit events.
-		if s.flushFailCount >= s.maxFlushRetries {
-			log.Printf("WARNING: audit batch failed %d times, dropping %d records",
-				s.flushFailCount, len(s.batchBuffer))
-			s.eventsDropped.Add(float64(len(s.batchBuffer)))
-			s.batchBuffer = s.batchBuffer[:0]
-			s.flushFailCount = 0
-			s.lastFlushDropped = true
-			return
-		}
-
 		// Buffer still holds the records — enforce the cap so memory stays bounded.
+		// Memory safety is provided solely by maxRetryBuffer; there is no fixed
+		// retry limit so a transient outage of any duration cannot lose records
+		// that still fit within the cap.
 		if len(s.batchBuffer) > s.maxRetryBuffer {
 			drop := len(s.batchBuffer) - s.maxRetryBuffer
 			log.Printf("WARNING: audit retry buffer full, dropping %d oldest records", drop)

From faeded8d871c5c290371944c02f0e9cfe6bbc9ba Mon Sep 17 00:00:00 2001
From: appleboy <appleboy.tw@gmail.com>
Date: Fri, 26 Jun 2026 13:37:17 +0800
Subject: [PATCH 08/15] fix(audit): keep retry state after cap-eviction; add
 stale-tick record eviction

P2: don't reset flushFailCount after cap-eviction in addToBatch.
Previously, after dropping overflow records the counter was reset to 0,
which re-entered the normal flush path. Since the buffer typically stays
above 100 entries, the very next event would call flushBatchUnsafe again,
causing a DB write attempt every other event under sustained traffic while
the DB is already down.

P2: add staleTicks-based single-record eviction in flushBatchUnsafe.
After staleTicks (default 300 = ~5 min) consecutive failed ticks, one
oldest record is evicted per tick. This handles the permanently-invalid
row case (e.g. a ResourceName value exceeding varchar(255) on Postgres)
that would otherwise block all subsequent audit events indefinitely in
low-traffic deployments.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 internal/services/audit.go | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/internal/services/audit.go b/internal/services/audit.go
index cba3851c..810a476a 100644
--- a/internal/services/audit.go
+++ b/internal/services/audit.go
@@ -102,6 +102,7 @@ type AuditService struct {
 	batchMutex       sync.Mutex
 	batchTicker      *time.Ticker
 	maxRetryBuffer   int  // cap for pending-retry records (default: 500)
+	staleTicks       int  // ticks before starting per-tick eviction of oldest record (default: 300)
 	flushFailCount   int  // > 0 while in retry state (consecutive flush failures)
 	lastFlushDropped bool // set true when records are dropped without being written
 
@@ -126,6 +127,7 @@ func NewAuditService(s core.Store, bufferSize int) *AuditService {
 		logChan:        make(chan *models.AuditLog, bufferSize),
 		batchBuffer:    make([]*models.AuditLog, 0, 100),
 		maxRetryBuffer: 500, // 5x the normal batch size
+		staleTicks:     300, // ~5 minutes of failed ticks before single-record eviction begins
 		eventsDropped:  getAuditEventsDroppedCounter(),
 	}
 
@@ -188,11 +190,12 @@ func (s *AuditService) addToBatch(log *models.AuditLog) {
 		drop := len(s.batchBuffer) - maxBuf
 		s.eventsDropped.Add(float64(drop))
 		s.batchBuffer = s.batchBuffer[drop:]
-		// The retained records are now different from the ones that failed.
-		// Reset the retry counter so those new records get the full retry
-		// window instead of inheriting prior failures.
-		s.flushFailCount = 0
 		s.lastFlushDropped = true
+		// Do NOT reset flushFailCount — the DB may still be down.
+		// Resetting would re-enter the normal flush path on the next event:
+		// since the buffer stays above 100 entries, addToBatch would call
+		// flushBatchUnsafe immediately, causing a write attempt roughly every
+		// other event under sustained traffic and hammering an already failing DB.
 	}
 }
 
@@ -228,15 +231,29 @@ func (s *AuditService) flushBatchUnsafe() {
 		)
 
 		// Buffer still holds the records — enforce the cap so memory stays bounded.
-		// Memory safety is provided solely by maxRetryBuffer; there is no fixed
-		// retry limit so a transient outage of any duration cannot lose records
-		// that still fit within the cap.
 		if len(s.batchBuffer) > s.maxRetryBuffer {
 			drop := len(s.batchBuffer) - s.maxRetryBuffer
 			log.Printf("WARNING: audit retry buffer full, dropping %d oldest records", drop)
 			s.eventsDropped.Add(float64(drop))
 			s.batchBuffer = s.batchBuffer[drop:]
 		}
+
+		// After staleTicks consecutive failures, drop one oldest record per tick
+		// so a permanently-invalid row eventually falls out of the buffer while
+		// preserving the rest. This handles the case where a single bad row blocks
+		// all subsequent valid events indefinitely in low-traffic deployments.
+		stale := s.staleTicks
+		if stale <= 0 {
+			stale = 300
+		}
+		if s.flushFailCount > stale && len(s.batchBuffer) > 0 {
+			log.Printf(
+				"WARNING: audit batch stale after %d ticks, evicting oldest record",
+				s.flushFailCount,
+			)
+			s.eventsDropped.Add(1)
+			s.batchBuffer = s.batchBuffer[1:]
+		}
 		return
 	}
 

From 7979388709d374b1b065f1bc1b278c1afdc4f78b Mon Sep 17 00:00:00 2001
From: appleboy <appleboy.tw@gmail.com>
Date: Fri, 26 Jun 2026 13:43:56 +0800
Subject: [PATCH 09/15] fix(audit): set lastFlushDropped on all drop paths in
 flushBatchUnsafe

Both the cap-overflow eviction and the stale-tick single-record eviction
in flushBatchUnsafe could drop records without setting lastFlushDropped.
If a subsequent write succeeded, Shutdown would see an empty buffer and
a false lastFlushDropped=false, report graceful success, and silently
hide that records had been lost. Setting lastFlushDropped=true on both
drop paths ensures Shutdown accurately reports when any records were
discarded.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 internal/services/audit.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/internal/services/audit.go b/internal/services/audit.go
index 810a476a..f80ea82b 100644
--- a/internal/services/audit.go
+++ b/internal/services/audit.go
@@ -236,6 +236,7 @@ func (s *AuditService) flushBatchUnsafe() {
 			log.Printf("WARNING: audit retry buffer full, dropping %d oldest records", drop)
 			s.eventsDropped.Add(float64(drop))
 			s.batchBuffer = s.batchBuffer[drop:]
+			s.lastFlushDropped = true
 		}
 
 		// After staleTicks consecutive failures, drop one oldest record per tick
@@ -253,6 +254,7 @@ func (s *AuditService) flushBatchUnsafe() {
 			)
 			s.eventsDropped.Add(1)
 			s.batchBuffer = s.batchBuffer[1:]
+			s.lastFlushDropped = true
 		}
 		return
 	}

From 6114217c23aa9458fdc3ac8389b1bb705eb15c17 Mon Sep 17 00:00:00 2001
From: appleboy <appleboy.tw@gmail.com>
Date: Fri, 26 Jun 2026 13:50:35 +0800
Subject: [PATCH 10/15] fix(audit): remove staleTicks age-based eviction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

staleTicks dropped one record per second after ~5 minutes of consecutive
failures regardless of buffer pressure. During any prolonged but transient
DB maintenance window in a low-traffic deployment this silently lost one
audit record per tick — reintroducing the exact data loss the retry logic
was meant to prevent.

Memory is already bounded by maxRetryBuffer (cap-based eviction). A
permanently-stuck row produces WARNING logs on every tick, giving
operators visibility to investigate without the service silently
discarding records under time pressure.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 internal/services/audit.go | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/internal/services/audit.go b/internal/services/audit.go
index f80ea82b..0c4a9490 100644
--- a/internal/services/audit.go
+++ b/internal/services/audit.go
@@ -102,7 +102,6 @@ type AuditService struct {
 	batchMutex       sync.Mutex
 	batchTicker      *time.Ticker
 	maxRetryBuffer   int  // cap for pending-retry records (default: 500)
-	staleTicks       int  // ticks before starting per-tick eviction of oldest record (default: 300)
 	flushFailCount   int  // > 0 while in retry state (consecutive flush failures)
 	lastFlushDropped bool // set true when records are dropped without being written
 
@@ -127,7 +126,6 @@ func NewAuditService(s core.Store, bufferSize int) *AuditService {
 		logChan:        make(chan *models.AuditLog, bufferSize),
 		batchBuffer:    make([]*models.AuditLog, 0, 100),
 		maxRetryBuffer: 500, // 5x the normal batch size
-		staleTicks:     300, // ~5 minutes of failed ticks before single-record eviction begins
 		eventsDropped:  getAuditEventsDroppedCounter(),
 	}
 
@@ -239,23 +237,6 @@ func (s *AuditService) flushBatchUnsafe() {
 			s.lastFlushDropped = true
 		}
 
-		// After staleTicks consecutive failures, drop one oldest record per tick
-		// so a permanently-invalid row eventually falls out of the buffer while
-		// preserving the rest. This handles the case where a single bad row blocks
-		// all subsequent valid events indefinitely in low-traffic deployments.
-		stale := s.staleTicks
-		if stale <= 0 {
-			stale = 300
-		}
-		if s.flushFailCount > stale && len(s.batchBuffer) > 0 {
-			log.Printf(
-				"WARNING: audit batch stale after %d ticks, evicting oldest record",
-				s.flushFailCount,
-			)
-			s.eventsDropped.Add(1)
-			s.batchBuffer = s.batchBuffer[1:]
-			s.lastFlushDropped = true
-		}
 		return
 	}
 

From c79513da9f3f70b9bcc1a776da88a6b39f40dcf3 Mon Sep 17 00:00:00 2001
From: appleboy <appleboy.tw@gmail.com>
Date: Fri, 26 Jun 2026 13:56:23 +0800
Subject: [PATCH 11/15] fix(audit): clamp ResourceName and Action to
 varchar(255) in buildAuditLog
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ResourceName and Action are stored in varchar(255) columns but were not
clamped before INSERT, so an over-long value (e.g. a client name used as
ResourceName) would cause a permanent Postgres constraint error, leaving
the entire batch stuck in the retry buffer indefinitely.

Clamp both fields to 255 runes in buildAuditLog using the same
clampToColumn helper already applied to UserAgent, RequestPath,
ActorUsername, and ActorFullName — values within the column width are
preserved verbatim; only over-long values get an ellipsis-terminated
truncation.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 internal/services/audit.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/internal/services/audit.go b/internal/services/audit.go
index 0c4a9490..d55af384 100644
--- a/internal/services/audit.go
+++ b/internal/services/audit.go
@@ -321,6 +321,8 @@ func (s *AuditService) buildAuditLog(
 	entry.RequestPath = clampToColumn(entry.RequestPath, 500)
 	entry.ActorUsername = clampToColumn(entry.ActorUsername, 100)
 	entry.ActorFullName = clampToColumn(entry.ActorFullName, 100)
+	entry.ResourceName = clampToColumn(entry.ResourceName, 255)
+	entry.Action = clampToColumn(entry.Action, 255)
 
 	// RequestMethod is stored in a varchar(10) column. Preserve values up to
 	// the full column width and hard-truncate anything longer without adding

From 034bca77381c36b8d4b3f8c48013284b55e7bc43 Mon Sep 17 00:00:00 2001
From: appleboy <appleboy.tw@gmail.com>
Date: Fri, 26 Jun 2026 14:06:21 +0800
Subject: [PATCH 12/15] fix(audit): validate AuditDetails JSON serializability
 in buildAuditLog

An AuditDetails value with a non-JSON-marshalable type (e.g. a channel,
function, or cyclic map) would cause CreateAuditLogBatch to return a
permanent serialization error on every retry, blocking all subsequent
audit events in the buffer indefinitely.

Validate Details.Value() before the record enters the batch. If it is
not serializable, replace the details with a sentinel map containing
the error message so the row is always writable. This prevents any
permanently-invalid row from entering the retry buffer in the first place.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 internal/services/audit.go | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/internal/services/audit.go b/internal/services/audit.go
index d55af384..41496fcb 100644
--- a/internal/services/audit.go
+++ b/internal/services/audit.go
@@ -312,6 +312,20 @@ func (s *AuditService) buildAuditLog(
 	}
 	entry.Details = maskSensitiveDetails(entry.Details)
 
+	// Validate that Details can be serialized to JSON before the record enters
+	// the batch buffer. An unserializable value would cause CreateAuditLogBatch
+	// to fail with a permanent error on every retry, blocking all subsequent
+	// audit events indefinitely. Replace bad details with an error indicator so
+	// the row can always be written.
+	if _, err := entry.Details.Value(); err != nil {
+		log.Printf(
+			"WARNING: audit details for event %q are not JSON-serializable, replacing with error indicator: %v",
+			entry.EventType,
+			err,
+		)
+		entry.Details = models.AuditDetails{"_serialization_error": err.Error()}
+	}
+
 	// Clamp string fields to their varchar column widths. Values already within
 	// the limit are preserved verbatim; only over-long values are truncated
 	// (with an ellipsis). ActorUsername/ActorFullName mirror their unbounded

From 7c78c58384564e67e6286de910cfae49e7f0ae5e Mon Sep 17 00:00:00 2001
From: appleboy <appleboy.tw@gmail.com>
Date: Fri, 26 Jun 2026 14:41:03 +0800
Subject: [PATCH 13/15] fix(audit): truncate ActorUserID to varchar(36) in
 buildAuditLog

ActorUserID is stored in varchar(36) but machine identity strings such
as 'client:<uuid>' (43 chars) exceed that limit on Postgres strict mode,
causing a permanent INSERT error that blocks the entire retry buffer.

Hard-truncate ActorUserID to 36 bytes (no ellipsis) at build time using
the same pattern as RequestMethod, matching the column's fixed width.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 internal/services/audit.go | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/internal/services/audit.go b/internal/services/audit.go
index 41496fcb..c55a37cc 100644
--- a/internal/services/audit.go
+++ b/internal/services/audit.go
@@ -338,6 +338,12 @@ func (s *AuditService) buildAuditLog(
 	entry.ResourceName = clampToColumn(entry.ResourceName, 255)
 	entry.Action = clampToColumn(entry.Action, 255)
 
+	// ActorUserID is varchar(36); machine identity strings such as
+	// "client:<uuid>" (43 chars) exceed that limit on strict drivers.
+	if len(entry.ActorUserID) > 36 {
+		entry.ActorUserID = entry.ActorUserID[:36]
+	}
+
 	// RequestMethod is stored in a varchar(10) column. Preserve values up to
 	// the full column width and hard-truncate anything longer without adding
 	// an ellipsis.

From 6e976b834af130f6a5f09c01b48fff915fc25544 Mon Sep 17 00:00:00 2001
From: appleboy <appleboy.tw@gmail.com>
Date: Fri, 26 Jun 2026 14:47:24 +0800
Subject: [PATCH 14/15] revert(audit): remove ActorUserID truncation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Truncating 'client:<uuid>' (43 chars) to 36 bytes produces a malformed
prefix that silently corrupts the audit trail — different clients share
the same prefix and filtering by machine identity breaks. The varchar(36)
column is too narrow for machine identity strings; that requires a schema
migration, not silent data truncation at the application layer.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 internal/services/audit.go | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/internal/services/audit.go b/internal/services/audit.go
index c55a37cc..41496fcb 100644
--- a/internal/services/audit.go
+++ b/internal/services/audit.go
@@ -338,12 +338,6 @@ func (s *AuditService) buildAuditLog(
 	entry.ResourceName = clampToColumn(entry.ResourceName, 255)
 	entry.Action = clampToColumn(entry.Action, 255)
 
-	// ActorUserID is varchar(36); machine identity strings such as
-	// "client:<uuid>" (43 chars) exceed that limit on strict drivers.
-	if len(entry.ActorUserID) > 36 {
-		entry.ActorUserID = entry.ActorUserID[:36]
-	}
-
 	// RequestMethod is stored in a varchar(10) column. Preserve values up to
 	// the full column width and hard-truncate anything longer without adding
 	// an ellipsis.

From dbffc77469198ab2c81263325f681aa00781bd25 Mon Sep 17 00:00:00 2001
From: appleboy <appleboy.tw@gmail.com>
Date: Fri, 26 Jun 2026 15:02:29 +0800
Subject: [PATCH 15/15] fix(audit): truncate ActorUserID to varchar(36) with
 logged warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Machine identity strings such as 'client:<uuid>' (43 chars) exceed the
varchar(36) ActorUserID column on Postgres strict mode. Without truncation
the batch fails permanently and blocks all later audit events in the
retry buffer.

Log the full original value at WARNING level before truncating so
operators retain full visibility in logs. The column itself is truncated
to 36 bytes (no ellipsis) — a hard identifier limit, not human-readable
text. A follow-up migration will widen the column to accommodate the
full machine identity string.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 internal/services/audit.go | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/internal/services/audit.go b/internal/services/audit.go
index 41496fcb..4792253d 100644
--- a/internal/services/audit.go
+++ b/internal/services/audit.go
@@ -338,6 +338,19 @@ func (s *AuditService) buildAuditLog(
 	entry.ResourceName = clampToColumn(entry.ResourceName, 255)
 	entry.Action = clampToColumn(entry.Action, 255)
 
+	// ActorUserID is varchar(36). Machine identity strings such as
+	// "client:<uuid>" (43 chars) exceed that limit on Postgres strict mode.
+	// Log the full value so operators retain visibility, then hard-truncate
+	// to prevent a permanent INSERT error from poisoning the retry buffer.
+	// TODO: widen the column in a follow-up migration.
+	if len(entry.ActorUserID) > 36 {
+		log.Printf(
+			"WARNING: audit ActorUserID %q exceeds varchar(36), truncating for event %q",
+			entry.ActorUserID, entry.EventType,
+		)
+		entry.ActorUserID = entry.ActorUserID[:36]
+	}
+
 	// RequestMethod is stored in a varchar(10) column. Preserve values up to
 	// the full column width and hard-truncate anything longer without adding
 	// an ellipsis.