From 6163b6ad4582b5341e37907e9e7ecee32340c061 Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Fri, 27 Feb 2026 16:22:17 +0200 Subject: [PATCH 01/28] 1st commit --- pkg/beholder/chip_ingress_batch_emitter.go | 148 ++++++++ .../chip_ingress_batch_emitter_test.go | 351 ++++++++++++++++++ pkg/beholder/chip_ingress_emitter_worker.go | 118 ++++++ pkg/beholder/client.go | 18 +- pkg/beholder/config.go | 15 +- pkg/beholder/config_test.go | 2 +- pkg/beholder/dual_source_emitter.go | 19 +- 7 files changed, 649 insertions(+), 22 deletions(-) create mode 100644 pkg/beholder/chip_ingress_batch_emitter.go create mode 100644 pkg/beholder/chip_ingress_batch_emitter_test.go create mode 100644 pkg/beholder/chip_ingress_emitter_worker.go diff --git a/pkg/beholder/chip_ingress_batch_emitter.go b/pkg/beholder/chip_ingress_batch_emitter.go new file mode 100644 index 0000000000..37949069cf --- /dev/null +++ b/pkg/beholder/chip_ingress_batch_emitter.go @@ -0,0 +1,148 @@ +package beholder + +import ( + "context" + "fmt" + "sync" + "time" + + "github.com/smartcontractkit/chainlink-common/pkg/chipingress" + "github.com/smartcontractkit/chainlink-common/pkg/logger" + "github.com/smartcontractkit/chainlink-common/pkg/services" + "github.com/smartcontractkit/chainlink-common/pkg/timeutil" +) + +// ChipIngressBatchEmitter buffers events per (domain, entity) and flushes them +// via chipingress.Client.PublishBatch on a periodic interval. +// It satisfies the Emitter interface so it can be used as a drop-in replacement +// for ChipIngressEmitter. +type ChipIngressBatchEmitter struct { + services.Service + eng *services.Engine + + client chipingress.Client + + workers map[string]*chipIngressEmitterWorker + workersMutex sync.RWMutex + + bufferSize uint + maxBatchSize uint + sendInterval time.Duration + sendTimeout time.Duration +} + +// NewChipIngressBatchEmitter creates a batch emitter backed by the given chipingress client. +// Call Start() to begin health monitoring, and Close() to stop all workers. +func NewChipIngressBatchEmitter(client chipingress.Client, lggr logger.Logger, cfg Config) (*ChipIngressBatchEmitter, error) { + if client == nil { + return nil, fmt.Errorf("chip ingress client is nil") + } + + bufferSize := cfg.ChipIngressBufferSize + if bufferSize == 0 { + bufferSize = 100 + } + maxBatchSize := cfg.ChipIngressMaxBatchSize + if maxBatchSize == 0 { + maxBatchSize = 50 + } + sendInterval := cfg.ChipIngressSendInterval + if sendInterval == 0 { + sendInterval = 500 * time.Millisecond + } + sendTimeout := cfg.ChipIngressSendTimeout + if sendTimeout == 0 { + sendTimeout = 10 * time.Second + } + + e := &ChipIngressBatchEmitter{ + client: client, + workers: make(map[string]*chipIngressEmitterWorker), + bufferSize: bufferSize, + maxBatchSize: maxBatchSize, + sendInterval: sendInterval, + sendTimeout: sendTimeout, + } + + e.Service, e.eng = services.Config{ + Name: "ChipIngressBatchEmitter", + Start: e.start, + }.NewServiceEngine(lggr) + + return e, nil +} + +func (e *ChipIngressBatchEmitter) start(_ context.Context) error { + return nil +} + +// Emit extracts (domain, entity) from the attributes, routes the event to the +// appropriate per-(domain, entity) worker, and returns immediately. +// If the worker's buffer is full, the event is dropped and a warning is logged. +func (e *ChipIngressBatchEmitter) Emit(ctx context.Context, body []byte, attrKVs ...any) error { + domain, entity, err := ExtractSourceAndType(attrKVs...) + if err != nil { + return err + } + + attributes := newAttributes(attrKVs...) + + worker := e.findOrCreateWorker(domain, entity) + + payload := emitterPayload{ + body: body, + attributes: attributes, + domain: domain, + entity: entity, + } + + select { + case worker.ch <- payload: + worker.dropCount.Store(0) + case <-ctx.Done(): + return ctx.Err() + default: + worker.logBufferFullWithExpBackoff(payload) + } + + return nil +} + +// findOrCreateWorker returns the worker for the given (domain, entity) pair, +// creating one with a new buffered channel and GoTick flush loop if it doesn't exist. +func (e *ChipIngressBatchEmitter) findOrCreateWorker(domain, entity string) *chipIngressEmitterWorker { + workerKey := fmt.Sprintf("%s_%s", domain, entity) + + e.workersMutex.RLock() + worker, found := e.workers[workerKey] + e.workersMutex.RUnlock() + + if found { + return worker + } + + e.workersMutex.Lock() + defer e.workersMutex.Unlock() + + // Double-check after acquiring write lock + if worker, found = e.workers[workerKey]; found { + return worker + } + + worker = newChipIngressEmitterWorker( + e.client, + make(chan emitterPayload, e.bufferSize), + domain, + entity, + e.maxBatchSize, + e.sendTimeout, + e.eng, + ) + + e.eng.GoTick(timeutil.NewTicker(func() time.Duration { + return e.sendInterval + }), worker.Send) + + e.workers[workerKey] = worker + return worker +} diff --git a/pkg/beholder/chip_ingress_batch_emitter_test.go b/pkg/beholder/chip_ingress_batch_emitter_test.go new file mode 100644 index 0000000000..02a6a22789 --- /dev/null +++ b/pkg/beholder/chip_ingress_batch_emitter_test.go @@ -0,0 +1,351 @@ +package beholder_test + +import ( + "sync" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" + + "github.com/smartcontractkit/chainlink-common/pkg/beholder" + "github.com/smartcontractkit/chainlink-common/pkg/chipingress" + "github.com/smartcontractkit/chainlink-common/pkg/chipingress/mocks" + "github.com/smartcontractkit/chainlink-common/pkg/logger" +) + +func newTestConfig() beholder.Config { + return beholder.Config{ + ChipIngressBufferSize: 10, + ChipIngressMaxBatchSize: 5, + ChipIngressSendInterval: 50 * time.Millisecond, + ChipIngressSendTimeout: 5 * time.Second, + } +} + +func newTestLogger(t *testing.T) logger.Logger { + t.Helper() + lggr, err := logger.New() + require.NoError(t, err) + return lggr +} + +func TestNewChipIngressBatchEmitter(t *testing.T) { + t.Run("happy path", func(t *testing.T) { + clientMock := mocks.NewClient(t) + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestLogger(t), newTestConfig()) + require.NoError(t, err) + assert.NotNil(t, emitter) + }) + + t.Run("returns error when client is nil", func(t *testing.T) { + emitter, err := beholder.NewChipIngressBatchEmitter(nil, newTestLogger(t), newTestConfig()) + assert.Error(t, err) + assert.Nil(t, emitter) + }) +} + +func TestChipIngressBatchEmitter_Emit(t *testing.T) { + t.Run("returns error when domain/entity missing", func(t *testing.T) { + clientMock := mocks.NewClient(t) + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestLogger(t), newTestConfig()) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + defer emitter.Close() //nolint:errcheck + + err = emitter.Emit(t.Context(), []byte("test"), "bad_key", "bad_value") + assert.Error(t, err) + }) + + t.Run("enqueues and does not call PublishBatch immediately", func(t *testing.T) { + clientMock := mocks.NewClient(t) + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestLogger(t), newTestConfig()) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + defer emitter.Close() //nolint:errcheck + + err = emitter.Emit(t.Context(), []byte("body"), + beholder.AttrKeyDomain, "test-domain", + beholder.AttrKeyEntity, "test-entity", + ) + require.NoError(t, err) + + // PublishBatch should NOT have been called yet (event is just buffered) + clientMock.AssertNotCalled(t, "PublishBatch", mock.Anything, mock.Anything) + }) +} + +func TestChipIngressBatchEmitter_BatchAssembly(t *testing.T) { + t.Run("events are batched and sent via PublishBatch", func(t *testing.T) { + clientMock := mocks.NewClient(t) + + var mu sync.Mutex + var receivedBatches []*chipingress.CloudEventBatch + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Run(func(args mock.Arguments) { + mu.Lock() + defer mu.Unlock() + batch := args.Get(1).(*chipingress.CloudEventBatch) + receivedBatches = append(receivedBatches, batch) + }). + Return(nil, nil) + + cfg := newTestConfig() + cfg.ChipIngressSendInterval = 50 * time.Millisecond + + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestLogger(t), cfg) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + + for i := 0; i < 3; i++ { + err = emitter.Emit(t.Context(), []byte("body"), + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + require.NoError(t, err) + } + + // Wait for flush to occur + assert.Eventually(t, func() bool { + mu.Lock() + defer mu.Unlock() + return len(receivedBatches) > 0 + }, 2*time.Second, 10*time.Millisecond) + + require.NoError(t, emitter.Close()) + + mu.Lock() + defer mu.Unlock() + + totalEvents := 0 + for _, batch := range receivedBatches { + totalEvents += len(batch.Events) + } + assert.Equal(t, 3, totalEvents) + }) +} + +func TestChipIngressBatchEmitter_MaxBatchSize(t *testing.T) { + t.Run("batch is capped at maxBatchSize", func(t *testing.T) { + clientMock := mocks.NewClient(t) + + var mu sync.Mutex + var batchSizes []int + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Run(func(args mock.Arguments) { + mu.Lock() + defer mu.Unlock() + batch := args.Get(1).(*chipingress.CloudEventBatch) + batchSizes = append(batchSizes, len(batch.Events)) + }). + Return(nil, nil) + + cfg := newTestConfig() + cfg.ChipIngressBufferSize = 20 + cfg.ChipIngressMaxBatchSize = 3 + cfg.ChipIngressSendInterval = 50 * time.Millisecond + + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestLogger(t), cfg) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + + for i := 0; i < 7; i++ { + err = emitter.Emit(t.Context(), []byte("body"), + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + require.NoError(t, err) + } + + // Wait for all events to be flushed + assert.Eventually(t, func() bool { + mu.Lock() + defer mu.Unlock() + total := 0 + for _, s := range batchSizes { + total += s + } + return total >= 7 + }, 2*time.Second, 10*time.Millisecond) + + require.NoError(t, emitter.Close()) + + mu.Lock() + defer mu.Unlock() + for _, size := range batchSizes { + assert.LessOrEqual(t, size, 3, "batch size should not exceed maxBatchSize") + } + }) +} + +func TestChipIngressBatchEmitter_PerDomainEntityIsolation(t *testing.T) { + t.Run("separate workers for different domain/entity pairs", func(t *testing.T) { + clientMock := mocks.NewClient(t) + + var mu sync.Mutex + domainEntitySeen := make(map[string]int) + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Run(func(args mock.Arguments) { + mu.Lock() + defer mu.Unlock() + batch := args.Get(1).(*chipingress.CloudEventBatch) + for _, event := range batch.Events { + key := event.Source + "/" + event.Type + domainEntitySeen[key] += 1 + } + }). + Return(nil, nil) + + cfg := newTestConfig() + cfg.ChipIngressSendInterval = 50 * time.Millisecond + + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestLogger(t), cfg) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + + // Emit events for two different domain/entity pairs + for i := 0; i < 3; i++ { + err = emitter.Emit(t.Context(), []byte("workflow"), + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "WorkflowEvent", + ) + require.NoError(t, err) + } + for i := 0; i < 2; i++ { + err = emitter.Emit(t.Context(), []byte("bridge"), + beholder.AttrKeyDomain, "data-feeds", + beholder.AttrKeyEntity, "BridgeStatus", + ) + require.NoError(t, err) + } + + // Wait for both to be flushed + assert.Eventually(t, func() bool { + mu.Lock() + defer mu.Unlock() + return domainEntitySeen["platform/WorkflowEvent"] >= 3 && + domainEntitySeen["data-feeds/BridgeStatus"] >= 2 + }, 2*time.Second, 10*time.Millisecond) + + require.NoError(t, emitter.Close()) + + mu.Lock() + defer mu.Unlock() + assert.Equal(t, 3, domainEntitySeen["platform/WorkflowEvent"]) + assert.Equal(t, 2, domainEntitySeen["data-feeds/BridgeStatus"]) + }) +} + +func TestChipIngressBatchEmitter_BufferFull(t *testing.T) { + t.Run("events are dropped when buffer is full", func(t *testing.T) { + clientMock := mocks.NewClient(t) + // Block PublishBatch so the buffer fills up + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Return(nil, nil). + Maybe() + + cfg := newTestConfig() + cfg.ChipIngressBufferSize = 3 + cfg.ChipIngressSendInterval = 10 * time.Second // very long interval to prevent flushing + + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestLogger(t), cfg) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + defer emitter.Close() //nolint:errcheck + + // Fill the buffer (3 events) + for i := 0; i < 3; i++ { + err = emitter.Emit(t.Context(), []byte("body"), + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + require.NoError(t, err) + } + + // This should not error (it drops silently), but the event won't be delivered + err = emitter.Emit(t.Context(), []byte("dropped"), + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + assert.NoError(t, err) + }) +} + +func TestChipIngressBatchEmitter_Lifecycle(t *testing.T) { + t.Run("start and close cleanly", func(t *testing.T) { + clientMock := mocks.NewClient(t) + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Return(nil, nil). + Maybe() + + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestLogger(t), newTestConfig()) + require.NoError(t, err) + + require.NoError(t, emitter.Start(t.Context())) + + // Emit a few events to create workers + for i := 0; i < 3; i++ { + err = emitter.Emit(t.Context(), []byte("body"), + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + require.NoError(t, err) + } + + // Close should not hang or error + require.NoError(t, emitter.Close()) + }) +} + +func TestChipIngressBatchEmitter_CloudEventFormat(t *testing.T) { + t.Run("CloudEvents have correct source, type, and data", func(t *testing.T) { + clientMock := mocks.NewClient(t) + + var mu sync.Mutex + var receivedBatch *chipingress.CloudEventBatch + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Run(func(args mock.Arguments) { + mu.Lock() + defer mu.Unlock() + receivedBatch = args.Get(1).(*chipingress.CloudEventBatch) + }). + Return(nil, nil) + + cfg := newTestConfig() + cfg.ChipIngressSendInterval = 50 * time.Millisecond + + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestLogger(t), cfg) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + + err = emitter.Emit(t.Context(), []byte("test-payload"), + beholder.AttrKeyDomain, "my-domain", + beholder.AttrKeyEntity, "my-entity", + ) + require.NoError(t, err) + + assert.Eventually(t, func() bool { + mu.Lock() + defer mu.Unlock() + return receivedBatch != nil + }, 2*time.Second, 10*time.Millisecond) + + require.NoError(t, emitter.Close()) + + mu.Lock() + defer mu.Unlock() + require.Len(t, receivedBatch.Events, 1) + + event := receivedBatch.Events[0] + assert.Equal(t, "my-domain", event.Source) + assert.Equal(t, "my-entity", event.Type) + assert.NotEmpty(t, event.Id) + }) +} diff --git a/pkg/beholder/chip_ingress_emitter_worker.go b/pkg/beholder/chip_ingress_emitter_worker.go new file mode 100644 index 0000000000..6b9c61759e --- /dev/null +++ b/pkg/beholder/chip_ingress_emitter_worker.go @@ -0,0 +1,118 @@ +package beholder + +import ( + "context" + "fmt" + "sync/atomic" + "time" + + "github.com/smartcontractkit/chainlink-common/pkg/chipingress" + "github.com/smartcontractkit/chainlink-common/pkg/logger" +) + +// emitterPayload holds a single event to be batched and sent via chip ingress. +type emitterPayload struct { + body []byte + attributes map[string]any + domain string + entity string +} + +// chipIngressEmitterWorker buffers events for a single (domain, entity) pair +// and flushes them via PublishBatch on a periodic interval. +type chipIngressEmitterWorker struct { + client chipingress.Client + ch chan emitterPayload + domain string + entity string + maxBatchSize uint + sendTimeout time.Duration + lggr logger.Logger + dropCount atomic.Uint32 +} + +func newChipIngressEmitterWorker( + client chipingress.Client, + ch chan emitterPayload, + domain string, + entity string, + maxBatchSize uint, + sendTimeout time.Duration, + lggr logger.Logger, +) *chipIngressEmitterWorker { + return &chipIngressEmitterWorker{ + client: client, + ch: ch, + domain: domain, + entity: entity, + maxBatchSize: maxBatchSize, + sendTimeout: sendTimeout, + lggr: logger.Named(lggr, "ChipIngressEmitterWorker"), + } +} + +// Send drains the channel and sends a batch. Called periodically by GoTick. +func (w *chipIngressEmitterWorker) Send(ctx context.Context) { + if len(w.ch) == 0 { + return + } + + batch := w.buildBatch() + if batch == nil || len(batch.Events) == 0 { + return + } + + ctx, cancel := context.WithTimeout(ctx, w.sendTimeout) + defer cancel() + + _, err := w.client.PublishBatch(ctx, batch) + if err != nil { + w.lggr.Warnf("could not send batch via chip ingress: %v", err) + return + } +} + +// buildBatch drains the channel up to maxBatchSize and converts payloads to a CloudEventBatch. +func (w *chipIngressEmitterWorker) buildBatch() *chipingress.CloudEventBatch { + var events []chipingress.CloudEvent + + for len(w.ch) > 0 && len(events) < int(w.maxBatchSize) { // #nosec G115 + payload := <-w.ch + event, err := w.payloadToEvent(payload) + if err != nil { + w.lggr.Warnf("failed to build CloudEvent, dropping: %v", err) + continue + } + events = append(events, event) + } + + if len(events) == 0 { + return nil + } + + batch, err := chipingress.EventsToBatch(events) + if err != nil { + w.lggr.Warnf("failed to convert events to batch: %v", err) + return nil + } + + return batch +} + +func (w *chipIngressEmitterWorker) payloadToEvent(payload emitterPayload) (chipingress.CloudEvent, error) { + event, err := chipingress.NewEvent(payload.domain, payload.entity, payload.body, payload.attributes) + if err != nil { + return chipingress.CloudEvent{}, fmt.Errorf("failed to create CloudEvent: %w", err) + } + return event, nil +} + +// logBufferFullWithExpBackoff logs at 1, 2, 4, 8, 16, 32, 64, 100, 200, 300, ... +// to avoid flooding logs when the buffer is persistently full. +func (w *chipIngressEmitterWorker) logBufferFullWithExpBackoff(payload emitterPayload) { + count := w.dropCount.Add(1) + if count > 0 && (count%100 == 0 || count&(count-1) == 0) { + w.lggr.Warnf("chip ingress emitter buffer full, dropping event (domain=%s, entity=%s, droppedCount=%d)", + payload.domain, payload.entity, count) + } +} diff --git a/pkg/beholder/client.go b/pkg/beholder/client.go index 328d877c49..f10c01ae29 100644 --- a/pkg/beholder/client.go +++ b/pkg/beholder/client.go @@ -8,6 +8,7 @@ import ( "io" "time" + ccllogger "github.com/smartcontractkit/chainlink-common/pkg/logger" "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc" @@ -187,7 +188,7 @@ func NewGRPCClient(cfg Config, otlploggrpcNew otlploggrpcFactory) (*Client, erro // This will eventually be removed in favor of chip-ingress emitter // and logs will be sent via OTLP using the regular Logger instead of calling Emit emitter := NewMessageEmitter(messageLogger) - + var batchEmitter *ChipIngressBatchEmitter var chipIngressClient chipingress.Client = &chipingress.NoopClient{} // if chip ingress is enabled, create dual source emitter that sends to both otel collector and chip ingress // eventually we will remove the dual source emitter and just use chip ingress @@ -223,18 +224,27 @@ func NewGRPCClient(cfg Config, otlploggrpcNew otlploggrpcFactory) (*Client, erro return nil, err } - chipIngressEmitter, err := NewChipIngressEmitter(chipIngressClient) + lggr, lErr := ccllogger.New() + if lErr != nil { + return nil, fmt.Errorf("failed to create logger for chip ingress batch emitter: %w", lErr) + } + batchEmitter, err = NewChipIngressBatchEmitter(chipIngressClient, lggr, cfg) if err != nil { - return nil, fmt.Errorf("failed to create chip ingress emitter: %w", err) + return nil, fmt.Errorf("failed to create chip ingress batch emitter: %w", err) + } + if err = batchEmitter.Start(context.Background()); err != nil { + return nil, fmt.Errorf("failed to start chip ingress batch emitter: %w", err) } - emitter, err = NewDualSourceEmitter(chipIngressEmitter, emitter) + emitter, err = NewDualSourceEmitter(batchEmitter, emitter) if err != nil { return nil, fmt.Errorf("failed to create dual source emitter: %w", err) } } onClose := func() (err error) { + // batchEmitter is closed via DualSourceEmitter.Close() -> chipIngressEmitter.Close(), + // which is called by Client.Close() -> c.Emitter.Close() before OnClose runs. for _, provider := range []shutdowner{messageLoggerProvider, loggerProvider, tracerProvider, meterProvider, messageLoggerProvider} { err = errors.Join(err, provider.Shutdown(context.Background())) } diff --git a/pkg/beholder/config.go b/pkg/beholder/config.go index f3e4561b68..506dce9d0e 100644 --- a/pkg/beholder/config.go +++ b/pkg/beholder/config.go @@ -44,6 +44,12 @@ type Config struct { ChipIngressEmitterGRPCEndpoint string ChipIngressInsecureConnection bool // Disables TLS for Chip Ingress Emitter + // Chip Ingress Batch Emitter + ChipIngressBufferSize uint // Per-worker channel buffer size (default 100) + ChipIngressMaxBatchSize uint // Max events per PublishBatch call (default 50) + ChipIngressSendInterval time.Duration // Flush interval per worker (default 500ms). Zero disables batching. + ChipIngressSendTimeout time.Duration // Timeout per PublishBatch call (default 10s) + // OTel Log LogExportTimeout time.Duration LogExportInterval time.Duration @@ -131,8 +137,13 @@ func DefaultConfig() Config { LogMaxQueueSize: 2048, LogBatchProcessor: true, LogStreamingEnabled: true, // Enable logs streaming by default - LogLevel: zapcore.InfoLevel, - LogCompressor: "gzip", + LogLevel: zapcore.InfoLevel, + LogCompressor: "gzip", + // Chip Ingress Batch Emitter + ChipIngressBufferSize: 100, + ChipIngressMaxBatchSize: 50, + ChipIngressSendInterval: 500 * time.Millisecond, + ChipIngressSendTimeout: 10 * time.Second, // Auth (defaults to static auth mode with TTL=0) AuthHeadersTTL: 0, } diff --git a/pkg/beholder/config_test.go b/pkg/beholder/config_test.go index ae156db9c9..9ecdd2280e 100644 --- a/pkg/beholder/config_test.go +++ b/pkg/beholder/config_test.go @@ -67,6 +67,6 @@ func ExampleConfig() { } fmt.Printf("%+v\n", *config.LogRetryConfig) // Output: - // {InsecureConnection:true CACertFile: OtelExporterGRPCEndpoint:localhost:4317 OtelExporterHTTPEndpoint:localhost:4318 ResourceAttributes:[{Key:package_name Value:{vtype:4 numeric:0 stringly:beholder slice:}} {Key:sender Value:{vtype:4 numeric:0 stringly:beholderclient slice:}}] EmitterExportTimeout:1s EmitterExportInterval:1s EmitterExportMaxBatchSize:512 EmitterMaxQueueSize:2048 EmitterBatchProcessor:true TraceSampleRatio:1 TraceBatchTimeout:1s TraceSpanExporter: TraceRetryConfig: TraceCompressor:gzip MetricReaderInterval:1s MetricRetryConfig: MetricViews:[] MetricCompressor:gzip ChipIngressEmitterEnabled:false ChipIngressEmitterGRPCEndpoint: ChipIngressInsecureConnection:false LogExportTimeout:1s LogExportInterval:1s LogExportMaxBatchSize:512 LogMaxQueueSize:2048 LogBatchProcessor:true LogRetryConfig: LogStreamingEnabled:false LogLevel:info LogCompressor:gzip AuthHeaders:map[] AuthHeadersTTL:0s AuthKeySigner: AuthPublicKeyHex:} + // {InsecureConnection:true CACertFile: OtelExporterGRPCEndpoint:localhost:4317 OtelExporterHTTPEndpoint:localhost:4318 ResourceAttributes:[{Key:package_name Value:{vtype:4 numeric:0 stringly:beholder slice:}} {Key:sender Value:{vtype:4 numeric:0 stringly:beholderclient slice:}}] EmitterExportTimeout:1s EmitterExportInterval:1s EmitterExportMaxBatchSize:512 EmitterMaxQueueSize:2048 EmitterBatchProcessor:true TraceSampleRatio:1 TraceBatchTimeout:1s TraceSpanExporter: TraceRetryConfig: TraceCompressor:gzip MetricReaderInterval:1s MetricRetryConfig: MetricViews:[] MetricCompressor:gzip ChipIngressEmitterEnabled:false ChipIngressEmitterGRPCEndpoint: ChipIngressInsecureConnection:false ChipIngressBufferSize:0 ChipIngressMaxBatchSize:0 ChipIngressSendInterval:0s ChipIngressSendTimeout:0s LogExportTimeout:1s LogExportInterval:1s LogExportMaxBatchSize:512 LogMaxQueueSize:2048 LogBatchProcessor:true LogRetryConfig: LogStreamingEnabled:false LogLevel:info LogCompressor:gzip AuthHeaders:map[] AuthHeadersTTL:0s AuthKeySigner: AuthPublicKeyHex:} // {InitialInterval:5s MaxInterval:30s MaxElapsedTime:1m0s} } diff --git a/pkg/beholder/dual_source_emitter.go b/pkg/beholder/dual_source_emitter.go index 5167efaece..d34dbb614e 100644 --- a/pkg/beholder/dual_source_emitter.go +++ b/pkg/beholder/dual_source_emitter.go @@ -62,22 +62,11 @@ func (d *DualSourceEmitter) Emit(ctx context.Context, body []byte, attrKVs ...an return err } - // Emit via chip ingress async - if err := d.wg.TryAdd(1); err != nil { - return err + // Emit via chip ingress. When backed by ChipIngressBatchEmitter this is + // non-blocking (just a channel send), so no goroutine wrapper is needed. + if err := d.chipIngressEmitter.Emit(ctx, body, attrKVs...); err != nil { + d.log.Infof("failed to emit to chip ingress: %v", err) } - go func(ctx context.Context) { - defer d.wg.Done() - var cancel context.CancelFunc - ctx, cancel = d.stopCh.Ctx(ctx) - defer cancel() - - if err := d.chipIngressEmitter.Emit(ctx, body, attrKVs...); err != nil { - // If the chip ingress emitter fails, we ONLY log the error - // because we still want to send the data to the OTLP collector and not cause disruption - d.log.Infof("failed to emit to chip ingress: %v", err) - } - }(context.WithoutCancel(ctx)) return nil } From 5ec2dec06bf822abfa161fd01e2a77d499265bfb Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Fri, 27 Feb 2026 16:39:52 +0200 Subject: [PATCH 02/28] Amend chip_ingress_batch_emitter_test.go --- .../chip_ingress_batch_emitter_test.go | 112 ++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/pkg/beholder/chip_ingress_batch_emitter_test.go b/pkg/beholder/chip_ingress_batch_emitter_test.go index 02a6a22789..483d55e86f 100644 --- a/pkg/beholder/chip_ingress_batch_emitter_test.go +++ b/pkg/beholder/chip_ingress_batch_emitter_test.go @@ -1,6 +1,7 @@ package beholder_test import ( + "context" "sync" "testing" "time" @@ -349,3 +350,114 @@ func TestChipIngressBatchEmitter_CloudEventFormat(t *testing.T) { assert.NotEmpty(t, event.Id) }) } + +func TestChipIngressBatchEmitter_PublishBatchError(t *testing.T) { + t.Run("PublishBatch error is handled gracefully", func(t *testing.T) { + clientMock := mocks.NewClient(t) + + var mu sync.Mutex + callCount := 0 + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Run(func(_ mock.Arguments) { + mu.Lock() + defer mu.Unlock() + callCount++ + }). + Return(nil, assert.AnError) + + cfg := newTestConfig() + cfg.ChipIngressSendInterval = 50 * time.Millisecond + + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestLogger(t), cfg) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + + for i := 0; i < 3; i++ { + err = emitter.Emit(t.Context(), []byte("body"), + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + require.NoError(t, err) + } + + assert.Eventually(t, func() bool { + mu.Lock() + defer mu.Unlock() + return callCount > 0 + }, 2*time.Second, 10*time.Millisecond) + + require.NoError(t, emitter.Close()) + }) +} + +func TestChipIngressBatchEmitter_ContextCancellation(t *testing.T) { + t.Run("Emit returns context error when context is cancelled", func(t *testing.T) { + clientMock := mocks.NewClient(t) + + cfg := newTestConfig() + cfg.ChipIngressBufferSize = 1 + cfg.ChipIngressSendInterval = 10 * time.Second + + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestLogger(t), cfg) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + defer emitter.Close() //nolint:errcheck + + // Fill the buffer so the next Emit will block on channel send + err = emitter.Emit(t.Context(), []byte("fill"), + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + require.NoError(t, err) + + ctx, cancel := context.WithCancel(t.Context()) + cancel() + + err = emitter.Emit(ctx, []byte("should-fail"), + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + assert.ErrorIs(t, err, context.Canceled) + }) +} + +func TestChipIngressBatchEmitter_DefaultConfig(t *testing.T) { + t.Run("zero config uses sane defaults", func(t *testing.T) { + clientMock := mocks.NewClient(t) + + var mu sync.Mutex + var receivedBatch *chipingress.CloudEventBatch + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Run(func(args mock.Arguments) { + mu.Lock() + defer mu.Unlock() + receivedBatch = args.Get(1).(*chipingress.CloudEventBatch) + }). + Return(nil, nil) + + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestLogger(t), beholder.Config{}) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + + err = emitter.Emit(t.Context(), []byte("body"), + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + require.NoError(t, err) + + // Default send interval is 500ms; wait for flush + assert.Eventually(t, func() bool { + mu.Lock() + defer mu.Unlock() + return receivedBatch != nil + }, 3*time.Second, 50*time.Millisecond) + + require.NoError(t, emitter.Close()) + + mu.Lock() + defer mu.Unlock() + require.Len(t, receivedBatch.Events, 1) + }) +} From ab95437bc72bb8786a75897171bae02ebd062317 Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Fri, 27 Feb 2026 20:59:21 +0200 Subject: [PATCH 03/28] Fix comment --- pkg/beholder/config.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/beholder/config.go b/pkg/beholder/config.go index 506dce9d0e..a19b76ca1e 100644 --- a/pkg/beholder/config.go +++ b/pkg/beholder/config.go @@ -47,7 +47,7 @@ type Config struct { // Chip Ingress Batch Emitter ChipIngressBufferSize uint // Per-worker channel buffer size (default 100) ChipIngressMaxBatchSize uint // Max events per PublishBatch call (default 50) - ChipIngressSendInterval time.Duration // Flush interval per worker (default 500ms). Zero disables batching. + ChipIngressSendInterval time.Duration // Flush interval per worker (default 500ms when zero or unset) ChipIngressSendTimeout time.Duration // Timeout per PublishBatch call (default 10s) // OTel Log From f31d3e8a3f615f92402e7eb2de812e165eade2ec Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Mon, 2 Mar 2026 11:30:52 +0200 Subject: [PATCH 04/28] batchEmitter -> batchEmitterService --- pkg/beholder/client.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/beholder/client.go b/pkg/beholder/client.go index f10c01ae29..4cdb05c404 100644 --- a/pkg/beholder/client.go +++ b/pkg/beholder/client.go @@ -188,7 +188,7 @@ func NewGRPCClient(cfg Config, otlploggrpcNew otlploggrpcFactory) (*Client, erro // This will eventually be removed in favor of chip-ingress emitter // and logs will be sent via OTLP using the regular Logger instead of calling Emit emitter := NewMessageEmitter(messageLogger) - var batchEmitter *ChipIngressBatchEmitter + var batchEmitterService *ChipIngressBatchEmitter var chipIngressClient chipingress.Client = &chipingress.NoopClient{} // if chip ingress is enabled, create dual source emitter that sends to both otel collector and chip ingress // eventually we will remove the dual source emitter and just use chip ingress @@ -228,22 +228,22 @@ func NewGRPCClient(cfg Config, otlploggrpcNew otlploggrpcFactory) (*Client, erro if lErr != nil { return nil, fmt.Errorf("failed to create logger for chip ingress batch emitter: %w", lErr) } - batchEmitter, err = NewChipIngressBatchEmitter(chipIngressClient, lggr, cfg) + batchEmitterService, err = NewChipIngressBatchEmitter(chipIngressClient, lggr, cfg) if err != nil { return nil, fmt.Errorf("failed to create chip ingress batch emitter: %w", err) } - if err = batchEmitter.Start(context.Background()); err != nil { + if err = batchEmitterService.Start(context.Background()); err != nil { return nil, fmt.Errorf("failed to start chip ingress batch emitter: %w", err) } - emitter, err = NewDualSourceEmitter(batchEmitter, emitter) + emitter, err = NewDualSourceEmitter(batchEmitterService, emitter) if err != nil { return nil, fmt.Errorf("failed to create dual source emitter: %w", err) } } onClose := func() (err error) { - // batchEmitter is closed via DualSourceEmitter.Close() -> chipIngressEmitter.Close(), + // batchEmitterService is closed via DualSourceEmitter.Close() -> chipIngressEmitter.Close(), // which is called by Client.Close() -> c.Emitter.Close() before OnClose runs. for _, provider := range []shutdowner{messageLoggerProvider, loggerProvider, tracerProvider, meterProvider, messageLoggerProvider} { err = errors.Join(err, provider.Shutdown(context.Background())) From ba186b773de0cc5f91f209578ded9d9c2d44375e Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Mon, 2 Mar 2026 13:18:18 +0200 Subject: [PATCH 05/28] Add feature flag --- pkg/beholder/client.go | 31 ++++++++++++++++++++----------- pkg/beholder/config.go | 12 +++++++----- pkg/beholder/config_test.go | 2 +- pkg/loop/config.go | 15 +++++++++++---- pkg/loop/config_test.go | 11 +++++++---- pkg/loop/server.go | 1 + 6 files changed, 47 insertions(+), 25 deletions(-) diff --git a/pkg/beholder/client.go b/pkg/beholder/client.go index 4cdb05c404..e6e7c435e2 100644 --- a/pkg/beholder/client.go +++ b/pkg/beholder/client.go @@ -224,19 +224,28 @@ func NewGRPCClient(cfg Config, otlploggrpcNew otlploggrpcFactory) (*Client, erro return nil, err } - lggr, lErr := ccllogger.New() - if lErr != nil { - return nil, fmt.Errorf("failed to create logger for chip ingress batch emitter: %w", lErr) - } - batchEmitterService, err = NewChipIngressBatchEmitter(chipIngressClient, lggr, cfg) - if err != nil { - return nil, fmt.Errorf("failed to create chip ingress batch emitter: %w", err) - } - if err = batchEmitterService.Start(context.Background()); err != nil { - return nil, fmt.Errorf("failed to start chip ingress batch emitter: %w", err) + var chipIngressEmitter Emitter + if cfg.ChipIngressBatchEmitterEnabled { + lggr, lErr := ccllogger.New() + if lErr != nil { + return nil, fmt.Errorf("failed to create logger for chip ingress batch emitter: %w", lErr) + } + batchEmitterService, err = NewChipIngressBatchEmitter(chipIngressClient, lggr, cfg) + if err != nil { + return nil, fmt.Errorf("failed to create chip ingress batch emitter: %w", err) + } + if err = batchEmitterService.Start(context.Background()); err != nil { + return nil, fmt.Errorf("failed to start chip ingress batch emitter: %w", err) + } + chipIngressEmitter = batchEmitterService + } else { + chipIngressEmitter, err = NewChipIngressEmitter(chipIngressClient) + if err != nil { + return nil, fmt.Errorf("failed to create chip ingress emitter: %w", err) + } } - emitter, err = NewDualSourceEmitter(batchEmitterService, emitter) + emitter, err = NewDualSourceEmitter(chipIngressEmitter, emitter) if err != nil { return nil, fmt.Errorf("failed to create dual source emitter: %w", err) } diff --git a/pkg/beholder/config.go b/pkg/beholder/config.go index a19b76ca1e..06b877279e 100644 --- a/pkg/beholder/config.go +++ b/pkg/beholder/config.go @@ -45,10 +45,11 @@ type Config struct { ChipIngressInsecureConnection bool // Disables TLS for Chip Ingress Emitter // Chip Ingress Batch Emitter - ChipIngressBufferSize uint // Per-worker channel buffer size (default 100) - ChipIngressMaxBatchSize uint // Max events per PublishBatch call (default 50) - ChipIngressSendInterval time.Duration // Flush interval per worker (default 500ms when zero or unset) - ChipIngressSendTimeout time.Duration // Timeout per PublishBatch call (default 10s) + ChipIngressBatchEmitterEnabled bool // When true (default), use batch emitter; when false, use legacy per-event emitter + ChipIngressBufferSize uint // Per-worker channel buffer size (default 100) + ChipIngressMaxBatchSize uint // Max events per PublishBatch call (default 50) + ChipIngressSendInterval time.Duration // Flush interval per worker (default 500ms when zero or unset) + ChipIngressSendTimeout time.Duration // Timeout per PublishBatch call (default 10s) // OTel Log LogExportTimeout time.Duration @@ -140,7 +141,8 @@ func DefaultConfig() Config { LogLevel: zapcore.InfoLevel, LogCompressor: "gzip", // Chip Ingress Batch Emitter - ChipIngressBufferSize: 100, + ChipIngressBatchEmitterEnabled: false, + ChipIngressBufferSize: 100, ChipIngressMaxBatchSize: 50, ChipIngressSendInterval: 500 * time.Millisecond, ChipIngressSendTimeout: 10 * time.Second, diff --git a/pkg/beholder/config_test.go b/pkg/beholder/config_test.go index 9ecdd2280e..024ba805d2 100644 --- a/pkg/beholder/config_test.go +++ b/pkg/beholder/config_test.go @@ -67,6 +67,6 @@ func ExampleConfig() { } fmt.Printf("%+v\n", *config.LogRetryConfig) // Output: - // {InsecureConnection:true CACertFile: OtelExporterGRPCEndpoint:localhost:4317 OtelExporterHTTPEndpoint:localhost:4318 ResourceAttributes:[{Key:package_name Value:{vtype:4 numeric:0 stringly:beholder slice:}} {Key:sender Value:{vtype:4 numeric:0 stringly:beholderclient slice:}}] EmitterExportTimeout:1s EmitterExportInterval:1s EmitterExportMaxBatchSize:512 EmitterMaxQueueSize:2048 EmitterBatchProcessor:true TraceSampleRatio:1 TraceBatchTimeout:1s TraceSpanExporter: TraceRetryConfig: TraceCompressor:gzip MetricReaderInterval:1s MetricRetryConfig: MetricViews:[] MetricCompressor:gzip ChipIngressEmitterEnabled:false ChipIngressEmitterGRPCEndpoint: ChipIngressInsecureConnection:false ChipIngressBufferSize:0 ChipIngressMaxBatchSize:0 ChipIngressSendInterval:0s ChipIngressSendTimeout:0s LogExportTimeout:1s LogExportInterval:1s LogExportMaxBatchSize:512 LogMaxQueueSize:2048 LogBatchProcessor:true LogRetryConfig: LogStreamingEnabled:false LogLevel:info LogCompressor:gzip AuthHeaders:map[] AuthHeadersTTL:0s AuthKeySigner: AuthPublicKeyHex:} + // {InsecureConnection:true CACertFile: OtelExporterGRPCEndpoint:localhost:4317 OtelExporterHTTPEndpoint:localhost:4318 ResourceAttributes:[{Key:package_name Value:{vtype:4 numeric:0 stringly:beholder slice:}} {Key:sender Value:{vtype:4 numeric:0 stringly:beholderclient slice:}}] EmitterExportTimeout:1s EmitterExportInterval:1s EmitterExportMaxBatchSize:512 EmitterMaxQueueSize:2048 EmitterBatchProcessor:true TraceSampleRatio:1 TraceBatchTimeout:1s TraceSpanExporter: TraceRetryConfig: TraceCompressor:gzip MetricReaderInterval:1s MetricRetryConfig: MetricViews:[] MetricCompressor:gzip ChipIngressEmitterEnabled:false ChipIngressEmitterGRPCEndpoint: ChipIngressInsecureConnection:false ChipIngressBatchEmitterEnabled:false ChipIngressBufferSize:0 ChipIngressMaxBatchSize:0 ChipIngressSendInterval:0s ChipIngressSendTimeout:0s LogExportTimeout:1s LogExportInterval:1s LogExportMaxBatchSize:512 LogMaxQueueSize:2048 LogBatchProcessor:true LogRetryConfig: LogStreamingEnabled:false LogLevel:info LogCompressor:gzip AuthHeaders:map[] AuthHeadersTTL:0s AuthKeySigner: AuthPublicKeyHex:} // {InitialInterval:5s MaxInterval:30s MaxElapsedTime:1m0s} } diff --git a/pkg/loop/config.go b/pkg/loop/config.go index 19e1ab7f10..1a0f71e26d 100644 --- a/pkg/loop/config.go +++ b/pkg/loop/config.go @@ -77,8 +77,9 @@ const ( envTelemetryMetricCompressor = "CL_TELEMETRY_METRIC_COMPRESSOR" envTelemetryLogCompressor = "CL_TELEMETRY_LOG_COMPRESSOR" - envChipIngressEndpoint = "CL_CHIP_INGRESS_ENDPOINT" - envChipIngressInsecureConnection = "CL_CHIP_INGRESS_INSECURE_CONNECTION" + envChipIngressEndpoint = "CL_CHIP_INGRESS_ENDPOINT" + envChipIngressInsecureConnection = "CL_CHIP_INGRESS_INSECURE_CONNECTION" + envChipIngressBatchEmitterEnabled = "CL_CHIP_INGRESS_BATCH_EMITTER_ENABLED" envCRESettings = cresettings.EnvNameSettings envCRESettingsDefault = cresettings.EnvNameSettingsDefault @@ -148,8 +149,9 @@ type EnvConfig struct { TelemetryMetricCompressor string TelemetryLogCompressor string - ChipIngressEndpoint string - ChipIngressInsecureConnection bool + ChipIngressEndpoint string + ChipIngressInsecureConnection bool + ChipIngressBatchEmitterEnabled bool CRESettings string CRESettingsDefault string @@ -234,6 +236,7 @@ func (e *EnvConfig) AsCmdEnv() (env []string) { add(envChipIngressEndpoint, e.ChipIngressEndpoint) add(envChipIngressInsecureConnection, strconv.FormatBool(e.ChipIngressInsecureConnection)) + add(envChipIngressBatchEmitterEnabled, strconv.FormatBool(e.ChipIngressBatchEmitterEnabled)) if e.CRESettings != "" { add(envCRESettings, e.CRESettings) @@ -449,6 +452,10 @@ func (e *EnvConfig) parse() error { if err != nil { return fmt.Errorf("failed to parse %s: %w", envChipIngressInsecureConnection, err) } + e.ChipIngressBatchEmitterEnabled, err = getBool(envChipIngressBatchEmitterEnabled) + if err != nil { + return fmt.Errorf("failed to parse %s: %w", envChipIngressBatchEmitterEnabled, err) + } } e.CRESettings = os.Getenv(envCRESettings) diff --git a/pkg/loop/config_test.go b/pkg/loop/config_test.go index a0b5ef83d9..5207dbbeaa 100644 --- a/pkg/loop/config_test.go +++ b/pkg/loop/config_test.go @@ -78,8 +78,9 @@ func TestEnvConfig_parse(t *testing.T) { envTelemetryEmitterMaxQueueSize: "1000", envTelemetryLogStreamingEnabled: "false", - envChipIngressEndpoint: "chip-ingress.example.com:50051", - envChipIngressInsecureConnection: "true", + envChipIngressEndpoint: "chip-ingress.example.com:50051", + envChipIngressInsecureConnection: "true", + envChipIngressBatchEmitterEnabled: "false", envCRESettings: `{"global":{}}`, envCRESettingsDefault: `{"foo":"bar"}`, @@ -182,8 +183,9 @@ var envCfgFull = EnvConfig{ TelemetryEmitterMaxQueueSize: 1000, TelemetryLogStreamingEnabled: false, - ChipIngressEndpoint: "chip-ingress.example.com:50051", - ChipIngressInsecureConnection: true, + ChipIngressEndpoint: "chip-ingress.example.com:50051", + ChipIngressInsecureConnection: true, + ChipIngressBatchEmitterEnabled: false, CRESettings: `{"global":{}}`, CRESettingsDefault: `{"foo":"bar"}`, @@ -239,6 +241,7 @@ func TestEnvConfig_AsCmdEnv(t *testing.T) { // Assert ChipIngress environment variables assert.Equal(t, "chip-ingress.example.com:50051", got[envChipIngressEndpoint]) assert.Equal(t, "true", got[envChipIngressInsecureConnection]) + assert.Equal(t, "false", got[envChipIngressBatchEmitterEnabled]) assert.Equal(t, `{"global":{}}`, got[envCRESettings]) assert.Equal(t, `{"foo":"bar"}`, got[envCRESettingsDefault]) diff --git a/pkg/loop/server.go b/pkg/loop/server.go index ae61c9d6c4..b36bc30a04 100644 --- a/pkg/loop/server.go +++ b/pkg/loop/server.go @@ -173,6 +173,7 @@ func (s *Server) start(opts ...ServerOpt) error { ChipIngressEmitterEnabled: s.EnvConfig.ChipIngressEndpoint != "", ChipIngressEmitterGRPCEndpoint: s.EnvConfig.ChipIngressEndpoint, ChipIngressInsecureConnection: s.EnvConfig.ChipIngressInsecureConnection, + ChipIngressBatchEmitterEnabled: s.EnvConfig.ChipIngressBatchEmitterEnabled, MetricCompressor: s.EnvConfig.TelemetryMetricCompressor, } From dc9e23570efb7ad62de40c3018813ce7e57e22e9 Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Mon, 2 Mar 2026 13:47:23 +0200 Subject: [PATCH 06/28] Make logger last parameter --- pkg/beholder/chip_ingress_batch_emitter.go | 2 +- .../chip_ingress_batch_emitter_test.go | 26 +++++++++---------- pkg/beholder/client.go | 2 +- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/pkg/beholder/chip_ingress_batch_emitter.go b/pkg/beholder/chip_ingress_batch_emitter.go index 37949069cf..b8657d1b17 100644 --- a/pkg/beholder/chip_ingress_batch_emitter.go +++ b/pkg/beholder/chip_ingress_batch_emitter.go @@ -33,7 +33,7 @@ type ChipIngressBatchEmitter struct { // NewChipIngressBatchEmitter creates a batch emitter backed by the given chipingress client. // Call Start() to begin health monitoring, and Close() to stop all workers. -func NewChipIngressBatchEmitter(client chipingress.Client, lggr logger.Logger, cfg Config) (*ChipIngressBatchEmitter, error) { +func NewChipIngressBatchEmitter(client chipingress.Client, cfg Config, lggr logger.Logger) (*ChipIngressBatchEmitter, error) { if client == nil { return nil, fmt.Errorf("chip ingress client is nil") } diff --git a/pkg/beholder/chip_ingress_batch_emitter_test.go b/pkg/beholder/chip_ingress_batch_emitter_test.go index 483d55e86f..ec2ac01373 100644 --- a/pkg/beholder/chip_ingress_batch_emitter_test.go +++ b/pkg/beholder/chip_ingress_batch_emitter_test.go @@ -35,13 +35,13 @@ func newTestLogger(t *testing.T) logger.Logger { func TestNewChipIngressBatchEmitter(t *testing.T) { t.Run("happy path", func(t *testing.T) { clientMock := mocks.NewClient(t) - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestLogger(t), newTestConfig()) + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestConfig(), newTestLogger(t)) require.NoError(t, err) assert.NotNil(t, emitter) }) t.Run("returns error when client is nil", func(t *testing.T) { - emitter, err := beholder.NewChipIngressBatchEmitter(nil, newTestLogger(t), newTestConfig()) + emitter, err := beholder.NewChipIngressBatchEmitter(nil, newTestConfig(), newTestLogger(t)) assert.Error(t, err) assert.Nil(t, emitter) }) @@ -50,7 +50,7 @@ func TestNewChipIngressBatchEmitter(t *testing.T) { func TestChipIngressBatchEmitter_Emit(t *testing.T) { t.Run("returns error when domain/entity missing", func(t *testing.T) { clientMock := mocks.NewClient(t) - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestLogger(t), newTestConfig()) + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestConfig(), newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) defer emitter.Close() //nolint:errcheck @@ -61,7 +61,7 @@ func TestChipIngressBatchEmitter_Emit(t *testing.T) { t.Run("enqueues and does not call PublishBatch immediately", func(t *testing.T) { clientMock := mocks.NewClient(t) - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestLogger(t), newTestConfig()) + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestConfig(), newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) defer emitter.Close() //nolint:errcheck @@ -96,7 +96,7 @@ func TestChipIngressBatchEmitter_BatchAssembly(t *testing.T) { cfg := newTestConfig() cfg.ChipIngressSendInterval = 50 * time.Millisecond - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestLogger(t), cfg) + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) @@ -149,7 +149,7 @@ func TestChipIngressBatchEmitter_MaxBatchSize(t *testing.T) { cfg.ChipIngressMaxBatchSize = 3 cfg.ChipIngressSendInterval = 50 * time.Millisecond - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestLogger(t), cfg) + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) @@ -204,7 +204,7 @@ func TestChipIngressBatchEmitter_PerDomainEntityIsolation(t *testing.T) { cfg := newTestConfig() cfg.ChipIngressSendInterval = 50 * time.Millisecond - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestLogger(t), cfg) + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) @@ -254,7 +254,7 @@ func TestChipIngressBatchEmitter_BufferFull(t *testing.T) { cfg.ChipIngressBufferSize = 3 cfg.ChipIngressSendInterval = 10 * time.Second // very long interval to prevent flushing - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestLogger(t), cfg) + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) defer emitter.Close() //nolint:errcheck @@ -285,7 +285,7 @@ func TestChipIngressBatchEmitter_Lifecycle(t *testing.T) { Return(nil, nil). Maybe() - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestLogger(t), newTestConfig()) + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestConfig(), newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) @@ -322,7 +322,7 @@ func TestChipIngressBatchEmitter_CloudEventFormat(t *testing.T) { cfg := newTestConfig() cfg.ChipIngressSendInterval = 50 * time.Millisecond - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestLogger(t), cfg) + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) @@ -369,7 +369,7 @@ func TestChipIngressBatchEmitter_PublishBatchError(t *testing.T) { cfg := newTestConfig() cfg.ChipIngressSendInterval = 50 * time.Millisecond - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestLogger(t), cfg) + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) @@ -399,7 +399,7 @@ func TestChipIngressBatchEmitter_ContextCancellation(t *testing.T) { cfg.ChipIngressBufferSize = 1 cfg.ChipIngressSendInterval = 10 * time.Second - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestLogger(t), cfg) + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) defer emitter.Close() //nolint:errcheck @@ -437,7 +437,7 @@ func TestChipIngressBatchEmitter_DefaultConfig(t *testing.T) { }). Return(nil, nil) - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestLogger(t), beholder.Config{}) + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, beholder.Config{}, newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) diff --git a/pkg/beholder/client.go b/pkg/beholder/client.go index e6e7c435e2..eaa268223f 100644 --- a/pkg/beholder/client.go +++ b/pkg/beholder/client.go @@ -230,7 +230,7 @@ func NewGRPCClient(cfg Config, otlploggrpcNew otlploggrpcFactory) (*Client, erro if lErr != nil { return nil, fmt.Errorf("failed to create logger for chip ingress batch emitter: %w", lErr) } - batchEmitterService, err = NewChipIngressBatchEmitter(chipIngressClient, lggr, cfg) + batchEmitterService, err = NewChipIngressBatchEmitter(chipIngressClient, cfg, lggr) if err != nil { return nil, fmt.Errorf("failed to create chip ingress batch emitter: %w", err) } From 38c7ca34dc8425b9c4cf1884b946279a8ee97519 Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Mon, 2 Mar 2026 14:06:11 +0200 Subject: [PATCH 07/28] Omit ChipIngressBatchEmitter.start --- pkg/beholder/chip_ingress_batch_emitter.go | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pkg/beholder/chip_ingress_batch_emitter.go b/pkg/beholder/chip_ingress_batch_emitter.go index b8657d1b17..5240f75099 100644 --- a/pkg/beholder/chip_ingress_batch_emitter.go +++ b/pkg/beholder/chip_ingress_batch_emitter.go @@ -65,16 +65,12 @@ func NewChipIngressBatchEmitter(client chipingress.Client, cfg Config, lggr logg } e.Service, e.eng = services.Config{ - Name: "ChipIngressBatchEmitter", - Start: e.start, + Name: "ChipIngressBatchEmitter", }.NewServiceEngine(lggr) return e, nil } -func (e *ChipIngressBatchEmitter) start(_ context.Context) error { - return nil -} // Emit extracts (domain, entity) from the attributes, routes the event to the // appropriate per-(domain, entity) worker, and returns immediately. From 616ac1e4bffb0ad7a24bed9a4e600d873a92484d Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Wed, 4 Mar 2026 13:32:33 +0200 Subject: [PATCH 08/28] Add retries and drain --- pkg/beholder/chip_ingress_batch_emitter.go | 98 ++++- .../chip_ingress_batch_emitter_test.go | 404 ++++++++++++++++++ pkg/beholder/chip_ingress_emitter_worker.go | 144 ++++++- pkg/beholder/config.go | 27 +- pkg/beholder/config_test.go | 2 +- 5 files changed, 652 insertions(+), 23 deletions(-) diff --git a/pkg/beholder/chip_ingress_batch_emitter.go b/pkg/beholder/chip_ingress_batch_emitter.go index 5240f75099..de41f01b5f 100644 --- a/pkg/beholder/chip_ingress_batch_emitter.go +++ b/pkg/beholder/chip_ingress_batch_emitter.go @@ -6,10 +6,12 @@ import ( "sync" "time" + "go.opentelemetry.io/otel" + otelmetric "go.opentelemetry.io/otel/metric" + "github.com/smartcontractkit/chainlink-common/pkg/chipingress" "github.com/smartcontractkit/chainlink-common/pkg/logger" "github.com/smartcontractkit/chainlink-common/pkg/services" - "github.com/smartcontractkit/chainlink-common/pkg/timeutil" ) // ChipIngressBatchEmitter buffers events per (domain, entity) and flushes them @@ -29,6 +31,18 @@ type ChipIngressBatchEmitter struct { maxBatchSize uint sendInterval time.Duration sendTimeout time.Duration + retryCfg *RetryConfig + drainTimeout time.Duration + + metrics batchEmitterMetrics +} + +type batchEmitterMetrics struct { + eventsSent otelmetric.Int64Counter + eventsDropped otelmetric.Int64Counter + batchRetries otelmetric.Int64Counter + batchFailures otelmetric.Int64Counter + eventsDrained otelmetric.Int64Counter } // NewChipIngressBatchEmitter creates a batch emitter backed by the given chipingress client. @@ -54,6 +68,16 @@ func NewChipIngressBatchEmitter(client chipingress.Client, cfg Config, lggr logg if sendTimeout == 0 { sendTimeout = 10 * time.Second } + drainTimeout := cfg.ChipIngressDrainTimeout + if drainTimeout == 0 { + drainTimeout = 5 * time.Second + } + + meter := otel.Meter("beholder/chip_ingress_batch_emitter") + metrics, err := newBatchEmitterMetrics(meter) + if err != nil { + return nil, fmt.Errorf("failed to create batch emitter metrics: %w", err) + } e := &ChipIngressBatchEmitter{ client: client, @@ -62,6 +86,9 @@ func NewChipIngressBatchEmitter(client chipingress.Client, cfg Config, lggr logg maxBatchSize: maxBatchSize, sendInterval: sendInterval, sendTimeout: sendTimeout, + retryCfg: cfg.ChipIngressRetryConfig, + drainTimeout: drainTimeout, + metrics: metrics, } e.Service, e.eng = services.Config{ @@ -71,7 +98,6 @@ func NewChipIngressBatchEmitter(client chipingress.Client, cfg Config, lggr logg return e, nil } - // Emit extracts (domain, entity) from the attributes, routes the event to the // appropriate per-(domain, entity) worker, and returns immediately. // If the worker's buffer is full, the event is dropped and a warning is logged. @@ -105,7 +131,7 @@ func (e *ChipIngressBatchEmitter) Emit(ctx context.Context, body []byte, attrKVs } // findOrCreateWorker returns the worker for the given (domain, entity) pair, -// creating one with a new buffered channel and GoTick flush loop if it doesn't exist. +// creating one with a new buffered channel and flush goroutine if it doesn't exist. func (e *ChipIngressBatchEmitter) findOrCreateWorker(domain, entity string) *chipIngressEmitterWorker { workerKey := fmt.Sprintf("%s_%s", domain, entity) @@ -132,13 +158,73 @@ func (e *ChipIngressBatchEmitter) findOrCreateWorker(domain, entity string) *chi entity, e.maxBatchSize, e.sendTimeout, + e.retryCfg, + e.metrics, e.eng, ) - e.eng.GoTick(timeutil.NewTicker(func() time.Duration { - return e.sendInterval - }), worker.Send) + sendInterval := e.sendInterval + drainTimeout := e.drainTimeout + e.eng.Go(func(ctx context.Context) { + ticker := time.NewTicker(sendInterval) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + worker.Send(ctx) + case <-ctx.Done(): + worker.drain(drainTimeout) + return + } + } + }) e.workers[workerKey] = worker return worker } + +func newBatchEmitterMetrics(meter otelmetric.Meter) (batchEmitterMetrics, error) { + eventsSent, err := meter.Int64Counter("chip_ingress.events_sent", + otelmetric.WithDescription("Total events successfully sent via PublishBatch"), + otelmetric.WithUnit("{event}")) + if err != nil { + return batchEmitterMetrics{}, err + } + + eventsDropped, err := meter.Int64Counter("chip_ingress.events_dropped", + otelmetric.WithDescription("Total events dropped (buffer full or retries exhausted)"), + otelmetric.WithUnit("{event}")) + if err != nil { + return batchEmitterMetrics{}, err + } + + batchRetries, err := meter.Int64Counter("chip_ingress.batch_retries", + otelmetric.WithDescription("Total PublishBatch retry attempts"), + otelmetric.WithUnit("{attempt}")) + if err != nil { + return batchEmitterMetrics{}, err + } + + batchFailures, err := meter.Int64Counter("chip_ingress.batch_failures", + otelmetric.WithDescription("Total batches that failed after all retries"), + otelmetric.WithUnit("{batch}")) + if err != nil { + return batchEmitterMetrics{}, err + } + + eventsDrained, err := meter.Int64Counter("chip_ingress.events_drained", + otelmetric.WithDescription("Total events flushed during graceful shutdown"), + otelmetric.WithUnit("{event}")) + if err != nil { + return batchEmitterMetrics{}, err + } + + return batchEmitterMetrics{ + eventsSent: eventsSent, + eventsDropped: eventsDropped, + batchRetries: batchRetries, + batchFailures: batchFailures, + eventsDrained: eventsDrained, + }, nil +} diff --git a/pkg/beholder/chip_ingress_batch_emitter_test.go b/pkg/beholder/chip_ingress_batch_emitter_test.go index ec2ac01373..98f6808752 100644 --- a/pkg/beholder/chip_ingress_batch_emitter_test.go +++ b/pkg/beholder/chip_ingress_batch_emitter_test.go @@ -61,6 +61,11 @@ func TestChipIngressBatchEmitter_Emit(t *testing.T) { t.Run("enqueues and does not call PublishBatch immediately", func(t *testing.T) { clientMock := mocks.NewClient(t) + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Return(nil, nil). + Maybe() + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestConfig(), newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) @@ -394,6 +399,10 @@ func TestChipIngressBatchEmitter_PublishBatchError(t *testing.T) { func TestChipIngressBatchEmitter_ContextCancellation(t *testing.T) { t.Run("Emit returns context error when context is cancelled", func(t *testing.T) { clientMock := mocks.NewClient(t) + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Return(nil, nil). + Maybe() cfg := newTestConfig() cfg.ChipIngressBufferSize = 1 @@ -461,3 +470,398 @@ func TestChipIngressBatchEmitter_DefaultConfig(t *testing.T) { require.Len(t, receivedBatch.Events, 1) }) } + +func newRetryTestConfig() beholder.Config { + return beholder.Config{ + ChipIngressBufferSize: 10, + ChipIngressMaxBatchSize: 5, + ChipIngressSendInterval: 50 * time.Millisecond, + ChipIngressSendTimeout: 1 * time.Second, + ChipIngressRetryConfig: &beholder.RetryConfig{ + InitialInterval: 10 * time.Millisecond, + MaxInterval: 50 * time.Millisecond, + MaxElapsedTime: 500 * time.Millisecond, + }, + ChipIngressDrainTimeout: 5 * time.Second, + } +} + +func TestChipIngressBatchEmitter_RetrySuccess(t *testing.T) { + t.Run("succeeds after transient failure", func(t *testing.T) { + clientMock := mocks.NewClient(t) + + var mu sync.Mutex + callCount := 0 + totalEventsSent := 0 + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Run(func(args mock.Arguments) { + mu.Lock() + defer mu.Unlock() + callCount++ + batch := args.Get(1).(*chipingress.CloudEventBatch) + if callCount <= 2 { + return // first 2 calls fail + } + totalEventsSent += len(batch.Events) + }). + Return(nil, assert.AnError).Times(2) + + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Run(func(args mock.Arguments) { + mu.Lock() + defer mu.Unlock() + callCount++ + batch := args.Get(1).(*chipingress.CloudEventBatch) + totalEventsSent += len(batch.Events) + }). + Return(nil, nil) + + cfg := newRetryTestConfig() + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + + for i := 0; i < 3; i++ { + err = emitter.Emit(t.Context(), []byte("body"), + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + require.NoError(t, err) + } + + assert.Eventually(t, func() bool { + mu.Lock() + defer mu.Unlock() + return totalEventsSent >= 3 + }, 5*time.Second, 10*time.Millisecond) + + require.NoError(t, emitter.Close()) + + mu.Lock() + defer mu.Unlock() + assert.GreaterOrEqual(t, callCount, 3, "should have retried at least twice before succeeding") + assert.Equal(t, 3, totalEventsSent) + }) +} + +func TestChipIngressBatchEmitter_RetryExhaustion(t *testing.T) { + t.Run("drops events after retries exhausted", func(t *testing.T) { + clientMock := mocks.NewClient(t) + + var mu sync.Mutex + callCount := 0 + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Run(func(_ mock.Arguments) { + mu.Lock() + defer mu.Unlock() + callCount++ + }). + Return(nil, assert.AnError) + + cfg := newRetryTestConfig() + cfg.ChipIngressRetryConfig = &beholder.RetryConfig{ + InitialInterval: 10 * time.Millisecond, + MaxInterval: 20 * time.Millisecond, + MaxElapsedTime: 100 * time.Millisecond, + } + + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + + for i := 0; i < 3; i++ { + err = emitter.Emit(t.Context(), []byte("body"), + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + require.NoError(t, err) + } + + // Wait long enough for retries to exhaust + assert.Eventually(t, func() bool { + mu.Lock() + defer mu.Unlock() + return callCount >= 2 + }, 3*time.Second, 10*time.Millisecond) + + require.NoError(t, emitter.Close()) + + mu.Lock() + defer mu.Unlock() + assert.GreaterOrEqual(t, callCount, 2, "should have attempted at least 2 times before exhausting retries") + }) +} + +func TestChipIngressBatchEmitter_GracefulDrain(t *testing.T) { + t.Run("flushes buffered events on close", func(t *testing.T) { + clientMock := mocks.NewClient(t) + + var mu sync.Mutex + totalEventsSent := 0 + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Run(func(args mock.Arguments) { + mu.Lock() + defer mu.Unlock() + batch := args.Get(1).(*chipingress.CloudEventBatch) + totalEventsSent += len(batch.Events) + }). + Return(nil, nil) + + cfg := beholder.Config{ + ChipIngressBufferSize: 20, + ChipIngressMaxBatchSize: 10, + ChipIngressSendInterval: 1 * time.Hour, // very long interval — events won't flush via tick + ChipIngressSendTimeout: 5 * time.Second, + ChipIngressDrainTimeout: 5 * time.Second, + } + + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + + for i := 0; i < 5; i++ { + err = emitter.Emit(t.Context(), []byte("body"), + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + require.NoError(t, err) + } + + // Events are buffered but no tick has fired. Close should drain them. + require.NoError(t, emitter.Close()) + + mu.Lock() + defer mu.Unlock() + assert.Equal(t, 5, totalEventsSent, "all buffered events should be drained on close") + }) +} + +func TestChipIngressBatchEmitter_DrainMultipleDomains(t *testing.T) { + t.Run("drains events from all workers on close", func(t *testing.T) { + clientMock := mocks.NewClient(t) + + var mu sync.Mutex + domainEntitySent := make(map[string]int) + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Run(func(args mock.Arguments) { + mu.Lock() + defer mu.Unlock() + batch := args.Get(1).(*chipingress.CloudEventBatch) + for _, event := range batch.Events { + key := event.Source + "/" + event.Type + domainEntitySent[key] += 1 + } + }). + Return(nil, nil) + + cfg := beholder.Config{ + ChipIngressBufferSize: 20, + ChipIngressMaxBatchSize: 10, + ChipIngressSendInterval: 1 * time.Hour, + ChipIngressSendTimeout: 5 * time.Second, + ChipIngressDrainTimeout: 5 * time.Second, + } + + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + + for i := 0; i < 3; i++ { + err = emitter.Emit(t.Context(), []byte("workflow"), + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "WorkflowEvent", + ) + require.NoError(t, err) + } + for i := 0; i < 2; i++ { + err = emitter.Emit(t.Context(), []byte("bridge"), + beholder.AttrKeyDomain, "data-feeds", + beholder.AttrKeyEntity, "BridgeStatus", + ) + require.NoError(t, err) + } + + require.NoError(t, emitter.Close()) + + mu.Lock() + defer mu.Unlock() + assert.Equal(t, 3, domainEntitySent["platform/WorkflowEvent"]) + assert.Equal(t, 2, domainEntitySent["data-feeds/BridgeStatus"]) + }) +} + +func TestChipIngressBatchEmitter_DrainPublishBatchFailure(t *testing.T) { + t.Run("drain continues attempting batches after failure", func(t *testing.T) { + clientMock := mocks.NewClient(t) + + var mu sync.Mutex + callCount := 0 + totalEventsSent := 0 + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Run(func(args mock.Arguments) { + mu.Lock() + defer mu.Unlock() + callCount++ + batch := args.Get(1).(*chipingress.CloudEventBatch) + if callCount == 1 { + return // first call fails + } + totalEventsSent += len(batch.Events) + }). + Return(nil, assert.AnError).Once() + + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Run(func(args mock.Arguments) { + mu.Lock() + defer mu.Unlock() + callCount++ + batch := args.Get(1).(*chipingress.CloudEventBatch) + totalEventsSent += len(batch.Events) + }). + Return(nil, nil) + + cfg := beholder.Config{ + ChipIngressBufferSize: 20, + ChipIngressMaxBatchSize: 3, + ChipIngressSendInterval: 1 * time.Hour, + ChipIngressSendTimeout: 5 * time.Second, + ChipIngressDrainTimeout: 5 * time.Second, + } + + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + + // Emit 6 events with maxBatchSize=3 => 2 batches during drain + for i := 0; i < 6; i++ { + err = emitter.Emit(t.Context(), []byte("body"), + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + require.NoError(t, err) + } + + require.NoError(t, emitter.Close()) + + mu.Lock() + defer mu.Unlock() + assert.GreaterOrEqual(t, callCount, 2, "drain should have attempted at least 2 batches") + assert.Equal(t, 3, totalEventsSent, "second batch should have succeeded despite first batch failure") + }) +} + +func TestChipIngressBatchEmitter_DrainTimeout(t *testing.T) { + t.Run("close returns promptly when drain timeout expires", func(t *testing.T) { + clientMock := mocks.NewClient(t) + + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Run(func(args mock.Arguments) { + ctx := args.Get(0).(context.Context) + <-ctx.Done() // simulate a slow server that only returns on context cancellation + }). + Return(nil, context.DeadlineExceeded). + Maybe() + + cfg := beholder.Config{ + ChipIngressBufferSize: 20, + ChipIngressMaxBatchSize: 10, + ChipIngressSendInterval: 1 * time.Hour, + ChipIngressSendTimeout: 10 * time.Second, + ChipIngressDrainTimeout: 200 * time.Millisecond, + } + + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + + for i := 0; i < 5; i++ { + err = emitter.Emit(t.Context(), []byte("body"), + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + require.NoError(t, err) + } + + closeDone := make(chan error, 1) + go func() { + closeDone <- emitter.Close() + }() + + select { + case err := <-closeDone: + assert.NoError(t, err, "close should not error") + case <-time.After(5 * time.Second): + t.Fatal("Close() did not return within 5s; drain timeout is not working") + } + }) +} + +func TestChipIngressBatchEmitter_RetryShutdownDuringBackoff(t *testing.T) { + t.Run("shutdown during retry backoff completes promptly", func(t *testing.T) { + clientMock := mocks.NewClient(t) + + var mu sync.Mutex + callCount := 0 + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Run(func(_ mock.Arguments) { + mu.Lock() + defer mu.Unlock() + callCount++ + }). + Return(nil, assert.AnError) + + cfg := beholder.Config{ + ChipIngressBufferSize: 10, + ChipIngressMaxBatchSize: 5, + ChipIngressSendInterval: 50 * time.Millisecond, + ChipIngressSendTimeout: 1 * time.Second, + ChipIngressRetryConfig: &beholder.RetryConfig{ + InitialInterval: 10 * time.Second, // very long backoff + MaxInterval: 30 * time.Second, + MaxElapsedTime: 1 * time.Minute, + }, + ChipIngressDrainTimeout: 5 * time.Second, + } + + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + + for i := 0; i < 3; i++ { + err = emitter.Emit(t.Context(), []byte("body"), + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + require.NoError(t, err) + } + + // Wait for at least one PublishBatch attempt (which will fail and enter 10s backoff) + assert.Eventually(t, func() bool { + mu.Lock() + defer mu.Unlock() + return callCount >= 1 + }, 2*time.Second, 10*time.Millisecond) + + // Close while the worker is sleeping in the 10s retry backoff + closeDone := make(chan error, 1) + go func() { + closeDone <- emitter.Close() + }() + + select { + case err := <-closeDone: + assert.NoError(t, err, "close should not error") + case <-time.After(3 * time.Second): + t.Fatal("Close() did not return within 3s; timer.Stop() during retry backoff is not working") + } + }) +} diff --git a/pkg/beholder/chip_ingress_emitter_worker.go b/pkg/beholder/chip_ingress_emitter_worker.go index 6b9c61759e..2df4ab130b 100644 --- a/pkg/beholder/chip_ingress_emitter_worker.go +++ b/pkg/beholder/chip_ingress_emitter_worker.go @@ -3,9 +3,13 @@ package beholder import ( "context" "fmt" + "math/rand/v2" "sync/atomic" "time" + "go.opentelemetry.io/otel/attribute" + otelmetric "go.opentelemetry.io/otel/metric" + "github.com/smartcontractkit/chainlink-common/pkg/chipingress" "github.com/smartcontractkit/chainlink-common/pkg/logger" ) @@ -29,6 +33,12 @@ type chipIngressEmitterWorker struct { sendTimeout time.Duration lggr logger.Logger dropCount atomic.Uint32 + + retryInitialInterval time.Duration + retryMaxInterval time.Duration + retryMaxElapsed time.Duration + + metrics batchEmitterMetrics } func newChipIngressEmitterWorker( @@ -38,9 +48,11 @@ func newChipIngressEmitterWorker( entity string, maxBatchSize uint, sendTimeout time.Duration, + retryCfg *RetryConfig, + metrics batchEmitterMetrics, lggr logger.Logger, ) *chipIngressEmitterWorker { - return &chipIngressEmitterWorker{ + w := &chipIngressEmitterWorker{ client: client, ch: ch, domain: domain, @@ -48,10 +60,18 @@ func newChipIngressEmitterWorker( maxBatchSize: maxBatchSize, sendTimeout: sendTimeout, lggr: logger.Named(lggr, "ChipIngressEmitterWorker"), + metrics: metrics, } + if retryCfg != nil && retryCfg.Enabled() { + w.retryInitialInterval = retryCfg.InitialInterval + w.retryMaxInterval = retryCfg.MaxInterval + w.retryMaxElapsed = retryCfg.MaxElapsedTime + } + return w } -// Send drains the channel and sends a batch. Called periodically by GoTick. +// Send drains the channel and sends a batch with retry on failure. +// Called periodically by the tick loop. func (w *chipIngressEmitterWorker) Send(ctx context.Context) { if len(w.ch) == 0 { return @@ -62,25 +82,84 @@ func (w *chipIngressEmitterWorker) Send(ctx context.Context) { return } - ctx, cancel := context.WithTimeout(ctx, w.sendTimeout) + if w.retryMaxElapsed == 0 { + w.publishOnce(ctx, batch) + return + } + + backoff := w.retryInitialInterval + deadline := time.Now().Add(w.retryMaxElapsed) + + batchSize := int64(len(batch.Events)) + metricAttrs := otelmetric.WithAttributeSet(w.metricAttributes()) + + for attempt := 0; ; attempt++ { + sendCtx, cancel := context.WithTimeout(ctx, w.sendTimeout) + _, err := w.client.PublishBatch(sendCtx, batch) + cancel() + + if err == nil { + w.metrics.eventsSent.Add(context.Background(), batchSize, metricAttrs) + return + } + + w.lggr.Warnf("PublishBatch failed (attempt %d, domain=%s, entity=%s): %v", + attempt+1, w.domain, w.entity, err) + w.metrics.batchRetries.Add(context.Background(), 1, metricAttrs) + + if time.Now().Add(backoff).After(deadline) { + w.lggr.Warnf("PublishBatch retries exhausted, dropping %d events (domain=%s, entity=%s)", + len(batch.Events), w.domain, w.entity) + w.metrics.batchFailures.Add(context.Background(), 1, metricAttrs) + w.metrics.eventsDropped.Add(context.Background(), batchSize, metricAttrs) + return + } + + timer := time.NewTimer(backoff) + select { + case <-ctx.Done(): + timer.Stop() + w.lggr.Warnf("context cancelled during retry, dropping %d events (domain=%s, entity=%s)", + len(batch.Events), w.domain, w.entity) + w.metrics.eventsDropped.Add(context.Background(), batchSize, metricAttrs) + return + case <-timer.C: + } + + backoff = min(backoff*2, w.retryMaxInterval) + jitter := time.Duration(rand.Int64N(int64(backoff) / 5)) //nolint:gosec + backoff += jitter + } +} + +func (w *chipIngressEmitterWorker) publishOnce(ctx context.Context, batch *chipingress.CloudEventBatch) { + sendCtx, cancel := context.WithTimeout(ctx, w.sendTimeout) defer cancel() - _, err := w.client.PublishBatch(ctx, batch) + batchSize := int64(len(batch.Events)) + metricAttrs := otelmetric.WithAttributeSet(w.metricAttributes()) + _, err := w.client.PublishBatch(sendCtx, batch) if err != nil { - w.lggr.Warnf("could not send batch via chip ingress: %v", err) + w.lggr.Warnf("could not send batch via chip ingress (domain=%s, entity=%s): %v", + w.domain, w.entity, err) + w.metrics.batchFailures.Add(context.Background(), 1, metricAttrs) + w.metrics.eventsDropped.Add(context.Background(), batchSize, metricAttrs) return } + w.metrics.eventsSent.Add(context.Background(), batchSize, metricAttrs) } // buildBatch drains the channel up to maxBatchSize and converts payloads to a CloudEventBatch. func (w *chipIngressEmitterWorker) buildBatch() *chipingress.CloudEventBatch { var events []chipingress.CloudEvent + metricAttrs := otelmetric.WithAttributeSet(w.metricAttributes()) for len(w.ch) > 0 && len(events) < int(w.maxBatchSize) { // #nosec G115 payload := <-w.ch event, err := w.payloadToEvent(payload) if err != nil { w.lggr.Warnf("failed to build CloudEvent, dropping: %v", err) + w.metrics.eventsDropped.Add(context.Background(), 1, metricAttrs) continue } events = append(events, event) @@ -93,6 +172,7 @@ func (w *chipIngressEmitterWorker) buildBatch() *chipingress.CloudEventBatch { batch, err := chipingress.EventsToBatch(events) if err != nil { w.lggr.Warnf("failed to convert events to batch: %v", err) + w.metrics.eventsDropped.Add(context.Background(), int64(len(events)), metricAttrs) return nil } @@ -107,12 +187,66 @@ func (w *chipIngressEmitterWorker) payloadToEvent(payload emitterPayload) (chipi return event, nil } +// drain flushes all remaining buffered events before shutdown. +// Uses a fresh context with the given timeout (independent of the cancelled parent). +// Continues attempting subsequent batches even if one fails. +func (w *chipIngressEmitterWorker) drain(timeout time.Duration) { + if len(w.ch) == 0 { + return + } + + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + metricAttrs := otelmetric.WithAttributeSet(w.metricAttributes()) + w.lggr.Infof("draining %d buffered events (domain=%s, entity=%s)", len(w.ch), w.domain, w.entity) + + for len(w.ch) > 0 { + if ctx.Err() != nil { + remaining := len(w.ch) + if remaining > 0 { + w.lggr.Warnf("drain timeout exceeded, dropping %d remaining events (domain=%s, entity=%s)", + remaining, w.domain, w.entity) + w.metrics.eventsDropped.Add(context.Background(), int64(remaining), metricAttrs) + } + return + } + + batch := w.buildBatch() + if batch == nil || len(batch.Events) == 0 { + break + } + + batchSize := int64(len(batch.Events)) + sendCtx, sendCancel := context.WithTimeout(ctx, w.sendTimeout) + _, err := w.client.PublishBatch(sendCtx, batch) + sendCancel() + + if err != nil { + w.lggr.Warnf("drain PublishBatch failed, dropping %d events (domain=%s, entity=%s): %v", + len(batch.Events), w.domain, w.entity, err) + w.metrics.eventsDropped.Add(context.Background(), batchSize, metricAttrs) + continue + } + + w.metrics.eventsDrained.Add(context.Background(), batchSize, metricAttrs) + } +} + // logBufferFullWithExpBackoff logs at 1, 2, 4, 8, 16, 32, 64, 100, 200, 300, ... // to avoid flooding logs when the buffer is persistently full. func (w *chipIngressEmitterWorker) logBufferFullWithExpBackoff(payload emitterPayload) { + w.metrics.eventsDropped.Add(context.Background(), 1, otelmetric.WithAttributeSet(w.metricAttributes())) count := w.dropCount.Add(1) if count > 0 && (count%100 == 0 || count&(count-1) == 0) { w.lggr.Warnf("chip ingress emitter buffer full, dropping event (domain=%s, entity=%s, droppedCount=%d)", payload.domain, payload.entity, count) } } + +func (w *chipIngressEmitterWorker) metricAttributes() attribute.Set { + return attribute.NewSet( + attribute.String("domain", w.domain), + attribute.String("entity", w.entity), + ) +} diff --git a/pkg/beholder/config.go b/pkg/beholder/config.go index 06b877279e..27dd6e8f0c 100644 --- a/pkg/beholder/config.go +++ b/pkg/beholder/config.go @@ -45,11 +45,13 @@ type Config struct { ChipIngressInsecureConnection bool // Disables TLS for Chip Ingress Emitter // Chip Ingress Batch Emitter - ChipIngressBatchEmitterEnabled bool // When true (default), use batch emitter; when false, use legacy per-event emitter - ChipIngressBufferSize uint // Per-worker channel buffer size (default 100) - ChipIngressMaxBatchSize uint // Max events per PublishBatch call (default 50) - ChipIngressSendInterval time.Duration // Flush interval per worker (default 500ms when zero or unset) - ChipIngressSendTimeout time.Duration // Timeout per PublishBatch call (default 10s) + ChipIngressBatchEmitterEnabled bool // When true (default), use batch emitter; when false, use legacy per-event emitter + ChipIngressBufferSize uint // Per-worker channel buffer size (default 100) + ChipIngressMaxBatchSize uint // Max events per PublishBatch call (default 50) + ChipIngressSendInterval time.Duration // Flush interval per worker (default 500ms when zero or unset) + ChipIngressSendTimeout time.Duration // Timeout per PublishBatch call (default 10s) + ChipIngressRetryConfig *RetryConfig // Retry config for failed PublishBatch calls (defaults match OTel SDK: 5s/30s/1m) + ChipIngressDrainTimeout time.Duration // Max time to flush remaining events on shutdown (default 5s) // OTel Log LogExportTimeout time.Duration @@ -143,9 +145,11 @@ func DefaultConfig() Config { // Chip Ingress Batch Emitter ChipIngressBatchEmitterEnabled: false, ChipIngressBufferSize: 100, - ChipIngressMaxBatchSize: 50, - ChipIngressSendInterval: 500 * time.Millisecond, - ChipIngressSendTimeout: 10 * time.Second, + ChipIngressMaxBatchSize: 50, + ChipIngressSendInterval: 500 * time.Millisecond, + ChipIngressSendTimeout: 10 * time.Second, + ChipIngressRetryConfig: defaultRetryConfig.Copy(), + ChipIngressDrainTimeout: 5 * time.Second, // Auth (defaults to static auth mode with TTL=0) AuthHeadersTTL: 0, } @@ -157,9 +161,10 @@ func TestDefaultConfig() Config { config.EmitterBatchProcessor = false config.LogBatchProcessor = false // Retries are disabled for testing - config.LogRetryConfig.MaxElapsedTime = 0 // Retry is disabled - config.TraceRetryConfig.MaxElapsedTime = 0 // Retry is disabled - config.MetricRetryConfig.MaxElapsedTime = 0 // Retry is disabled + config.LogRetryConfig.MaxElapsedTime = 0 // Retry is disabled + config.TraceRetryConfig.MaxElapsedTime = 0 // Retry is disabled + config.MetricRetryConfig.MaxElapsedTime = 0 // Retry is disabled + config.ChipIngressRetryConfig.MaxElapsedTime = 0 // Retry is disabled // Auth disabled for testing (TTL=0 means static auth mode) config.AuthHeadersTTL = 0 return config diff --git a/pkg/beholder/config_test.go b/pkg/beholder/config_test.go index 024ba805d2..f98f77bc60 100644 --- a/pkg/beholder/config_test.go +++ b/pkg/beholder/config_test.go @@ -67,6 +67,6 @@ func ExampleConfig() { } fmt.Printf("%+v\n", *config.LogRetryConfig) // Output: - // {InsecureConnection:true CACertFile: OtelExporterGRPCEndpoint:localhost:4317 OtelExporterHTTPEndpoint:localhost:4318 ResourceAttributes:[{Key:package_name Value:{vtype:4 numeric:0 stringly:beholder slice:}} {Key:sender Value:{vtype:4 numeric:0 stringly:beholderclient slice:}}] EmitterExportTimeout:1s EmitterExportInterval:1s EmitterExportMaxBatchSize:512 EmitterMaxQueueSize:2048 EmitterBatchProcessor:true TraceSampleRatio:1 TraceBatchTimeout:1s TraceSpanExporter: TraceRetryConfig: TraceCompressor:gzip MetricReaderInterval:1s MetricRetryConfig: MetricViews:[] MetricCompressor:gzip ChipIngressEmitterEnabled:false ChipIngressEmitterGRPCEndpoint: ChipIngressInsecureConnection:false ChipIngressBatchEmitterEnabled:false ChipIngressBufferSize:0 ChipIngressMaxBatchSize:0 ChipIngressSendInterval:0s ChipIngressSendTimeout:0s LogExportTimeout:1s LogExportInterval:1s LogExportMaxBatchSize:512 LogMaxQueueSize:2048 LogBatchProcessor:true LogRetryConfig: LogStreamingEnabled:false LogLevel:info LogCompressor:gzip AuthHeaders:map[] AuthHeadersTTL:0s AuthKeySigner: AuthPublicKeyHex:} + // {InsecureConnection:true CACertFile: OtelExporterGRPCEndpoint:localhost:4317 OtelExporterHTTPEndpoint:localhost:4318 ResourceAttributes:[{Key:package_name Value:{vtype:4 numeric:0 stringly:beholder slice:}} {Key:sender Value:{vtype:4 numeric:0 stringly:beholderclient slice:}}] EmitterExportTimeout:1s EmitterExportInterval:1s EmitterExportMaxBatchSize:512 EmitterMaxQueueSize:2048 EmitterBatchProcessor:true TraceSampleRatio:1 TraceBatchTimeout:1s TraceSpanExporter: TraceRetryConfig: TraceCompressor:gzip MetricReaderInterval:1s MetricRetryConfig: MetricViews:[] MetricCompressor:gzip ChipIngressEmitterEnabled:false ChipIngressEmitterGRPCEndpoint: ChipIngressInsecureConnection:false ChipIngressBatchEmitterEnabled:false ChipIngressBufferSize:0 ChipIngressMaxBatchSize:0 ChipIngressSendInterval:0s ChipIngressSendTimeout:0s ChipIngressRetryConfig: ChipIngressDrainTimeout:0s LogExportTimeout:1s LogExportInterval:1s LogExportMaxBatchSize:512 LogMaxQueueSize:2048 LogBatchProcessor:true LogRetryConfig: LogStreamingEnabled:false LogLevel:info LogCompressor:gzip AuthHeaders:map[] AuthHeadersTTL:0s AuthKeySigner: AuthPublicKeyHex:} // {InitialInterval:5s MaxInterval:30s MaxElapsedTime:1m0s} } From c57a9b048ddb5e0dcb3af1c71091bf657c1241ec Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Wed, 4 Mar 2026 13:53:09 +0200 Subject: [PATCH 09/28] Use idiomatic select --- pkg/beholder/chip_ingress_emitter_worker.go | 22 +++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/pkg/beholder/chip_ingress_emitter_worker.go b/pkg/beholder/chip_ingress_emitter_worker.go index 2df4ab130b..f706de6fc7 100644 --- a/pkg/beholder/chip_ingress_emitter_worker.go +++ b/pkg/beholder/chip_ingress_emitter_worker.go @@ -154,15 +154,21 @@ func (w *chipIngressEmitterWorker) buildBatch() *chipingress.CloudEventBatch { var events []chipingress.CloudEvent metricAttrs := otelmetric.WithAttributeSet(w.metricAttributes()) - for len(w.ch) > 0 && len(events) < int(w.maxBatchSize) { // #nosec G115 - payload := <-w.ch - event, err := w.payloadToEvent(payload) - if err != nil { - w.lggr.Warnf("failed to build CloudEvent, dropping: %v", err) - w.metrics.eventsDropped.Add(context.Background(), 1, metricAttrs) - continue + max := int(w.maxBatchSize) // #nosec G115 +drain: + for len(events) < max { + select { + case payload := <-w.ch: + event, err := w.payloadToEvent(payload) + if err != nil { + w.lggr.Warnf("failed to build CloudEvent, dropping: %v", err) + w.metrics.eventsDropped.Add(context.Background(), 1, metricAttrs) + continue + } + events = append(events, event) + default: + break drain } - events = append(events, event) } if len(events) == 0 { From b55aafd2e5a2f3c3f0418573786d665424eb842b Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Wed, 4 Mar 2026 16:10:39 +0200 Subject: [PATCH 10/28] Various fixes --- pkg/beholder/chip_ingress_batch_emitter.go | 46 ++++++++-------- .../chip_ingress_batch_emitter_test.go | 22 ++++++++ pkg/beholder/chip_ingress_emitter_worker.go | 54 +++++++++---------- pkg/beholder/client.go | 10 +++- pkg/beholder/config.go | 2 +- 5 files changed, 80 insertions(+), 54 deletions(-) diff --git a/pkg/beholder/chip_ingress_batch_emitter.go b/pkg/beholder/chip_ingress_batch_emitter.go index de41f01b5f..f2e92c1ff8 100644 --- a/pkg/beholder/chip_ingress_batch_emitter.go +++ b/pkg/beholder/chip_ingress_batch_emitter.go @@ -101,33 +101,37 @@ func NewChipIngressBatchEmitter(client chipingress.Client, cfg Config, lggr logg // Emit extracts (domain, entity) from the attributes, routes the event to the // appropriate per-(domain, entity) worker, and returns immediately. // If the worker's buffer is full, the event is dropped and a warning is logged. +// Returns an error if the emitter has been closed. func (e *ChipIngressBatchEmitter) Emit(ctx context.Context, body []byte, attrKVs ...any) error { - domain, entity, err := ExtractSourceAndType(attrKVs...) - if err != nil { - return err - } + return e.eng.IfNotStopped(func() error { + domain, entity, err := ExtractSourceAndType(attrKVs...) + if err != nil { + return err + } - attributes := newAttributes(attrKVs...) + attributes := newAttributes(attrKVs...) - worker := e.findOrCreateWorker(domain, entity) + worker := e.findOrCreateWorker(domain, entity) - payload := emitterPayload{ - body: body, - attributes: attributes, - domain: domain, - entity: entity, - } + payload := emitterPayload{ + body: body, + attributes: attributes, + domain: domain, + entity: entity, + } - select { - case worker.ch <- payload: - worker.dropCount.Store(0) - case <-ctx.Done(): - return ctx.Err() - default: - worker.logBufferFullWithExpBackoff(payload) - } + select { + case worker.ch <- payload: + // Intentionally racy with logBufferFullWithExpBackoff — only affects log frequency, not correctness. + worker.dropCount.Store(0) + case <-ctx.Done(): + return ctx.Err() + default: + worker.logBufferFullWithExpBackoff(payload) + } - return nil + return nil + }) } // findOrCreateWorker returns the worker for the given (domain, entity) pair, diff --git a/pkg/beholder/chip_ingress_batch_emitter_test.go b/pkg/beholder/chip_ingress_batch_emitter_test.go index 98f6808752..f827a04398 100644 --- a/pkg/beholder/chip_ingress_batch_emitter_test.go +++ b/pkg/beholder/chip_ingress_batch_emitter_test.go @@ -29,6 +29,7 @@ func newTestLogger(t *testing.T) logger.Logger { t.Helper() lggr, err := logger.New() require.NoError(t, err) + t.Cleanup(func() { _ = lggr.Sync() }) return lggr } @@ -865,3 +866,24 @@ func TestChipIngressBatchEmitter_RetryShutdownDuringBackoff(t *testing.T) { } }) } + +func TestChipIngressBatchEmitter_EmitAfterClose(t *testing.T) { + t.Run("Emit after Close returns error", func(t *testing.T) { + clientMock := mocks.NewClient(t) + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Return(nil, nil). + Maybe() + + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestConfig(), newTestLogger(t)) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + require.NoError(t, emitter.Close()) + + err = emitter.Emit(t.Context(), []byte("body"), + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + assert.Error(t, err, "Emit after Close should return an error") + }) +} diff --git a/pkg/beholder/chip_ingress_emitter_worker.go b/pkg/beholder/chip_ingress_emitter_worker.go index f706de6fc7..87eab18117 100644 --- a/pkg/beholder/chip_ingress_emitter_worker.go +++ b/pkg/beholder/chip_ingress_emitter_worker.go @@ -38,7 +38,8 @@ type chipIngressEmitterWorker struct { retryMaxInterval time.Duration retryMaxElapsed time.Duration - metrics batchEmitterMetrics + metrics batchEmitterMetrics + metricAttrs otelmetric.MeasurementOption } func newChipIngressEmitterWorker( @@ -61,6 +62,10 @@ func newChipIngressEmitterWorker( sendTimeout: sendTimeout, lggr: logger.Named(lggr, "ChipIngressEmitterWorker"), metrics: metrics, + metricAttrs: otelmetric.WithAttributeSet(attribute.NewSet( + attribute.String("domain", domain), + attribute.String("entity", entity), + )), } if retryCfg != nil && retryCfg.Enabled() { w.retryInitialInterval = retryCfg.InitialInterval @@ -91,27 +96,25 @@ func (w *chipIngressEmitterWorker) Send(ctx context.Context) { deadline := time.Now().Add(w.retryMaxElapsed) batchSize := int64(len(batch.Events)) - metricAttrs := otelmetric.WithAttributeSet(w.metricAttributes()) - for attempt := 0; ; attempt++ { sendCtx, cancel := context.WithTimeout(ctx, w.sendTimeout) _, err := w.client.PublishBatch(sendCtx, batch) cancel() if err == nil { - w.metrics.eventsSent.Add(context.Background(), batchSize, metricAttrs) + w.metrics.eventsSent.Add(context.Background(), batchSize, w.metricAttrs) return } w.lggr.Warnf("PublishBatch failed (attempt %d, domain=%s, entity=%s): %v", attempt+1, w.domain, w.entity, err) - w.metrics.batchRetries.Add(context.Background(), 1, metricAttrs) + w.metrics.batchRetries.Add(context.Background(), 1, w.metricAttrs) if time.Now().Add(backoff).After(deadline) { w.lggr.Warnf("PublishBatch retries exhausted, dropping %d events (domain=%s, entity=%s)", len(batch.Events), w.domain, w.entity) - w.metrics.batchFailures.Add(context.Background(), 1, metricAttrs) - w.metrics.eventsDropped.Add(context.Background(), batchSize, metricAttrs) + w.metrics.batchFailures.Add(context.Background(), 1, w.metricAttrs) + w.metrics.eventsDropped.Add(context.Background(), batchSize, w.metricAttrs) return } @@ -121,14 +124,14 @@ func (w *chipIngressEmitterWorker) Send(ctx context.Context) { timer.Stop() w.lggr.Warnf("context cancelled during retry, dropping %d events (domain=%s, entity=%s)", len(batch.Events), w.domain, w.entity) - w.metrics.eventsDropped.Add(context.Background(), batchSize, metricAttrs) + w.metrics.eventsDropped.Add(context.Background(), batchSize, w.metricAttrs) return case <-timer.C: } - backoff = min(backoff*2, w.retryMaxInterval) - jitter := time.Duration(rand.Int64N(int64(backoff) / 5)) //nolint:gosec - backoff += jitter + next := backoff * 2 + jitter := time.Duration(rand.Int64N(int64(next)/5 + 1)) //nolint:gosec + backoff = min(next+jitter, w.retryMaxInterval) } } @@ -137,22 +140,20 @@ func (w *chipIngressEmitterWorker) publishOnce(ctx context.Context, batch *chipi defer cancel() batchSize := int64(len(batch.Events)) - metricAttrs := otelmetric.WithAttributeSet(w.metricAttributes()) _, err := w.client.PublishBatch(sendCtx, batch) if err != nil { w.lggr.Warnf("could not send batch via chip ingress (domain=%s, entity=%s): %v", w.domain, w.entity, err) - w.metrics.batchFailures.Add(context.Background(), 1, metricAttrs) - w.metrics.eventsDropped.Add(context.Background(), batchSize, metricAttrs) + w.metrics.batchFailures.Add(context.Background(), 1, w.metricAttrs) + w.metrics.eventsDropped.Add(context.Background(), batchSize, w.metricAttrs) return } - w.metrics.eventsSent.Add(context.Background(), batchSize, metricAttrs) + w.metrics.eventsSent.Add(context.Background(), batchSize, w.metricAttrs) } // buildBatch drains the channel up to maxBatchSize and converts payloads to a CloudEventBatch. func (w *chipIngressEmitterWorker) buildBatch() *chipingress.CloudEventBatch { var events []chipingress.CloudEvent - metricAttrs := otelmetric.WithAttributeSet(w.metricAttributes()) max := int(w.maxBatchSize) // #nosec G115 drain: @@ -162,7 +163,7 @@ drain: event, err := w.payloadToEvent(payload) if err != nil { w.lggr.Warnf("failed to build CloudEvent, dropping: %v", err) - w.metrics.eventsDropped.Add(context.Background(), 1, metricAttrs) + w.metrics.eventsDropped.Add(context.Background(), 1, w.metricAttrs) continue } events = append(events, event) @@ -178,7 +179,7 @@ drain: batch, err := chipingress.EventsToBatch(events) if err != nil { w.lggr.Warnf("failed to convert events to batch: %v", err) - w.metrics.eventsDropped.Add(context.Background(), int64(len(events)), metricAttrs) + w.metrics.eventsDropped.Add(context.Background(), int64(len(events)), w.metricAttrs) return nil } @@ -204,7 +205,6 @@ func (w *chipIngressEmitterWorker) drain(timeout time.Duration) { ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() - metricAttrs := otelmetric.WithAttributeSet(w.metricAttributes()) w.lggr.Infof("draining %d buffered events (domain=%s, entity=%s)", len(w.ch), w.domain, w.entity) for len(w.ch) > 0 { @@ -213,7 +213,7 @@ func (w *chipIngressEmitterWorker) drain(timeout time.Duration) { if remaining > 0 { w.lggr.Warnf("drain timeout exceeded, dropping %d remaining events (domain=%s, entity=%s)", remaining, w.domain, w.entity) - w.metrics.eventsDropped.Add(context.Background(), int64(remaining), metricAttrs) + w.metrics.eventsDropped.Add(context.Background(), int64(remaining), w.metricAttrs) } return } @@ -231,28 +231,22 @@ func (w *chipIngressEmitterWorker) drain(timeout time.Duration) { if err != nil { w.lggr.Warnf("drain PublishBatch failed, dropping %d events (domain=%s, entity=%s): %v", len(batch.Events), w.domain, w.entity, err) - w.metrics.eventsDropped.Add(context.Background(), batchSize, metricAttrs) + w.metrics.eventsDropped.Add(context.Background(), batchSize, w.metricAttrs) continue } - w.metrics.eventsDrained.Add(context.Background(), batchSize, metricAttrs) + w.metrics.eventsDrained.Add(context.Background(), batchSize, w.metricAttrs) } } // logBufferFullWithExpBackoff logs at 1, 2, 4, 8, 16, 32, 64, 100, 200, 300, ... // to avoid flooding logs when the buffer is persistently full. +// dropCount is intentionally racy with Emit's Store(0) — this only affects log frequency, not correctness. func (w *chipIngressEmitterWorker) logBufferFullWithExpBackoff(payload emitterPayload) { - w.metrics.eventsDropped.Add(context.Background(), 1, otelmetric.WithAttributeSet(w.metricAttributes())) + w.metrics.eventsDropped.Add(context.Background(), 1, w.metricAttrs) count := w.dropCount.Add(1) if count > 0 && (count%100 == 0 || count&(count-1) == 0) { w.lggr.Warnf("chip ingress emitter buffer full, dropping event (domain=%s, entity=%s, droppedCount=%d)", payload.domain, payload.entity, count) } } - -func (w *chipIngressEmitterWorker) metricAttributes() attribute.Set { - return attribute.NewSet( - attribute.String("domain", w.domain), - attribute.String("entity", w.entity), - ) -} diff --git a/pkg/beholder/client.go b/pkg/beholder/client.go index eaa268223f..e042368f20 100644 --- a/pkg/beholder/client.go +++ b/pkg/beholder/client.go @@ -226,6 +226,8 @@ func NewGRPCClient(cfg Config, otlploggrpcNew otlploggrpcFactory) (*Client, erro var chipIngressEmitter Emitter if cfg.ChipIngressBatchEmitterEnabled { + // TODO: accept a logger from the caller instead of creating a new root logger, + // so batch emitter logs respect the node's logging configuration. lggr, lErr := ccllogger.New() if lErr != nil { return nil, fmt.Errorf("failed to create logger for chip ingress batch emitter: %w", lErr) @@ -247,13 +249,17 @@ func NewGRPCClient(cfg Config, otlploggrpcNew otlploggrpcFactory) (*Client, erro emitter, err = NewDualSourceEmitter(chipIngressEmitter, emitter) if err != nil { + if batchEmitterService != nil { + _ = batchEmitterService.Close() + } return nil, fmt.Errorf("failed to create dual source emitter: %w", err) } } onClose := func() (err error) { - // batchEmitterService is closed via DualSourceEmitter.Close() -> chipIngressEmitter.Close(), - // which is called by Client.Close() -> c.Emitter.Close() before OnClose runs. + if batchEmitterService != nil { + err = errors.Join(err, batchEmitterService.Close()) + } for _, provider := range []shutdowner{messageLoggerProvider, loggerProvider, tracerProvider, meterProvider, messageLoggerProvider} { err = errors.Join(err, provider.Shutdown(context.Background())) } diff --git a/pkg/beholder/config.go b/pkg/beholder/config.go index 27dd6e8f0c..6f9af30d91 100644 --- a/pkg/beholder/config.go +++ b/pkg/beholder/config.go @@ -45,7 +45,7 @@ type Config struct { ChipIngressInsecureConnection bool // Disables TLS for Chip Ingress Emitter // Chip Ingress Batch Emitter - ChipIngressBatchEmitterEnabled bool // When true (default), use batch emitter; when false, use legacy per-event emitter + ChipIngressBatchEmitterEnabled bool // When true, use batch emitter; when false (default), use legacy per-event emitter ChipIngressBufferSize uint // Per-worker channel buffer size (default 100) ChipIngressMaxBatchSize uint // Max events per PublishBatch call (default 50) ChipIngressSendInterval time.Duration // Flush interval per worker (default 500ms when zero or unset) From 58a8dd93d12bd8a93e6a701f624b2d6a3d39793b Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Wed, 4 Mar 2026 20:54:58 +0200 Subject: [PATCH 11/28] Update defaults --- pkg/beholder/config.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pkg/beholder/config.go b/pkg/beholder/config.go index 6f9af30d91..3d8758b677 100644 --- a/pkg/beholder/config.go +++ b/pkg/beholder/config.go @@ -50,7 +50,7 @@ type Config struct { ChipIngressMaxBatchSize uint // Max events per PublishBatch call (default 50) ChipIngressSendInterval time.Duration // Flush interval per worker (default 500ms when zero or unset) ChipIngressSendTimeout time.Duration // Timeout per PublishBatch call (default 10s) - ChipIngressRetryConfig *RetryConfig // Retry config for failed PublishBatch calls (defaults match OTel SDK: 5s/30s/1m) + ChipIngressRetryConfig *RetryConfig // Retry config for failed PublishBatch calls (defaults: 500ms/5s/30s) ChipIngressDrainTimeout time.Duration // Max time to flush remaining events on shutdown (default 5s) // OTel Log @@ -99,6 +99,14 @@ var defaultRetryConfig = RetryConfig{ MaxElapsedTime: 1 * time.Minute, // Retry is enabled } +// Retry defaults for the chip ingress batch emitter: faster backoff than OTel +// to avoid buffer pressure on the small per-worker channel. +var defaultChipIngressRetryConfig = RetryConfig{ + InitialInterval: 500 * time.Millisecond, + MaxInterval: 5 * time.Second, + MaxElapsedTime: 5 * time.Second, +} + const ( defaultPackageName = "beholder" ) @@ -148,7 +156,7 @@ func DefaultConfig() Config { ChipIngressMaxBatchSize: 50, ChipIngressSendInterval: 500 * time.Millisecond, ChipIngressSendTimeout: 10 * time.Second, - ChipIngressRetryConfig: defaultRetryConfig.Copy(), + ChipIngressRetryConfig: defaultChipIngressRetryConfig.Copy(), ChipIngressDrainTimeout: 5 * time.Second, // Auth (defaults to static auth mode with TTL=0) AuthHeadersTTL: 0, From 7871a2c844aba78269380f7dd8bdd5f2b3aac80c Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Wed, 4 Mar 2026 21:45:46 +0200 Subject: [PATCH 12/28] Change default timeout to 5s --- pkg/beholder/chip_ingress_batch_emitter.go | 2 +- pkg/beholder/config.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/beholder/chip_ingress_batch_emitter.go b/pkg/beholder/chip_ingress_batch_emitter.go index f2e92c1ff8..27b8d727e6 100644 --- a/pkg/beholder/chip_ingress_batch_emitter.go +++ b/pkg/beholder/chip_ingress_batch_emitter.go @@ -66,7 +66,7 @@ func NewChipIngressBatchEmitter(client chipingress.Client, cfg Config, lggr logg } sendTimeout := cfg.ChipIngressSendTimeout if sendTimeout == 0 { - sendTimeout = 10 * time.Second + sendTimeout = 5 * time.Second } drainTimeout := cfg.ChipIngressDrainTimeout if drainTimeout == 0 { diff --git a/pkg/beholder/config.go b/pkg/beholder/config.go index 3d8758b677..0b8b6129a5 100644 --- a/pkg/beholder/config.go +++ b/pkg/beholder/config.go @@ -49,7 +49,7 @@ type Config struct { ChipIngressBufferSize uint // Per-worker channel buffer size (default 100) ChipIngressMaxBatchSize uint // Max events per PublishBatch call (default 50) ChipIngressSendInterval time.Duration // Flush interval per worker (default 500ms when zero or unset) - ChipIngressSendTimeout time.Duration // Timeout per PublishBatch call (default 10s) + ChipIngressSendTimeout time.Duration // Timeout per PublishBatch call (default 5s) ChipIngressRetryConfig *RetryConfig // Retry config for failed PublishBatch calls (defaults: 500ms/5s/30s) ChipIngressDrainTimeout time.Duration // Max time to flush remaining events on shutdown (default 5s) @@ -155,7 +155,7 @@ func DefaultConfig() Config { ChipIngressBufferSize: 100, ChipIngressMaxBatchSize: 50, ChipIngressSendInterval: 500 * time.Millisecond, - ChipIngressSendTimeout: 10 * time.Second, + ChipIngressSendTimeout: 5 * time.Second, ChipIngressRetryConfig: defaultChipIngressRetryConfig.Copy(), ChipIngressDrainTimeout: 5 * time.Second, // Auth (defaults to static auth mode with TTL=0) From d7f42dca74544375ba8329f7c3c459c67922fa42 Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Thu, 5 Mar 2026 13:36:06 +0200 Subject: [PATCH 13/28] Remove retries, add max workers --- pkg/beholder/chip_ingress_batch_emitter.go | 42 ++-- .../chip_ingress_batch_emitter_test.go | 202 +++--------------- pkg/beholder/chip_ingress_emitter_worker.go | 65 +----- pkg/beholder/config.go | 32 ++- pkg/beholder/config_test.go | 2 +- 5 files changed, 71 insertions(+), 272 deletions(-) diff --git a/pkg/beholder/chip_ingress_batch_emitter.go b/pkg/beholder/chip_ingress_batch_emitter.go index 27b8d727e6..8c0a3d4f93 100644 --- a/pkg/beholder/chip_ingress_batch_emitter.go +++ b/pkg/beholder/chip_ingress_batch_emitter.go @@ -29,20 +29,19 @@ type ChipIngressBatchEmitter struct { bufferSize uint maxBatchSize uint + maxWorkers int sendInterval time.Duration sendTimeout time.Duration - retryCfg *RetryConfig drainTimeout time.Duration metrics batchEmitterMetrics } type batchEmitterMetrics struct { - eventsSent otelmetric.Int64Counter - eventsDropped otelmetric.Int64Counter - batchRetries otelmetric.Int64Counter - batchFailures otelmetric.Int64Counter - eventsDrained otelmetric.Int64Counter + eventsSent otelmetric.Int64Counter + eventsDropped otelmetric.Int64Counter + batchFailures otelmetric.Int64Counter + eventsDrained otelmetric.Int64Counter } // NewChipIngressBatchEmitter creates a batch emitter backed by the given chipingress client. @@ -60,6 +59,10 @@ func NewChipIngressBatchEmitter(client chipingress.Client, cfg Config, lggr logg if maxBatchSize == 0 { maxBatchSize = 50 } + maxWorkers := cfg.ChipIngressMaxWorkers + if maxWorkers == 0 { + maxWorkers = defaultMaxWorkers + } sendInterval := cfg.ChipIngressSendInterval if sendInterval == 0 { sendInterval = 500 * time.Millisecond @@ -84,9 +87,9 @@ func NewChipIngressBatchEmitter(client chipingress.Client, cfg Config, lggr logg workers: make(map[string]*chipIngressEmitterWorker), bufferSize: bufferSize, maxBatchSize: maxBatchSize, + maxWorkers: maxWorkers, sendInterval: sendInterval, sendTimeout: sendTimeout, - retryCfg: cfg.ChipIngressRetryConfig, drainTimeout: drainTimeout, metrics: metrics, } @@ -112,6 +115,9 @@ func (e *ChipIngressBatchEmitter) Emit(ctx context.Context, body []byte, attrKVs attributes := newAttributes(attrKVs...) worker := e.findOrCreateWorker(domain, entity) + if worker == nil { + return nil + } payload := emitterPayload{ body: body, @@ -137,7 +143,7 @@ func (e *ChipIngressBatchEmitter) Emit(ctx context.Context, body []byte, attrKVs // findOrCreateWorker returns the worker for the given (domain, entity) pair, // creating one with a new buffered channel and flush goroutine if it doesn't exist. func (e *ChipIngressBatchEmitter) findOrCreateWorker(domain, entity string) *chipIngressEmitterWorker { - workerKey := fmt.Sprintf("%s_%s", domain, entity) + workerKey := domain + "/" + entity e.workersMutex.RLock() worker, found := e.workers[workerKey] @@ -150,11 +156,16 @@ func (e *ChipIngressBatchEmitter) findOrCreateWorker(domain, entity string) *chi e.workersMutex.Lock() defer e.workersMutex.Unlock() - // Double-check after acquiring write lock if worker, found = e.workers[workerKey]; found { return worker } + if len(e.workers) >= e.maxWorkers { + e.eng.Warnf("chip ingress batch emitter: max workers (%d) reached, dropping event for %s", e.maxWorkers, workerKey) + e.metrics.eventsDropped.Add(context.Background(), 1) + return nil + } + worker = newChipIngressEmitterWorker( e.client, make(chan emitterPayload, e.bufferSize), @@ -162,7 +173,6 @@ func (e *ChipIngressBatchEmitter) findOrCreateWorker(domain, entity string) *chi entity, e.maxBatchSize, e.sendTimeout, - e.retryCfg, e.metrics, e.eng, ) @@ -197,21 +207,14 @@ func newBatchEmitterMetrics(meter otelmetric.Meter) (batchEmitterMetrics, error) } eventsDropped, err := meter.Int64Counter("chip_ingress.events_dropped", - otelmetric.WithDescription("Total events dropped (buffer full or retries exhausted)"), + otelmetric.WithDescription("Total events dropped (buffer full or send failure)"), otelmetric.WithUnit("{event}")) if err != nil { return batchEmitterMetrics{}, err } - batchRetries, err := meter.Int64Counter("chip_ingress.batch_retries", - otelmetric.WithDescription("Total PublishBatch retry attempts"), - otelmetric.WithUnit("{attempt}")) - if err != nil { - return batchEmitterMetrics{}, err - } - batchFailures, err := meter.Int64Counter("chip_ingress.batch_failures", - otelmetric.WithDescription("Total batches that failed after all retries"), + otelmetric.WithDescription("Total PublishBatch calls that failed"), otelmetric.WithUnit("{batch}")) if err != nil { return batchEmitterMetrics{}, err @@ -227,7 +230,6 @@ func newBatchEmitterMetrics(meter otelmetric.Meter) (batchEmitterMetrics, error) return batchEmitterMetrics{ eventsSent: eventsSent, eventsDropped: eventsDropped, - batchRetries: batchRetries, batchFailures: batchFailures, eventsDrained: eventsDrained, }, nil diff --git a/pkg/beholder/chip_ingress_batch_emitter_test.go b/pkg/beholder/chip_ingress_batch_emitter_test.go index f827a04398..142a5edd82 100644 --- a/pkg/beholder/chip_ingress_batch_emitter_test.go +++ b/pkg/beholder/chip_ingress_batch_emitter_test.go @@ -472,130 +472,6 @@ func TestChipIngressBatchEmitter_DefaultConfig(t *testing.T) { }) } -func newRetryTestConfig() beholder.Config { - return beholder.Config{ - ChipIngressBufferSize: 10, - ChipIngressMaxBatchSize: 5, - ChipIngressSendInterval: 50 * time.Millisecond, - ChipIngressSendTimeout: 1 * time.Second, - ChipIngressRetryConfig: &beholder.RetryConfig{ - InitialInterval: 10 * time.Millisecond, - MaxInterval: 50 * time.Millisecond, - MaxElapsedTime: 500 * time.Millisecond, - }, - ChipIngressDrainTimeout: 5 * time.Second, - } -} - -func TestChipIngressBatchEmitter_RetrySuccess(t *testing.T) { - t.Run("succeeds after transient failure", func(t *testing.T) { - clientMock := mocks.NewClient(t) - - var mu sync.Mutex - callCount := 0 - totalEventsSent := 0 - clientMock. - On("PublishBatch", mock.Anything, mock.Anything). - Run(func(args mock.Arguments) { - mu.Lock() - defer mu.Unlock() - callCount++ - batch := args.Get(1).(*chipingress.CloudEventBatch) - if callCount <= 2 { - return // first 2 calls fail - } - totalEventsSent += len(batch.Events) - }). - Return(nil, assert.AnError).Times(2) - - clientMock. - On("PublishBatch", mock.Anything, mock.Anything). - Run(func(args mock.Arguments) { - mu.Lock() - defer mu.Unlock() - callCount++ - batch := args.Get(1).(*chipingress.CloudEventBatch) - totalEventsSent += len(batch.Events) - }). - Return(nil, nil) - - cfg := newRetryTestConfig() - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) - require.NoError(t, err) - require.NoError(t, emitter.Start(t.Context())) - - for i := 0; i < 3; i++ { - err = emitter.Emit(t.Context(), []byte("body"), - beholder.AttrKeyDomain, "platform", - beholder.AttrKeyEntity, "TestEvent", - ) - require.NoError(t, err) - } - - assert.Eventually(t, func() bool { - mu.Lock() - defer mu.Unlock() - return totalEventsSent >= 3 - }, 5*time.Second, 10*time.Millisecond) - - require.NoError(t, emitter.Close()) - - mu.Lock() - defer mu.Unlock() - assert.GreaterOrEqual(t, callCount, 3, "should have retried at least twice before succeeding") - assert.Equal(t, 3, totalEventsSent) - }) -} - -func TestChipIngressBatchEmitter_RetryExhaustion(t *testing.T) { - t.Run("drops events after retries exhausted", func(t *testing.T) { - clientMock := mocks.NewClient(t) - - var mu sync.Mutex - callCount := 0 - clientMock. - On("PublishBatch", mock.Anything, mock.Anything). - Run(func(_ mock.Arguments) { - mu.Lock() - defer mu.Unlock() - callCount++ - }). - Return(nil, assert.AnError) - - cfg := newRetryTestConfig() - cfg.ChipIngressRetryConfig = &beholder.RetryConfig{ - InitialInterval: 10 * time.Millisecond, - MaxInterval: 20 * time.Millisecond, - MaxElapsedTime: 100 * time.Millisecond, - } - - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) - require.NoError(t, err) - require.NoError(t, emitter.Start(t.Context())) - - for i := 0; i < 3; i++ { - err = emitter.Emit(t.Context(), []byte("body"), - beholder.AttrKeyDomain, "platform", - beholder.AttrKeyEntity, "TestEvent", - ) - require.NoError(t, err) - } - - // Wait long enough for retries to exhaust - assert.Eventually(t, func() bool { - mu.Lock() - defer mu.Unlock() - return callCount >= 2 - }, 3*time.Second, 10*time.Millisecond) - - require.NoError(t, emitter.Close()) - - mu.Lock() - defer mu.Unlock() - assert.GreaterOrEqual(t, callCount, 2, "should have attempted at least 2 times before exhausting retries") - }) -} - func TestChipIngressBatchEmitter_GracefulDrain(t *testing.T) { t.Run("flushes buffered events on close", func(t *testing.T) { clientMock := mocks.NewClient(t) @@ -805,65 +681,49 @@ func TestChipIngressBatchEmitter_DrainTimeout(t *testing.T) { }) } -func TestChipIngressBatchEmitter_RetryShutdownDuringBackoff(t *testing.T) { - t.Run("shutdown during retry backoff completes promptly", func(t *testing.T) { +func TestChipIngressBatchEmitter_MaxWorkersCap(t *testing.T) { + t.Run("drops events when max workers reached", func(t *testing.T) { clientMock := mocks.NewClient(t) - - var mu sync.Mutex - callCount := 0 clientMock. On("PublishBatch", mock.Anything, mock.Anything). - Run(func(_ mock.Arguments) { - mu.Lock() - defer mu.Unlock() - callCount++ - }). - Return(nil, assert.AnError) + Return(nil, nil). + Maybe() - cfg := beholder.Config{ - ChipIngressBufferSize: 10, - ChipIngressMaxBatchSize: 5, - ChipIngressSendInterval: 50 * time.Millisecond, - ChipIngressSendTimeout: 1 * time.Second, - ChipIngressRetryConfig: &beholder.RetryConfig{ - InitialInterval: 10 * time.Second, // very long backoff - MaxInterval: 30 * time.Second, - MaxElapsedTime: 1 * time.Minute, - }, - ChipIngressDrainTimeout: 5 * time.Second, - } + cfg := newTestConfig() + cfg.ChipIngressMaxWorkers = 2 + cfg.ChipIngressSendInterval = 50 * time.Millisecond emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) + defer emitter.Close() //nolint:errcheck - for i := 0; i < 3; i++ { - err = emitter.Emit(t.Context(), []byte("body"), - beholder.AttrKeyDomain, "platform", - beholder.AttrKeyEntity, "TestEvent", - ) - require.NoError(t, err) - } + // Create 2 workers (at the cap) + err = emitter.Emit(t.Context(), []byte("body"), + beholder.AttrKeyDomain, "domain-1", + beholder.AttrKeyEntity, "entity-1", + ) + require.NoError(t, err) - // Wait for at least one PublishBatch attempt (which will fail and enter 10s backoff) - assert.Eventually(t, func() bool { - mu.Lock() - defer mu.Unlock() - return callCount >= 1 - }, 2*time.Second, 10*time.Millisecond) + err = emitter.Emit(t.Context(), []byte("body"), + beholder.AttrKeyDomain, "domain-2", + beholder.AttrKeyEntity, "entity-2", + ) + require.NoError(t, err) - // Close while the worker is sleeping in the 10s retry backoff - closeDone := make(chan error, 1) - go func() { - closeDone <- emitter.Close() - }() + // 3rd unique pair should be silently dropped (no error, no worker created) + err = emitter.Emit(t.Context(), []byte("body"), + beholder.AttrKeyDomain, "domain-3", + beholder.AttrKeyEntity, "entity-3", + ) + assert.NoError(t, err, "Emit should not error when max workers is reached") - select { - case err := <-closeDone: - assert.NoError(t, err, "close should not error") - case <-time.After(3 * time.Second): - t.Fatal("Close() did not return within 3s; timer.Stop() during retry backoff is not working") - } + // Existing workers should still accept events + err = emitter.Emit(t.Context(), []byte("body"), + beholder.AttrKeyDomain, "domain-1", + beholder.AttrKeyEntity, "entity-1", + ) + assert.NoError(t, err, "Emit to existing worker should still work") }) } diff --git a/pkg/beholder/chip_ingress_emitter_worker.go b/pkg/beholder/chip_ingress_emitter_worker.go index 87eab18117..b2bd7b3101 100644 --- a/pkg/beholder/chip_ingress_emitter_worker.go +++ b/pkg/beholder/chip_ingress_emitter_worker.go @@ -3,7 +3,6 @@ package beholder import ( "context" "fmt" - "math/rand/v2" "sync/atomic" "time" @@ -24,6 +23,8 @@ type emitterPayload struct { // chipIngressEmitterWorker buffers events for a single (domain, entity) pair // and flushes them via PublishBatch on a periodic interval. +// Transport-level retries (UNAVAILABLE, RESOURCE_EXHAUSTED) are handled by the +// gRPC client's built-in retry policy; no application-level retry is needed. type chipIngressEmitterWorker struct { client chipingress.Client ch chan emitterPayload @@ -34,10 +35,6 @@ type chipIngressEmitterWorker struct { lggr logger.Logger dropCount atomic.Uint32 - retryInitialInterval time.Duration - retryMaxInterval time.Duration - retryMaxElapsed time.Duration - metrics batchEmitterMetrics metricAttrs otelmetric.MeasurementOption } @@ -49,11 +46,10 @@ func newChipIngressEmitterWorker( entity string, maxBatchSize uint, sendTimeout time.Duration, - retryCfg *RetryConfig, metrics batchEmitterMetrics, lggr logger.Logger, ) *chipIngressEmitterWorker { - w := &chipIngressEmitterWorker{ + return &chipIngressEmitterWorker{ client: client, ch: ch, domain: domain, @@ -67,15 +63,9 @@ func newChipIngressEmitterWorker( attribute.String("entity", entity), )), } - if retryCfg != nil && retryCfg.Enabled() { - w.retryInitialInterval = retryCfg.InitialInterval - w.retryMaxInterval = retryCfg.MaxInterval - w.retryMaxElapsed = retryCfg.MaxElapsedTime - } - return w } -// Send drains the channel and sends a batch with retry on failure. +// Send drains the channel and sends a batch. // Called periodically by the tick loop. func (w *chipIngressEmitterWorker) Send(ctx context.Context) { if len(w.ch) == 0 { @@ -87,52 +77,7 @@ func (w *chipIngressEmitterWorker) Send(ctx context.Context) { return } - if w.retryMaxElapsed == 0 { - w.publishOnce(ctx, batch) - return - } - - backoff := w.retryInitialInterval - deadline := time.Now().Add(w.retryMaxElapsed) - - batchSize := int64(len(batch.Events)) - for attempt := 0; ; attempt++ { - sendCtx, cancel := context.WithTimeout(ctx, w.sendTimeout) - _, err := w.client.PublishBatch(sendCtx, batch) - cancel() - - if err == nil { - w.metrics.eventsSent.Add(context.Background(), batchSize, w.metricAttrs) - return - } - - w.lggr.Warnf("PublishBatch failed (attempt %d, domain=%s, entity=%s): %v", - attempt+1, w.domain, w.entity, err) - w.metrics.batchRetries.Add(context.Background(), 1, w.metricAttrs) - - if time.Now().Add(backoff).After(deadline) { - w.lggr.Warnf("PublishBatch retries exhausted, dropping %d events (domain=%s, entity=%s)", - len(batch.Events), w.domain, w.entity) - w.metrics.batchFailures.Add(context.Background(), 1, w.metricAttrs) - w.metrics.eventsDropped.Add(context.Background(), batchSize, w.metricAttrs) - return - } - - timer := time.NewTimer(backoff) - select { - case <-ctx.Done(): - timer.Stop() - w.lggr.Warnf("context cancelled during retry, dropping %d events (domain=%s, entity=%s)", - len(batch.Events), w.domain, w.entity) - w.metrics.eventsDropped.Add(context.Background(), batchSize, w.metricAttrs) - return - case <-timer.C: - } - - next := backoff * 2 - jitter := time.Duration(rand.Int64N(int64(next)/5 + 1)) //nolint:gosec - backoff = min(next+jitter, w.retryMaxInterval) - } + w.publishOnce(ctx, batch) } func (w *chipIngressEmitterWorker) publishOnce(ctx context.Context, batch *chipingress.CloudEventBatch) { diff --git a/pkg/beholder/config.go b/pkg/beholder/config.go index 0b8b6129a5..9d80177560 100644 --- a/pkg/beholder/config.go +++ b/pkg/beholder/config.go @@ -45,13 +45,13 @@ type Config struct { ChipIngressInsecureConnection bool // Disables TLS for Chip Ingress Emitter // Chip Ingress Batch Emitter - ChipIngressBatchEmitterEnabled bool // When true, use batch emitter; when false (default), use legacy per-event emitter - ChipIngressBufferSize uint // Per-worker channel buffer size (default 100) - ChipIngressMaxBatchSize uint // Max events per PublishBatch call (default 50) - ChipIngressSendInterval time.Duration // Flush interval per worker (default 500ms when zero or unset) - ChipIngressSendTimeout time.Duration // Timeout per PublishBatch call (default 5s) - ChipIngressRetryConfig *RetryConfig // Retry config for failed PublishBatch calls (defaults: 500ms/5s/30s) - ChipIngressDrainTimeout time.Duration // Max time to flush remaining events on shutdown (default 5s) + ChipIngressBatchEmitterEnabled bool // When true, use batch emitter; when false (default), use legacy per-event emitter + ChipIngressBufferSize uint // Per-worker channel buffer size (default 100) + ChipIngressMaxBatchSize uint // Max events per PublishBatch call (default 50) + ChipIngressMaxWorkers int // Max concurrent (domain, entity) workers (default 100) + ChipIngressSendInterval time.Duration // Flush interval per worker (default 500ms when zero or unset) + ChipIngressSendTimeout time.Duration // Timeout per PublishBatch call (default 5s) + ChipIngressDrainTimeout time.Duration // Max time to flush remaining events on shutdown (default 5s) // OTel Log LogExportTimeout time.Duration @@ -99,16 +99,9 @@ var defaultRetryConfig = RetryConfig{ MaxElapsedTime: 1 * time.Minute, // Retry is enabled } -// Retry defaults for the chip ingress batch emitter: faster backoff than OTel -// to avoid buffer pressure on the small per-worker channel. -var defaultChipIngressRetryConfig = RetryConfig{ - InitialInterval: 500 * time.Millisecond, - MaxInterval: 5 * time.Second, - MaxElapsedTime: 5 * time.Second, -} - const ( defaultPackageName = "beholder" + defaultMaxWorkers = 100 ) var defaultOtelAttributes = []attribute.KeyValue{ @@ -154,9 +147,9 @@ func DefaultConfig() Config { ChipIngressBatchEmitterEnabled: false, ChipIngressBufferSize: 100, ChipIngressMaxBatchSize: 50, + ChipIngressMaxWorkers: defaultMaxWorkers, ChipIngressSendInterval: 500 * time.Millisecond, ChipIngressSendTimeout: 5 * time.Second, - ChipIngressRetryConfig: defaultChipIngressRetryConfig.Copy(), ChipIngressDrainTimeout: 5 * time.Second, // Auth (defaults to static auth mode with TTL=0) AuthHeadersTTL: 0, @@ -169,10 +162,9 @@ func TestDefaultConfig() Config { config.EmitterBatchProcessor = false config.LogBatchProcessor = false // Retries are disabled for testing - config.LogRetryConfig.MaxElapsedTime = 0 // Retry is disabled - config.TraceRetryConfig.MaxElapsedTime = 0 // Retry is disabled - config.MetricRetryConfig.MaxElapsedTime = 0 // Retry is disabled - config.ChipIngressRetryConfig.MaxElapsedTime = 0 // Retry is disabled + config.LogRetryConfig.MaxElapsedTime = 0 // Retry is disabled + config.TraceRetryConfig.MaxElapsedTime = 0 // Retry is disabled + config.MetricRetryConfig.MaxElapsedTime = 0 // Retry is disabled // Auth disabled for testing (TTL=0 means static auth mode) config.AuthHeadersTTL = 0 return config diff --git a/pkg/beholder/config_test.go b/pkg/beholder/config_test.go index f98f77bc60..5a83c71178 100644 --- a/pkg/beholder/config_test.go +++ b/pkg/beholder/config_test.go @@ -67,6 +67,6 @@ func ExampleConfig() { } fmt.Printf("%+v\n", *config.LogRetryConfig) // Output: - // {InsecureConnection:true CACertFile: OtelExporterGRPCEndpoint:localhost:4317 OtelExporterHTTPEndpoint:localhost:4318 ResourceAttributes:[{Key:package_name Value:{vtype:4 numeric:0 stringly:beholder slice:}} {Key:sender Value:{vtype:4 numeric:0 stringly:beholderclient slice:}}] EmitterExportTimeout:1s EmitterExportInterval:1s EmitterExportMaxBatchSize:512 EmitterMaxQueueSize:2048 EmitterBatchProcessor:true TraceSampleRatio:1 TraceBatchTimeout:1s TraceSpanExporter: TraceRetryConfig: TraceCompressor:gzip MetricReaderInterval:1s MetricRetryConfig: MetricViews:[] MetricCompressor:gzip ChipIngressEmitterEnabled:false ChipIngressEmitterGRPCEndpoint: ChipIngressInsecureConnection:false ChipIngressBatchEmitterEnabled:false ChipIngressBufferSize:0 ChipIngressMaxBatchSize:0 ChipIngressSendInterval:0s ChipIngressSendTimeout:0s ChipIngressRetryConfig: ChipIngressDrainTimeout:0s LogExportTimeout:1s LogExportInterval:1s LogExportMaxBatchSize:512 LogMaxQueueSize:2048 LogBatchProcessor:true LogRetryConfig: LogStreamingEnabled:false LogLevel:info LogCompressor:gzip AuthHeaders:map[] AuthHeadersTTL:0s AuthKeySigner: AuthPublicKeyHex:} + // {InsecureConnection:true CACertFile: OtelExporterGRPCEndpoint:localhost:4317 OtelExporterHTTPEndpoint:localhost:4318 ResourceAttributes:[{Key:package_name Value:{vtype:4 numeric:0 stringly:beholder slice:}} {Key:sender Value:{vtype:4 numeric:0 stringly:beholderclient slice:}}] EmitterExportTimeout:1s EmitterExportInterval:1s EmitterExportMaxBatchSize:512 EmitterMaxQueueSize:2048 EmitterBatchProcessor:true TraceSampleRatio:1 TraceBatchTimeout:1s TraceSpanExporter: TraceRetryConfig: TraceCompressor:gzip MetricReaderInterval:1s MetricRetryConfig: MetricViews:[] MetricCompressor:gzip ChipIngressEmitterEnabled:false ChipIngressEmitterGRPCEndpoint: ChipIngressInsecureConnection:false ChipIngressBatchEmitterEnabled:false ChipIngressBufferSize:0 ChipIngressMaxBatchSize:0 ChipIngressMaxWorkers:0 ChipIngressSendInterval:0s ChipIngressSendTimeout:0s ChipIngressDrainTimeout:0s LogExportTimeout:1s LogExportInterval:1s LogExportMaxBatchSize:512 LogMaxQueueSize:2048 LogBatchProcessor:true LogRetryConfig: LogStreamingEnabled:false LogLevel:info LogCompressor:gzip AuthHeaders:map[] AuthHeadersTTL:0s AuthKeySigner: AuthPublicKeyHex:} // {InitialInterval:5s MaxInterval:30s MaxElapsedTime:1m0s} } From af757a12bb50afa02a812326b4e85695e373d466 Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Thu, 5 Mar 2026 13:36:33 +0200 Subject: [PATCH 14/28] Fix draining, remove unused --- pkg/beholder/client.go | 13 ++++++------- pkg/beholder/dual_source_emitter.go | 6 ------ 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/pkg/beholder/client.go b/pkg/beholder/client.go index e042368f20..9a5fdf9305 100644 --- a/pkg/beholder/client.go +++ b/pkg/beholder/client.go @@ -257,9 +257,6 @@ func NewGRPCClient(cfg Config, otlploggrpcNew otlploggrpcFactory) (*Client, erro } onClose := func() (err error) { - if batchEmitterService != nil { - err = errors.Join(err, batchEmitterService.Close()) - } for _, provider := range []shutdowner{messageLoggerProvider, loggerProvider, tracerProvider, meterProvider, messageLoggerProvider} { err = errors.Join(err, provider.Shutdown(context.Background())) } @@ -268,17 +265,19 @@ func NewGRPCClient(cfg Config, otlploggrpcNew otlploggrpcFactory) (*Client, erro return &Client{cfg, logger, tracer, meter, emitter, chipIngressClient, loggerProvider, tracerProvider, meterProvider, messageLoggerProvider, signer, onClose}, nil } -// Closes all providers, flushes all data and stops all background processes +// Closes all providers, flushes all data and stops all background processes. +// Order matters: emitter is closed first so the batch emitter can drain +// buffered events while the gRPC connection is still alive. func (c Client) Close() (err error) { - if c.Chip != nil { - err = errors.Join(err, c.Chip.Close()) - } if c.Emitter != nil { err = errors.Join(err, c.Emitter.Close()) } if c.OnClose != nil { err = errors.Join(err, c.OnClose()) } + if c.Chip != nil { + err = errors.Join(err, c.Chip.Close()) + } return } diff --git a/pkg/beholder/dual_source_emitter.go b/pkg/beholder/dual_source_emitter.go index d34dbb614e..0501e5c529 100644 --- a/pkg/beholder/dual_source_emitter.go +++ b/pkg/beholder/dual_source_emitter.go @@ -7,7 +7,6 @@ import ( "sync/atomic" "github.com/smartcontractkit/chainlink-common/pkg/logger" - "github.com/smartcontractkit/chainlink-common/pkg/services" ) // DualSourceEmitter emits both to chip ingress and to the otel collector @@ -18,8 +17,6 @@ type DualSourceEmitter struct { chipIngressEmitter Emitter otelCollectorEmitter Emitter log logger.Logger - stopCh services.StopChan - wg services.WaitGroup closed atomic.Bool } @@ -42,7 +39,6 @@ func NewDualSourceEmitter(chipIngressEmitter Emitter, otelCollectorEmitter Emitt chipIngressEmitter: chipIngressEmitter, otelCollectorEmitter: otelCollectorEmitter, log: logger, - stopCh: make(services.StopChan), }, nil } @@ -50,8 +46,6 @@ func (d *DualSourceEmitter) Close() error { if wasClosed := d.closed.Swap(true); wasClosed { return errors.New("already closed") } - close(d.stopCh) - d.wg.Wait() return errors.Join(d.chipIngressEmitter.Close(), d.otelCollectorEmitter.Close()) } From 32ebc25478fedfbda902f4f53baa61a67ae9b806 Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Thu, 5 Mar 2026 14:32:46 +0200 Subject: [PATCH 15/28] Change defaults --- pkg/beholder/chip_ingress_batch_emitter.go | 10 +++++----- pkg/beholder/chip_ingress_batch_emitter_test.go | 2 +- pkg/beholder/config.go | 10 +++++----- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pkg/beholder/chip_ingress_batch_emitter.go b/pkg/beholder/chip_ingress_batch_emitter.go index 8c0a3d4f93..d526d1a1d3 100644 --- a/pkg/beholder/chip_ingress_batch_emitter.go +++ b/pkg/beholder/chip_ingress_batch_emitter.go @@ -53,11 +53,11 @@ func NewChipIngressBatchEmitter(client chipingress.Client, cfg Config, lggr logg bufferSize := cfg.ChipIngressBufferSize if bufferSize == 0 { - bufferSize = 100 + bufferSize = 1000 } maxBatchSize := cfg.ChipIngressMaxBatchSize if maxBatchSize == 0 { - maxBatchSize = 50 + maxBatchSize = 500 } maxWorkers := cfg.ChipIngressMaxWorkers if maxWorkers == 0 { @@ -65,15 +65,15 @@ func NewChipIngressBatchEmitter(client chipingress.Client, cfg Config, lggr logg } sendInterval := cfg.ChipIngressSendInterval if sendInterval == 0 { - sendInterval = 500 * time.Millisecond + sendInterval = 100 * time.Millisecond } sendTimeout := cfg.ChipIngressSendTimeout if sendTimeout == 0 { - sendTimeout = 5 * time.Second + sendTimeout = 3 * time.Second } drainTimeout := cfg.ChipIngressDrainTimeout if drainTimeout == 0 { - drainTimeout = 5 * time.Second + drainTimeout = 10 * time.Second } meter := otel.Meter("beholder/chip_ingress_batch_emitter") diff --git a/pkg/beholder/chip_ingress_batch_emitter_test.go b/pkg/beholder/chip_ingress_batch_emitter_test.go index 142a5edd82..b2fe7211bf 100644 --- a/pkg/beholder/chip_ingress_batch_emitter_test.go +++ b/pkg/beholder/chip_ingress_batch_emitter_test.go @@ -457,7 +457,7 @@ func TestChipIngressBatchEmitter_DefaultConfig(t *testing.T) { ) require.NoError(t, err) - // Default send interval is 500ms; wait for flush + // Default send interval is 100ms; wait for flush assert.Eventually(t, func() bool { mu.Lock() defer mu.Unlock() diff --git a/pkg/beholder/config.go b/pkg/beholder/config.go index 9d80177560..3f371799df 100644 --- a/pkg/beholder/config.go +++ b/pkg/beholder/config.go @@ -145,12 +145,12 @@ func DefaultConfig() Config { LogCompressor: "gzip", // Chip Ingress Batch Emitter ChipIngressBatchEmitterEnabled: false, - ChipIngressBufferSize: 100, - ChipIngressMaxBatchSize: 50, + ChipIngressBufferSize: 1000, + ChipIngressMaxBatchSize: 500, ChipIngressMaxWorkers: defaultMaxWorkers, - ChipIngressSendInterval: 500 * time.Millisecond, - ChipIngressSendTimeout: 5 * time.Second, - ChipIngressDrainTimeout: 5 * time.Second, + ChipIngressSendInterval: 100 * time.Millisecond, + ChipIngressSendTimeout: 3 * time.Second, + ChipIngressDrainTimeout: 10 * time.Second, // Auth (defaults to static auth mode with TTL=0) AuthHeadersTTL: 0, } From d45812681bd5753d902e7abddb2070788e3d34a2 Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Fri, 6 Mar 2026 15:55:29 +0200 Subject: [PATCH 16/28] Use batch emitter client --- go.mod | 2 + go.sum | 2 - pkg/beholder/chip_ingress_batch_emitter.go | 179 ++++++---- .../chip_ingress_batch_emitter_test.go | 326 +++++++++++++++--- pkg/beholder/chip_ingress_emitter_worker.go | 180 +--------- pkg/beholder/config.go | 17 +- pkg/beholder/config_test.go | 2 +- pkg/chipingress/batch/client.go | 4 +- pkg/chipingress/batch/client_test.go | 7 +- 9 files changed, 407 insertions(+), 312 deletions(-) diff --git a/go.mod b/go.mod index eb9c45471f..85fb5f3969 100644 --- a/go.mod +++ b/go.mod @@ -157,3 +157,5 @@ require ( google.golang.org/genproto/googleapis/api v0.0.0-20251029180050-ab9386a59fda // indirect gopkg.in/yaml.v2 v2.4.0 // indirect ) + +replace github.com/smartcontractkit/chainlink-common/pkg/chipingress => ./pkg/chipingress diff --git a/go.sum b/go.sum index bfa8dbdd0a..58c5db722d 100644 --- a/go.sum +++ b/go.sum @@ -328,8 +328,6 @@ github.com/sirupsen/logrus v1.4.1/go.mod h1:ni0Sbl8bgC9z8RoU9G6nDWqqs/fq4eDPysMB github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/smartcontractkit/chain-selectors v1.0.89 h1:L9oWZGqQXWyTPnC6ODXgu3b0DFyLmJ9eHv+uJrE9IZY= github.com/smartcontractkit/chain-selectors v1.0.89/go.mod h1:qy7whtgG5g+7z0jt0nRyii9bLND9m15NZTzuQPkMZ5w= -github.com/smartcontractkit/chainlink-common/pkg/chipingress v0.0.10 h1:FJAFgXS9oqASnkS03RE1HQwYQQxrO4l46O5JSzxqLgg= -github.com/smartcontractkit/chainlink-common/pkg/chipingress v0.0.10/go.mod h1:oiDa54M0FwxevWwyAX773lwdWvFYYlYHHQV1LQ5HpWY= github.com/smartcontractkit/chainlink-protos/billing/go v0.0.0-20251024234028-0988426d98f4 h1:GCzrxDWn3b7jFfEA+WiYRi8CKoegsayiDoJBCjYkneE= github.com/smartcontractkit/chainlink-protos/billing/go v0.0.0-20251024234028-0988426d98f4/go.mod h1:HHGeDUpAsPa0pmOx7wrByCitjQ0mbUxf0R9v+g67uCA= github.com/smartcontractkit/chainlink-protos/cre/go v0.0.0-20260226130359-963f935e0396 h1:03tbcwjyIEjvHba1IWOj1sfThwebm2XNzyFHSuZtlWc= diff --git a/pkg/beholder/chip_ingress_batch_emitter.go b/pkg/beholder/chip_ingress_batch_emitter.go index d526d1a1d3..17b8479431 100644 --- a/pkg/beholder/chip_ingress_batch_emitter.go +++ b/pkg/beholder/chip_ingress_batch_emitter.go @@ -10,12 +10,15 @@ import ( otelmetric "go.opentelemetry.io/otel/metric" "github.com/smartcontractkit/chainlink-common/pkg/chipingress" + "github.com/smartcontractkit/chainlink-common/pkg/chipingress/batch" "github.com/smartcontractkit/chainlink-common/pkg/logger" "github.com/smartcontractkit/chainlink-common/pkg/services" ) // ChipIngressBatchEmitter buffers events per (domain, entity) and flushes them // via chipingress.Client.PublishBatch on a periodic interval. +// Each (domain, entity) pair gets its own batch.Client, providing per-entity +// isolation and independent concurrency scaling. // It satisfies the Emitter interface so it can be used as a drop-in replacement // for ChipIngressEmitter. type ChipIngressBatchEmitter struct { @@ -27,12 +30,13 @@ type ChipIngressBatchEmitter struct { workers map[string]*chipIngressEmitterWorker workersMutex sync.RWMutex - bufferSize uint - maxBatchSize uint - maxWorkers int - sendInterval time.Duration - sendTimeout time.Duration - drainTimeout time.Duration + bufferSize int + maxBatchSize int + maxWorkers int + maxConcurrentSends int + sendInterval time.Duration + sendTimeout time.Duration + drainTimeout time.Duration metrics batchEmitterMetrics } @@ -40,8 +44,6 @@ type ChipIngressBatchEmitter struct { type batchEmitterMetrics struct { eventsSent otelmetric.Int64Counter eventsDropped otelmetric.Int64Counter - batchFailures otelmetric.Int64Counter - eventsDrained otelmetric.Int64Counter } // NewChipIngressBatchEmitter creates a batch emitter backed by the given chipingress client. @@ -51,11 +53,11 @@ func NewChipIngressBatchEmitter(client chipingress.Client, cfg Config, lggr logg return nil, fmt.Errorf("chip ingress client is nil") } - bufferSize := cfg.ChipIngressBufferSize + bufferSize := int(cfg.ChipIngressBufferSize) if bufferSize == 0 { bufferSize = 1000 } - maxBatchSize := cfg.ChipIngressMaxBatchSize + maxBatchSize := int(cfg.ChipIngressMaxBatchSize) if maxBatchSize == 0 { maxBatchSize = 500 } @@ -63,6 +65,10 @@ func NewChipIngressBatchEmitter(client chipingress.Client, cfg Config, lggr logg if maxWorkers == 0 { maxWorkers = defaultMaxWorkers } + maxConcurrentSends := cfg.ChipIngressMaxConcurrentSends + if maxConcurrentSends == 0 { + maxConcurrentSends = defaultMaxConcurrentSends + } sendInterval := cfg.ChipIngressSendInterval if sendInterval == 0 { sendInterval = 100 * time.Millisecond @@ -83,15 +89,16 @@ func NewChipIngressBatchEmitter(client chipingress.Client, cfg Config, lggr logg } e := &ChipIngressBatchEmitter{ - client: client, - workers: make(map[string]*chipIngressEmitterWorker), - bufferSize: bufferSize, - maxBatchSize: maxBatchSize, - maxWorkers: maxWorkers, - sendInterval: sendInterval, - sendTimeout: sendTimeout, - drainTimeout: drainTimeout, - metrics: metrics, + client: client, + workers: make(map[string]*chipIngressEmitterWorker), + bufferSize: bufferSize, + maxBatchSize: maxBatchSize, + maxWorkers: maxWorkers, + maxConcurrentSends: maxConcurrentSends, + sendInterval: sendInterval, + sendTimeout: sendTimeout, + drainTimeout: drainTimeout, + metrics: metrics, } e.Service, e.eng = services.Config{ @@ -102,10 +109,36 @@ func NewChipIngressBatchEmitter(client chipingress.Client, cfg Config, lggr logg } // Emit extracts (domain, entity) from the attributes, routes the event to the -// appropriate per-(domain, entity) worker, and returns immediately. -// If the worker's buffer is full, the event is dropped and a warning is logged. -// Returns an error if the emitter has been closed. +// appropriate per-(domain, entity) worker's batch.Client, and returns immediately. +// If the worker's buffer is full, the event is dropped and a metric is bumped. +// Returns an error if the emitter has been closed or the caller's context is cancelled. func (e *ChipIngressBatchEmitter) Emit(ctx context.Context, body []byte, attrKVs ...any) error { + return e.emitInternal(ctx, body, nil, attrKVs...) +} + +// EmitWithCallback works like Emit but accepts a callback that is invoked once +// the event's fate is determined: nil on successful PublishBatch delivery, or a +// non-nil error on send failure or buffer-full drop. +// +// Contract: +// - If EmitWithCallback returns a non-nil error, the callback will NOT be invoked. +// The caller should handle the error from the return value. +// - If EmitWithCallback returns nil, the callback is guaranteed to be invoked +// exactly once — either asynchronously (after the batch is sent) or +// synchronously (if the event was dropped at enqueue time). +// +// Callers can use this for "synchronous" emission: +// +// done := make(chan error, 1) +// if err := emitter.EmitWithCallback(ctx, body, func(err error) { done <- err }, attrKVs...); err != nil { +// return err // callback will not fire +// } +// err := <-done // safe — callback will fire exactly once +func (e *ChipIngressBatchEmitter) EmitWithCallback(ctx context.Context, body []byte, callback func(error), attrKVs ...any) error { + return e.emitInternal(ctx, body, callback, attrKVs...) +} + +func (e *ChipIngressBatchEmitter) emitInternal(ctx context.Context, body []byte, callback func(error), attrKVs ...any) error { return e.eng.IfNotStopped(func() error { domain, entity, err := ExtractSourceAndType(attrKVs...) if err != nil { @@ -116,24 +149,40 @@ func (e *ChipIngressBatchEmitter) Emit(ctx context.Context, body []byte, attrKVs worker := e.findOrCreateWorker(domain, entity) if worker == nil { + if callback != nil { + callback(fmt.Errorf("max workers reached, event dropped")) + } return nil } - payload := emitterPayload{ - body: body, - attributes: attributes, - domain: domain, - entity: entity, + event, err := chipingress.NewEvent(domain, entity, body, attributes) + if err != nil { + return fmt.Errorf("failed to create CloudEvent: %w", err) + } + eventPb, err := chipingress.EventToProto(event) + if err != nil { + return fmt.Errorf("failed to convert to proto: %w", err) + } + + if err := ctx.Err(); err != nil { + return err } - select { - case worker.ch <- payload: - // Intentionally racy with logBufferFullWithExpBackoff — only affects log frequency, not correctness. - worker.dropCount.Store(0) - case <-ctx.Done(): - return ctx.Err() - default: - worker.logBufferFullWithExpBackoff(payload) + queueErr := worker.batchClient.QueueMessage(eventPb, func(sendErr error) { + if sendErr != nil { + e.metrics.eventsDropped.Add(context.Background(), 1, worker.metricAttrs) + } else { + e.metrics.eventsSent.Add(context.Background(), 1, worker.metricAttrs) + } + if callback != nil { + callback(sendErr) + } + }) + if queueErr != nil { + e.metrics.eventsDropped.Add(context.Background(), 1, worker.metricAttrs) + if callback != nil { + callback(queueErr) + } } return nil @@ -141,9 +190,9 @@ func (e *ChipIngressBatchEmitter) Emit(ctx context.Context, body []byte, attrKVs } // findOrCreateWorker returns the worker for the given (domain, entity) pair, -// creating one with a new buffered channel and flush goroutine if it doesn't exist. +// creating one backed by a new batch.Client if it doesn't exist. func (e *ChipIngressBatchEmitter) findOrCreateWorker(domain, entity string) *chipIngressEmitterWorker { - workerKey := domain + "/" + entity + workerKey := domain + "\x00" + entity e.workersMutex.RLock() worker, found := e.workers[workerKey] @@ -166,32 +215,26 @@ func (e *ChipIngressBatchEmitter) findOrCreateWorker(domain, entity string) *chi return nil } - worker = newChipIngressEmitterWorker( - e.client, - make(chan emitterPayload, e.bufferSize), - domain, - entity, - e.maxBatchSize, - e.sendTimeout, - e.metrics, - e.eng, + batchClient, err := batch.NewBatchClient(e.client, + batch.WithBatchSize(e.maxBatchSize), + batch.WithMessageBuffer(e.bufferSize), + batch.WithBatchInterval(e.sendInterval), + batch.WithMaxPublishTimeout(e.sendTimeout), + batch.WithShutdownTimeout(e.drainTimeout), + batch.WithMaxConcurrentSends(e.maxConcurrentSends), + batch.WithEventClone(false), ) + if err != nil { + e.eng.Errorf("chip ingress batch emitter: failed to create batch client for %s: %v", workerKey, err) + return nil + } + + worker = newChipIngressEmitterWorker(batchClient, domain, entity) - sendInterval := e.sendInterval - drainTimeout := e.drainTimeout e.eng.Go(func(ctx context.Context) { - ticker := time.NewTicker(sendInterval) - defer ticker.Stop() - - for { - select { - case <-ticker.C: - worker.Send(ctx) - case <-ctx.Done(): - worker.drain(drainTimeout) - return - } - } + worker.batchClient.Start(ctx) + <-ctx.Done() + worker.batchClient.Stop() }) e.workers[workerKey] = worker @@ -213,24 +256,8 @@ func newBatchEmitterMetrics(meter otelmetric.Meter) (batchEmitterMetrics, error) return batchEmitterMetrics{}, err } - batchFailures, err := meter.Int64Counter("chip_ingress.batch_failures", - otelmetric.WithDescription("Total PublishBatch calls that failed"), - otelmetric.WithUnit("{batch}")) - if err != nil { - return batchEmitterMetrics{}, err - } - - eventsDrained, err := meter.Int64Counter("chip_ingress.events_drained", - otelmetric.WithDescription("Total events flushed during graceful shutdown"), - otelmetric.WithUnit("{event}")) - if err != nil { - return batchEmitterMetrics{}, err - } - return batchEmitterMetrics{ eventsSent: eventsSent, eventsDropped: eventsDropped, - batchFailures: batchFailures, - eventsDrained: eventsDrained, }, nil } diff --git a/pkg/beholder/chip_ingress_batch_emitter_test.go b/pkg/beholder/chip_ingress_batch_emitter_test.go index b2fe7211bf..0e51b808e6 100644 --- a/pkg/beholder/chip_ingress_batch_emitter_test.go +++ b/pkg/beholder/chip_ingress_batch_emitter_test.go @@ -18,10 +18,12 @@ import ( func newTestConfig() beholder.Config { return beholder.Config{ - ChipIngressBufferSize: 10, - ChipIngressMaxBatchSize: 5, - ChipIngressSendInterval: 50 * time.Millisecond, - ChipIngressSendTimeout: 5 * time.Second, + ChipIngressBufferSize: 10, + ChipIngressMaxBatchSize: 5, + ChipIngressMaxConcurrentSends: 3, + ChipIngressSendInterval: 50 * time.Millisecond, + ChipIngressSendTimeout: 5 * time.Second, + ChipIngressDrainTimeout: 5 * time.Second, } } @@ -67,7 +69,11 @@ func TestChipIngressBatchEmitter_Emit(t *testing.T) { Return(nil, nil). Maybe() - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestConfig(), newTestLogger(t)) + cfg := newTestConfig() + cfg.ChipIngressSendInterval = 10 * time.Second + cfg.ChipIngressMaxBatchSize = 100 + + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) defer emitter.Close() //nolint:errcheck @@ -78,7 +84,8 @@ func TestChipIngressBatchEmitter_Emit(t *testing.T) { ) require.NoError(t, err) - // PublishBatch should NOT have been called yet (event is just buffered) + // With a large batch size and long interval, PublishBatch should not fire yet + time.Sleep(100 * time.Millisecond) clientMock.AssertNotCalled(t, "PublishBatch", mock.Anything, mock.Anything) }) } @@ -250,36 +257,56 @@ func TestChipIngressBatchEmitter_PerDomainEntityIsolation(t *testing.T) { func TestChipIngressBatchEmitter_BufferFull(t *testing.T) { t.Run("events are dropped when buffer is full", func(t *testing.T) { clientMock := mocks.NewClient(t) - // Block PublishBatch so the buffer fills up + + // Block PublishBatch so the batcher's send pipeline backs up, + // eventually filling the message buffer channel. + sendBlocked := make(chan struct{}) + firstCallSignal := make(chan struct{}, 1) clientMock. On("PublishBatch", mock.Anything, mock.Anything). + Run(func(_ mock.Arguments) { + select { + case firstCallSignal <- struct{}{}: + default: + } + <-sendBlocked + }). Return(nil, nil). Maybe() cfg := newTestConfig() - cfg.ChipIngressBufferSize = 3 - cfg.ChipIngressSendInterval = 10 * time.Second // very long interval to prevent flushing + cfg.ChipIngressBufferSize = 2 + cfg.ChipIngressMaxBatchSize = 1 + cfg.ChipIngressMaxConcurrentSends = 1 + cfg.ChipIngressSendInterval = 50 * time.Millisecond + cfg.ChipIngressDrainTimeout = 200 * time.Millisecond emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) + defer close(sendBlocked) defer emitter.Close() //nolint:errcheck - // Fill the buffer (3 events) - for i := 0; i < 3; i++ { - err = emitter.Emit(t.Context(), []byte("body"), + // First event triggers a send that blocks, exhausting the semaphore + err = emitter.Emit(t.Context(), []byte("body"), + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + require.NoError(t, err) + + // Wait for the batcher to process the event and block on PublishBatch + <-firstCallSignal + // Give batcher time to read the next event and block on the semaphore + time.Sleep(100 * time.Millisecond) + + // Flood the buffer — Emit should never return an error + for i := 0; i < 10; i++ { + err = emitter.Emit(t.Context(), []byte("overflow"), beholder.AttrKeyDomain, "platform", beholder.AttrKeyEntity, "TestEvent", ) - require.NoError(t, err) + assert.NoError(t, err, "Emit should not return error even when dropping events") } - - // This should not error (it drops silently), but the event won't be delivered - err = emitter.Emit(t.Context(), []byte("dropped"), - beholder.AttrKeyDomain, "platform", - beholder.AttrKeyEntity, "TestEvent", - ) - assert.NoError(t, err) }) } @@ -414,13 +441,6 @@ func TestChipIngressBatchEmitter_ContextCancellation(t *testing.T) { require.NoError(t, emitter.Start(t.Context())) defer emitter.Close() //nolint:errcheck - // Fill the buffer so the next Emit will block on channel send - err = emitter.Emit(t.Context(), []byte("fill"), - beholder.AttrKeyDomain, "platform", - beholder.AttrKeyEntity, "TestEvent", - ) - require.NoError(t, err) - ctx, cancel := context.WithCancel(t.Context()) cancel() @@ -489,11 +509,12 @@ func TestChipIngressBatchEmitter_GracefulDrain(t *testing.T) { Return(nil, nil) cfg := beholder.Config{ - ChipIngressBufferSize: 20, - ChipIngressMaxBatchSize: 10, - ChipIngressSendInterval: 1 * time.Hour, // very long interval — events won't flush via tick - ChipIngressSendTimeout: 5 * time.Second, - ChipIngressDrainTimeout: 5 * time.Second, + ChipIngressBufferSize: 20, + ChipIngressMaxBatchSize: 10, + ChipIngressMaxConcurrentSends: 3, + ChipIngressSendInterval: 1 * time.Hour, + ChipIngressSendTimeout: 5 * time.Second, + ChipIngressDrainTimeout: 5 * time.Second, } emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) @@ -508,6 +529,9 @@ func TestChipIngressBatchEmitter_GracefulDrain(t *testing.T) { require.NoError(t, err) } + // Give the batcher time to read events from the channel into its internal batch + time.Sleep(50 * time.Millisecond) + // Events are buffered but no tick has fired. Close should drain them. require.NoError(t, emitter.Close()) @@ -537,11 +561,12 @@ func TestChipIngressBatchEmitter_DrainMultipleDomains(t *testing.T) { Return(nil, nil) cfg := beholder.Config{ - ChipIngressBufferSize: 20, - ChipIngressMaxBatchSize: 10, - ChipIngressSendInterval: 1 * time.Hour, - ChipIngressSendTimeout: 5 * time.Second, - ChipIngressDrainTimeout: 5 * time.Second, + ChipIngressBufferSize: 20, + ChipIngressMaxBatchSize: 10, + ChipIngressMaxConcurrentSends: 3, + ChipIngressSendInterval: 1 * time.Hour, + ChipIngressSendTimeout: 5 * time.Second, + ChipIngressDrainTimeout: 5 * time.Second, } emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) @@ -563,6 +588,9 @@ func TestChipIngressBatchEmitter_DrainMultipleDomains(t *testing.T) { require.NoError(t, err) } + // Give batchers time to read events from their channels + time.Sleep(50 * time.Millisecond) + require.NoError(t, emitter.Close()) mu.Lock() @@ -573,7 +601,7 @@ func TestChipIngressBatchEmitter_DrainMultipleDomains(t *testing.T) { } func TestChipIngressBatchEmitter_DrainPublishBatchFailure(t *testing.T) { - t.Run("drain continues attempting batches after failure", func(t *testing.T) { + t.Run("handles batch failure and continues sending", func(t *testing.T) { clientMock := mocks.NewClient(t) var mu sync.Mutex @@ -605,18 +633,19 @@ func TestChipIngressBatchEmitter_DrainPublishBatchFailure(t *testing.T) { Return(nil, nil) cfg := beholder.Config{ - ChipIngressBufferSize: 20, - ChipIngressMaxBatchSize: 3, - ChipIngressSendInterval: 1 * time.Hour, - ChipIngressSendTimeout: 5 * time.Second, - ChipIngressDrainTimeout: 5 * time.Second, + ChipIngressBufferSize: 20, + ChipIngressMaxBatchSize: 3, + ChipIngressMaxConcurrentSends: 3, + ChipIngressSendInterval: 1 * time.Hour, + ChipIngressSendTimeout: 5 * time.Second, + ChipIngressDrainTimeout: 5 * time.Second, } emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) - // Emit 6 events with maxBatchSize=3 => 2 batches during drain + // Emit 6 events with maxBatchSize=3 => 2 batches via size trigger for i := 0; i < 6; i++ { err = emitter.Emit(t.Context(), []byte("body"), beholder.AttrKeyDomain, "platform", @@ -625,11 +654,14 @@ func TestChipIngressBatchEmitter_DrainPublishBatchFailure(t *testing.T) { require.NoError(t, err) } + // Give the batcher time to read events and trigger size-based sends + time.Sleep(100 * time.Millisecond) + require.NoError(t, emitter.Close()) mu.Lock() defer mu.Unlock() - assert.GreaterOrEqual(t, callCount, 2, "drain should have attempted at least 2 batches") + assert.GreaterOrEqual(t, callCount, 2, "should have attempted at least 2 batches") assert.Equal(t, 3, totalEventsSent, "second batch should have succeeded despite first batch failure") }) } @@ -648,11 +680,12 @@ func TestChipIngressBatchEmitter_DrainTimeout(t *testing.T) { Maybe() cfg := beholder.Config{ - ChipIngressBufferSize: 20, - ChipIngressMaxBatchSize: 10, - ChipIngressSendInterval: 1 * time.Hour, - ChipIngressSendTimeout: 10 * time.Second, - ChipIngressDrainTimeout: 200 * time.Millisecond, + ChipIngressBufferSize: 20, + ChipIngressMaxBatchSize: 10, + ChipIngressMaxConcurrentSends: 3, + ChipIngressSendInterval: 1 * time.Hour, + ChipIngressSendTimeout: 10 * time.Second, + ChipIngressDrainTimeout: 200 * time.Millisecond, } emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) @@ -667,6 +700,9 @@ func TestChipIngressBatchEmitter_DrainTimeout(t *testing.T) { require.NoError(t, err) } + // Give the batcher time to read events + time.Sleep(50 * time.Millisecond) + closeDone := make(chan error, 1) go func() { closeDone <- emitter.Close() @@ -747,3 +783,193 @@ func TestChipIngressBatchEmitter_EmitAfterClose(t *testing.T) { assert.Error(t, err, "Emit after Close should return an error") }) } + +func TestChipIngressBatchEmitter_EmitWithCallback(t *testing.T) { + t.Run("callback receives nil on successful send", func(t *testing.T) { + clientMock := mocks.NewClient(t) + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Return(nil, nil) + + cfg := newTestConfig() + cfg.ChipIngressSendInterval = 50 * time.Millisecond + + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + + done := make(chan error, 1) + err = emitter.EmitWithCallback(t.Context(), []byte("body"), func(sendErr error) { + done <- sendErr + }, + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + require.NoError(t, err) + + select { + case sendErr := <-done: + assert.NoError(t, sendErr, "callback should receive nil on success") + case <-time.After(3 * time.Second): + t.Fatal("callback was not invoked within timeout") + } + + require.NoError(t, emitter.Close()) + }) + + t.Run("callback receives error on PublishBatch failure", func(t *testing.T) { + clientMock := mocks.NewClient(t) + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Return(nil, assert.AnError) + + cfg := newTestConfig() + cfg.ChipIngressSendInterval = 50 * time.Millisecond + + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + + done := make(chan error, 1) + err = emitter.EmitWithCallback(t.Context(), []byte("body"), func(sendErr error) { + done <- sendErr + }, + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + require.NoError(t, err) + + select { + case sendErr := <-done: + assert.Error(t, sendErr, "callback should receive error on failure") + case <-time.After(3 * time.Second): + t.Fatal("callback was not invoked within timeout") + } + + require.NoError(t, emitter.Close()) + }) + + t.Run("callback receives error when buffer is full", func(t *testing.T) { + clientMock := mocks.NewClient(t) + + sendBlocked := make(chan struct{}) + firstCallSignal := make(chan struct{}, 1) + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Run(func(_ mock.Arguments) { + select { + case firstCallSignal <- struct{}{}: + default: + } + <-sendBlocked + }). + Return(nil, nil). + Maybe() + + cfg := newTestConfig() + cfg.ChipIngressBufferSize = 2 + cfg.ChipIngressMaxBatchSize = 1 + cfg.ChipIngressMaxConcurrentSends = 1 + cfg.ChipIngressSendInterval = 50 * time.Millisecond + cfg.ChipIngressDrainTimeout = 200 * time.Millisecond + + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + defer close(sendBlocked) + defer emitter.Close() //nolint:errcheck + + // First event triggers a send that blocks, exhausting the semaphore + err = emitter.Emit(t.Context(), []byte("body"), + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + require.NoError(t, err) + + // Wait for PublishBatch to be called and blocked + <-firstCallSignal + // Give batcher time to read the next event and block on the semaphore + time.Sleep(100 * time.Millisecond) + + // Flood the buffer (size 2) so it becomes full + for i := 0; i < 10; i++ { + _ = emitter.Emit(t.Context(), []byte("filler"), + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + } + + // Buffer is full — callback should be invoked synchronously with an error + dropped := make(chan error, 1) + err = emitter.EmitWithCallback(t.Context(), []byte("overflow"), func(sendErr error) { + dropped <- sendErr + }, + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + assert.NoError(t, err, "Emit should not return an error even when dropping") + + select { + case dropErr := <-dropped: + assert.Error(t, dropErr, "callback should receive an error when buffer is full") + case <-time.After(time.Second): + t.Fatal("callback was not invoked for dropped event") + } + }) + + t.Run("synchronous emit pattern works end-to-end", func(t *testing.T) { + clientMock := mocks.NewClient(t) + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Return(nil, nil) + + cfg := newTestConfig() + cfg.ChipIngressSendInterval = 50 * time.Millisecond + + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + + done := make(chan error, 1) + err = emitter.EmitWithCallback(t.Context(), []byte("sync-body"), func(sendErr error) { + done <- sendErr + }, + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + require.NoError(t, err) + + // Block until the event is actually sent — the "synchronous" pattern + select { + case sendErr := <-done: + assert.NoError(t, sendErr) + case <-time.After(3 * time.Second): + t.Fatal("synchronous emit did not complete within timeout") + } + + require.NoError(t, emitter.Close()) + }) + + t.Run("nil callback behaves like Emit", func(t *testing.T) { + clientMock := mocks.NewClient(t) + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Return(nil, nil). + Maybe() + + cfg := newTestConfig() + cfg.ChipIngressSendInterval = 50 * time.Millisecond + + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + + err = emitter.EmitWithCallback(t.Context(), []byte("body"), nil, + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + assert.NoError(t, err) + + require.NoError(t, emitter.Close()) + }) +} diff --git a/pkg/beholder/chip_ingress_emitter_worker.go b/pkg/beholder/chip_ingress_emitter_worker.go index b2bd7b3101..005b2c3aa5 100644 --- a/pkg/beholder/chip_ingress_emitter_worker.go +++ b/pkg/beholder/chip_ingress_emitter_worker.go @@ -1,197 +1,29 @@ package beholder import ( - "context" - "fmt" - "sync/atomic" - "time" - "go.opentelemetry.io/otel/attribute" otelmetric "go.opentelemetry.io/otel/metric" - "github.com/smartcontractkit/chainlink-common/pkg/chipingress" - "github.com/smartcontractkit/chainlink-common/pkg/logger" + "github.com/smartcontractkit/chainlink-common/pkg/chipingress/batch" ) -// emitterPayload holds a single event to be batched and sent via chip ingress. -type emitterPayload struct { - body []byte - attributes map[string]any - domain string - entity string -} - -// chipIngressEmitterWorker buffers events for a single (domain, entity) pair -// and flushes them via PublishBatch on a periodic interval. -// Transport-level retries (UNAVAILABLE, RESOURCE_EXHAUSTED) are handled by the -// gRPC client's built-in retry policy; no application-level retry is needed. +// chipIngressEmitterWorker wraps a batch.Client for a single (domain, entity) pair. +// The batch.Client handles buffering, periodic flushing, concurrent sends, and graceful shutdown. type chipIngressEmitterWorker struct { - client chipingress.Client - ch chan emitterPayload - domain string - entity string - maxBatchSize uint - sendTimeout time.Duration - lggr logger.Logger - dropCount atomic.Uint32 - - metrics batchEmitterMetrics + batchClient *batch.Client metricAttrs otelmetric.MeasurementOption } func newChipIngressEmitterWorker( - client chipingress.Client, - ch chan emitterPayload, + batchClient *batch.Client, domain string, entity string, - maxBatchSize uint, - sendTimeout time.Duration, - metrics batchEmitterMetrics, - lggr logger.Logger, ) *chipIngressEmitterWorker { return &chipIngressEmitterWorker{ - client: client, - ch: ch, - domain: domain, - entity: entity, - maxBatchSize: maxBatchSize, - sendTimeout: sendTimeout, - lggr: logger.Named(lggr, "ChipIngressEmitterWorker"), - metrics: metrics, + batchClient: batchClient, metricAttrs: otelmetric.WithAttributeSet(attribute.NewSet( attribute.String("domain", domain), attribute.String("entity", entity), )), } } - -// Send drains the channel and sends a batch. -// Called periodically by the tick loop. -func (w *chipIngressEmitterWorker) Send(ctx context.Context) { - if len(w.ch) == 0 { - return - } - - batch := w.buildBatch() - if batch == nil || len(batch.Events) == 0 { - return - } - - w.publishOnce(ctx, batch) -} - -func (w *chipIngressEmitterWorker) publishOnce(ctx context.Context, batch *chipingress.CloudEventBatch) { - sendCtx, cancel := context.WithTimeout(ctx, w.sendTimeout) - defer cancel() - - batchSize := int64(len(batch.Events)) - _, err := w.client.PublishBatch(sendCtx, batch) - if err != nil { - w.lggr.Warnf("could not send batch via chip ingress (domain=%s, entity=%s): %v", - w.domain, w.entity, err) - w.metrics.batchFailures.Add(context.Background(), 1, w.metricAttrs) - w.metrics.eventsDropped.Add(context.Background(), batchSize, w.metricAttrs) - return - } - w.metrics.eventsSent.Add(context.Background(), batchSize, w.metricAttrs) -} - -// buildBatch drains the channel up to maxBatchSize and converts payloads to a CloudEventBatch. -func (w *chipIngressEmitterWorker) buildBatch() *chipingress.CloudEventBatch { - var events []chipingress.CloudEvent - - max := int(w.maxBatchSize) // #nosec G115 -drain: - for len(events) < max { - select { - case payload := <-w.ch: - event, err := w.payloadToEvent(payload) - if err != nil { - w.lggr.Warnf("failed to build CloudEvent, dropping: %v", err) - w.metrics.eventsDropped.Add(context.Background(), 1, w.metricAttrs) - continue - } - events = append(events, event) - default: - break drain - } - } - - if len(events) == 0 { - return nil - } - - batch, err := chipingress.EventsToBatch(events) - if err != nil { - w.lggr.Warnf("failed to convert events to batch: %v", err) - w.metrics.eventsDropped.Add(context.Background(), int64(len(events)), w.metricAttrs) - return nil - } - - return batch -} - -func (w *chipIngressEmitterWorker) payloadToEvent(payload emitterPayload) (chipingress.CloudEvent, error) { - event, err := chipingress.NewEvent(payload.domain, payload.entity, payload.body, payload.attributes) - if err != nil { - return chipingress.CloudEvent{}, fmt.Errorf("failed to create CloudEvent: %w", err) - } - return event, nil -} - -// drain flushes all remaining buffered events before shutdown. -// Uses a fresh context with the given timeout (independent of the cancelled parent). -// Continues attempting subsequent batches even if one fails. -func (w *chipIngressEmitterWorker) drain(timeout time.Duration) { - if len(w.ch) == 0 { - return - } - - ctx, cancel := context.WithTimeout(context.Background(), timeout) - defer cancel() - - w.lggr.Infof("draining %d buffered events (domain=%s, entity=%s)", len(w.ch), w.domain, w.entity) - - for len(w.ch) > 0 { - if ctx.Err() != nil { - remaining := len(w.ch) - if remaining > 0 { - w.lggr.Warnf("drain timeout exceeded, dropping %d remaining events (domain=%s, entity=%s)", - remaining, w.domain, w.entity) - w.metrics.eventsDropped.Add(context.Background(), int64(remaining), w.metricAttrs) - } - return - } - - batch := w.buildBatch() - if batch == nil || len(batch.Events) == 0 { - break - } - - batchSize := int64(len(batch.Events)) - sendCtx, sendCancel := context.WithTimeout(ctx, w.sendTimeout) - _, err := w.client.PublishBatch(sendCtx, batch) - sendCancel() - - if err != nil { - w.lggr.Warnf("drain PublishBatch failed, dropping %d events (domain=%s, entity=%s): %v", - len(batch.Events), w.domain, w.entity, err) - w.metrics.eventsDropped.Add(context.Background(), batchSize, w.metricAttrs) - continue - } - - w.metrics.eventsDrained.Add(context.Background(), batchSize, w.metricAttrs) - } -} - -// logBufferFullWithExpBackoff logs at 1, 2, 4, 8, 16, 32, 64, 100, 200, 300, ... -// to avoid flooding logs when the buffer is persistently full. -// dropCount is intentionally racy with Emit's Store(0) — this only affects log frequency, not correctness. -func (w *chipIngressEmitterWorker) logBufferFullWithExpBackoff(payload emitterPayload) { - w.metrics.eventsDropped.Add(context.Background(), 1, w.metricAttrs) - count := w.dropCount.Add(1) - if count > 0 && (count%100 == 0 || count&(count-1) == 0) { - w.lggr.Warnf("chip ingress emitter buffer full, dropping event (domain=%s, entity=%s, droppedCount=%d)", - payload.domain, payload.entity, count) - } -} diff --git a/pkg/beholder/config.go b/pkg/beholder/config.go index 3f371799df..8bcf7c3a4b 100644 --- a/pkg/beholder/config.go +++ b/pkg/beholder/config.go @@ -46,12 +46,13 @@ type Config struct { // Chip Ingress Batch Emitter ChipIngressBatchEmitterEnabled bool // When true, use batch emitter; when false (default), use legacy per-event emitter - ChipIngressBufferSize uint // Per-worker channel buffer size (default 100) - ChipIngressMaxBatchSize uint // Max events per PublishBatch call (default 50) + ChipIngressBufferSize uint // Per-worker message buffer size (default 1000) + ChipIngressMaxBatchSize uint // Max events per PublishBatch call (default 500) ChipIngressMaxWorkers int // Max concurrent (domain, entity) workers (default 100) - ChipIngressSendInterval time.Duration // Flush interval per worker (default 500ms when zero or unset) - ChipIngressSendTimeout time.Duration // Timeout per PublishBatch call (default 5s) - ChipIngressDrainTimeout time.Duration // Max time to flush remaining events on shutdown (default 5s) + ChipIngressSendInterval time.Duration // Flush interval per worker (default 100ms) + ChipIngressSendTimeout time.Duration // Timeout per PublishBatch call (default 3s) + ChipIngressDrainTimeout time.Duration // Max time to flush remaining events on shutdown (default 10s) + ChipIngressMaxConcurrentSends int // Max concurrent PublishBatch calls per worker (default 3) // OTel Log LogExportTimeout time.Duration @@ -100,8 +101,9 @@ var defaultRetryConfig = RetryConfig{ } const ( - defaultPackageName = "beholder" - defaultMaxWorkers = 100 + defaultPackageName = "beholder" + defaultMaxWorkers = 100 + defaultMaxConcurrentSends = 3 ) var defaultOtelAttributes = []attribute.KeyValue{ @@ -151,6 +153,7 @@ func DefaultConfig() Config { ChipIngressSendInterval: 100 * time.Millisecond, ChipIngressSendTimeout: 3 * time.Second, ChipIngressDrainTimeout: 10 * time.Second, + ChipIngressMaxConcurrentSends: defaultMaxConcurrentSends, // Auth (defaults to static auth mode with TTL=0) AuthHeadersTTL: 0, } diff --git a/pkg/beholder/config_test.go b/pkg/beholder/config_test.go index 5a83c71178..c96b6187c2 100644 --- a/pkg/beholder/config_test.go +++ b/pkg/beholder/config_test.go @@ -67,6 +67,6 @@ func ExampleConfig() { } fmt.Printf("%+v\n", *config.LogRetryConfig) // Output: - // {InsecureConnection:true CACertFile: OtelExporterGRPCEndpoint:localhost:4317 OtelExporterHTTPEndpoint:localhost:4318 ResourceAttributes:[{Key:package_name Value:{vtype:4 numeric:0 stringly:beholder slice:}} {Key:sender Value:{vtype:4 numeric:0 stringly:beholderclient slice:}}] EmitterExportTimeout:1s EmitterExportInterval:1s EmitterExportMaxBatchSize:512 EmitterMaxQueueSize:2048 EmitterBatchProcessor:true TraceSampleRatio:1 TraceBatchTimeout:1s TraceSpanExporter: TraceRetryConfig: TraceCompressor:gzip MetricReaderInterval:1s MetricRetryConfig: MetricViews:[] MetricCompressor:gzip ChipIngressEmitterEnabled:false ChipIngressEmitterGRPCEndpoint: ChipIngressInsecureConnection:false ChipIngressBatchEmitterEnabled:false ChipIngressBufferSize:0 ChipIngressMaxBatchSize:0 ChipIngressMaxWorkers:0 ChipIngressSendInterval:0s ChipIngressSendTimeout:0s ChipIngressDrainTimeout:0s LogExportTimeout:1s LogExportInterval:1s LogExportMaxBatchSize:512 LogMaxQueueSize:2048 LogBatchProcessor:true LogRetryConfig: LogStreamingEnabled:false LogLevel:info LogCompressor:gzip AuthHeaders:map[] AuthHeadersTTL:0s AuthKeySigner: AuthPublicKeyHex:} + // {InsecureConnection:true CACertFile: OtelExporterGRPCEndpoint:localhost:4317 OtelExporterHTTPEndpoint:localhost:4318 ResourceAttributes:[{Key:package_name Value:{vtype:4 numeric:0 stringly:beholder slice:}} {Key:sender Value:{vtype:4 numeric:0 stringly:beholderclient slice:}}] EmitterExportTimeout:1s EmitterExportInterval:1s EmitterExportMaxBatchSize:512 EmitterMaxQueueSize:2048 EmitterBatchProcessor:true TraceSampleRatio:1 TraceBatchTimeout:1s TraceSpanExporter: TraceRetryConfig: TraceCompressor:gzip MetricReaderInterval:1s MetricRetryConfig: MetricViews:[] MetricCompressor:gzip ChipIngressEmitterEnabled:false ChipIngressEmitterGRPCEndpoint: ChipIngressInsecureConnection:false ChipIngressBatchEmitterEnabled:false ChipIngressBufferSize:0 ChipIngressMaxBatchSize:0 ChipIngressMaxWorkers:0 ChipIngressSendInterval:0s ChipIngressSendTimeout:0s ChipIngressDrainTimeout:0s ChipIngressMaxConcurrentSends:0 LogExportTimeout:1s LogExportInterval:1s LogExportMaxBatchSize:512 LogMaxQueueSize:2048 LogBatchProcessor:true LogRetryConfig: LogStreamingEnabled:false LogLevel:info LogCompressor:gzip AuthHeaders:map[] AuthHeadersTTL:0s AuthKeySigner: AuthPublicKeyHex:} // {InitialInterval:5s MaxInterval:30s MaxElapsedTime:1m0s} } diff --git a/pkg/chipingress/batch/client.go b/pkg/chipingress/batch/client.go index e02704b26b..af97c0684b 100644 --- a/pkg/chipingress/batch/client.go +++ b/pkg/chipingress/batch/client.go @@ -110,7 +110,9 @@ func (b *Client) Start(ctx context.Context) { // Forcibly shutdowns down after timeout if not completed. func (b *Client) Stop() { b.shutdownOnce.Do(func() { - ctx, cancel := b.stopCh.CtxWithTimeout(b.shutdownTimeout) + // Use a standalone timeout context so the shutdown wait isn't cancelled + // by close(b.stopCh) below. + ctx, cancel := context.WithTimeout(context.Background(), b.shutdownTimeout) defer cancel() if b.cancelBatcher != nil { diff --git a/pkg/chipingress/batch/client_test.go b/pkg/chipingress/batch/client_test.go index 7f8c356fb3..dab751e06c 100644 --- a/pkg/chipingress/batch/client_test.go +++ b/pkg/chipingress/batch/client_test.go @@ -837,6 +837,11 @@ func TestStop(t *testing.T) { t.Run("QueueMessage returns error after Stop", func(t *testing.T) { mockClient := mocks.NewClient(t) + mockClient. + On("PublishBatch", mock.Anything, mock.Anything). + Return(&chipingress.PublishResponse{}, nil). + Maybe() + client, err := NewBatchClient(mockClient, WithBatchSize(10)) require.NoError(t, err) @@ -853,7 +858,7 @@ func TestStop(t *testing.T) { }, nil) require.NoError(t, err) - // Stop the client + // Stop the client — drains any buffered messages client.Stop() // Queue message after stop - should fail From 314b0f55bae988269b6e1ebeb753645e74be32eb Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Fri, 6 Mar 2026 16:27:39 +0200 Subject: [PATCH 17/28] Increase defaultMaxConcurrentSends 3->10 --- pkg/beholder/chip_ingress_emitter_worker.go | 2 -- pkg/beholder/config.go | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/pkg/beholder/chip_ingress_emitter_worker.go b/pkg/beholder/chip_ingress_emitter_worker.go index 005b2c3aa5..5b964bca3c 100644 --- a/pkg/beholder/chip_ingress_emitter_worker.go +++ b/pkg/beholder/chip_ingress_emitter_worker.go @@ -7,8 +7,6 @@ import ( "github.com/smartcontractkit/chainlink-common/pkg/chipingress/batch" ) -// chipIngressEmitterWorker wraps a batch.Client for a single (domain, entity) pair. -// The batch.Client handles buffering, periodic flushing, concurrent sends, and graceful shutdown. type chipIngressEmitterWorker struct { batchClient *batch.Client metricAttrs otelmetric.MeasurementOption diff --git a/pkg/beholder/config.go b/pkg/beholder/config.go index 8bcf7c3a4b..5f6dafe51c 100644 --- a/pkg/beholder/config.go +++ b/pkg/beholder/config.go @@ -103,7 +103,7 @@ var defaultRetryConfig = RetryConfig{ const ( defaultPackageName = "beholder" defaultMaxWorkers = 100 - defaultMaxConcurrentSends = 3 + defaultMaxConcurrentSends = 10 ) var defaultOtelAttributes = []attribute.KeyValue{ From 689ff493d78515d37c3c37cf4fa429df43ff88d6 Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Fri, 6 Mar 2026 16:55:04 +0200 Subject: [PATCH 18/28] Use application logger --- pkg/beholder/client.go | 11 ++++------- pkg/beholder/config.go | 5 ++++- pkg/beholder/config_test.go | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pkg/beholder/client.go b/pkg/beholder/client.go index 9a5fdf9305..454bc89d54 100644 --- a/pkg/beholder/client.go +++ b/pkg/beholder/client.go @@ -8,7 +8,7 @@ import ( "io" "time" - ccllogger "github.com/smartcontractkit/chainlink-common/pkg/logger" + "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc" @@ -226,13 +226,10 @@ func NewGRPCClient(cfg Config, otlploggrpcNew otlploggrpcFactory) (*Client, erro var chipIngressEmitter Emitter if cfg.ChipIngressBatchEmitterEnabled { - // TODO: accept a logger from the caller instead of creating a new root logger, - // so batch emitter logs respect the node's logging configuration. - lggr, lErr := ccllogger.New() - if lErr != nil { - return nil, fmt.Errorf("failed to create logger for chip ingress batch emitter: %w", lErr) + if cfg.ChipIngressLogger == nil { + return nil, fmt.Errorf("ChipIngressLogger is required when ChipIngressBatchEmitterEnabled is true") } - batchEmitterService, err = NewChipIngressBatchEmitter(chipIngressClient, cfg, lggr) + batchEmitterService, err = NewChipIngressBatchEmitter(chipIngressClient, cfg, cfg.ChipIngressLogger) if err != nil { return nil, fmt.Errorf("failed to create chip ingress batch emitter: %w", err) } diff --git a/pkg/beholder/config.go b/pkg/beholder/config.go index 5f6dafe51c..bfce03c7cf 100644 --- a/pkg/beholder/config.go +++ b/pkg/beholder/config.go @@ -7,6 +7,8 @@ import ( "go.opentelemetry.io/otel/sdk/metric" "go.opentelemetry.io/otel/sdk/trace" "go.uber.org/zap/zapcore" + + "github.com/smartcontractkit/chainlink-common/pkg/logger" ) type Config struct { @@ -52,7 +54,8 @@ type Config struct { ChipIngressSendInterval time.Duration // Flush interval per worker (default 100ms) ChipIngressSendTimeout time.Duration // Timeout per PublishBatch call (default 3s) ChipIngressDrainTimeout time.Duration // Max time to flush remaining events on shutdown (default 10s) - ChipIngressMaxConcurrentSends int // Max concurrent PublishBatch calls per worker (default 3) + ChipIngressMaxConcurrentSends int // Max concurrent PublishBatch calls per worker (default 10) + ChipIngressLogger logger.Logger // Required when ChipIngressBatchEmitterEnabled is true // OTel Log LogExportTimeout time.Duration diff --git a/pkg/beholder/config_test.go b/pkg/beholder/config_test.go index c96b6187c2..50410d0a39 100644 --- a/pkg/beholder/config_test.go +++ b/pkg/beholder/config_test.go @@ -67,6 +67,6 @@ func ExampleConfig() { } fmt.Printf("%+v\n", *config.LogRetryConfig) // Output: - // {InsecureConnection:true CACertFile: OtelExporterGRPCEndpoint:localhost:4317 OtelExporterHTTPEndpoint:localhost:4318 ResourceAttributes:[{Key:package_name Value:{vtype:4 numeric:0 stringly:beholder slice:}} {Key:sender Value:{vtype:4 numeric:0 stringly:beholderclient slice:}}] EmitterExportTimeout:1s EmitterExportInterval:1s EmitterExportMaxBatchSize:512 EmitterMaxQueueSize:2048 EmitterBatchProcessor:true TraceSampleRatio:1 TraceBatchTimeout:1s TraceSpanExporter: TraceRetryConfig: TraceCompressor:gzip MetricReaderInterval:1s MetricRetryConfig: MetricViews:[] MetricCompressor:gzip ChipIngressEmitterEnabled:false ChipIngressEmitterGRPCEndpoint: ChipIngressInsecureConnection:false ChipIngressBatchEmitterEnabled:false ChipIngressBufferSize:0 ChipIngressMaxBatchSize:0 ChipIngressMaxWorkers:0 ChipIngressSendInterval:0s ChipIngressSendTimeout:0s ChipIngressDrainTimeout:0s ChipIngressMaxConcurrentSends:0 LogExportTimeout:1s LogExportInterval:1s LogExportMaxBatchSize:512 LogMaxQueueSize:2048 LogBatchProcessor:true LogRetryConfig: LogStreamingEnabled:false LogLevel:info LogCompressor:gzip AuthHeaders:map[] AuthHeadersTTL:0s AuthKeySigner: AuthPublicKeyHex:} + // {InsecureConnection:true CACertFile: OtelExporterGRPCEndpoint:localhost:4317 OtelExporterHTTPEndpoint:localhost:4318 ResourceAttributes:[{Key:package_name Value:{vtype:4 numeric:0 stringly:beholder slice:}} {Key:sender Value:{vtype:4 numeric:0 stringly:beholderclient slice:}}] EmitterExportTimeout:1s EmitterExportInterval:1s EmitterExportMaxBatchSize:512 EmitterMaxQueueSize:2048 EmitterBatchProcessor:true TraceSampleRatio:1 TraceBatchTimeout:1s TraceSpanExporter: TraceRetryConfig: TraceCompressor:gzip MetricReaderInterval:1s MetricRetryConfig: MetricViews:[] MetricCompressor:gzip ChipIngressEmitterEnabled:false ChipIngressEmitterGRPCEndpoint: ChipIngressInsecureConnection:false ChipIngressBatchEmitterEnabled:false ChipIngressBufferSize:0 ChipIngressMaxBatchSize:0 ChipIngressMaxWorkers:0 ChipIngressSendInterval:0s ChipIngressSendTimeout:0s ChipIngressDrainTimeout:0s ChipIngressMaxConcurrentSends:0 ChipIngressLogger: LogExportTimeout:1s LogExportInterval:1s LogExportMaxBatchSize:512 LogMaxQueueSize:2048 LogBatchProcessor:true LogRetryConfig: LogStreamingEnabled:false LogLevel:info LogCompressor:gzip AuthHeaders:map[] AuthHeadersTTL:0s AuthKeySigner: AuthPublicKeyHex:} // {InitialInterval:5s MaxInterval:30s MaxElapsedTime:1m0s} } From 86ca238054ad5b521abe2079d256fabd12b94faa Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Fri, 6 Mar 2026 16:58:04 +0200 Subject: [PATCH 19/28] Small fixes --- pkg/beholder/chip_ingress_batch_emitter.go | 8 ++++++-- pkg/beholder/client.go | 1 - 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pkg/beholder/chip_ingress_batch_emitter.go b/pkg/beholder/chip_ingress_batch_emitter.go index 17b8479431..b9e280120a 100644 --- a/pkg/beholder/chip_ingress_batch_emitter.go +++ b/pkg/beholder/chip_ingress_batch_emitter.go @@ -7,6 +7,7 @@ import ( "time" "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" otelmetric "go.opentelemetry.io/otel/metric" "github.com/smartcontractkit/chainlink-common/pkg/chipingress" @@ -210,8 +211,11 @@ func (e *ChipIngressBatchEmitter) findOrCreateWorker(domain, entity string) *chi } if len(e.workers) >= e.maxWorkers { - e.eng.Warnf("chip ingress batch emitter: max workers (%d) reached, dropping event for %s", e.maxWorkers, workerKey) - e.metrics.eventsDropped.Add(context.Background(), 1) + e.eng.Warnf("chip ingress batch emitter: max workers (%d) reached, dropping event for %s/%s", e.maxWorkers, domain, entity) + e.metrics.eventsDropped.Add(context.Background(), 1, otelmetric.WithAttributeSet(attribute.NewSet( + attribute.String("domain", domain), + attribute.String("entity", entity), + ))) return nil } diff --git a/pkg/beholder/client.go b/pkg/beholder/client.go index 454bc89d54..04ef19910e 100644 --- a/pkg/beholder/client.go +++ b/pkg/beholder/client.go @@ -8,7 +8,6 @@ import ( "io" "time" - "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc" From ac45d121befe7324db5a6f80f8523e90e75a011d Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Fri, 6 Mar 2026 23:24:58 +0200 Subject: [PATCH 20/28] Amend comment --- pkg/beholder/chip_ingress_batch_emitter.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pkg/beholder/chip_ingress_batch_emitter.go b/pkg/beholder/chip_ingress_batch_emitter.go index b9e280120a..68b30e9192 100644 --- a/pkg/beholder/chip_ingress_batch_emitter.go +++ b/pkg/beholder/chip_ingress_batch_emitter.go @@ -109,10 +109,9 @@ func NewChipIngressBatchEmitter(client chipingress.Client, cfg Config, lggr logg return e, nil } -// Emit extracts (domain, entity) from the attributes, routes the event to the -// appropriate per-(domain, entity) worker's batch.Client, and returns immediately. -// If the worker's buffer is full, the event is dropped and a metric is bumped. -// Returns an error if the emitter has been closed or the caller's context is cancelled. +// Emit queues an event for batched delivery. It returns immediately without blocking. +// If the worker's buffer is full, the event is silently dropped (metric bumped). +// Returns an error only if the emitter is closed or the context is cancelled. func (e *ChipIngressBatchEmitter) Emit(ctx context.Context, body []byte, attrKVs ...any) error { return e.emitInternal(ctx, body, nil, attrKVs...) } From ffad1edfd7f83892b81fd8a8547a70bb21b9ce93 Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Mon, 9 Mar 2026 18:54:40 +0200 Subject: [PATCH 21/28] Add missing logger --- pkg/loop/server.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/loop/server.go b/pkg/loop/server.go index b36bc30a04..28472ac2a3 100644 --- a/pkg/loop/server.go +++ b/pkg/loop/server.go @@ -174,6 +174,7 @@ func (s *Server) start(opts ...ServerOpt) error { ChipIngressEmitterGRPCEndpoint: s.EnvConfig.ChipIngressEndpoint, ChipIngressInsecureConnection: s.EnvConfig.ChipIngressInsecureConnection, ChipIngressBatchEmitterEnabled: s.EnvConfig.ChipIngressBatchEmitterEnabled, + ChipIngressLogger: s.Logger, MetricCompressor: s.EnvConfig.TelemetryMetricCompressor, } From 71db03e805503655f63d69d2df3b2ad6073af166 Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Thu, 12 Mar 2026 16:03:00 +0200 Subject: [PATCH 22/28] Fix regression --- pkg/beholder/client.go | 2 +- pkg/beholder/dual_source_emitter.go | 36 ++++++++++--- pkg/beholder/dual_source_emitter_test.go | 64 ++++++++++++++++++++++-- 3 files changed, 89 insertions(+), 13 deletions(-) diff --git a/pkg/beholder/client.go b/pkg/beholder/client.go index 04ef19910e..83f61c3a0e 100644 --- a/pkg/beholder/client.go +++ b/pkg/beholder/client.go @@ -243,7 +243,7 @@ func NewGRPCClient(cfg Config, otlploggrpcNew otlploggrpcFactory) (*Client, erro } } - emitter, err = NewDualSourceEmitter(chipIngressEmitter, emitter) + emitter, err = NewDualSourceEmitter(chipIngressEmitter, emitter, cfg.ChipIngressBatchEmitterEnabled) if err != nil { if batchEmitterService != nil { _ = batchEmitterService.Close() diff --git a/pkg/beholder/dual_source_emitter.go b/pkg/beholder/dual_source_emitter.go index 0501e5c529..7d4cdc7748 100644 --- a/pkg/beholder/dual_source_emitter.go +++ b/pkg/beholder/dual_source_emitter.go @@ -7,6 +7,7 @@ import ( "sync/atomic" "github.com/smartcontractkit/chainlink-common/pkg/logger" + "github.com/smartcontractkit/chainlink-common/pkg/services" ) // DualSourceEmitter emits both to chip ingress and to the otel collector @@ -17,10 +18,13 @@ type DualSourceEmitter struct { chipIngressEmitter Emitter otelCollectorEmitter Emitter log logger.Logger + nonBlockingEmitter bool + stopCh services.StopChan + wg services.WaitGroup closed atomic.Bool } -func NewDualSourceEmitter(chipIngressEmitter Emitter, otelCollectorEmitter Emitter) (Emitter, error) { +func NewDualSourceEmitter(chipIngressEmitter Emitter, otelCollectorEmitter Emitter, nonBlockingChipIngress bool) (Emitter, error) { if chipIngressEmitter == nil { return nil, fmt.Errorf("chip ingress emitter is nil") @@ -39,6 +43,8 @@ func NewDualSourceEmitter(chipIngressEmitter Emitter, otelCollectorEmitter Emitt chipIngressEmitter: chipIngressEmitter, otelCollectorEmitter: otelCollectorEmitter, log: logger, + nonBlockingEmitter: nonBlockingChipIngress, + stopCh: make(services.StopChan), }, nil } @@ -46,20 +52,36 @@ func (d *DualSourceEmitter) Close() error { if wasClosed := d.closed.Swap(true); wasClosed { return errors.New("already closed") } + close(d.stopCh) + d.wg.Wait() return errors.Join(d.chipIngressEmitter.Close(), d.otelCollectorEmitter.Close()) } func (d *DualSourceEmitter) Emit(ctx context.Context, body []byte, attrKVs ...any) error { - - // Emit via OTLP first if err := d.otelCollectorEmitter.Emit(ctx, body, attrKVs...); err != nil { return err } - // Emit via chip ingress. When backed by ChipIngressBatchEmitter this is - // non-blocking (just a channel send), so no goroutine wrapper is needed. - if err := d.chipIngressEmitter.Emit(ctx, body, attrKVs...); err != nil { - d.log.Infof("failed to emit to chip ingress: %v", err) + if d.nonBlockingEmitter { + if err := d.chipIngressEmitter.Emit(ctx, body, attrKVs...); err != nil { + d.log.Infof("failed to emit to chip ingress: %v", err) + } + } else { + // Legacy ChipIngressEmitter.Emit is a synchronous gRPC call; + // fire-and-forget via goroutine to avoid blocking the caller. + if err := d.wg.TryAdd(1); err != nil { + return err + } + go func(ctx context.Context) { + defer d.wg.Done() + var cancel context.CancelFunc + ctx, cancel = d.stopCh.Ctx(ctx) + defer cancel() + + if err := d.chipIngressEmitter.Emit(ctx, body, attrKVs...); err != nil { + d.log.Infof("failed to emit to chip ingress: %v", err) + } + }(context.WithoutCancel(ctx)) } return nil diff --git a/pkg/beholder/dual_source_emitter_test.go b/pkg/beholder/dual_source_emitter_test.go index 3429ab1648..314d82348f 100644 --- a/pkg/beholder/dual_source_emitter_test.go +++ b/pkg/beholder/dual_source_emitter_test.go @@ -3,7 +3,9 @@ package beholder_test import ( "context" "fmt" + "sync/atomic" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -18,7 +20,7 @@ func TestNewDualSourceEmitter(t *testing.T) { chipEmitter := &mockEmitter{} otelEmitter := &mockEmitter{} - emitter, err := beholder.NewDualSourceEmitter(chipEmitter, otelEmitter) + emitter, err := beholder.NewDualSourceEmitter(chipEmitter, otelEmitter, false) require.NoError(t, err) assert.NotNil(t, emitter) @@ -29,7 +31,7 @@ func TestNewDualSourceEmitter(t *testing.T) { t.Run("nil chip ingress emitter", func(t *testing.T) { otelEmitter := &mockEmitter{} - emitter, err := beholder.NewDualSourceEmitter(nil, otelEmitter) + emitter, err := beholder.NewDualSourceEmitter(nil, otelEmitter, false) assert.Error(t, err) assert.Nil(t, emitter) @@ -39,7 +41,7 @@ func TestNewDualSourceEmitter(t *testing.T) { t.Run("nil otel collector emitter", func(t *testing.T) { chipEmitter := &mockEmitter{} - emitter, err := beholder.NewDualSourceEmitter(chipEmitter, nil) + emitter, err := beholder.NewDualSourceEmitter(chipEmitter, nil, false) assert.Error(t, err) assert.Nil(t, emitter) @@ -51,7 +53,7 @@ func TestDualSourceEmitterEmit(t *testing.T) { chipEmitter := &mockEmitter{} otelEmitter := &mockEmitter{} - emitter, err := beholder.NewDualSourceEmitter(chipEmitter, otelEmitter) + emitter, err := beholder.NewDualSourceEmitter(chipEmitter, otelEmitter, false) require.NoError(t, err) err = emitter.Emit(t.Context(), []byte("test message"), "key", "value") @@ -67,7 +69,7 @@ func TestDualSourceEmitterEmit(t *testing.T) { }, } - emitter, err := beholder.NewDualSourceEmitter(chipEmitter, otelEmitter) + emitter, err := beholder.NewDualSourceEmitter(chipEmitter, otelEmitter, false) require.NoError(t, err) err = emitter.Emit(t.Context(), []byte("test message")) @@ -76,6 +78,58 @@ func TestDualSourceEmitterEmit(t *testing.T) { }) } +func TestDualSourceEmitterBlockingBehavior(t *testing.T) { + t.Run("legacy mode does not block caller", func(t *testing.T) { + var chipCalled atomic.Bool + chipEmitter := &mockEmitter{ + emitFunc: func(ctx context.Context, body []byte, attrKVs ...any) error { + time.Sleep(200 * time.Millisecond) + chipCalled.Store(true) + return nil + }, + } + otelEmitter := &mockEmitter{} + + emitter, err := beholder.NewDualSourceEmitter(chipEmitter, otelEmitter, false) + require.NoError(t, err) + + start := time.Now() + err = emitter.Emit(t.Context(), []byte("test")) + elapsed := time.Since(start) + + assert.NoError(t, err) + assert.Less(t, elapsed, 100*time.Millisecond, + "Emit should return immediately; chip ingress runs in a goroutine") + assert.False(t, chipCalled.Load(), + "chip ingress emit should still be in-flight when Emit returns") + + require.NoError(t, emitter.Close()) + assert.True(t, chipCalled.Load(), + "Close should wait for the in-flight chip ingress emit to finish") + }) + + t.Run("non-blocking mode emits inline", func(t *testing.T) { + var chipCalled atomic.Bool + chipEmitter := &mockEmitter{ + emitFunc: func(ctx context.Context, body []byte, attrKVs ...any) error { + chipCalled.Store(true) + return nil + }, + } + otelEmitter := &mockEmitter{} + + emitter, err := beholder.NewDualSourceEmitter(chipEmitter, otelEmitter, true) + require.NoError(t, err) + + err = emitter.Emit(t.Context(), []byte("test")) + assert.NoError(t, err) + assert.True(t, chipCalled.Load(), + "chip ingress emit should complete before Emit returns") + + require.NoError(t, emitter.Close()) + }) +} + // Mock emitter for testing type mockEmitter struct { emitFunc func(ctx context.Context, body []byte, attrKVs ...any) error From a66147b8a5696885bab32a0d015034b0a6153a23 Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Fri, 13 Mar 2026 00:06:32 +0200 Subject: [PATCH 23/28] Switch to single client --- pkg/beholder/chip_ingress_batch_emitter.go | 169 ++-- .../chip_ingress_batch_emitter_test.go | 782 +++--------------- pkg/beholder/chip_ingress_emitter_worker.go | 27 - pkg/beholder/config.go | 9 +- pkg/beholder/config_test.go | 2 +- 5 files changed, 172 insertions(+), 817 deletions(-) delete mode 100644 pkg/beholder/chip_ingress_emitter_worker.go diff --git a/pkg/beholder/chip_ingress_batch_emitter.go b/pkg/beholder/chip_ingress_batch_emitter.go index 68b30e9192..2c28c11513 100644 --- a/pkg/beholder/chip_ingress_batch_emitter.go +++ b/pkg/beholder/chip_ingress_batch_emitter.go @@ -16,30 +16,16 @@ import ( "github.com/smartcontractkit/chainlink-common/pkg/services" ) -// ChipIngressBatchEmitter buffers events per (domain, entity) and flushes them -// via chipingress.Client.PublishBatch on a periodic interval. -// Each (domain, entity) pair gets its own batch.Client, providing per-entity -// isolation and independent concurrency scaling. -// It satisfies the Emitter interface so it can be used as a drop-in replacement -// for ChipIngressEmitter. +// ChipIngressBatchEmitter batches events and sends them via chipingress.Client.PublishBatch. +// It implements the Emitter interface. type ChipIngressBatchEmitter struct { services.Service eng *services.Engine - client chipingress.Client + batchClient *batch.Client - workers map[string]*chipIngressEmitterWorker - workersMutex sync.RWMutex - - bufferSize int - maxBatchSize int - maxWorkers int - maxConcurrentSends int - sendInterval time.Duration - sendTimeout time.Duration - drainTimeout time.Duration - - metrics batchEmitterMetrics + metricAttrsCache sync.Map // map[string]otelmetric.MeasurementOption + metrics batchEmitterMetrics } type batchEmitterMetrics struct { @@ -48,7 +34,6 @@ type batchEmitterMetrics struct { } // NewChipIngressBatchEmitter creates a batch emitter backed by the given chipingress client. -// Call Start() to begin health monitoring, and Close() to stop all workers. func NewChipIngressBatchEmitter(client chipingress.Client, cfg Config, lggr logger.Logger) (*ChipIngressBatchEmitter, error) { if client == nil { return nil, fmt.Errorf("chip ingress client is nil") @@ -62,10 +47,6 @@ func NewChipIngressBatchEmitter(client chipingress.Client, cfg Config, lggr logg if maxBatchSize == 0 { maxBatchSize = 500 } - maxWorkers := cfg.ChipIngressMaxWorkers - if maxWorkers == 0 { - maxWorkers = defaultMaxWorkers - } maxConcurrentSends := cfg.ChipIngressMaxConcurrentSends if maxConcurrentSends == 0 { maxConcurrentSends = defaultMaxConcurrentSends @@ -89,51 +70,49 @@ func NewChipIngressBatchEmitter(client chipingress.Client, cfg Config, lggr logg return nil, fmt.Errorf("failed to create batch emitter metrics: %w", err) } + batchClient, err := batch.NewBatchClient(client, + batch.WithBatchSize(maxBatchSize), + batch.WithMessageBuffer(bufferSize), + batch.WithBatchInterval(sendInterval), + batch.WithMaxPublishTimeout(sendTimeout), + batch.WithShutdownTimeout(drainTimeout), + batch.WithMaxConcurrentSends(maxConcurrentSends), + batch.WithEventClone(false), + ) + if err != nil { + return nil, fmt.Errorf("failed to create batch client: %w", err) + } + e := &ChipIngressBatchEmitter{ - client: client, - workers: make(map[string]*chipIngressEmitterWorker), - bufferSize: bufferSize, - maxBatchSize: maxBatchSize, - maxWorkers: maxWorkers, - maxConcurrentSends: maxConcurrentSends, - sendInterval: sendInterval, - sendTimeout: sendTimeout, - drainTimeout: drainTimeout, - metrics: metrics, + batchClient: batchClient, + metrics: metrics, } e.Service, e.eng = services.Config{ Name: "ChipIngressBatchEmitter", }.NewServiceEngine(lggr) + e.eng.Go(func(ctx context.Context) { + batchClient.Start(ctx) + <-ctx.Done() + batchClient.Stop() + }) + return e, nil } -// Emit queues an event for batched delivery. It returns immediately without blocking. -// If the worker's buffer is full, the event is silently dropped (metric bumped). -// Returns an error only if the emitter is closed or the context is cancelled. +// Emit queues an event for batched delivery without blocking. +// Returns an error if the emitter is stopped or the context is cancelled. +// If the buffer is full, the event is silently dropped. func (e *ChipIngressBatchEmitter) Emit(ctx context.Context, body []byte, attrKVs ...any) error { return e.emitInternal(ctx, body, nil, attrKVs...) } -// EmitWithCallback works like Emit but accepts a callback that is invoked once -// the event's fate is determined: nil on successful PublishBatch delivery, or a -// non-nil error on send failure or buffer-full drop. -// -// Contract: -// - If EmitWithCallback returns a non-nil error, the callback will NOT be invoked. -// The caller should handle the error from the return value. -// - If EmitWithCallback returns nil, the callback is guaranteed to be invoked -// exactly once — either asynchronously (after the batch is sent) or -// synchronously (if the event was dropped at enqueue time). +// EmitWithCallback works like Emit but invokes callback once the event's fate +// is determined (nil on success, non-nil on failure or buffer-full drop). // -// Callers can use this for "synchronous" emission: -// -// done := make(chan error, 1) -// if err := emitter.EmitWithCallback(ctx, body, func(err error) { done <- err }, attrKVs...); err != nil { -// return err // callback will not fire -// } -// err := <-done // safe — callback will fire exactly once +// If EmitWithCallback returns a non-nil error, the callback will NOT be invoked. +// If it returns nil, the callback is guaranteed to fire exactly once. func (e *ChipIngressBatchEmitter) EmitWithCallback(ctx context.Context, body []byte, callback func(error), attrKVs ...any) error { return e.emitInternal(ctx, body, callback, attrKVs...) } @@ -147,14 +126,6 @@ func (e *ChipIngressBatchEmitter) emitInternal(ctx context.Context, body []byte, attributes := newAttributes(attrKVs...) - worker := e.findOrCreateWorker(domain, entity) - if worker == nil { - if callback != nil { - callback(fmt.Errorf("max workers reached, event dropped")) - } - return nil - } - event, err := chipingress.NewEvent(domain, entity, body, attributes) if err != nil { return fmt.Errorf("failed to create CloudEvent: %w", err) @@ -168,18 +139,20 @@ func (e *ChipIngressBatchEmitter) emitInternal(ctx context.Context, body []byte, return err } - queueErr := worker.batchClient.QueueMessage(eventPb, func(sendErr error) { + metricAttrs := e.metricAttrsFor(domain, entity) + + queueErr := e.batchClient.QueueMessage(eventPb, func(sendErr error) { if sendErr != nil { - e.metrics.eventsDropped.Add(context.Background(), 1, worker.metricAttrs) + e.metrics.eventsDropped.Add(context.Background(), 1, metricAttrs) } else { - e.metrics.eventsSent.Add(context.Background(), 1, worker.metricAttrs) + e.metrics.eventsSent.Add(context.Background(), 1, metricAttrs) } if callback != nil { callback(sendErr) } }) if queueErr != nil { - e.metrics.eventsDropped.Add(context.Background(), 1, worker.metricAttrs) + e.metrics.eventsDropped.Add(context.Background(), 1, metricAttrs) if callback != nil { callback(queueErr) } @@ -189,59 +162,17 @@ func (e *ChipIngressBatchEmitter) emitInternal(ctx context.Context, body []byte, }) } -// findOrCreateWorker returns the worker for the given (domain, entity) pair, -// creating one backed by a new batch.Client if it doesn't exist. -func (e *ChipIngressBatchEmitter) findOrCreateWorker(domain, entity string) *chipIngressEmitterWorker { - workerKey := domain + "\x00" + entity - - e.workersMutex.RLock() - worker, found := e.workers[workerKey] - e.workersMutex.RUnlock() - - if found { - return worker - } - - e.workersMutex.Lock() - defer e.workersMutex.Unlock() - - if worker, found = e.workers[workerKey]; found { - return worker - } - - if len(e.workers) >= e.maxWorkers { - e.eng.Warnf("chip ingress batch emitter: max workers (%d) reached, dropping event for %s/%s", e.maxWorkers, domain, entity) - e.metrics.eventsDropped.Add(context.Background(), 1, otelmetric.WithAttributeSet(attribute.NewSet( - attribute.String("domain", domain), - attribute.String("entity", entity), - ))) - return nil - } - - batchClient, err := batch.NewBatchClient(e.client, - batch.WithBatchSize(e.maxBatchSize), - batch.WithMessageBuffer(e.bufferSize), - batch.WithBatchInterval(e.sendInterval), - batch.WithMaxPublishTimeout(e.sendTimeout), - batch.WithShutdownTimeout(e.drainTimeout), - batch.WithMaxConcurrentSends(e.maxConcurrentSends), - batch.WithEventClone(false), - ) - if err != nil { - e.eng.Errorf("chip ingress batch emitter: failed to create batch client for %s: %v", workerKey, err) - return nil - } - - worker = newChipIngressEmitterWorker(batchClient, domain, entity) - - e.eng.Go(func(ctx context.Context) { - worker.batchClient.Start(ctx) - <-ctx.Done() - worker.batchClient.Stop() - }) - - e.workers[workerKey] = worker - return worker +func (e *ChipIngressBatchEmitter) metricAttrsFor(domain, entity string) otelmetric.MeasurementOption { + key := domain + "\x00" + entity + if v, ok := e.metricAttrsCache.Load(key); ok { + return v.(otelmetric.MeasurementOption) + } + attrs := otelmetric.WithAttributeSet(attribute.NewSet( + attribute.String("domain", domain), + attribute.String("entity", entity), + )) + v, _ := e.metricAttrsCache.LoadOrStore(key, attrs) + return v.(otelmetric.MeasurementOption) } func newBatchEmitterMetrics(meter otelmetric.Meter) (batchEmitterMetrics, error) { diff --git a/pkg/beholder/chip_ingress_batch_emitter_test.go b/pkg/beholder/chip_ingress_batch_emitter_test.go index 0e51b808e6..45cd6a2dfe 100644 --- a/pkg/beholder/chip_ingress_batch_emitter_test.go +++ b/pkg/beholder/chip_ingress_batch_emitter_test.go @@ -62,35 +62,6 @@ func TestChipIngressBatchEmitter_Emit(t *testing.T) { assert.Error(t, err) }) - t.Run("enqueues and does not call PublishBatch immediately", func(t *testing.T) { - clientMock := mocks.NewClient(t) - clientMock. - On("PublishBatch", mock.Anything, mock.Anything). - Return(nil, nil). - Maybe() - - cfg := newTestConfig() - cfg.ChipIngressSendInterval = 10 * time.Second - cfg.ChipIngressMaxBatchSize = 100 - - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) - require.NoError(t, err) - require.NoError(t, emitter.Start(t.Context())) - defer emitter.Close() //nolint:errcheck - - err = emitter.Emit(t.Context(), []byte("body"), - beholder.AttrKeyDomain, "test-domain", - beholder.AttrKeyEntity, "test-entity", - ) - require.NoError(t, err) - - // With a large batch size and long interval, PublishBatch should not fire yet - time.Sleep(100 * time.Millisecond) - clientMock.AssertNotCalled(t, "PublishBatch", mock.Anything, mock.Anything) - }) -} - -func TestChipIngressBatchEmitter_BatchAssembly(t *testing.T) { t.Run("events are batched and sent via PublishBatch", func(t *testing.T) { clientMock := mocks.NewClient(t) @@ -121,7 +92,6 @@ func TestChipIngressBatchEmitter_BatchAssembly(t *testing.T) { require.NoError(t, err) } - // Wait for flush to occur assert.Eventually(t, func() bool { mu.Lock() defer mu.Unlock() @@ -141,651 +111,173 @@ func TestChipIngressBatchEmitter_BatchAssembly(t *testing.T) { }) } -func TestChipIngressBatchEmitter_MaxBatchSize(t *testing.T) { - t.Run("batch is capped at maxBatchSize", func(t *testing.T) { - clientMock := mocks.NewClient(t) - - var mu sync.Mutex - var batchSizes []int - clientMock. - On("PublishBatch", mock.Anything, mock.Anything). - Run(func(args mock.Arguments) { - mu.Lock() - defer mu.Unlock() - batch := args.Get(1).(*chipingress.CloudEventBatch) - batchSizes = append(batchSizes, len(batch.Events)) - }). - Return(nil, nil) - - cfg := newTestConfig() - cfg.ChipIngressBufferSize = 20 - cfg.ChipIngressMaxBatchSize = 3 - cfg.ChipIngressSendInterval = 50 * time.Millisecond - - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) - require.NoError(t, err) - require.NoError(t, emitter.Start(t.Context())) - - for i := 0; i < 7; i++ { - err = emitter.Emit(t.Context(), []byte("body"), - beholder.AttrKeyDomain, "platform", - beholder.AttrKeyEntity, "TestEvent", - ) - require.NoError(t, err) - } +func TestChipIngressBatchEmitter_CloudEventFormat(t *testing.T) { + clientMock := mocks.NewClient(t) - // Wait for all events to be flushed - assert.Eventually(t, func() bool { + var mu sync.Mutex + var receivedBatch *chipingress.CloudEventBatch + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Run(func(args mock.Arguments) { mu.Lock() defer mu.Unlock() - total := 0 - for _, s := range batchSizes { - total += s - } - return total >= 7 - }, 2*time.Second, 10*time.Millisecond) + receivedBatch = args.Get(1).(*chipingress.CloudEventBatch) + }). + Return(nil, nil) - require.NoError(t, emitter.Close()) + cfg := newTestConfig() + cfg.ChipIngressSendInterval = 50 * time.Millisecond - mu.Lock() - defer mu.Unlock() - for _, size := range batchSizes { - assert.LessOrEqual(t, size, 3, "batch size should not exceed maxBatchSize") - } - }) -} - -func TestChipIngressBatchEmitter_PerDomainEntityIsolation(t *testing.T) { - t.Run("separate workers for different domain/entity pairs", func(t *testing.T) { - clientMock := mocks.NewClient(t) - - var mu sync.Mutex - domainEntitySeen := make(map[string]int) - clientMock. - On("PublishBatch", mock.Anything, mock.Anything). - Run(func(args mock.Arguments) { - mu.Lock() - defer mu.Unlock() - batch := args.Get(1).(*chipingress.CloudEventBatch) - for _, event := range batch.Events { - key := event.Source + "/" + event.Type - domainEntitySeen[key] += 1 - } - }). - Return(nil, nil) - - cfg := newTestConfig() - cfg.ChipIngressSendInterval = 50 * time.Millisecond - - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) - require.NoError(t, err) - require.NoError(t, emitter.Start(t.Context())) - - // Emit events for two different domain/entity pairs - for i := 0; i < 3; i++ { - err = emitter.Emit(t.Context(), []byte("workflow"), - beholder.AttrKeyDomain, "platform", - beholder.AttrKeyEntity, "WorkflowEvent", - ) - require.NoError(t, err) - } - for i := 0; i < 2; i++ { - err = emitter.Emit(t.Context(), []byte("bridge"), - beholder.AttrKeyDomain, "data-feeds", - beholder.AttrKeyEntity, "BridgeStatus", - ) - require.NoError(t, err) - } - - // Wait for both to be flushed - assert.Eventually(t, func() bool { - mu.Lock() - defer mu.Unlock() - return domainEntitySeen["platform/WorkflowEvent"] >= 3 && - domainEntitySeen["data-feeds/BridgeStatus"] >= 2 - }, 2*time.Second, 10*time.Millisecond) + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) - require.NoError(t, emitter.Close()) + err = emitter.Emit(t.Context(), []byte("test-payload"), + beholder.AttrKeyDomain, "my-domain", + beholder.AttrKeyEntity, "my-entity", + ) + require.NoError(t, err) + assert.Eventually(t, func() bool { mu.Lock() defer mu.Unlock() - assert.Equal(t, 3, domainEntitySeen["platform/WorkflowEvent"]) - assert.Equal(t, 2, domainEntitySeen["data-feeds/BridgeStatus"]) - }) -} - -func TestChipIngressBatchEmitter_BufferFull(t *testing.T) { - t.Run("events are dropped when buffer is full", func(t *testing.T) { - clientMock := mocks.NewClient(t) - - // Block PublishBatch so the batcher's send pipeline backs up, - // eventually filling the message buffer channel. - sendBlocked := make(chan struct{}) - firstCallSignal := make(chan struct{}, 1) - clientMock. - On("PublishBatch", mock.Anything, mock.Anything). - Run(func(_ mock.Arguments) { - select { - case firstCallSignal <- struct{}{}: - default: - } - <-sendBlocked - }). - Return(nil, nil). - Maybe() - - cfg := newTestConfig() - cfg.ChipIngressBufferSize = 2 - cfg.ChipIngressMaxBatchSize = 1 - cfg.ChipIngressMaxConcurrentSends = 1 - cfg.ChipIngressSendInterval = 50 * time.Millisecond - cfg.ChipIngressDrainTimeout = 200 * time.Millisecond - - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) - require.NoError(t, err) - require.NoError(t, emitter.Start(t.Context())) - defer close(sendBlocked) - defer emitter.Close() //nolint:errcheck - - // First event triggers a send that blocks, exhausting the semaphore - err = emitter.Emit(t.Context(), []byte("body"), - beholder.AttrKeyDomain, "platform", - beholder.AttrKeyEntity, "TestEvent", - ) - require.NoError(t, err) - - // Wait for the batcher to process the event and block on PublishBatch - <-firstCallSignal - // Give batcher time to read the next event and block on the semaphore - time.Sleep(100 * time.Millisecond) - - // Flood the buffer — Emit should never return an error - for i := 0; i < 10; i++ { - err = emitter.Emit(t.Context(), []byte("overflow"), - beholder.AttrKeyDomain, "platform", - beholder.AttrKeyEntity, "TestEvent", - ) - assert.NoError(t, err, "Emit should not return error even when dropping events") - } - }) -} - -func TestChipIngressBatchEmitter_Lifecycle(t *testing.T) { - t.Run("start and close cleanly", func(t *testing.T) { - clientMock := mocks.NewClient(t) - clientMock. - On("PublishBatch", mock.Anything, mock.Anything). - Return(nil, nil). - Maybe() - - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestConfig(), newTestLogger(t)) - require.NoError(t, err) + return receivedBatch != nil + }, 2*time.Second, 10*time.Millisecond) - require.NoError(t, emitter.Start(t.Context())) + require.NoError(t, emitter.Close()) - // Emit a few events to create workers - for i := 0; i < 3; i++ { - err = emitter.Emit(t.Context(), []byte("body"), - beholder.AttrKeyDomain, "platform", - beholder.AttrKeyEntity, "TestEvent", - ) - require.NoError(t, err) - } + mu.Lock() + defer mu.Unlock() + require.Len(t, receivedBatch.Events, 1) - // Close should not hang or error - require.NoError(t, emitter.Close()) - }) -} - -func TestChipIngressBatchEmitter_CloudEventFormat(t *testing.T) { - t.Run("CloudEvents have correct source, type, and data", func(t *testing.T) { - clientMock := mocks.NewClient(t) - - var mu sync.Mutex - var receivedBatch *chipingress.CloudEventBatch - clientMock. - On("PublishBatch", mock.Anything, mock.Anything). - Run(func(args mock.Arguments) { - mu.Lock() - defer mu.Unlock() - receivedBatch = args.Get(1).(*chipingress.CloudEventBatch) - }). - Return(nil, nil) - - cfg := newTestConfig() - cfg.ChipIngressSendInterval = 50 * time.Millisecond - - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) - require.NoError(t, err) - require.NoError(t, emitter.Start(t.Context())) - - err = emitter.Emit(t.Context(), []byte("test-payload"), - beholder.AttrKeyDomain, "my-domain", - beholder.AttrKeyEntity, "my-entity", - ) - require.NoError(t, err) - - assert.Eventually(t, func() bool { - mu.Lock() - defer mu.Unlock() - return receivedBatch != nil - }, 2*time.Second, 10*time.Millisecond) - - require.NoError(t, emitter.Close()) - - mu.Lock() - defer mu.Unlock() - require.Len(t, receivedBatch.Events, 1) - - event := receivedBatch.Events[0] - assert.Equal(t, "my-domain", event.Source) - assert.Equal(t, "my-entity", event.Type) - assert.NotEmpty(t, event.Id) - }) + event := receivedBatch.Events[0] + assert.Equal(t, "my-domain", event.Source) + assert.Equal(t, "my-entity", event.Type) + assert.NotEmpty(t, event.Id) } func TestChipIngressBatchEmitter_PublishBatchError(t *testing.T) { - t.Run("PublishBatch error is handled gracefully", func(t *testing.T) { - clientMock := mocks.NewClient(t) + clientMock := mocks.NewClient(t) - var mu sync.Mutex - callCount := 0 - clientMock. - On("PublishBatch", mock.Anything, mock.Anything). - Run(func(_ mock.Arguments) { - mu.Lock() - defer mu.Unlock() - callCount++ - }). - Return(nil, assert.AnError) - - cfg := newTestConfig() - cfg.ChipIngressSendInterval = 50 * time.Millisecond - - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) - require.NoError(t, err) - require.NoError(t, emitter.Start(t.Context())) - - for i := 0; i < 3; i++ { - err = emitter.Emit(t.Context(), []byte("body"), - beholder.AttrKeyDomain, "platform", - beholder.AttrKeyEntity, "TestEvent", - ) - require.NoError(t, err) - } - - assert.Eventually(t, func() bool { + var mu sync.Mutex + callCount := 0 + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Run(func(_ mock.Arguments) { mu.Lock() defer mu.Unlock() - return callCount > 0 - }, 2*time.Second, 10*time.Millisecond) + callCount++ + }). + Return(nil, assert.AnError) - require.NoError(t, emitter.Close()) - }) -} - -func TestChipIngressBatchEmitter_ContextCancellation(t *testing.T) { - t.Run("Emit returns context error when context is cancelled", func(t *testing.T) { - clientMock := mocks.NewClient(t) - clientMock. - On("PublishBatch", mock.Anything, mock.Anything). - Return(nil, nil). - Maybe() - - cfg := newTestConfig() - cfg.ChipIngressBufferSize = 1 - cfg.ChipIngressSendInterval = 10 * time.Second - - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) - require.NoError(t, err) - require.NoError(t, emitter.Start(t.Context())) - defer emitter.Close() //nolint:errcheck + cfg := newTestConfig() + cfg.ChipIngressSendInterval = 50 * time.Millisecond - ctx, cancel := context.WithCancel(t.Context()) - cancel() - - err = emitter.Emit(ctx, []byte("should-fail"), - beholder.AttrKeyDomain, "platform", - beholder.AttrKeyEntity, "TestEvent", - ) - assert.ErrorIs(t, err, context.Canceled) - }) -} - -func TestChipIngressBatchEmitter_DefaultConfig(t *testing.T) { - t.Run("zero config uses sane defaults", func(t *testing.T) { - clientMock := mocks.NewClient(t) - - var mu sync.Mutex - var receivedBatch *chipingress.CloudEventBatch - clientMock. - On("PublishBatch", mock.Anything, mock.Anything). - Run(func(args mock.Arguments) { - mu.Lock() - defer mu.Unlock() - receivedBatch = args.Get(1).(*chipingress.CloudEventBatch) - }). - Return(nil, nil) - - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, beholder.Config{}, newTestLogger(t)) - require.NoError(t, err) - require.NoError(t, emitter.Start(t.Context())) + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + for i := 0; i < 3; i++ { err = emitter.Emit(t.Context(), []byte("body"), beholder.AttrKeyDomain, "platform", beholder.AttrKeyEntity, "TestEvent", ) require.NoError(t, err) + } - // Default send interval is 100ms; wait for flush - assert.Eventually(t, func() bool { - mu.Lock() - defer mu.Unlock() - return receivedBatch != nil - }, 3*time.Second, 50*time.Millisecond) - - require.NoError(t, emitter.Close()) - + assert.Eventually(t, func() bool { mu.Lock() defer mu.Unlock() - require.Len(t, receivedBatch.Events, 1) - }) -} - -func TestChipIngressBatchEmitter_GracefulDrain(t *testing.T) { - t.Run("flushes buffered events on close", func(t *testing.T) { - clientMock := mocks.NewClient(t) - - var mu sync.Mutex - totalEventsSent := 0 - clientMock. - On("PublishBatch", mock.Anything, mock.Anything). - Run(func(args mock.Arguments) { - mu.Lock() - defer mu.Unlock() - batch := args.Get(1).(*chipingress.CloudEventBatch) - totalEventsSent += len(batch.Events) - }). - Return(nil, nil) - - cfg := beholder.Config{ - ChipIngressBufferSize: 20, - ChipIngressMaxBatchSize: 10, - ChipIngressMaxConcurrentSends: 3, - ChipIngressSendInterval: 1 * time.Hour, - ChipIngressSendTimeout: 5 * time.Second, - ChipIngressDrainTimeout: 5 * time.Second, - } + return callCount > 0 + }, 2*time.Second, 10*time.Millisecond) - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) - require.NoError(t, err) - require.NoError(t, emitter.Start(t.Context())) - - for i := 0; i < 5; i++ { - err = emitter.Emit(t.Context(), []byte("body"), - beholder.AttrKeyDomain, "platform", - beholder.AttrKeyEntity, "TestEvent", - ) - require.NoError(t, err) - } - - // Give the batcher time to read events from the channel into its internal batch - time.Sleep(50 * time.Millisecond) - - // Events are buffered but no tick has fired. Close should drain them. - require.NoError(t, emitter.Close()) - - mu.Lock() - defer mu.Unlock() - assert.Equal(t, 5, totalEventsSent, "all buffered events should be drained on close") - }) + require.NoError(t, emitter.Close()) } -func TestChipIngressBatchEmitter_DrainMultipleDomains(t *testing.T) { - t.Run("drains events from all workers on close", func(t *testing.T) { - clientMock := mocks.NewClient(t) - - var mu sync.Mutex - domainEntitySent := make(map[string]int) - clientMock. - On("PublishBatch", mock.Anything, mock.Anything). - Run(func(args mock.Arguments) { - mu.Lock() - defer mu.Unlock() - batch := args.Get(1).(*chipingress.CloudEventBatch) - for _, event := range batch.Events { - key := event.Source + "/" + event.Type - domainEntitySent[key] += 1 - } - }). - Return(nil, nil) - - cfg := beholder.Config{ - ChipIngressBufferSize: 20, - ChipIngressMaxBatchSize: 10, - ChipIngressMaxConcurrentSends: 3, - ChipIngressSendInterval: 1 * time.Hour, - ChipIngressSendTimeout: 5 * time.Second, - ChipIngressDrainTimeout: 5 * time.Second, - } +func TestChipIngressBatchEmitter_ContextCancellation(t *testing.T) { + clientMock := mocks.NewClient(t) + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Return(nil, nil). + Maybe() - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) - require.NoError(t, err) - require.NoError(t, emitter.Start(t.Context())) + cfg := newTestConfig() + cfg.ChipIngressBufferSize = 1 + cfg.ChipIngressSendInterval = 10 * time.Second - for i := 0; i < 3; i++ { - err = emitter.Emit(t.Context(), []byte("workflow"), - beholder.AttrKeyDomain, "platform", - beholder.AttrKeyEntity, "WorkflowEvent", - ) - require.NoError(t, err) - } - for i := 0; i < 2; i++ { - err = emitter.Emit(t.Context(), []byte("bridge"), - beholder.AttrKeyDomain, "data-feeds", - beholder.AttrKeyEntity, "BridgeStatus", - ) - require.NoError(t, err) - } + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + defer emitter.Close() //nolint:errcheck - // Give batchers time to read events from their channels - time.Sleep(50 * time.Millisecond) + ctx, cancel := context.WithCancel(t.Context()) + cancel() - require.NoError(t, emitter.Close()) - - mu.Lock() - defer mu.Unlock() - assert.Equal(t, 3, domainEntitySent["platform/WorkflowEvent"]) - assert.Equal(t, 2, domainEntitySent["data-feeds/BridgeStatus"]) - }) + err = emitter.Emit(ctx, []byte("should-fail"), + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + assert.ErrorIs(t, err, context.Canceled) } -func TestChipIngressBatchEmitter_DrainPublishBatchFailure(t *testing.T) { - t.Run("handles batch failure and continues sending", func(t *testing.T) { - clientMock := mocks.NewClient(t) - - var mu sync.Mutex - callCount := 0 - totalEventsSent := 0 - clientMock. - On("PublishBatch", mock.Anything, mock.Anything). - Run(func(args mock.Arguments) { - mu.Lock() - defer mu.Unlock() - callCount++ - batch := args.Get(1).(*chipingress.CloudEventBatch) - if callCount == 1 { - return // first call fails - } - totalEventsSent += len(batch.Events) - }). - Return(nil, assert.AnError).Once() - - clientMock. - On("PublishBatch", mock.Anything, mock.Anything). - Run(func(args mock.Arguments) { - mu.Lock() - defer mu.Unlock() - callCount++ - batch := args.Get(1).(*chipingress.CloudEventBatch) - totalEventsSent += len(batch.Events) - }). - Return(nil, nil) - - cfg := beholder.Config{ - ChipIngressBufferSize: 20, - ChipIngressMaxBatchSize: 3, - ChipIngressMaxConcurrentSends: 3, - ChipIngressSendInterval: 1 * time.Hour, - ChipIngressSendTimeout: 5 * time.Second, - ChipIngressDrainTimeout: 5 * time.Second, - } - - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) - require.NoError(t, err) - require.NoError(t, emitter.Start(t.Context())) +func TestChipIngressBatchEmitter_DefaultConfig(t *testing.T) { + clientMock := mocks.NewClient(t) - // Emit 6 events with maxBatchSize=3 => 2 batches via size trigger - for i := 0; i < 6; i++ { - err = emitter.Emit(t.Context(), []byte("body"), - beholder.AttrKeyDomain, "platform", - beholder.AttrKeyEntity, "TestEvent", - ) - require.NoError(t, err) - } + var mu sync.Mutex + var receivedBatch *chipingress.CloudEventBatch + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Run(func(args mock.Arguments) { + mu.Lock() + defer mu.Unlock() + receivedBatch = args.Get(1).(*chipingress.CloudEventBatch) + }). + Return(nil, nil) - // Give the batcher time to read events and trigger size-based sends - time.Sleep(100 * time.Millisecond) + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, beholder.Config{}, newTestLogger(t)) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) - require.NoError(t, emitter.Close()) + err = emitter.Emit(t.Context(), []byte("body"), + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + require.NoError(t, err) + assert.Eventually(t, func() bool { mu.Lock() defer mu.Unlock() - assert.GreaterOrEqual(t, callCount, 2, "should have attempted at least 2 batches") - assert.Equal(t, 3, totalEventsSent, "second batch should have succeeded despite first batch failure") - }) -} - -func TestChipIngressBatchEmitter_DrainTimeout(t *testing.T) { - t.Run("close returns promptly when drain timeout expires", func(t *testing.T) { - clientMock := mocks.NewClient(t) - - clientMock. - On("PublishBatch", mock.Anything, mock.Anything). - Run(func(args mock.Arguments) { - ctx := args.Get(0).(context.Context) - <-ctx.Done() // simulate a slow server that only returns on context cancellation - }). - Return(nil, context.DeadlineExceeded). - Maybe() - - cfg := beholder.Config{ - ChipIngressBufferSize: 20, - ChipIngressMaxBatchSize: 10, - ChipIngressMaxConcurrentSends: 3, - ChipIngressSendInterval: 1 * time.Hour, - ChipIngressSendTimeout: 10 * time.Second, - ChipIngressDrainTimeout: 200 * time.Millisecond, - } - - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) - require.NoError(t, err) - require.NoError(t, emitter.Start(t.Context())) - - for i := 0; i < 5; i++ { - err = emitter.Emit(t.Context(), []byte("body"), - beholder.AttrKeyDomain, "platform", - beholder.AttrKeyEntity, "TestEvent", - ) - require.NoError(t, err) - } - - // Give the batcher time to read events - time.Sleep(50 * time.Millisecond) - - closeDone := make(chan error, 1) - go func() { - closeDone <- emitter.Close() - }() - - select { - case err := <-closeDone: - assert.NoError(t, err, "close should not error") - case <-time.After(5 * time.Second): - t.Fatal("Close() did not return within 5s; drain timeout is not working") - } - }) -} - -func TestChipIngressBatchEmitter_MaxWorkersCap(t *testing.T) { - t.Run("drops events when max workers reached", func(t *testing.T) { - clientMock := mocks.NewClient(t) - clientMock. - On("PublishBatch", mock.Anything, mock.Anything). - Return(nil, nil). - Maybe() - - cfg := newTestConfig() - cfg.ChipIngressMaxWorkers = 2 - cfg.ChipIngressSendInterval = 50 * time.Millisecond - - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) - require.NoError(t, err) - require.NoError(t, emitter.Start(t.Context())) - defer emitter.Close() //nolint:errcheck - - // Create 2 workers (at the cap) - err = emitter.Emit(t.Context(), []byte("body"), - beholder.AttrKeyDomain, "domain-1", - beholder.AttrKeyEntity, "entity-1", - ) - require.NoError(t, err) + return receivedBatch != nil + }, 3*time.Second, 50*time.Millisecond) - err = emitter.Emit(t.Context(), []byte("body"), - beholder.AttrKeyDomain, "domain-2", - beholder.AttrKeyEntity, "entity-2", - ) - require.NoError(t, err) + require.NoError(t, emitter.Close()) - // 3rd unique pair should be silently dropped (no error, no worker created) - err = emitter.Emit(t.Context(), []byte("body"), - beholder.AttrKeyDomain, "domain-3", - beholder.AttrKeyEntity, "entity-3", - ) - assert.NoError(t, err, "Emit should not error when max workers is reached") - - // Existing workers should still accept events - err = emitter.Emit(t.Context(), []byte("body"), - beholder.AttrKeyDomain, "domain-1", - beholder.AttrKeyEntity, "entity-1", - ) - assert.NoError(t, err, "Emit to existing worker should still work") - }) + mu.Lock() + defer mu.Unlock() + require.Len(t, receivedBatch.Events, 1) } func TestChipIngressBatchEmitter_EmitAfterClose(t *testing.T) { - t.Run("Emit after Close returns error", func(t *testing.T) { - clientMock := mocks.NewClient(t) - clientMock. - On("PublishBatch", mock.Anything, mock.Anything). - Return(nil, nil). - Maybe() - - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestConfig(), newTestLogger(t)) - require.NoError(t, err) - require.NoError(t, emitter.Start(t.Context())) - require.NoError(t, emitter.Close()) + clientMock := mocks.NewClient(t) + clientMock. + On("PublishBatch", mock.Anything, mock.Anything). + Return(nil, nil). + Maybe() - err = emitter.Emit(t.Context(), []byte("body"), - beholder.AttrKeyDomain, "platform", - beholder.AttrKeyEntity, "TestEvent", - ) - assert.Error(t, err, "Emit after Close should return an error") - }) + emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestConfig(), newTestLogger(t)) + require.NoError(t, err) + require.NoError(t, emitter.Start(t.Context())) + require.NoError(t, emitter.Close()) + + err = emitter.Emit(t.Context(), []byte("body"), + beholder.AttrKeyDomain, "platform", + beholder.AttrKeyEntity, "TestEvent", + ) + assert.Error(t, err) } func TestChipIngressBatchEmitter_EmitWithCallback(t *testing.T) { - t.Run("callback receives nil on successful send", func(t *testing.T) { + t.Run("callback receives nil on success", func(t *testing.T) { clientMock := mocks.NewClient(t) clientMock. On("PublishBatch", mock.Anything, mock.Anything). @@ -809,7 +301,7 @@ func TestChipIngressBatchEmitter_EmitWithCallback(t *testing.T) { select { case sendErr := <-done: - assert.NoError(t, sendErr, "callback should receive nil on success") + assert.NoError(t, sendErr) case <-time.After(3 * time.Second): t.Fatal("callback was not invoked within timeout") } @@ -841,7 +333,7 @@ func TestChipIngressBatchEmitter_EmitWithCallback(t *testing.T) { select { case sendErr := <-done: - assert.Error(t, sendErr, "callback should receive error on failure") + assert.Error(t, sendErr) case <-time.After(3 * time.Second): t.Fatal("callback was not invoked within timeout") } @@ -879,19 +371,15 @@ func TestChipIngressBatchEmitter_EmitWithCallback(t *testing.T) { defer close(sendBlocked) defer emitter.Close() //nolint:errcheck - // First event triggers a send that blocks, exhausting the semaphore err = emitter.Emit(t.Context(), []byte("body"), beholder.AttrKeyDomain, "platform", beholder.AttrKeyEntity, "TestEvent", ) require.NoError(t, err) - // Wait for PublishBatch to be called and blocked <-firstCallSignal - // Give batcher time to read the next event and block on the semaphore time.Sleep(100 * time.Millisecond) - // Flood the buffer (size 2) so it becomes full for i := 0; i < 10; i++ { _ = emitter.Emit(t.Context(), []byte("filler"), beholder.AttrKeyDomain, "platform", @@ -899,7 +387,6 @@ func TestChipIngressBatchEmitter_EmitWithCallback(t *testing.T) { ) } - // Buffer is full — callback should be invoked synchronously with an error dropped := make(chan error, 1) err = emitter.EmitWithCallback(t.Context(), []byte("overflow"), func(sendErr error) { dropped <- sendErr @@ -907,49 +394,16 @@ func TestChipIngressBatchEmitter_EmitWithCallback(t *testing.T) { beholder.AttrKeyDomain, "platform", beholder.AttrKeyEntity, "TestEvent", ) - assert.NoError(t, err, "Emit should not return an error even when dropping") + assert.NoError(t, err) select { case dropErr := <-dropped: - assert.Error(t, dropErr, "callback should receive an error when buffer is full") + assert.Error(t, dropErr) case <-time.After(time.Second): t.Fatal("callback was not invoked for dropped event") } }) - t.Run("synchronous emit pattern works end-to-end", func(t *testing.T) { - clientMock := mocks.NewClient(t) - clientMock. - On("PublishBatch", mock.Anything, mock.Anything). - Return(nil, nil) - - cfg := newTestConfig() - cfg.ChipIngressSendInterval = 50 * time.Millisecond - - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) - require.NoError(t, err) - require.NoError(t, emitter.Start(t.Context())) - - done := make(chan error, 1) - err = emitter.EmitWithCallback(t.Context(), []byte("sync-body"), func(sendErr error) { - done <- sendErr - }, - beholder.AttrKeyDomain, "platform", - beholder.AttrKeyEntity, "TestEvent", - ) - require.NoError(t, err) - - // Block until the event is actually sent — the "synchronous" pattern - select { - case sendErr := <-done: - assert.NoError(t, sendErr) - case <-time.After(3 * time.Second): - t.Fatal("synchronous emit did not complete within timeout") - } - - require.NoError(t, emitter.Close()) - }) - t.Run("nil callback behaves like Emit", func(t *testing.T) { clientMock := mocks.NewClient(t) clientMock. diff --git a/pkg/beholder/chip_ingress_emitter_worker.go b/pkg/beholder/chip_ingress_emitter_worker.go deleted file mode 100644 index 5b964bca3c..0000000000 --- a/pkg/beholder/chip_ingress_emitter_worker.go +++ /dev/null @@ -1,27 +0,0 @@ -package beholder - -import ( - "go.opentelemetry.io/otel/attribute" - otelmetric "go.opentelemetry.io/otel/metric" - - "github.com/smartcontractkit/chainlink-common/pkg/chipingress/batch" -) - -type chipIngressEmitterWorker struct { - batchClient *batch.Client - metricAttrs otelmetric.MeasurementOption -} - -func newChipIngressEmitterWorker( - batchClient *batch.Client, - domain string, - entity string, -) *chipIngressEmitterWorker { - return &chipIngressEmitterWorker{ - batchClient: batchClient, - metricAttrs: otelmetric.WithAttributeSet(attribute.NewSet( - attribute.String("domain", domain), - attribute.String("entity", entity), - )), - } -} diff --git a/pkg/beholder/config.go b/pkg/beholder/config.go index bfce03c7cf..deeed041fd 100644 --- a/pkg/beholder/config.go +++ b/pkg/beholder/config.go @@ -48,13 +48,12 @@ type Config struct { // Chip Ingress Batch Emitter ChipIngressBatchEmitterEnabled bool // When true, use batch emitter; when false (default), use legacy per-event emitter - ChipIngressBufferSize uint // Per-worker message buffer size (default 1000) + ChipIngressBufferSize uint // Message buffer size (default 1000) ChipIngressMaxBatchSize uint // Max events per PublishBatch call (default 500) - ChipIngressMaxWorkers int // Max concurrent (domain, entity) workers (default 100) - ChipIngressSendInterval time.Duration // Flush interval per worker (default 100ms) + ChipIngressSendInterval time.Duration // Flush interval (default 100ms) ChipIngressSendTimeout time.Duration // Timeout per PublishBatch call (default 3s) ChipIngressDrainTimeout time.Duration // Max time to flush remaining events on shutdown (default 10s) - ChipIngressMaxConcurrentSends int // Max concurrent PublishBatch calls per worker (default 10) + ChipIngressMaxConcurrentSends int // Max concurrent PublishBatch calls (default 10) ChipIngressLogger logger.Logger // Required when ChipIngressBatchEmitterEnabled is true // OTel Log @@ -105,7 +104,6 @@ var defaultRetryConfig = RetryConfig{ const ( defaultPackageName = "beholder" - defaultMaxWorkers = 100 defaultMaxConcurrentSends = 10 ) @@ -152,7 +150,6 @@ func DefaultConfig() Config { ChipIngressBatchEmitterEnabled: false, ChipIngressBufferSize: 1000, ChipIngressMaxBatchSize: 500, - ChipIngressMaxWorkers: defaultMaxWorkers, ChipIngressSendInterval: 100 * time.Millisecond, ChipIngressSendTimeout: 3 * time.Second, ChipIngressDrainTimeout: 10 * time.Second, diff --git a/pkg/beholder/config_test.go b/pkg/beholder/config_test.go index 50410d0a39..80bef78d3a 100644 --- a/pkg/beholder/config_test.go +++ b/pkg/beholder/config_test.go @@ -67,6 +67,6 @@ func ExampleConfig() { } fmt.Printf("%+v\n", *config.LogRetryConfig) // Output: - // {InsecureConnection:true CACertFile: OtelExporterGRPCEndpoint:localhost:4317 OtelExporterHTTPEndpoint:localhost:4318 ResourceAttributes:[{Key:package_name Value:{vtype:4 numeric:0 stringly:beholder slice:}} {Key:sender Value:{vtype:4 numeric:0 stringly:beholderclient slice:}}] EmitterExportTimeout:1s EmitterExportInterval:1s EmitterExportMaxBatchSize:512 EmitterMaxQueueSize:2048 EmitterBatchProcessor:true TraceSampleRatio:1 TraceBatchTimeout:1s TraceSpanExporter: TraceRetryConfig: TraceCompressor:gzip MetricReaderInterval:1s MetricRetryConfig: MetricViews:[] MetricCompressor:gzip ChipIngressEmitterEnabled:false ChipIngressEmitterGRPCEndpoint: ChipIngressInsecureConnection:false ChipIngressBatchEmitterEnabled:false ChipIngressBufferSize:0 ChipIngressMaxBatchSize:0 ChipIngressMaxWorkers:0 ChipIngressSendInterval:0s ChipIngressSendTimeout:0s ChipIngressDrainTimeout:0s ChipIngressMaxConcurrentSends:0 ChipIngressLogger: LogExportTimeout:1s LogExportInterval:1s LogExportMaxBatchSize:512 LogMaxQueueSize:2048 LogBatchProcessor:true LogRetryConfig: LogStreamingEnabled:false LogLevel:info LogCompressor:gzip AuthHeaders:map[] AuthHeadersTTL:0s AuthKeySigner: AuthPublicKeyHex:} + // {InsecureConnection:true CACertFile: OtelExporterGRPCEndpoint:localhost:4317 OtelExporterHTTPEndpoint:localhost:4318 ResourceAttributes:[{Key:package_name Value:{vtype:4 numeric:0 stringly:beholder slice:}} {Key:sender Value:{vtype:4 numeric:0 stringly:beholderclient slice:}}] EmitterExportTimeout:1s EmitterExportInterval:1s EmitterExportMaxBatchSize:512 EmitterMaxQueueSize:2048 EmitterBatchProcessor:true TraceSampleRatio:1 TraceBatchTimeout:1s TraceSpanExporter: TraceRetryConfig: TraceCompressor:gzip MetricReaderInterval:1s MetricRetryConfig: MetricViews:[] MetricCompressor:gzip ChipIngressEmitterEnabled:false ChipIngressEmitterGRPCEndpoint: ChipIngressInsecureConnection:false ChipIngressBatchEmitterEnabled:false ChipIngressBufferSize:0 ChipIngressMaxBatchSize:0 ChipIngressSendInterval:0s ChipIngressSendTimeout:0s ChipIngressDrainTimeout:0s ChipIngressMaxConcurrentSends:0 ChipIngressLogger: LogExportTimeout:1s LogExportInterval:1s LogExportMaxBatchSize:512 LogMaxQueueSize:2048 LogBatchProcessor:true LogRetryConfig: LogStreamingEnabled:false LogLevel:info LogCompressor:gzip AuthHeaders:map[] AuthHeadersTTL:0s AuthKeySigner: AuthPublicKeyHex:} // {InitialInterval:5s MaxInterval:30s MaxElapsedTime:1m0s} } From 16aa7091d0e4eb590e18bab89e95feb98b51142f Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Mon, 16 Mar 2026 19:49:03 +0200 Subject: [PATCH 24/28] Use caller ctx --- pkg/beholder/chip_ingress_batch_emitter.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/beholder/chip_ingress_batch_emitter.go b/pkg/beholder/chip_ingress_batch_emitter.go index 2c28c11513..c3f21173e0 100644 --- a/pkg/beholder/chip_ingress_batch_emitter.go +++ b/pkg/beholder/chip_ingress_batch_emitter.go @@ -143,16 +143,16 @@ func (e *ChipIngressBatchEmitter) emitInternal(ctx context.Context, body []byte, queueErr := e.batchClient.QueueMessage(eventPb, func(sendErr error) { if sendErr != nil { - e.metrics.eventsDropped.Add(context.Background(), 1, metricAttrs) + e.metrics.eventsDropped.Add(ctx, 1, metricAttrs) } else { - e.metrics.eventsSent.Add(context.Background(), 1, metricAttrs) + e.metrics.eventsSent.Add(ctx, 1, metricAttrs) } if callback != nil { callback(sendErr) } }) if queueErr != nil { - e.metrics.eventsDropped.Add(context.Background(), 1, metricAttrs) + e.metrics.eventsDropped.Add(ctx, 1, metricAttrs) if callback != nil { callback(queueErr) } From 3048466c40b88c7d351777f8023ff35bf2887bfc Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Wed, 18 Mar 2026 15:23:53 +0200 Subject: [PATCH 25/28] Add batch emitter in services --- pkg/beholder/client.go | 33 +++++--- pkg/beholder/emit_only_adapter.go | 18 +++++ pkg/beholder/emit_only_adapter_test.go | 54 +++++++++++++ pkg/beholder/httpclient.go | 2 +- pkg/beholder/managed_services_test.go | 102 +++++++++++++++++++++++++ pkg/beholder/noop.go | 2 +- 6 files changed, 198 insertions(+), 13 deletions(-) create mode 100644 pkg/beholder/emit_only_adapter.go create mode 100644 pkg/beholder/emit_only_adapter_test.go create mode 100644 pkg/beholder/managed_services_test.go diff --git a/pkg/beholder/client.go b/pkg/beholder/client.go index 83f61c3a0e..2f695f758d 100644 --- a/pkg/beholder/client.go +++ b/pkg/beholder/client.go @@ -26,6 +26,7 @@ import ( "google.golang.org/grpc/credentials/insecure" "github.com/smartcontractkit/chainlink-common/pkg/chipingress" + "github.com/smartcontractkit/chainlink-common/pkg/services" ) const defaultGRPCCompressor = "gzip" @@ -60,6 +61,9 @@ type Client struct { // OnClose OnClose func() error + + // managedServices holds services started/stopped by the application, not by the Client. + managedServices []services.Service } // NewClient creates a new Client with initialized OpenTelemetry components @@ -232,10 +236,7 @@ func NewGRPCClient(cfg Config, otlploggrpcNew otlploggrpcFactory) (*Client, erro if err != nil { return nil, fmt.Errorf("failed to create chip ingress batch emitter: %w", err) } - if err = batchEmitterService.Start(context.Background()); err != nil { - return nil, fmt.Errorf("failed to start chip ingress batch emitter: %w", err) - } - chipIngressEmitter = batchEmitterService + chipIngressEmitter = &emitOnlyAdapter{batchEmitterService} } else { chipIngressEmitter, err = NewChipIngressEmitter(chipIngressClient) if err != nil { @@ -245,9 +246,6 @@ func NewGRPCClient(cfg Config, otlploggrpcNew otlploggrpcFactory) (*Client, erro emitter, err = NewDualSourceEmitter(chipIngressEmitter, emitter, cfg.ChipIngressBatchEmitterEnabled) if err != nil { - if batchEmitterService != nil { - _ = batchEmitterService.Close() - } return nil, fmt.Errorf("failed to create dual source emitter: %w", err) } } @@ -258,12 +256,25 @@ func NewGRPCClient(cfg Config, otlploggrpcNew otlploggrpcFactory) (*Client, erro } return } - return &Client{cfg, logger, tracer, meter, emitter, chipIngressClient, loggerProvider, tracerProvider, meterProvider, messageLoggerProvider, signer, onClose}, nil + var managed []services.Service + if batchEmitterService != nil { + managed = append(managed, batchEmitterService) + } + + return &Client{cfg, logger, tracer, meter, emitter, chipIngressClient, loggerProvider, tracerProvider, meterProvider, messageLoggerProvider, signer, onClose, managed}, nil +} + +// ManagedServices returns services whose lifecycle is owned by the application +// (start, stop, health). Returns nil when the receiver is nil or has none. +func (c *Client) ManagedServices() []services.Service { + if c == nil { + return nil + } + return c.managedServices } -// Closes all providers, flushes all data and stops all background processes. -// Order matters: emitter is closed first so the batch emitter can drain -// buffered events while the gRPC connection is still alive. +// Close shuts down OTel providers and the chip ingress connection. +// It does NOT close managed services — those are closed by the application. func (c Client) Close() (err error) { if c.Emitter != nil { err = errors.Join(err, c.Emitter.Close()) diff --git a/pkg/beholder/emit_only_adapter.go b/pkg/beholder/emit_only_adapter.go new file mode 100644 index 0000000000..3b2ac3d736 --- /dev/null +++ b/pkg/beholder/emit_only_adapter.go @@ -0,0 +1,18 @@ +package beholder + +import "context" + +// emitOnlyAdapter wraps a ChipIngressBatchEmitter as an Emitter but with a +// no-op Close. This decouples emission (used by DualSourceEmitter) from +// lifecycle management (owned by the application's service list). +type emitOnlyAdapter struct { + e *ChipIngressBatchEmitter +} + +func (a *emitOnlyAdapter) Emit(ctx context.Context, body []byte, attrKVs ...any) error { + return a.e.Emit(ctx, body, attrKVs...) +} + +func (a *emitOnlyAdapter) Close() error { + return nil // lifecycle is managed externally +} diff --git a/pkg/beholder/emit_only_adapter_test.go b/pkg/beholder/emit_only_adapter_test.go new file mode 100644 index 0000000000..c81c168cc7 --- /dev/null +++ b/pkg/beholder/emit_only_adapter_test.go @@ -0,0 +1,54 @@ +package beholder_test + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/smartcontractkit/chainlink-common/pkg/beholder" +) + +func TestEmitOnlyAdapter_CloseIsNoOp(t *testing.T) { + // Create a client with batch enabled, get the emitter, and verify + // that closing the client (which closes the DualSourceEmitter which + // closes the emitOnlyAdapter) does not close the batch emitter service. + lggr := newTestLogger(t) + + client, err := beholder.NewClient(beholder.Config{ + OtelExporterGRPCEndpoint: "localhost:4317", + ChipIngressEmitterEnabled: true, + ChipIngressEmitterGRPCEndpoint: "localhost:9090", + ChipIngressInsecureConnection: true, + ChipIngressBatchEmitterEnabled: true, + ChipIngressLogger: lggr, + ChipIngressBufferSize: 10, + ChipIngressMaxBatchSize: 5, + ChipIngressSendInterval: 50 * time.Millisecond, + ChipIngressSendTimeout: 1 * time.Second, + ChipIngressDrainTimeout: 1 * time.Second, + }) + require.NoError(t, err) + + managed := client.ManagedServices() + require.Len(t, managed, 1) + svc := managed[0] + + // Close the client (which closes the DualSourceEmitter → emitOnlyAdapter.Close → no-op). + // OTel provider flush may fail (no real endpoint) — we only care that it doesn't + // close the batch emitter service. + _ = client.Close() + + // The batch emitter service should still be startable (not already closed) + err = svc.Start(t.Context()) + require.NoError(t, err) + + // And closable + err = svc.Close() + require.NoError(t, err) + + // Second close should error (StopOnce) + err = svc.Close() + assert.Error(t, err, "second close should fail") +} diff --git a/pkg/beholder/httpclient.go b/pkg/beholder/httpclient.go index 693f581452..7bea69a424 100644 --- a/pkg/beholder/httpclient.go +++ b/pkg/beholder/httpclient.go @@ -187,7 +187,7 @@ func NewHTTPClient(cfg Config, otlploghttpNew otlploghttpFactory) (*Client, erro return } // HTTP client doesn't currently support rotating auth, so lazySigner is always nil - return &Client{cfg, logger, tracer, meter, emitter, nil, loggerProvider, tracerProvider, meterProvider, messageLoggerProvider, nil, onClose}, nil + return &Client{cfg, logger, tracer, meter, emitter, nil, loggerProvider, tracerProvider, meterProvider, messageLoggerProvider, nil, onClose, nil}, nil } func newHTTPTracerProvider(config Config, resource *sdkresource.Resource, tlsConfig *tls.Config) (*sdktrace.TracerProvider, error) { diff --git a/pkg/beholder/managed_services_test.go b/pkg/beholder/managed_services_test.go new file mode 100644 index 0000000000..7feaff6e9b --- /dev/null +++ b/pkg/beholder/managed_services_test.go @@ -0,0 +1,102 @@ +package beholder_test + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/smartcontractkit/chainlink-common/pkg/beholder" +) + +func TestManagedServices_NilReceiver(t *testing.T) { + var c *beholder.Client + assert.Nil(t, c.ManagedServices()) +} + +func TestManagedServices_NoopClient(t *testing.T) { + c := beholder.NewNoopClient() + assert.Empty(t, c.ManagedServices()) +} + +func TestManagedServices_BatchDisabled(t *testing.T) { + client, err := beholder.NewClient(beholder.Config{ + OtelExporterGRPCEndpoint: "localhost:4317", + ChipIngressEmitterEnabled: true, + ChipIngressEmitterGRPCEndpoint: "localhost:9090", + ChipIngressInsecureConnection: true, + ChipIngressBatchEmitterEnabled: false, + }) + require.NoError(t, err) + defer func() { _ = client.Close() }() + + assert.Empty(t, client.ManagedServices()) +} + +func TestManagedServices_BatchEnabled(t *testing.T) { + lggr := newTestLogger(t) + + client, err := beholder.NewClient(beholder.Config{ + OtelExporterGRPCEndpoint: "localhost:4317", + ChipIngressEmitterEnabled: true, + ChipIngressEmitterGRPCEndpoint: "localhost:9090", + ChipIngressInsecureConnection: true, + ChipIngressBatchEmitterEnabled: true, + ChipIngressLogger: lggr, + ChipIngressBufferSize: 10, + ChipIngressMaxBatchSize: 5, + ChipIngressSendInterval: 50 * time.Millisecond, + ChipIngressSendTimeout: 1 * time.Second, + ChipIngressDrainTimeout: 1 * time.Second, + }) + require.NoError(t, err) + + managed := client.ManagedServices() + require.Len(t, managed, 1) + assert.Equal(t, "ChipIngressBatchEmitter", managed[0].Name()) + + // Service should be startable (not already started) + err = managed[0].Start(context.Background()) + require.NoError(t, err) + + err = managed[0].Close() + require.NoError(t, err) + + // Client.Close may return OTel flush errors (no real endpoint) — that's fine. + // The key assertion is that closing the client does NOT close the batch emitter + // (emitOnlyAdapter.Close is a no-op). + _ = client.Close() +} + +func TestManagedServices_BatchEmitterNotAutoStarted(t *testing.T) { + lggr := newTestLogger(t) + + client, err := beholder.NewClient(beholder.Config{ + OtelExporterGRPCEndpoint: "localhost:4317", + ChipIngressEmitterEnabled: true, + ChipIngressEmitterGRPCEndpoint: "localhost:9090", + ChipIngressInsecureConnection: true, + ChipIngressBatchEmitterEnabled: true, + ChipIngressLogger: lggr, + ChipIngressBufferSize: 10, + ChipIngressMaxBatchSize: 5, + ChipIngressSendInterval: 50 * time.Millisecond, + ChipIngressSendTimeout: 1 * time.Second, + ChipIngressDrainTimeout: 1 * time.Second, + }) + require.NoError(t, err) + + managed := client.ManagedServices() + require.Len(t, managed, 1) + + // The service should not be ready yet (not started) + err = managed[0].Ready() + assert.Error(t, err, "service should not be ready before Start()") + + // Start, verify ready, then close + require.NoError(t, managed[0].Start(context.Background())) + require.NoError(t, managed[0].Ready()) + require.NoError(t, managed[0].Close()) +} diff --git a/pkg/beholder/noop.go b/pkg/beholder/noop.go index 67580ec15c..1ed0a0144a 100644 --- a/pkg/beholder/noop.go +++ b/pkg/beholder/noop.go @@ -39,7 +39,7 @@ func NewNoopClient() *Client { // ChipIngress chipClient := &chipingress.NoopClient{} - return &Client{cfg, logger, tracer, meter, messageEmitter, chipClient, loggerProvider, tracerProvider, meterProvider, loggerProvider, nil, noopOnClose} + return &Client{cfg, logger, tracer, meter, messageEmitter, chipClient, loggerProvider, tracerProvider, meterProvider, loggerProvider, nil, noopOnClose, nil} } // NewStdoutClient creates a new Client with exporters which send telemetry data to standard output From 7257a19a5c701084f98558ac10a48a0d25c01d8e Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Wed, 18 Mar 2026 15:38:50 +0200 Subject: [PATCH 26/28] BatchEmitter + Service --- ... => chip_ingress_batch_emitter_service.go} | 20 ++++----- ...hip_ingress_batch_emitter_service_test.go} | 42 +++++++++---------- pkg/beholder/client.go | 4 +- pkg/beholder/emit_only_adapter.go | 4 +- pkg/beholder/managed_services_test.go | 2 +- 5 files changed, 36 insertions(+), 36 deletions(-) rename pkg/beholder/{chip_ingress_batch_emitter.go => chip_ingress_batch_emitter_service.go} (83%) rename pkg/beholder/{chip_ingress_batch_emitter_test.go => chip_ingress_batch_emitter_service_test.go} (84%) diff --git a/pkg/beholder/chip_ingress_batch_emitter.go b/pkg/beholder/chip_ingress_batch_emitter_service.go similarity index 83% rename from pkg/beholder/chip_ingress_batch_emitter.go rename to pkg/beholder/chip_ingress_batch_emitter_service.go index c3f21173e0..427941e3f5 100644 --- a/pkg/beholder/chip_ingress_batch_emitter.go +++ b/pkg/beholder/chip_ingress_batch_emitter_service.go @@ -16,9 +16,9 @@ import ( "github.com/smartcontractkit/chainlink-common/pkg/services" ) -// ChipIngressBatchEmitter batches events and sends them via chipingress.Client.PublishBatch. +// ChipIngressBatchEmitterService batches events and sends them via chipingress.Client.PublishBatch. // It implements the Emitter interface. -type ChipIngressBatchEmitter struct { +type ChipIngressBatchEmitterService struct { services.Service eng *services.Engine @@ -33,8 +33,8 @@ type batchEmitterMetrics struct { eventsDropped otelmetric.Int64Counter } -// NewChipIngressBatchEmitter creates a batch emitter backed by the given chipingress client. -func NewChipIngressBatchEmitter(client chipingress.Client, cfg Config, lggr logger.Logger) (*ChipIngressBatchEmitter, error) { +// NewChipIngressBatchEmitterService creates a batch emitter service backed by the given chipingress client. +func NewChipIngressBatchEmitterService(client chipingress.Client, cfg Config, lggr logger.Logger) (*ChipIngressBatchEmitterService, error) { if client == nil { return nil, fmt.Errorf("chip ingress client is nil") } @@ -83,13 +83,13 @@ func NewChipIngressBatchEmitter(client chipingress.Client, cfg Config, lggr logg return nil, fmt.Errorf("failed to create batch client: %w", err) } - e := &ChipIngressBatchEmitter{ + e := &ChipIngressBatchEmitterService{ batchClient: batchClient, metrics: metrics, } e.Service, e.eng = services.Config{ - Name: "ChipIngressBatchEmitter", + Name: "ChipIngressBatchEmitterService", }.NewServiceEngine(lggr) e.eng.Go(func(ctx context.Context) { @@ -104,7 +104,7 @@ func NewChipIngressBatchEmitter(client chipingress.Client, cfg Config, lggr logg // Emit queues an event for batched delivery without blocking. // Returns an error if the emitter is stopped or the context is cancelled. // If the buffer is full, the event is silently dropped. -func (e *ChipIngressBatchEmitter) Emit(ctx context.Context, body []byte, attrKVs ...any) error { +func (e *ChipIngressBatchEmitterService) Emit(ctx context.Context, body []byte, attrKVs ...any) error { return e.emitInternal(ctx, body, nil, attrKVs...) } @@ -113,11 +113,11 @@ func (e *ChipIngressBatchEmitter) Emit(ctx context.Context, body []byte, attrKVs // // If EmitWithCallback returns a non-nil error, the callback will NOT be invoked. // If it returns nil, the callback is guaranteed to fire exactly once. -func (e *ChipIngressBatchEmitter) EmitWithCallback(ctx context.Context, body []byte, callback func(error), attrKVs ...any) error { +func (e *ChipIngressBatchEmitterService) EmitWithCallback(ctx context.Context, body []byte, callback func(error), attrKVs ...any) error { return e.emitInternal(ctx, body, callback, attrKVs...) } -func (e *ChipIngressBatchEmitter) emitInternal(ctx context.Context, body []byte, callback func(error), attrKVs ...any) error { +func (e *ChipIngressBatchEmitterService) emitInternal(ctx context.Context, body []byte, callback func(error), attrKVs ...any) error { return e.eng.IfNotStopped(func() error { domain, entity, err := ExtractSourceAndType(attrKVs...) if err != nil { @@ -162,7 +162,7 @@ func (e *ChipIngressBatchEmitter) emitInternal(ctx context.Context, body []byte, }) } -func (e *ChipIngressBatchEmitter) metricAttrsFor(domain, entity string) otelmetric.MeasurementOption { +func (e *ChipIngressBatchEmitterService) metricAttrsFor(domain, entity string) otelmetric.MeasurementOption { key := domain + "\x00" + entity if v, ok := e.metricAttrsCache.Load(key); ok { return v.(otelmetric.MeasurementOption) diff --git a/pkg/beholder/chip_ingress_batch_emitter_test.go b/pkg/beholder/chip_ingress_batch_emitter_service_test.go similarity index 84% rename from pkg/beholder/chip_ingress_batch_emitter_test.go rename to pkg/beholder/chip_ingress_batch_emitter_service_test.go index 45cd6a2dfe..988647a56d 100644 --- a/pkg/beholder/chip_ingress_batch_emitter_test.go +++ b/pkg/beholder/chip_ingress_batch_emitter_service_test.go @@ -35,25 +35,25 @@ func newTestLogger(t *testing.T) logger.Logger { return lggr } -func TestNewChipIngressBatchEmitter(t *testing.T) { +func TestNewChipIngressBatchEmitterService(t *testing.T) { t.Run("happy path", func(t *testing.T) { clientMock := mocks.NewClient(t) - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestConfig(), newTestLogger(t)) + emitter, err := beholder.NewChipIngressBatchEmitterService(clientMock, newTestConfig(), newTestLogger(t)) require.NoError(t, err) assert.NotNil(t, emitter) }) t.Run("returns error when client is nil", func(t *testing.T) { - emitter, err := beholder.NewChipIngressBatchEmitter(nil, newTestConfig(), newTestLogger(t)) + emitter, err := beholder.NewChipIngressBatchEmitterService(nil, newTestConfig(), newTestLogger(t)) assert.Error(t, err) assert.Nil(t, emitter) }) } -func TestChipIngressBatchEmitter_Emit(t *testing.T) { +func TestChipIngressBatchEmitterService_Emit(t *testing.T) { t.Run("returns error when domain/entity missing", func(t *testing.T) { clientMock := mocks.NewClient(t) - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestConfig(), newTestLogger(t)) + emitter, err := beholder.NewChipIngressBatchEmitterService(clientMock, newTestConfig(), newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) defer emitter.Close() //nolint:errcheck @@ -80,7 +80,7 @@ func TestChipIngressBatchEmitter_Emit(t *testing.T) { cfg := newTestConfig() cfg.ChipIngressSendInterval = 50 * time.Millisecond - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) + emitter, err := beholder.NewChipIngressBatchEmitterService(clientMock, cfg, newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) @@ -111,7 +111,7 @@ func TestChipIngressBatchEmitter_Emit(t *testing.T) { }) } -func TestChipIngressBatchEmitter_CloudEventFormat(t *testing.T) { +func TestChipIngressBatchEmitterService_CloudEventFormat(t *testing.T) { clientMock := mocks.NewClient(t) var mu sync.Mutex @@ -128,7 +128,7 @@ func TestChipIngressBatchEmitter_CloudEventFormat(t *testing.T) { cfg := newTestConfig() cfg.ChipIngressSendInterval = 50 * time.Millisecond - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) + emitter, err := beholder.NewChipIngressBatchEmitterService(clientMock, cfg, newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) @@ -156,7 +156,7 @@ func TestChipIngressBatchEmitter_CloudEventFormat(t *testing.T) { assert.NotEmpty(t, event.Id) } -func TestChipIngressBatchEmitter_PublishBatchError(t *testing.T) { +func TestChipIngressBatchEmitterService_PublishBatchError(t *testing.T) { clientMock := mocks.NewClient(t) var mu sync.Mutex @@ -173,7 +173,7 @@ func TestChipIngressBatchEmitter_PublishBatchError(t *testing.T) { cfg := newTestConfig() cfg.ChipIngressSendInterval = 50 * time.Millisecond - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) + emitter, err := beholder.NewChipIngressBatchEmitterService(clientMock, cfg, newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) @@ -194,7 +194,7 @@ func TestChipIngressBatchEmitter_PublishBatchError(t *testing.T) { require.NoError(t, emitter.Close()) } -func TestChipIngressBatchEmitter_ContextCancellation(t *testing.T) { +func TestChipIngressBatchEmitterService_ContextCancellation(t *testing.T) { clientMock := mocks.NewClient(t) clientMock. On("PublishBatch", mock.Anything, mock.Anything). @@ -205,7 +205,7 @@ func TestChipIngressBatchEmitter_ContextCancellation(t *testing.T) { cfg.ChipIngressBufferSize = 1 cfg.ChipIngressSendInterval = 10 * time.Second - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) + emitter, err := beholder.NewChipIngressBatchEmitterService(clientMock, cfg, newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) defer emitter.Close() //nolint:errcheck @@ -220,7 +220,7 @@ func TestChipIngressBatchEmitter_ContextCancellation(t *testing.T) { assert.ErrorIs(t, err, context.Canceled) } -func TestChipIngressBatchEmitter_DefaultConfig(t *testing.T) { +func TestChipIngressBatchEmitterService_DefaultConfig(t *testing.T) { clientMock := mocks.NewClient(t) var mu sync.Mutex @@ -234,7 +234,7 @@ func TestChipIngressBatchEmitter_DefaultConfig(t *testing.T) { }). Return(nil, nil) - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, beholder.Config{}, newTestLogger(t)) + emitter, err := beholder.NewChipIngressBatchEmitterService(clientMock, beholder.Config{}, newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) @@ -257,14 +257,14 @@ func TestChipIngressBatchEmitter_DefaultConfig(t *testing.T) { require.Len(t, receivedBatch.Events, 1) } -func TestChipIngressBatchEmitter_EmitAfterClose(t *testing.T) { +func TestChipIngressBatchEmitterService_EmitAfterClose(t *testing.T) { clientMock := mocks.NewClient(t) clientMock. On("PublishBatch", mock.Anything, mock.Anything). Return(nil, nil). Maybe() - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, newTestConfig(), newTestLogger(t)) + emitter, err := beholder.NewChipIngressBatchEmitterService(clientMock, newTestConfig(), newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) require.NoError(t, emitter.Close()) @@ -276,7 +276,7 @@ func TestChipIngressBatchEmitter_EmitAfterClose(t *testing.T) { assert.Error(t, err) } -func TestChipIngressBatchEmitter_EmitWithCallback(t *testing.T) { +func TestChipIngressBatchEmitterService_EmitWithCallback(t *testing.T) { t.Run("callback receives nil on success", func(t *testing.T) { clientMock := mocks.NewClient(t) clientMock. @@ -286,7 +286,7 @@ func TestChipIngressBatchEmitter_EmitWithCallback(t *testing.T) { cfg := newTestConfig() cfg.ChipIngressSendInterval = 50 * time.Millisecond - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) + emitter, err := beholder.NewChipIngressBatchEmitterService(clientMock, cfg, newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) @@ -318,7 +318,7 @@ func TestChipIngressBatchEmitter_EmitWithCallback(t *testing.T) { cfg := newTestConfig() cfg.ChipIngressSendInterval = 50 * time.Millisecond - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) + emitter, err := beholder.NewChipIngressBatchEmitterService(clientMock, cfg, newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) @@ -365,7 +365,7 @@ func TestChipIngressBatchEmitter_EmitWithCallback(t *testing.T) { cfg.ChipIngressSendInterval = 50 * time.Millisecond cfg.ChipIngressDrainTimeout = 200 * time.Millisecond - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) + emitter, err := beholder.NewChipIngressBatchEmitterService(clientMock, cfg, newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) defer close(sendBlocked) @@ -414,7 +414,7 @@ func TestChipIngressBatchEmitter_EmitWithCallback(t *testing.T) { cfg := newTestConfig() cfg.ChipIngressSendInterval = 50 * time.Millisecond - emitter, err := beholder.NewChipIngressBatchEmitter(clientMock, cfg, newTestLogger(t)) + emitter, err := beholder.NewChipIngressBatchEmitterService(clientMock, cfg, newTestLogger(t)) require.NoError(t, err) require.NoError(t, emitter.Start(t.Context())) diff --git a/pkg/beholder/client.go b/pkg/beholder/client.go index 2f695f758d..c254f7a097 100644 --- a/pkg/beholder/client.go +++ b/pkg/beholder/client.go @@ -191,7 +191,7 @@ func NewGRPCClient(cfg Config, otlploggrpcNew otlploggrpcFactory) (*Client, erro // This will eventually be removed in favor of chip-ingress emitter // and logs will be sent via OTLP using the regular Logger instead of calling Emit emitter := NewMessageEmitter(messageLogger) - var batchEmitterService *ChipIngressBatchEmitter + var batchEmitterService *ChipIngressBatchEmitterService var chipIngressClient chipingress.Client = &chipingress.NoopClient{} // if chip ingress is enabled, create dual source emitter that sends to both otel collector and chip ingress // eventually we will remove the dual source emitter and just use chip ingress @@ -232,7 +232,7 @@ func NewGRPCClient(cfg Config, otlploggrpcNew otlploggrpcFactory) (*Client, erro if cfg.ChipIngressLogger == nil { return nil, fmt.Errorf("ChipIngressLogger is required when ChipIngressBatchEmitterEnabled is true") } - batchEmitterService, err = NewChipIngressBatchEmitter(chipIngressClient, cfg, cfg.ChipIngressLogger) + batchEmitterService, err = NewChipIngressBatchEmitterService(chipIngressClient, cfg, cfg.ChipIngressLogger) if err != nil { return nil, fmt.Errorf("failed to create chip ingress batch emitter: %w", err) } diff --git a/pkg/beholder/emit_only_adapter.go b/pkg/beholder/emit_only_adapter.go index 3b2ac3d736..52da40b4e6 100644 --- a/pkg/beholder/emit_only_adapter.go +++ b/pkg/beholder/emit_only_adapter.go @@ -2,11 +2,11 @@ package beholder import "context" -// emitOnlyAdapter wraps a ChipIngressBatchEmitter as an Emitter but with a +// emitOnlyAdapter wraps a ChipIngressBatchEmitterService as an Emitter but with a // no-op Close. This decouples emission (used by DualSourceEmitter) from // lifecycle management (owned by the application's service list). type emitOnlyAdapter struct { - e *ChipIngressBatchEmitter + e *ChipIngressBatchEmitterService } func (a *emitOnlyAdapter) Emit(ctx context.Context, body []byte, attrKVs ...any) error { diff --git a/pkg/beholder/managed_services_test.go b/pkg/beholder/managed_services_test.go index 7feaff6e9b..38727382fe 100644 --- a/pkg/beholder/managed_services_test.go +++ b/pkg/beholder/managed_services_test.go @@ -55,7 +55,7 @@ func TestManagedServices_BatchEnabled(t *testing.T) { managed := client.ManagedServices() require.Len(t, managed, 1) - assert.Equal(t, "ChipIngressBatchEmitter", managed[0].Name()) + assert.Equal(t, "ChipIngressBatchEmitterService", managed[0].Name()) // Service should be startable (not already started) err = managed[0].Start(context.Background()) From e82a57edc6e041db5ec273883d94bd9b819ae401 Mon Sep 17 00:00:00 2001 From: Thomas Kaliakos Date: Wed, 18 Mar 2026 22:52:45 +0200 Subject: [PATCH 27/28] remove emit_only_adapter - amend loop server --- pkg/beholder/client.go | 5 ++- pkg/beholder/emit_only_adapter.go | 18 --------- pkg/beholder/emit_only_adapter_test.go | 54 -------------------------- pkg/beholder/managed_services_test.go | 11 ++++-- pkg/loop/server.go | 11 ++++++ 5 files changed, 21 insertions(+), 78 deletions(-) delete mode 100644 pkg/beholder/emit_only_adapter.go delete mode 100644 pkg/beholder/emit_only_adapter_test.go diff --git a/pkg/beholder/client.go b/pkg/beholder/client.go index c254f7a097..aec93bf046 100644 --- a/pkg/beholder/client.go +++ b/pkg/beholder/client.go @@ -236,7 +236,7 @@ func NewGRPCClient(cfg Config, otlploggrpcNew otlploggrpcFactory) (*Client, erro if err != nil { return nil, fmt.Errorf("failed to create chip ingress batch emitter: %w", err) } - chipIngressEmitter = &emitOnlyAdapter{batchEmitterService} + chipIngressEmitter = batchEmitterService } else { chipIngressEmitter, err = NewChipIngressEmitter(chipIngressClient) if err != nil { @@ -274,7 +274,8 @@ func (c *Client) ManagedServices() []services.Service { } // Close shuts down OTel providers and the chip ingress connection. -// It does NOT close managed services — those are closed by the application. +// The batch emitter service may already have been closed by the application; +// the duplicate Close returns an "already stopped" error which is harmless. func (c Client) Close() (err error) { if c.Emitter != nil { err = errors.Join(err, c.Emitter.Close()) diff --git a/pkg/beholder/emit_only_adapter.go b/pkg/beholder/emit_only_adapter.go deleted file mode 100644 index 52da40b4e6..0000000000 --- a/pkg/beholder/emit_only_adapter.go +++ /dev/null @@ -1,18 +0,0 @@ -package beholder - -import "context" - -// emitOnlyAdapter wraps a ChipIngressBatchEmitterService as an Emitter but with a -// no-op Close. This decouples emission (used by DualSourceEmitter) from -// lifecycle management (owned by the application's service list). -type emitOnlyAdapter struct { - e *ChipIngressBatchEmitterService -} - -func (a *emitOnlyAdapter) Emit(ctx context.Context, body []byte, attrKVs ...any) error { - return a.e.Emit(ctx, body, attrKVs...) -} - -func (a *emitOnlyAdapter) Close() error { - return nil // lifecycle is managed externally -} diff --git a/pkg/beholder/emit_only_adapter_test.go b/pkg/beholder/emit_only_adapter_test.go deleted file mode 100644 index c81c168cc7..0000000000 --- a/pkg/beholder/emit_only_adapter_test.go +++ /dev/null @@ -1,54 +0,0 @@ -package beholder_test - -import ( - "testing" - "time" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - "github.com/smartcontractkit/chainlink-common/pkg/beholder" -) - -func TestEmitOnlyAdapter_CloseIsNoOp(t *testing.T) { - // Create a client with batch enabled, get the emitter, and verify - // that closing the client (which closes the DualSourceEmitter which - // closes the emitOnlyAdapter) does not close the batch emitter service. - lggr := newTestLogger(t) - - client, err := beholder.NewClient(beholder.Config{ - OtelExporterGRPCEndpoint: "localhost:4317", - ChipIngressEmitterEnabled: true, - ChipIngressEmitterGRPCEndpoint: "localhost:9090", - ChipIngressInsecureConnection: true, - ChipIngressBatchEmitterEnabled: true, - ChipIngressLogger: lggr, - ChipIngressBufferSize: 10, - ChipIngressMaxBatchSize: 5, - ChipIngressSendInterval: 50 * time.Millisecond, - ChipIngressSendTimeout: 1 * time.Second, - ChipIngressDrainTimeout: 1 * time.Second, - }) - require.NoError(t, err) - - managed := client.ManagedServices() - require.Len(t, managed, 1) - svc := managed[0] - - // Close the client (which closes the DualSourceEmitter → emitOnlyAdapter.Close → no-op). - // OTel provider flush may fail (no real endpoint) — we only care that it doesn't - // close the batch emitter service. - _ = client.Close() - - // The batch emitter service should still be startable (not already closed) - err = svc.Start(t.Context()) - require.NoError(t, err) - - // And closable - err = svc.Close() - require.NoError(t, err) - - // Second close should error (StopOnce) - err = svc.Close() - assert.Error(t, err, "second close should fail") -} diff --git a/pkg/beholder/managed_services_test.go b/pkg/beholder/managed_services_test.go index 38727382fe..476cd4dc44 100644 --- a/pkg/beholder/managed_services_test.go +++ b/pkg/beholder/managed_services_test.go @@ -8,6 +8,8 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/smartcontractkit/chainlink-common/pkg/services" + "github.com/smartcontractkit/chainlink-common/pkg/beholder" ) @@ -64,10 +66,11 @@ func TestManagedServices_BatchEnabled(t *testing.T) { err = managed[0].Close() require.NoError(t, err) - // Client.Close may return OTel flush errors (no real endpoint) — that's fine. - // The key assertion is that closing the client does NOT close the batch emitter - // (emitOnlyAdapter.Close is a no-op). - _ = client.Close() + // Client.Close calls DualSourceEmitter.Close which calls the batch emitter's + // Close a second time. StopOnce returns "already stopped" — harmless. + err = client.Close() + require.Error(t, err) + assert.ErrorIs(t, err, services.ErrAlreadyStopped) } func TestManagedServices_BatchEmitterNotAutoStarted(t *testing.T) { diff --git a/pkg/loop/server.go b/pkg/loop/server.go index 28472ac2a3..3fa83da4c9 100644 --- a/pkg/loop/server.go +++ b/pkg/loop/server.go @@ -94,6 +94,7 @@ type Server struct { webServer *webServer checker *services.HealthChecker LimitsFactory limits.Factory + managedServices []services.Service } func newServer(loggerName string) (*Server, error) { @@ -214,6 +215,13 @@ func (s *Server) start(opts ...ServerOpt) error { beholder.SetClient(beholderClient) beholder.SetGlobalOtelProviders() + for _, svc := range beholderClient.ManagedServices() { + if err := svc.Start(ctx); err != nil { + return fmt.Errorf("failed to start beholder managed service %s: %w", svc.Name(), err) + } + s.managedServices = append(s.managedServices, svc) + } + if beholderCfg.LogStreamingEnabled { otelLogger, err := NewOtelLogger(beholderClient.Logger, beholderCfg.LogLevel) if err != nil { @@ -285,6 +293,9 @@ func (s *Server) Register(c services.HealthReporter) error { return s.checker.Re // Stop closes resources and flushes logs. func (s *Server) Stop() { + for i := len(s.managedServices) - 1; i >= 0; i-- { + s.Logger.ErrorIfFn(s.managedServices[i].Close, "Failed to close managed service") + } if s.dbStatsReporter != nil { s.dbStatsReporter.Stop() } From 2c481d0ccf65a17433ed1f2c6328141e94f327ff Mon Sep 17 00:00:00 2001 From: Pavel <177363085+pkcll@users.noreply.github.com> Date: Wed, 25 Mar 2026 23:22:36 -0400 Subject: [PATCH 28/28] test(beholder): close writer client in global test --- pkg/beholder/global_test.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/beholder/global_test.go b/pkg/beholder/global_test.go index df57d0a41b..0867609f78 100644 --- a/pkg/beholder/global_test.go +++ b/pkg/beholder/global_test.go @@ -77,6 +77,9 @@ func TestClient_SetGlobalOtelProviders(t *testing.T) { var b strings.Builder client, err := beholder.NewWriterClient(&b) require.NoError(t, err) + defer func() { + require.NoError(t, client.Close()) + }() // Set global Otel Client beholder.SetClient(client)