diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index d4ebd5f32..99f703608 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -33,9 +33,9 @@ jobs: strategy: matrix: args: - - name: fluent-bit-to-vali + - name: fluent-bit-plugin target: fluent-bit-plugin - oci-repository: gardener/fluent-bit-to-vali + oci-repository: gardener/fluent-bit-plugin ocm-labels: name: gardener.cloud/cve-categorisation value: @@ -46,33 +46,6 @@ jobs: integrity_requirement: none availability_requirement: none comment: no data is stored of processed by the installer - - name: vali-curator - target: curator - oci-repository: gardener/vali-curator - ocm-labels: - name: gardener.cloud/cve-categorisation - value: - network_exposure: private - authentication_enforced: false - user_interaction: gardener-operator - confidentiality_requirement: none - integrity_requirement: high - availability_requirement: low - - name: telegraf-iptables - target: telegraf - oci-repository: gardener/telegraf-iptables - ocm-labels: - name: gardener.cloud/cve-categorisation - value: - network_exposure: private - authentication_enforced: false - user_interaction: gardener-operator - confidentiality_requirement: none - integrity_requirement: none - availability_requirement: none - comment: >- - telegraf is not accessible from outside the seed-cluster and does not - interact with confidential data - name: event-logger target: event-logger oci-repository: gardener/event-logger diff --git a/.gitignore b/.gitignore index 4c472a3b1..d446048c8 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,5 @@ bin kubeconfig *~ gosec-report.sarif +fluent-bit-output-plugin + diff --git a/.golangci.yaml b/.golangci.yaml index a481bb2d6..b4df07fcf 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -47,4 +47,4 @@ formatters: sections: - standard - default - - prefix(github.com/gardener/logging) + - prefix(github.com/gardener/logging/v1) diff --git a/.hyperspace/pull_request_bot.json b/.hyperspace/pull_request_bot.json index 813ed01be..59ef37f02 100644 --- a/.hyperspace/pull_request_bot.json +++ b/.hyperspace/pull_request_bot.json @@ -11,7 +11,7 @@ }, "review": { "auto_generate_review": false, - "use_custom_review_focus": false + "use_custom_review_focus": true } } } diff --git a/.hyperspace/pull_request_bot_review_focus.md b/.hyperspace/pull_request_bot_review_focus.md new file mode 100644 index 000000000..7910b6714 --- /dev/null +++ b/.hyperspace/pull_request_bot_review_focus.md @@ -0,0 +1,94 @@ +# Review Philosophy + +- Comment only when there is **high confidence (>80%)** that an issue, risk, or meaningful improvement exists. Avoid speculative or low-impact feedback. +- Prioritize **signal over noise**. If a comment does not clearly improve correctness, readability, performance, security, or maintainability, do not leave it. +- Be **concise and direct**. Prefer a single, well-phrased sentence per comment whenever possible. +- Focus on **actionable feedback**. Each comment should either explain *what* is wrong, *why* it matters, or *how* to improve it. +- Avoid restating what the code already does. Assume the author can read the code. +- When reviewing text or documentation: + - Comment only if the wording is **genuinely ambiguous, misleading, or likely to cause incorrect usage**. + - Do not suggest stylistic or subjective wording changes unless they materially improve clarity or prevent misunderstanding. +- Treat every review as if the code will be **maintained by someone else six months from now**. + +## Priority Areas (Review These First) + +Focus review effort on the areas below, in order of **risk and long-term impact**. +Deprioritize minor style or preference-based issues unless they materially affect maintainability. + +--- + +### Security & Safety + +- Unsafe code blocks **without clear justification, scope, or documented invariants**. +- Command injection risks involving shell execution, dynamic commands, or unsanitized user input. +- Path traversal vulnerabilities when handling file paths, URLs, or external input. +- Credential exposure, hardcoded secrets, tokens, API keys, or sensitive configuration values. +- Missing or insufficient input validation on **external or untrusted data sources**. +- Improper error handling that could **leak sensitive information** through logs, error messages, or responses. +- Security-sensitive behavior that is implicit, undocumented, or relies on assumptions not enforced in code. + +--- + +### Correctness Issues + +- Logic errors that could lead to panics, crashes, undefined behavior, or incorrect results. +- Race conditions, shared-state issues, or unsafe access patterns in concurrent or async code. +- Resource leaks involving files, network connections, locks, or memory. +- Boundary issues such as off-by-one errors, empty states, or unhandled edge cases. +- Incorrect error propagation +- Optional types used where a value is guaranteed or required, adding unnecessary complexity. +- Error context that does not meaningfully improve debuggability or understanding. +- Overly defensive code that adds checks without realistic failure modes. +- Comments that restate obvious behavior instead of explaining **why** something exists. + +--- + +### Architecture & Patterns + +- Code that violates established patterns, conventions, or architectural decisions in the codebase. +- Missing or inconsistent error handling where a standard approach is already used +- Misuse of async/await, including blocking operations inside async contexts. +- Improper or incomplete trait implementations that break expectations or contracts. +- Abstractions that increase complexity without reducing duplication or improving clarity. +- Public APIs that expose unnecessary surface area or leak internal implementation details. + +## Skip These (Low Value) + +Do **not** leave review comments for the following, unless they directly impact +correctness, security, or long-term maintainability: + +- Style or formatting concerns handled by automated tools (`go fmt`, Prettier). +- Minor naming preferences that do not materially improve clarity or correctness. +- Suggestions to add comments when the code is already self-explanatory. +- Refactoring proposals unless they fix a real bug, remove duplicated logic, or significantly reduce complexity. +- Logging suggestions unless they are required for **security, auditing, or critical observability gaps**. +- Pedantic wording or text accuracy nitpicks unless misunderstanding could lead to incorrect usage or bugs. + +When in doubt, **err on the side of silence**. + +## Response Format + +Use the following structure for every review comment. +Do not deviate unless brevity clearly improves clarity. + +1. **State the problem** + - One clear sentence describing the concrete issue. + - Avoid speculation or vague phrasing. + +2. **Why it matters** (optional) + - One sentence explaining impact (correctness, safety, maintainability, or developer experience). + - Omit this step if the impact is obvious. + +3. **Suggested fix** + - Provide a specific action, code snippet, or alternative approach. + - Prefer minimal, localized changes over broad refactors. + +## When to Stay Silent + +- If you are **not confident** that something is an actual issue, do not comment. +- Do not speculate or ask hypothetical questions disguised as feedback. +- Silence is preferred over low-confidence, low-impact, or opinion-based comments. +- If an issue depends on missing context and cannot be verified from the diff, assume the author has context and stay silent. +- Only break silence when uncertainty itself creates a **real risk** (e.g., potential security, data loss, or correctness issues). + +Default to restraint. A good review is measured by **impact**, not comment count. diff --git a/Dockerfile b/Dockerfile index 70e06405d..a0e79e256 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ WORKDIR /go/src/github.com/gardener/logging COPY . . RUN go mod download -RUN make plugin copy curator event-logger +RUN make plugin copy event-logger ############# distroless-static FROM gcr.io/distroless/static-debian12:nonroot AS distroless-static @@ -29,16 +29,6 @@ WORKDIR / CMD ["-e", "/fluent-bit/plugins/output_plugin.so", "-c", "/fluent-bit/config/fluent-bit.conf"] -############# curator ############# -FROM distroless-static AS curator - -COPY --from=builder /go/src/github.com/gardener/logging/build/curator /curator - -WORKDIR / -EXPOSE 2718 - -ENTRYPOINT [ "/curator" ] - ############# eventlogger ############# FROM distroless-static AS event-logger @@ -48,58 +38,6 @@ WORKDIR / ENTRYPOINT [ "/event-logger" ] -############# telegraf-builder ############# -FROM golang:1.25.5 AS telegraf-builder -RUN git clone --depth 1 --branch v1.26.0 https://github.com/influxdata/telegraf.git -WORKDIR /go/telegraf -ARG TARGETOS -ARG TARGETARCH -RUN --mount=type=cache,target="/root/.cache/go-build" CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} make build - -############# iptables-builder ############# -FROM alpine:3.23.2 AS iptables-builder - -RUN apk add --update bash sudo iptables ncurses-libs libmnl && \ - rm -rf /var/cache/apk/* - -WORKDIR /volume - -RUN mkdir -p ./bin ./sbin ./lib ./usr/bin ./usr/sbin ./usr/lib ./usr/lib/xtables ./usr/lib/bash ./tmp ./run ./etc/bash ./etc/openvpn ./usr/lib/openvpn/plugins ./etc/iproute2 ./etc/terminfo ./etc/logrotate.d ./etc/network/if-up.d ./usr/share/udhcpc ./etc/ssl/misc ./usr/lib/engines-1.1 ./run ./usr/lib/sudo \ - && cp -d /lib/ld-musl-* ./lib && echo "package musl" \ - && cp -d /lib/libc.musl-* ./lib && echo "package musl" \ - && cp -d -r /etc/terminfo/* ./etc/terminfo && echo "package ncurses-terminfo-base" \ - && cp -d /usr/lib/libformw.so.* ./usr/lib && echo "package ncurses-libs" \ - && cp -d /usr/lib/libmenuw.so.* ./usr/lib && echo "package ncurses-libs" \ - && cp -d /usr/lib/libncursesw.so.* ./usr/lib && echo "package ncurses-libs" \ - && cp -d /usr/lib/libpanelw.so.* ./usr/lib && echo "package ncurses-libs" \ - && cp -d /usr/lib/libreadline.so.* ./usr/lib && echo "package readline" \ - && cp -d /etc/inputrc ./etc && echo "package readline" \ - && cp -d /bin/bash ./bin && echo "package bash" \ - && cp -d /etc/bash/bashrc ./etc/bash && echo "package bash" \ - && cp -d /usr/lib/bash/* ./usr/lib/bash && echo "package bash" \ - && cp -d /usr/lib/libz.* ./lib && echo "package zlib" \ - && cp -d /usr/lib/libmnl.* ./usr/lib && echo "package libmnl" \ - && cp -d /usr/lib/libnftnl* ./usr/lib && echo "package libnftnl" \ - && cp -d /etc/ethertypes ./etc && echo "package iptables" \ - && cp -d /usr/sbin/iptables* ./sbin && echo "package iptables" \ - && cp -d /usr/sbin/xtables* ./sbin && echo "package iptables" \ - && cp -d /usr/lib/libxtables* ./usr/lib && echo "package iptables" \ - && cp -d /usr/lib/xtables/* ./usr/lib/xtables && echo "package iptables" \ - && cp -d /usr/lib/sudo/* ./usr/lib/sudo && echo "package sudo" \ - && cp -d /etc/sudoers ./etc && echo "package sudo" \ - && cp -d /etc/passwd ./etc && echo "package sudo" \ - && cp -d /usr/bin/sudo ./usr/sbin && echo "package sudo" \ - && touch ./run/xtables.lock && echo "create /run/xtables.lock" - -############# telegraf ############# -FROM scratch AS telegraf - -COPY --from=iptables-builder /volume / - -COPY --from=telegraf-builder /go/telegraf/telegraf /usr/bin/telegraf - -CMD [ "/usr/bin/telegraf"] - ############# tune2fs-builder ############# FROM alpine:3.23.2 AS tune2fs-builder diff --git a/Makefile b/Makefile index a0afd802d..ec94fbb23 100644 --- a/Makefile +++ b/Makefile @@ -5,10 +5,8 @@ REPO_ROOT := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) VERSION := $(shell cat VERSION) REGISTRY ?= europe-docker.pkg.dev/gardener-project/snapshots/gardener -FLUENT_BIT_TO_VALI_IMAGE_REPOSITORY := $(REGISTRY)/fluent-bit-to-vali -FLUENT_BIT_VALI_IMAGE_REPOSITORY := $(REGISTRY)/fluent-bit-vali -VALI_CURATOR_IMAGE_REPOSITORY := $(REGISTRY)/vali-curator -TELEGRAF_IMAGE_REPOSITORY := $(REGISTRY)/telegraf-iptables +FLUENT_BIT_PLUGIN_IMAGE_REPOSITORY := $(REGISTRY)/fluent-bit-plugin +FLUENT_BIT_OUTPUT_IMAGE_REPOSITORY := $(REGISTRY)/fluent-bit-output TUNE2FS_IMAGE_REPOSITORY := $(REGISTRY)/tune2fs EVENT_LOGGER_IMAGE_REPOSITORY := $(REGISTRY)/event-logger EFFECTIVE_VERSION := $(VERSION)-$(shell git rev-parse --short HEAD) @@ -30,7 +28,7 @@ include hack/tools.mk export PATH := $(abspath $(TOOLS_DIR)):$(PATH) .DEFAULT_GOAL := all -all: verify plugin curator event-logger +all: tidy fmt gci plugin event-logger lint ################################################################# # Build targets # @@ -46,18 +44,6 @@ plugin: tidy -ldflags="$(LD_FLAGS)" \ ./cmd/fluent-bit-output-plugin -.PHONY: curator -curator: tidy - @echo "building $@ for $(BUILD_PLATFORM)/$(BUILD_ARCH)" - @GOOS=$(BUILD_PLATFORM) \ - GOARCH=$(BUILD_ARCH) \ - CGO_ENABLED=0 \ - GO111MODULE=on \ - go build \ - -o $(REPO_ROOT)/build/curator \ - -ldflags="$(LD_FLAGS)" \ - ./cmd/vali-curator - .PHONY: event-logger event-logger: tidy @echo "building $@ for $(BUILD_PLATFORM)/$(BUILD_ARCH)" @@ -87,19 +73,11 @@ copy: tidy docker-images: @BUILD_ARCH=$(BUILD_ARCH) \ $(REPO_ROOT)/hack/docker-image-build.sh "fluent-bit-plugin" \ - $(FLUENT_BIT_TO_VALI_IMAGE_REPOSITORY) $(IMAGE_TAG) + $(FLUENT_BIT_PLUGIN_IMAGE_REPOSITORY) $(IMAGE_TAG) @BUILD_ARCH=$(BUILD_ARCH) \ $(REPO_ROOT)/hack/docker-image-build.sh "fluent-bit-output" \ - $(FLUENT_BIT_VALI_IMAGE_REPOSITORY) $(IMAGE_TAG) - - @BUILD_ARCH=$(BUILD_ARCH) \ - $(REPO_ROOT)/hack/docker-image-build.sh "curator" \ - $(VALI_CURATOR_IMAGE_REPOSITORY) $(IMAGE_TAG) - - @BUILD_ARCH=$(BUILD_ARCH) \ - $(REPO_ROOT)/hack/docker-image-build.sh "telegraf" \ - $(TELEGRAF_IMAGE_REPOSITORY) $(IMAGE_TAG) + $(FLUENT_BIT_OUTPUT_IMAGE_REPOSITORY) $(IMAGE_TAG) @BUILD_ARCH=$(BUILD_ARCH) \ $(REPO_ROOT)/hack/docker-image-build.sh "event-logger" \ @@ -112,13 +90,7 @@ docker-images: .PHONY: docker-push docker-push: @$(REPO_ROOT)/hack/docker-image-push.sh "fluent-bit-plugin" \ - $(FLUENT_BIT_TO_VALI_IMAGE_REPOSITORY) $(IMAGE_TAG) - - @$(REPO_ROOT)/hack/docker-image-push.sh "curator" \ - $(VALI_CURATOR_IMAGE_REPOSITORY) $(IMAGE_TAG) - - @$(REPO_ROOT)/hack/docker-image-push.sh "telegraf" \ - $(TELEGRAF_IMAGE_REPOSITORY) $(IMAGE_TAG) + $(FLUENT_BIT_PLUGIN_IMAGE_REPOSITORY) $(IMAGE_TAG) @$(REPO_ROOT)/hack/docker-image-push.sh "event-logger" \ $(EVENT_LOGGER_IMAGE_REPOSITORY) $(IMAGE_TAG) $(EFFECTIVE_VERSION) @@ -135,7 +107,7 @@ tidy: @go mod tidy .PHONY: check -check: tidy fmt gci lint +check: tidy fmt gci .PHONY: fmt fmt: tidy @@ -150,7 +122,7 @@ gci: tidy @go tool gci write $(GCI_OPT) $(SRC_DIRS) .PHONY: lint -check: tidy +lint: tidy @echo "Running lint..." @go tool golangci-lint run \ --config=$(REPO_ROOT)/.golangci.yaml \ @@ -184,24 +156,3 @@ add-license-headers: tidy .PHONY: clean clean: @rm -rf $(REPO_ROOT)/build - -######################################### -# Tools # -######################################### -.PHONY: kind-up -kind-up: tidy $(KUBECTL) - @$(REPO_ROOT)/hack/kind-up.sh - -######################################### -# skaffold pipeline scenarios # -######################################### -skaffold-%: export KUBECONFIG = $(REPO_ROOT)/example/kind/kubeconfig - -.PHONY: skaffold-run -skaffold-run: $(SKAFFOLD) - @$(SKAFFOLD) run --kubeconfig=$(KUBECONFIG) - -# skaffold-dev target requires that skaffold run has been run -.PHONY: skaffold-dev -skaffold-dev: $(SKAFFOLD) - @$(SKAFFOLD) dev --kubeconfig=$(KUBECONFIG) diff --git a/README.md b/README.md index 8a3bbbb6b..beb72d87b 100644 --- a/README.md +++ b/README.md @@ -1,215 +1,147 @@ -# Fluent Bit output plugin +# Gardener Fluent Bit OTLP Output Plugin ![Logging Logo](images/logo/logging.png) [![REUSE status](https://api.reuse.software/badge/github.com/gardener/logging)](https://api.reuse.software/info/github.com/gardener/logging) [![Build](https://github.com/gardener/logging/actions/workflows/non-release.yaml/badge.svg)](https://github.com/gardener/logging/actions/workflows/non-release.yaml) [![Go Report Card](https://goreportcard.com/badge/github.com/gardener/logging)](https://goreportcard.com/report/github.com/gardener/logging) -[![License: Apache-2.0](https://img.shields.io/badge/License-Apache--2.0-blue.svg)](LICENSE) [![Release](https://img.shields.io/github/v/release/gardener/logging.svg?style=flat)](https://github.com/gardener/logging) [![Go Reference](https://pkg.go.dev/badge/github.com/gardener/logging.svg)](https://pkg.go.dev/github.com/gardener/logging) - -This plugin extends [Fluent Bit output plugin](https://github.com/credativ/vali/tree/main/cmd/fluent-bit) which aims to forward log messages from fluent-bit to Vali backend. -Π’he plugin meets the needs of the [Gardener](https://gardener.cloud/) by implementing a logic for dynamically forwarding log messages from one Fluent-bit to multiple Vali instances. -It also adds additional configurations that aim to improve plugin's performance and user experience. - -## Configuration Options - -| Key | Description | Default | -| --------------|-----------------------------------------------|-------------------------------------| -| Url | Url of vali server API endpoint. | `http://localhost:3100/vali/api/v1/push` | -| ProxyURL | Optional Url for http proxy. | "" | -| BatchWait | Time to wait before send a log batch to Vali, full or not. (unit: sec) | 1 second | -| BatchSize | Log batch size to send a log batch to Vali (unit: Bytes). | 10 KiB (10 * 1024 Bytes) | -| MaxRetries | Number of times the vali client will try to send unsuccessful sent record to vali. | 10 | -| Timeout | The duration which vali client will wait for response. | 10 | -| MinBackoff | The first wait after unsuccessful sent log. | 0.5s | -| MaxBackoff | The maximum duration after unsuccessful sent log. | 5m | -| Labels | Additional labels in the log stream | {job="fluent-bit"} | -| LogLevel | LogLevel for plugin logger. | "info" | -| RemoveKeys | Fields to be removed from the log streams | none | -| AutoKubernetesLabels | If set to true, it will add all Kubernetes labels to Vali labels | false | -| LabelKeys | Comma separated list of keys to use as stream labels. All other keys will be placed into the log line. LabelKeys is deactivated when using `LabelMapPath` label mapping configuration. | none | -| LineFormat | Format to use when flattening the record to a log line. Valid values are "json" or "key_value". If set to "json" the log line sent to Vali will be the `fluent-bit` record (excluding any keys extracted out as labels) dumped as json. If set to "key_value", the log line will be each item in the record concatenated together (separated by a single space) in the format `=`. | json | -| DropSingleKey | When a record has only a single key after after extracting the label keys for the stream, the log line sent to Vali will just be the value of the remaining key.| true | -| LabelMapPath | Path to a json file defining how to transform nested records. | none | -| DynamicHostPath | Jsonpath in the log labels to the dynamic host. | none | -| DynamicHostPrefix | String to prepend to the dynamic host. | none | -| DynamicHostSuffix | String to append to the dynamic host. | none | -| DynamicHostRegex | Regex to check if the dynamic host is valid. | '*' | -| Buffer | If set to true, a buffered client will be used. | none | -| BufferType | The buffer type to use when using buffered client is unable. "Dque" is the only available. | "dque" | -| QueueDir | Path to a directory where the buffer will store its records. | '/tmp/flb-storage/vali' | -| QueueSegmentSize | The number of entries stored into the buffer. | 500 | -| QueueName | The name of the file where the log entries will be stored | `dque` | -| SortByTimestamp | Sort the logs by their timestamps. | `false` | -| FallbackToTagWhenMetadataIsMissing | If set the plugin will try to extract the `namespace`, `pod_name` and `container_name` from the tag when the metadata is missing | `false` | -| TagKey | The key of the record which holds the tag. The tag should not be nested | "tag" | -| TagPrefix | The prefix of the tag. In the prefix no metadata will be searched. The prefix must not contain group expression(`()`). | none | -| TagExpression | The regex expression which will be used for matching the metadata retrieved from the tag. It contains 3 group expressions (`()`): `pod name`, `namespace` and the `container name` | "\\.(.*)_(.*)_(.*)-.*\\.log" | -| DropLogEntryWithoutK8sMetadata | When metadata is missing for the log entry, it will be dropped | `false` | -| ControllerSyncTimeout | Time to wait for cluster object synchronization | 60 seconds | -| NumberOfBatchIDs | The number of id per batch. This increase the number of vali label streams | 10 | -| IdLabelName | The name of the batch ID label kye in the stream label set | `id` | -| DeletedClientTimeExpiration | The time duration after a client for deleted cluster will be considered for expired | 1 hour | -| HostnameKeyValue | \\\ key/value pair adding the hostname into the label stream. When value is omitted the hostname is deduced from os.Hostname() call | nil | -| Pprof | Activating the pprof packeg for debugging purpose | false | -| LabelSetInitCapacity | The initial size of the label set which will be extracted from the records. Reduce map reallocation | 10 | -| SendLogsToMainClusterWhenIsInCreationState | Send log to the dynamic cluster when it is in creation state | `true` | -| SendLogsToMainClusterWhenIsInReadyState | Send log to the dynamic cluster when it is in ready state | `true` | -| SendLogsToMainClusterWhenIsInHibernatingState | Send log to the dynamic cluster when it is in hibernating state | `false` | -| SendLogsToMainClusterWhenIsInHibernatedState | Send log to the dynamic cluster when it is in hibernated state | `false` | -| SendLogsToMainClusterWhenIsInDeletionState | Send log to the dynamic cluster when it is in deletion state | `true` | -| SendLogsToMainClusterWhenIsInRestoreState | Send log to the dynamic cluster when it is in restoration state | `true` | -| SendLogsToMainClusterWhenIsInMigrationState | Send log to the dynamic cluster when it is in migration state | `true` | -| SendLogsToDefaultClientWhenClusterIsInCreationState | Send log to the default URL when it is in creation state | `true` | -| SendLogsToDefaultClientWhenClusterIsInReadyState | Send log to the default URL when it is in ready state | `false` | -| SendLogsToDefaultClientWhenClusterIsInHibernatingState | Send log to the default URL when it is in hibernating state | `false` | -| SendLogsToDefaultClientWhenClusterIsInHibernatedState | Send log to the default URL when it is in hibernated state | `false` | -| SendLogsToDefaultClientWhenClusterIsInDeletionState | Send log to the default URL when it is in deletion state | `true` | -| SendLogsToDefaultClientWhenClusterIsInRestoreState | Send log to the default URL when it is in restoration state | `true` | -| SendLogsToDefaultClientWhenClusterIsInMigrationState | Send log to the default URL when it is in migration state | `true` | - -### Labels - -Labels are used to query logs. For example, `{container_name="nginx", cluster="us-west1"}`. Usually labels represent metadata about the workload producing the log stream such as (`instance`, `container_name`, `region`, `cluster`, `level`). In Vali labels are indexed consequently which may lead to log streams with high cardinality. The latter can influence the performance of the backend - -The `Labels`, `RemoveKeys` , `LabelKeys` and `LabelMapPath` configuration options determine how the output plugin will perform labels extraction. - -### AutoKubernetesLabels - -If set to true, it will add all Kubernetes labels to Vali labels automatically and ignore parameters `LabelKeys`, LabelMapPath. - -### LabelMaps - -While Vali labels are key value pairs, the fluent-bit records may contain nested structures. -The `LabelMap` or `LabelMapPath` determines how to extract `labels` from each record. Each key in the map will be matched with the log record to find the label values. Values from the configuration are used as label names. - -Considering the record below : - -```json -{ - "kubernetes": { - "container_name": "valitail", - "pod_name": "valitail-xxx", - "namespace_name": "prod", - "labels" : { - "team": "x-men", - }, - }, - "HOSTNAME": "docker-desktop", - "log" : "a log line", - "time": "20190926T152206Z", -} -``` +[![License: Apache-2.0](https://img.shields.io/badge/License-Apache--2.0-blue.svg)](LICENSE) +[![Release](https://img.shields.io/github/v/release/gardener/logging.svg?style=flat)](https://github.com/gardener/logging) +[![Go Reference](https://pkg.go.dev/badge/github.com/gardener/logging.svg)](https://pkg.go.dev/github.com/gardener/logging) -and a LabelMap file as follow : - -```json -{ - "kubernetes": { - "container_name": "container", - "pod_name": "pod", - "namespace_name": "namespace", - "labels" : { - "team": "team", - }, - }, -} -``` +A modern Fluent Bit output plugin for [Gardener](https://gardener.cloud/) that ships logs using the **OpenTelemetry Protocol (OTLP)**. This plugin enables standardized, vendor-neutral log forwarding from Fluent Bit to any OTLP-compatible backend (VictoriaLogs, Loki, ClickHouse, etc.) with support for dynamic routing to multiple destinations based on Kubernetes cluster metadata. -The labels extracted will be `{team="x-men", container="valitail", pod="valitail-xxx", namespace="prod"}`. +## Overview -If you don't want the `kubernetes` and `HOSTNAME` fields to appear in the log line you can use the `RemoveKeys` configuration field. (e.g. `RemoveKeys kubernetes,HOSTNAME`). +The Gardener OTLP plugin represents Gardener's evolution toward **Observability 2.0**, embracing OpenTelemetry standards for unified, interoperable observability. It provides: -### Configuration examples +- **OpenTelemetry Protocol Support**: Native OTLP over gRPC and HTTP +- **Dynamic Multi-Target Routing**: Automatically routes logs to different backends based on Kubernetes cluster state +- **Persistent Buffering**: Disk-backed queuing (dque) prevents log loss during backend outages +- **Enterprise Features**: TLS/mTLS, rate limiting, retries with exponential backoff, and batch processing +- **Kubernetes-Native**: Seamless integration with Gardener's Shoot and Seed cluster architecture +- **Production-Ready**: Prometheus metrics, health checks, and pprof profiling support -To configure the Vali output plugin add this section to fluent-bit.conf +![Gardener OTLP Plugin Architecture](docs/images/gardener-logging-otlp-plugin.png) -```properties -[Output] - Name gardenervali - Match kubernetes.* - Url http://vali.garden.svc:3100/vali/api/v1/push - LogLevel info - BatchWait 40 - BatchSize 30720 - Labels {test="fluent-bit-go"} - LineFormat json - SortByTimestamp true - DropSingleKey false - AutoKubernetesLabels false - LabelSelector gardener.cloud/role:shoot - RemoveKeys kubernetes,stream,time,tag - LabelMapPath /fluent-bit/etc/kubernetes_label_map.json - DynamicHostPath {"kubernetes": {"namespace_name": "namespace"}} - DynamicHostPrefix http://vali. - DynamicHostSuffix .svc:3100/vali/api/v1/push - DynamicHostRegex ^shoot- - DynamicTenant user gardener user - MaxRetries 3 - Timeout 10 - MinBackoff 30 - Buffer true - BufferType dque - QueueDir /fluent-bit/buffers/operator - QueueSegmentSize 300 - QueueSync normal - QueueName gardener-kubernetes-operator - FallbackToTagWhenMetadataIsMissing true - TagKey tag - DropLogEntryWithoutK8sMetadata true +## Key Features + +### OTLP Protocol Support +- **OTLP/gRPC**: High-performance binary protocol with bi-directional streaming +- **OTLP/HTTP**: Firewall-friendly HTTP/1.1 and HTTP/2 support with JSON or Protobuf encoding +- **OpenTelemetry Standards**: Full compliance with OTLP log data model and semantic conventions + +### Dynamic Multi-Cluster Routing +- **Intelligent Routing**: Automatically determines target backends based on Kubernetes namespace and cluster state +- **Cluster State Awareness**: Routes logs differently based on cluster lifecycle (Ready, Hibernating, Deletion, etc.) +- **Seed and Shoot Support**: Separate client configurations for Gardener Seed and Shoot clusters + +### Reliability & Performance +- **Persistent Buffering**: Disk-based queue (dque) with configurable segment size and sync modes +- **Batch Processing**: Efficient batching with configurable size and timeout parameters +- **Retry Logic**: Exponential backoff with configurable initial/max intervals and elapsed time +- **Rate Limiting**: Optional throttling to prevent overwhelming backends +- **Backpressure Handling**: Queue-based buffering prevents memory exhaustion under high load + +### Security & Compliance +- **TLS/mTLS**: Full TLS 1.2+ support with certificate authentication +- **Secure Defaults**: TLS enabled by default with configurable certificate validation +- **Header-based Auth**: Custom headers support for token-based authentication + +### Observability +- **Prometheus Metrics**: Comprehensive metrics for monitoring client behavior, queue depth, export latency +- **Health Checks**: `/healthz` endpoint for liveness/readiness probes +- **Debug Support**: pprof profiling endpoints for troubleshooting + +## Quick Start + +### Installation + +```bash +# Build the plugin +make plugin + +# Run with Fluent Bit +fluent-bit -e ./build/output_plugin.so -c fluent-bit.conf ``` -```properties +### Basic Configuration + +```ini [Output] - Name gardenervali - Match journald.* - Url http://vali.garden.svc:3100/vali/api/v1/push + Name gardener + Match kubernetes.* + SeedType OTLPGRPC + Endpoint victorialogs.logging.svc:4317 LogLevel info - BatchWait 60 - BatchSize 30720 - Labels {test="fluent-bit-go"} - LineFormat json - SortByTimestamp true - DropSingleKey false - RemoveKeys kubernetes,stream,hostname,unit - LabelMapPath /fluent-bit/etc/systemd_label_map.json - MaxRetries 3 - Timeout 10 - MinBackoff 30 - Buffer true - BufferType dque - QueueDir /fluent-bit/buffers - QueueSegmentSize 300 - QueueSync normal - QueueName gardener-journald ``` -### Running multiple plugin instances +See [Usage Guide](docs/usage.md) for detailed installation and usage instructions. -You can run multiple plugin instances in the same fluent-bit process, for example if you want to push to different Vali servers or route logs into different Vali tenant IDs. To do so, add additional `[Output]` sections. +## Observability 2.0 -## Building +This plugin is part of Gardener's evolution toward **Observability 2.0**, a modern approach to unified observability using OpenTelemetry standards. -```bash -make plugin -``` +πŸ“– **[Gardener Observability 2.0: A Unified Approach](docs/observability-2.0/Observability%202.0.md)** -## Prerequisites +### Roadmap Status -* Go 1.23+ -* gcc (for cgo) +The OTLP plugin implementation represents **Step 5** in Gardener's Observability 2.0 roadmap: -## Local +1. βœ… Gardener OpenTelemetry Collector Distribution +2. βœ… OpenTelemetry Operator for Seeds +3. πŸ”„ Migrate Shoot log shippers from Valitail to OTel Collector +4. πŸ”„ Format Fluent Bit inputs as OTLP logs +5. βœ… **OTLP in Fluent Bit Output - Gardener Logging Plugin** (This project) +6. πŸ”„ VictoriaLogs in Shoot Control Plane namespaces +7. πŸ”„ OTel Collector extension for Shoots +8. πŸ”„ Unified visualization -If you have Fluent Bit installed in your `$PATH` you can run the plugin using: +## Documentation -```bash -fluent-bit -e /path/to/built/out_vali.so -c fluent-bit.conf -``` +### User Guides +- **[Configuration Guide](docs/configuration.md)** - Complete configuration reference with examples +- **[Usage Guide](docs/usage.md)** - Installation and usage instructions +- **[Troubleshooting Guide](docs/troubleshooting.md)** - Common issues and solutions -You can also adapt your plugins.conf, removing the need to change the command line options: +### Technical Documentation +- **[Architecture](docs/architecture.md)** - Design and component details +- **[Monitoring & Metrics](docs/monitoring.md)** - Metrics, alerts, and observability +- **[Client Package Documentation](pkg/client/README.md)** - Detailed client implementation docs + +## Supported Backends + +This plugin is compatible with any backend that supports OTLP log ingestion: + +- **[VictoriaLogs](https://docs.victoriametrics.com/victorialogs/)** - High-performance log storage +- **[Loki](https://grafana.com/oss/loki/)** - Horizontally-scalable log aggregation +- **[ClickHouse](https://clickhouse.com/)** - Fast OLAP database +- **[OpenTelemetry Collector](https://opentelemetry.io/docs/collector/)** - Vendor-agnostic data pipeline + + +## Contributing + +Contributions are welcome! Please: + +1. Read [CONTRIBUTING.md](CONTRIBUTING.md) +2. Follow the [Gardener coding conventions](.github/copilot-instructions.md) +3. Submit a pull request with tests + +## License + +This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details. + +## Links + +- **Gardener Project**: [https://gardener.cloud/](https://gardener.cloud/) +- **OpenTelemetry**: [https://opentelemetry.io/](https://opentelemetry.io/) +- **Documentation**: [docs/](docs/) +- **GitHub Issues**: [https://github.com/gardener/logging/issues](https://github.com/gardener/logging/issues) + +## Support + +For questions or issues: + +- **GitHub Issues**: [https://github.com/gardener/logging/issues](https://github.com/gardener/logging/issues) +- **Gardener Slack**: [#gardener](https://kubernetes.slack.com/messages/gardener) on Kubernetes Slack -```config -[PLUGINS] - Path /path/to/built/out_vali.so -``` diff --git a/VERSION b/VERSION index 337f7f445..74a91a8f3 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v0.72.0-dev +v1.0.0-dev diff --git a/cmd/copy/copy.go b/cmd/copy/copy.go index 624f227a3..04c759514 100644 --- a/cmd/copy/copy.go +++ b/cmd/copy/copy.go @@ -1,5 +1,4 @@ -// SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and Gardener contributors -// +// Copyright 2025 SPDX-FileCopyrightText: SAP SE or an SAP affiliate company and Gardener contributors // SPDX-License-Identifier: Apache-2.0 package main diff --git a/cmd/event-logger/app/options.go b/cmd/event-logger/app/options.go index f11ea2152..df695ac5d 100644 --- a/cmd/event-logger/app/options.go +++ b/cmd/event-logger/app/options.go @@ -1,5 +1,4 @@ -// SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and Gardener contributors -// +// Copyright 2025 SPDX-FileCopyrightText: SAP SE or an SAP affiliate company and Gardener contributors // SPDX-License-Identifier: Apache-2.0 package app @@ -21,7 +20,7 @@ import ( "k8s.io/component-base/version/verflag" "sigs.k8s.io/controller-runtime/pkg/manager/signals" - "github.com/gardener/logging/pkg/events" + "github.com/gardener/logging/v1/pkg/events" ) // NewCommandStartGardenerEventLogger creates a *cobra.Command object with default parameters. diff --git a/cmd/event-logger/main.go b/cmd/event-logger/main.go index 20ed85357..27e5874a0 100644 --- a/cmd/event-logger/main.go +++ b/cmd/event-logger/main.go @@ -1,5 +1,4 @@ -// SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and Gardener contributors -// +// Copyright 2025 SPDX-FileCopyrightText: SAP SE or an SAP affiliate company and Gardener contributors // SPDX-License-Identifier: Apache-2.0 package main @@ -7,7 +6,7 @@ package main import ( "os" - "github.com/gardener/logging/cmd/event-logger/app" + "github.com/gardener/logging/v1/cmd/event-logger/app" ) func main() { diff --git a/cmd/fluent-bit-output-plugin/config_dump.go b/cmd/fluent-bit-output-plugin/config_dump.go new file mode 100644 index 000000000..75d14f921 --- /dev/null +++ b/cmd/fluent-bit-output-plugin/config_dump.go @@ -0,0 +1,99 @@ +// Copyright 2025 SPDX-FileCopyrightText: SAP SE or an SAP affiliate company and Gardener contributors +// SPDX-License-Identifier: Apache-2.0 + +package main + +import ( + "fmt" + + "github.com/gardener/logging/v1/pkg/config" +) + +// dumpConfiguration logs the complete plugin configuration at debug level (V(1)). +// This is useful for troubleshooting configuration issues and verifying that +// all configuration values are correctly parsed and applied. +func dumpConfiguration(conf *config.Config) { + logger.V(1).Info("[flb-go] ===== Plugin Config =====") + + logger.V(1).Info("[flb-go]", "DropLogEntryWithoutK8sMetadata", fmt.Sprintf("%+v", conf.PluginConfig.KubernetesMetadata.DropLogEntryWithoutK8sMetadata)) + logger.V(1).Info("[flb-go]", "FallbackToTagWhenMetadataIsMissing", fmt.Sprintf("%+v", conf.PluginConfig.KubernetesMetadata.FallbackToTagWhenMetadataIsMissing)) + if len(conf.PluginConfig.HostnameValue) > 0 { + logger.V(1).Info("[flb-go]", "HostnameValue", conf.PluginConfig.HostnameValue) + } + logger.V(1).Info("[flb-go]", "LogLevel", conf.PluginConfig.LogLevel) + logger.V(1).Info("[flb-go]", "Pprof", fmt.Sprintf("%+v", conf.PluginConfig.Pprof)) + logger.V(1).Info("[flb-go]", "SeedType", conf.PluginConfig.SeedType) + logger.V(1).Info("[flb-go]", "ShootType", conf.PluginConfig.ShootType) + logger.V(1).Info("[flb-go]", "TagExpression", fmt.Sprintf("%+v", conf.PluginConfig.KubernetesMetadata.TagExpression)) + logger.V(1).Info("[flb-go]", "TagKey", fmt.Sprintf("%+v", conf.PluginConfig.KubernetesMetadata.TagKey)) + logger.V(1).Info("[flb-go]", "TagPrefix", fmt.Sprintf("%+v", conf.PluginConfig.KubernetesMetadata.TagPrefix)) + logger.V(1).Info("[flb-go]", "Origin", fmt.Sprintf("%+v", conf.PluginConfig.Origin)) + logger.V(1).Info("") + logger.V(1).Info("[flb-go] ===== Controller Config =====") + logger.V(1).Info("[flb-go]", "ControllerSyncTimeout", fmt.Sprintf("%+v", conf.ControllerConfig.CtlSyncTimeout.String())) + logger.V(1).Info("[flb-go]", "DynamicHostPath", fmt.Sprintf("%+v", conf.ControllerConfig.DynamicHostPath)) + logger.V(1).Info("[flb-go]", "DynamicHostPrefix", fmt.Sprintf("%+v", conf.ControllerConfig.DynamicHostPrefix)) + logger.V(1).Info("[flb-go]", "DynamicHostSuffix", fmt.Sprintf("%+v", conf.ControllerConfig.DynamicHostSuffix)) + logger.V(1).Info("[flb-go]", "DynamicHostRegex", fmt.Sprintf("%+v", conf.ControllerConfig.DynamicHostRegex)) + logger.V(1).Info("[flb-go]", "SendLogsToShootWhenIsInCreationState", fmt.Sprintf("%+v", conf.ControllerConfig.ShootControllerClientConfig.SendLogsWhenIsInCreationState)) + logger.V(1).Info("[flb-go]", "SendLogsToShootWhenIsInReadyState", fmt.Sprintf("%+v", conf.ControllerConfig.ShootControllerClientConfig.SendLogsWhenIsInReadyState)) + logger.V(1).Info("[flb-go]", "SendLogsToShootWhenIsInHibernatingState", fmt.Sprintf("%+v", conf.ControllerConfig.ShootControllerClientConfig.SendLogsWhenIsInHibernatingState)) + logger.V(1).Info("[flb-go]", "SendLogsToShootWhenIsInHibernatedState", fmt.Sprintf("%+v", conf.ControllerConfig.ShootControllerClientConfig.SendLogsWhenIsInHibernatedState)) + logger.V(1).Info("[flb-go]", "SendLogsToShootWhenIsInDeletionState", fmt.Sprintf("%+v", conf.ControllerConfig.ShootControllerClientConfig.SendLogsWhenIsInDeletionState)) + logger.V(1).Info("[flb-go]", "SendLogsToShootWhenIsInRestoreState", fmt.Sprintf("%+v", conf.ControllerConfig.ShootControllerClientConfig.SendLogsWhenIsInRestoreState)) + logger.V(1).Info("[flb-go]", "SendLogsToShootWhenIsInMigrationState", fmt.Sprintf("%+v", conf.ControllerConfig.ShootControllerClientConfig.SendLogsWhenIsInMigrationState)) + logger.V(1).Info("[flb-go]", "SendLogsToSeedWhenShootIsInCreationState", fmt.Sprintf("%+v", conf.ControllerConfig.SeedControllerClientConfig.SendLogsWhenIsInCreationState)) + logger.V(1).Info("[flb-go]", "SendLogsToSeedWhenShootIsInReadyState", fmt.Sprintf("%+v", conf.ControllerConfig.SeedControllerClientConfig.SendLogsWhenIsInReadyState)) + logger.V(1).Info("[flb-go]", "SendLogsToSeedWhenShootIsInHibernatingState", fmt.Sprintf("%+v", conf.ControllerConfig.SeedControllerClientConfig.SendLogsWhenIsInHibernatingState)) + logger.V(1).Info("[flb-go]", "SendLogsToSeedWhenShootIsInHibernatedState", fmt.Sprintf("%+v", conf.ControllerConfig.SeedControllerClientConfig.SendLogsWhenIsInHibernatedState)) + logger.V(1).Info("[flb-go]", "SendLogsToSeedWhenShootIsInDeletionState", fmt.Sprintf("%+v", conf.ControllerConfig.SeedControllerClientConfig.SendLogsWhenIsInDeletionState)) + logger.V(1).Info("[flb-go]", "SendLogsToSeedWhenShootIsInRestoreState", fmt.Sprintf("%+v", conf.ControllerConfig.SeedControllerClientConfig.SendLogsWhenIsInRestoreState)) + logger.V(1).Info("[flb-go]", "SendLogsToSeedWhenShootIsInMigrationState", fmt.Sprintf("%+v", conf.ControllerConfig.SeedControllerClientConfig.SendLogsWhenIsInMigrationState)) + logger.V(1).Info("") + logger.V(1).Info("[flb-go] ===== OTLP Config =====") + logger.V(1).Info("[flb-go]", "DQueDir", fmt.Sprintf("%+v", conf.OTLPConfig.DQueConfig.DQueDir)) + logger.V(1).Info("[flb-go]", "DQueSegmentSize", fmt.Sprintf("%+v", conf.OTLPConfig.DQueConfig.DQueSegmentSize)) + logger.V(1).Info("[flb-go]", "DQueSync", fmt.Sprintf("%+v", conf.OTLPConfig.DQueConfig.DQueSync)) + logger.V(1).Info("[flb-go]", "DQueName", fmt.Sprintf("%+v", conf.OTLPConfig.DQueConfig.DQueName)) // DQue Batch Processor configuration + logger.V(1).Info("[flb-go]", "DQueBatchProcessorMaxQueueSize", fmt.Sprintf("%+v", conf.OTLPConfig.DQueBatchProcessorMaxQueueSize)) + logger.V(1).Info("[flb-go]", "DQueBatchProcessorMaxBatchSize", fmt.Sprintf("%+v", conf.OTLPConfig.DQueBatchProcessorMaxBatchSize)) + logger.V(1).Info("[flb-go]", "DQueBatchProcessorExportTimeout", fmt.Sprintf("%+v", conf.OTLPConfig.DQueBatchProcessorExportTimeout)) + logger.V(1).Info("[flb-go]", "DQueBatchProcessorExportInterval", fmt.Sprintf("%+v", conf.OTLPConfig.DQueBatchProcessorExportInterval)) + logger.V(1).Info("[flb-go]", "DQueBatchProcessorExportBufferSize", fmt.Sprintf("%+v", conf.OTLPConfig.DQueBatchProcessorExportBufferSize)) + // OTLP general configuration + logger.V(1).Info("[flb-go]", "Endpoint", fmt.Sprintf("%+v", conf.OTLPConfig.Endpoint)) + logger.V(1).Info("[flb-go]", "EndpointUrl", fmt.Sprintf("%+v", conf.OTLPConfig.EndpointURL)) + logger.V(1).Info("[flb-go]", "EndpointUrlPath", fmt.Sprintf("%+v", conf.OTLPConfig.EndpointURLPath)) + + logger.V(1).Info("[flb-go]", "Insecure", fmt.Sprintf("%+v", conf.OTLPConfig.Insecure)) + logger.V(1).Info("[flb-go]", "Compression", fmt.Sprintf("%+v", conf.OTLPConfig.Compression)) + logger.V(1).Info("[flb-go]", "Timeout", fmt.Sprintf("%+v", conf.OTLPConfig.Timeout)) + + if len(conf.OTLPConfig.Headers) > 0 { + logger.V(1).Info("[flb-go]", "Headers", fmt.Sprintf("%+v", conf.OTLPConfig.Headers)) + } + // OTLP Client Retry configuration + logger.V(1).Info("[flb-go]", "RetryEnabled", fmt.Sprintf("%+v", conf.OTLPConfig.RetryEnabled)) + logger.V(1).Info("[flb-go]", "RetryInitialInterval", fmt.Sprintf("%+v", conf.OTLPConfig.RetryInitialInterval)) + logger.V(1).Info("[flb-go]", "RetryMaxInterval", fmt.Sprintf("%+v", conf.OTLPConfig.RetryMaxInterval)) + logger.V(1).Info("[flb-go]", "RetryMaxElapsedTime", fmt.Sprintf("%+v", conf.OTLPConfig.RetryMaxElapsedTime)) + if conf.OTLPConfig.RetryConfig != nil { + logger.V(1).Info("[flb-go]", "RetryConfig", "configured") + } + + // Throttle configuration + logger.V(1).Info("[flb-go]", "ThrottleEnabled", fmt.Sprintf("%+v", conf.OTLPConfig.ThrottleEnabled)) + logger.V(1).Info("[flb-go]", "ThrottlePeriod", fmt.Sprintf("%+v", conf.OTLPConfig.ThrottleRequestsPerSec)) + + // OTLP TLS configuration + logger.V(1).Info("[flb-go]", "TLSCertFile", fmt.Sprintf("%+v", conf.OTLPConfig.TLSCertFile)) + logger.V(1).Info("[flb-go]", "TLSKeyFile", fmt.Sprintf("%+v", conf.OTLPConfig.TLSKeyFile)) + logger.V(1).Info("[flb-go]", "TLSCAFile", fmt.Sprintf("%+v", conf.OTLPConfig.TLSCAFile)) + logger.V(1).Info("[flb-go]", "TLSServerName", fmt.Sprintf("%+v", conf.OTLPConfig.TLSServerName)) + logger.V(1).Info("[flb-go]", "TLSInsecureSkipVerify", fmt.Sprintf("%+v", conf.OTLPConfig.TLSInsecureSkipVerify)) + logger.V(1).Info("[flb-go]", "TLSMinVersion", fmt.Sprintf("%+v", conf.OTLPConfig.TLSMinVersion)) + logger.V(1).Info("[flb-go]", "TLSMaxVersion", fmt.Sprintf("%+v", conf.OTLPConfig.TLSMaxVersion)) + if conf.OTLPConfig.TLSConfig != nil { + logger.V(1).Info("[flb-go]", "TLSConfig", "configured") + } +} diff --git a/cmd/fluent-bit-output-plugin/kubernetes_client.go b/cmd/fluent-bit-output-plugin/kubernetes_client.go new file mode 100644 index 000000000..071c7f6ee --- /dev/null +++ b/cmd/fluent-bit-output-plugin/kubernetes_client.go @@ -0,0 +1,65 @@ +// Copyright 2025 SPDX-FileCopyrightText: SAP SE or an SAP affiliate company and Gardener contributors +// SPDX-License-Identifier: Apache-2.0 + +package main + +import ( + "fmt" + "os" + "time" + + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + + gardenerclientsetversioned "github.com/gardener/logging/v1/pkg/cluster/clientset/versioned" + gardeninternalcoreinformers "github.com/gardener/logging/v1/pkg/cluster/informers/externalversions" +) + +// inClusterKubernetesClient creates a Kubernetes client using in-cluster configuration. +// It returns nil if the in-cluster config is not available (e.g., when running outside a cluster). +func inClusterKubernetesClient() (gardenerclientsetversioned.Interface, error) { + c, err := rest.InClusterConfig() + if err != nil { + return nil, fmt.Errorf("failed to get incluster config: %v", err) + } + + return gardenerclientsetversioned.NewForConfig(c) +} + +// envKubernetesClient creates a Kubernetes client using the KUBECONFIG environment variable. +// It returns an error if the KUBECONFIG env var is not set or the config file is invalid. +func envKubernetesClient() (gardenerclientsetversioned.Interface, error) { + fromFlags, err := clientcmd.BuildConfigFromFlags("", os.Getenv("KUBECONFIG")) + if err != nil { + return nil, fmt.Errorf("failed to get kubeconfig from env: %v", err) + } + + return gardenerclientsetversioned.NewForConfig(fromFlags) +} + +// initClusterInformer initializes and starts the shared informer instance for Cluster resources. +// It first attempts to use in-cluster configuration, falling back to KUBECONFIG if that fails. +// The informer is used to watch for changes to Cluster resources when dynamic host paths are configured. +// This function panics if it cannot obtain a valid Kubernetes client from either source. +func initClusterInformer() { + if informer != nil && !informer.IsStopped() { + return + } + + var ( + err error + kubernetesClient gardenerclientsetversioned.Interface + ) + if kubernetesClient, _ = inClusterKubernetesClient(); kubernetesClient == nil { + logger.Info("[flb-go] failed to get in-cluster kubernetes client, trying KUBECONFIG env variable") + kubernetesClient, err = envKubernetesClient() + if err != nil { + panic(fmt.Errorf("failed to get kubernetes client, give up: %v", err)) + } + } + + kubeInformerFactory := gardeninternalcoreinformers.NewSharedInformerFactory(kubernetesClient, time.Second*30) + informer = kubeInformerFactory.Extensions().V1alpha1().Clusters().Informer() + informerStopChan = make(chan struct{}) + kubeInformerFactory.Start(informerStopChan) +} diff --git a/cmd/fluent-bit-output-plugin/output_plugin.go b/cmd/fluent-bit-output-plugin/output_plugin.go index c53a3bc22..1700f06d9 100644 --- a/cmd/fluent-bit-output-plugin/output_plugin.go +++ b/cmd/fluent-bit-output-plugin/output_plugin.go @@ -1,9 +1,5 @@ -/* -This file was copied from the credativ/vali project -https://github.com/credativ/vali/blob/v2.2.4/cmd/fluent-bit/out_vali.go - -Modifications Copyright SAP SE or an SAP affiliate company and Gardener contributors -*/ +// Copyright 2025 SPDX-FileCopyrightText: SAP SE or an SAP affiliate company and Gardener contributors +// SPDX-License-Identifier: Apache-2.0 package main @@ -12,207 +8,116 @@ import ( ) import ( + "errors" "fmt" "net/http" _ "net/http/pprof" "os" - "runtime" "strings" "sync" "time" "unsafe" "github.com/fluent/fluent-bit-go/output" - "github.com/go-kit/log" - "github.com/go-kit/log/level" + "github.com/go-logr/logr" "github.com/prometheus/client_golang/prometheus/promhttp" - "github.com/weaveworks/common/logging" "k8s.io/apimachinery/pkg/util/uuid" - "k8s.io/client-go/rest" "k8s.io/client-go/tools/cache" - "k8s.io/client-go/tools/clientcmd" "k8s.io/component-base/version" - gardenerclientsetversioned "github.com/gardener/logging/pkg/cluster/clientset/versioned" - gardeninternalcoreinformers "github.com/gardener/logging/pkg/cluster/informers/externalversions" - "github.com/gardener/logging/pkg/config" - "github.com/gardener/logging/pkg/healthz" - "github.com/gardener/logging/pkg/metrics" - "github.com/gardener/logging/pkg/plugin" + "github.com/gardener/logging/v1/pkg/config" + "github.com/gardener/logging/v1/pkg/healthz" + "github.com/gardener/logging/v1/pkg/log" + "github.com/gardener/logging/v1/pkg/metrics" + "github.com/gardener/logging/v1/pkg/plugin" + "github.com/gardener/logging/v1/pkg/types" ) var ( - // registered vali plugin instances, required for disposal during shutdown - pluginsMap map[string]plugin.OutputPlugin - pluginsMutex sync.RWMutex - logger log.Logger + // registered plugin instances, required for disposal during shutdown + // Uses sync.Map for concurrent-safe access without explicit locking + plugins sync.Map // map[string]plugin.OutputPlugin + logger logr.Logger informer cache.SharedIndexInformer informerStopChan chan struct{} pprofOnce sync.Once ) func init() { - var logLevel logging.Level - _ = logLevel.Set("info") - - logger = log.With(newLogger(logLevel), "ts", log.DefaultTimestampUTC) - _ = level.Info(logger). - Log( - "version", version.Get().GitVersion, - "revision", version.Get().GitCommit, - "gitTreeState", version.Get().GitTreeState, - ) - pluginsMutex = sync.RWMutex{} - pluginsMap = make(map[string]plugin.OutputPlugin) + logger = log.NewLogger("info") + logger.Info("Starting fluent-bit-gardener-output-plugin", + "version", version.Get().GitVersion, + "revision", version.Get().GitCommit, + "gitTreeState", version.Get().GitTreeState, + ) // metrics and healthz go func() { http.Handle("/metrics", promhttp.Handler()) http.Handle("/healthz", healthz.Handler("", "")) if err := http.ListenAndServe(":2021", nil); err != nil { - _ = level.Error(logger).Log("Fluent-bit-gardener-output-plugin", err.Error()) + logger.Error(err, "Fluent-bit-gardener-output-plugin") } }() -} - -// Initializes and starts the shared informer instance -func initClusterInformer() { - if informer != nil && !informer.IsStopped() { - return - } - var ( - err error - kubernetesClient gardenerclientsetversioned.Interface - ) - if kubernetesClient, _ = inClusterKubernetesClient(); kubernetesClient == nil { - _ = level.Debug(logger).Log("[flb-go]", "failed to get in-cluster kubernetes client, trying KUBECONFIG env variable") - kubernetesClient, err = envKubernetesClient() - if err != nil { - panic(fmt.Errorf("failed to get kubernetes client, give up: %v", err)) - } - } - - kubeInformerFactory := gardeninternalcoreinformers.NewSharedInformerFactory(kubernetesClient, time.Second*30) - informer = kubeInformerFactory.Extensions().V1alpha1().Clusters().Informer() - informerStopChan = make(chan struct{}) - kubeInformerFactory.Start(informerStopChan) -} - -func setPprofProfile() { - pprofOnce.Do(func() { - runtime.SetMutexProfileFraction(5) - runtime.SetBlockProfileRate(1) - }) -} - -type pluginConfig struct { - ctx unsafe.Pointer -} - -func (c *pluginConfig) Get(key string) string { - return output.FLBPluginConfigKey(c.ctx, key) -} - -// toStringMap converts the pluginConfig to a map[string]string for configuration parsing. -// It extracts all configuration values from the fluent-bit plugin context and returns them -// as a string map that can be used by the config parser. This is necessary because there -// is no direct C interface to retrieve the complete plugin configuration at once. -// -// When adding new configuration options to the plugin, the corresponding keys must be -// added to the configKeys slice below to ensure they are properly extracted. -func (c *pluginConfig) toStringMap() map[string]string { - configMap := make(map[string]string) - - // Define all possible configuration keys based on the structs and documentation - configKeys := []string{ - // Client config - "Url", "ProxyUrl", "TenantID", "BatchWait", "BatchSize", "Labels", "Timeout", "MinBackoff", "MaxBackoff", - "MaxRetries", - "SortByTimestamp", "NumberOfBatchIDs", "IdLabelName", - - // Plugin config - "AutoKubernetesLabels", "LineFormat", "DropSingleKey", "LabelKeys", "RemoveKeys", "LabelMapPath", - "DynamicHostPath", "DynamicHostPrefix", "DynamicHostSuffix", "DynamicHostRegex", - "LabelSetInitCapacity", "HostnameKey", "HostnameKeyValue", "HostnameValue", "PreservedLabels", "EnableMultiTenancy", - - // Kubernetes metadata - "FallbackToTagWhenMetadataIsMissing", "DropLogEntryWithoutK8sMetadata", - "TagKey", "TagPrefix", "TagExpression", - - // Buffer config - "Buffer", "BufferType", "QueueDir", "QueueSegmentSize", "QueueSync", "QueueName", - - // Controller config - "DeletedClientTimeExpiration", "ControllerSyncTimeout", - "SendLogsToMainClusterWhenIsInCreationState", "SendLogsToMainClusterWhenIsInReadyState", - "SendLogsToMainClusterWhenIsInHibernatingState", "SendLogsToMainClusterWhenIsInHibernatedState", - "SendLogsToMainClusterWhenIsInDeletionState", "SendLogsToMainClusterWhenIsInRestoreState", - "SendLogsToMainClusterWhenIsInMigrationState", - "SendLogsToDefaultClientWhenClusterIsInCreationState", "SendLogsToDefaultClientWhenClusterIsInReadyState", - "SendLogsToDefaultClientWhenClusterIsInHibernatingState", "SendLogsToDefaultClientWhenClusterIsInHibernatedState", - - // General config - "LogLevel", "Pprof", - } - - // Extract values for all known keys - for _, key := range configKeys { - if value := c.Get(key); value != "" { - configMap[key] = value - } - } - - return configMap + _ = os.Setenv("OTEL_GO_X_OBSERVABILITY", "true") } // FLBPluginRegister registers the plugin with fluent-bit // //export FLBPluginRegister func FLBPluginRegister(ctx unsafe.Pointer) int { - return output.FLBPluginRegister(ctx, "gardenervali", "Ship fluent-bit logs to an Output") + return output.FLBPluginRegister(ctx, "gardener", "Ship fluent-bit logs to an Output") } // FLBPluginInit is called for each vali plugin instance // Since fluent-bit 3, the context is recreated upon hot-reload. // Any plugin instances created before are not present in the new context, which may lead to memory leaks. -// The fluent-bit shall invoke // //export FLBPluginInit func FLBPluginInit(ctx unsafe.Pointer) int { // shall create only if not found in the context and in plugins slice if id := output.FLBPluginGetContext(ctx); id != nil && pluginsContains(id.(string)) { - _ = level.Info(logger).Log("[flb-go]", "outputPlugin already present") + logger.Info("[flb-go]", "outputPlugin already present") return output.FLB_OK } pluginCfg := &pluginConfig{ctx: ctx} - conf, err := config.ParseConfigFromStringMap(pluginCfg.toStringMap()) + configurationMap := pluginCfg.toStringMap() + logger.Info(fmt.Sprintf("plugin configuration: %v", configurationMap)) + cfg, err := config.ParseConfigFromStringMap(configurationMap) + if err != nil { metrics.Errors.WithLabelValues(metrics.ErrorFLBPluginInit).Inc() - _ = level.Error(logger).Log("[flb-go]", "failed to launch", "error", err) + logger.Info("[flb-go] failed to launch", "error", err) return output.FLB_ERROR } - if conf.Pprof { + if cfg.PluginConfig.LogLevel != "info" { + logger = log.NewLogger(cfg.PluginConfig.LogLevel) + } + + dumpConfiguration(cfg) + + if cfg.PluginConfig.Pprof { setPprofProfile() } - if len(conf.PluginConfig.DynamicHostPath) > 0 { + if len(cfg.ControllerConfig.DynamicHostPath) > 0 { initClusterInformer() } id, _, _ := strings.Cut(string(uuid.NewUUID()), "-") - _logger := log.With(newLogger(conf.LogLevel), "ts", log.DefaultTimestampUTC, "id", id) - dumpConfiguration(_logger, conf) + // dump the complete configuration at debug level + // dumpConfiguration(cfg) - outputPlugin, err := plugin.NewPlugin(informer, conf, _logger) + outputPlugin, err := plugin.NewPlugin(informer, cfg, log.NewLogger(cfg.PluginConfig.LogLevel)) if err != nil { metrics.Errors.WithLabelValues(metrics.ErrorNewPlugin).Inc() - _ = level.Error(_logger).Log("[flb-go]", "error creating outputPlugin", "err", err) + logger.Error(err, "[flb-go] error creating output plugin", "id", id) return output.FLB_ERROR } @@ -220,11 +125,9 @@ func FLBPluginInit(ctx unsafe.Pointer) int { // register outputPlugin instance, to be retrievable when sending logs output.FLBPluginSetContext(ctx, id) // remember outputPlugin instance, required to cleanly dispose when fluent-bit is shutting down - pluginsMutex.Lock() - pluginsMap[id] = outputPlugin - pluginsMutex.Unlock() + pluginsSet(id, outputPlugin) - _ = level.Info(_logger).Log("[flb-go]", "output plugin initialized", "id", id, "count", len(pluginsMap)) + logger.Info("[flb-go] output plugin initialized", "id", id, "count", pluginsLen()) return output.FLB_OK } @@ -232,20 +135,18 @@ func FLBPluginInit(ctx unsafe.Pointer) int { // FLBPluginFlushCtx is called when the plugin is invoked to flush data // //export FLBPluginFlushCtx -func FLBPluginFlushCtx(ctx, data unsafe.Pointer, length C.int, tag *C.char) int { +func FLBPluginFlushCtx(ctx, data unsafe.Pointer, length C.int, _ *C.char) int { var id string var ok bool if id, ok = output.FLBPluginGetContext(ctx).(string); !ok { - _ = level.Error(logger).Log("msg", "output plugin id not found in context") + logger.Info("output plugin id not found in context") return output.FLB_ERROR } - pluginsMutex.RLock() - outputPlugin, ok := pluginsMap[id] - pluginsMutex.RUnlock() + outputPlugin, ok := pluginsGet(id) if !ok { metrics.Errors.WithLabelValues(metrics.ErrorFLBPluginFlushCtx).Inc() - _ = level.Error(logger).Log("[flb-go]", "outputPlugin not initialized") + logger.Error(errors.New("not found"), "outputPlugin not found in plugins map", "id", id) return output.FLB_ERROR } @@ -269,19 +170,18 @@ func FLBPluginFlushCtx(ctx, data unsafe.Pointer, length C.int, tag *C.char) int case uint64: timestamp = time.Unix(int64(t), 0) default: - _ = level.Info(logger).Log("[flb-go]", fmt.Sprintf("unknown timestamp type: %T", ts)) + logger.Info(fmt.Sprintf("[flb-go] unknown timestamp type: %T", ts)) timestamp = time.Now() } - err := outputPlugin.SendRecord(record, timestamp) - if err != nil { - _ = level.Error(logger).Log( - "[flb-go]", "error sending record, retrying...", - "tag", C.GoString(tag), - "err", err.Error(), - ) - - return output.FLB_RETRY // max retry of the outputPlugin is set to 3, then it shall be discarded by fluent-bit + // TODO: it shall also handle logs groups when opentelemetry envelope is enabled + // https://docs.fluentbit.io/manual/data-pipeline/processors/opentelemetry-envelope + l := types.OutputEntry{ + Timestamp: timestamp, + Record: toOutputRecord(record), + } + if err := outputPlugin.SendRecord(l); err != nil { + return output.FLB_RETRY } } @@ -300,22 +200,18 @@ func FLBPluginExitCtx(ctx unsafe.Pointer) int { var id string var ok bool if id, ok = output.FLBPluginGetContext(ctx).(string); !ok { - _ = level.Error(logger).Log("[flb-go]", "output plugin id not found in context") + logger.Error(errors.New("not found"), "outputPlugin not found in context") return output.FLB_ERROR } - pluginsMutex.RLock() - outputPlugin, ok := pluginsMap[id] - pluginsMutex.RUnlock() + outputPlugin, ok := pluginsGet(id) if !ok { - _ = level.Error(logger).Log("[flb-go]", "output plugin not known", "id", id) - return output.FLB_ERROR } outputPlugin.Close() pluginsRemove(id) - _ = level.Info(logger).Log("[flb-go]", "output plugin removed", "id", id, "count", len(pluginsMap)) + logger.Info("[flb-go] output plugin removed", "id", id, "count", pluginsLen()) return output.FLB_OK } @@ -324,9 +220,11 @@ func FLBPluginExitCtx(ctx unsafe.Pointer) int { // //export FLBPluginExit func FLBPluginExit() int { - for _, outputPlugin := range pluginsMap { - outputPlugin.Close() - } + plugins.Range(func(_, value any) bool { + value.(plugin.OutputPlugin).Close() + + return true + }) if informerStopChan != nil { close(informerStopChan) } @@ -334,108 +232,4 @@ func FLBPluginExit() int { return output.FLB_OK } -func pluginsContains(id string) bool { - pluginsMutex.RLock() - defer pluginsMutex.Unlock() - - return pluginsMap[id] != nil -} - -func pluginsRemove(id string) { - pluginsMutex.Lock() - defer pluginsMutex.Unlock() - delete(pluginsMap, id) -} - -func newLogger(logLevel logging.Level) log.Logger { - _logger := log.NewLogfmtLogger(log.NewSyncWriter(os.Stderr)) - _logger = level.NewFilter(_logger, logLevel.Gokit) - _logger = log.With(_logger, "caller", log.Caller(3)) - - return _logger -} - -func inClusterKubernetesClient() (gardenerclientsetversioned.Interface, error) { - c, err := rest.InClusterConfig() - if err != nil { - return nil, fmt.Errorf("failed to get incluster config: %v", err) - } - - return gardenerclientsetversioned.NewForConfig(c) -} - -func envKubernetesClient() (gardenerclientsetversioned.Interface, error) { - fromFlags, err := clientcmd.BuildConfigFromFlags("", os.Getenv("KUBECONFIG")) - if err != nil { - return nil, fmt.Errorf("failed to get kubeconfig from env: %v", err) - } - - return gardenerclientsetversioned.NewForConfig(fromFlags) -} - func main() {} - -func dumpConfiguration(_logger log.Logger, conf *config.Config) { - paramLogger := log.With(_logger, "[flb-go]", "provided parameter") - _ = level.Debug(paramLogger).Log("URL", conf.ClientConfig.CredativValiConfig.URL) - _ = level.Debug(paramLogger).Log("ProxyURL", conf.ClientConfig.CredativValiConfig.Client.ProxyURL.URL) - _ = level.Debug(paramLogger).Log("TenantID", conf.ClientConfig.CredativValiConfig.TenantID) - _ = level.Debug(paramLogger).Log("BatchWait", conf.ClientConfig.CredativValiConfig.BatchWait) - _ = level.Debug(paramLogger).Log("BatchSize", conf.ClientConfig.CredativValiConfig.BatchSize) - _ = level.Debug(paramLogger).Log("Labels", conf.ClientConfig.CredativValiConfig.ExternalLabels) - _ = level.Debug(paramLogger).Log("LogLevel", conf.LogLevel.String()) - _ = level.Debug(paramLogger).Log("AutoKubernetesLabels", conf.PluginConfig.AutoKubernetesLabels) - _ = level.Debug(paramLogger).Log("RemoveKeys", fmt.Sprintf("%+v", conf.PluginConfig.RemoveKeys)) - _ = level.Debug(paramLogger).Log("LabelKeys", fmt.Sprintf("%+v", conf.PluginConfig.LabelKeys)) - _ = level.Debug(paramLogger).Log("LineFormat", conf.PluginConfig.LineFormat) - _ = level.Debug(paramLogger).Log("DropSingleKey", conf.PluginConfig.DropSingleKey) - _ = level.Debug(paramLogger).Log("LabelMapPath", fmt.Sprintf("%+v", conf.PluginConfig.LabelMap)) - _ = level.Debug(paramLogger).Log("SortByTimestamp", fmt.Sprintf("%+v", conf.ClientConfig.SortByTimestamp)) - _ = level.Debug(paramLogger).Log("DynamicHostPath", fmt.Sprintf("%+v", conf.PluginConfig.DynamicHostPath)) - _ = level.Debug(paramLogger).Log("DynamicHostPrefix", fmt.Sprintf("%+v", conf.ControllerConfig.DynamicHostPrefix)) - _ = level.Debug(paramLogger).Log("DynamicHostSuffix", fmt.Sprintf("%+v", conf.ControllerConfig.DynamicHostSuffix)) - _ = level.Debug(paramLogger).Log("DynamicHostRegex", fmt.Sprintf("%+v", conf.PluginConfig.DynamicHostRegex)) - _ = level.Debug(paramLogger).Log("Timeout", fmt.Sprintf("%+v", conf.ClientConfig.CredativValiConfig.Timeout)) - _ = level.Debug(paramLogger).Log("MinBackoff", fmt.Sprintf("%+v", conf.ClientConfig.CredativValiConfig.BackoffConfig.MinBackoff)) - _ = level.Debug(paramLogger).Log("MaxBackoff", fmt.Sprintf("%+v", conf.ClientConfig.CredativValiConfig.BackoffConfig.MaxBackoff)) - _ = level.Debug(paramLogger).Log("MaxRetries", fmt.Sprintf("%+v", conf.ClientConfig.CredativValiConfig.BackoffConfig.MaxRetries)) - _ = level.Debug(paramLogger).Log("Buffer", fmt.Sprintf("%+v", conf.ClientConfig.BufferConfig.Buffer)) - _ = level.Debug(paramLogger).Log("BufferType", fmt.Sprintf("%+v", conf.ClientConfig.BufferConfig.BufferType)) - _ = level.Debug(paramLogger).Log("QueueDir", fmt.Sprintf("%+v", conf.ClientConfig.BufferConfig.DqueConfig.QueueDir)) - _ = level.Debug(paramLogger).Log("QueueSegmentSize", fmt.Sprintf("%+v", conf.ClientConfig.BufferConfig.DqueConfig.QueueSegmentSize)) - _ = level.Debug(paramLogger).Log("QueueSync", fmt.Sprintf("%+v", conf.ClientConfig.BufferConfig.DqueConfig.QueueSync)) - _ = level.Debug(paramLogger).Log("QueueName", fmt.Sprintf("%+v", conf.ClientConfig.BufferConfig.DqueConfig.QueueName)) - _ = level.Debug(paramLogger).Log("FallbackToTagWhenMetadataIsMissing", fmt.Sprintf("%+v", conf.PluginConfig.KubernetesMetadata.FallbackToTagWhenMetadataIsMissing)) - _ = level.Debug(paramLogger).Log("TagKey", fmt.Sprintf("%+v", conf.PluginConfig.KubernetesMetadata.TagKey)) - _ = level.Debug(paramLogger).Log("TagPrefix", fmt.Sprintf("%+v", conf.PluginConfig.KubernetesMetadata.TagPrefix)) - _ = level.Debug(paramLogger).Log("TagExpression", fmt.Sprintf("%+v", conf.PluginConfig.KubernetesMetadata.TagExpression)) - _ = level.Debug(paramLogger).Log("DropLogEntryWithoutK8sMetadata", fmt.Sprintf("%+v", conf.PluginConfig.KubernetesMetadata.DropLogEntryWithoutK8sMetadata)) - _ = level.Debug(paramLogger).Log("NumberOfBatchIDs", fmt.Sprintf("%+v", conf.ClientConfig.NumberOfBatchIDs)) - _ = level.Debug(paramLogger).Log("IdLabelName", fmt.Sprintf("%+v", conf.ClientConfig.IDLabelName)) - _ = level.Debug(paramLogger).Log("DeletedClientTimeExpiration", fmt.Sprintf("%+v", conf.ControllerConfig.DeletedClientTimeExpiration)) - _ = level.Debug(paramLogger).Log("Pprof", fmt.Sprintf("%+v", conf.Pprof)) - if len(conf.PluginConfig.HostnameKey) > 0 { - _ = level.Debug(paramLogger).Log("HostnameKey", conf.PluginConfig.HostnameKey) - } - if len(conf.PluginConfig.HostnameValue) > 0 { - _ = level.Debug(paramLogger).Log("HostnameValue", conf.PluginConfig.HostnameValue) - } - if conf.PluginConfig.PreservedLabels != nil { - _ = level.Debug(paramLogger).Log("PreservedLabels", fmt.Sprintf("%+v", conf.PluginConfig.PreservedLabels)) - } - _ = level.Debug(paramLogger).Log("LabelSetInitCapacity", fmt.Sprintf("%+v", conf.PluginConfig.LabelSetInitCapacity)) - _ = level.Debug(paramLogger).Log("SendLogsToMainClusterWhenIsInCreationState", fmt.Sprintf("%+v", conf.ControllerConfig.ShootControllerClientConfig.SendLogsWhenIsInCreationState)) - _ = level.Debug(paramLogger).Log("SendLogsToMainClusterWhenIsInReadyState", fmt.Sprintf("%+v", conf.ControllerConfig.ShootControllerClientConfig.SendLogsWhenIsInReadyState)) - _ = level.Debug(paramLogger).Log("SendLogsToMainClusterWhenIsInHibernatingState", fmt.Sprintf("%+v", conf.ControllerConfig.ShootControllerClientConfig.SendLogsWhenIsInHibernatingState)) - _ = level.Debug(paramLogger).Log("SendLogsToMainClusterWhenIsInHibernatedState", fmt.Sprintf("%+v", conf.ControllerConfig.ShootControllerClientConfig.SendLogsWhenIsInHibernatedState)) - _ = level.Debug(paramLogger).Log("SendLogsToMainClusterWhenIsInDeletionState", fmt.Sprintf("%+v", conf.ControllerConfig.ShootControllerClientConfig.SendLogsWhenIsInDeletionState)) - _ = level.Debug(paramLogger).Log("SendLogsToMainClusterWhenIsInRestoreState", fmt.Sprintf("%+v", conf.ControllerConfig.ShootControllerClientConfig.SendLogsWhenIsInRestoreState)) - _ = level.Debug(paramLogger).Log("SendLogsToMainClusterWhenIsInMigrationState", fmt.Sprintf("%+v", conf.ControllerConfig.ShootControllerClientConfig.SendLogsWhenIsInMigrationState)) - _ = level.Debug(paramLogger).Log("SendLogsToDefaultClientWhenClusterIsInCreationState", fmt.Sprintf("%+v", conf.ControllerConfig.SeedControllerClientConfig.SendLogsWhenIsInCreationState)) - _ = level.Debug(paramLogger).Log("SendLogsToDefaultClientWhenClusterIsInReadyState", fmt.Sprintf("%+v", conf.ControllerConfig.SeedControllerClientConfig.SendLogsWhenIsInReadyState)) - _ = level.Debug(paramLogger).Log("SendLogsToDefaultClientWhenClusterIsInHibernatingState", fmt.Sprintf("%+v", conf.ControllerConfig.SeedControllerClientConfig.SendLogsWhenIsInHibernatingState)) - _ = level.Debug(paramLogger).Log("SendLogsToDefaultClientWhenClusterIsInHibernatedState", fmt.Sprintf("%+v", conf.ControllerConfig.SeedControllerClientConfig.SendLogsWhenIsInHibernatedState)) - _ = level.Debug(paramLogger).Log("SendLogsToDefaultClientWhenClusterIsInDeletionState", fmt.Sprintf("%+v", conf.ControllerConfig.SeedControllerClientConfig.SendLogsWhenIsInDeletionState)) - _ = level.Debug(paramLogger).Log("SendLogsToDefaultClientWhenClusterIsInRestoreState", fmt.Sprintf("%+v", conf.ControllerConfig.SeedControllerClientConfig.SendLogsWhenIsInRestoreState)) - _ = level.Debug(paramLogger).Log("SendLogsToDefaultClientWhenClusterIsInMigrationState", fmt.Sprintf("%+v", conf.ControllerConfig.SeedControllerClientConfig.SendLogsWhenIsInMigrationState)) -} diff --git a/cmd/fluent-bit-output-plugin/output_plugin.h b/cmd/fluent-bit-output-plugin/output_plugin.h index b7848912f..54864edc5 100644 --- a/cmd/fluent-bit-output-plugin/output_plugin.h +++ b/cmd/fluent-bit-output-plugin/output_plugin.h @@ -5,8 +5,6 @@ /* Code generated by cmd/cgo; DO NOT EDIT. */ -/* package github.com/credativ/vali/cmd/fluent-bit */ - #line 1 "cgo-builtin-export-prolog" diff --git a/cmd/fluent-bit-output-plugin/plugin_config.go b/cmd/fluent-bit-output-plugin/plugin_config.go new file mode 100644 index 000000000..ea00ad78a --- /dev/null +++ b/cmd/fluent-bit-output-plugin/plugin_config.go @@ -0,0 +1,137 @@ +// Copyright 2025 SPDX-FileCopyrightText: SAP SE or an SAP affiliate company and Gardener contributors +// SPDX-License-Identifier: Apache-2.0 + +package main + +import ( + "strings" + "unsafe" + + "github.com/fluent/fluent-bit-go/output" +) + +type pluginConfig struct { + ctx unsafe.Pointer +} + +func (c *pluginConfig) Get(key string) string { + return output.FLBPluginConfigKey(c.ctx, key) +} + +// toStringMap converts the pluginConfig to a map[string]string for configuration parsing. +// It extracts all configuration values from the fluent-bit plugin context and returns them +// as a string map that can be used by the config parser. This is necessary because there +// is no direct C interface to retrieve the complete plugin configuration at once. +// +// When adding new configuration options to the plugin, the corresponding keys must be +// added to the configKeys slice below to ensure they are properly extracted. +func (c *pluginConfig) toStringMap() map[string]string { + configMap := make(map[string]string) + + // Define all possible configuration keys based on the structs and documentation + configKeys := []string{ + // Client types + "SeedType", "seedType", "seed_type", + "ShootType", "shootType", "shoot_type", + + // Plugin config + "DynamicHostPath", "dynamicHostPath", "dynamic_host_path", + "DynamicHostPrefix", "dynamicHostPrefix", "dynamic_host_prefix", + "DynamicHostSuffix", "dynamicHostSuffix", "dynamic_host_suffix", + "DynamicHostRegex", "dynamicHostRegex", "dynamic_host_regex", + + "HostnameValue", "hostnameValue", "hostname_value", + "Origin", "origin", + + // Kubernetes metadata - TODO: revisit how to handle kubernetes metadata. Simplify? + "FallbackToTagWhenMetadataIsMissing", "fallbackToTagWhenMetadataIsMissing", "fallback_to_tag_when_metadata_is_missing", + "DropLogEntryWithoutK8sMetadata", "dropLogEntryWithoutK8sMetadata", "drop_log_entry_without_k8s_metadata", + "TagKey", "tagKey", "tag_key", + "TagPrefix", "tagPrefix", "tag_prefix", + "TagExpression", "tagExpression", "tag_expression", + + // Dque config + "DQueDir", "dqueDir", "dque_dir", + "DQueSegmentSize", "dqueSegmentSize", "dque_segment_size", + "DQueSync", "dqueSync", "dque_sync", + "DQueName", " dqueName", "dque_name", + + // Controller config + "DeletedClientTimeExpiration", "deletedClientTimeExpiration", "deleted_client_time_expiration", + "ControllerSyncTimeout", "controllerSyncTimeout", "controller_sync_timeout", + + // Log flows depending on cluster state + // Shoot client config + "SendLogsToShootWhenIsInCreationState", "sendLogsToShootWhenIsInCreationState", "send_logs_to_shoot_when_is_in_creation_state", + "SendLogsToShootWhenIsInReadyState", "sendLogsToShootWhenIsInReadyState", "send_logs_to_shoot_when_is_in_ready_state", + "SendLogsToShootWhenIsInHibernatingState", "sendLogsToShootWhenIsInHibernatingState", "send_logs_to_shoot_when_is_in_hibernating_state", + "SendLogsToShootWhenIsInHibernatedState", "sendLogsToShootWhenIsInHibernatedState", "send_logs_to_shoot_when_is_in_hibernated_state", + "SendLogsToShootWhenIsInWakingState", "sendLogsToShootWhenIsInWakingState", "send_logs_to_shoot_when_is_in_waking_state", + "SendLogsToShootWhenIsInDeletionState", "sendLogsToShootWhenIsInDeletionState", "send_logs_to_shoot_when_is_in_deletion_state", + "SendLogsToShootWhenIsInDeletedState", "sendLogsToShootWhenIsInDeletedState", "send_logs_to_shoot_when_is_in_deleted_state", + "SendLogsToShootWhenIsInRestoreState", "sendLogsToShootWhenIsInRestoreState", "send_logs_to_shoot_when_is_in_restore_state", + "SendLogsToShootWhenIsInMigrationState", "sendLogsToShootWhenIsInMigrationState", "send_logs_to_shoot_when_is_in_migration_state", + + // Seed client config for shoots with dynamic hostnames + "SendLogsToSeedWhenShootIsInCreationState", "sendLogsToSeedWhenShootIsInCreationState", "send_logs_to_seed_when_shoot_is_in_creation_state", + "SendLogsToSeedWhenShootIsInReadyState", "sendLogsToSeedWhenShootIsInReadyState", "send_logs_to_seed_when_shoot_is_in_ready_state", + "SendLogsToSeedWhenShootIsInHibernatingState", "sendLogsToSeedWhenShootIsInHibernatingState", "send_logs_to_seed_when_shoot_is_in_hibernating_state", + "SendLogsToSeedWhenShootIsInHibernatedState", "sendLogsToSeedWhenShootIsInHibernatedState", "send_logs_to_seed_when_shoot_is_in_hibernated_state", + "SendLogsToSeedWhenShootIsInWakingState", "sendLogsToSeedWhenShootIsInWakingState", "send_logs_to_seed_when_shoot_is_in_waking_state", + "SendLogsToSeedWhenShootIsInDeletionState", "sendLogsToSeedWhenShootIsInDeletionState", "send_logs_to_seed_when_shoot_is_in_deletion_state", + "SendLogsToSeedWhenShootIsInDeletedState", "sendLogsToSeedWhenShootIsInDeletedState", "send_logs_to_seed_when_shoot_is_in_deleted_state", + "SendLogsToSeedWhenShootIsInRestoreState", "sendLogsToSeedWhenShootIsInRestoreState", "send_logs_to_seed_when_shoot_is_in_restore_state", + "SendLogsToSeedWhenShootIsInMigrationState", "sendLogsToSeedWhenShootIsInMigrationState", "send_logs_to_seed_when_shoot_is_in_migration_state", + + // Common OTLP configs + "Endpoint", "endpoint", + "EndpointUrl", "endpointUrl", "endpoint_url", + "EndpointUrlPath:", "endpointUrlPath", "endpoint_url_path", + "Insecure", "insecure", + "Compression", "compression", + "Timeout", "timeout", + "Headers", "headers", + + // OTLP Retry configs + "RetryEnabled", "retryEnabled", "retry_enabled", + "RetryInitialInterval", "retryInitialInterval", "retry_initial_interval", + "RetryMaxInterval", "retryMaxInterval", "retry_max_interval", + "RetryMaxElapsedTime", "retryMaxElapsedTime", "retry_max_elapsed_time", + + // OTLP HTTP specific configs + "HTTPPath", "httpPath", "http_path", + "HTTPProxy", "httpProxy", "http_proxy", + + // OTLP TLS configs + "TLSCertFile", "tlsCertFile", "tls_cert_file", + "TLSKeyFile", "tlsKeyFile", "tls_key_file", + "TLSCAFile", "tlsCAFile", "tls_ca_file", + "TLSServerName", "tlsServerName", "tls_server_name", + "TLSInsecureSkipVerify", "tlsInsecureSkipVerify", "tls_insecure_skip_verify", + "TLSMinVersion", "tlsMinVersion", "tls_min_version", + "TLSMaxVersion", "tlsMaxVersion", "tls_max_version", + + "ThrottleEnabled", "throttleEnabled", "throttle_enabled", + "ThrottleRequestsPerSec", "throttleRequestsPerSec", "throttle_requests_per_sec", + + // OTLP Batch Processor configs + "DQueBatchProcessorMaxQueueSize", "dqueBatchProcessorMaxQueueSize", "dque_batch_processor_max_queue_size", + "DQueBatchProcessorMaxBatchSize", "dqueBatchProcessorMaxBatchSize", "dque_batch_processor_max_batch_size", + "DQueBatchProcessorExportTimeout", "dqueBatchProcessorExportTimeout", "dque_batch_processor_export_timeout", + "DQueBatchProcessorExportInterval", "dqueBatchProcessorExportInterval", "dque_batch_processor_export_interval", + "DQueBatchProcessorExportBufferSize", "dqueBatchProcessorExportBufferSize", "dque_batch_processor_export_buffer_size", + + // General config + "LogLevel", "logLevel", "log_level", + "Pprof", "pprof", + } + + // Extract values for all known keys + for _, key := range configKeys { + if value := c.Get(key); value != "" { + configMap[strings.ToLower(strings.ReplaceAll(key, "_", ""))] = value + } + } + + return configMap +} diff --git a/cmd/fluent-bit-output-plugin/plugin_registry.go b/cmd/fluent-bit-output-plugin/plugin_registry.go new file mode 100644 index 000000000..e9b5f4396 --- /dev/null +++ b/cmd/fluent-bit-output-plugin/plugin_registry.go @@ -0,0 +1,58 @@ +// Copyright 2025 SPDX-FileCopyrightText: SAP SE or an SAP affiliate company and Gardener contributors +// SPDX-License-Identifier: Apache-2.0 + +package main + +import ( + "github.com/gardener/logging/v1/pkg/plugin" +) + +// pluginsContains checks if a plugin with the given id exists in the plugins map. +// Uses sync.Map's Load method which is concurrent-safe. +func pluginsContains(id string) bool { + _, ok := plugins.Load(id) + + return ok +} + +// pluginsGet retrieves a plugin with the given id from the plugins map. +// Returns the plugin and a boolean indicating whether it was found. +// Uses sync.Map's Load method which is concurrent-safe. +func pluginsGet(id string) (plugin.OutputPlugin, bool) { + val, ok := plugins.Load(id) + if !ok { + return nil, false + } + + p, ok := val.(plugin.OutputPlugin) + if !ok { + return nil, false + } + + return p, ok +} + +// pluginsSet stores a plugin with the given id in the plugins map. +// Uses sync.Map's Store method which is concurrent-safe. +func pluginsSet(id string, p plugin.OutputPlugin) { + plugins.Store(id, p) +} + +// pluginsRemove removes a plugin with the given id from the plugins map. +// Uses sync.Map's Delete method which is concurrent-safe. +func pluginsRemove(id string) { + plugins.Delete(id) +} + +// pluginsLen returns the number of plugins in the plugins map. +// Uses sync.Map's Range method to count entries. +func pluginsLen() int { + count := 0 + plugins.Range(func(_, _ any) bool { + count++ + + return true + }) + + return count +} diff --git a/cmd/fluent-bit-output-plugin/pprof.go b/cmd/fluent-bit-output-plugin/pprof.go new file mode 100644 index 000000000..3c6d39d13 --- /dev/null +++ b/cmd/fluent-bit-output-plugin/pprof.go @@ -0,0 +1,18 @@ +// Copyright 2025 SPDX-FileCopyrightText: SAP SE or an SAP affiliate company and Gardener contributors +// SPDX-License-Identifier: Apache-2.0 + +package main + +import ( + "runtime" +) + +// setPprofProfile configures pprof profiling settings. +// It uses sync.Once to ensure these settings are only configured once during the plugin's lifetime. +// This function enables mutex profiling at 1/5 fraction and block profiling for performance analysis. +func setPprofProfile() { + pprofOnce.Do(func() { + runtime.SetMutexProfileFraction(5) + runtime.SetBlockProfileRate(1) + }) +} diff --git a/cmd/fluent-bit-output-plugin/record_converter.go b/cmd/fluent-bit-output-plugin/record_converter.go new file mode 100644 index 000000000..cc2047093 --- /dev/null +++ b/cmd/fluent-bit-output-plugin/record_converter.go @@ -0,0 +1,63 @@ +// Copyright 2025 SPDX-FileCopyrightText: SAP SE or an SAP affiliate company and Gardener contributors +// SPDX-License-Identifier: Apache-2.0 + +package main + +import ( + "fmt" + + "github.com/gardener/logging/v1/pkg/metrics" +) + +// toOutputRecord converts fluent-bit's map[any]any to types.OutputRecord. +// It recursively processes nested structures and converts byte arrays to strings. +// Entries with non-string keys are dropped and logged as warnings with metrics. +func toOutputRecord(record map[any]any) map[string]any { + m := make(map[string]any, len(record)) + for k, v := range record { + key, ok := k.(string) + if !ok { + logger.V(2).Info("dropping record entry with non-string key", "keyType", fmt.Sprintf("%T", k)) + metrics.Errors.WithLabelValues(metrics.ErrorInvalidRecordKey).Inc() + + continue + } + + switch t := v.(type) { + case []byte: + m[key] = string(t) + case map[any]any: + m[key] = toOutputRecord(t) + case []any: + m[key] = toSlice(t) + default: + m[key] = v + } + } + + return m +} + +// toSlice recursively converts []any, handling nested structures and byte arrays. +// It maintains the same conversion logic as toOutputRecord for consistency. +func toSlice(slice []any) []any { + if len(slice) == 0 { + return slice + } + + s := make([]any, 0, len(slice)) + for _, v := range slice { + switch t := v.(type) { + case []byte: + s = append(s, string(t)) + case map[any]any: + s = append(s, toOutputRecord(t)) + case []any: + s = append(s, toSlice(t)) + default: + s = append(s, t) + } + } + + return s +} diff --git a/cmd/fluent-bit-output-plugin/record_converter_test.go b/cmd/fluent-bit-output-plugin/record_converter_test.go new file mode 100644 index 000000000..fe338b041 --- /dev/null +++ b/cmd/fluent-bit-output-plugin/record_converter_test.go @@ -0,0 +1,523 @@ +// Copyright 2025 SPDX-FileCopyrightText: SAP SE or an SAP affiliate company and Gardener contributors +// SPDX-License-Identifier: Apache-2.0 + +package main + +import ( + "testing" + + "github.com/go-logr/logr" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestRecordConverter(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "RecordConverter Suite") +} + +var _ = BeforeSuite(func() { + // Initialize logger for tests + logger = logr.Discard() +}) + +var _ = Describe("toOutputRecord", func() { + Context("when converting simple records", func() { + It("should convert string values", func() { + input := map[any]any{ + "message": "test message", + "level": "info", + } + + result := toOutputRecord(input) + + Expect(result).To(HaveKeyWithValue("message", "test message")) + Expect(result).To(HaveKeyWithValue("level", "info")) + }) + + It("should convert numeric values", func() { + input := map[any]any{ + "count": 42, + "duration": 3.14, + } + + result := toOutputRecord(input) + + Expect(result).To(HaveKeyWithValue("count", 42)) + Expect(result).To(HaveKeyWithValue("duration", 3.14)) + }) + + It("should convert boolean values", func() { + input := map[any]any{ + "enabled": true, + "debug": false, + } + + result := toOutputRecord(input) + + Expect(result).To(HaveKeyWithValue("enabled", true)) + Expect(result).To(HaveKeyWithValue("debug", false)) + }) + + It("should convert nil values", func() { + input := map[any]any{ + "nullValue": nil, + } + + result := toOutputRecord(input) + + Expect(result).To(HaveKey("nullValue")) + Expect(result["nullValue"]).To(BeNil()) + }) + }) + + Context("when converting byte arrays", func() { + It("should convert []byte to string", func() { + input := map[any]any{ + "data": []byte("binary data"), + } + + result := toOutputRecord(input) + + Expect(result).To(HaveKeyWithValue("data", "binary data")) + }) + + It("should convert empty []byte to empty string", func() { + input := map[any]any{ + "empty": []byte{}, + } + + result := toOutputRecord(input) + + Expect(result).To(HaveKeyWithValue("empty", "")) + }) + + It("should handle []byte with special characters", func() { + input := map[any]any{ + "special": []byte("line1\nline2\ttab"), + } + + result := toOutputRecord(input) + + Expect(result).To(HaveKeyWithValue("special", "line1\nline2\ttab")) + }) + }) + + Context("when converting nested maps", func() { + It("should recursively convert nested map[any]any", func() { + input := map[any]any{ + "outer": map[any]any{ + "inner": "value", + }, + } + + result := toOutputRecord(input) + + Expect(result).To(HaveKey("outer")) + nested, ok := result["outer"].(map[string]any) + Expect(ok).To(BeTrue()) + Expect(nested).To(HaveKeyWithValue("inner", "value")) + }) + + It("should handle deeply nested maps", func() { + input := map[any]any{ + "level1": map[any]any{ + "level2": map[any]any{ + "level3": map[any]any{ + "deep": "value", + }, + }, + }, + } + + result := toOutputRecord(input) + + Expect(result).To(HaveKey("level1")) + level1, ok := result["level1"].(map[string]any) + Expect(ok).To(BeTrue()) + Expect(level1).To(HaveKey("level2")) + level2, ok := level1["level2"].(map[string]any) + Expect(ok).To(BeTrue()) + Expect(level2).To(HaveKey("level3")) + level3, ok := level2["level3"].(map[string]any) + Expect(ok).To(BeTrue()) + Expect(level3).To(HaveKeyWithValue("deep", "value")) + }) + + It("should convert []byte in nested maps", func() { + input := map[any]any{ + "kubernetes": map[any]any{ + "pod_name": []byte("my-pod"), + }, + } + + result := toOutputRecord(input) + + Expect(result).To(HaveKey("kubernetes")) + k8s, ok := result["kubernetes"].(map[string]any) + Expect(ok).To(BeTrue()) + Expect(k8s).To(HaveKeyWithValue("pod_name", "my-pod")) + }) + }) + + Context("when converting arrays", func() { + It("should convert []any with simple values", func() { + input := map[any]any{ + "items": []any{"item1", "item2", "item3"}, + } + + result := toOutputRecord(input) + + Expect(result).To(HaveKey("items")) + items, ok := result["items"].([]any) + Expect(ok).To(BeTrue()) + Expect(items).To(Equal([]any{"item1", "item2", "item3"})) + }) + + It("should convert []byte within arrays", func() { + input := map[any]any{ + "data": []any{[]byte("first"), []byte("second")}, + } + + result := toOutputRecord(input) + + Expect(result).To(HaveKey("data")) + data, ok := result["data"].([]any) + Expect(ok).To(BeTrue()) + Expect(data).To(Equal([]any{"first", "second"})) + }) + + It("should recursively convert nested arrays", func() { + input := map[any]any{ + "matrix": []any{ + []any{1, 2, 3}, + []any{4, 5, 6}, + }, + } + + result := toOutputRecord(input) + + Expect(result).To(HaveKey("matrix")) + matrix, ok := result["matrix"].([]any) + Expect(ok).To(BeTrue()) + Expect(matrix).To(HaveLen(2)) + row1, ok := matrix[0].([]any) + Expect(ok).To(BeTrue()) + Expect(row1).To(Equal([]any{1, 2, 3})) + }) + + It("should convert maps within arrays", func() { + input := map[any]any{ + "objects": []any{ + map[any]any{"name": "obj1"}, + map[any]any{"name": "obj2"}, + }, + } + + result := toOutputRecord(input) + + Expect(result).To(HaveKey("objects")) + objects, ok := result["objects"].([]any) + Expect(ok).To(BeTrue()) + Expect(objects).To(HaveLen(2)) + obj1, ok := objects[0].(map[string]any) + Expect(ok).To(BeTrue()) + Expect(obj1).To(HaveKeyWithValue("name", "obj1")) + }) + }) + + Context("when handling non-string keys", func() { + It("should drop entries with integer keys", func() { + input := map[any]any{ + "valid": "keep", + 123: "drop", + } + + result := toOutputRecord(input) + + Expect(result).To(HaveKeyWithValue("valid", "keep")) + Expect(result).ToNot(HaveKey("123")) + Expect(result).To(HaveLen(1)) + }) + + It("should drop entries with bool keys", func() { + input := map[any]any{ + "valid": "keep", + true: "drop", + } + + result := toOutputRecord(input) + + Expect(result).To(HaveKeyWithValue("valid", "keep")) + Expect(result).To(HaveLen(1)) + }) + + It("should drop entries with struct keys", func() { + type customKey struct{ id int } + input := map[any]any{ + "valid": "keep", + customKey{id: 1}: "drop", + } + + result := toOutputRecord(input) + + Expect(result).To(HaveKeyWithValue("valid", "keep")) + Expect(result).To(HaveLen(1)) + }) + }) + + Context("when handling empty inputs", func() { + It("should handle empty map", func() { + input := map[any]any{} + + result := toOutputRecord(input) + + Expect(result).To(BeEmpty()) + }) + + It("should handle nil values in map", func() { + input := map[any]any{ + "key": nil, + } + + result := toOutputRecord(input) + + Expect(result).To(HaveKey("key")) + Expect(result["key"]).To(BeNil()) + }) + }) + + Context("when handling complex mixed structures", func() { + It("should handle Kubernetes-like metadata structure", func() { + input := map[any]any{ + "kubernetes": map[any]any{ + "namespace_name": []byte("default"), + "pod_name": []byte("test-pod-123"), + "labels": map[any]any{ + "app": "my-app", + "version": "v1.0", + }, + "annotations": []any{ + map[any]any{"key": "annotation1"}, + }, + }, + "log": []byte("Application started"), + "level": "info", + } + + result := toOutputRecord(input) + + Expect(result).To(HaveKeyWithValue("log", "Application started")) + Expect(result).To(HaveKeyWithValue("level", "info")) + + k8s, ok := result["kubernetes"].(map[string]any) + Expect(ok).To(BeTrue()) + Expect(k8s).To(HaveKeyWithValue("namespace_name", "default")) + Expect(k8s).To(HaveKeyWithValue("pod_name", "test-pod-123")) + + labels, ok := k8s["labels"].(map[string]any) + Expect(ok).To(BeTrue()) + Expect(labels).To(HaveKeyWithValue("app", "my-app")) + }) + + It("should handle fluent-bit typical record structure", func() { + input := map[any]any{ + "log": []byte("2024-11-28T10:00:00Z INFO Sample log message"), + "stream": "stdout", + "time": "2024-11-28T10:00:00.123456789Z", + "kubernetes": map[any]any{ + "pod_name": []byte("app-deployment-abc123-xyz"), + "namespace_name": []byte("production"), + "container_name": []byte("main"), + "host": []byte("node-01"), + }, + } + + result := toOutputRecord(input) + + Expect(result).To(HaveKeyWithValue("log", "2024-11-28T10:00:00Z INFO Sample log message")) + Expect(result).To(HaveKeyWithValue("stream", "stdout")) + + k8s, ok := result["kubernetes"].(map[string]any) + Expect(ok).To(BeTrue()) + Expect(k8s).To(HaveKeyWithValue("pod_name", "app-deployment-abc123-xyz")) + Expect(k8s).To(HaveKeyWithValue("namespace_name", "production")) + }) + }) +}) + +var _ = Describe("toSlice", func() { + Context("when converting simple slices", func() { + It("should convert string slice", func() { + input := []any{"a", "b", "c"} + + result := toSlice(input) + + Expect(result).To(Equal([]any{"a", "b", "c"})) + }) + + It("should convert numeric slice", func() { + input := []any{1, 2, 3, 4, 5} + + result := toSlice(input) + + Expect(result).To(Equal([]any{1, 2, 3, 4, 5})) + }) + + It("should convert mixed type slice", func() { + input := []any{"text", 42, true, 3.14} + + result := toSlice(input) + + Expect(result).To(Equal([]any{"text", 42, true, 3.14})) + }) + }) + + Context("when converting byte arrays in slices", func() { + It("should convert []byte elements to strings", func() { + input := []any{ + []byte("first"), + []byte("second"), + []byte("third"), + } + + result := toSlice(input) + + Expect(result).To(Equal([]any{"first", "second", "third"})) + }) + + It("should convert mixed []byte and strings", func() { + input := []any{ + "regular string", + []byte("byte string"), + "another string", + } + + result := toSlice(input) + + Expect(result).To(Equal([]any{"regular string", "byte string", "another string"})) + }) + }) + + Context("when converting nested structures", func() { + It("should recursively convert nested slices", func() { + input := []any{ + []any{1, 2}, + []any{3, 4}, + } + + result := toSlice(input) + + Expect(result).To(HaveLen(2)) + nested1, ok := result[0].([]any) + Expect(ok).To(BeTrue()) + Expect(nested1).To(Equal([]any{1, 2})) + nested2, ok := result[1].([]any) + Expect(ok).To(BeTrue()) + Expect(nested2).To(Equal([]any{3, 4})) + }) + + It("should convert maps within slices", func() { + input := []any{ + map[any]any{"id": 1, "name": "first"}, + map[any]any{"id": 2, "name": "second"}, + } + + result := toSlice(input) + + Expect(result).To(HaveLen(2)) + obj1, ok := result[0].(map[string]any) + Expect(ok).To(BeTrue()) + Expect(obj1).To(HaveKeyWithValue("id", 1)) + Expect(obj1).To(HaveKeyWithValue("name", "first")) + }) + + It("should handle deeply nested slices", func() { + input := []any{ + []any{ + []any{1, 2}, + []any{3, 4}, + }, + } + + result := toSlice(input) + + Expect(result).To(HaveLen(1)) + level1, ok := result[0].([]any) + Expect(ok).To(BeTrue()) + Expect(level1).To(HaveLen(2)) + level2, ok := level1[0].([]any) + Expect(ok).To(BeTrue()) + Expect(level2).To(Equal([]any{1, 2})) + }) + + It("should convert []byte in nested maps within slices", func() { + input := []any{ + map[any]any{ + "data": []byte("binary"), + }, + } + + result := toSlice(input) + + obj, ok := result[0].(map[string]any) + Expect(ok).To(BeTrue()) + Expect(obj).To(HaveKeyWithValue("data", "binary")) + }) + }) + + Context("when handling empty slices", func() { + It("should return empty slice for empty input", func() { + input := []any{} + + result := toSlice(input) + + Expect(result).To(BeEmpty()) + }) + + It("should handle slice with nil elements", func() { + input := []any{nil, nil, nil} + + result := toSlice(input) + + Expect(result).To(Equal([]any{nil, nil, nil})) + }) + }) + + Context("when handling complex scenarios", func() { + It("should handle mixed nested structures", func() { + input := []any{ + map[any]any{ + "array": []any{1, 2, 3}, + "bytes": []byte("data"), + }, + []any{ + map[any]any{"nested": "value"}, + }, + "simple", + } + + result := toSlice(input) + + Expect(result).To(HaveLen(3)) + + // First element: map with array and bytes + obj1, ok := result[0].(map[string]any) + Expect(ok).To(BeTrue()) + arr, ok := obj1["array"].([]any) + Expect(ok).To(BeTrue()) + Expect(arr).To(Equal([]any{1, 2, 3})) + Expect(obj1).To(HaveKeyWithValue("bytes", "data")) + + // Second element: array with map + arr2, ok := result[1].([]any) + Expect(ok).To(BeTrue()) + nestedMap, ok := arr2[0].(map[string]any) + Expect(ok).To(BeTrue()) + Expect(nestedMap).To(HaveKeyWithValue("nested", "value")) + + // Third element: simple string + Expect(result[2]).To(Equal("simple")) + }) + }) +}) diff --git a/cmd/vali-curator/app/vali_curator.go b/cmd/vali-curator/app/vali_curator.go deleted file mode 100644 index 4cf50f37d..000000000 --- a/cmd/vali-curator/app/vali_curator.go +++ /dev/null @@ -1,64 +0,0 @@ -// SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and Gardener contributors -// -// SPDX-License-Identifier: Apache-2.0 - -package app - -import ( - "flag" - "os" - - "github.com/go-kit/log" - "github.com/go-kit/log/level" - "github.com/weaveworks/common/logging" - - "github.com/gardener/logging/pkg/vali/curator/config" -) - -var ( - logger log.Logger -) - -// ParseConfiguration parses the Curator's inode and storage configurations. -func ParseConfiguration() (*config.CuratorConfig, log.Logger, error) { - curatorConfigPath := flag.String("config", "/etc/vali/curator.yaml", "A path to the curator's configuration file") - flag.Parse() - conf, err := config.ParseConfigurations(*curatorConfigPath) - if err != nil { - return nil, nil, err - } - - logger = newLogger(conf.LogLevel) - _ = level.Info(logger).Log("LogLevel", conf.LogLevel) - _ = level.Info(logger).Log("TriggerInterval", conf.TriggerInterval) - _ = level.Info(logger).Log("DiskPath", conf.DiskPath) - _ = level.Info(logger).Log("InodeConfig.MinFreePercentages", conf.InodeConfig.MinFreePercentages) - _ = level.Info(logger).Log("InodeConfig.TargetFreePercentages", conf.InodeConfig.TargetFreePercentages) - _ = level.Info(logger).Log("InodeConfig.PageSizeForDeletionPercentages", conf.InodeConfig.PageSizeForDeletionPercentages) - _ = level.Info(logger).Log("StorageConfig.MinFreePercentages", conf.StorageConfig.MinFreePercentages) - _ = level.Info(logger).Log("StorageConfig.TargetFreePercentages", conf.StorageConfig.TargetFreePercentages) - _ = level.Info(logger).Log("StorageConfig.PageSizeForDeletionPercentages", conf.StorageConfig.PageSizeForDeletionPercentages) - - return conf, logger, nil -} - -func newLogger(logLevelName string) log.Logger { - var logLevel logging.Level - switch logLevelName { - case "info": - fallthrough - case "debug": - fallthrough - case "warn": - fallthrough - case "error": - _ = logLevel.Set(logLevelName) - default: - _ = logLevel.Set("info") - } - - l := log.NewLogfmtLogger(log.NewSyncWriter(os.Stderr)) - l = level.NewFilter(l, logLevel.Gokit) - - return log.With(l, "caller", log.DefaultCaller, "ts", log.DefaultTimestampUTC) -} diff --git a/cmd/vali-curator/main.go b/cmd/vali-curator/main.go deleted file mode 100644 index 0fda4c5c6..000000000 --- a/cmd/vali-curator/main.go +++ /dev/null @@ -1,52 +0,0 @@ -// SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and Gardener contributors -// -// SPDX-License-Identifier: Apache-2.0 - -package main - -import ( - "net/http" - _ "net/http/pprof" // #nosec: G108 - "os" - "os/signal" - "runtime" - "time" - - "github.com/go-kit/log/level" - "github.com/prometheus/client_golang/prometheus/promhttp" - - "github.com/gardener/logging/cmd/vali-curator/app" - "github.com/gardener/logging/pkg/vali/curator" -) - -func main() { - conf, logger, err := app.ParseConfiguration() - if err != nil { - _ = level.Error(logger).Log("msg", "error", err) - os.Exit(1) - } - - // metrics - go func() { - runtime.SetMutexProfileFraction(5) - runtime.SetBlockProfileRate(1) - mux := http.NewServeMux() - mux.Handle("/curator/metrics", promhttp.Handler()) - server := &http.Server{ - Addr: ":2718", - ReadHeaderTimeout: time.Second * 30, - Handler: mux, - } - if err := server.ListenAndServe(); err != nil { - _ = level.Error(logger).Log("Curator metric server error", err.Error()) - } - }() - - valiCurator := curator.NewCurator(*conf, logger) - c := make(chan os.Signal, 2) - signal.Notify(c, os.Interrupt) - go func() { valiCurator.Run() }() - sig := <-c - _ = level.Error(logger).Log("msg", "error", "Got %s signal. Aborting...", sig) - valiCurator.Stop() -} diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 000000000..e9f8eeaf1 --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,214 @@ +# Architecture + +This document describes the architecture of the Gardener Fluent Bit OTLP Output Plugin. + +![Gardener OTLP Plugin Architecture](images/gardener-logging-otlp-plugin.png) + +## OTLP Clients + +The plugin supports multiple client implementations: + +- **OTLP gRPC Client**: High-performance binary protocol with bi-directional streaming +- **OTLP HTTP Client**: HTTP-based transport for environments where gRPC is not available +- **Stdout Client**: JSON output to stdout for debugging +- **Noop Client**: No-operation client for testing + +### OTLP gRPC Client + +High-performance OTLP over gRPC transport: +- Binary protocol (Protobuf) +- Bi-directional streaming +- Efficient compression +- Recommended for production + +**Use cases:** +- Production environments +- High-volume log shipping +- Low-latency requirements +- When backend supports gRPC + +### OTLP HTTP Client + +HTTP-based OTLP transport: +- Works through HTTP proxies +- Firewall-friendly +- HTTP/1.1 and HTTP/2 support +- JSON or Protobuf encoding + +**Use cases:** +- When gRPC is not available or blocked by firewalls +- HTTP proxy environments +- Debugging (easier to inspect with standard tools) +- When backend only supports HTTP + +### Stdout Client + +Debug client that writes logs to stdout: +- JSON formatted output +- Useful for local development +- No external dependencies + +### Noop Client + +No-operation client: +- Discards all logs +- Useful for testing and benchmarking +- Zero overhead + +## DQue Batch Processor + +The plugin uses a disk-based queue (dque) for persistent buffering: + +- **Persistent Storage**: Logs are stored on disk to survive process restarts +- **Batch Processing**: Logs are batched for efficient export +- **Backpressure Handling**: Queue prevents memory exhaustion under high load +- **Configurable Sync**: Optional fsync for data durability + +### How It Works + +1. **Enqueue**: Incoming logs are added to the disk-backed queue +2. **Batch Formation**: Logs are accumulated until batch size or timeout is reached +3. **Export**: Batches are exported to the OTLP backend +4. **Retry**: Failed exports are retried with exponential backoff +5. **Dequeue**: Successfully exported logs are removed from the queue + +### Performance Tuning + +- **Queue Size**: Controls maximum in-memory records before dropping +- **Batch Size**: Larger batches improve throughput but increase latency +- **Export Interval**: Shorter intervals reduce latency but increase overhead +- **Segment Size**: Affects disk I/O patterns and memory usage + +## Dynamic Routing Controller + +The controller watches Kubernetes cluster resources and manages client routing: + +- **Cluster State Monitoring**: Tracks Shoot cluster lifecycle states +- **Dynamic Client Creation**: Creates clients for new namespaces matching patterns +- **Client Cleanup**: Removes clients for deleted clusters after expiration +- **State-Based Routing**: Routes logs based on cluster state (Ready, Hibernating, etc.) + +### Routing Logic + +1. **Metadata Extraction**: Extract cluster information from log metadata (namespace, labels) +2. **Client Selection**: Determine appropriate client based on: + - Cluster namespace pattern (e.g., `shoot--*`) + - Cluster state (Ready, Hibernating, Hibernated, etc.) + - Configuration rules +3. **Dynamic Host Resolution**: Build endpoint URL from prefix, extracted value, and suffix +4. **Client Management**: Create, cache, and cleanup clients as needed + +### Cluster State Handling + +Different cluster states have different routing behaviors: + +- **Creation**: Logs can be sent to both Seed and Shoot (configurable) +- **Ready**: Typically only to Shoot cluster +- **Hibernating/Hibernated**: Typically only to Seed cluster (Shoot unavailable) +- **Deletion**: Logs sent to both for troubleshooting +- **Restore/Migration**: Logs sent to both for observability during transitions + +## Component Interaction + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Fluent Bit β”‚ +β”‚ (Input) β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β”‚ Log Records + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Gardener Output Plugin β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ Record Converter β”‚ β”‚ +β”‚ β”‚ - Extract Kubernetes metadata β”‚ β”‚ +β”‚ β”‚ - Convert to OTLP log records β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ +β”‚ β–Ό β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ Dynamic Routing Controller β”‚ β”‚ +β”‚ β”‚ - Determine target client β”‚ β”‚ +β”‚ β”‚ - Create clients on-demand β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ +β”‚ β–Ό β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ Client (OTLP gRPC/HTTP/stdout/noop) β”‚ β”‚ +β”‚ β”‚ - DQue batch processor β”‚ β”‚ +β”‚ β”‚ - Persistent buffering β”‚ β”‚ +β”‚ β”‚ - Retry logic β”‚ β”‚ +β”‚ β”‚ - Rate limiting β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β”‚ OTLP Protocol + β–Ό + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Backend β”‚ + β”‚ (VictoriaLogs,β”‚ + β”‚ Loki, etc.) β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +## Metrics & Observability + +The plugin exposes Prometheus metrics on port 2021: + +- **Queue Metrics**: `dque_queue_size`, `dque_enqueued_total`, `dque_dequeued_total` +- **Export Metrics**: `dque_export_duration_seconds`, `dque_export_errors_total` +- **Client Metrics**: OTLP SDK metrics for export operations +- **Health Checks**: `/healthz` endpoint for probes +- **Profiling**: Optional pprof endpoints on `/debug/pprof/` + +### Metrics Endpoints + +- `GET /metrics` - Prometheus metrics +- `GET /healthz` - Health check (returns 200 OK when healthy) +- `GET /debug/pprof/` - Profiling endpoints (when enabled) + +## Security + +### TLS/mTLS + +The plugin supports full TLS configuration: + +- **Client Certificates**: mTLS authentication with client certificates +- **CA Verification**: Server certificate validation with custom CA +- **SNI**: Server Name Indication support +- **Version Control**: Configurable minimum and maximum TLS versions + +### Authentication + +- **Header-based**: Custom headers for bearer tokens or API keys +- **Certificate-based**: mTLS with client certificates +- **No authentication**: For testing or internal networks + +## Performance Considerations + +### Memory Usage + +- Queue size controls in-memory buffer +- Batch size affects memory per export operation +- Multiple clients multiply memory requirements + +### Disk Usage + +- Queue segments stored on disk +- Segment size affects file count and I/O patterns +- Cleanup happens after successful export + +### Network + +- Compression reduces bandwidth (at CPU cost) +- Batch size affects request frequency +- Rate limiting prevents backend overload + +### CPU + +- Compression increases CPU usage +- Batch processing reduces overhead +- Multiple clients increase context switching + diff --git a/docs/configuration.md b/docs/configuration.md new file mode 100644 index 000000000..3e045e733 --- /dev/null +++ b/docs/configuration.md @@ -0,0 +1,308 @@ +# Configuration Guide + +This document provides comprehensive configuration options for the Gardener Fluent Bit OTLP Output Plugin. + +## Configuration Options + +### OTLP Configuration + +| Key | Description | Default | Type | +|-----|-------------|---------|------| +| `Endpoint` | OTLP endpoint URL (with or without scheme) | `localhost:4317` | string | +| `Insecure` | Use insecure connection (skip TLS) | `false` | bool | +| `Compression` | Compression algorithm (0=none, 1=gzip) | `0` | int | +| `Timeout` | Request timeout duration | `30s` | duration | +| `Headers` | Custom HTTP headers (format: `key1 value1,key2 value2`) | `{}` | map[string]string | + +### Batch Processor Configuration + +| Key | Description | Default | Type | +|-----|-------------|---------|------| +| `DQueBatchProcessorMaxQueueSize` | Maximum records in queue before dropping | `512` | int | +| `DQueBatchProcessorMaxBatchSize` | Maximum records per export batch | `256` | int | +| `DQueBatchProcessorExportTimeout` | Timeout for single export operation | `30s` | duration | +| `DQueBatchProcessorExportInterval` | Flush interval | `1s` | duration | +| `DQueBatchProcessorExportBufferSize` | Export buffer size | `10` | int | + +### DQue (Disk Queue) Configuration + +| Key | Description | Default | Type | +|-----|-------------|---------|------| +| `DQueDir` | Directory path for queue storage | `/tmp/flb-storage` | string | +| `DQueName` | Queue name (subdirectory under DQueDir) | `dque` | string | +| `DQueSegmentSize` | Number of entries per segment file | `500` | int | +| `DQueSync` | Sync writes to disk (true/false) | `false` | bool | + +### Retry Configuration + +| Key | Description | Default | Type | +|-----|-------------|---------|------| +| `RetryEnabled` | Enable retry logic | `true` | bool | +| `RetryInitialInterval` | Initial retry wait time | `5s` | duration | +| `RetryMaxInterval` | Maximum retry wait time | `30s` | duration | +| `RetryMaxElapsedTime` | Total time to retry before giving up | `1m` | duration | + +### Throttle Configuration + +| Key | Description | Default | Type | +|-----|-------------|---------|------| +| `ThrottleEnabled` | Enable rate limiting | `false` | bool | +| `ThrottleRequestsPerSec` | Maximum requests per second (0=unlimited) | `0` | int | + +### TLS Configuration + +| Key | Description | Default | Type | +|-----|-------------|---------|------| +| `TLSCertFile` | Path to client certificate | `""` | string | +| `TLSKeyFile` | Path to client private key | `""` | string | +| `TLSCAFile` | Path to CA certificate for server verification | `""` | string | +| `TLSServerName` | Server name for SNI | `""` | string | +| `TLSInsecureSkipVerify` | Skip certificate verification (insecure) | `false` | bool | +| `TLSMinVersion` | Minimum TLS version (1.0, 1.1, 1.2, 1.3) | `1.2` | string | +| `TLSMaxVersion` | Maximum TLS version | `""` (Go default) | string | + +### Plugin Configuration + +| Key | Description | Default | Type | +|-----|-------------|---------|------| +| `SeedType` | Client type for Seed clusters (`OTLPGRPC`/`OTLPHTTP`/`stdout`/`noop`) | `""` | string | +| `ShootType` | Client type for Shoot clusters (`OTLPGRPC`/`OTLPHTTP`/`stdout`/`noop`) | `""` | string | +| `LogLevel` | Plugin log level (debug, info, warn, error) | `info` | string | +| `Pprof` | Enable pprof profiling endpoints | `false` | bool | +| `HostnameValue` | Custom hostname to include in logs | OS hostname | string | +| `Origin` | Origin label for logs (seed/shoot identification) | `""` | string | + +### Kubernetes Metadata Extraction + +| Key | Description | Default | Type | +|-----|-------------|---------|------| +| `FallbackToTagWhenMetadataIsMissing` | Extract metadata from tag if missing | `false` | bool | +| `DropLogEntryWithoutK8sMetadata` | Drop logs without Kubernetes metadata | `false` | bool | +| `TagKey` | Record key containing the tag | `tag` | string | +| `TagPrefix` | Tag prefix (metadata not searched here) | `kubernetes\\.var\\.log\\.containers` | string | +| `TagExpression` | Regex to extract pod/namespace/container from tag | `\\.([^_]+)_([^_]+)_(.+)-([a-z0-9]{64})\\.log$` | string | + +### Dynamic Routing Configuration + +| Key | Description | Default | Type | +|-----|-------------|---------|------| +| `DynamicHostPath` | JSONPath to extract dynamic host from log metadata | `""` | string | +| `DynamicHostPrefix` | Prefix for dynamic host URL | `""` | string | +| `DynamicHostSuffix` | Suffix for dynamic host URL | `""` | string | +| `DynamicHostRegex` | Regex to validate dynamic host | `*` | string | +| `ControllerSyncTimeout` | Time to wait for cluster object sync | `60s` | duration | +| `DeletedClientTimeExpiration` | Expiration time for deleted cluster clients | `1h` | duration | + +### Cluster State-Based Routing + +Control where logs are sent based on Shoot cluster state: + +| Cluster State | Send to Seed | Send to Shoot (Dynamic) | +|---------------|--------------|-------------------------| +| Creation | `SendLogsToDefaultClientWhenClusterIsInCreationState` (true) | `SendLogsToMainClusterWhenIsInCreationState` (true) | +| Ready | `SendLogsToDefaultClientWhenClusterIsInReadyState` (false) | `SendLogsToMainClusterWhenIsInReadyState` (true) | +| Hibernating | `SendLogsToDefaultClientWhenClusterIsInHibernatingState` (false) | `SendLogsToMainClusterWhenIsInHibernatingState` (false) | +| Hibernated | `SendLogsToDefaultClientWhenClusterIsInHibernatedState` (false) | `SendLogsToMainClusterWhenIsInHibernatedState` (false) | +| Deletion | `SendLogsToDefaultClientWhenClusterIsInDeletionState` (true) | `SendLogsToMainClusterWhenIsInDeletionState` (true) | +| Restore | `SendLogsToDefaultClientWhenClusterIsInRestoreState` (true) | `SendLogsToMainClusterWhenIsInRestoreState` (true) | +| Migration | `SendLogsToDefaultClientWhenClusterIsInMigrationState` (true) | `SendLogsToMainClusterWhenIsInMigrationState` (true) | + +## Configuration Examples + +### Basic OTLP gRPC Configuration + +Send logs to a VictoriaLogs backend using OTLP over gRPC: + +```ini +[Output] + Name gardener + Match kubernetes.* + + # Client type selection + SeedType OTLPGRPC + ShootType OTLPGRPC + + # OTLP endpoint + Endpoint victorialogs.logging.svc.cluster.local:4317 + Insecure false + Compression 1 + Timeout 30s + + # Batch processing + DQueBatchProcessorMaxQueueSize 512 + DQueBatchProcessorMaxBatchSize 256 + DQueBatchProcessorExportTimeout 30s + DQueBatchProcessorExportInterval 1s + + # Disk queue configuration + DQueDir /fluent-bit/buffers/otlp + DQueName gardener-otlp + DQueSegmentSize 500 + DQueSync false + + # Retry configuration + RetryEnabled true + RetryInitialInterval 5s + RetryMaxInterval 30s + RetryMaxElapsedTime 1m + + # TLS configuration + TLSCertFile /etc/ssl/certs/client.crt + TLSKeyFile /etc/ssl/private/client.key + TLSCAFile /etc/ssl/certs/ca.crt + TLSMinVersion 1.2 + + # Plugin settings + LogLevel info + HostnameValue seed-cluster-1 + Origin seed +``` + +### OTLP HTTP Configuration with Custom Headers + +Send logs using OTLP over HTTP with authentication headers: + +```ini +[Output] + Name gardener + Match kubernetes.* + + # Use HTTP client + SeedType OTLPHTTP + + # OTLP HTTP endpoint + Endpoint https://victorialogs.example.com/insert/opentelemetry/v1/logs + Insecure false + Timeout 30s + + # Custom headers for authentication + Headers Authorization Bearer YOUR_TOKEN,X-Scope-OrgID tenant-1 + + # Batch processing + DQueBatchProcessorMaxBatchSize 256 + DQueBatchProcessorExportInterval 5s + + # Disk queue + DQueDir /fluent-bit/buffers + DQueName otlp-http + + # Rate limiting + ThrottleEnabled true + ThrottleRequestsPerSec 100 +``` + +### Dynamic Multi-Cluster Routing + +Route logs to different backends based on Shoot cluster namespaces: + +```ini +[Output] + Name gardener + Match kubernetes.* + + # Default Seed cluster client + SeedType OTLPGRPC + Endpoint victorialogs-seed.logging.svc:4317 + + # Dynamic Shoot cluster routing + ShootType OTLPGRPC + DynamicHostPath {"kubernetes": {"namespace_name": "namespace"}} + DynamicHostPrefix victorialogs. + DynamicHostSuffix .svc.cluster.local:4317 + DynamicHostRegex ^shoot-- + + # Cluster state-based routing + SendLogsToMainClusterWhenIsInReadyState true + SendLogsToMainClusterWhenIsInHibernatingState false + SendLogsToMainClusterWhenIsInHibernatedState false + SendLogsToDefaultClientWhenClusterIsInReadyState false + SendLogsToDefaultClientWhenClusterIsInHibernatingState true + + # Kubernetes metadata extraction + FallbackToTagWhenMetadataIsMissing true + DropLogEntryWithoutK8sMetadata true + TagKey tag + TagPrefix kubernetes\\.var\\.log\\.containers + TagExpression \\.([^_]+)_([^_]+)_(.+)-([a-z0-9]{64})\\.log$ + + # Buffer configuration + DQueDir /fluent-bit/buffers/shoot + DQueName gardener-shoot + DQueSegmentSize 500 + + # Controller settings + ControllerSyncTimeout 60s + DeletedClientTimeExpiration 1h +``` + +### Development/Debug Configuration + +Use stdout client for local development: + +```ini +[Output] + Name gardener + Match * + + # Use stdout for debugging + SeedType stdout + ShootType stdout + + LogLevel debug + Pprof true +``` + +### Production Configuration with mTLS + +Full production setup with mutual TLS authentication: + +```ini +[Output] + Name gardener + Match kubernetes.* + + # Production settings + SeedType OTLPGRPC + Endpoint victorialogs-prod.logging.svc:4317 + LogLevel info + + # mTLS configuration + Insecure false + TLSCertFile /etc/ssl/fluent-bit/tls.crt + TLSKeyFile /etc/ssl/fluent-bit/tls.key + TLSCAFile /etc/ssl/fluent-bit/ca.crt + TLSServerName victorialogs-prod.logging.svc + TLSInsecureSkipVerify false + TLSMinVersion 1.3 + + # Optimized batch processing + DQueBatchProcessorMaxQueueSize 1024 + DQueBatchProcessorMaxBatchSize 512 + DQueBatchProcessorExportTimeout 30s + DQueBatchProcessorExportInterval 2s + DQueBatchProcessorExportBufferSize 10 + + # Persistent buffering + DQueDir /var/fluent-bit/buffers + DQueName gardener-prod + DQueSegmentSize 1000 + DQueSync true + + # Retry with exponential backoff + RetryEnabled true + RetryInitialInterval 10s + RetryMaxInterval 5m + RetryMaxElapsedTime 10m + + # Rate limiting + ThrottleEnabled true + ThrottleRequestsPerSec 500 + + # Compression + Compression 1 + + # Monitoring + HostnameValue ${HOSTNAME} + Origin seed +``` + diff --git a/docs/images/gardener-logging-otlp-plugin.png b/docs/images/gardener-logging-otlp-plugin.png new file mode 100644 index 000000000..966c8f3a6 Binary files /dev/null and b/docs/images/gardener-logging-otlp-plugin.png differ diff --git a/docs/monitoring.md b/docs/monitoring.md new file mode 100644 index 000000000..4f6c3c819 --- /dev/null +++ b/docs/monitoring.md @@ -0,0 +1,490 @@ +# Monitoring and Metrics + +This document describes the metrics, health checks, and observability features of the Gardener Fluent Bit OTLP Output Plugin. + +## Metrics Endpoints + +The plugin exposes multiple HTTP endpoints for monitoring: + +| Endpoint | Port | Description | +|----------|------|-------------| +| `/metrics` | 2021 | Prometheus metrics | +| `/healthz` | 2021 | Health check endpoint | +| `/debug/pprof/*` | 2021 | Profiling endpoints (when enabled) | + +## Prometheus Metrics + +### Plugin Metrics + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `flb_gardener_errors_total` | Counter | `error_type` | Total errors by type | +| `flb_gardener_records_total` | Counter | `client_type` | Total records processed | +| `flb_gardener_clients_total` | Gauge | - | Total active clients | + +**Example:** +```promql +# Error rate by type +rate(flb_gardener_errors_total[5m]) + +# Records per second +rate(flb_gardener_records_total[1m]) + +# Active clients +flb_gardener_clients_total +``` + +### DQue (Queue) Metrics + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `dque_queue_size` | Gauge | `endpoint` | Current queue size (records) | +| `dque_enqueued_total` | Counter | `endpoint` | Total records enqueued | +| `dque_dequeued_total` | Counter | `endpoint` | Total records dequeued | +| `dque_dropped_total` | Counter | `endpoint` | Total records dropped (queue full) | +| `dque_export_duration_seconds` | Histogram | `endpoint` | Export operation duration | +| `dque_export_errors_total` | Counter | `endpoint`, `error_type` | Total export errors | + +**Example Queries:** + +```promql +# Queue depth +dque_queue_size + +# Enqueue rate +rate(dque_enqueued_total[5m]) + +# Drop rate (should be 0) +rate(dque_dropped_total[5m]) + +# Export latency (95th percentile) +histogram_quantile(0.95, rate(dque_export_duration_seconds_bucket[5m])) + +# Export error rate +rate(dque_export_errors_total[5m]) +``` + +### OTLP Client Metrics + +The OTLP SDK automatically exports additional metrics: + +| Metric | Description | +|--------|-------------| +| `otelcol_exporter_sent_log_records` | Number of log records successfully sent | +| `otelcol_exporter_send_failed_log_records` | Number of log records failed to send | +| `otelcol_exporter_queue_size` | Current queue size | +| `otelcol_exporter_queue_capacity` | Maximum queue capacity | + +## Health Checks + +### Health Endpoint + +```bash +curl http://localhost:2021/healthz +``` + +**Responses:** + +- `200 OK`: Plugin is healthy +- `503 Service Unavailable`: Plugin is unhealthy + +### Health Check Logic + +The plugin is considered healthy when: +- All required services are running +- No critical errors in recent history +- At least one client is operational + +## Alerting Rules + +### Recommended Prometheus Alerts + +```yaml +groups: +- name: fluent-bit-gardener-plugin + interval: 30s + rules: + + # Queue growing continuously + - alert: FluentBitQueueGrowing + expr: | + delta(dque_queue_size[5m]) > 100 + for: 10m + labels: + severity: warning + annotations: + summary: "Fluent Bit queue growing on {{ $labels.instance }}" + description: "Queue size increased by {{ $value }} in 5 minutes. Backend may be slow or unavailable." + + # Queue near capacity + - alert: FluentBitQueueNearCapacity + expr: | + dque_queue_size / on() group_left() flb_gardener_clients_total > 400 + for: 5m + labels: + severity: warning + annotations: + summary: "Fluent Bit queue near capacity on {{ $labels.instance }}" + description: "Queue size is {{ $value }} (max 512). Logs may be dropped soon." + + # Logs being dropped + - alert: FluentBitLogsDropped + expr: | + rate(dque_dropped_total[5m]) > 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Fluent Bit dropping logs on {{ $labels.instance }}" + description: "{{ $value }} logs/sec are being dropped. Queue is full." + + # High export error rate + - alert: FluentBitHighErrorRate + expr: | + rate(dque_export_errors_total[5m]) > 0.1 + for: 5m + labels: + severity: warning + annotations: + summary: "High export error rate on {{ $labels.instance }}" + description: "{{ $value }} export errors/sec. Check backend connectivity." + + # High export latency + - alert: FluentBitHighExportLatency + expr: | + histogram_quantile(0.95, + rate(dque_export_duration_seconds_bucket[5m]) + ) > 10 + for: 5m + labels: + severity: warning + annotations: + summary: "High export latency on {{ $labels.instance }}" + description: "95th percentile export latency is {{ $value }}s. Backend may be slow." + + # Plugin not healthy + - alert: FluentBitPluginUnhealthy + expr: | + up{job="fluent-bit"} == 0 + or + probe_success{job="fluent-bit-healthz"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Fluent Bit plugin unhealthy on {{ $labels.instance }}" + description: "Plugin is not responding to health checks." + + # No logs being sent + - alert: FluentBitNoLogsSent + expr: | + rate(dque_dequeued_total[5m]) == 0 + for: 10m + labels: + severity: warning + annotations: + summary: "Fluent Bit not sending logs on {{ $labels.instance }}" + description: "No logs exported in 10 minutes. Check configuration and backend." +``` + +## Grafana Dashboards + +### Overview Dashboard + +Example dashboard JSON snippet: + +```json +{ + "dashboard": { + "title": "Fluent Bit Gardener Plugin", + "panels": [ + { + "title": "Queue Size", + "targets": [ + { + "expr": "dque_queue_size", + "legendFormat": "{{ endpoint }}" + } + ] + }, + { + "title": "Enqueue Rate", + "targets": [ + { + "expr": "rate(dque_enqueued_total[1m])", + "legendFormat": "{{ endpoint }}" + } + ] + }, + { + "title": "Export Latency (p95)", + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(dque_export_duration_seconds_bucket[5m]))", + "legendFormat": "{{ endpoint }}" + } + ] + }, + { + "title": "Error Rate", + "targets": [ + { + "expr": "rate(dque_export_errors_total[5m])", + "legendFormat": "{{ endpoint }} - {{ error_type }}" + } + ] + } + ] + } +} +``` + +### Key Metrics to Monitor + +1. **Throughput**: + - Enqueue rate: `rate(dque_enqueued_total[1m])` + - Dequeue rate: `rate(dque_dequeued_total[1m])` + - Records processed: `rate(flb_gardener_records_total[1m])` + +2. **Latency**: + - Export duration p50: `histogram_quantile(0.5, rate(dque_export_duration_seconds_bucket[5m]))` + - Export duration p95: `histogram_quantile(0.95, rate(dque_export_duration_seconds_bucket[5m]))` + - Export duration p99: `histogram_quantile(0.99, rate(dque_export_duration_seconds_bucket[5m]))` + +3. **Errors**: + - Error rate: `rate(flb_gardener_errors_total[5m])` + - Export errors: `rate(dque_export_errors_total[5m])` + - Drop rate: `rate(dque_dropped_total[5m])` + +4. **Queue Health**: + - Queue size: `dque_queue_size` + - Queue utilization: `dque_queue_size / 512 * 100` + - Queue growth: `delta(dque_queue_size[5m])` + +## Profiling (Debug) + +### Enabling Profiling + +Enable pprof in configuration: + +```ini +[Output] + Name gardener + Match * + Pprof true +``` + +### Available Profiles + +| Endpoint | Description | +|----------|-------------| +| `/debug/pprof/` | Index of available profiles | +| `/debug/pprof/goroutine` | Stack traces of all goroutines | +| `/debug/pprof/heap` | Heap memory allocations | +| `/debug/pprof/allocs` | All past memory allocations | +| `/debug/pprof/threadcreate` | Stack traces of thread creation | +| `/debug/pprof/block` | Stack traces of blocking operations | +| `/debug/pprof/mutex` | Stack traces of mutex contention | +| `/debug/pprof/profile` | CPU profile (30s sample) | +| `/debug/pprof/trace` | Execution trace (1s sample) | + +### Using Profiles + +#### CPU Profile + +```bash +# Collect 30-second CPU profile +go tool pprof http://localhost:2021/debug/pprof/profile + +# In pprof interactive mode: +(pprof) top10 +(pprof) list functionName +(pprof) web +``` + +#### Heap Profile + +```bash +# Analyze heap usage +go tool pprof http://localhost:2021/debug/pprof/heap + +# Show top memory consumers +(pprof) top10 + +# Show allocations by function +(pprof) list functionName +``` + +#### Goroutine Profile + +```bash +# Check goroutine count +curl http://localhost:2021/debug/pprof/goroutine?debug=1 + +# Analyze with pprof +go tool pprof http://localhost:2021/debug/pprof/goroutine +``` + +#### Trace Analysis + +```bash +# Collect execution trace +curl http://localhost:2021/debug/pprof/trace?seconds=5 > trace.out + +# Analyze trace +go tool trace trace.out +``` + +## Log Analysis + +### Plugin Logs + +The plugin logs to Fluent Bit's output. Key log messages: + +**Startup:** +``` +[info] Starting fluent-bit-gardener-output-plugin version=v1.0.0 revision=abc123 +[info] [flb-go] output plugin initialized id=xyz count=1 +``` + +**Normal Operation:** +``` +[debug] [flb-go] sending batch records=256 endpoint=victorialogs:4317 +[debug] [flb-go] batch exported duration=123ms +``` + +**Errors:** +``` +[error] [flb-go] failed to export batch error="connection refused" +[error] [flb-go] queue full, dropping records count=10 +``` + +### Log Levels + +Set appropriate log level: + +```ini +# Production +LogLevel info + +# Troubleshooting +LogLevel debug + +# Silent (errors only) +LogLevel error +``` + +## Monitoring Best Practices + +### What to Monitor + +1. **Core Metrics**: + - Queue size (should be stable and low) + - Export latency (should be consistent) + - Error rate (should be near zero) + - Drop rate (should be zero) + +2. **Resource Usage**: + - Memory usage (pod/container metrics) + - CPU usage (should be consistent) + - Disk I/O (queue storage) + - Network bandwidth + +3. **Client Health**: + - Active client count + - Per-client queue sizes + - Per-client error rates + +### Alert Thresholds + +Recommended thresholds: + +```yaml +# Warning thresholds +queue_size_warning: 400 # 80% of default max (512) +export_latency_warning: 5s +error_rate_warning: 0.01 # 1% + +# Critical thresholds +logs_dropped: > 0 # Any drops +export_latency_critical: 10s +error_rate_critical: 0.1 # 10% +health_check_failure: true +``` + +### SLOs (Service Level Objectives) + +Example SLOs: + +```yaml +# Availability +- metric: up + target: 99.9% + window: 30d + +# Error rate +- metric: rate(dque_export_errors_total[5m]) / rate(dque_enqueued_total[5m]) + target: < 0.1% # 99.9% success rate + window: 30d + +# Latency +- metric: histogram_quantile(0.95, rate(dque_export_duration_seconds_bucket[5m])) + target: < 5s + window: 30d + +# No data loss +- metric: rate(dque_dropped_total[5m]) + target: 0 + window: 30d +``` + +## Troubleshooting with Metrics + +### Queue Growing + +```promql +# Check queue growth +delta(dque_queue_size[5m]) > 100 + +# Check export vs enqueue rate +rate(dque_enqueued_total[1m]) - rate(dque_dequeued_total[1m]) +``` + +**Actions:** +- Increase export rate +- Check backend performance +- Enable compression +- Increase batch size + +### High Latency + +```promql +# Check p95 latency +histogram_quantile(0.95, rate(dque_export_duration_seconds_bucket[5m])) > 5 +``` + +**Actions:** +- Check network latency to backend +- Review backend performance +- Reduce batch size +- Check for throttling + +### Memory Issues + +```promql +# Check memory usage (from container metrics) +container_memory_usage_bytes{pod=~"fluent-bit.*"} / container_spec_memory_limit_bytes{pod=~"fluent-bit.*"} > 0.8 +``` + +**Actions:** +- Reduce queue size +- Reduce batch size +- Check for goroutine leaks (pprof) + +## Additional Resources + +- [Prometheus Documentation](https://prometheus.io/docs/) +- [Grafana Dashboards](https://grafana.com/docs/grafana/latest/dashboards/) +- [Go pprof Guide](https://go.dev/blog/pprof) +- [Troubleshooting Guide](troubleshooting.md) + diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md new file mode 100644 index 000000000..d48a49c52 --- /dev/null +++ b/docs/troubleshooting.md @@ -0,0 +1,428 @@ +# Troubleshooting Guide + +This document provides solutions to common issues when using the Gardener Fluent Bit OTLP Output Plugin. + +## Common Issues + +### Logs Not Being Sent + +#### Symptoms +- Logs are not appearing in the backend +- Queue size keeps growing +- No export errors in metrics + +#### Troubleshooting Steps + +1. **Check client type configuration**: + ```ini + SeedType OTLPGRPC # or OTLPHTTP, stdout + ShootType OTLPGRPC + ``` + + Verify the client type is set correctly. If empty, the plugin won't send logs. + +2. **Verify endpoint connectivity**: + ```bash + # For gRPC + grpcurl -plaintext victorialogs.logging.svc:4317 list + + # For HTTP + curl -v https://victorialogs.logging.svc/insert/opentelemetry/v1/logs + ``` + +3. **Check TLS configuration**: + - Ensure certificate paths are correct + - Verify certificate validity: `openssl x509 -in /path/to/cert.crt -text -noout` + - Check server name matches certificate CN/SAN + - Verify CA certificate is correct + +4. **Review plugin logs**: + ```bash + # Increase log level in config + LogLevel debug + + # Check Fluent Bit logs + kubectl logs -n logging daemonset/fluent-bit + ``` + +5. **Check backend availability**: + ```bash + # Test network connectivity + nc -zv victorialogs.logging.svc 4317 + + # Check DNS resolution + nslookup victorialogs.logging.svc + ``` + +### Queue Growing Continuously + +#### Symptoms +- `dque_queue_size` metric keeps increasing +- Disk usage growing +- Logs being dropped (if queue full) + +#### Root Causes and Solutions + +1. **Backend performance issues**: + - Backend may be too slow or unavailable + - Check backend metrics and logs + - Scale backend if needed + +2. **Batch size too small**: + ```ini + # Increase batch size + DQueBatchProcessorMaxBatchSize 512 + ``` + +3. **Export interval too high**: + ```ini + # Decrease export interval + DQueBatchProcessorExportInterval 500ms + ``` + +4. **Network latency**: + - Check network connectivity to backend + - Enable compression to reduce bandwidth: + ```ini + Compression 1 + ``` + +5. **Backend throttling**: + - Check for rate limiting errors + - Adjust throttle configuration: + ```ini + ThrottleEnabled true + ThrottleRequestsPerSec 100 + ``` + +### High Memory Usage + +#### Symptoms +- Pod OOMKilled +- High memory metrics +- System slowdown + +#### Solutions + +1. **Queue size too large**: + ```ini + # Reduce in-memory queue + DQueBatchProcessorMaxQueueSize 256 + ``` + +2. **Batch size too large**: + ```ini + # Reduce batch size + DQueBatchProcessorMaxBatchSize 128 + ``` + +3. **Too many clients**: + - Check for client leaks with dynamic routing + - Review client cleanup: + ```ini + DeletedClientTimeExpiration 30m + ``` + +4. **Memory leak**: + - Enable profiling and analyze heap: + ```bash + go tool pprof http://localhost:2021/debug/pprof/heap + ``` + +### TLS/mTLS Errors + +#### Symptoms +- "certificate verify failed" errors +- "tls: bad certificate" errors +- Connection refused + +#### Solutions + +1. **Certificate not found**: + ```bash + # Check file exists and is readable + ls -la /etc/ssl/fluent-bit/tls.crt + + # Check permissions + kubectl exec -it fluent-bit-xxx -- ls -la /etc/ssl/fluent-bit/ + ``` + +2. **Certificate expired**: + ```bash + # Check certificate validity + openssl x509 -in /path/to/cert.crt -noout -dates + ``` + +3. **CA certificate mismatch**: + ```bash + # Verify certificate chain + openssl verify -CAfile /path/to/ca.crt /path/to/cert.crt + ``` + +4. **Server name mismatch**: + ```ini + # Set correct server name for SNI + TLSServerName victorialogs.logging.svc + ``` + +5. **TLS version incompatibility**: + ```ini + # Adjust TLS version + TLSMinVersion 1.2 + TLSMaxVersion 1.3 + ``` + +### Dynamic Routing Not Working + +#### Symptoms +- Logs always go to default client +- Expected client not created +- "client not found" errors + +#### Solutions + +1. **Check JSONPath configuration**: + ```ini + # Verify path matches log structure + DynamicHostPath {"kubernetes": {"namespace_name": "namespace"}} + ``` + +2. **Verify regex pattern**: + ```ini + # Test regex against namespace names + DynamicHostRegex ^shoot-- + ``` + +3. **Check controller sync**: + ```bash + # Increase sync timeout + ControllerSyncTimeout 120s + ``` + +4. **Review cluster state routing**: + ```ini + # Ensure state-based routing is configured + SendLogsToMainClusterWhenIsInReadyState true + ``` + +5. **Check namespace metadata**: + ```bash + # Verify logs contain expected metadata + LogLevel debug + # Look for "extracted dynamic host" messages + ``` + +### Export Errors + +#### Symptoms +- `dque_export_errors_total` metric increasing +- "context deadline exceeded" errors +- "connection reset" errors + +#### Solutions + +1. **Timeout too short**: + ```ini + # Increase export timeout + DQueBatchProcessorExportTimeout 60s + Timeout 60s + ``` + +2. **Backend overloaded**: + - Reduce export rate: + ```ini + ThrottleEnabled true + ThrottleRequestsPerSec 50 + ``` + +3. **Network issues**: + - Check network connectivity + - Test with smaller batches: + ```ini + DQueBatchProcessorMaxBatchSize 128 + ``` + +4. **Backend errors**: + - Check backend logs for error details + - Verify backend configuration + +### Kubernetes Metadata Missing + +#### Symptoms +- Logs missing pod_name, namespace, container_name +- Logs dropped when `DropLogEntryWithoutK8sMetadata` is true + +#### Solutions + +1. **Enable fallback to tag**: + ```ini + FallbackToTagWhenMetadataIsMissing true + ``` + +2. **Check tag configuration**: + ```ini + TagKey tag + TagPrefix kubernetes\\.var\\.log\\.containers + TagExpression \\.([^_]+)_([^_]+)_(.+)-([a-z0-9]{64})\\.log$ + ``` + +3. **Verify Fluent Bit Kubernetes filter**: + ```ini + [Filter] + Name kubernetes + Match kubernetes.* + Kube_URL https://kubernetes.default.svc:443 + Kube_CA_File /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + Kube_Token_File /var/run/secrets/kubernetes.io/serviceaccount/token + ``` + +4. **Check service account permissions**: + ```bash + # Verify RBAC allows reading pods/namespaces + kubectl auth can-i get pods --as=system:serviceaccount:logging:fluent-bit + ``` + +## Debug Mode + +Enable comprehensive debugging: + +```ini +[Output] + Name gardener + Match * + LogLevel debug + Pprof true +``` + +### Accessing Debug Information + +1. **Metrics**: + ```bash + curl http://localhost:2021/metrics + ``` + +2. **Health Check**: + ```bash + curl http://localhost:2021/healthz + ``` + +3. **CPU Profile**: + ```bash + go tool pprof http://localhost:2021/debug/pprof/profile + ``` + +4. **Heap Profile**: + ```bash + go tool pprof http://localhost:2021/debug/pprof/heap + ``` + +5. **Goroutines**: + ```bash + curl http://localhost:2021/debug/pprof/goroutine?debug=2 + ``` + +## Performance Issues + +### High CPU Usage + +1. **Compression overhead**: + ```ini + # Disable compression if CPU-constrained + Compression 0 + ``` + +2. **Too many regex operations**: + - Simplify tag expressions + - Use Kubernetes filter instead of tag parsing + +3. **Too many clients**: + - Review dynamic routing configuration + - Reduce client count if possible + +### High Disk I/O + +1. **Frequent syncs**: + ```ini + # Disable fsync for better performance + DQueSync false + ``` + +2. **Small segments**: + ```ini + # Increase segment size + DQueSegmentSize 1000 + ``` + +3. **Disk queue location**: + - Use faster storage for queue directory + - Consider tmpfs for ephemeral environments: + ```ini + DQueDir /dev/shm/fluent-bit-buffers + ``` + +## Metrics Analysis + +### Key Metrics to Monitor + +```bash +# Queue health +curl -s http://localhost:2021/metrics | grep dque_queue_size + +# Export performance +curl -s http://localhost:2021/metrics | grep dque_export_duration_seconds + +# Error rate +curl -s http://localhost:2021/metrics | grep dque_export_errors_total + +# Drop rate +curl -s http://localhost:2021/metrics | grep dque_dropped_total +``` + +### Alert Recommendations + +1. **Queue Growing**: + - Alert: `dque_queue_size > 400` for 5 minutes + - Action: Investigate backend performance or increase export rate + +2. **High Error Rate**: + - Alert: `rate(dque_export_errors_total[5m]) > 0.1` + - Action: Check backend connectivity and logs + +3. **Logs Dropped**: + - Alert: `rate(dque_dropped_total[5m]) > 0` + - Action: Increase queue size or export rate + +4. **High Export Latency**: + - Alert: `histogram_quantile(0.95, dque_export_duration_seconds) > 10` + - Action: Check network latency or backend performance + +## Getting Help + +If you've tried the solutions above and still have issues: + +1. **Collect debug information**: + ```bash + # Plugin logs + kubectl logs -n logging daemonset/fluent-bit --tail=1000 > fluent-bit.log + + # Metrics + curl http://localhost:2021/metrics > metrics.txt + + # Configuration + kubectl get configmap fluent-bit-config -o yaml > config.yaml + ``` + +2. **Check known issues**: + - GitHub Issues: [https://github.com/gardener/logging/issues](https://github.com/gardener/logging/issues) + +3. **Ask for help**: + - Create a GitHub issue with debug information + - Join Gardener Slack: [#gardener](https://kubernetes.slack.com/messages/gardener) + +## Additional Resources + +- [Configuration Guide](configuration.md) +- [Architecture](architecture.md) +- [Usage Guide](usage.md) +- [pkg/client/README.md](../pkg/client/README.md) + diff --git a/docs/usage.md b/docs/usage.md new file mode 100644 index 000000000..fcf1e3ec3 --- /dev/null +++ b/docs/usage.md @@ -0,0 +1,424 @@ +# Usage Guide + +This document provides instructions for building, installing, and using the Gardener Fluent Bit OTLP Output Plugin. + +## Building + +### Prerequisites + +- Go 1.23+ +- gcc (for cgo, required by Fluent Bit plugin interface) +- make + +### Build the Plugin + +```bash +make plugin +``` + +This builds the plugin as a shared library at `build/output_plugin.so`. + +### Build the Event Logger + +```bash +make event-logger +``` + +### Run Tests + +```bash +make test +``` + +### Build Docker Images + +```bash +make docker-images +``` + +## Installation + +### With Fluent Bit Binary + +If you have Fluent Bit installed in your `$PATH`: + +```bash +fluent-bit -e /path/to/build/output_plugin.so -c fluent-bit.conf +``` + +### Using plugins.conf + +You can also configure Fluent Bit to load the plugin automatically via `plugins.conf`: + +```ini +[PLUGINS] + Path /path/to/build/output_plugin.so +``` + +### Docker/Kubernetes + +Mount the plugin as a volume and configure Fluent Bit to load it: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: fluent-bit-config +data: + plugins.conf: | + [PLUGINS] + Path /fluent-bit/plugins/output_plugin.so + + fluent-bit.conf: | + [Service] + Flush 1 + Daemon off + Log_Level info + Parsers_File parsers.conf + Plugins_File plugins.conf + + [Input] + Name tail + Path /var/log/containers/*.log + Parser cri + Tag kubernetes.* + Refresh_Interval 5 + Mem_Buf_Limit 5MB + Skip_Long_Lines On + + [Output] + Name gardener + Match kubernetes.* + SeedType OTLPGRPC + Endpoint victorialogs.logging.svc:4317 + LogLevel info +``` + +## Basic Usage + +### Simple Configuration + +Minimal configuration to send logs to a VictoriaLogs backend: + +```ini +[Service] + Flush 1 + Daemon off + Log_Level info + HTTP_Server On + HTTP_Listen 0.0.0.0 + HTTP_Port 2020 + +[Input] + Name tail + Path /var/log/containers/*.log + Parser cri + Tag kubernetes.* + Refresh_Interval 5 + Mem_Buf_Limit 5MB + Skip_Long_Lines On + +[Output] + Name gardener + Match kubernetes.* + SeedType OTLPGRPC + Endpoint victorialogs.logging.svc:4317 + LogLevel info +``` + +### Multiple Plugin Instances + +You can run multiple plugin instances in the same Fluent Bit process to route logs to different backends: + +```ini +# Route Kubernetes logs to Seed cluster +[Output] + Name gardener + Match kubernetes.* + SeedType OTLPGRPC + Endpoint victorialogs-seed.logging.svc:4317 + DQueDir /fluent-bit/buffers/kubernetes + DQueName kubernetes-logs + +# Route systemd logs to different endpoint +[Output] + Name gardener + Match systemd.* + SeedType OTLPHTTP + Endpoint https://victorialogs-systemd.logging.svc/insert/opentelemetry/v1/logs + DQueDir /fluent-bit/buffers/systemd + DQueName systemd-logs +``` + +## Running the Plugin + +### Local Development + +```bash +# Build the plugin +make plugin + +# Run with Fluent Bit +fluent-bit -e ./build/output_plugin.so -c examples/fluent-bit.conf +``` + +### Debug Mode + +Enable debug logging and profiling: + +```ini +[Output] + Name gardener + Match * + SeedType stdout + LogLevel debug + Pprof true +``` + +Then access: +- Logs: stdout +- Metrics: `http://localhost:2021/metrics` +- Health: `http://localhost:2021/healthz` +- Profiling: `http://localhost:2021/debug/pprof/` + +### Production Deployment + +For production, use the Docker images provided by the project: + +```yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: fluent-bit + namespace: logging +spec: + selector: + matchLabels: + app: fluent-bit + template: + metadata: + labels: + app: fluent-bit + spec: + serviceAccountName: fluent-bit + containers: + - name: fluent-bit + image: europe-docker.pkg.dev/gardener-project/releases/fluent-bit-output:latest + resources: + limits: + memory: 200Mi + requests: + cpu: 100m + memory: 100Mi + volumeMounts: + - name: varlog + mountPath: /var/log + readOnly: true + - name: varlibdockercontainers + mountPath: /var/lib/docker/containers + readOnly: true + - name: fluent-bit-config + mountPath: /fluent-bit/etc/ + - name: buffer + mountPath: /fluent-bit/buffers + volumes: + - name: varlog + hostPath: + path: /var/log + - name: varlibdockercontainers + hostPath: + path: /var/lib/docker/containers + - name: fluent-bit-config + configMap: + name: fluent-bit-config + - name: buffer + emptyDir: {} +``` + +## Common Use Cases + +### Gardener Seed Cluster + +Send logs from Seed cluster to central VictoriaLogs: + +```ini +[Output] + Name gardener + Match * + SeedType OTLPGRPC + Endpoint victorialogs.garden.svc:4317 + + # TLS configuration + TLSCertFile /etc/ssl/fluent-bit/tls.crt + TLSKeyFile /etc/ssl/fluent-bit/tls.key + TLSCAFile /etc/ssl/fluent-bit/ca.crt + + # Buffering + DQueDir /fluent-bit/buffers + DQueName seed-logs + + # Metadata + HostnameValue ${SEED_NAME} + Origin seed +``` + +### Gardener Shoot Control Plane + +Route logs from Shoot control planes to both Seed and Shoot clusters: + +```ini +[Output] + Name gardener + Match kubernetes.* + + # Default Seed client + SeedType OTLPGRPC + Endpoint victorialogs-seed.logging.svc:4317 + + # Dynamic Shoot routing + ShootType OTLPGRPC + DynamicHostPath {"kubernetes": {"namespace_name": "namespace"}} + DynamicHostPrefix victorialogs. + DynamicHostSuffix .svc.cluster.local:4317 + DynamicHostRegex ^shoot-- + + # State-based routing + SendLogsToMainClusterWhenIsInReadyState true + SendLogsToMainClusterWhenIsInHibernatingState false + SendLogsToDefaultClientWhenClusterIsInHibernatingState true + + # Buffering per client + DQueDir /fluent-bit/buffers/shoot-cp + DQueName shoot-cp-logs +``` + +### Multi-Tenant Setup + +Route logs to different tenants based on namespace: + +```ini +[Output] + Name gardener + Match kubernetes.* + + # HTTP client for multi-tenancy + SeedType OTLPHTTP + Endpoint https://victorialogs.example.com/insert/opentelemetry/v1/logs + + # Extract tenant from namespace and add as header + DynamicHostPath {"kubernetes": {"namespace_name": "namespace"}} + Headers X-Scope-OrgID ${NAMESPACE} + + # Authentication + TLSCertFile /etc/ssl/fluent-bit/tls.crt + TLSKeyFile /etc/ssl/fluent-bit/tls.key +``` + +## Verification + +### Check Plugin Status + +```bash +# Check Fluent Bit logs +kubectl logs -n logging daemonset/fluent-bit + +# Check metrics +curl http://localhost:2021/metrics + +# Check health +curl http://localhost:2021/healthz +``` + +### Verify Logs in Backend + +```bash +# VictoriaLogs query +curl -G 'http://victorialogs:9428/select/logsql/query' \ + --data-urlencode 'query={origin="seed"} | limit 10' + +# Check OTLP endpoint +grpcurl -plaintext victorialogs:4317 list +``` + +### Debug Issues + +```bash +# Enable debug logging +# Set LogLevel debug in config + +# Check queue status +curl http://localhost:2021/metrics | grep dque_queue_size + +# Profile CPU usage +go tool pprof http://localhost:2021/debug/pprof/profile + +# Check goroutines +curl http://localhost:2021/debug/pprof/goroutine?debug=2 +``` + +## Performance Tuning + +### High-Volume Environments + +```ini +[Output] + Name gardener + Match * + SeedType OTLPGRPC + Endpoint victorialogs.logging.svc:4317 + + # Larger batches + DQueBatchProcessorMaxBatchSize 512 + DQueBatchProcessorMaxQueueSize 2048 + + # Compression + Compression 1 + + # Faster flushing + DQueBatchProcessorExportInterval 500ms + + # Buffering + DQueDir /fluent-bit/buffers + DQueSegmentSize 1000 +``` + +### Low-Latency Requirements + +```ini +[Output] + Name gardener + Match * + SeedType OTLPGRPC + Endpoint victorialogs.logging.svc:4317 + + # Smaller batches, faster export + DQueBatchProcessorMaxBatchSize 64 + DQueBatchProcessorExportInterval 100ms + + # No compression + Compression 0 +``` + +### Memory-Constrained Environments + +```ini +[Output] + Name gardener + Match * + SeedType OTLPGRPC + Endpoint victorialogs.logging.svc:4317 + + # Smaller queue + DQueBatchProcessorMaxQueueSize 256 + DQueBatchProcessorMaxBatchSize 128 + + # Smaller segments + DQueSegmentSize 250 +``` + +## Next Steps + +- Review [Configuration Guide](configuration.md) for detailed options +- See [Troubleshooting Guide](troubleshooting.md) for common issues +- Check [Architecture](architecture.md) for design details + diff --git a/docs/usage/README.md b/docs/usage/README.md deleted file mode 100644 index 27829e33a..000000000 --- a/docs/usage/README.md +++ /dev/null @@ -1,54 +0,0 @@ -# Logging Plugin - -This guide is about Gardener Logging, how it is organized and how to use the dashboard to view the log data of Kubernetes clusters. - -## Cluster level logging - -Log data is fundamental for the successful operation activities of Kubernetes landscapes. It is used for investigating problems and monitoring cluster activity. - -Cluster level logging is the recommended way to collect and store log data for Kubernetes cluster components. With cluster level logging the log data is externalized -in a logging backend where the log lifecycle management is independent from the lifecycle management of the Kubernetes resources. - -Cluster level logging is not available by default with [Kubernetes](https://kubernetes.io/docs/concepts/cluster-administration/logging/#cluster-level-logging-architectures) and consumers have to additionally implement it. -The Kubernetes project only provides basic logging capabilities via `kubectl logs` where the kubelet keeps one terminated container with its logs. -When a pod is evicted from the node, all corresponding containers are also evicted, along with their logs. -This is why the default log storage solution is considered short-lived and not sufficient when you want to properly operate a Kubernetes environment. - -Gardener, as an advanced Kubernetes management solution, follows the general recommendations and offers a cluster level logging solution to ensure proper log storage for all managed Kubernetes resources. -The log management is setup when a new cluster is created. -Log collection is organized using [fluent-bit](https://fluentbit.io). -Log storage and search is organized using [Vali](https://github.com/credativ/vali). -Log visualization is available using [Plutono](https://github.com/credativ/plutono) that is deployed with predefined dashboard and visualization for every shoot cluster. - -Using Kubernetes operators can benefit from different capabilities like accessing the logs for -already terminated containers and performing fast and sophisticated search queries for investigating long-lasting or recurring problems based on logs from a long period of time. - -In this guide, you will find out how to explore the log data for your clusters. - -## Exploring logs - -The sections below describe how access Grafana and use it to view the log data of your Kubernetes cluster. - -### Accessing Plutono - -Plutono UI is visible on the Shoot panel in the Gardner Dashboard App. Usually it follows a naming convention of the seeds clusters and can be bookmarked for convinience. - -### Using Plutono - -There are two options to explore log messages in Plutono. - -#### Predefined Dashboards - -The `Plutono` dashboards containing logs table are tagged with label `logging` for convinient dashboard filtering. - -#### Explore tab - -The second option is to use the **Explore** tab. - -The explore tab allows filtering logs from the connected backend using the `Vali` LogQL. The latter is completly compatible with loki logql. The filters can be build either by selecting fields in the `Log Browser` or by entering the desired filters manuall. The UI supports auto completion of the filter names for convinience. - -An Example filter are: - -- `{pod_name="kube-apiserver-1234-1234"}` to select logs from the given pod -- `{pod_name=~"kube-apiserver.+"}` to use a regex in as pod name -- `sum(count_over_time({container_name="updater"}[5m]))` to aggregate logs count from a given container over time diff --git a/example/kind/cluster-crd.yaml b/example/kind/cluster-crd.yaml deleted file mode 100644 index d17405d9c..000000000 --- a/example/kind/cluster-crd.yaml +++ /dev/null @@ -1,76 +0,0 @@ ---- -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - controller-gen.kubebuilder.io/version: v0.14.0 - name: clusters.extensions.gardener.cloud -spec: - conversion: - strategy: None - group: extensions.gardener.cloud - names: - kind: Cluster - listKind: ClusterList - plural: clusters - singular: cluster - scope: Cluster - versions: - - additionalPrinterColumns: - - description: creation timestamp - jsonPath: .metadata.creationTimestamp - name: Age - type: date - name: v1alpha1 - schema: - openAPIV3Schema: - description: Cluster is a specification for a Cluster resource. - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: ClusterSpec is the spec for a Cluster resource. - properties: - cloudProfile: - description: |- - CloudProfile is a raw extension field that contains the cloudprofile resource referenced - by the shoot that has to be reconciled. - type: object - x-kubernetes-preserve-unknown-fields: true - seed: - description: |- - Seed is a raw extension field that contains the seed resource referenced by the shoot that - has to be reconciled. - type: object - x-kubernetes-preserve-unknown-fields: true - shoot: - description: Shoot is a raw extension field that contains the shoot resource that has to be reconciled. - type: object - x-kubernetes-preserve-unknown-fields: true - required: - - cloudProfile - - seed - - shoot - type: object - required: - - spec - type: object - served: true - storage: true - subresources: - status: {} diff --git a/example/kind/fluent-bit-vali-conf.yaml b/example/kind/fluent-bit-vali-conf.yaml deleted file mode 100644 index de5e7274f..000000000 --- a/example/kind/fluent-bit-vali-conf.yaml +++ /dev/null @@ -1,41 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: fluent-bit-vali-conf - namespace: fluent-bit -data: - fluent-bit.conf: | - [Service] - Daemon false - Http_Listen 0.0.0.0 - Http_Port 2020 - Http_Server true - Log_Level info - - [Input] - Name dummy - Tag dummy - Interval_sec 3 - Dummy {"message":"The log message","namespace_name":"localhost"} - Metadata {"severity":"INFO","job":"dummy"} - - [Output] - Name gardenervali - Match * - Labels {origin="dummy"} - Url http://vali-0.vali.svc:3100/vali/api/v1/push - LogLevel debug - BatchWait 3s - BatchSize 10240 - LineFormat json - SortByTimestamp true - DropSingleKey false - RemoveKeys stream - MaxRetries 3 - Timeout 10s - MinBackoff 15s - Buffer true - BufferType dque - QueueDir /fluent-bit/buffers - QueueSync normal - QueueName dummy diff --git a/example/kind/fluent-bit-vali.yaml b/example/kind/fluent-bit-vali.yaml deleted file mode 100644 index b1114a6f3..000000000 --- a/example/kind/fluent-bit-vali.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: fluent-bit-vali - namespace: fluent-bit -spec: - selector: - matchLabels: - app: fluent-bit-vali - template: - metadata: - labels: - app: fluent-bit-vali - spec: - tolerations: - - operator: Exists - containers: - - name: fluent-bit-vali - image: fluent-bit-vali - volumeMounts: - - name: fluent-bit-vali-conf - mountPath: /fluent-bit/config/fluent-bit.conf - subPath: fluent-bit.conf - volumes: - - name: fluent-bit-vali-conf - configMap: - name: fluent-bit-vali-conf diff --git a/example/kind/kind-config.yaml b/example/kind/kind-config.yaml deleted file mode 100644 index 2869cfa8f..000000000 --- a/example/kind/kind-config.yaml +++ /dev/null @@ -1,28 +0,0 @@ ---- -apiVersion: kind.x-k8s.io/v1alpha4 -kind: Cluster -nodes: -- role: control-plane - labels: - topology.kubernetes.io/zone: "0" - extraPortMappings: - - containerPort: 30443 - hostPort: 6443 - kubeadmConfigPatches: - - | - kind: ClusterConfiguration - apiServer: - extraArgs: - authorization-mode: RBAC,Node - - | - apiVersion: kubelet.config.k8s.io/v1beta1 - kind: KubeletConfiguration - maxPods: 50 - serializeImagePulls: false - registryPullQPS: 10 - registryBurst: 20 - -networking: - ipFamily: ipv4 - podSubnet: 10.0.0.0/24 - serviceSubnet: 10.100.0.0/16 diff --git a/example/kind/vali.yaml b/example/kind/vali.yaml deleted file mode 100644 index 894814a2c..000000000 --- a/example/kind/vali.yaml +++ /dev/null @@ -1,56 +0,0 @@ ---- -apiVersion: v1 -kind: Service -metadata: - name: vali - labels: - app: vali -spec: - ports: - - port: 3100 - name: vali - clusterIP: None - selector: - app: vali ---- -# create vali statefulset -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: vali -spec: - serviceName: vali - replicas: 1 - selector: - matchLabels: - app: vali - template: - metadata: - labels: - app: vali - spec: - containers: - - name: vali - image: ghcr.io/credativ/vali:v2.2.27 - ports: - - containerPort: 3100 - name: vali - volumeMounts: - - name: vali - mountPath: /data/vali - volumes: - - name: vali - emptyDir: {} ---- -apiVersion: v1 -kind: Service -metadata: - name: vali-0 - labels: - app: vali-0 -spec: - ports: - - port: 3100 - name: vali - selector: - statefulset.kubernetes.io/pod-name: "vali-0" diff --git a/example/performance-test/Makefile b/example/performance-test/Makefile index 511f0722c..1c93813e7 100644 --- a/example/performance-test/Makefile +++ b/example/performance-test/Makefile @@ -11,10 +11,8 @@ LOGS_DELAY ?= 25s # Delay between log messages SCENARIO ?= shoot # Test scenario: "shoot" or "seed" -LOGCLI_BIN := $(DIR)/bin/logcli - # Default target -.PHONY: help logcli setup run fetch down clean test check check-kubeconfig +.PHONY: help setup run fetch down clean test check check-kubeconfig help: @echo "Available targets:" @echo " setup - Install fluent-bit chart and apply cluster CRD" @@ -23,7 +21,6 @@ help: @echo " check - Show current number of logs in Vali" @echo " down - Clean up test clusters and log generation jobs" @echo " clean - Remove all test components" - @echo " logcli - Download and install logcli tool" @echo "" @echo "Environment variables:" @echo " CLUSTERS - Number of clusters to create (default: $(CLUSTERS))" @@ -31,26 +28,8 @@ help: @echo " LOGS - Number of logs per job (default: $(LOGS))" @echo " LOGS_DELAY - Delay between log messages (default: $(LOGS_DELAY))" -# Download and install logcli tool using a dedicated Go module - -logcli: $(LOGCLI_BIN) - @echo "logcli present: $(LOGCLI_BIN)" - -$(LOGCLI_BIN): - @echo "Building logcli from credativ/vali (clone & build) ..." - @mkdir -p $(DIR)/bin - @TMP_DIR=$$(mktemp -d) && \ - cd $$TMP_DIR && \ - git clone --depth 1 https://github.com/credativ/vali.git && \ - cd vali && \ - echo "Building logcli..." && \ - CGO_ENABLED=0 GOTOOLCHAIN=go1.22.0 go build -ldflags="-s -w" -tags netgo -o logcli ./cmd/logcli && \ - mv $$TMP_DIR/vali/logcli $(DIR)/bin/logcli - @rm -rf $$TMP_DIR - @echo "logcli installed: bin/logcli" - # Install fluent-bit chart and apply cluster CRD -setup: logcli check-kubeconfig +setup: check-kubeconfig @echo "Setting up logging performance test environment..." @$(DIR)/setup.sh diff --git a/example/performance-test/charts/fluent-bit-plugin/Chart.yaml b/example/performance-test/charts/fluent-bit-plugin/Chart.yaml index 1a85a82f3..70b14ee8c 100644 --- a/example/performance-test/charts/fluent-bit-plugin/Chart.yaml +++ b/example/performance-test/charts/fluent-bit-plugin/Chart.yaml @@ -3,4 +3,4 @@ name: fluent-bit-plugin description: A Helm chart for Kubernetes type: application version: 0.1.0 -appVersion: "0.67.0" +appVersion: "1.0.0" diff --git a/example/performance-test/charts/fluent-bit-plugin/dashboards/plutono-fluent-bit-dashboard.json b/example/performance-test/charts/fluent-bit-plugin/dashboards/plutono-fluent-bit-dashboard.json index ed8ce5c5c..1f44b2063 100644 --- a/example/performance-test/charts/fluent-bit-plugin/dashboards/plutono-fluent-bit-dashboard.json +++ b/example/performance-test/charts/fluent-bit-plugin/dashboards/plutono-fluent-bit-dashboard.json @@ -16,7 +16,7 @@ "editable": true, "gnetId": 7752, "graphTooltip": 1, - "iteration": 1763975703415, + "iteration": 1766128205437, "links": [], "panels": [ { @@ -59,7 +59,7 @@ "overrides": [] }, "gridPos": { - "h": 5, + "h": 3, "w": 4, "x": 0, "y": 1 @@ -85,79 +85,80 @@ { "exemplar": true, "expr": "sum(fluentbit_input_records_total{pod=~\"$pod\"})", + "instant": false, "interval": "", "legendFormat": "", "queryType": "randomWalk", "refId": "A" } ], - "title": "[Fluentbit] Input Plugin Total Records", + "title": "Input plugin total records", "type": "stat" }, { "datasource": "prometheus", - "description": "", + "description": "Average Input bytes processing rate of all fluent-bits.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, - "displayName": "Records", "mappings": [], - "min": 0, "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ { - "color": "blue", + "color": "green", "value": null } ] }, - "unit": "short" + "unit": "deckbytes" }, "overrides": [] }, "gridPos": { - "h": 5, - "w": 4, + "h": 6, + "w": 5, "x": 4, "y": 1 }, - "id": 74, + "id": 2, + "links": [], "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", "reduceOptions": { "calcs": [ - "lastNotNull" + "mean" ], "fields": "", "values": false }, - "text": {}, - "textMode": "value" + "showThresholdLabels": false, + "showThresholdMarkers": false, + "text": {} }, "pluginVersion": "7.5.42", "targets": [ { "exemplar": true, - "expr": "sum(fluentbit_output_proc_records_total{pod=~\"$pod\"})", + "expr": "sum(rate(fluentbit_input_bytes_total{pod=~\"$pod\"}[$__rate_interval])) by (name) / 1024 ", + "format": "time_series", + "hide": false, "interval": "", - "legendFormat": "", - "queryType": "randomWalk", + "intervalFactor": 1, + "legendFormat": "{{name}}", "refId": "A" } ], - "title": "[Fluentbit] Output Plugin Total Records", - "type": "stat" + "timeFrom": null, + "timeShift": null, + "title": "Input Plugin bytes/s", + "type": "gauge" }, { "datasource": "prometheus", - "description": "Average Input bytes processing rate of all fluent-bits.", + "description": "Output plugin bytes processing rate of all fluent-bits.", "fieldConfig": { "defaults": { "color": { @@ -169,7 +170,7 @@ "mode": "absolute", "steps": [ { - "color": "green", + "color": "blue", "value": null } ] @@ -179,12 +180,12 @@ "overrides": [] }, "gridPos": { - "h": 5, - "w": 8, - "x": 8, + "h": 6, + "w": 5, + "x": 9, "y": 1 }, - "id": 2, + "id": 9, "links": [], "options": { "reduceOptions": { @@ -202,7 +203,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(fluentbit_input_bytes_total{pod=~\"$pod\"}[$__rate_interval])) by (name) / 1024 ", + "expr": "sum(rate(fluentbit_output_proc_bytes_total{pod=~\"$pod\"}[$__rate_interval])) by (name) /1024", "format": "time_series", "hide": false, "interval": "", @@ -213,12 +214,12 @@ ], "timeFrom": null, "timeShift": null, - "title": "[Fluentbit] Input Plugin Avg Bytes", + "title": "Output Plugin bytes/s", "type": "gauge" }, { "datasource": "prometheus", - "description": "Output plugin bytes processing rate of all fluent-bits.", + "description": "Output plugin errors per name", "fieldConfig": { "defaults": { "color": { @@ -235,17 +236,17 @@ } ] }, - "unit": "deckbytes" + "unit": "short" }, "overrides": [] }, "gridPos": { - "h": 5, - "w": 8, - "x": 16, + "h": 6, + "w": 5, + "x": 14, "y": 1 }, - "id": 9, + "id": 96, "links": [], "options": { "reduceOptions": { @@ -263,7 +264,68 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(fluentbit_output_proc_bytes_total{pod=~\"$pod\"}[$__rate_interval])) by (name) /1024", + "expr": "sum(fluentbit_output_errors_total{pod=~\"$pod\"}) by (name)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Output Plugin errors", + "type": "gauge" + }, + { + "datasource": "prometheus", + "description": "Output plugin dropped records per name", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 5, + "x": 19, + "y": 1 + }, + "id": 99, + "links": [], + "options": { + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": false, + "text": {} + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "sum(fluentbit_output_dropped_records_total{pod=~\"$pod\"}) by (name)", "format": "time_series", "hide": false, "interval": "", @@ -274,9 +336,70 @@ ], "timeFrom": null, "timeShift": null, - "title": "[Fluentbit] Output Plugin Avg Bytes", + "title": "Output Plugin dropped records", "type": "gauge" }, + { + "datasource": "prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "displayName": "Records", + "mappings": [], + "min": 0, + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 0, + "y": 4 + }, + "id": 74, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "value" + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "sum(fluentbit_output_proc_records_total{pod=~\"$pod\"})", + "interval": "", + "legendFormat": "", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Output Plugin total records", + "type": "stat" + }, { "datasource": "prometheus", "fieldConfig": { @@ -290,7 +413,7 @@ "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 10, + "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "graph": false, @@ -324,21 +447,22 @@ "overrides": [] }, "gridPos": { - "h": 7, + "h": 6, "w": 12, "x": 0, - "y": 6 + "y": 7 }, "id": 69, "options": { "graph": {}, "legend": { "calcs": [ + "last", "mean", - "sum" + "max" ], "displayMode": "table", - "placement": "right" + "placement": "bottom" }, "tooltipOptions": { "mode": "single" @@ -353,11 +477,19 @@ "legendFormat": "{{name}}", "queryType": "randomWalk", "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(rate(fluentbit_input_storage_overlimit{pod=~\"$pod\"}[$__rate_interval])) by (name)", + "hide": false, + "interval": "", + "legendFormat": "{{name}}", + "refId": "B" } ], "timeFrom": null, "timeShift": null, - "title": "[Fluentbit] Rate Input Plugin Records ", + "title": "Input Plugin records/s", "type": "timeseries" }, { @@ -372,7 +504,7 @@ "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 10, + "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "graph": false, @@ -410,21 +542,22 @@ "overrides": [] }, "gridPos": { - "h": 7, + "h": 6, "w": 12, "x": 12, - "y": 6 + "y": 7 }, "id": 71, "options": { "graph": {}, "legend": { "calcs": [ + "last", "mean", - "sum" + "max" ], "displayMode": "table", - "placement": "right" + "placement": "bottom" }, "tooltipOptions": { "mode": "single" @@ -441,7 +574,7 @@ "refId": "A" } ], - "title": "[Fluentbit] Output Plugin Records", + "title": "Output Plugin records/s", "type": "timeseries" }, { @@ -499,8 +632,9 @@ "graph": {}, "legend": { "calcs": [ + "last", "mean", - "sum" + "max" ], "displayMode": "table", "placement": "right" @@ -523,7 +657,7 @@ ], "timeFrom": null, "timeShift": null, - "title": "[Fluentbit] Input Records Per Second", + "title": "Input Plugin pods records/s", "type": "timeseries" }, { @@ -583,8 +717,9 @@ "graph": {}, "legend": { "calcs": [ + "last", "mean", - "sum" + "max" ], "displayMode": "table", "placement": "right" @@ -607,41 +742,49 @@ ], "timeFrom": null, "timeShift": null, - "title": "[Fluentbit] Output Records Per Second", + "title": "Output Plugin pods records/s", "type": "timeseries" }, - { - "collapsed": false, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 21 - }, - "id": 86, - "panels": [], - "title": "Output Plugin", - "type": "row" - }, { "datasource": "prometheus", "description": "", "fieldConfig": { "defaults": { "color": { - "mode": "thresholds" + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 2, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true }, - "displayName": "Records", "mappings": [], - "min": 0, - "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null + }, + { + "color": "red", + "value": 80 } ] }, @@ -650,308 +793,46 @@ "overrides": [] }, "gridPos": { - "h": 5, - "w": 3, + "h": 8, + "w": 12, "x": 0, - "y": 22 + "y": 21 }, - "id": 75, + "id": 122, "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { + "graph": {}, + "legend": { "calcs": [ - "lastNotNull" + "last" ], - "fields": "", - "values": false + "displayMode": "table", + "placement": "right" }, - "text": {}, - "textMode": "value" + "tooltipOptions": { + "mode": "single" + } }, "pluginVersion": "7.5.42", "targets": [ { "exemplar": true, - "expr": "sum(fluentbit_vali_gardener_incoming_logs_total{pod=~\"$pod\",host=~\"$host\"})", + "expr": "fluentbit_storage_mem_chunks{pod=~\"$pod\"}", + "hide": false, "interval": "", - "legendFormat": "", + "legendFormat": "mem-{{pod}}", "queryType": "randomWalk", - "refId": "A" + "refId": "B" } ], - "title": "[Output Plugin] Input Total Records", - "type": "stat" + "title": "Chunks in memory", + "type": "timeseries" }, { "datasource": "prometheus", - "description": "Total records send to output plugin clients", "fieldConfig": { "defaults": { "color": { - "mode": "thresholds" - }, - "displayName": "Records", - "mappings": [], - "min": 0, - "noValue": "0", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "blue", - "value": null - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 5, - "w": 3, - "x": 3, - "y": 22 - }, - "id": 76, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "value" - }, - "pluginVersion": "7.5.42", - "targets": [ - { - "exemplar": true, - "expr": "sum(fluentbit_vali_gardener_forwarded_logs_total{pod=~\"$pod\",host=~\".+$host.+\"})", - "interval": "", - "legendFormat": "", - "queryType": "randomWalk", - "refId": "A" - } - ], - "title": "[Output Plugin] Forwarder Total Records", - "type": "stat" - }, - { - "datasource": "prometheus", - "description": "Total errors by output plugin", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "displayName": "Records", - "mappings": [], - "min": 0, - "noValue": "0", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "green", - "value": 0 - }, - { - "color": "#EAB839", - "value": 10 - }, - { - "color": "orange", - "value": 20 - }, - { - "color": "red", - "value": 30 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 5, - "w": 2, - "x": 6, - "y": 22 - }, - "id": 78, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "value" - }, - "pluginVersion": "7.5.42", - "targets": [ - { - "exemplar": true, - "expr": "sum(fluentbit_vali_gardener_errors_total{pod=~\"$pod\",host=~\"$host\"})", - "interval": "", - "legendFormat": "", - "queryType": "randomWalk", - "refId": "A" - } - ], - "title": "[Output Plugin] Total Errors", - "type": "stat" - }, - { - "datasource": "prometheus", - "description": "Total records dropped by output plugin", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "displayName": "Records", - "mappings": [], - "min": 0, - "noValue": "0", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "purple", - "value": null - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 5, - "w": 2, - "x": 8, - "y": 22 - }, - "id": 77, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "value" - }, - "pluginVersion": "7.5.42", - "targets": [ - { - "exemplar": true, - "expr": "sum(fluentbit_vali_gardener_dropped_logs_total{pod=~\"$pod\",host=~\"$host\"})", - "interval": "", - "legendFormat": "", - "queryType": "randomWalk", - "refId": "A" - } - ], - "title": "[Output Plugin] Dropped Total Records", - "type": "stat" - }, - { - "datasource": "prometheus", - "description": "Total records without metadata", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "displayName": "Records", - "mappings": [], - "min": 0, - "noValue": "0", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "purple", - "value": null - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 5, - "w": 2, - "x": 10, - "y": 22 - }, - "id": 79, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "value" - }, - "pluginVersion": "7.5.42", - "targets": [ - { - "exemplar": true, - "expr": "sum(fluentbit_vali_gardener_logs_without_metadata_total{pod=~\"$pod\",host=~\"$host\"})", - "interval": "", - "legendFormat": "", - "queryType": "randomWalk", - "refId": "A" - } - ], - "title": "[Output Plugin] No Metadata Records", - "type": "stat" - }, - { - "datasource": "prometheus", - "description": "buffer sizes output plugin", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "mode": "palette-classic" }, "custom": { "axisLabel": "", @@ -975,13 +856,16 @@ "spanNulls": true }, "mappings": [], - "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null + }, + { + "color": "red", + "value": 80 } ] }, @@ -990,16 +874,16 @@ "overrides": [] }, "gridPos": { - "h": 5, + "h": 8, "w": 12, "x": 12, - "y": 22 + "y": 21 }, - "id": 95, + "id": 139, "options": { + "graph": {}, "legend": { "calcs": [ - "mean", "last" ], "displayMode": "table", @@ -1013,20 +897,18 @@ "targets": [ { "exemplar": true, - "expr": "fluentbit_vali_gardener_dque_size{pod=~\"$pod\",name=~\"$host\"}", - "instant": false, + "expr": "fluentbit_storage_fs_chunks{pod=~\"$pod\"}", "interval": "", - "legendFormat": "{{pod}}/{{name}}", + "legendFormat": "{{pod}}", "queryType": "randomWalk", "refId": "A" } ], - "title": "[Output Plugin] DQue Current Size", + "title": "Chunks on filesystem", "type": "timeseries" }, { "datasource": "prometheus", - "description": "Output plugin total logs", "fieldConfig": { "defaults": { "color": { @@ -1060,26 +942,29 @@ { "color": "green", "value": null + }, + { + "color": "red", + "value": 80 } ] }, - "unit": "rps" + "unit": "short" }, "overrides": [] }, "gridPos": { "h": 8, - "w": 11, + "w": 12, "x": 0, - "y": 27 + "y": 29 }, - "id": 56, + "id": 120, "options": { "graph": {}, "legend": { "calcs": [ - "mean", - "sum" + "last" ], "displayMode": "table", "placement": "right" @@ -1092,22 +977,19 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(fluentbit_vali_gardener_incoming_logs_total{pod=~\"$pod\",host=~\"$host\"}[$__rate_interval])) by (host)", - "instant": false, + "expr": "fluentbit_storage_fs_chunks_up{pod=~\"$pod\"}", + "hide": false, "interval": "", - "legendFormat": "{{host}}", - "refId": "A" + "legendFormat": "up-{{pod}}", + "queryType": "randomWalk", + "refId": "B" } ], - "timeFrom": null, - "timeShift": null, - "title": "[Output Plugin] Incoming Logs Rate", - "transformations": [], + "title": "Chunks on filesystem - up", "type": "timeseries" }, { "datasource": "prometheus", - "description": "Output plugin total logs", "fieldConfig": { "defaults": { "color": { @@ -1141,26 +1023,29 @@ { "color": "green", "value": null + }, + { + "color": "red", + "value": 80 } ] }, - "unit": "rps" + "unit": "short" }, "overrides": [] }, "gridPos": { "h": 8, - "w": 13, - "x": 11, - "y": 27 + "w": 12, + "x": 12, + "y": 29 }, - "id": 80, + "id": 140, "options": { "graph": {}, "legend": { "calcs": [ - "mean", - "sum" + "last" ], "displayMode": "table", "placement": "right" @@ -1173,753 +1058,2158 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(fluentbit_vali_gardener_forwarded_logs_total{pod=~\"$pod\",host=~\".+$host.+\"}[$__rate_interval])) by (host)", - "instant": false, + "expr": "fluentbit_storage_fs_chunks_down{pod=~\"$pod\"}", + "hide": false, "interval": "", - "legendFormat": "{{host}}", - "refId": "A" + "legendFormat": "down-{{pod}}", + "queryType": "randomWalk", + "refId": "B" } ], - "timeFrom": null, - "timeShift": null, - "title": "[Output Plugin] Forwarded Logs Rate", - "transformations": [], + "title": "Chunks on filesystem - down", "type": "timeseries" }, { - "datasource": "prometheus", - "description": "Output plugin errors", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "graph": false, - "legend": false, - "tooltip": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "errors" - }, - "overrides": [] - }, + "collapsed": true, + "datasource": null, "gridPos": { - "h": 8, - "w": 8, + "h": 1, + "w": 24, "x": 0, - "y": 35 - }, - "id": 82, - "options": { - "graph": {}, - "legend": { - "calcs": [ - "mean", - "sum" - ], - "displayMode": "table", - "placement": "right" - }, - "tooltipOptions": { - "mode": "single" - } + "y": 37 }, - "pluginVersion": "7.5.42", - "targets": [ + "id": 86, + "panels": [ { - "exemplar": true, - "expr": "sum(rate(fluentbit_vali_gardener_errors_total{pod=~\"$pod\",host=~\"$host\"}[$__rate_interval])) by (host)", - "instant": false, - "interval": "", - "legendFormat": "{{host}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "[Output Plugin] Errors Logs Rate", - "transformations": [], - "type": "timeseries" - }, - { - "datasource": "prometheus", - "description": "Output plugin dropped logs", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "graph": false, - "legend": false, - "tooltip": false + "datasource": "prometheus", + "description": "Output plugin input records", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "displayName": "Records", + "mappings": [], + "min": 0, + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 2 + }, + "id": 75, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - "showPoints": "never", - "spanNulls": true - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "errors" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 7, - "x": 8, - "y": 35 - }, - "id": 89, - "options": { - "graph": {}, - "legend": { - "calcs": [ - "mean", - "sum" + "text": {}, + "textMode": "value" + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "sum(fluentbit_gardener_incoming_logs_total{pod=~\"$pod\",host=~\"$host\"})", + "interval": "", + "legendFormat": "", + "queryType": "randomWalk", + "refId": "A" + } ], - "displayMode": "table", - "placement": "right" + "title": "Total incoming records", + "type": "stat" }, - "tooltipOptions": { - "mode": "single" - } - }, - "pluginVersion": "7.5.42", - "targets": [ { - "exemplar": true, - "expr": "sum(rate(fluentbit_vali_gardener_dropped_logs_total{pod=~\"$pod\",host=~\"$host\"}[$__rate_interval])) by (host)", - "instant": false, - "interval": "", - "legendFormat": "{{host}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "[Output Plugin] Dropped Logs Rate", - "transformations": [], - "type": "timeseries" - }, - { - "datasource": "prometheus", - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "graph": false, - "legend": false, - "tooltip": false + "datasource": "prometheus", + "description": "Total records send to output plugin clients", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "displayName": "Records", + "mappings": [], + "min": 0, + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "short" }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 4, + "y": 2 + }, + "id": 76, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - "showPoints": "never", - "spanNulls": true - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "errors" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 9, - "x": 15, - "y": 35 - }, - "id": 90, - "options": { - "graph": {}, - "legend": { - "calcs": [ - "mean", - "sum" + "text": {}, + "textMode": "value" + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "sum(fluentbit_gardener_output_client_logs_total{pod=~\"$pod\",host=~\".+$host.svc.cluster.local:.+\"})", + "interval": "", + "legendFormat": "", + "queryType": "randomWalk", + "refId": "A" + } ], - "displayMode": "table", - "placement": "right" + "title": "Total output records", + "type": "stat" }, - "tooltipOptions": { - "mode": "single" - } - }, - "pluginVersion": "7.5.42", - "targets": [ { - "exemplar": true, - "expr": "sum(rate(fluentbit_vali_gardener_logs_without_metadata_total{pod=~\"$pod\",host=~\".*$host.*\"}[$__rate_interval])) by (host)", - "instant": false, - "interval": "", - "legendFormat": "{{host}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "[Output Plugin] Logs without metadata Rate", - "transformations": [], - "type": "timeseries" - }, - { - "collapsed": false, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 43 - }, - "id": 92, - "panels": [], - "title": "ValiTail", - "type": "row" - }, - { - "datasource": "prometheus", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 5, - "x": 0, - "y": 44 - }, - "id": 94, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 8, + "y": 2 + }, + "id": 142, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "sum by (type)(fluentbit_gardener_clients_total{pod=~\"$pod\"})", + "interval": "", + "legendFormat": "{{type}}", + "queryType": "randomWalk", + "refId": "A" + } ], - "fields": "", - "values": false + "title": "Total sending clients", + "type": "stat" }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "7.5.42", - "targets": [ { - "exemplar": true, - "expr": "sum(valitail_sent_entries_total{pod=~\"$pod\",host=~\"$url\"})", - "interval": "", - "legendFormat": "{{host}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "title": "ValiTail Sent Logs Total", - "type": "stat" - }, - { - "datasource": "prometheus", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "graph": false, - "legend": false, - "tooltip": false + "datasource": "prometheus", + "description": "Total errors by output plugin", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "displayName": "Records", + "mappings": [], + "min": 0, + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 10 + }, + { + "color": "orange", + "value": 20 + }, + { + "color": "red", + "value": 30 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 14, + "y": 2 + }, + "id": 78, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" + "text": {}, + "textMode": "value" + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "sum(fluentbit_gardener_errors_total{pod=~\"$pod\",host=~\"$host\"})", + "interval": "", + "legendFormat": "", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Total errors", + "type": "stat" + }, + { + "datasource": "prometheus", + "description": "Total records dropped by output plugin", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "displayName": "Records", + "mappings": [], + "min": 0, + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "purple", + "value": null + } + ] + }, + "unit": "short" }, - "showPoints": "never", - "spanNulls": true - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 17, + "y": 2 + }, + "id": 77, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "value" + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "sum(fluentbit_gardener_dropped_logs_total{pod=~\"$pod\",host=~\".*$host.*\"})", + "interval": "", + "legendFormat": "", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Total dropped records", + "type": "stat" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "green", + "mode": "fixed" }, - { - "color": "red", - "value": 80 + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] } - ] - }, - "unit": "rps" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 20, + "y": 2 + }, + "id": 134, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "sum(output_plugin_otel_sdk_exporter_log_exported_total{server_address=~\".+$host.svc.cluster.local\"})", + "interval": "", + "legendFormat": "", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Total OTel SDK exports", + "type": "stat" }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 19, - "x": 5, - "y": 44 - }, - "id": 97, - "options": { - "graph": {}, - "legend": { - "calcs": [ - "mean", - "sum" + { + "datasource": "prometheus", + "description": "Output plugin total logs", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "rps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 13, + "x": 0, + "y": 7 + }, + "id": 56, + "options": { + "graph": {}, + "legend": { + "calcs": [ + "last", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(fluentbit_gardener_incoming_logs_total{pod=~\"$pod\",host=~\"$host\"}[$__rate_interval])) by (host)", + "instant": false, + "interval": "", + "legendFormat": "{{host}}", + "refId": "A" + } ], - "displayMode": "table", - "placement": "right" + "timeFrom": null, + "timeShift": null, + "title": "Incoming records/s", + "transformations": [], + "type": "timeseries" }, - "tooltipOptions": { - "mode": "single" - } - }, - "pluginVersion": "7.5.42", - "targets": [ { - "exemplar": true, - "expr": "sum(rate(valitail_sent_entries_total{pod=~\"$pod\",host=~\".+$host.+\"}[$__rate_interval])) by (host)", - "interval": "", - "legendFormat": "{{host}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "title": "ValiTail Client Send Logs", - "type": "timeseries" - }, - { - "datasource": "prometheus", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "graph": false, - "legend": false, - "tooltip": false + "datasource": "prometheus", + "description": "Output plugin total logs", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "rps" }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 11, + "x": 13, + "y": 7 + }, + "id": 80, + "options": { + "graph": {}, + "legend": { + "calcs": [ + "last", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right" }, - "showPoints": "never", - "spanNulls": true - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(fluentbit_gardener_output_client_logs_total{pod=~\"$pod\",host=~\".+$host.svc.cluster.local.*\"}[$__rate_interval])) by (host)", + "instant": false, + "interval": "", + "legendFormat": "{{host}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Output records/s", + "transformations": [], + "type": "timeseries" + }, + { + "datasource": "prometheus", + "description": "buffer sizes output plugin", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "decbytes" + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 13, + "x": 0, + "y": 15 + }, + "id": 95, + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "fluentbit_gardener_dque_size{pod=~\"$pod\",name=~\"$host\"}", + "instant": false, + "interval": "", + "legendFormat": "{{pod}}/{{name}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "DQue current size", + "type": "timeseries" }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 52 - }, - "id": 103, - "options": { - "graph": {}, - "legend": { - "calcs": [ - "mean", - "sum" + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 11, + "x": 13, + "y": 15 + }, + "id": 138, + "options": { + "graph": {}, + "legend": { + "calcs": [ + "last" + ], + "displayMode": "list", + "placement": "right" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "sum by (host)(fluentbit_gardener_buffered_logs{pod=~\"$pod\",host=~\"$host\"})", + "interval": "", + "legendFormat": "", + "queryType": "randomWalk", + "refId": "A" + } ], - "displayMode": "table", - "placement": "right" + "title": "DQue Buffered records", + "type": "timeseries" }, - "tooltipOptions": { - "mode": "single" - } - }, - "pluginVersion": "7.5.42", - "targets": [ { - "exemplar": true, - "expr": "sum(rate(valitail_sent_bytes_total{pod=~\"$pod\",host=~\".+$host.+\"}[$__rate_interval])) by (host)", - "interval": "", - "legendFormat": "{{host}}", - "queryType": "randomWalk", - "refId": "A" + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 13, + "x": 0, + "y": 23 + }, + "id": 136, + "options": { + "graph": {}, + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "sum by (host)(rate(fluentbit_gardener_exported_client_logs_total{pod=~\"$pod\",host=~\"$host\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "{{host}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Exported records/s", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "description": "Logs throttled by output plugin clients", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 11, + "x": 13, + "y": 23 + }, + "id": 118, + "options": { + "graph": {}, + "legend": { + "calcs": [ + "last", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "sum by (host) (rate(fluentbit_gardener_throttled_logs_total{pod=~\"$pod\",host=~\"$host\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "{{host}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Throttled records/s", + "type": "timeseries" } ], - "title": "ValiTail Send Bytes Rate", - "type": "timeseries" + "title": "Output Plugin", + "type": "row" }, { - "datasource": "prometheus", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "graph": false, - "legend": false, - "tooltip": false + "collapsed": true, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 38 + }, + "id": 100, + "panels": [ + { + "datasource": "prometheus", + "description": "CPU time spent in user and system mode", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percentunit" }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 38 + }, + "id": 104, + "options": { + "legend": { + "calcs": [ + "last", + "mean" + ], + "displayMode": "table", + "placement": "right" }, - "showPoints": "never", - "spanNulls": true - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "rate(process_cpu_seconds_total{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - CPU usage", + "refId": "A" + } + ], + "title": "[Process] CPU Usage", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "description": "Network bytes sent and received", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "decbytes" + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 45 + }, + "id": 106, + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "rate(process_network_transmit_bytes_total{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - TX", + "refId": "A" + }, + { + "exemplar": true, + "expr": "rate(process_network_receive_bytes_total{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - RX", + "refId": "B" + } + ], + "title": "[Process] Network I/O Rate", + "type": "timeseries" }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 52 - }, - "id": 104, - "options": { - "graph": {}, - "legend": { - "calcs": [ - "mean", - "sum" + { + "datasource": "prometheus", + "description": "Memory allocated in heap and currently in use", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 53 + }, + "id": 101, + "options": { + "legend": { + "calcs": [ + "last", + "max" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "go_memstats_heap_alloc_bytes{pod=~\"$pod\"}", + "interval": "", + "legendFormat": "{{pod}} - heap alloc", + "refId": "A" + }, + { + "exemplar": true, + "expr": "go_memstats_heap_inuse_bytes{pod=~\"$pod\"}", + "interval": "", + "legendFormat": "{{pod}} - heap in use", + "refId": "B" + }, + { + "exemplar": true, + "expr": "go_memstats_heap_idle_bytes{pod=~\"$pod\"}", + "interval": "", + "legendFormat": "{{pod}} - heap idle", + "refId": "C" + } + ], + "title": "[Go] Heap Memory Usage", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "description": "Number of heap objects allocated", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 60 + }, + "id": 107, + "options": { + "legend": { + "calcs": [ + "last", + "max" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "go_memstats_heap_objects{pod=~\"$pod\"}", + "interval": "", + "legendFormat": "{{pod}} - heap objects", + "refId": "A" + }, + { + "exemplar": true, + "expr": "rate(go_memstats_mallocs_total{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - alloc rate", + "refId": "B" + }, + { + "exemplar": true, + "expr": "rate(go_memstats_frees_total{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - free rate", + "refId": "C" + } + ], + "title": "[Go] Heap Objects & Allocation Rate", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "description": "Number of goroutines currently running", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1000 + }, + { + "color": "red", + "value": 10000 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 68 + }, + "id": 102, + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "go_goroutines{pod=~\"$pod\"}", + "interval": "", + "legendFormat": "{{pod}}", + "refId": "A" + } ], - "displayMode": "table", - "placement": "right" + "title": "[Go] Goroutines", + "type": "timeseries" }, - "tooltipOptions": { - "mode": "single" - } - }, - "pluginVersion": "7.5.42", - "targets": [ { - "exemplar": true, - "expr": "sum(rate(valitail_dropped_entries_total{pod=~\"$pod\",host=~\"$url\"}[$__rate_interval])) by (host)", - "interval": "", - "legendFormat": "{{host}}", - "queryType": "randomWalk", - "refId": "A" + "datasource": "prometheus", + "description": "GC pause duration and frequency", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 75 + }, + "id": 103, + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "rate(go_gc_duration_seconds_sum{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - GC duration rate", + "refId": "A" + }, + { + "exemplar": true, + "expr": "rate(go_gc_duration_seconds_count{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - GC frequency", + "refId": "B" + } + ], + "title": "[Go] GC Duration & Frequency", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "description": "Process memory usage - resident and virtual", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 83 + }, + "id": 105, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "process_resident_memory_bytes{pod=~\"$pod\"}", + "interval": "", + "legendFormat": "{{pod}} - resident memory", + "refId": "A" + }, + { + "exemplar": true, + "expr": "process_virtual_memory_bytes{pod=~\"$pod\"}", + "interval": "", + "legendFormat": "{{pod}} - virtual memory", + "refId": "B" + } + ], + "title": "[Process] Memory Usage", + "type": "timeseries" } ], - "title": "ValiTail Dropped Entries", - "type": "timeseries" + "title": "Output Plugin Go Runtime & Process Metrics", + "type": "row" }, { - "datasource": "prometheus", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "graph": false, - "legend": false, - "tooltip": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ms" - }, - "overrides": [] - }, + "collapsed": true, + "datasource": null, "gridPos": { - "h": 8, - "w": 12, + "h": 1, + "w": 24, "x": 0, - "y": 60 + "y": 39 }, - "id": 99, - "options": { - "graph": {}, - "legend": { - "calcs": [ - "last" + "id": 110, + "panels": [ + { + "datasource": "prometheus", + "description": "RPC call duration in milliseconds", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 100 + }, + { + "color": "red", + "value": 500 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 39 + }, + "id": 111, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.95, rate(output_plugin_rpc_client_duration_milliseconds_bucket{pod=~\"$pod\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "{{pod}} - p95", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, rate(output_plugin_rpc_client_duration_milliseconds_bucket{pod=~\"$pod\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "{{pod}} - p99", + "refId": "B" + }, + { + "exemplar": true, + "expr": "rate(output_plugin_rpc_client_duration_milliseconds_sum{pod=~\"$pod\"}[$__rate_interval]) / rate(output_plugin_rpc_client_duration_milliseconds_count{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - avg", + "refId": "C" + } ], - "displayMode": "table", - "placement": "right" + "title": "[OTLP gRPC] RPC Duration", + "type": "timeseries" }, - "tooltipOptions": { - "mode": "single" - } - }, - "pluginVersion": "7.5.42", - "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.50, sum(rate(valitail_request_duration_seconds_bucket{host=~\".+$host.+\",status_code=~\".+\"}[5m])) by (le,status_code,host)) * 1000 ", - "hide": false, - "interval": "", - "legendFormat": "p50 / status code {{status_code}}", - "queryType": "randomWalk", - "refId": "C" + "datasource": "prometheus", + "description": "RPC call rate per second", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 39 + }, + "id": 112, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "rate(output_plugin_rpc_client_duration_milliseconds_count{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - {{rpc_method}}", + "refId": "A" + } + ], + "title": "[OTLP gRPC] RPC Call Rate", + "type": "timeseries" }, { - "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(valitail_request_duration_seconds_bucket{host=~\".+$host.+\",status_code=~\".+\"}[5m])) by (le,status_code,host)) * 1000 ", - "interval": "", - "legendFormat": "p95 / status code {{status_code}}", - "queryType": "randomWalk", - "refId": "A" + "datasource": "prometheus", + "description": "Size of RPC request and response messages", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 47 + }, + "id": 113, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.95, rate(output_plugin_rpc_client_request_size_bytes_bucket{pod=~\"$pod\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "{{pod}} - request p95", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.95, rate(output_plugin_rpc_client_response_size_bytes_bucket{pod=~\"$pod\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "{{pod}} - response p95", + "refId": "B" + }, + { + "exemplar": true, + "expr": "rate(output_plugin_rpc_client_request_size_bytes_sum{pod=~\"$pod\"}[$__rate_interval]) / rate(output_plugin_rpc_client_request_size_bytes_count{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - request avg", + "refId": "C" + }, + { + "exemplar": true, + "expr": "rate(output_plugin_rpc_client_response_size_bytes_sum{pod=~\"$pod\"}[$__rate_interval]) / rate(output_plugin_rpc_client_response_size_bytes_count{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - response avg", + "refId": "D" + } + ], + "title": "[OTLP gRPC] Request/Response Message Size", + "type": "timeseries" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(valitail_request_duration_seconds_bucket{host=~\".+$host.+\",status_code=~\".+\"}[5m])) by (le,status_code,host))* 1000", - "hide": false, - "interval": "", - "legendFormat": "p99 / status code {{status_code}}", - "queryType": "randomWalk", - "refId": "B" - } - ], - "title": "ValiTail Request Percentiles", - "type": "timeseries" - }, - { - "datasource": "prometheus", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "graph": false, - "legend": false, - "tooltip": false + "datasource": "prometheus", + "description": "Number of messages per RPC call", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 47 + }, + "id": 114, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right" }, - "showPoints": "never", - "spanNulls": true - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ms" + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "rate(output_plugin_rpc_client_requests_per_rpc_sum{pod=~\"$pod\"}[$__rate_interval]) / rate(output_plugin_rpc_client_requests_per_rpc_count{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - avg requests per RPC", + "refId": "A" + }, + { + "exemplar": true, + "expr": "rate(output_plugin_rpc_client_responses_per_rpc_sum{pod=~\"$pod\"}[$__rate_interval]) / rate(output_plugin_rpc_client_responses_per_rpc_count{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - avg responses per RPC", + "refId": "B" + } + ], + "title": "[OTLP gRPC] Messages Per RPC", + "type": "timeseries" }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 60 - }, - "id": 101, - "options": { - "graph": {}, - "legend": { - "calcs": [ - "last" + { + "datasource": "prometheus", + "description": "gRPC response status codes distribution", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [ + { + "options": { + "0": { + "text": "OK" + }, + "1": { + "text": "CANCELLED" + }, + "2": { + "text": "UNKNOWN" + }, + "3": { + "text": "INVALID_ARGUMENT" + }, + "4": { + "text": "DEADLINE_EXCEEDED" + }, + "5": { + "text": "NOT_FOUND" + }, + "6": { + "text": "ALREADY_EXISTS" + }, + "7": { + "text": "PERMISSION_DENIED" + }, + "8": { + "text": "RESOURCE_EXHAUSTED" + }, + "9": { + "text": "FAILED_PRECONDITION" + }, + "10": { + "text": "ABORTED" + }, + "11": { + "text": "OUT_OF_RANGE" + }, + "12": { + "text": "UNIMPLEMENTED" + }, + "13": { + "text": "INTERNAL" + }, + "14": { + "text": "UNAVAILABLE" + }, + "15": { + "text": "DATA_LOSS" + }, + "16": { + "text": "UNAUTHENTICATED" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 55 + }, + "id": 116, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "rate(output_plugin_rpc_client_responses_per_rpc_count{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - status {{rpc_grpc_status_code}}", + "refId": "A" + } ], - "displayMode": "table", - "placement": "right" + "title": "[OTLP gRPC] Response Status Codes", + "type": "timeseries" }, - "tooltipOptions": { - "mode": "single" - } - }, - "pluginVersion": "7.5.42", - "targets": [ { - "exemplar": true, - "expr": "sum(rate(valitail_request_duration_seconds_sum{pod=~\"$pod\",host=~\".+$host.+\"}[5m])) by (host)\n/\nsum(rate(valitail_request_duration_seconds_count{pod=~\"$pod\",host=~\".+$host.+\"}[5m])) by (host)\n* 1000", - "interval": "", - "legendFormat": "{{host}}", - "queryType": "randomWalk", - "refId": "A" + "datasource": "prometheus", + "description": "Network throughput for gRPC communication", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 63 + }, + "id": 115, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "rate(output_plugin_rpc_client_request_size_bytes_sum{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - request bytes/sec", + "refId": "A" + }, + { + "exemplar": true, + "expr": "rate(output_plugin_rpc_client_response_size_bytes_sum{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - response bytes/sec", + "refId": "B" + } + ], + "title": "[OTLP gRPC] Network Throughput", + "type": "timeseries" } ], - "title": "ValiTail Avg Latency", - "type": "timeseries" + "title": "OTLP gRPC Client Metrics", + "type": "row" } ], "refresh": "5s", @@ -1932,12 +3222,8 @@ "allValue": "fluent-bit.+", "current": { "selected": false, - "text": [ - "All" - ], - "value": [ - "$__all" - ] + "text": "All", + "value": "$__all" }, "datasource": "prometheus", "definition": "label_values(pod)", @@ -1946,17 +3232,17 @@ "hide": 0, "includeAll": true, "label": "Pod", - "multi": true, + "multi": false, "name": "pod", "options": [], "query": { "query": "label_values(pod)", "refId": "prometheus-pod-Variable-Query" }, - "refresh": 1, + "refresh": 2, "regex": "fluent-bit.+", "skipUrlSync": false, - "sort": 0, + "sort": 1, "tagValuesQuery": "", "tags": [], "tagsQuery": "", @@ -1971,58 +3257,23 @@ "value": "$__all" }, "datasource": "prometheus", - "definition": "label_values(fluentbit_vali_gardener_incoming_logs_total,host)", + "definition": "label_values(fluentbit_gardener_incoming_logs_total,host)", "description": "Output plugin host target", "error": null, "hide": 0, "includeAll": true, "label": "Host", - "multi": true, + "multi": false, "name": "host", "options": [], "query": { - "query": "label_values(fluentbit_vali_gardener_incoming_logs_total,host)", - "refId": "StandardVariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": ".+", - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, - "datasource": "prometheus", - "definition": "label_values(valitail_sent_entries_total, host)", - "description": null, - "error": null, - "hide": 2, - "includeAll": true, - "label": "Url", - "multi": true, - "name": "url", - "options": [], - "query": { - "query": "label_values(valitail_sent_entries_total, host)", + "query": "label_values(fluentbit_gardener_incoming_logs_total,host)", "refId": "StandardVariableQuery" }, "refresh": 2, - "regex": "", + "regex": "/garden|shoot--logging--dev.+/", "skipUrlSync": false, - "sort": 0, + "sort": 3, "tagValuesQuery": "", "tags": [], "tagsQuery": "", @@ -2032,7 +3283,7 @@ ] }, "time": { - "from": "now-30m", + "from": "now-15m", "to": "now" }, "timepicker": { @@ -2060,7 +3311,7 @@ "30d" ] }, - "timezone": "", + "timezone": "UTC", "title": "Fluent Bit", "uid": "fluentbit", "version": 1 diff --git a/example/performance-test/charts/fluent-bit-plugin/dashboards/plutono-otel-collector-dashboard.json b/example/performance-test/charts/fluent-bit-plugin/dashboards/plutono-otel-collector-dashboard.json new file mode 100644 index 000000000..268c9f134 --- /dev/null +++ b/example/performance-test/charts/fluent-bit-plugin/dashboards/plutono-otel-collector-dashboard.json @@ -0,0 +1,1936 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "iteration": 1765781327778, + "links": [], + "panels": [ + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [], + "title": "Overview & Health", + "type": "row" + }, + { + "datasource": "prometheus", + "description": "Collector uptime in hours", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "green", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 3600 + }, + { + "color": "green", + "value": 86400 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "otelcol_process_uptime_seconds_total{pod=~\"$pod\"}", + "interval": "", + "legendFormat": "{{pod}}", + "refId": "A" + } + ], + "title": "Collector Uptime", + "type": "stat" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "green", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 8, + "y": 1 + }, + "id": 46, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "sum(otelcol_receiver_accepted_log_records_total{pod=~\"$pod\"}) by (receiver)", + "interval": "", + "legendFormat": "", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Receiver accepted logs", + "type": "stat" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 47, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "sum(otelcol_exporter_sent_log_records_total{pod=~\"$pod\"}) by (exporter)", + "interval": "", + "legendFormat": "{{exporter}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Exporter sent logs", + "type": "stat" + }, + { + "datasource": "prometheus", + "description": "Success rate percentage", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 95 + }, + { + "color": "green", + "value": 99 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 6 + }, + "id": 5, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "sum(otelcol_exporter_sent_log_records_total{pod=~\"$pod\"}) / (sum(otelcol_exporter_sent_log_records_total{pod=~\"$pod\"}) + sum(otelcol_exporter_send_failed_log_records_total{pod=~\"$pod\"})) * 100", + "interval": "", + "legendFormat": "Success rate", + "refId": "A" + } + ], + "title": "Export Success Rate", + "type": "gauge" + }, + { + "datasource": "prometheus", + "description": "Total log records successfully sent to destination", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 4, + "y": 6 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "sum(otelcol_exporter_sent_log_records_total{pod=~\"$pod\"})", + "interval": "", + "legendFormat": "Total sent", + "refId": "A" + } + ], + "title": "Total Logs Sent", + "type": "stat" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 8, + "y": 6 + }, + "id": 49, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "sum(otelcol_receiver_failed_log_records_total{pod=~\"$pod\"}) by (receiver)", + "interval": "", + "legendFormat": "", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Receiver failed logs", + "type": "stat" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 6 + }, + "id": 50, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "sum(otelcol_exporter_send_failed_log_records_total{pod=~\"$pod\"}) by (exporter)", + "interval": "", + "legendFormat": "{{exporter}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Exporter send failed logs", + "type": "stat" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 18, + "y": 6 + }, + "id": 48, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "sum(otelcol_receiver_refused_log_records_total{pod=~\"$pod\"}) by (receiver)", + "interval": "", + "legendFormat": "", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Receiver refused logs", + "type": "stat" + }, + { + "datasource": "prometheus", + "description": "Total log records failed to send", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 1000 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 21, + "y": 6 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "sum(otelcol_exporter_send_failed_log_records_total{pod=~\"$pod\"})", + "interval": "", + "legendFormat": "Total failed", + "refId": "A" + } + ], + "title": "Total Logs Failed", + "type": "stat" + }, + { + "collapsed": true, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 11 + }, + "id": 10, + "panels": [ + { + "datasource": "prometheus", + "description": "Rate of log records accepted, refused, and failed by receiver", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*failed.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".*refused.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".*accepted.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 12 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "rate(otelcol_receiver_accepted_log_records_total{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - {{receiver}} - accepted", + "refId": "A" + }, + { + "exemplar": true, + "expr": "rate(otelcol_receiver_refused_log_records_total{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - {{receiver}} - refused", + "refId": "B" + }, + { + "exemplar": true, + "expr": "rate(otelcol_receiver_failed_log_records_total{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - {{receiver}} - failed", + "refId": "C" + } + ], + "title": "[Receiver] Log Records Rate", + "type": "timeseries" + } + ], + "title": "Receiver Metrics", + "type": "row" + }, + { + "collapsed": true, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 12 + }, + "id": 20, + "panels": [ + { + "datasource": "prometheus", + "description": "Rate of items incoming and outgoing from processors", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 13 + }, + "id": 21, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "rate(otelcol_processor_incoming_items_total{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - {{processor}} - incoming", + "refId": "A" + }, + { + "exemplar": true, + "expr": "rate(otelcol_processor_outgoing_items_total{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - {{processor}} - outgoing", + "refId": "B" + } + ], + "title": "[Processor] Items Rate", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "description": "Batch processor metrics - batch size distribution", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 13 + }, + "id": 22, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.95, rate(otelcol_processor_batch_batch_send_size_bucket{pod=~\"$pod\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "{{pod}} - {{processor}} - p95 batch size", + "refId": "A" + }, + { + "exemplar": true, + "expr": "rate(otelcol_processor_batch_batch_send_size_sum{pod=~\"$pod\"}[$__rate_interval]) / rate(otelcol_processor_batch_batch_send_size_count{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - {{processor}} - avg batch size", + "refId": "B" + } + ], + "title": "[Processor] Batch Send Size", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "description": "Number of times batch was sent due to timeout", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 23, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "rate(otelcol_processor_batch_timeout_trigger_send_total{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - {{processor}} - timeout triggers", + "refId": "A" + } + ], + "title": "[Processor] Batch Timeout Triggers", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "description": "Batch processor metadata cardinality", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 100 + }, + { + "color": "red", + "value": 1000 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 21 + }, + "id": 24, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "otelcol_processor_batch_metadata_cardinality{pod=~\"$pod\"}", + "interval": "", + "legendFormat": "{{pod}} - {{processor}} - cardinality", + "refId": "A" + } + ], + "title": "[Processor] Batch Metadata Cardinality", + "type": "timeseries" + } + ], + "title": "Processor Metrics", + "type": "row" + }, + { + "collapsed": true, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 13 + }, + "id": 30, + "panels": [ + { + "datasource": "prometheus", + "description": "Rate of log records sent and failed by exporter", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*failed.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".*sent.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 31, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "rate(otelcol_exporter_sent_log_records_total{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - {{exporter}} - sent", + "refId": "A" + }, + { + "exemplar": true, + "expr": "rate(otelcol_exporter_send_failed_log_records_total{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - {{exporter}} - failed", + "refId": "B" + } + ], + "title": "[Exporter] Log Records Rate", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "description": "Exporter queue size and capacity", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 1, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*capacity.*" + }, + "properties": [ + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 22 + }, + "id": 32, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "otelcol_exporter_queue_size{pod=~\"$pod\"}", + "interval": "", + "legendFormat": "{{pod}} - {{exporter}} - queue size", + "refId": "A" + }, + { + "exemplar": true, + "expr": "otelcol_exporter_queue_capacity{pod=~\"$pod\"}", + "interval": "", + "legendFormat": "{{pod}} - {{exporter}} - queue capacity", + "refId": "B" + } + ], + "title": "[Exporter] Queue Size vs Capacity", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "description": "Exporter batch send size distribution", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 22 + }, + "id": 33, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.95, rate(otelcol_exporter_queue_batch_send_size_bucket{pod=~\"$pod\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "{{pod}} - {{exporter}} - p95 batch size", + "refId": "A" + }, + { + "exemplar": true, + "expr": "rate(otelcol_exporter_queue_batch_send_size_sum{pod=~\"$pod\"}[$__rate_interval]) / rate(otelcol_exporter_queue_batch_send_size_count{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - {{exporter}} - avg batch size", + "refId": "B" + } + ], + "title": "[Exporter] Batch Send Size", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "description": "Queue utilization percentage", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 34, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "(otelcol_exporter_queue_size{pod=~\"$pod\"} / otelcol_exporter_queue_capacity{pod=~\"$pod\"}) * 100", + "interval": "", + "legendFormat": "{{pod}} - {{exporter}}", + "refId": "A" + } + ], + "title": "[Exporter] Queue Utilization %", + "type": "gauge" + } + ], + "title": "Exporter Metrics", + "type": "row" + }, + { + "collapsed": true, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 40, + "panels": [ + { + "datasource": "prometheus", + "description": "CPU usage by OTel collector process", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 63 + }, + "id": 41, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "rate(otelcol_process_cpu_seconds_total{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - CPU usage", + "refId": "A" + } + ], + "title": "[Process] CPU Usage", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "description": "Physical memory (RSS) used by collector", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 63 + }, + "id": 42, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "otelcol_process_memory_rss_bytes{pod=~\"$pod\"}", + "interval": "", + "legendFormat": "{{pod}} - RSS memory", + "refId": "A" + } + ], + "title": "[Process] Memory RSS", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "description": "Go runtime heap memory metrics", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 71 + }, + "id": 43, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "otelcol_process_runtime_heap_alloc_bytes{pod=~\"$pod\"}", + "interval": "", + "legendFormat": "{{pod}} - heap alloc", + "refId": "A" + }, + { + "exemplar": true, + "expr": "otelcol_process_runtime_total_sys_memory_bytes{pod=~\"$pod\"}", + "interval": "", + "legendFormat": "{{pod}} - total sys memory", + "refId": "B" + } + ], + "title": "[Process] Go Runtime Heap Memory", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "description": "Cumulative heap allocations rate", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 71 + }, + "id": 44, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "exemplar": true, + "expr": "rate(otelcol_process_runtime_total_alloc_bytes_total{pod=~\"$pod\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}} - allocation rate", + "refId": "A" + } + ], + "title": "[Process] Heap Allocation Rate", + "type": "timeseries" + } + ], + "title": "Process & Resource Metrics", + "type": "row" + } + ], + "refresh": "5s", + "schemaVersion": 27, + "style": "dark", + "tags": [ + "otel", + "opentelemetry", + "collector" + ], + "templating": { + "list": [ + { + "allValue": ".*otel-collector.+", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "prometheus", + "definition": "label_values(otelcol_process_uptime_seconds_total, pod)", + "description": "OTel Collector pod selector", + "error": null, + "hide": 0, + "includeAll": true, + "label": "Pod", + "multi": false, + "name": "pod", + "options": [], + "query": { + "query": "label_values(otelcol_process_uptime_seconds_total, pod)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "/logging-otel-collector-.+/", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "UTC", + "title": "OpenTelemetry Collector", + "uid": "otel-collector", + "version": 1 +} \ No newline at end of file diff --git a/example/performance-test/charts/fluent-bit-plugin/dashboards/plutono-victorialogs-dashboard.json b/example/performance-test/charts/fluent-bit-plugin/dashboards/plutono-victorialogs-dashboard.json new file mode 100644 index 000000000..2e30e7d3a --- /dev/null +++ b/example/performance-test/charts/fluent-bit-plugin/dashboards/plutono-victorialogs-dashboard.json @@ -0,0 +1,906 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "iteration": 1766091411295, + "links": [], + "panels": [ + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [], + "title": "Overview", + "type": "row" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "expr": "sum(vl_rows_ingested_total{pod=~\"$pod\"})", + "refId": "A" + } + ], + "title": "Total Rows Ingested", + "type": "stat" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "expr": "sum(vl_bytes_ingested_total{pod=~\"$pod\"})", + "refId": "A" + } + ], + "title": "Total Bytes Ingested", + "type": "stat" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "expr": "sum(vl_storage_rows{pod=~\"$pod\"})", + "refId": "A" + } + ], + "title": "Storage Rows", + "type": "stat" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 3, + "x": 18, + "y": 1 + }, + "id": 33, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "expr": "(vl_total_disk_space_bytes{pod=~\"$pod\"} - vl_free_disk_space_bytes{pod=~\"$pod\"}) / vl_total_disk_space_bytes{pod=~\"$pod\"} * 100", + "legendFormat": "{{pod}} - {{path}}", + "refId": "A" + } + ], + "title": "Disk Usage %", + "type": "gauge" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 3, + "x": 21, + "y": 1 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.42", + "targets": [ + { + "expr": "sum(vl_streams_created_total{pod=~\"$pod\"})", + "refId": "A" + } + ], + "title": "Total Streams", + "type": "stat" + }, + { + "collapsed": true, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 10, + "panels": [ + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "rps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "rate(vl_rows_ingested_total{pod=~\"$pod\"}[$__rate_interval])", + "legendFormat": "{{pod}} - {{type}}", + "refId": "A" + } + ], + "title": "Ingestion Rate (rows/sec)", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 12, + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "rate(vl_bytes_ingested_total{pod=~\"$pod\"}[$__rate_interval])", + "legendFormat": "{{pod}} - {{type}}", + "refId": "A" + } + ], + "title": "Ingestion Rate (bytes/sec)", + "type": "timeseries" + } + ], + "title": "Ingestion", + "type": "row" + }, + { + "collapsed": true, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 20, + "panels": [ + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 21, + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "rate(vl_http_requests_total{pod=~\"$pod\"}[$__rate_interval])", + "legendFormat": "{{pod}} - {{path}}", + "refId": "A" + } + ], + "title": "HTTP Request Rate", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 22, + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "rate(vl_http_request_duration_seconds_sum{pod=~\"$pod\"}[$__rate_interval]) / rate(vl_http_request_duration_seconds_count{pod=~\"$pod\"}[$__rate_interval])", + "legendFormat": "{{pod}} - {{path}}", + "refId": "A" + } + ], + "title": "HTTP Request Duration", + "type": "timeseries" + } + ], + "title": "HTTP API", + "type": "row" + }, + { + "collapsed": true, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 30, + "panels": [ + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 10 + }, + "id": 31, + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "vl_data_size_bytes{pod=~\"$pod\"}", + "legendFormat": "{{pod}} - {{type}}", + "refId": "A" + } + ], + "title": "Data Size", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 10 + }, + "id": 32, + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "vl_uncompressed_data_size_bytes{pod=~\"$pod\"} / (vl_compressed_data_size_bytes{pod=~\"$pod\"} > 0)", + "legendFormat": "{{pod}} - {{type}}", + "refId": "A" + } + ], + "title": "Compression Ratio", + "type": "timeseries" + } + ], + "title": "Storage", + "type": "row" + } + ], + "refresh": "5s", + "schemaVersion": 27, + "style": "dark", + "tags": [ + "victorialogs", + "logs" + ], + "templating": { + "list": [ + { + "allValue": ".*victorialogs.+", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "prometheus", + "definition": "label_values(vl_rows_ingested_total, pod)", + "description": "VictoriaLogs pod selector", + "error": null, + "hide": 0, + "includeAll": true, + "label": "Pod", + "multi": false, + "name": "pod", + "options": [], + "query": { + "query": "label_values(vl_rows_ingested_total, pod)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "/logging-victorialogs-.+/", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "UTC", + "title": "VictoriaLogs", + "uid": "victorialogs", + "version": 1 +}VictoriaLogs - single-node.json \ No newline at end of file diff --git a/example/performance-test/charts/fluent-bit-plugin/templates/_helpers.tpl b/example/performance-test/charts/fluent-bit-plugin/templates/_helpers.tpl index 3e3d250ab..7f1eed271 100644 --- a/example/performance-test/charts/fluent-bit-plugin/templates/_helpers.tpl +++ b/example/performance-test/charts/fluent-bit-plugin/templates/_helpers.tpl @@ -20,10 +20,17 @@ Expand the name for prometheus resources. {{- end }} {{/* -Expand the name for vali resources. +Expand the name for victorialogs resources. */}} -{{- define "fluent-bit-plugin.valiName" -}} -{{- printf "%s-vali" (include "fluent-bit-plugin.name" .) | trunc 63 | trimSuffix "-" }} +{{- define "fluent-bit-plugin.victorialogsName" -}} +{{- printf "%s-victorialogs" (include "fluent-bit-plugin.name" .) | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Expand the name for otel-collector resources. +*/}} +{{- define "fluent-bit-plugin.otelCollectorName" -}} +{{- printf "%s-otel-collector" (include "fluent-bit-plugin.name" .) | trunc 63 | trimSuffix "-" }} {{- end }} @@ -114,11 +121,31 @@ app.kubernetes.io/instance: {{ .Release.Name }} {{- end }} {{/* -Vali Common labels +VictoriaLogs Common labels +*/}} +{{- define "fluent-bit-plugin.victorialogsLabels" -}} +helm.sh/chart: {{ include "fluent-bit-plugin.chart" . }} +{{ include "fluent-bit-plugin.victorialogsSelectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +VictoriaLogs Selector labels +*/}} +{{- define "fluent-bit-plugin.victorialogsSelectorLabels" -}} +app.kubernetes.io/name: {{ include "fluent-bit-plugin.victorialogsName" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +OtelCollector Common labels */}} -{{- define "fluent-bit-plugin.valiLabels" -}} +{{- define "fluent-bit-plugin.otelCollectorLabels" -}} helm.sh/chart: {{ include "fluent-bit-plugin.chart" . }} -{{ include "fluent-bit-plugin.valiSelectorLabels" . }} +{{ include "fluent-bit-plugin.otelCollectorSelectorLabels" . }} {{- if .Chart.AppVersion }} app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} {{- end }} @@ -126,10 +153,10 @@ app.kubernetes.io/managed-by: {{ .Release.Service }} {{- end }} {{/* -Vali Selector labels +OtelCollector Selector labels */}} -{{- define "fluent-bit-plugin.valiSelectorLabels" -}} -app.kubernetes.io/name: {{ include "fluent-bit-plugin.valiName" . }} +{{- define "fluent-bit-plugin.otelCollectorSelectorLabels" -}} +app.kubernetes.io/name: {{ include "fluent-bit-plugin.otelCollectorName" . }} app.kubernetes.io/instance: {{ .Release.Name }} {{- end }} diff --git a/example/performance-test/charts/fluent-bit-plugin/templates/fluent-bit-config.yaml b/example/performance-test/charts/fluent-bit-plugin/templates/fluent-bit-config.yaml index 9a0b46f68..d6a6c8f50 100644 --- a/example/performance-test/charts/fluent-bit-plugin/templates/fluent-bit-config.yaml +++ b/example/performance-test/charts/fluent-bit-plugin/templates/fluent-bit-config.yaml @@ -7,6 +7,43 @@ metadata: labels: {{- include "fluent-bit-plugin.labels" . | nindent 4 }} data: + systemd.lua: | + function set_systemd_record(tag, timestamp, record) + new_record = {} + timeStr = os.date("!*t", timestamp["sec"]) + t = string.format("%4d-%02d-%02dT%02d:%02d:%02d.%sZ", + timeStr["year"], timeStr["month"], timeStr["day"], + timeStr["hour"], timeStr["min"], timeStr["sec"], + timestamp["nsec"]) + + new_record["time"] = t + new_record["log"] = record["MESSAGE"] + new_record["process.command"] = record["_EXE"] + new_record["process.command_line"] = record["_CMDLINE"] + new_record["process.pid"] = record["_PID"] + new_record["host.id"] = record["_MACHINE_ID"] + new_record["service.name"] = record["_SYSTEMD_UNIT"] + new_record["service.namespace"] = record["_SYSTEMD_SLICE"] + + return 1, timestamp, new_record + end + + set_time_to_record.lua: | + function set_time_to_record(tag, timestamp, record) + if(record["logtag"]~=nil) then + timeStr = os.date("!*t", timestamp["sec"]) + t = string.format("%4d-%02d-%02dT%02d:%02d:%02d.%sZ", + timeStr["year"], timeStr["month"], timeStr["day"], + timeStr["hour"], timeStr["min"], timeStr["sec"], + timestamp["nsec"]); + record["time"] = t; + + return 1, timestamp, record + else + return 0,timestamp,record + end + end + add_tag_to_record.lua: | function add_tag_to_record(tag, timestamp, record) record["tag"] = tag @@ -65,7 +102,7 @@ data: [PARSER] Name containerd-parser Format regex - Regex ^(?