From 1c74ea9d137b82ca0c3ca361cf0ad938d2db5568 Mon Sep 17 00:00:00 2001 From: Shirly Radco Date: Thu, 12 Mar 2026 20:34:33 +0200 Subject: [PATCH] add alerts_effective_active_at metric Expose a Prometheus gauge metric whose value is the activeAt Unix timestamp for every effective alert (firing, pending, silenced). Labels include all alerts labels after relabeling plus enrichment labels and alertstate. Annotations are excluded since they are available from the alert rule definition. Signed-off-by: Shirly Radco Co-authored-by: AI Assistant --- internal/managementrouter/health_get_test.go | 4 + pkg/k8s/enrich_active_at_test.go | 106 ++++++++++++++++++ pkg/k8s/prometheus_alerts.go | 72 +++++++++++- pkg/management/management.go | 9 ++ .../metrics/alerts_collector.go | 2 +- .../metrics/alerts_collector_test.go | 2 +- .../metrics/leader_election.go | 0 pkg/management/types.go | 6 + pkg/server.go | 16 ++- test/e2e/alerts_effective_metric_test.go | 2 +- 10 files changed, 210 insertions(+), 9 deletions(-) create mode 100644 pkg/k8s/enrich_active_at_test.go rename pkg/{ => management}/metrics/alerts_collector.go (100%) rename pkg/{ => management}/metrics/alerts_collector_test.go (99%) rename pkg/{ => management}/metrics/leader_election.go (100%) diff --git a/internal/managementrouter/health_get_test.go b/internal/managementrouter/health_get_test.go index 1dc7976a2..4d88234b7 100644 --- a/internal/managementrouter/health_get_test.go +++ b/internal/managementrouter/health_get_test.go @@ -9,6 +9,7 @@ import ( "testing" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "k8s.io/client-go/rest" "github.com/openshift/monitoring-plugin/internal/managementrouter" "github.com/openshift/monitoring-plugin/pkg/k8s" @@ -64,6 +65,9 @@ func (s *stubClient) UpdateAlertRuleClassification(_ context.Context, _ manageme func (s *stubClient) BulkUpdateAlertRuleClassification(_ context.Context, _ []management.UpdateRuleClassificationRequest) []error { return nil } +func (s *stubClient) MetricsHandler(_ context.Context, _ *rest.Config) (http.Handler, error) { + return nil, nil +} // newStubRouter builds a router backed by stub and adds a Bearer token header // to requests via the helper get/getNoAuth methods. diff --git a/pkg/k8s/enrich_active_at_test.go b/pkg/k8s/enrich_active_at_test.go new file mode 100644 index 000000000..95e205591 --- /dev/null +++ b/pkg/k8s/enrich_active_at_test.go @@ -0,0 +1,106 @@ +package k8s + +import ( + "testing" + "time" +) + +func TestEnrichActiveAt_ReplacesAlertmanagerTimestamp(t *testing.T) { + amTime := time.Date(2026, 3, 10, 12, 0, 0, 0, time.UTC) + promTime := time.Date(2026, 3, 9, 8, 0, 0, 0, time.UTC) + + amAlerts := []PrometheusAlert{{ + Labels: map[string]string{"alertname": "HighCPU", "severity": "critical", AlertSourceLabel: "platform", AlertBackendLabel: "am"}, + ActiveAt: amTime, + }} + promAlerts := []PrometheusAlert{{ + Labels: map[string]string{"alertname": "HighCPU", "severity": "critical", AlertSourceLabel: "platform", AlertBackendLabel: "prom"}, + ActiveAt: promTime, + }} + + enrichActiveAt(amAlerts, promAlerts) + + if !amAlerts[0].ActiveAt.Equal(promTime) { + t.Errorf("expected ActiveAt=%v, got %v", promTime, amAlerts[0].ActiveAt) + } +} + +func TestEnrichActiveAt_NoMatchKeepsOriginal(t *testing.T) { + amTime := time.Date(2026, 3, 10, 12, 0, 0, 0, time.UTC) + + amAlerts := []PrometheusAlert{{ + Labels: map[string]string{"alertname": "HighCPU", "severity": "critical"}, + ActiveAt: amTime, + }} + promAlerts := []PrometheusAlert{{ + Labels: map[string]string{"alertname": "DiskFull", "severity": "warning"}, + ActiveAt: time.Date(2026, 3, 9, 8, 0, 0, 0, time.UTC), + }} + + enrichActiveAt(amAlerts, promAlerts) + + if !amAlerts[0].ActiveAt.Equal(amTime) { + t.Errorf("expected ActiveAt to stay %v, got %v", amTime, amAlerts[0].ActiveAt) + } +} + +func TestEnrichActiveAt_EmptyPromAlerts(t *testing.T) { + amTime := time.Date(2026, 3, 10, 12, 0, 0, 0, time.UTC) + + amAlerts := []PrometheusAlert{{ + Labels: map[string]string{"alertname": "HighCPU"}, + ActiveAt: amTime, + }} + + enrichActiveAt(amAlerts, nil) + + if !amAlerts[0].ActiveAt.Equal(amTime) { + t.Errorf("expected ActiveAt to stay %v, got %v", amTime, amAlerts[0].ActiveAt) + } +} + +func TestEnrichActiveAt_SkipsZeroPromActiveAt(t *testing.T) { + amTime := time.Date(2026, 3, 10, 12, 0, 0, 0, time.UTC) + + amAlerts := []PrometheusAlert{{ + Labels: map[string]string{"alertname": "HighCPU"}, + ActiveAt: amTime, + }} + promAlerts := []PrometheusAlert{{ + Labels: map[string]string{"alertname": "HighCPU"}, + }} + + enrichActiveAt(amAlerts, promAlerts) + + if !amAlerts[0].ActiveAt.Equal(amTime) { + t.Errorf("expected ActiveAt to stay %v when prom has zero time, got %v", amTime, amAlerts[0].ActiveAt) + } +} + +func TestAlertFingerprint_IgnoresMetadataLabels(t *testing.T) { + fp1 := alertFingerprint(map[string]string{ + "alertname": "HighCPU", + "severity": "critical", + AlertSourceLabel: "platform", + AlertBackendLabel: "am", + }) + fp2 := alertFingerprint(map[string]string{ + "alertname": "HighCPU", + "severity": "critical", + AlertSourceLabel: "platform", + AlertBackendLabel: "prom", + }) + + if fp1 != fp2 { + t.Errorf("fingerprints should match when only metadata labels differ:\n fp1=%q\n fp2=%q", fp1, fp2) + } +} + +func TestAlertFingerprint_DifferentLabelsProduceDifferentKeys(t *testing.T) { + fp1 := alertFingerprint(map[string]string{"alertname": "HighCPU", "severity": "critical"}) + fp2 := alertFingerprint(map[string]string{"alertname": "HighCPU", "severity": "warning"}) + + if fp1 == fp2 { + t.Error("fingerprints should differ when label values differ") + } +} diff --git a/pkg/k8s/prometheus_alerts.go b/pkg/k8s/prometheus_alerts.go index 155c94bbe..0b560fbed 100644 --- a/pkg/k8s/prometheus_alerts.go +++ b/pkg/k8s/prometheus_alerts.go @@ -10,6 +10,7 @@ import ( "net/http" "net/url" "os" + "sort" "strings" "sync" "time" @@ -278,11 +279,21 @@ func (pa *prometheusAlerts) routeHealth(ctx context.Context, namespace string, r return health } +// getAlertsForSource fetches alerts from both Alertmanager and Prometheus in +// parallel and merges the results. The fallback strategy is: +// - Both succeed: AM (firing+silenced) + Prom pending, with AM timestamps +// enriched from Prometheus activeAt. +// - AM only: AM alerts returned as-is (no Prom data to enrich from). +// - Prom only: all Prom alerts returned (AM was unreachable). +// - Both fail: error propagated from Prometheus. func (pa *prometheusAlerts) getAlertsForSource(ctx context.Context, namespace string, promRouteName string, amRouteName string, source string) ([]PrometheusAlert, error) { amAlerts, amErr := pa.getAlertmanagerAlerts(ctx, namespace, amRouteName, source) promAlerts, promErr := pa.getAlertsViaProxy(ctx, namespace, promRouteName, source) if amErr == nil { + if promErr == nil { + enrichActiveAt(amAlerts, promAlerts) + } pending := filterAlertsByState(promAlerts, "pending") return append(amAlerts, pending...), nil } @@ -337,15 +348,17 @@ func (pa *prometheusAlerts) getUserWorkloadAlertsViaAlertmanager(ctx context.Con } } - pending, err := pa.getAlertsViaProxy(ctx, UserWorkloadMonitoringNamespace, UserWorkloadRouteName, AlertSourceUser) + promAlerts, err := pa.getAlertsViaProxy(ctx, UserWorkloadMonitoringNamespace, UserWorkloadRouteName, AlertSourceUser) if err != nil { - pending, err = pa.getPrometheusAlertsViaService(ctx, UserWorkloadMonitoringNamespace, UserWorkloadPrometheusServiceName, UserWorkloadPrometheusPort, AlertSourceUser) + promAlerts, err = pa.getPrometheusAlertsViaService(ctx, UserWorkloadMonitoringNamespace, UserWorkloadPrometheusServiceName, UserWorkloadPrometheusPort, AlertSourceUser) if err != nil { return alerts, nil } } - return append(alerts, filterAlertsByState(pending, "pending")...), nil + // Enrich before filtering: AM alerts need activeAt from all Prom states. + enrichActiveAt(alerts, promAlerts) + return append(alerts, filterAlertsByState(promAlerts, "pending")...), nil } func (pa *prometheusAlerts) getPrometheusAlertsViaService(ctx context.Context, namespace string, serviceName string, port int32, source string) ([]PrometheusAlert, error) { @@ -776,6 +789,59 @@ func filterAlertsByState(alerts []PrometheusAlert, state string) []PrometheusAle return out } +// enrichActiveAt replaces ActiveAt in Alertmanager-sourced alerts with the +// authoritative value from Prometheus. Alertmanager only exposes startsAt +// (when it received the alert), while Prometheus tracks the true activeAt +// (when the alert condition first became true). +func enrichActiveAt(amAlerts, promAlerts []PrometheusAlert) { + if len(promAlerts) == 0 { + return + } + + lookup := make(map[string]time.Time, len(promAlerts)) + for _, alert := range promAlerts { + fp := alertFingerprint(alert.Labels) + if !alert.ActiveAt.IsZero() { + lookup[fp] = alert.ActiveAt + } + } + + for i := range amAlerts { + fp := alertFingerprint(amAlerts[i].Labels) + if activeAt, ok := lookup[fp]; ok { + amAlerts[i].ActiveAt = activeAt + } + } +} + +// alertFingerprint builds a stable identity key from an alert's labels, +// excluding metadata labels injected by this plugin (source, backend). +// This matches the same alert *instance* across Alertmanager and Prometheus +// (which may differ only in injected metadata). It is distinct from the +// alert rule ID (GetAlertingRuleId) which identifies the *rule definition* +// and is computed from the rule spec (name, expr, duration, static labels). +func alertFingerprint(labels map[string]string) string { + keys := make([]string, 0, len(labels)) + for k := range labels { + if k == AlertSourceLabel || k == AlertBackendLabel { + continue + } + keys = append(keys, k) + } + sort.Strings(keys) + + var b strings.Builder + for i, k := range keys { + if i > 0 { + b.WriteByte('\xff') + } + b.WriteString(k) + b.WriteByte('\xfe') + b.WriteString(labels[k]) + } + return b.String() +} + func mapAlertmanagerState(state string) string { if state == "active" { return "firing" diff --git a/pkg/management/management.go b/pkg/management/management.go index b7eec3c09..84c9a8d84 100644 --- a/pkg/management/management.go +++ b/pkg/management/management.go @@ -1,9 +1,14 @@ package management import ( + "context" + "net/http" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/rest" "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/management/metrics" ) type client struct { @@ -21,3 +26,7 @@ type client struct { func (c *client) isPlatformManagedPrometheusRule(nn types.NamespacedName) bool { return c.k8sClient.Namespace().IsClusterMonitoringNamespace(nn.Namespace) } + +func (c *client) MetricsHandler(ctx context.Context, kubeConfig *rest.Config) (http.Handler, error) { + return metrics.NewHandler(ctx, c, kubeConfig) +} diff --git a/pkg/metrics/alerts_collector.go b/pkg/management/metrics/alerts_collector.go similarity index 100% rename from pkg/metrics/alerts_collector.go rename to pkg/management/metrics/alerts_collector.go index 63aa18e83..fd9dd9bed 100644 --- a/pkg/metrics/alerts_collector.go +++ b/pkg/management/metrics/alerts_collector.go @@ -11,9 +11,9 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/sirupsen/logrus" + "k8s.io/client-go/rest" "github.com/openshift/monitoring-plugin/pkg/k8s" - "k8s.io/client-go/rest" ) var metricsLog = logrus.WithField("module", "metrics") diff --git a/pkg/metrics/alerts_collector_test.go b/pkg/management/metrics/alerts_collector_test.go similarity index 99% rename from pkg/metrics/alerts_collector_test.go rename to pkg/management/metrics/alerts_collector_test.go index 6cc30efbe..5ce053834 100644 --- a/pkg/metrics/alerts_collector_test.go +++ b/pkg/management/metrics/alerts_collector_test.go @@ -10,7 +10,7 @@ import ( dto "github.com/prometheus/client_model/go" "github.com/openshift/monitoring-plugin/pkg/k8s" - "github.com/openshift/monitoring-plugin/pkg/metrics" + "github.com/openshift/monitoring-plugin/pkg/management/metrics" ) type mockAlertsFetcher struct { diff --git a/pkg/metrics/leader_election.go b/pkg/management/metrics/leader_election.go similarity index 100% rename from pkg/metrics/leader_election.go rename to pkg/management/metrics/leader_election.go diff --git a/pkg/management/types.go b/pkg/management/types.go index 0f7d71f4b..e2c54c872 100644 --- a/pkg/management/types.go +++ b/pkg/management/types.go @@ -2,8 +2,10 @@ package management import ( "context" + "net/http" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "k8s.io/client-go/rest" "github.com/openshift/monitoring-plugin/pkg/k8s" ) @@ -51,6 +53,10 @@ type Client interface { // GetAlertingHealth retrieves alerting health details GetAlertingHealth(ctx context.Context) (k8s.AlertingHealth, error) + + // MetricsHandler returns an HTTP handler that exposes alert management metrics. + // It handles leader election internally using the provided kubeConfig. + MetricsHandler(ctx context.Context, kubeConfig *rest.Config) (http.Handler, error) } // PrometheusRuleOptions specifies options for selecting PrometheusRule resources and groups diff --git a/pkg/server.go b/pkg/server.go index 323c83656..5b2fe23dd 100644 --- a/pkg/server.go +++ b/pkg/server.go @@ -171,7 +171,10 @@ func createHTTPServer(ctx context.Context, cfg *Config) (*http.Server, error) { log.Info("alert management API enabled") } - router, pluginConfig := setupRoutes(cfg, managementClient) + router, pluginConfig, err := setupRoutes(ctx, cfg, managementClient, k8sconfig) + if err != nil { + return nil, fmt.Errorf("failed to set up routes: %w", err) + } router.Use(corsHeaderMiddleware()) tlsConfig := &tls.Config{} @@ -262,7 +265,7 @@ func createHTTPServer(ctx context.Context, cfg *Config) (*http.Server, error) { return httpServer, nil } -func setupRoutes(cfg *Config, managementClient management.Client) (*mux.Router, *PluginConfig) { +func setupRoutes(ctx context.Context, cfg *Config, managementClient management.Client, k8sconfig *rest.Config) (*mux.Router, *PluginConfig, error) { configHandlerFunc, pluginConfig := configHandler(cfg) router := mux.NewRouter() @@ -277,11 +280,18 @@ func setupRoutes(cfg *Config, managementClient management.Client) (*mux.Router, if managementClient != nil { managementRouter := managementrouter.New(managementClient) router.PathPrefix("/api/v1/alerting").Handler(managementRouter) + + metricsHandler, err := managementClient.MetricsHandler(ctx, k8sconfig) + if err != nil { + return nil, nil, fmt.Errorf("failed to start alert management metrics: %w", err) + } + router.Path("/metrics").Handler(metricsHandler) + log.Info("alert management metrics started") } router.PathPrefix("/").Handler(filesHandler(http.Dir(cfg.StaticPath))) - return router, pluginConfig + return router, pluginConfig, nil } func setupProxyRoutes(cfg *Config, k8sclient *dynamic.DynamicClient, kind proxy.KindType) *mux.Router { diff --git a/test/e2e/alerts_effective_metric_test.go b/test/e2e/alerts_effective_metric_test.go index cd59897f7..1c73a5065 100644 --- a/test/e2e/alerts_effective_metric_test.go +++ b/test/e2e/alerts_effective_metric_test.go @@ -12,7 +12,7 @@ import ( "k8s.io/apimachinery/pkg/util/wait" "github.com/openshift/monitoring-plugin/pkg/k8s" - "github.com/openshift/monitoring-plugin/pkg/metrics" + "github.com/openshift/monitoring-plugin/pkg/management/metrics" "github.com/openshift/monitoring-plugin/test/e2e/framework" )