diff --git a/internal/managementrouter/health_get_test.go b/internal/managementrouter/health_get_test.go index 1dc7976a2..4d88234b7 100644 --- a/internal/managementrouter/health_get_test.go +++ b/internal/managementrouter/health_get_test.go @@ -9,6 +9,7 @@ import ( "testing" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "k8s.io/client-go/rest" "github.com/openshift/monitoring-plugin/internal/managementrouter" "github.com/openshift/monitoring-plugin/pkg/k8s" @@ -64,6 +65,9 @@ func (s *stubClient) UpdateAlertRuleClassification(_ context.Context, _ manageme func (s *stubClient) BulkUpdateAlertRuleClassification(_ context.Context, _ []management.UpdateRuleClassificationRequest) []error { return nil } +func (s *stubClient) MetricsHandler(_ context.Context, _ *rest.Config) (http.Handler, error) { + return nil, nil +} // newStubRouter builds a router backed by stub and adds a Bearer token header // to requests via the helper get/getNoAuth methods. diff --git a/pkg/k8s/enrich_active_at_test.go b/pkg/k8s/enrich_active_at_test.go new file mode 100644 index 000000000..95e205591 --- /dev/null +++ b/pkg/k8s/enrich_active_at_test.go @@ -0,0 +1,106 @@ +package k8s + +import ( + "testing" + "time" +) + +func TestEnrichActiveAt_ReplacesAlertmanagerTimestamp(t *testing.T) { + amTime := time.Date(2026, 3, 10, 12, 0, 0, 0, time.UTC) + promTime := time.Date(2026, 3, 9, 8, 0, 0, 0, time.UTC) + + amAlerts := []PrometheusAlert{{ + Labels: map[string]string{"alertname": "HighCPU", "severity": "critical", AlertSourceLabel: "platform", AlertBackendLabel: "am"}, + ActiveAt: amTime, + }} + promAlerts := []PrometheusAlert{{ + Labels: map[string]string{"alertname": "HighCPU", "severity": "critical", AlertSourceLabel: "platform", AlertBackendLabel: "prom"}, + ActiveAt: promTime, + }} + + enrichActiveAt(amAlerts, promAlerts) + + if !amAlerts[0].ActiveAt.Equal(promTime) { + t.Errorf("expected ActiveAt=%v, got %v", promTime, amAlerts[0].ActiveAt) + } +} + +func TestEnrichActiveAt_NoMatchKeepsOriginal(t *testing.T) { + amTime := time.Date(2026, 3, 10, 12, 0, 0, 0, time.UTC) + + amAlerts := []PrometheusAlert{{ + Labels: map[string]string{"alertname": "HighCPU", "severity": "critical"}, + ActiveAt: amTime, + }} + promAlerts := []PrometheusAlert{{ + Labels: map[string]string{"alertname": "DiskFull", "severity": "warning"}, + ActiveAt: time.Date(2026, 3, 9, 8, 0, 0, 0, time.UTC), + }} + + enrichActiveAt(amAlerts, promAlerts) + + if !amAlerts[0].ActiveAt.Equal(amTime) { + t.Errorf("expected ActiveAt to stay %v, got %v", amTime, amAlerts[0].ActiveAt) + } +} + +func TestEnrichActiveAt_EmptyPromAlerts(t *testing.T) { + amTime := time.Date(2026, 3, 10, 12, 0, 0, 0, time.UTC) + + amAlerts := []PrometheusAlert{{ + Labels: map[string]string{"alertname": "HighCPU"}, + ActiveAt: amTime, + }} + + enrichActiveAt(amAlerts, nil) + + if !amAlerts[0].ActiveAt.Equal(amTime) { + t.Errorf("expected ActiveAt to stay %v, got %v", amTime, amAlerts[0].ActiveAt) + } +} + +func TestEnrichActiveAt_SkipsZeroPromActiveAt(t *testing.T) { + amTime := time.Date(2026, 3, 10, 12, 0, 0, 0, time.UTC) + + amAlerts := []PrometheusAlert{{ + Labels: map[string]string{"alertname": "HighCPU"}, + ActiveAt: amTime, + }} + promAlerts := []PrometheusAlert{{ + Labels: map[string]string{"alertname": "HighCPU"}, + }} + + enrichActiveAt(amAlerts, promAlerts) + + if !amAlerts[0].ActiveAt.Equal(amTime) { + t.Errorf("expected ActiveAt to stay %v when prom has zero time, got %v", amTime, amAlerts[0].ActiveAt) + } +} + +func TestAlertFingerprint_IgnoresMetadataLabels(t *testing.T) { + fp1 := alertFingerprint(map[string]string{ + "alertname": "HighCPU", + "severity": "critical", + AlertSourceLabel: "platform", + AlertBackendLabel: "am", + }) + fp2 := alertFingerprint(map[string]string{ + "alertname": "HighCPU", + "severity": "critical", + AlertSourceLabel: "platform", + AlertBackendLabel: "prom", + }) + + if fp1 != fp2 { + t.Errorf("fingerprints should match when only metadata labels differ:\n fp1=%q\n fp2=%q", fp1, fp2) + } +} + +func TestAlertFingerprint_DifferentLabelsProduceDifferentKeys(t *testing.T) { + fp1 := alertFingerprint(map[string]string{"alertname": "HighCPU", "severity": "critical"}) + fp2 := alertFingerprint(map[string]string{"alertname": "HighCPU", "severity": "warning"}) + + if fp1 == fp2 { + t.Error("fingerprints should differ when label values differ") + } +} diff --git a/pkg/k8s/prometheus_alerts.go b/pkg/k8s/prometheus_alerts.go index 155c94bbe..0b560fbed 100644 --- a/pkg/k8s/prometheus_alerts.go +++ b/pkg/k8s/prometheus_alerts.go @@ -10,6 +10,7 @@ import ( "net/http" "net/url" "os" + "sort" "strings" "sync" "time" @@ -278,11 +279,21 @@ func (pa *prometheusAlerts) routeHealth(ctx context.Context, namespace string, r return health } +// getAlertsForSource fetches alerts from both Alertmanager and Prometheus in +// parallel and merges the results. The fallback strategy is: +// - Both succeed: AM (firing+silenced) + Prom pending, with AM timestamps +// enriched from Prometheus activeAt. +// - AM only: AM alerts returned as-is (no Prom data to enrich from). +// - Prom only: all Prom alerts returned (AM was unreachable). +// - Both fail: error propagated from Prometheus. func (pa *prometheusAlerts) getAlertsForSource(ctx context.Context, namespace string, promRouteName string, amRouteName string, source string) ([]PrometheusAlert, error) { amAlerts, amErr := pa.getAlertmanagerAlerts(ctx, namespace, amRouteName, source) promAlerts, promErr := pa.getAlertsViaProxy(ctx, namespace, promRouteName, source) if amErr == nil { + if promErr == nil { + enrichActiveAt(amAlerts, promAlerts) + } pending := filterAlertsByState(promAlerts, "pending") return append(amAlerts, pending...), nil } @@ -337,15 +348,17 @@ func (pa *prometheusAlerts) getUserWorkloadAlertsViaAlertmanager(ctx context.Con } } - pending, err := pa.getAlertsViaProxy(ctx, UserWorkloadMonitoringNamespace, UserWorkloadRouteName, AlertSourceUser) + promAlerts, err := pa.getAlertsViaProxy(ctx, UserWorkloadMonitoringNamespace, UserWorkloadRouteName, AlertSourceUser) if err != nil { - pending, err = pa.getPrometheusAlertsViaService(ctx, UserWorkloadMonitoringNamespace, UserWorkloadPrometheusServiceName, UserWorkloadPrometheusPort, AlertSourceUser) + promAlerts, err = pa.getPrometheusAlertsViaService(ctx, UserWorkloadMonitoringNamespace, UserWorkloadPrometheusServiceName, UserWorkloadPrometheusPort, AlertSourceUser) if err != nil { return alerts, nil } } - return append(alerts, filterAlertsByState(pending, "pending")...), nil + // Enrich before filtering: AM alerts need activeAt from all Prom states. + enrichActiveAt(alerts, promAlerts) + return append(alerts, filterAlertsByState(promAlerts, "pending")...), nil } func (pa *prometheusAlerts) getPrometheusAlertsViaService(ctx context.Context, namespace string, serviceName string, port int32, source string) ([]PrometheusAlert, error) { @@ -776,6 +789,59 @@ func filterAlertsByState(alerts []PrometheusAlert, state string) []PrometheusAle return out } +// enrichActiveAt replaces ActiveAt in Alertmanager-sourced alerts with the +// authoritative value from Prometheus. Alertmanager only exposes startsAt +// (when it received the alert), while Prometheus tracks the true activeAt +// (when the alert condition first became true). +func enrichActiveAt(amAlerts, promAlerts []PrometheusAlert) { + if len(promAlerts) == 0 { + return + } + + lookup := make(map[string]time.Time, len(promAlerts)) + for _, alert := range promAlerts { + fp := alertFingerprint(alert.Labels) + if !alert.ActiveAt.IsZero() { + lookup[fp] = alert.ActiveAt + } + } + + for i := range amAlerts { + fp := alertFingerprint(amAlerts[i].Labels) + if activeAt, ok := lookup[fp]; ok { + amAlerts[i].ActiveAt = activeAt + } + } +} + +// alertFingerprint builds a stable identity key from an alert's labels, +// excluding metadata labels injected by this plugin (source, backend). +// This matches the same alert *instance* across Alertmanager and Prometheus +// (which may differ only in injected metadata). It is distinct from the +// alert rule ID (GetAlertingRuleId) which identifies the *rule definition* +// and is computed from the rule spec (name, expr, duration, static labels). +func alertFingerprint(labels map[string]string) string { + keys := make([]string, 0, len(labels)) + for k := range labels { + if k == AlertSourceLabel || k == AlertBackendLabel { + continue + } + keys = append(keys, k) + } + sort.Strings(keys) + + var b strings.Builder + for i, k := range keys { + if i > 0 { + b.WriteByte('\xff') + } + b.WriteString(k) + b.WriteByte('\xfe') + b.WriteString(labels[k]) + } + return b.String() +} + func mapAlertmanagerState(state string) string { if state == "active" { return "firing" diff --git a/pkg/management/management.go b/pkg/management/management.go index b7eec3c09..84c9a8d84 100644 --- a/pkg/management/management.go +++ b/pkg/management/management.go @@ -1,9 +1,14 @@ package management import ( + "context" + "net/http" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/rest" "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/management/metrics" ) type client struct { @@ -21,3 +26,7 @@ type client struct { func (c *client) isPlatformManagedPrometheusRule(nn types.NamespacedName) bool { return c.k8sClient.Namespace().IsClusterMonitoringNamespace(nn.Namespace) } + +func (c *client) MetricsHandler(ctx context.Context, kubeConfig *rest.Config) (http.Handler, error) { + return metrics.NewHandler(ctx, c, kubeConfig) +} diff --git a/pkg/metrics/alerts_collector.go b/pkg/management/metrics/alerts_collector.go similarity index 100% rename from pkg/metrics/alerts_collector.go rename to pkg/management/metrics/alerts_collector.go index 63aa18e83..fd9dd9bed 100644 --- a/pkg/metrics/alerts_collector.go +++ b/pkg/management/metrics/alerts_collector.go @@ -11,9 +11,9 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/sirupsen/logrus" + "k8s.io/client-go/rest" "github.com/openshift/monitoring-plugin/pkg/k8s" - "k8s.io/client-go/rest" ) var metricsLog = logrus.WithField("module", "metrics") diff --git a/pkg/metrics/alerts_collector_test.go b/pkg/management/metrics/alerts_collector_test.go similarity index 99% rename from pkg/metrics/alerts_collector_test.go rename to pkg/management/metrics/alerts_collector_test.go index 6cc30efbe..5ce053834 100644 --- a/pkg/metrics/alerts_collector_test.go +++ b/pkg/management/metrics/alerts_collector_test.go @@ -10,7 +10,7 @@ import ( dto "github.com/prometheus/client_model/go" "github.com/openshift/monitoring-plugin/pkg/k8s" - "github.com/openshift/monitoring-plugin/pkg/metrics" + "github.com/openshift/monitoring-plugin/pkg/management/metrics" ) type mockAlertsFetcher struct { diff --git a/pkg/metrics/leader_election.go b/pkg/management/metrics/leader_election.go similarity index 100% rename from pkg/metrics/leader_election.go rename to pkg/management/metrics/leader_election.go diff --git a/pkg/management/types.go b/pkg/management/types.go index 0f7d71f4b..e2c54c872 100644 --- a/pkg/management/types.go +++ b/pkg/management/types.go @@ -2,8 +2,10 @@ package management import ( "context" + "net/http" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "k8s.io/client-go/rest" "github.com/openshift/monitoring-plugin/pkg/k8s" ) @@ -51,6 +53,10 @@ type Client interface { // GetAlertingHealth retrieves alerting health details GetAlertingHealth(ctx context.Context) (k8s.AlertingHealth, error) + + // MetricsHandler returns an HTTP handler that exposes alert management metrics. + // It handles leader election internally using the provided kubeConfig. + MetricsHandler(ctx context.Context, kubeConfig *rest.Config) (http.Handler, error) } // PrometheusRuleOptions specifies options for selecting PrometheusRule resources and groups diff --git a/pkg/server.go b/pkg/server.go index 323c83656..5b2fe23dd 100644 --- a/pkg/server.go +++ b/pkg/server.go @@ -171,7 +171,10 @@ func createHTTPServer(ctx context.Context, cfg *Config) (*http.Server, error) { log.Info("alert management API enabled") } - router, pluginConfig := setupRoutes(cfg, managementClient) + router, pluginConfig, err := setupRoutes(ctx, cfg, managementClient, k8sconfig) + if err != nil { + return nil, fmt.Errorf("failed to set up routes: %w", err) + } router.Use(corsHeaderMiddleware()) tlsConfig := &tls.Config{} @@ -262,7 +265,7 @@ func createHTTPServer(ctx context.Context, cfg *Config) (*http.Server, error) { return httpServer, nil } -func setupRoutes(cfg *Config, managementClient management.Client) (*mux.Router, *PluginConfig) { +func setupRoutes(ctx context.Context, cfg *Config, managementClient management.Client, k8sconfig *rest.Config) (*mux.Router, *PluginConfig, error) { configHandlerFunc, pluginConfig := configHandler(cfg) router := mux.NewRouter() @@ -277,11 +280,18 @@ func setupRoutes(cfg *Config, managementClient management.Client) (*mux.Router, if managementClient != nil { managementRouter := managementrouter.New(managementClient) router.PathPrefix("/api/v1/alerting").Handler(managementRouter) + + metricsHandler, err := managementClient.MetricsHandler(ctx, k8sconfig) + if err != nil { + return nil, nil, fmt.Errorf("failed to start alert management metrics: %w", err) + } + router.Path("/metrics").Handler(metricsHandler) + log.Info("alert management metrics started") } router.PathPrefix("/").Handler(filesHandler(http.Dir(cfg.StaticPath))) - return router, pluginConfig + return router, pluginConfig, nil } func setupProxyRoutes(cfg *Config, k8sclient *dynamic.DynamicClient, kind proxy.KindType) *mux.Router { diff --git a/test/e2e/alerts_effective_metric_test.go b/test/e2e/alerts_effective_metric_test.go index cd59897f7..1c73a5065 100644 --- a/test/e2e/alerts_effective_metric_test.go +++ b/test/e2e/alerts_effective_metric_test.go @@ -12,7 +12,7 @@ import ( "k8s.io/apimachinery/pkg/util/wait" "github.com/openshift/monitoring-plugin/pkg/k8s" - "github.com/openshift/monitoring-plugin/pkg/metrics" + "github.com/openshift/monitoring-plugin/pkg/management/metrics" "github.com/openshift/monitoring-plugin/test/e2e/framework" )