Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions internal/managementrouter/health_get_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"testing"

monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"k8s.io/client-go/rest"

"github.com/openshift/monitoring-plugin/internal/managementrouter"
"github.com/openshift/monitoring-plugin/pkg/k8s"
Expand Down Expand Up @@ -64,6 +65,9 @@ func (s *stubClient) UpdateAlertRuleClassification(_ context.Context, _ manageme
func (s *stubClient) BulkUpdateAlertRuleClassification(_ context.Context, _ []management.UpdateRuleClassificationRequest) []error {
return nil
}
func (s *stubClient) MetricsHandler(_ context.Context, _ *rest.Config) (http.Handler, error) {
return nil, nil
}

// newStubRouter builds a router backed by stub and adds a Bearer token header
// to requests via the helper get/getNoAuth methods.
Expand Down
106 changes: 106 additions & 0 deletions pkg/k8s/enrich_active_at_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
package k8s

import (
"testing"
"time"
)

func TestEnrichActiveAt_ReplacesAlertmanagerTimestamp(t *testing.T) {
amTime := time.Date(2026, 3, 10, 12, 0, 0, 0, time.UTC)
promTime := time.Date(2026, 3, 9, 8, 0, 0, 0, time.UTC)

amAlerts := []PrometheusAlert{{
Labels: map[string]string{"alertname": "HighCPU", "severity": "critical", AlertSourceLabel: "platform", AlertBackendLabel: "am"},
ActiveAt: amTime,
}}
promAlerts := []PrometheusAlert{{
Labels: map[string]string{"alertname": "HighCPU", "severity": "critical", AlertSourceLabel: "platform", AlertBackendLabel: "prom"},
ActiveAt: promTime,
}}

enrichActiveAt(amAlerts, promAlerts)

if !amAlerts[0].ActiveAt.Equal(promTime) {
t.Errorf("expected ActiveAt=%v, got %v", promTime, amAlerts[0].ActiveAt)
}
}

func TestEnrichActiveAt_NoMatchKeepsOriginal(t *testing.T) {
amTime := time.Date(2026, 3, 10, 12, 0, 0, 0, time.UTC)

amAlerts := []PrometheusAlert{{
Labels: map[string]string{"alertname": "HighCPU", "severity": "critical"},
ActiveAt: amTime,
}}
promAlerts := []PrometheusAlert{{
Labels: map[string]string{"alertname": "DiskFull", "severity": "warning"},
ActiveAt: time.Date(2026, 3, 9, 8, 0, 0, 0, time.UTC),
}}

enrichActiveAt(amAlerts, promAlerts)

if !amAlerts[0].ActiveAt.Equal(amTime) {
t.Errorf("expected ActiveAt to stay %v, got %v", amTime, amAlerts[0].ActiveAt)
}
}

func TestEnrichActiveAt_EmptyPromAlerts(t *testing.T) {
amTime := time.Date(2026, 3, 10, 12, 0, 0, 0, time.UTC)

amAlerts := []PrometheusAlert{{
Labels: map[string]string{"alertname": "HighCPU"},
ActiveAt: amTime,
}}

enrichActiveAt(amAlerts, nil)

if !amAlerts[0].ActiveAt.Equal(amTime) {
t.Errorf("expected ActiveAt to stay %v, got %v", amTime, amAlerts[0].ActiveAt)
}
}

func TestEnrichActiveAt_SkipsZeroPromActiveAt(t *testing.T) {
amTime := time.Date(2026, 3, 10, 12, 0, 0, 0, time.UTC)

amAlerts := []PrometheusAlert{{
Labels: map[string]string{"alertname": "HighCPU"},
ActiveAt: amTime,
}}
promAlerts := []PrometheusAlert{{
Labels: map[string]string{"alertname": "HighCPU"},
}}

enrichActiveAt(amAlerts, promAlerts)

if !amAlerts[0].ActiveAt.Equal(amTime) {
t.Errorf("expected ActiveAt to stay %v when prom has zero time, got %v", amTime, amAlerts[0].ActiveAt)
}
}

func TestAlertFingerprint_IgnoresMetadataLabels(t *testing.T) {
fp1 := alertFingerprint(map[string]string{
"alertname": "HighCPU",
"severity": "critical",
AlertSourceLabel: "platform",
AlertBackendLabel: "am",
})
fp2 := alertFingerprint(map[string]string{
"alertname": "HighCPU",
"severity": "critical",
AlertSourceLabel: "platform",
AlertBackendLabel: "prom",
})

if fp1 != fp2 {
t.Errorf("fingerprints should match when only metadata labels differ:\n fp1=%q\n fp2=%q", fp1, fp2)
}
}

func TestAlertFingerprint_DifferentLabelsProduceDifferentKeys(t *testing.T) {
fp1 := alertFingerprint(map[string]string{"alertname": "HighCPU", "severity": "critical"})
fp2 := alertFingerprint(map[string]string{"alertname": "HighCPU", "severity": "warning"})

if fp1 == fp2 {
t.Error("fingerprints should differ when label values differ")
}
}
72 changes: 69 additions & 3 deletions pkg/k8s/prometheus_alerts.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"net/http"
"net/url"
"os"
"sort"
"strings"
"sync"
"time"
Expand Down Expand Up @@ -278,11 +279,21 @@ func (pa *prometheusAlerts) routeHealth(ctx context.Context, namespace string, r
return health
}

// getAlertsForSource fetches alerts from both Alertmanager and Prometheus in
// parallel and merges the results. The fallback strategy is:
// - Both succeed: AM (firing+silenced) + Prom pending, with AM timestamps
// enriched from Prometheus activeAt.
// - AM only: AM alerts returned as-is (no Prom data to enrich from).
// - Prom only: all Prom alerts returned (AM was unreachable).
// - Both fail: error propagated from Prometheus.
func (pa *prometheusAlerts) getAlertsForSource(ctx context.Context, namespace string, promRouteName string, amRouteName string, source string) ([]PrometheusAlert, error) {
amAlerts, amErr := pa.getAlertmanagerAlerts(ctx, namespace, amRouteName, source)
promAlerts, promErr := pa.getAlertsViaProxy(ctx, namespace, promRouteName, source)

if amErr == nil {
if promErr == nil {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if we handled the error, this check wouldnt be needed

enrichActiveAt(amAlerts, promAlerts)
}
pending := filterAlertsByState(promAlerts, "pending")
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we are only filtering if no amErr

why would we return all promAlerts if no amErr
but return only the pending if amErr

return append(amAlerts, pending...), nil
}
Expand Down Expand Up @@ -337,15 +348,17 @@ func (pa *prometheusAlerts) getUserWorkloadAlertsViaAlertmanager(ctx context.Con
}
}

pending, err := pa.getAlertsViaProxy(ctx, UserWorkloadMonitoringNamespace, UserWorkloadRouteName, AlertSourceUser)
promAlerts, err := pa.getAlertsViaProxy(ctx, UserWorkloadMonitoringNamespace, UserWorkloadRouteName, AlertSourceUser)
if err != nil {
pending, err = pa.getPrometheusAlertsViaService(ctx, UserWorkloadMonitoringNamespace, UserWorkloadPrometheusServiceName, UserWorkloadPrometheusPort, AlertSourceUser)
promAlerts, err = pa.getPrometheusAlertsViaService(ctx, UserWorkloadMonitoringNamespace, UserWorkloadPrometheusServiceName, UserWorkloadPrometheusPort, AlertSourceUser)
if err != nil {
return alerts, nil
}
}

return append(alerts, filterAlertsByState(pending, "pending")...), nil
// Enrich before filtering: AM alerts need activeAt from all Prom states.
enrichActiveAt(alerts, promAlerts)
return append(alerts, filterAlertsByState(promAlerts, "pending")...), nil
Comment on lines +360 to +361
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not filter first then enrich?

}

func (pa *prometheusAlerts) getPrometheusAlertsViaService(ctx context.Context, namespace string, serviceName string, port int32, source string) ([]PrometheusAlert, error) {
Expand Down Expand Up @@ -776,6 +789,59 @@ func filterAlertsByState(alerts []PrometheusAlert, state string) []PrometheusAle
return out
}

// enrichActiveAt replaces ActiveAt in Alertmanager-sourced alerts with the
// authoritative value from Prometheus. Alertmanager only exposes startsAt
// (when it received the alert), while Prometheus tracks the true activeAt
// (when the alert condition first became true).
func enrichActiveAt(amAlerts, promAlerts []PrometheusAlert) {
if len(promAlerts) == 0 {
return
}

lookup := make(map[string]time.Time, len(promAlerts))
for _, alert := range promAlerts {
fp := alertFingerprint(alert.Labels)
if !alert.ActiveAt.IsZero() {
lookup[fp] = alert.ActiveAt
}
}

for i := range amAlerts {
fp := alertFingerprint(amAlerts[i].Labels)
if activeAt, ok := lookup[fp]; ok {
amAlerts[i].ActiveAt = activeAt
}
}
}

// alertFingerprint builds a stable identity key from an alert's labels,
// excluding metadata labels injected by this plugin (source, backend).
// This matches the same alert *instance* across Alertmanager and Prometheus
// (which may differ only in injected metadata). It is distinct from the
// alert rule ID (GetAlertingRuleId) which identifies the *rule definition*
// and is computed from the rule spec (name, expr, duration, static labels).
func alertFingerprint(labels map[string]string) string {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

whats the difference between using this implementation and the one for the id label?

keys := make([]string, 0, len(labels))
for k := range labels {
if k == AlertSourceLabel || k == AlertBackendLabel {
continue
}
keys = append(keys, k)
}
sort.Strings(keys)

var b strings.Builder
for i, k := range keys {
if i > 0 {
b.WriteByte('\xff')
}
b.WriteString(k)
b.WriteByte('\xfe')
b.WriteString(labels[k])
}
return b.String()
}

func mapAlertmanagerState(state string) string {
if state == "active" {
return "firing"
Expand Down
9 changes: 9 additions & 0 deletions pkg/management/management.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
package management

import (
"context"
"net/http"

"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/rest"

"github.com/openshift/monitoring-plugin/pkg/k8s"
"github.com/openshift/monitoring-plugin/pkg/management/metrics"
)

type client struct {
Expand All @@ -21,3 +26,7 @@ type client struct {
func (c *client) isPlatformManagedPrometheusRule(nn types.NamespacedName) bool {
return c.k8sClient.Namespace().IsClusterMonitoringNamespace(nn.Namespace)
}

func (c *client) MetricsHandler(ctx context.Context, kubeConfig *rest.Config) (http.Handler, error) {
return metrics.NewHandler(ctx, c, kubeConfig)
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/sirupsen/logrus"
"k8s.io/client-go/rest"

"github.com/openshift/monitoring-plugin/pkg/k8s"
"k8s.io/client-go/rest"
)

var metricsLog = logrus.WithField("module", "metrics")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import (
dto "github.com/prometheus/client_model/go"

"github.com/openshift/monitoring-plugin/pkg/k8s"
"github.com/openshift/monitoring-plugin/pkg/metrics"
"github.com/openshift/monitoring-plugin/pkg/management/metrics"
)

type mockAlertsFetcher struct {
Expand Down
6 changes: 6 additions & 0 deletions pkg/management/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@ package management

import (
"context"
"net/http"

monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"k8s.io/client-go/rest"

"github.com/openshift/monitoring-plugin/pkg/k8s"
)
Expand Down Expand Up @@ -51,6 +53,10 @@ type Client interface {

// GetAlertingHealth retrieves alerting health details
GetAlertingHealth(ctx context.Context) (k8s.AlertingHealth, error)

// MetricsHandler returns an HTTP handler that exposes alert management metrics.
// It handles leader election internally using the provided kubeConfig.
MetricsHandler(ctx context.Context, kubeConfig *rest.Config) (http.Handler, error)
}

// PrometheusRuleOptions specifies options for selecting PrometheusRule resources and groups
Expand Down
16 changes: 13 additions & 3 deletions pkg/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,10 @@ func createHTTPServer(ctx context.Context, cfg *Config) (*http.Server, error) {
log.Info("alert management API enabled")
}

router, pluginConfig := setupRoutes(cfg, managementClient)
router, pluginConfig, err := setupRoutes(ctx, cfg, managementClient, k8sconfig)
if err != nil {
return nil, fmt.Errorf("failed to set up routes: %w", err)
}
router.Use(corsHeaderMiddleware())

tlsConfig := &tls.Config{}
Expand Down Expand Up @@ -262,7 +265,7 @@ func createHTTPServer(ctx context.Context, cfg *Config) (*http.Server, error) {
return httpServer, nil
}

func setupRoutes(cfg *Config, managementClient management.Client) (*mux.Router, *PluginConfig) {
func setupRoutes(ctx context.Context, cfg *Config, managementClient management.Client, k8sconfig *rest.Config) (*mux.Router, *PluginConfig, error) {
configHandlerFunc, pluginConfig := configHandler(cfg)

router := mux.NewRouter()
Expand All @@ -277,11 +280,18 @@ func setupRoutes(cfg *Config, managementClient management.Client) (*mux.Router,
if managementClient != nil {
managementRouter := managementrouter.New(managementClient)
router.PathPrefix("/api/v1/alerting").Handler(managementRouter)

metricsHandler, err := managementClient.MetricsHandler(ctx, k8sconfig)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

like it

if err != nil {
return nil, nil, fmt.Errorf("failed to start alert management metrics: %w", err)
}
router.Path("/metrics").Handler(metricsHandler)
log.Info("alert management metrics started")
}

router.PathPrefix("/").Handler(filesHandler(http.Dir(cfg.StaticPath)))

return router, pluginConfig
return router, pluginConfig, nil
}

func setupProxyRoutes(cfg *Config, k8sclient *dynamic.DynamicClient, kind proxy.KindType) *mux.Router {
Expand Down
2 changes: 1 addition & 1 deletion test/e2e/alerts_effective_metric_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import (
"k8s.io/apimachinery/pkg/util/wait"

"github.com/openshift/monitoring-plugin/pkg/k8s"
"github.com/openshift/monitoring-plugin/pkg/metrics"
"github.com/openshift/monitoring-plugin/pkg/management/metrics"
"github.com/openshift/monitoring-plugin/test/e2e/framework"
)

Expand Down