-
Notifications
You must be signed in to change notification settings - Fork 0
CNV-80608: PR12 add alerts_effective_active_at_timestamp_seconds metric #16
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: alert-mgmt-restructured-11-orphan-gc
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,106 @@ | ||
| package k8s | ||
|
|
||
| import ( | ||
| "testing" | ||
| "time" | ||
| ) | ||
|
|
||
| func TestEnrichActiveAt_ReplacesAlertmanagerTimestamp(t *testing.T) { | ||
| amTime := time.Date(2026, 3, 10, 12, 0, 0, 0, time.UTC) | ||
| promTime := time.Date(2026, 3, 9, 8, 0, 0, 0, time.UTC) | ||
|
|
||
| amAlerts := []PrometheusAlert{{ | ||
| Labels: map[string]string{"alertname": "HighCPU", "severity": "critical", AlertSourceLabel: "platform", AlertBackendLabel: "am"}, | ||
| ActiveAt: amTime, | ||
| }} | ||
| promAlerts := []PrometheusAlert{{ | ||
| Labels: map[string]string{"alertname": "HighCPU", "severity": "critical", AlertSourceLabel: "platform", AlertBackendLabel: "prom"}, | ||
| ActiveAt: promTime, | ||
| }} | ||
|
|
||
| enrichActiveAt(amAlerts, promAlerts) | ||
|
|
||
| if !amAlerts[0].ActiveAt.Equal(promTime) { | ||
| t.Errorf("expected ActiveAt=%v, got %v", promTime, amAlerts[0].ActiveAt) | ||
| } | ||
| } | ||
|
|
||
| func TestEnrichActiveAt_NoMatchKeepsOriginal(t *testing.T) { | ||
| amTime := time.Date(2026, 3, 10, 12, 0, 0, 0, time.UTC) | ||
|
|
||
| amAlerts := []PrometheusAlert{{ | ||
| Labels: map[string]string{"alertname": "HighCPU", "severity": "critical"}, | ||
| ActiveAt: amTime, | ||
| }} | ||
| promAlerts := []PrometheusAlert{{ | ||
| Labels: map[string]string{"alertname": "DiskFull", "severity": "warning"}, | ||
| ActiveAt: time.Date(2026, 3, 9, 8, 0, 0, 0, time.UTC), | ||
| }} | ||
|
|
||
| enrichActiveAt(amAlerts, promAlerts) | ||
|
|
||
| if !amAlerts[0].ActiveAt.Equal(amTime) { | ||
| t.Errorf("expected ActiveAt to stay %v, got %v", amTime, amAlerts[0].ActiveAt) | ||
| } | ||
| } | ||
|
|
||
| func TestEnrichActiveAt_EmptyPromAlerts(t *testing.T) { | ||
| amTime := time.Date(2026, 3, 10, 12, 0, 0, 0, time.UTC) | ||
|
|
||
| amAlerts := []PrometheusAlert{{ | ||
| Labels: map[string]string{"alertname": "HighCPU"}, | ||
| ActiveAt: amTime, | ||
| }} | ||
|
|
||
| enrichActiveAt(amAlerts, nil) | ||
|
|
||
| if !amAlerts[0].ActiveAt.Equal(amTime) { | ||
| t.Errorf("expected ActiveAt to stay %v, got %v", amTime, amAlerts[0].ActiveAt) | ||
| } | ||
| } | ||
|
|
||
| func TestEnrichActiveAt_SkipsZeroPromActiveAt(t *testing.T) { | ||
| amTime := time.Date(2026, 3, 10, 12, 0, 0, 0, time.UTC) | ||
|
|
||
| amAlerts := []PrometheusAlert{{ | ||
| Labels: map[string]string{"alertname": "HighCPU"}, | ||
| ActiveAt: amTime, | ||
| }} | ||
| promAlerts := []PrometheusAlert{{ | ||
| Labels: map[string]string{"alertname": "HighCPU"}, | ||
| }} | ||
|
|
||
| enrichActiveAt(amAlerts, promAlerts) | ||
|
|
||
| if !amAlerts[0].ActiveAt.Equal(amTime) { | ||
| t.Errorf("expected ActiveAt to stay %v when prom has zero time, got %v", amTime, amAlerts[0].ActiveAt) | ||
| } | ||
| } | ||
|
|
||
| func TestAlertFingerprint_IgnoresMetadataLabels(t *testing.T) { | ||
| fp1 := alertFingerprint(map[string]string{ | ||
| "alertname": "HighCPU", | ||
| "severity": "critical", | ||
| AlertSourceLabel: "platform", | ||
| AlertBackendLabel: "am", | ||
| }) | ||
| fp2 := alertFingerprint(map[string]string{ | ||
| "alertname": "HighCPU", | ||
| "severity": "critical", | ||
| AlertSourceLabel: "platform", | ||
| AlertBackendLabel: "prom", | ||
| }) | ||
|
|
||
| if fp1 != fp2 { | ||
| t.Errorf("fingerprints should match when only metadata labels differ:\n fp1=%q\n fp2=%q", fp1, fp2) | ||
| } | ||
| } | ||
|
|
||
| func TestAlertFingerprint_DifferentLabelsProduceDifferentKeys(t *testing.T) { | ||
| fp1 := alertFingerprint(map[string]string{"alertname": "HighCPU", "severity": "critical"}) | ||
| fp2 := alertFingerprint(map[string]string{"alertname": "HighCPU", "severity": "warning"}) | ||
|
|
||
| if fp1 == fp2 { | ||
| t.Error("fingerprints should differ when label values differ") | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,6 +10,7 @@ import ( | |
| "net/http" | ||
| "net/url" | ||
| "os" | ||
| "sort" | ||
| "strings" | ||
| "sync" | ||
| "time" | ||
|
|
@@ -278,11 +279,21 @@ func (pa *prometheusAlerts) routeHealth(ctx context.Context, namespace string, r | |
| return health | ||
| } | ||
|
|
||
| // getAlertsForSource fetches alerts from both Alertmanager and Prometheus in | ||
| // parallel and merges the results. The fallback strategy is: | ||
| // - Both succeed: AM (firing+silenced) + Prom pending, with AM timestamps | ||
| // enriched from Prometheus activeAt. | ||
| // - AM only: AM alerts returned as-is (no Prom data to enrich from). | ||
| // - Prom only: all Prom alerts returned (AM was unreachable). | ||
| // - Both fail: error propagated from Prometheus. | ||
| func (pa *prometheusAlerts) getAlertsForSource(ctx context.Context, namespace string, promRouteName string, amRouteName string, source string) ([]PrometheusAlert, error) { | ||
| amAlerts, amErr := pa.getAlertmanagerAlerts(ctx, namespace, amRouteName, source) | ||
| promAlerts, promErr := pa.getAlertsViaProxy(ctx, namespace, promRouteName, source) | ||
|
|
||
| if amErr == nil { | ||
| if promErr == nil { | ||
| enrichActiveAt(amAlerts, promAlerts) | ||
| } | ||
| pending := filterAlertsByState(promAlerts, "pending") | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we are only filtering if no amErr why would we return all promAlerts if no amErr |
||
| return append(amAlerts, pending...), nil | ||
| } | ||
|
|
@@ -337,15 +348,17 @@ func (pa *prometheusAlerts) getUserWorkloadAlertsViaAlertmanager(ctx context.Con | |
| } | ||
| } | ||
|
|
||
| pending, err := pa.getAlertsViaProxy(ctx, UserWorkloadMonitoringNamespace, UserWorkloadRouteName, AlertSourceUser) | ||
| promAlerts, err := pa.getAlertsViaProxy(ctx, UserWorkloadMonitoringNamespace, UserWorkloadRouteName, AlertSourceUser) | ||
| if err != nil { | ||
| pending, err = pa.getPrometheusAlertsViaService(ctx, UserWorkloadMonitoringNamespace, UserWorkloadPrometheusServiceName, UserWorkloadPrometheusPort, AlertSourceUser) | ||
| promAlerts, err = pa.getPrometheusAlertsViaService(ctx, UserWorkloadMonitoringNamespace, UserWorkloadPrometheusServiceName, UserWorkloadPrometheusPort, AlertSourceUser) | ||
| if err != nil { | ||
| return alerts, nil | ||
| } | ||
| } | ||
|
|
||
| return append(alerts, filterAlertsByState(pending, "pending")...), nil | ||
| // Enrich before filtering: AM alerts need activeAt from all Prom states. | ||
| enrichActiveAt(alerts, promAlerts) | ||
| return append(alerts, filterAlertsByState(promAlerts, "pending")...), nil | ||
|
Comment on lines
+360
to
+361
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why not filter first then enrich? |
||
| } | ||
|
|
||
| func (pa *prometheusAlerts) getPrometheusAlertsViaService(ctx context.Context, namespace string, serviceName string, port int32, source string) ([]PrometheusAlert, error) { | ||
|
|
@@ -776,6 +789,59 @@ func filterAlertsByState(alerts []PrometheusAlert, state string) []PrometheusAle | |
| return out | ||
| } | ||
|
|
||
| // enrichActiveAt replaces ActiveAt in Alertmanager-sourced alerts with the | ||
| // authoritative value from Prometheus. Alertmanager only exposes startsAt | ||
| // (when it received the alert), while Prometheus tracks the true activeAt | ||
| // (when the alert condition first became true). | ||
| func enrichActiveAt(amAlerts, promAlerts []PrometheusAlert) { | ||
| if len(promAlerts) == 0 { | ||
| return | ||
| } | ||
|
|
||
| lookup := make(map[string]time.Time, len(promAlerts)) | ||
| for _, alert := range promAlerts { | ||
| fp := alertFingerprint(alert.Labels) | ||
| if !alert.ActiveAt.IsZero() { | ||
| lookup[fp] = alert.ActiveAt | ||
| } | ||
| } | ||
|
|
||
| for i := range amAlerts { | ||
| fp := alertFingerprint(amAlerts[i].Labels) | ||
| if activeAt, ok := lookup[fp]; ok { | ||
| amAlerts[i].ActiveAt = activeAt | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // alertFingerprint builds a stable identity key from an alert's labels, | ||
| // excluding metadata labels injected by this plugin (source, backend). | ||
| // This matches the same alert *instance* across Alertmanager and Prometheus | ||
| // (which may differ only in injected metadata). It is distinct from the | ||
| // alert rule ID (GetAlertingRuleId) which identifies the *rule definition* | ||
| // and is computed from the rule spec (name, expr, duration, static labels). | ||
| func alertFingerprint(labels map[string]string) string { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. whats the difference between using this implementation and the one for the id label? |
||
| keys := make([]string, 0, len(labels)) | ||
| for k := range labels { | ||
| if k == AlertSourceLabel || k == AlertBackendLabel { | ||
| continue | ||
| } | ||
| keys = append(keys, k) | ||
| } | ||
| sort.Strings(keys) | ||
|
|
||
| var b strings.Builder | ||
| for i, k := range keys { | ||
| if i > 0 { | ||
| b.WriteByte('\xff') | ||
| } | ||
| b.WriteString(k) | ||
| b.WriteByte('\xfe') | ||
| b.WriteString(labels[k]) | ||
| } | ||
| return b.String() | ||
| } | ||
|
|
||
| func mapAlertmanagerState(state string) string { | ||
| if state == "active" { | ||
| return "firing" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -171,7 +171,10 @@ func createHTTPServer(ctx context.Context, cfg *Config) (*http.Server, error) { | |
| log.Info("alert management API enabled") | ||
| } | ||
|
|
||
| router, pluginConfig := setupRoutes(cfg, managementClient) | ||
| router, pluginConfig, err := setupRoutes(ctx, cfg, managementClient, k8sconfig) | ||
| if err != nil { | ||
| return nil, fmt.Errorf("failed to set up routes: %w", err) | ||
| } | ||
| router.Use(corsHeaderMiddleware()) | ||
|
|
||
| tlsConfig := &tls.Config{} | ||
|
|
@@ -262,7 +265,7 @@ func createHTTPServer(ctx context.Context, cfg *Config) (*http.Server, error) { | |
| return httpServer, nil | ||
| } | ||
|
|
||
| func setupRoutes(cfg *Config, managementClient management.Client) (*mux.Router, *PluginConfig) { | ||
| func setupRoutes(ctx context.Context, cfg *Config, managementClient management.Client, k8sconfig *rest.Config) (*mux.Router, *PluginConfig, error) { | ||
| configHandlerFunc, pluginConfig := configHandler(cfg) | ||
|
|
||
| router := mux.NewRouter() | ||
|
|
@@ -277,11 +280,18 @@ func setupRoutes(cfg *Config, managementClient management.Client) (*mux.Router, | |
| if managementClient != nil { | ||
| managementRouter := managementrouter.New(managementClient) | ||
| router.PathPrefix("/api/v1/alerting").Handler(managementRouter) | ||
|
|
||
| metricsHandler, err := managementClient.MetricsHandler(ctx, k8sconfig) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. like it |
||
| if err != nil { | ||
| return nil, nil, fmt.Errorf("failed to start alert management metrics: %w", err) | ||
| } | ||
| router.Path("/metrics").Handler(metricsHandler) | ||
| log.Info("alert management metrics started") | ||
| } | ||
|
|
||
| router.PathPrefix("/").Handler(filesHandler(http.Dir(cfg.StaticPath))) | ||
|
|
||
| return router, pluginConfig | ||
| return router, pluginConfig, nil | ||
| } | ||
|
|
||
| func setupProxyRoutes(cfg *Config, k8sclient *dynamic.DynamicClient, kind proxy.KindType) *mux.Router { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if we handled the error, this check wouldnt be needed