diff --git a/internal/managementrouter/alerts_get.go b/internal/managementrouter/alerts_get.go new file mode 100644 index 000000000..abb0ab462 --- /dev/null +++ b/internal/managementrouter/alerts_get.go @@ -0,0 +1,109 @@ +package managementrouter + +import ( + "context" + "encoding/json" + "net/http" + + "github.com/openshift/monitoring-plugin/pkg/k8s" +) + +type GetAlertsResponse struct { + Data GetAlertsResponseData `json:"data"` + Warnings []string `json:"warnings,omitempty"` +} + +type GetAlertsResponseData struct { + Alerts []k8s.PrometheusAlert `json:"alerts"` +} + +func (hr *httpRouter) GetAlerts(w http.ResponseWriter, req *http.Request) { + state, labels, err := parseStateAndLabels(req.URL.Query()) + if err != nil { + writeError(w, http.StatusBadRequest, err.Error()) + return + } + ctx := req.Context() + + alerts, err := hr.managementClient.GetAlerts(ctx, k8s.GetAlertsRequest{ + Labels: labels, + State: state, + }) + if err != nil { + handleError(w, err) + return + } + + w.Header().Set("Content-Type", "application/json") + w.Header().Set("Cache-Control", "no-store") + w.WriteHeader(http.StatusOK) + if err := json.NewEncoder(w).Encode(GetAlertsResponse{ + Data: GetAlertsResponseData{ + Alerts: alerts, + }, + Warnings: hr.alertWarnings(ctx), + }); err != nil { + log.WithError(err).Warn("failed to encode alerts response") + } +} + +func (hr *httpRouter) alertWarnings(ctx context.Context) []string { + health, ok := hr.alertingHealth(ctx) + if !ok { + return nil + } + + warnings := []string{} + if health.UserWorkloadEnabled && health.UserWorkload != nil { + warnings = append(warnings, buildRouteWarnings(health.UserWorkload.Prometheus, k8s.UserWorkloadRouteName, "user workload Prometheus")...) + warnings = append(warnings, buildRouteWarnings(health.UserWorkload.Alertmanager, k8s.UserWorkloadAlertmanagerRouteName, "user workload Alertmanager")...) + } + + return warnings +} + +//nolint:unused // used by the rules listing handler in a subsequent branch +func (hr *httpRouter) rulesWarnings(ctx context.Context) []string { + health, ok := hr.alertingHealth(ctx) + if !ok { + return nil + } + + if health.UserWorkloadEnabled && health.UserWorkload != nil { + return buildRouteWarnings(health.UserWorkload.Prometheus, k8s.UserWorkloadRouteName, "user workload Prometheus") + } + + return nil +} + +func (hr *httpRouter) alertingHealth(ctx context.Context) (k8s.AlertingHealth, bool) { + if hr.managementClient == nil { + return k8s.AlertingHealth{}, false + } + + health, err := hr.managementClient.GetAlertingHealth(ctx) + if err != nil { + log.WithError(err).Warn("alerting health unavailable") + return k8s.AlertingHealth{}, false + } + + return health, true +} + +func buildRouteWarnings(route k8s.AlertingRouteHealth, expectedName string, friendlyName string) []string { + if route.Name != "" && route.Name != expectedName { + return nil + } + if route.FallbackReachable { + return nil + } + + switch route.Status { + case k8s.RouteNotFound: + return []string{friendlyName + " route is missing"} + case k8s.RouteUnreachable: + return []string{friendlyName + " route is unreachable"} + default: + return nil + } +} diff --git a/internal/managementrouter/alerts_get_test.go b/internal/managementrouter/alerts_get_test.go new file mode 100644 index 000000000..1c931a00b --- /dev/null +++ b/internal/managementrouter/alerts_get_test.go @@ -0,0 +1,380 @@ +package managementrouter_test + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "github.com/prometheus/prometheus/model/relabel" + "k8s.io/apimachinery/pkg/util/intstr" + + "github.com/openshift/monitoring-plugin/internal/managementrouter" + alertrule "github.com/openshift/monitoring-plugin/pkg/alert_rule" + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/management" + "github.com/openshift/monitoring-plugin/pkg/management/testutils" + "github.com/openshift/monitoring-plugin/pkg/managementlabels" +) + +// agFixture holds mocks and the router for GetAlerts handler tests. +type agFixture struct { + router http.Handler + mockK8s *testutils.MockClient + mockPrometheusAlerts *testutils.MockPrometheusAlertsInterface +} + +func newAGFixture(t *testing.T) *agFixture { + t.Helper() + f := &agFixture{ + mockPrometheusAlerts: &testutils.MockPrometheusAlertsInterface{}, + } + f.mockK8s = &testutils.MockClient{ + PrometheusAlertsFunc: func() k8s.PrometheusAlertsInterface { + return f.mockPrometheusAlerts + }, + } + f.rebuild() + return f +} + +func (f *agFixture) rebuild() { + mgmt := management.New(context.Background(), f.mockK8s) + f.router = managementrouter.New(mgmt) +} + +func (f *agFixture) get(t *testing.T, url string) *httptest.ResponseRecorder { + t.Helper() + req := httptest.NewRequest(http.MethodGet, url, nil) + req.Header.Set("Authorization", "Bearer test-token") + w := httptest.NewRecorder() + f.router.ServeHTTP(w, req) + return w +} + +func decodeAlertsResp(t *testing.T, w *httptest.ResponseRecorder) managementrouter.GetAlertsResponse { + t.Helper() + var resp managementrouter.GetAlertsResponse + if err := json.NewDecoder(w.Body).Decode(&resp); err != nil { + t.Fatalf("decode response: %v", err) + } + return resp +} + +func TestGetAlerts_ParsesFlatQueryParams(t *testing.T) { + f := newAGFixture(t) + var captured k8s.GetAlertsRequest + f.mockPrometheusAlerts.GetAlertsFunc = func(_ context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + captured = req + return []k8s.PrometheusAlert{}, nil + } + + w := f.get(t, "/api/v1/alerting/alerts?namespace=ns1&severity=critical&state=firing&team=sre") + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body) + } + if captured.State != "firing" { + t.Errorf("expected state=firing, got %q", captured.State) + } + if captured.Labels["namespace"] != "ns1" { + t.Errorf("expected namespace=ns1, got %q", captured.Labels["namespace"]) + } + if captured.Labels["severity"] != "critical" { + t.Errorf("expected severity=critical, got %q", captured.Labels["severity"]) + } + if captured.Labels["team"] != "sre" { + t.Errorf("expected team=sre, got %q", captured.Labels["team"]) + } +} + +func TestGetAlerts_ReturnsAllAlerts(t *testing.T) { + f := newAGFixture(t) + testAlerts := []k8s.PrometheusAlert{ + { + Labels: map[string]string{managementlabels.AlertNameLabel: "HighCPUUsage", "severity": "warning", "namespace": "default"}, + Annotations: map[string]string{"description": "CPU usage is high"}, + State: "firing", + ActiveAt: time.Now(), + }, + { + Labels: map[string]string{managementlabels.AlertNameLabel: "LowMemory", "severity": "critical", "namespace": "monitoring"}, + Annotations: map[string]string{"description": "Memory is running low"}, + State: "firing", + ActiveAt: time.Now(), + }, + } + f.mockPrometheusAlerts.SetActiveAlerts(testAlerts) + + w := f.get(t, "/api/v1/alerting/alerts") + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body) + } + if ct := w.Header().Get("Content-Type"); ct != "application/json" { + t.Errorf("expected Content-Type application/json, got %q", ct) + } + resp := decodeAlertsResp(t, w) + if len(resp.Data.Alerts) != 2 { + t.Fatalf("expected 2 alerts, got %d", len(resp.Data.Alerts)) + } + if resp.Data.Alerts[0].Labels[managementlabels.AlertNameLabel] != "HighCPUUsage" { + t.Errorf("alert[0] name mismatch: %s", resp.Data.Alerts[0].Labels[managementlabels.AlertNameLabel]) + } + if resp.Data.Alerts[1].Labels[managementlabels.AlertNameLabel] != "LowMemory" { + t.Errorf("alert[1] name mismatch: %s", resp.Data.Alerts[1].Labels[managementlabels.AlertNameLabel]) + } +} + +func TestGetAlerts_WarningsWhenUserWorkloadRoutesMissing(t *testing.T) { + f := newAGFixture(t) + f.mockK8s.AlertingHealthFunc = func(_ context.Context) (k8s.AlertingHealth, error) { + return k8s.AlertingHealth{ + UserWorkloadEnabled: true, + UserWorkload: &k8s.AlertingStackHealth{ + Prometheus: k8s.AlertingRouteHealth{Status: k8s.RouteNotFound}, + Alertmanager: k8s.AlertingRouteHealth{Status: k8s.RouteNotFound}, + }, + }, nil + } + f.rebuild() + + w := f.get(t, "/api/v1/alerting/alerts") + resp := decodeAlertsResp(t, w) + + warnSet := make(map[string]bool) + for _, w := range resp.Warnings { + warnSet[w] = true + } + if !warnSet["user workload Prometheus route is missing"] { + t.Errorf("expected Prometheus route warning, got: %v", resp.Warnings) + } + if !warnSet["user workload Alertmanager route is missing"] { + t.Errorf("expected Alertmanager route warning, got: %v", resp.Warnings) + } +} + +func TestGetAlerts_SuppressesWarningsWhenFallbacksHealthy(t *testing.T) { + f := newAGFixture(t) + f.mockK8s.AlertingHealthFunc = func(_ context.Context) (k8s.AlertingHealth, error) { + return k8s.AlertingHealth{ + UserWorkloadEnabled: true, + UserWorkload: &k8s.AlertingStackHealth{ + Prometheus: k8s.AlertingRouteHealth{Status: k8s.RouteUnreachable, FallbackReachable: true}, + Alertmanager: k8s.AlertingRouteHealth{Status: k8s.RouteUnreachable, FallbackReachable: true}, + }, + }, nil + } + f.rebuild() + + w := f.get(t, "/api/v1/alerting/alerts") + resp := decodeAlertsResp(t, w) + if len(resp.Warnings) != 0 { + t.Errorf("expected no warnings, got: %v", resp.Warnings) + } +} + +func TestGetAlerts_ReturnsEmptyWhenNoAlerts(t *testing.T) { + f := newAGFixture(t) + f.mockPrometheusAlerts.SetActiveAlerts([]k8s.PrometheusAlert{}) + + w := f.get(t, "/api/v1/alerting/alerts") + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body) + } + resp := decodeAlertsResp(t, w) + if len(resp.Data.Alerts) != 0 { + t.Errorf("expected empty alerts, got %d", len(resp.Data.Alerts)) + } +} + +func TestGetAlerts_Returns500OnError(t *testing.T) { + f := newAGFixture(t) + f.mockPrometheusAlerts.GetAlertsFunc = func(_ context.Context, _ k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + return nil, fmt.Errorf("connection error") + } + + w := f.get(t, "/api/v1/alerting/alerts") + if w.Code != http.StatusInternalServerError { + t.Fatalf("expected 500, got %d: %s", w.Code, w.Body) + } + if body := w.Body.String(); !strings.Contains(body, "An unexpected error occurred") { + t.Errorf("expected error message, got: %s", body) + } +} + +func TestGetAlerts_ForwardsBearerToken(t *testing.T) { + f := newAGFixture(t) + var capturedCtx context.Context + f.mockPrometheusAlerts.GetAlertsFunc = func(ctx context.Context, _ k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + capturedCtx = ctx + return []k8s.PrometheusAlert{}, nil + } + + req := httptest.NewRequest(http.MethodGet, "/api/v1/alerting/alerts", nil) + req.Header.Set("Authorization", "Bearer test-token-abc123") + w := httptest.NewRecorder() + f.router.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body) + } + if token := k8s.BearerTokenFromContext(capturedCtx); token != "test-token-abc123" { + t.Errorf("expected token test-token-abc123, got %q", token) + } +} + +func TestGetAlerts_MissingAuthHeaderReturns401(t *testing.T) { + f := newAGFixture(t) + req := httptest.NewRequest(http.MethodGet, "/api/v1/alerting/alerts", nil) + w := httptest.NewRecorder() + f.router.ServeHTTP(w, req) + + if w.Code != http.StatusUnauthorized { + t.Fatalf("expected 401, got %d: %s", w.Code, w.Body) + } +} + +func TestGetAlerts_EnrichesAlertWithRuleId(t *testing.T) { + f := newAGFixture(t) + baseRule := monitoringv1.Rule{ + Alert: "HighCPU", + Expr: intstr.FromString("node_cpu > 0.9"), + Labels: map[string]string{"severity": "critical"}, + } + ruleId := alertrule.GetAlertingRuleId(&baseRule) + + relabeledRule := monitoringv1.Rule{ + Alert: "HighCPU", + Expr: intstr.FromString("node_cpu > 0.9"), + Labels: map[string]string{ + managementlabels.AlertNameLabel: "HighCPU", + "severity": "critical", + k8s.AlertRuleLabelId: ruleId, + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "cluster-cpu-rules", + managementlabels.AlertingRuleLabelName: "my-alerting-rule", + }, + } + + f.mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + ListFunc: func(_ context.Context) []monitoringv1.Rule { return []monitoringv1.Rule{relabeledRule} }, + GetFunc: func(_ context.Context, id string) (monitoringv1.Rule, bool) { + if id == ruleId { + return relabeledRule, true + } + return monitoringv1.Rule{}, false + }, + ConfigFunc: func() []*relabel.Config { return []*relabel.Config{} }, + } + } + f.mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + return &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { return name == "openshift-monitoring" }, + } + } + f.mockPrometheusAlerts.SetActiveAlerts([]k8s.PrometheusAlert{ + { + Labels: map[string]string{ + managementlabels.AlertNameLabel: "HighCPU", + "severity": "critical", + k8s.AlertSourceLabel: k8s.AlertSourcePlatform, + k8s.AlertBackendLabel: "alertmanager", + }, + Annotations: map[string]string{"summary": "CPU is high"}, + State: "firing", + ActiveAt: time.Now(), + }, + }) + f.rebuild() + + w := f.get(t, "/api/v1/alerting/alerts") + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body) + } + resp := decodeAlertsResp(t, w) + if len(resp.Data.Alerts) != 1 { + t.Fatalf("expected 1 alert, got %d", len(resp.Data.Alerts)) + } + alert := resp.Data.Alerts[0] + if alert.AlertRuleId != ruleId { + t.Errorf("expected ruleId %s, got %s", ruleId, alert.AlertRuleId) + } + if alert.AlertComponent == "" { + t.Error("expected non-empty AlertComponent") + } + if alert.AlertLayer == "" { + t.Error("expected non-empty AlertLayer") + } +} + +func TestGetAlerts_EnrichesWithoutAlertingRuleCR(t *testing.T) { + f := newAGFixture(t) + baseRule := monitoringv1.Rule{ + Alert: "KubePodCrashLooping", + Expr: intstr.FromString("rate(kube_pod_restart_total[5m]) > 0"), + Labels: map[string]string{"severity": "warning"}, + } + ruleId := alertrule.GetAlertingRuleId(&baseRule) + + relabeledRule := monitoringv1.Rule{ + Alert: "KubePodCrashLooping", + Expr: intstr.FromString("rate(kube_pod_restart_total[5m]) > 0"), + Labels: map[string]string{ + managementlabels.AlertNameLabel: "KubePodCrashLooping", + "severity": "warning", + k8s.AlertRuleLabelId: ruleId, + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "kube-state-metrics", + }, + } + + f.mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + ListFunc: func(_ context.Context) []monitoringv1.Rule { return []monitoringv1.Rule{relabeledRule} }, + GetFunc: func(_ context.Context, id string) (monitoringv1.Rule, bool) { + if id == ruleId { + return relabeledRule, true + } + return monitoringv1.Rule{}, false + }, + ConfigFunc: func() []*relabel.Config { return []*relabel.Config{} }, + } + } + f.mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + return &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { return name == "openshift-monitoring" }, + } + } + f.mockPrometheusAlerts.SetActiveAlerts([]k8s.PrometheusAlert{ + { + Labels: map[string]string{ + managementlabels.AlertNameLabel: "KubePodCrashLooping", + "severity": "warning", + k8s.AlertSourceLabel: k8s.AlertSourcePlatform, + k8s.AlertBackendLabel: "alertmanager", + }, + State: "firing", + ActiveAt: time.Now(), + }, + }) + f.rebuild() + + w := f.get(t, "/api/v1/alerting/alerts") + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body) + } + resp := decodeAlertsResp(t, w) + if len(resp.Data.Alerts) != 1 { + t.Fatalf("expected 1 alert, got %d", len(resp.Data.Alerts)) + } + if resp.Data.Alerts[0].AlertRuleId != ruleId { + t.Errorf("expected ruleId %s, got %s", ruleId, resp.Data.Alerts[0].AlertRuleId) + } +} diff --git a/internal/managementrouter/query_filters.go b/internal/managementrouter/query_filters.go new file mode 100644 index 000000000..f8e3e5e9d --- /dev/null +++ b/internal/managementrouter/query_filters.go @@ -0,0 +1,35 @@ +package managementrouter + +import ( + "fmt" + "net/url" + "strings" +) + +var validStates = map[string]bool{ + "": true, + "pending": true, + "firing": true, + "silenced": true, +} + +// parseStateAndLabels returns the optional state filter and label matches. +// Any query param other than "state" is treated as a label match. +// Returns an error if the state value is not one of the known states. +func parseStateAndLabels(q url.Values) (string, map[string]string, error) { + state := strings.ToLower(strings.TrimSpace(q.Get("state"))) + if !validStates[state] { + return "", nil, fmt.Errorf("invalid state filter %q: must be one of pending, firing, silenced", q.Get("state")) + } + + labels := make(map[string]string) + for key, vals := range q { + if key == "state" { + continue + } + if len(vals) > 0 && strings.TrimSpace(vals[0]) != "" { + labels[strings.TrimSpace(key)] = strings.TrimSpace(vals[0]) + } + } + return state, labels, nil +} diff --git a/internal/managementrouter/router.go b/internal/managementrouter/router.go index 8bb62a765..8888648bb 100644 --- a/internal/managementrouter/router.go +++ b/internal/managementrouter/router.go @@ -40,6 +40,10 @@ func New(managementClient management.Client) *mux.Router { BaseURL: "/api/v1/alerting", BaseRouter: r, }) + // GET /alerts is not yet in the OpenAPI spec; registered manually + // until its branch adds the spec entry and generated bindings. + r.HandleFunc("/api/v1/alerting/alerts", hr.GetAlerts).Methods(http.MethodGet) + return r } diff --git a/pkg/alertcomponent/matcher.go b/pkg/alertcomponent/matcher.go new file mode 100644 index 000000000..8aa6f9227 --- /dev/null +++ b/pkg/alertcomponent/matcher.go @@ -0,0 +1,381 @@ +package alertcomponent + +import ( + "regexp" + + "github.com/prometheus/common/model" + + "github.com/openshift/monitoring-plugin/pkg/managementlabels" +) + +const ( + labelNamespace = "namespace" + labelSeverity = "severity" +) + +func ns(values ...string) LabelsMatcher { + return NewLabelsMatcher(labelNamespace, NewStringValuesMatcher(values...)) +} + +func alertNames(values ...string) LabelsMatcher { + return NewLabelsMatcher(managementlabels.AlertNameLabel, NewStringValuesMatcher(values...)) +} + +func regexAlertNames(regexes ...*regexp.Regexp) LabelsMatcher { + return NewLabelsMatcher(managementlabels.AlertNameLabel, NewRegexValuesMatcher(regexes...)) +} + +func labelValues(key string, values ...string) LabelsMatcher { + return NewLabelsMatcher(key, NewStringValuesMatcher(values...)) +} + +func comp(component string, ms ...LabelsMatcher) componentMatcher { + return componentMatcher{component: component, matchers: ms} +} + +// LabelsMatcher represents a matcher definition for a set of labels. +// It matches if all of the label matchers match the labels. +type LabelsMatcher interface { + Matches(labels model.LabelSet) (match bool, keys []model.LabelName) + Equals(other LabelsMatcher) bool +} + +func NewLabelsMatcher(key string, matcher ValueMatcher) LabelsMatcher { + return labelMatcher{key: key, matcher: matcher} +} + +func NewStringValuesMatcher(keys ...string) ValueMatcher { + return stringMatcher(keys) +} + +func NewRegexValuesMatcher(regexes ...*regexp.Regexp) ValueMatcher { + return regexpMatcher(regexes) +} + +// labelMatcher represents a matcher definition for a label. +type labelMatcher struct { + key string + matcher ValueMatcher +} + +// Matches implements the LabelsMatcher interface. +func (l labelMatcher) Matches(labels model.LabelSet) (bool, []model.LabelName) { + if l.matcher.Matches(string(labels[model.LabelName(l.key)])) { + return true, []model.LabelName{model.LabelName(l.key)} + } + return false, nil +} + +// Equals implements the LabelsMatcher interface. +func (l labelMatcher) Equals(other LabelsMatcher) bool { + ol, ok := other.(labelMatcher) + if !ok { + return false + } + return l.key == ol.key && l.matcher.Equals(ol.matcher) +} + +// ValueMatcher represents a matcher for a specific value. +// +// Multiple implementations are provided for different types of matchers. +type ValueMatcher interface { + Matches(value string) bool + Equals(other ValueMatcher) bool +} + +// stringMatcher is a matcher for a list of strings. +// +// It matches if the value is in the list of strings. +type stringMatcher []string + +func (s stringMatcher) Matches(value string) bool { + for _, v := range s { + if v == value { + return true + } + } + return false +} + +// Equals implements the ValueMatcher interface. +func (s stringMatcher) Equals(other ValueMatcher) bool { + o, ok := other.(stringMatcher) + if !ok { + return false + } + return equalsNoOrder(s, o) +} + +// regexpMatcher is a matcher for a list of regular expressions. +// +// It matches if the value matches any of the regular expressions. +type regexpMatcher []*regexp.Regexp + +func (r regexpMatcher) Matches(value string) bool { + for _, re := range r { + if re.MatchString(value) { + return true + } + } + return false +} + +// Equals implements the ValueMatcher interface. +func (r regexpMatcher) Equals(other ValueMatcher) bool { + o, ok := other.(regexpMatcher) + if !ok { + return false + } + s1 := make([]string, 0, len(r)) + for _, re := range r { + s1 = append(s1, re.String()) + } + s2 := make([]string, 0, len(o)) + for _, re := range o { + s2 = append(s2, re.String()) + } + return equalsNoOrder(s1, s2) +} + +func equalsNoOrder(a, b []string) bool { + if len(a) != len(b) { + return false + } + + seen := make(map[string]int, len(a)) + for _, v := range a { + seen[v]++ + } + for _, v := range b { + if seen[v] == 0 { + return false + } + seen[v]-- + } + return true +} + +// componentMatcher represents a matcher definition for a component. +// +// It matches if any of the label matchers match the labels. +type componentMatcher struct { + component string + matchers []LabelsMatcher +} + +// findComponent tries to determine a component for given labels using the provided matchers. +// +// It returns the component and the keys that matched. +// If no match is found, it returns an empty component and nil keys. +func findComponent(compMatchers []componentMatcher, labels model.LabelSet) ( + component string, keys []model.LabelName) { + for _, compMatcher := range compMatchers { + for _, labelsMatcher := range compMatcher.matchers { + if matches, keys := labelsMatcher.Matches(labels); matches { + return compMatcher.component, keys + } + } + } + return "", nil +} + +// componentMatcherFn is a function that tries matching provided labels to a component. +// It returns the layer, component and the keys from the labels that were used for matching. +// If no match is found, it returns an empty layer, component and nil keys. +type componentMatcherFn func(labels model.LabelSet) (layer, comp model.LabelValue, keys []model.LabelName) + +func evalMatcherFns(fns []componentMatcherFn, labels model.LabelSet) ( + layer, comp string, labelsSubset model.LabelSet) { + for _, fn := range fns { + if layer, comp, keys := fn(labels); layer != "" { + return string(layer), string(comp), getLabelsSubset(labels, keys...) + } + } + return "Others", "Others", getLabelsSubset(labels) +} + +// getLabelsSubset returns a subset of the labels with given keys. +func getLabelsSubset(m model.LabelSet, keys ...model.LabelName) model.LabelSet { + keys = append([]model.LabelName{ + model.LabelName(labelNamespace), + model.LabelName(managementlabels.AlertNameLabel), + model.LabelName(labelSeverity), + }, keys...) + return getMapSubset(m, keys...) +} + +// getMapSubset returns a subset of the labels with given keys. +func getMapSubset(m model.LabelSet, keys ...model.LabelName) model.LabelSet { + subset := make(model.LabelSet, len(keys)) + for _, key := range keys { + if val, ok := m[key]; ok { + subset[key] = val + } + } + return subset +} + +var ( + nodeAlerts []model.LabelValue = []model.LabelValue{ + "NodeClockNotSynchronising", + "KubeNodeNotReady", + "KubeNodeUnreachable", + "NodeSystemSaturation", + "NodeFilesystemSpaceFillingUp", + "NodeFilesystemAlmostOutOfSpace", + "NodeMemoryMajorPagesFaults", + "NodeNetworkTransmitErrs", + "NodeTextFileCollectorScrapeError", + "NodeFilesystemFilesFillingUp", + "NodeNetworkReceiveErrs", + "NodeClockSkewDetected", + "NodeFilesystemAlmostOutOfFiles", + "NodeWithoutOVNKubeNodePodRunning", + "InfraNodesNeedResizingSRE", + "NodeHighNumberConntrackEntriesUsed", + "NodeMemHigh", + "NodeNetworkInterfaceFlapping", + "NodeWithoutSDNPod", + "NodeCpuHigh", + "CriticalNodeNotReady", + "NodeFileDescriptorLimit", + "MCCPoolAlert", + "MCCDrainError", + "MCDRebootError", + "MCDPivotError", + } + + coreMatchers = []componentMatcher{ + comp("etcd", ns("openshift-etcd", "openshift-etcd-operator")), + comp("kube-apiserver", ns("openshift-kube-apiserver", "openshift-kube-apiserver-operator")), + comp("kube-controller-manager", ns("openshift-kube-controller-manager", "openshift-kube-controller-manager-operator", "kube-system")), + comp("kube-scheduler", ns("openshift-kube-scheduler", "openshift-kube-scheduler-operator")), + comp("machine-approver", ns("openshift-cluster-machine-approver", "openshift-machine-approver-operator")), + comp("machine-config", + ns("openshift-machine-config-operator"), + alertNames( + "HighOverallControlPlaneMemory", + "ExtremelyHighIndividualControlPlaneMemory", + "MissingMachineConfig", + "MCCBootImageUpdateError", + "KubeletHealthState", + "SystemMemoryExceedsReservation", + ), + ), + comp("version", + ns("openshift-cluster-version", "openshift-version-operator"), + alertNames("ClusterNotUpgradeable", "UpdateAvailable"), + ), + comp("dns", ns("openshift-dns", "openshift-dns-operator")), + comp("authentication", ns("openshift-authentication", "openshift-oauth-apiserver", "openshift-authentication-operator")), + comp("cert-manager", ns("openshift-cert-manager", "openshift-cert-manager-operator")), + comp("cloud-controller-manager", ns("openshift-cloud-controller-manager", "openshift-cloud-controller-manager-operator")), + comp("cloud-credential", ns("openshift-cloud-credential-operator")), + comp("cluster-api", ns("openshift-cluster-api", "openshift-cluster-api-operator")), + comp("config-operator", ns("openshift-config-operator")), + comp("kube-storage-version-migrator", ns("openshift-kube-storage-version-migrator", "openshift-kube-storage-version-migrator-operator")), + comp("image-registry", ns("openshift-image-registry", "openshift-image-registry-operator")), + comp("ingress", ns("openshift-ingress", "openshift-route-controller-manager", "openshift-ingress-canary", "openshift-ingress-operator")), + comp("console", ns("openshift-console", "openshift-console-operator")), + comp("insights", ns("openshift-insights", "openshift-insights-operator")), + comp("machine-api", ns("openshift-machine-api", "openshift-machine-api-operator")), + comp("monitoring", ns("openshift-monitoring", "openshift-monitoring-operator")), + comp("network", ns("openshift-network-operator", "openshift-ovn-kubernetes", "openshift-multus", "openshift-network-diagnostics", "openshift-sdn")), + comp("node-tuning", ns("openshift-cluster-node-tuning-operator", "openshift-node-tuning-operator")), + comp("openshift-apiserver", ns("openshift-apiserver", "openshift-apiserver-operator")), + comp("openshift-controller-manager", ns("openshift-controller-manager", "openshift-controller-manager-operator")), + comp("openshift-samples", ns("openshift-cluster-samples-operator", "openshift-samples-operator")), + comp("operator-lifecycle-manager", ns("openshift-operator-lifecycle-manager")), + comp("service-ca", ns("openshift-service-ca", "openshift-service-ca-operator")), + comp("storage", ns("openshift-storage", "openshift-cluster-csi-drivers", "openshift-cluster-storage-operator", "openshift-storage-operator")), + comp("vertical-pod-autoscaler", ns("openshift-vertical-pod-autoscaler", "openshift-vertical-pod-autoscaler-operator")), + comp("marketplace", ns("openshift-marketplace", "openshift-marketplace-operator")), + } + + workloadMatchers = []componentMatcher{ + comp("openshift-compliance", ns("openshift-compliance")), + comp("openshift-file-integrity", ns("openshift-file-integrity")), + comp("openshift-logging", ns("openshift-logging")), + comp("openshift-user-workload-monitoring", ns("openshift-user-workload-monitoring")), + comp("openshift-gitops", ns("openshift-gitops", "openshift-gitops-operator")), + comp("openshift-operators", ns("openshift-operators")), + comp("openshift-local-storage", ns("openshift-local-storage")), + comp("quay", labelValues("container", "quay-app", "quay-mirror", "quay-app-upgrade")), + comp("Argo", regexAlertNames(regexp.MustCompile("^Argo"))), + } +) + +var cvoAlerts = []model.LabelValue{"ClusterOperatorDown", "ClusterOperatorDegraded"} + +func cvoAlertsMatcher(labels model.LabelSet) (layer, comp model.LabelValue, keys []model.LabelName) { + for _, v := range cvoAlerts { + if labels[managementlabels.AlertNameLabel] == v { + component := labels["name"] + if component == "" { + component = "version" + } + return "cluster", component, nil + } + } + return "", "", nil +} + +func kubevirtOperatorMatcher(labels model.LabelSet) (layer, comp model.LabelValue, keys []model.LabelName) { + if labels["kubernetes_operator_part_of"] != "kubevirt" { + return "", "", nil + } + if labels["kubernetes_operator_component"] == "cnv-observability" { + return "", "", nil + } + if labels["operator_health_impact"] == "none" && labels["kubernetes_operator_component"] == "kubevirt" { + return "namespace", "OpenShift Virtualization Virtual Machine", []model.LabelName{ + "kubernetes_operator_part_of", + "kubernetes_operator_component", + "operator_health_impact", + } + } + return "cluster", "OpenShift Virtualization Operator", []model.LabelName{ + "kubernetes_operator_part_of", + "kubernetes_operator_component", + "operator_health_impact", + } +} + +func computeMatcher(labels model.LabelSet) (layer, comp model.LabelValue, keys []model.LabelName) { + for _, nodeAlert := range nodeAlerts { + if labels[managementlabels.AlertNameLabel] == nodeAlert { + component := "compute" + return "cluster", model.LabelValue(component), nil + } + } + return "", "", nil +} + +func coreMatcher(labels model.LabelSet) (layer, comp model.LabelValue, keys []model.LabelName) { + // Try matching against core components. + if component, keys := findComponent(coreMatchers, labels); component != "" { + return "cluster", model.LabelValue(component), keys + } + return "", "", nil +} + +func workloadMatcher(labels model.LabelSet) (layer, comp model.LabelValue, keys []model.LabelName) { + // Try matching against workload components. + if component, keys := findComponent(workloadMatchers, labels); component != "" { + return "namespace", model.LabelValue(component), keys + } + return "", "", nil +} + +// DetermineComponent determines the component for a given set of labels. +// It returns the layer and component strings. +func DetermineComponent(labels model.LabelSet) (layer, component string) { + layer, component, _ = evalMatcherFns([]componentMatcherFn{ + cvoAlertsMatcher, + kubevirtOperatorMatcher, + computeMatcher, + coreMatcher, + workloadMatcher, + }, labels) + return layer, component +} diff --git a/pkg/k8s/alerting_health.go b/pkg/k8s/alerting_health.go new file mode 100644 index 000000000..0fdc40880 --- /dev/null +++ b/pkg/k8s/alerting_health.go @@ -0,0 +1,127 @@ +package k8s + +import ( + "context" + "fmt" + "strings" + "sync" + + "gopkg.in/yaml.v2" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/fields" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/cache" +) + +const ( + clusterMonitoringConfigMap = "cluster-monitoring-config" + clusterMonitoringConfigKey = "config.yaml" +) + +type clusterMonitoringConfig struct { + EnableUserWorkload bool `yaml:"enableUserWorkload"` +} + +// clusterMonitoringConfigManager watches the cluster-monitoring-config ConfigMap +// via an informer and caches the parsed enableUserWorkload value so that +// AlertingHealth never needs a live API call. +type clusterMonitoringConfigManager struct { + informer cache.SharedIndexInformer + + mu sync.RWMutex + enabled bool + err error +} + +func newClusterMonitoringConfigManager(ctx context.Context, clientset *kubernetes.Clientset) (*clusterMonitoringConfigManager, error) { + informer := cache.NewSharedIndexInformer( + cache.NewListWatchFromClient( + clientset.CoreV1().RESTClient(), + "configmaps", + ClusterMonitoringNamespace, + fields.OneTermEqualSelector("metadata.name", clusterMonitoringConfigMap), + ), + &corev1.ConfigMap{}, + 0, + cache.Indexers{}, + ) + + m := &clusterMonitoringConfigManager{ + informer: informer, + } + + _, err := informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) { + cm, ok := obj.(*corev1.ConfigMap) + if !ok { + return + } + m.handleUpdate(cm) + }, + UpdateFunc: func(_, newObj interface{}) { + cm, ok := newObj.(*corev1.ConfigMap) + if !ok { + return + } + m.handleUpdate(cm) + }, + DeleteFunc: func(_ interface{}) { + m.mu.Lock() + defer m.mu.Unlock() + m.enabled = false + m.err = nil + }, + }) + if err != nil { + return nil, fmt.Errorf("failed to add event handler to cluster-monitoring-config informer: %w", err) + } + + go informer.Run(ctx.Done()) + + if !cache.WaitForNamedCacheSync("ClusterMonitoringConfig informer", ctx.Done(), informer.HasSynced) { + return nil, fmt.Errorf("failed to sync ClusterMonitoringConfig informer") + } + + return m, nil +} + +func (m *clusterMonitoringConfigManager) handleUpdate(cm *corev1.ConfigMap) { + m.mu.Lock() + defer m.mu.Unlock() + + raw, ok := cm.Data[clusterMonitoringConfigKey] + if !ok || strings.TrimSpace(raw) == "" { + m.enabled = false + m.err = nil + return + } + + var cfg clusterMonitoringConfig + if err := yaml.Unmarshal([]byte(raw), &cfg); err != nil { + m.enabled = false + m.err = fmt.Errorf("parse cluster monitoring config.yaml: %w", err) + return + } + + m.enabled = cfg.EnableUserWorkload + m.err = nil +} + +func (m *clusterMonitoringConfigManager) userWorkloadEnabled() (bool, error) { + m.mu.RLock() + defer m.mu.RUnlock() + return m.enabled, m.err +} + +// AlertingHealth returns alerting route health and UWM enablement status. +func (c *client) AlertingHealth(ctx context.Context) (AlertingHealth, error) { + health := c.prometheusAlerts.alertingHealth(ctx) + + enabled, err := c.clusterMonitoringConfig.userWorkloadEnabled() + if err != nil { + return health, fmt.Errorf("failed to determine user workload enablement: %w", err) + } + health.UserWorkloadEnabled = enabled + + return health, nil +} diff --git a/pkg/k8s/client.go b/pkg/k8s/client.go index 6370270ff..e16be6dd2 100644 --- a/pkg/k8s/client.go +++ b/pkg/k8s/client.go @@ -5,6 +5,7 @@ import ( "fmt" osmv1client "github.com/openshift/client-go/monitoring/clientset/versioned" + routeclient "github.com/openshift/client-go/route/clientset/versioned" monitoringv1client "github.com/prometheus-operator/prometheus-operator/pkg/client/versioned" "github.com/sirupsen/logrus" "k8s.io/client-go/kubernetes" @@ -21,11 +22,14 @@ type client struct { osmv1clientset *osmv1client.Clientset config *rest.Config + prometheusAlerts *prometheusAlerts + prometheusRuleManager *prometheusRuleManager alertRelabelConfigManager *alertRelabelConfigManager alertingRuleManager *alertingRuleManager namespaceManager *namespaceManager relabeledRulesManager *relabeledRulesManager + clusterMonitoringConfig *clusterMonitoringConfigManager } func NewClient(ctx context.Context, config *rest.Config) (Client, error) { @@ -44,6 +48,11 @@ func NewClient(ctx context.Context, config *rest.Config) (Client, error) { return nil, fmt.Errorf("failed to create osmv1 clientset: %w", err) } + routeClientset, err := routeclient.NewForConfig(config) + if err != nil { + return nil, fmt.Errorf("failed to create route clientset: %w", err) + } + c := &client{ clientset: clientset, monitoringv1clientset: monitoringv1clientset, @@ -56,6 +65,8 @@ func NewClient(ctx context.Context, config *rest.Config) (Client, error) { return nil, fmt.Errorf("failed to create PrometheusRule manager: %w", err) } + c.prometheusAlerts = newPrometheusAlerts(routeClientset, clientset.CoreV1(), config, c.prometheusRuleManager) + c.alertRelabelConfigManager, err = newAlertRelabelConfigManager(ctx, osmv1clientset, config) if err != nil { return nil, fmt.Errorf("failed to create alert relabel config manager: %w", err) @@ -71,6 +82,11 @@ func NewClient(ctx context.Context, config *rest.Config) (Client, error) { return nil, fmt.Errorf("failed to create namespace manager: %w", err) } + c.clusterMonitoringConfig, err = newClusterMonitoringConfigManager(ctx, clientset) + if err != nil { + return nil, fmt.Errorf("failed to create cluster monitoring config manager: %w", err) + } + c.relabeledRulesManager, err = newRelabeledRulesManager(ctx, c.namespaceManager, c.alertRelabelConfigManager, monitoringv1clientset, clientset) if err != nil { return nil, fmt.Errorf("failed to create relabeled rules config manager: %w", err) @@ -87,6 +103,10 @@ func (c *client) TestConnection(_ context.Context) error { return nil } +func (c *client) PrometheusAlerts() PrometheusAlertsInterface { + return c.prometheusAlerts +} + func (c *client) PrometheusRules() PrometheusRuleInterface { return c.prometheusRuleManager } diff --git a/pkg/k8s/const.go b/pkg/k8s/const.go index 699dc452e..ff9eaf4c5 100644 --- a/pkg/k8s/const.go +++ b/pkg/k8s/const.go @@ -3,4 +3,29 @@ package k8s const ( ClusterMonitoringNamespace = "openshift-monitoring" UserWorkloadMonitoringNamespace = "openshift-user-workload-monitoring" + + PlatformRouteName = "prometheus-k8s" + PlatformAlertmanagerRouteName = "alertmanager-main" + UserWorkloadRouteName = "prometheus-user-workload" + UserWorkloadAlertmanagerRouteName = "alertmanager-user-workload" + PrometheusAlertsPath = "/v1/alerts" + PrometheusRulesPath = "/v1/rules" + AlertmanagerAlertsPath = "/api/v2/alerts" + UserWorkloadAlertmanagerPort = 9095 + UserWorkloadPrometheusServiceName = "prometheus-user-workload-web" + UserWorkloadPrometheusPort = 9090 + + ThanosQuerierServiceName = "thanos-querier" + DefaultThanosQuerierTenancyRulesPort = 9093 + ThanosQuerierTenancyAlertsPath = "/api/v1/alerts" + ThanosQuerierTenancyRulesPath = "/api/v1/rules" + ServiceCAPath = "/var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt" + + AlertSourceLabel = "openshift_io_alert_source" + AlertSourcePlatform = "platform" + AlertSourceUser = "user" + AlertBackendLabel = "openshift_io_alert_backend" + AlertBackendAM = "alertmanager" + AlertBackendProm = "prometheus" + AlertBackendThanos = "thanos" ) diff --git a/pkg/k8s/prometheus_alerts.go b/pkg/k8s/prometheus_alerts.go new file mode 100644 index 000000000..155c94bbe --- /dev/null +++ b/pkg/k8s/prometheus_alerts.go @@ -0,0 +1,941 @@ +package k8s + +import ( + "context" + "crypto/tls" + "crypto/x509" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "os" + "strings" + "sync" + "time" + + routev1 "github.com/openshift/api/route/v1" + routeclient "github.com/openshift/client-go/route/clientset/versioned" + "github.com/sirupsen/logrus" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + corev1client "k8s.io/client-go/kubernetes/typed/core/v1" + "k8s.io/client-go/rest" +) + +var ( + prometheusLog = logrus.WithField("module", "k8s-prometheus") +) + +const ( + namespaceCacheTTL = 30 * time.Second + serviceHealthTimeout = 5 * time.Second + serviceRequestTimeout = 10 * time.Second + maxTenancyProbeTargets = 3 +) + +type namespaceCache struct { + mu sync.Mutex + expiresAt time.Time + ttl time.Duration + value []string +} + +func newNamespaceCache(ttl time.Duration) *namespaceCache { + return &namespaceCache{ttl: ttl} +} + +func (c *namespaceCache) get() ([]string, bool) { + if c == nil { + return nil, false + } + + c.mu.Lock() + defer c.mu.Unlock() + + if c.expiresAt.IsZero() || time.Now().After(c.expiresAt) { + return nil, false + } + return copyStringSlice(c.value), true +} + +func (c *namespaceCache) set(namespaces []string) { + if c == nil { + return + } + + c.mu.Lock() + defer c.mu.Unlock() + + c.value = copyStringSlice(namespaces) + c.expiresAt = time.Now().Add(c.ttl) +} + +type prometheusAlerts struct { + routeClient routeclient.Interface + coreClient corev1client.CoreV1Interface + config *rest.Config + ruleManager PrometheusRuleInterface + nsCache *namespaceCache +} + +// GetAlertsRequest holds parameters for filtering alerts +type GetAlertsRequest struct { + // Labels filters alerts by labels + Labels map[string]string + // State filters alerts by state: "firing", "pending", "silenced", or "" for all states + State string +} + +type PrometheusAlert struct { + Labels map[string]string `json:"labels"` + Annotations map[string]string `json:"annotations"` + State string `json:"state"` + ActiveAt time.Time `json:"activeAt"` + Value string `json:"value"` + + AlertRuleId string `json:"alertRuleId,omitempty"` + AlertComponent string `json:"alertComponent,omitempty"` + AlertLayer string `json:"alertLayer,omitempty"` +} + +type prometheusAlertsData struct { + Alerts []PrometheusAlert `json:"alerts"` +} + +type prometheusAlertsResponse struct { + Status string `json:"status"` + Data prometheusAlertsData `json:"data"` +} + +type prometheusRulesData struct { + Groups []PrometheusRuleGroup `json:"groups"` +} + +type prometheusRulesResponse struct { + Status string `json:"status"` + Data prometheusRulesData `json:"data"` +} + +type alertmanagerAlertStatus struct { + State string `json:"state"` +} + +type alertmanagerAlert struct { + Labels map[string]string `json:"labels"` + Annotations map[string]string `json:"annotations"` + StartsAt time.Time `json:"startsAt"` + EndsAt time.Time `json:"endsAt"` + GeneratorURL string `json:"generatorURL"` + Status alertmanagerAlertStatus `json:"status"` +} + +func newPrometheusAlerts(routeClient routeclient.Interface, coreClient corev1client.CoreV1Interface, config *rest.Config, ruleManager PrometheusRuleInterface) *prometheusAlerts { + return &prometheusAlerts{ + routeClient: routeClient, + coreClient: coreClient, + config: config, + ruleManager: ruleManager, + nsCache: newNamespaceCache(namespaceCacheTTL), + } +} + +func (pa *prometheusAlerts) GetAlerts(ctx context.Context, req GetAlertsRequest) ([]PrometheusAlert, error) { + platformAlerts, err := pa.getAlertsForSource(ctx, ClusterMonitoringNamespace, PlatformRouteName, PlatformAlertmanagerRouteName, AlertSourcePlatform) + if err != nil { + return nil, err + } + + userAlerts, err := pa.getUserWorkloadAlerts(ctx, req) + if err != nil { + prometheusLog.Warnf("failed to get user workload alerts: %v", err) + } + + mergedAlerts := append(platformAlerts, userAlerts...) + + out := make([]PrometheusAlert, 0, len(mergedAlerts)) + for _, a := range mergedAlerts { + // Filter alerts based on state if provided + if !matchesAlertState(req.State, a.State) { + continue + } + + // Filter alerts based on labels if provided + if !labelsMatch(&req, &a) { + continue + } + + out = append(out, a) + } + return out, nil +} + +func matchesAlertState(requestedState string, alertState string) bool { + if requestedState == "" { + return true + } + if requestedState == "firing" { + return alertState == "firing" || alertState == "silenced" + } + return alertState == requestedState +} + +func (pa *prometheusAlerts) GetRules(ctx context.Context, req GetRulesRequest) ([]PrometheusRuleGroup, error) { + platformRules, err := pa.getRulesViaProxy(ctx, ClusterMonitoringNamespace, PlatformRouteName, AlertSourcePlatform) + if err != nil { + return nil, err + } + + userRules, err := pa.getUserWorkloadRules(ctx, req) + if err != nil { + prometheusLog.Warnf("failed to get user workload rules: %v", err) + } + + groups := append(platformRules, userRules...) + + matchers, err := compileRuleLabelMatchers(req) + if err != nil { + return nil, err + } + if len(matchers) == 0 { + return groups, nil + } + + return filterRuleGroupsByLabelMatchers(groups, matchers), nil +} + +func (pa *prometheusAlerts) alertingHealth(ctx context.Context) AlertingHealth { + userPrometheus := pa.routeHealth(ctx, UserWorkloadMonitoringNamespace, UserWorkloadRouteName, PrometheusRulesPath) + if userPrometheus.Status != RouteReachable { + if ok := pa.thanosTenancyReachable(ctx, ThanosQuerierTenancyAlertsPath); ok { + userPrometheus.FallbackReachable = true + } + } + + userAlertmanager := pa.routeHealth(ctx, UserWorkloadMonitoringNamespace, UserWorkloadAlertmanagerRouteName, AlertmanagerAlertsPath) + if userAlertmanager.Status != RouteReachable { + if ok := pa.serviceReachable(ctx, UserWorkloadMonitoringNamespace, UserWorkloadAlertmanagerRouteName, UserWorkloadAlertmanagerPort, AlertmanagerAlertsPath); ok { + userAlertmanager.FallbackReachable = true + } + } + + platformStack := pa.stackHealth(ctx, ClusterMonitoringNamespace, PlatformRouteName, PlatformAlertmanagerRouteName) + userWorkloadStack := AlertingStackHealth{ + Prometheus: userPrometheus, + Alertmanager: userAlertmanager, + } + + return AlertingHealth{ + Platform: &platformStack, + UserWorkload: &userWorkloadStack, + } +} + +func (pa *prometheusAlerts) stackHealth(ctx context.Context, namespace string, promRouteName string, amRouteName string) AlertingStackHealth { + return AlertingStackHealth{ + Prometheus: pa.routeHealth(ctx, namespace, promRouteName, PrometheusRulesPath), + Alertmanager: pa.routeHealth(ctx, namespace, amRouteName, AlertmanagerAlertsPath), + } +} + +func (pa *prometheusAlerts) routeHealth(ctx context.Context, namespace string, routeName string, path string) AlertingRouteHealth { + health := AlertingRouteHealth{ + Name: routeName, + Namespace: namespace, + } + + if pa.routeClient == nil { + health.Error = "route client is not configured" + return health + } + + route, err := pa.routeClient.RouteV1().Routes(namespace).Get(ctx, routeName, metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + health.Status = RouteNotFound + health.Error = err.Error() + return health + } + health.Error = err.Error() + return health + } + + url := buildRouteURL(route.Spec.Host, route.Spec.Path, path) + client, err := pa.createHTTPClient() + if err != nil { + health.Status = RouteUnreachable + health.Error = err.Error() + return health + } + + if _, err := pa.executeRequest(ctx, client, url); err != nil { + health.Status = RouteUnreachable + health.Error = err.Error() + return health + } + + health.Status = RouteReachable + return health +} + +func (pa *prometheusAlerts) getAlertsForSource(ctx context.Context, namespace string, promRouteName string, amRouteName string, source string) ([]PrometheusAlert, error) { + amAlerts, amErr := pa.getAlertmanagerAlerts(ctx, namespace, amRouteName, source) + promAlerts, promErr := pa.getAlertsViaProxy(ctx, namespace, promRouteName, source) + + if amErr == nil { + pending := filterAlertsByState(promAlerts, "pending") + return append(amAlerts, pending...), nil + } + + if promErr != nil { + return nil, promErr + } + + return promAlerts, nil +} + +func (pa *prometheusAlerts) getUserWorkloadAlerts(ctx context.Context, req GetAlertsRequest) ([]PrometheusAlert, error) { + if shouldPreferUserAlertmanager(req.State) { + alerts, err := pa.getUserWorkloadAlertsViaAlertmanager(ctx) + if err == nil { + return alerts, nil + } + prometheusLog.Warnf("failed to get user workload alerts via alertmanager: %v", err) + } + + namespace := namespaceFromLabels(req.Labels) + if namespace != "" { + alerts, err := pa.getAlertsViaThanosTenancy(ctx, namespace, AlertSourceUser) + if err == nil { + return alerts, nil + } + prometheusLog.Warnf("failed to get user workload alerts via thanos tenancy: %v", err) + } + + userNamespaces := pa.userRuleNamespaces(ctx) + if len(userNamespaces) > 0 { + alerts, err := pa.getAlertsViaThanosTenancyNamespaces(ctx, userNamespaces, AlertSourceUser) + if err == nil { + return alerts, nil + } + prometheusLog.Warnf("failed to get user workload alerts via thanos tenancy namespaces: %v", err) + } + + return pa.getAlertsForSource(ctx, UserWorkloadMonitoringNamespace, UserWorkloadRouteName, UserWorkloadAlertmanagerRouteName, AlertSourceUser) +} + +func shouldPreferUserAlertmanager(state string) bool { + return state == "firing" || state == "silenced" +} + +func (pa *prometheusAlerts) getUserWorkloadAlertsViaAlertmanager(ctx context.Context) ([]PrometheusAlert, error) { + alerts, err := pa.getAlertmanagerAlerts(ctx, UserWorkloadMonitoringNamespace, UserWorkloadAlertmanagerRouteName, AlertSourceUser) + if err != nil { + alerts, err = pa.getAlertmanagerAlertsViaService(ctx, UserWorkloadMonitoringNamespace, UserWorkloadAlertmanagerRouteName, UserWorkloadAlertmanagerPort, AlertSourceUser) + if err != nil { + return nil, err + } + } + + pending, err := pa.getAlertsViaProxy(ctx, UserWorkloadMonitoringNamespace, UserWorkloadRouteName, AlertSourceUser) + if err != nil { + pending, err = pa.getPrometheusAlertsViaService(ctx, UserWorkloadMonitoringNamespace, UserWorkloadPrometheusServiceName, UserWorkloadPrometheusPort, AlertSourceUser) + if err != nil { + return alerts, nil + } + } + + return append(alerts, filterAlertsByState(pending, "pending")...), nil +} + +func (pa *prometheusAlerts) getPrometheusAlertsViaService(ctx context.Context, namespace string, serviceName string, port int32, source string) ([]PrometheusAlert, error) { + if _, hasDeadline := ctx.Deadline(); !hasDeadline { + timeoutCtx, cancel := context.WithTimeout(ctx, serviceRequestTimeout) + defer cancel() + ctx = timeoutCtx + } + + raw, err := pa.getServiceResponse(ctx, namespace, serviceName, port, PrometheusAlertsPath) + if err != nil { + return nil, err + } + + var alertsResp prometheusAlertsResponse + if err := json.Unmarshal(raw, &alertsResp); err != nil { + return nil, fmt.Errorf("decode prometheus response: %w", err) + } + + if alertsResp.Status != "success" { + return nil, fmt.Errorf("prometheus API returned non-success status: %s", alertsResp.Status) + } + + applyAlertMetadata(alertsResp.Data.Alerts, source, AlertBackendProm) + return alertsResp.Data.Alerts, nil +} + +func (pa *prometheusAlerts) getAlertmanagerAlertsViaService(ctx context.Context, namespace string, serviceName string, port int32, source string) ([]PrometheusAlert, error) { + raw, err := pa.getServiceResponse(ctx, namespace, serviceName, port, AlertmanagerAlertsPath) + if err != nil { + return nil, err + } + + converted, err := parseAlertmanagerResponse(raw) + if err != nil { + return nil, err + } + + applyAlertMetadata(converted, source, AlertBackendAM) + if len(converted) == 0 { + return []PrometheusAlert{}, nil + } + return converted, nil +} + +// parseAlertmanagerResponse unmarshals a raw Alertmanager GET /api/v2/alerts +// response and converts it to PrometheusAlert structs. No routing labels are +// added — callers that need them should call applyAlertMetadata. +func parseAlertmanagerResponse(raw []byte) ([]PrometheusAlert, error) { + var amAlerts []alertmanagerAlert + if err := json.Unmarshal(raw, &amAlerts); err != nil { + return nil, fmt.Errorf("decode alertmanager response: %w", err) + } + + converted := make([]PrometheusAlert, 0, len(amAlerts)) + for _, alert := range amAlerts { + state := mapAlertmanagerState(alert.Status.State) + if state == "" { + continue + } + converted = append(converted, PrometheusAlert{ + Labels: alert.Labels, + Annotations: alert.Annotations, + State: state, + ActiveAt: alert.StartsAt, + }) + } + return converted, nil +} + +func (pa *prometheusAlerts) serviceReachable(ctx context.Context, namespace string, serviceName string, port int32, path string) bool { + healthCtx, cancel := context.WithTimeout(ctx, serviceHealthTimeout) + defer cancel() + + _, err := pa.getServiceResponse(healthCtx, namespace, serviceName, port, path) + return err == nil +} + +func (pa *prometheusAlerts) getServiceResponse(ctx context.Context, namespace string, serviceName string, port int32, path string) ([]byte, error) { + baseURL := fmt.Sprintf("https://%s.%s.svc:%d", serviceName, namespace, port) + requestURL := fmt.Sprintf("%s%s", baseURL, path) + + client, err := pa.createHTTPClient() + if err != nil { + return nil, err + } + + return pa.executeRequest(ctx, client, requestURL) +} + +func (pa *prometheusAlerts) thanosTenancyReachable(ctx context.Context, path string) bool { + namespaces := pa.userRuleNamespaces(ctx) + if len(namespaces) == 0 { + return false + } + + limit := maxTenancyProbeTargets + if limit <= 0 || limit > len(namespaces) { + limit = len(namespaces) + } + + for i := 0; i < limit; i++ { + healthCtx, cancel := context.WithTimeout(ctx, serviceHealthTimeout) + _, err := pa.getThanosTenancyResponse(healthCtx, path, namespaces[i]) + cancel() + + if err == nil { + return true + } + if isTenancyExpectedError(err) { + continue + } + return false + } + + return false +} + +// isTenancyExpectedError returns true for errors that are expected when probing +// Thanos tenancy endpoints across user namespaces — e.g. the namespace has no +// rules (404), the SA lacks access (401/403), or the namespace is not yet +// instrumented. These are skipped; only a network/server error aborts the probe. +func isTenancyExpectedError(err error) bool { + if err == nil { + return false + } + msg := strings.ToLower(err.Error()) + return strings.Contains(msg, "status 401") || + strings.Contains(msg, "status 403") || + strings.Contains(msg, "status 404") || + strings.Contains(msg, "unauthorized") || + strings.Contains(msg, "forbidden") || + strings.Contains(msg, "not found") +} + +func (pa *prometheusAlerts) getAlertsViaProxy(ctx context.Context, namespace string, routeName string, source string) ([]PrometheusAlert, error) { + raw, err := pa.getPrometheusResponse(ctx, namespace, routeName, PrometheusAlertsPath) + if err != nil { + return nil, err + } + + var alertsResp prometheusAlertsResponse + if err := json.Unmarshal(raw, &alertsResp); err != nil { + return nil, fmt.Errorf("decode prometheus response: %w", err) + } + + if alertsResp.Status != "success" { + return nil, fmt.Errorf("prometheus API returned non-success status: %s", alertsResp.Status) + } + + applyAlertMetadata(alertsResp.Data.Alerts, source, AlertBackendProm) + return alertsResp.Data.Alerts, nil +} + +func (pa *prometheusAlerts) getAlertsViaThanosTenancy(ctx context.Context, namespace string, source string) ([]PrometheusAlert, error) { + raw, err := pa.getThanosTenancyResponse(ctx, ThanosQuerierTenancyAlertsPath, namespace) + if err != nil { + return nil, err + } + + var alertsResp prometheusAlertsResponse + if err := json.Unmarshal(raw, &alertsResp); err != nil { + return nil, fmt.Errorf("decode thanos response: %w", err) + } + + if alertsResp.Status != "success" { + return nil, fmt.Errorf("thanos API returned non-success status: %s", alertsResp.Status) + } + + applyAlertMetadata(alertsResp.Data.Alerts, source, AlertBackendThanos) + return alertsResp.Data.Alerts, nil +} + +func (pa *prometheusAlerts) getAlertmanagerAlerts(ctx context.Context, namespace string, routeName string, source string) ([]PrometheusAlert, error) { + raw, err := pa.getPrometheusResponse(ctx, namespace, routeName, AlertmanagerAlertsPath) + if err != nil { + return nil, err + } + + converted, err := parseAlertmanagerResponse(raw) + if err != nil { + return nil, err + } + + applyAlertMetadata(converted, source, AlertBackendAM) + if len(converted) == 0 { + return []PrometheusAlert{}, nil + } + return converted, nil +} + +func (pa *prometheusAlerts) getUserWorkloadRules(ctx context.Context, req GetRulesRequest) ([]PrometheusRuleGroup, error) { + namespace := namespaceFromLabels(req.Labels) + if namespace != "" { + rules, err := pa.getRulesViaThanosTenancy(ctx, namespace, AlertSourceUser) + if err == nil { + return rules, nil + } + prometheusLog.Warnf("failed to get user workload rules via thanos tenancy: %v", err) + } + + userNamespaces := pa.userRuleNamespaces(ctx) + if len(userNamespaces) > 0 { + groups, err := pa.getRulesViaThanosTenancyNamespaces(ctx, userNamespaces, AlertSourceUser) + if err == nil { + return groups, nil + } + prometheusLog.Warnf("failed to get user workload rules via thanos tenancy namespaces: %v", err) + } + + return pa.getRulesViaProxy(ctx, UserWorkloadMonitoringNamespace, UserWorkloadRouteName, AlertSourceUser) +} + +func (pa *prometheusAlerts) userRuleNamespaces(ctx context.Context) []string { + if cached, ok := pa.nsCache.get(); ok { + return cached + } + + if pa.ruleManager == nil { + namespaces := pa.allNonPlatformNamespaces(ctx) + pa.nsCache.set(namespaces) + return namespaces + } + + prometheusRules, err := pa.ruleManager.List() + if err != nil { + prometheusLog.WithError(err).Warn("failed to list PrometheusRules for user namespace discovery") + namespaces := pa.allNonPlatformNamespaces(ctx) + pa.nsCache.set(namespaces) + return namespaces + } + + namespaces := map[string]struct{}{} + for _, pr := range prometheusRules { + if pr.Namespace == "" { + continue + } + if pr.Namespace == ClusterMonitoringNamespace || pr.Namespace == UserWorkloadMonitoringNamespace { + continue + } + namespaces[pr.Namespace] = struct{}{} + } + + out := make([]string, 0, len(namespaces)) + for ns := range namespaces { + out = append(out, ns) + } + pa.nsCache.set(out) + return out +} + +func (pa *prometheusAlerts) allNonPlatformNamespaces(ctx context.Context) []string { + if pa.coreClient == nil { + return nil + } + + namespaceList, err := pa.coreClient.Namespaces().List(ctx, metav1.ListOptions{}) + if err != nil { + prometheusLog.WithError(err).Warn("failed to list namespaces for user namespace discovery") + return nil + } + + out := make([]string, 0, len(namespaceList.Items)) + for _, ns := range namespaceList.Items { + if ns.Name == ClusterMonitoringNamespace || ns.Name == UserWorkloadMonitoringNamespace { + continue + } + out = append(out, ns.Name) + } + return out +} + +// fanOutThanosTenancy calls fetch for each namespace, accumulates results, and +// returns combined results (or the last error if nothing succeeded). +func fanOutThanosTenancy[T any](namespaces []string, fetch func(string) ([]T, error)) ([]T, error) { + var out []T + var lastErr error + for _, namespace := range namespaces { + results, err := fetch(namespace) + if err != nil { + lastErr = err + continue + } + out = append(out, results...) + } + if len(out) > 0 { + return out, nil + } + return out, lastErr +} + +func (pa *prometheusAlerts) getAlertsViaThanosTenancyNamespaces(ctx context.Context, namespaces []string, source string) ([]PrometheusAlert, error) { + return fanOutThanosTenancy(namespaces, func(ns string) ([]PrometheusAlert, error) { + return pa.getAlertsViaThanosTenancy(ctx, ns, source) + }) +} + +func (pa *prometheusAlerts) getRulesViaThanosTenancyNamespaces(ctx context.Context, namespaces []string, source string) ([]PrometheusRuleGroup, error) { + return fanOutThanosTenancy(namespaces, func(ns string) ([]PrometheusRuleGroup, error) { + return pa.getRulesViaThanosTenancy(ctx, ns, source) + }) +} + +func (pa *prometheusAlerts) getRulesViaProxy(ctx context.Context, namespace string, routeName string, source string) ([]PrometheusRuleGroup, error) { + raw, err := pa.getPrometheusResponse(ctx, namespace, routeName, PrometheusRulesPath) + if err != nil { + return nil, err + } + + var rulesResp prometheusRulesResponse + if err := json.Unmarshal(raw, &rulesResp); err != nil { + return nil, fmt.Errorf("decode prometheus response: %w", err) + } + + if rulesResp.Status != "success" { + return nil, fmt.Errorf("prometheus API returned non-success status: %s", rulesResp.Status) + } + + applyRuleSource(rulesResp.Data.Groups, source) + return rulesResp.Data.Groups, nil +} + +func (pa *prometheusAlerts) getRulesViaThanosTenancy(ctx context.Context, namespace string, source string) ([]PrometheusRuleGroup, error) { + raw, err := pa.getThanosTenancyResponse(ctx, ThanosQuerierTenancyRulesPath, namespace) + if err != nil { + return nil, err + } + + var rulesResp prometheusRulesResponse + if err := json.Unmarshal(raw, &rulesResp); err != nil { + return nil, fmt.Errorf("decode thanos response: %w", err) + } + + if rulesResp.Status != "success" { + return nil, fmt.Errorf("thanos API returned non-success status: %s", rulesResp.Status) + } + + applyRuleSource(rulesResp.Data.Groups, source) + return rulesResp.Data.Groups, nil +} + +func (pa *prometheusAlerts) getPrometheusResponse(ctx context.Context, namespace string, routeName string, path string) ([]byte, error) { + url, err := pa.buildPrometheusURL(ctx, namespace, routeName, path) + if err != nil { + return nil, err + } + client, err := pa.createHTTPClient() + if err != nil { + return nil, err + } + + return pa.executeRequest(ctx, client, url) +} + +func (pa *prometheusAlerts) getThanosTenancyResponse(ctx context.Context, path string, namespace string) ([]byte, error) { + if namespace == "" { + return nil, fmt.Errorf("namespace is required for thanos tenancy requests") + } + + baseURL := fmt.Sprintf("https://%s.%s.svc:%d", ThanosQuerierServiceName, ClusterMonitoringNamespace, DefaultThanosQuerierTenancyRulesPort) + requestURL := fmt.Sprintf("%s%s?namespace=%s", baseURL, path, url.QueryEscape(namespace)) + + client, err := pa.createHTTPClient() + if err != nil { + return nil, err + } + + return pa.executeRequest(ctx, client, requestURL) +} + +func (pa *prometheusAlerts) buildPrometheusURL(ctx context.Context, namespace string, routeName string, path string) (string, error) { + route, err := pa.fetchPrometheusRoute(ctx, namespace, routeName) + if err != nil { + return "", err + } + + return buildRouteURL(route.Spec.Host, route.Spec.Path, path), nil +} + +func (pa *prometheusAlerts) fetchPrometheusRoute(ctx context.Context, namespace string, routeName string) (*routev1.Route, error) { + if pa.routeClient == nil { + return nil, fmt.Errorf("route client is not configured") + } + + route, err := pa.routeClient.RouteV1().Routes(namespace).Get(ctx, routeName, metav1.GetOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to get prometheus route: %w", err) + } + + return route, nil +} + +func applyAlertMetadata(alerts []PrometheusAlert, source, backend string) { + for i := range alerts { + if alerts[i].Labels == nil { + alerts[i].Labels = map[string]string{} + } + alerts[i].Labels[AlertSourceLabel] = source + alerts[i].Labels[AlertBackendLabel] = backend + } +} + +func applyRuleSource(groups []PrometheusRuleGroup, source string) { + for gi := range groups { + for ri := range groups[gi].Rules { + rule := &groups[gi].Rules[ri] + if rule.Labels == nil { + rule.Labels = map[string]string{} + } + rule.Labels[AlertSourceLabel] = source + for ai := range rule.Alerts { + if rule.Alerts[ai].Labels == nil { + rule.Alerts[ai].Labels = map[string]string{} + } + rule.Alerts[ai].Labels[AlertSourceLabel] = source + } + } + } +} + +func filterAlertsByState(alerts []PrometheusAlert, state string) []PrometheusAlert { + out := make([]PrometheusAlert, 0, len(alerts)) + for _, alert := range alerts { + if alert.State == state { + out = append(out, alert) + } + } + return out +} + +func mapAlertmanagerState(state string) string { + if state == "active" { + return "firing" + } + if state == "suppressed" { + return "silenced" + } + return "" +} + +func buildRouteURL(host string, routePath string, requestPath string) string { + basePath := strings.TrimSuffix(routePath, "/") + if basePath == "" { + return fmt.Sprintf("https://%s%s", host, requestPath) + } + if requestPath == basePath || strings.HasPrefix(requestPath, basePath+"/") { + return fmt.Sprintf("https://%s%s", host, requestPath) + } + return fmt.Sprintf("https://%s%s%s", host, basePath, requestPath) +} + +func namespaceFromLabels(labels map[string]string) string { + if labels == nil { + return "" + } + return strings.TrimSpace(labels["namespace"]) +} + +func (pa *prometheusAlerts) createHTTPClient() (*http.Client, error) { + tlsConfig, err := pa.buildTLSConfig() + if err != nil { + return nil, err + } + + return &http.Client{ + Transport: &http.Transport{ + TLSClientConfig: tlsConfig, + }, + }, nil +} + +func (pa *prometheusAlerts) buildTLSConfig() (*tls.Config, error) { + caCertPool, err := pa.loadCACertPool() + if err != nil { + return nil, err + } + + return &tls.Config{ + MinVersion: tls.VersionTLS12, + RootCAs: caCertPool, + }, nil +} + +func (pa *prometheusAlerts) loadCACertPool() (*x509.CertPool, error) { + caCertPool, err := x509.SystemCertPool() + if err != nil { + caCertPool = x509.NewCertPool() + } + + if len(pa.config.CAData) > 0 { + caCertPool.AppendCertsFromPEM(pa.config.CAData) + return caCertPool, nil + } + + if pa.config.CAFile != "" { + caCert, err := os.ReadFile(pa.config.CAFile) + if err != nil { + return nil, fmt.Errorf("read CA cert file: %w", err) + } + caCertPool.AppendCertsFromPEM(caCert) + } + + // OpenShift service CA bundle for in-cluster service certs. + if serviceCA, err := os.ReadFile(ServiceCAPath); err == nil { + caCertPool.AppendCertsFromPEM(serviceCA) + } + + return caCertPool, nil +} + +func copyStringSlice(in []string) []string { + if len(in) == 0 { + return []string{} + } + + out := make([]string, len(in)) + copy(out, in) + return out +} + +func (pa *prometheusAlerts) executeRequest(ctx context.Context, client *http.Client, url string) ([]byte, error) { + req, err := pa.createAuthenticatedRequest(ctx, url) + if err != nil { + return nil, err + } + + return pa.performRequest(client, req) +} + +func (pa *prometheusAlerts) createAuthenticatedRequest(ctx context.Context, url string) (*http.Request, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return nil, fmt.Errorf("create request: %w", err) + } + + token := BearerTokenFromContext(ctx) + if token == "" { + var err error + token, err = pa.loadBearerToken() + if err != nil { + return nil, err + } + } + + req.Header.Set("Authorization", "Bearer "+token) + return req, nil +} + +func (pa *prometheusAlerts) loadBearerToken() (string, error) { + if pa.config.BearerToken != "" { + return pa.config.BearerToken, nil + } + + if pa.config.BearerTokenFile == "" { + return "", fmt.Errorf("no bearer token or token file configured") + } + + tokenBytes, err := os.ReadFile(pa.config.BearerTokenFile) + if err != nil { + return "", fmt.Errorf("load bearer token file: %w", err) + } + + return strings.TrimSpace(string(tokenBytes)), nil +} + +func (pa *prometheusAlerts) performRequest(client *http.Client, req *http.Request) ([]byte, error) { + resp, err := client.Do(req) + if err != nil { + return nil, fmt.Errorf("execute request: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("read response body: %w", err) + } + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("unexpected status %d: %s", resp.StatusCode, string(body)) + } + + return body, nil +} + +func labelsMatch(req *GetAlertsRequest, alert *PrometheusAlert) bool { + for key, value := range req.Labels { + if alertValue, exists := alert.Labels[key]; !exists || alertValue != value { + return false + } + } + + return true +} diff --git a/pkg/k8s/relabeled_rules.go b/pkg/k8s/relabeled_rules.go index 02452c385..a853630ea 100644 --- a/pkg/k8s/relabeled_rules.go +++ b/pkg/k8s/relabeled_rules.go @@ -9,7 +9,6 @@ import ( "sync" "time" - osmv1 "github.com/openshift/api/monitoring/v1" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" monitoringv1client "github.com/prometheus-operator/prometheus-operator/pkg/client/versioned" "github.com/prometheus/common/model" @@ -260,11 +259,6 @@ func (rrm *relabeledRulesManager) collectAlerts(ctx context.Context, relabelConf alerts := make(map[string]monitoringv1.Rule) seenIDs := make(map[string]struct{}) - // Fetch all ARCs once from the informer cache (O(1) per-rule lookup below). - // This avoids O(n) live API server calls inside the per-rule loop that would - // cause exponential rate-limit backoff and stale cache data for new rules. - arcByName := rrm.arcsByName(ctx) - for _, obj := range rrm.prometheusRulesInformer.GetStore().List() { promRule, ok := obj.(*monitoringv1.PrometheusRule) if !ok { @@ -322,7 +316,7 @@ func (rrm *relabeledRulesManager) collectAlerts(ctx context.Context, relabelConf rule.Labels[managementlabels.AlertingRuleLabelName] = arName } - ruleManagedBy, relabelConfigManagedBy := rrm.determineManagedBy(promRule, alertRuleId, arcByName) + ruleManagedBy, relabelConfigManagedBy := rrm.determineManagedBy(ctx, promRule, alertRuleId) if ruleManagedBy != "" { rule.Labels[managementlabels.RuleManagedByLabel] = ruleManagedBy } @@ -390,28 +384,8 @@ func shortHash(id string, n int) string { return full[:n] } -// arcsByName builds a namespace/name → ARC map from the informer cache. -// Called once per sync cycle so that determineManagedBy can do O(1) lookups -// instead of one live API call per rule. -func (rrm *relabeledRulesManager) arcsByName(ctx context.Context) map[string]*osmv1.AlertRelabelConfig { - if rrm.alertRelabelConfigs == nil { - return nil - } - arcs, err := rrm.alertRelabelConfigs.List(ctx, "") - if err != nil { - log.Errorf("arcsByName: failed to list ARCs from cache: %v", err) - return nil - } - m := make(map[string]*osmv1.AlertRelabelConfig, len(arcs)) - for i := range arcs { - key := arcs[i].Namespace + "/" + arcs[i].Name - m[key] = &arcs[i] - } - return m -} - // determineManagedBy determines the openshift_io_rule_managed_by and openshift_io_relabel_config_managed_by label values -func (rrm *relabeledRulesManager) determineManagedBy(promRule *monitoringv1.PrometheusRule, alertRuleId string, arcByName map[string]*osmv1.AlertRelabelConfig) (string, string) { +func (rrm *relabeledRulesManager) determineManagedBy(ctx context.Context, promRule *monitoringv1.PrometheusRule, alertRuleId string) (string, string) { // Determine ruleManagedBy from PrometheusRule var ruleManagedBy string // If generated by AlertingRule CRD, do not mark as operator-managed; treat as user-via-platform @@ -425,14 +399,13 @@ func (rrm *relabeledRulesManager) determineManagedBy(promRule *monitoringv1.Prom } } - // Determine relabelConfigManagedBy only for platform rules using the - // pre-fetched cache map; no live API call is made here. + // Determine relabelConfigManagedBy only for platform rules isPlatform := rrm.namespaceManager.IsClusterMonitoringNamespace(promRule.Namespace) var relabelConfigManagedBy string - if isPlatform && arcByName != nil { + if isPlatform && rrm.alertRelabelConfigs != nil { arcName := GetAlertRelabelConfigName(promRule.Name, alertRuleId) - key := promRule.Namespace + "/" + arcName - if arc, found := arcByName[key]; found { + arc, found, err := rrm.alertRelabelConfigs.Get(ctx, promRule.Namespace, arcName) + if err == nil && found { if IsManagedByGitOps(arc.Annotations, arc.Labels) { relabelConfigManagedBy = managementlabels.ManagedByGitOps } @@ -442,27 +415,13 @@ func (rrm *relabeledRulesManager) determineManagedBy(promRule *monitoringv1.Prom return ruleManagedBy, relabelConfigManagedBy } -// DetermineManagedBy determines the managed-by labels for a single PrometheusRule -// alert rule. Callers that have a user-scoped context (e.g. tests) can pass a -// live AlertRelabelConfigInterface; a targeted Get is performed for that one rule. +// DetermineManagedBy determines the managed-by labels for a PrometheusRule alert rule. func DetermineManagedBy(ctx context.Context, alertRelabelConfigs AlertRelabelConfigInterface, namespaceManager NamespaceInterface, promRule *monitoringv1.PrometheusRule, alertRuleId string) (string, string) { - // Single-rule path: fetch only the specific ARC with RBAC enforcement on the - // caller's context, then build a one-entry map for determineManagedBy. - var arcByName map[string]*osmv1.AlertRelabelConfig - if alertRelabelConfigs != nil && namespaceManager.IsClusterMonitoringNamespace(promRule.Namespace) { - arcName := GetAlertRelabelConfigName(promRule.Name, alertRuleId) - arc, found, err := alertRelabelConfigs.Get(ctx, promRule.Namespace, arcName) - if err == nil && found { - arcByName = map[string]*osmv1.AlertRelabelConfig{ - promRule.Namespace + "/" + arcName: arc, - } - } - } rrm := &relabeledRulesManager{ alertRelabelConfigs: alertRelabelConfigs, namespaceManager: namespaceManager, } - return rrm.determineManagedBy(promRule, alertRuleId, arcByName) + return rrm.determineManagedBy(ctx, promRule, alertRuleId) } func (rrm *relabeledRulesManager) List(ctx context.Context) []monitoringv1.Rule { diff --git a/pkg/k8s/relabeled_rules_test.go b/pkg/k8s/relabeled_rules_test.go deleted file mode 100644 index 1d10ef48c..000000000 --- a/pkg/k8s/relabeled_rules_test.go +++ /dev/null @@ -1,157 +0,0 @@ -package k8s - -import ( - "context" - "testing" - - osmv1 "github.com/openshift/api/monitoring/v1" - monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - - "github.com/openshift/monitoring-plugin/pkg/managementlabels" -) - -// arcGetPanicInterface implements AlertRelabelConfigInterface and panics if -// Get is called. It is used to verify that the sync path never calls Get. -type arcGetPanicInterface struct { - arcs []osmv1.AlertRelabelConfig -} - -func (m *arcGetPanicInterface) List(_ context.Context, namespace string) ([]osmv1.AlertRelabelConfig, error) { - if namespace == "" { - return m.arcs, nil - } - var filtered []osmv1.AlertRelabelConfig - for _, a := range m.arcs { - if a.Namespace == namespace { - filtered = append(filtered, a) - } - } - return filtered, nil -} - -func (m *arcGetPanicInterface) Get(_ context.Context, _, _ string) (*osmv1.AlertRelabelConfig, bool, error) { - panic("Get must not be called during sync; use the arcByName cache map instead") -} - -func (m *arcGetPanicInterface) Create(_ context.Context, arc osmv1.AlertRelabelConfig) (*osmv1.AlertRelabelConfig, error) { - return &arc, nil -} - -func (m *arcGetPanicInterface) Update(_ context.Context, _ osmv1.AlertRelabelConfig) error { - return nil -} - -func (m *arcGetPanicInterface) Delete(_ context.Context, _, _ string) error { - return nil -} - -// stubNamespaceManager implements NamespaceInterface for tests. -type stubNamespaceManager struct { - platformNamespaces map[string]bool -} - -func (s *stubNamespaceManager) IsClusterMonitoringNamespace(name string) bool { - return s.platformNamespaces[name] -} - -// TestDetermineManagedBy_NeverCallsGet verifies that determineManagedBy -// uses the pre-fetched arcByName map and never issues a live Get call, -// even for platform-namespace rules with a matching ARC. -func TestDetermineManagedBy_NeverCallsGet(t *testing.T) { - const ( - namespace = "openshift-monitoring" - promRuleName = "test-rule" - alertRuleID = "abc123" - ) - - arcName := GetAlertRelabelConfigName(promRuleName, alertRuleID) - arc := osmv1.AlertRelabelConfig{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: namespace, - Name: arcName, - Annotations: map[string]string{ - "argocd.argoproj.io/managed-by": "some-app", - }, - }, - } - - rrm := &relabeledRulesManager{ - // arcGetPanicInterface panics if Get is called — this is the guard. - alertRelabelConfigs: &arcGetPanicInterface{arcs: []osmv1.AlertRelabelConfig{arc}}, - namespaceManager: &stubNamespaceManager{ - platformNamespaces: map[string]bool{namespace: true}, - }, - } - - promRule := &monitoringv1.PrometheusRule{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: namespace, - Name: promRuleName, - }, - } - - // Build arcByName from List (no Get call). - arcByName := rrm.arcsByName(context.Background()) - - // This must not panic (i.e. must not call Get). - ruleManagedBy, relabelConfigManagedBy := rrm.determineManagedBy(promRule, alertRuleID, arcByName) - - if ruleManagedBy != "" { - t.Errorf("expected empty ruleManagedBy, got %q", ruleManagedBy) - } - if relabelConfigManagedBy != managementlabels.ManagedByGitOps { - t.Errorf("expected relabelConfigManagedBy=%q, got %q", managementlabels.ManagedByGitOps, relabelConfigManagedBy) - } -} - -// TestDetermineManagedBy_NoARCMatch verifies that a platform rule with no -// matching ARC in the cache produces empty relabelConfigManagedBy. -func TestDetermineManagedBy_NoARCMatch(t *testing.T) { - const namespace = "openshift-monitoring" - - rrm := &relabeledRulesManager{ - alertRelabelConfigs: &arcGetPanicInterface{arcs: nil}, - namespaceManager: &stubNamespaceManager{ - platformNamespaces: map[string]bool{namespace: true}, - }, - } - - promRule := &monitoringv1.PrometheusRule{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: namespace, - Name: "some-rule", - }, - } - - arcByName := rrm.arcsByName(context.Background()) - _, relabelConfigManagedBy := rrm.determineManagedBy(promRule, "no-match-id", arcByName) - - if relabelConfigManagedBy != "" { - t.Errorf("expected empty relabelConfigManagedBy for no ARC match, got %q", relabelConfigManagedBy) - } -} - -// TestDetermineManagedBy_NonPlatformRuleSkipsARCLookup verifies that a -// user-workload rule (non-platform namespace) does not consult ARCs at all. -func TestDetermineManagedBy_NonPlatformRuleSkipsARCLookup(t *testing.T) { - rrm := &relabeledRulesManager{ - // Non-nil but panics on Get — confirms no lookup occurs. - alertRelabelConfigs: &arcGetPanicInterface{arcs: nil}, - namespaceManager: &stubNamespaceManager{platformNamespaces: map[string]bool{}}, - } - - promRule := &monitoringv1.PrometheusRule{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: "user-namespace", - Name: "user-rule", - }, - } - - arcByName := rrm.arcsByName(context.Background()) - _, relabelConfigManagedBy := rrm.determineManagedBy(promRule, "some-id", arcByName) - - if relabelConfigManagedBy != "" { - t.Errorf("expected empty relabelConfigManagedBy for non-platform rule, got %q", relabelConfigManagedBy) - } -} diff --git a/pkg/k8s/rule_label_matchers.go b/pkg/k8s/rule_label_matchers.go new file mode 100644 index 000000000..cf8eb1f51 --- /dev/null +++ b/pkg/k8s/rule_label_matchers.go @@ -0,0 +1,91 @@ +package k8s + +import ( + "fmt" + "strings" + + "github.com/prometheus/prometheus/model/labels" + "github.com/prometheus/prometheus/promql/parser" +) + +const namespaceLabelKey = "namespace" + +func compileRuleLabelMatchers(req GetRulesRequest) ([]*labels.Matcher, error) { + var out []*labels.Matcher + + for k, v := range req.Labels { + if strings.TrimSpace(k) == "" { + continue + } + if k == namespaceLabelKey { + continue + } + m, err := labels.NewMatcher(labels.MatchEqual, k, v) + if err != nil { + return nil, fmt.Errorf("invalid label matcher %q=%q: %w", k, v, err) + } + out = append(out, m) + } + + for _, raw := range req.Matchers { + sel := strings.TrimSpace(raw) + if sel == "" { + continue + } + if !strings.HasPrefix(sel, "{") || !strings.HasSuffix(sel, "}") { + sel = "{" + sel + "}" + } + matchers, err := parser.ParseMetricSelector(sel) + if err != nil { + return nil, fmt.Errorf("invalid matcher %q: %w", raw, err) + } + out = append(out, matchers...) + } + + return out, nil +} + +func filterRuleGroupsByLabelMatchers(groups []PrometheusRuleGroup, matchers []*labels.Matcher) []PrometheusRuleGroup { + if len(matchers) == 0 || len(groups) == 0 { + return groups + } + + out := make([]PrometheusRuleGroup, 0, len(groups)) + for _, g := range groups { + kept := make([]PrometheusRule, 0, len(g.Rules)) + for _, r := range g.Rules { + if ruleMatchesLabelMatchers(r, matchers) { + kept = append(kept, r) + } + } + if len(kept) == 0 { + continue + } + g.Rules = kept + out = append(out, g) + } + + return out +} + +func ruleMatchesLabelMatchers(rule PrometheusRule, matchers []*labels.Matcher) bool { + if len(matchers) == 0 { + return true + } + + for _, m := range matchers { + val, ok := rule.Labels[m.Name] + if !ok { + // Prometheus semantics: negative matchers match missing labels. + if m.Type == labels.MatchNotEqual || m.Type == labels.MatchNotRegexp { + continue + } + return false + } + if !m.Matches(val) { + return false + } + } + + return true +} diff --git a/pkg/k8s/rule_label_matchers_test.go b/pkg/k8s/rule_label_matchers_test.go new file mode 100644 index 000000000..34169eaa7 --- /dev/null +++ b/pkg/k8s/rule_label_matchers_test.go @@ -0,0 +1,58 @@ +package k8s + +import "testing" + +func TestCompileRuleLabelMatchers_IgnoresNamespaceLabel(t *testing.T) { + matchers, err := compileRuleLabelMatchers(GetRulesRequest{ + Labels: map[string]string{ + "namespace": "ns-a", + "severity": "critical", + }, + }) + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if len(matchers) != 1 { + t.Fatalf("expected 1 matcher (severity), got %d", len(matchers)) + } + if matchers[0].Name != "severity" { + t.Fatalf("expected matcher for severity, got %q", matchers[0].Name) + } +} + +func TestRuleMatchesLabelMatchers_PrometheusMissingLabelSemantics(t *testing.T) { + neg, err := compileRuleLabelMatchers(GetRulesRequest{ + Matchers: []string{`missing!="x"`}, + }) + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if !ruleMatchesLabelMatchers(PrometheusRule{Labels: map[string]string{}}, neg) { + t.Fatalf("expected negative matcher to match missing label") + } + + pos, err := compileRuleLabelMatchers(GetRulesRequest{ + Matchers: []string{`missing="x"`}, + }) + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if ruleMatchesLabelMatchers(PrometheusRule{Labels: map[string]string{}}, pos) { + t.Fatalf("expected positive matcher not to match missing label") + } +} + +func TestCompileRuleLabelMatchers_AcceptsSelectorBody(t *testing.T) { + matchers, err := compileRuleLabelMatchers(GetRulesRequest{ + Matchers: []string{`severity=~"warning|critical"`}, + }) + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if len(matchers) != 1 { + t.Fatalf("expected 1 matcher, got %d", len(matchers)) + } + if matchers[0].Name != "severity" { + t.Fatalf("expected severity matcher, got %q", matchers[0].Name) + } +} diff --git a/pkg/k8s/types.go b/pkg/k8s/types.go index 102d5fccf..bf7b61b5b 100644 --- a/pkg/k8s/types.go +++ b/pkg/k8s/types.go @@ -21,6 +21,12 @@ type Client interface { // TestConnection tests the connection to the Kubernetes cluster TestConnection(ctx context.Context) error + // AlertingHealth returns alerting route and stack health details + AlertingHealth(ctx context.Context) (AlertingHealth, error) + + // PrometheusAlerts retrieves active Prometheus alerts + PrometheusAlerts() PrometheusAlertsInterface + // PrometheusRules returns the PrometheusRule interface PrometheusRules() PrometheusRuleInterface @@ -37,6 +43,14 @@ type Client interface { Namespace() NamespaceInterface } +// PrometheusAlertsInterface defines operations for managing PrometheusAlerts +type PrometheusAlertsInterface interface { + // GetAlerts retrieves Prometheus alerts with optional state filtering + GetAlerts(ctx context.Context, req GetAlertsRequest) ([]PrometheusAlert, error) + // GetRules retrieves Prometheus alerting rules and active alerts + GetRules(ctx context.Context, req GetRulesRequest) ([]PrometheusRuleGroup, error) +} + // PrometheusRuleInterface defines operations for managing PrometheusRules type PrometheusRuleInterface interface { // List lists all PrometheusRules from the informer cache @@ -104,6 +118,37 @@ type RelabeledRulesInterface interface { Config() []*relabel.Config } +// RouteStatus describes the availability state of a monitoring route. +type RouteStatus string + +const ( + RouteNotFound RouteStatus = "notFound" + RouteUnreachable RouteStatus = "unreachable" + RouteReachable RouteStatus = "reachable" +) + +// AlertingRouteHealth describes route availability and reachability. +type AlertingRouteHealth struct { + Name string `json:"name"` + Namespace string `json:"namespace"` + Status RouteStatus `json:"status"` + FallbackReachable bool `json:"fallbackReachable,omitempty"` + Error string `json:"error,omitempty"` +} + +// AlertingStackHealth describes alerting health for a monitoring stack. +type AlertingStackHealth struct { + Prometheus AlertingRouteHealth `json:"prometheus"` + Alertmanager AlertingRouteHealth `json:"alertmanager"` +} + +// AlertingHealth provides alerting health details for platform and user workload stacks. +type AlertingHealth struct { + Platform *AlertingStackHealth `json:"platform"` + UserWorkloadEnabled bool `json:"userWorkloadEnabled"` + UserWorkload *AlertingStackHealth `json:"userWorkload"` +} + // NamespaceInterface defines operations for Namespaces type NamespaceInterface interface { // IsClusterMonitoringNamespace checks if a namespace has the openshift.io/cluster-monitoring=true label diff --git a/pkg/management/get_alerting_health.go b/pkg/management/get_alerting_health.go new file mode 100644 index 000000000..001d13f15 --- /dev/null +++ b/pkg/management/get_alerting_health.go @@ -0,0 +1,21 @@ +package management + +import ( + "context" + "time" + + "github.com/openshift/monitoring-plugin/pkg/k8s" +) + +const alertingHealthTimeout = 10 * time.Second + +// GetAlertingHealth retrieves alerting health details. +func (c *client) GetAlertingHealth(ctx context.Context) (k8s.AlertingHealth, error) { + if _, hasDeadline := ctx.Deadline(); !hasDeadline { + timeoutCtx, cancel := context.WithTimeout(ctx, alertingHealthTimeout) + defer cancel() + ctx = timeoutCtx + } + + return c.k8sClient.AlertingHealth(ctx) +} diff --git a/pkg/management/get_alerts.go b/pkg/management/get_alerts.go new file mode 100644 index 000000000..9dea52e1c --- /dev/null +++ b/pkg/management/get_alerts.go @@ -0,0 +1,308 @@ +package management + +import ( + "context" + "fmt" + "strings" + + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "github.com/prometheus/common/model" + "github.com/prometheus/prometheus/model/labels" + "github.com/prometheus/prometheus/model/relabel" + "k8s.io/apimachinery/pkg/types" + + alertrule "github.com/openshift/monitoring-plugin/pkg/alert_rule" + "github.com/openshift/monitoring-plugin/pkg/alertcomponent" + "github.com/openshift/monitoring-plugin/pkg/classification" + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/managementlabels" +) + +var cvoAlertNames = map[string]struct{}{ + "ClusterOperatorDown": {}, + "ClusterOperatorDegraded": {}, +} + +func (c *client) GetAlerts(ctx context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + alerts, err := c.k8sClient.PrometheusAlerts().GetAlerts(ctx, req) + if err != nil { + return nil, fmt.Errorf("failed to get prometheus alerts: %w", err) + } + + configs := c.k8sClient.RelabeledRules().Config() + rules := c.k8sClient.RelabeledRules().List(ctx) + + result := make([]k8s.PrometheusAlert, 0, len(alerts)) + for _, alert := range alerts { + // Only apply relabel configs for platform alerts. User workload alerts + // already come from their own stack and should not be relabeled here. + if alert.Labels[k8s.AlertSourceLabel] != k8s.AlertSourceUser { + relabels, keep := relabel.Process(labels.FromMap(alert.Labels), configs...) + if !keep { + continue + } + alert.Labels = relabels.Map() + } + + // Add calculated rule ID and source when not present (labels enrichment) + c.setRuleIDAndSourceIfMissing(ctx, &alert, rules) + + // correlate alert -> base alert rule via subset matching against relabeled rules + alertRuleId := alert.Labels[k8s.AlertRuleLabelId] + component := "" + layer := "" + + bestRule, corrId := correlateAlertToRule(alert.Labels, rules) + if corrId != "" { + alertRuleId = corrId + } + if bestRule == nil && alertRuleId != "" { + if rule, ok := c.k8sClient.RelabeledRules().Get(ctx, alertRuleId); ok { + bestRule = &rule + } + } + + if bestRule != nil { + if src := c.deriveAlertSource(bestRule.Labels); src != "" { + alert.Labels[k8s.AlertSourceLabel] = src + } + component, layer = classifyFromRule(bestRule) + } else { + component, layer = classifyFromAlertLabels(alert.Labels) + } + + if cvoComponent, cvoLayer, ok := classifyCvoAlert(alert.Labels); ok { + component = cvoComponent + layer = cvoLayer + } + + // Dynamic classification: _from labels on the rule point to alert labels + // whose runtime values become the classification. Takes precedence over + // static classification labels. + if bestRule != nil { + component, layer = ApplyDynamicClassification(bestRule.Labels, alert.Labels, component, layer) + } + + // keep label and optional enriched fields consistent + if alert.Labels[k8s.AlertRuleLabelId] == "" && alertRuleId != "" { + alert.Labels[k8s.AlertRuleLabelId] = alertRuleId + } + alert.AlertRuleId = alertRuleId + + alert.AlertComponent = component + alert.AlertLayer = layer + + delete(alert.Labels, managementlabels.ClassificationManagedByKey) + + result = append(result, alert) + } + + return result, nil +} + +func (c *client) setRuleIDAndSourceIfMissing(ctx context.Context, alert *k8s.PrometheusAlert, rules []monitoringv1.Rule) { + if alert.Labels[k8s.AlertRuleLabelId] == "" { + for _, existing := range rules { + if existing.Alert != alert.Labels[managementlabels.AlertNameLabel] { + continue + } + if !ruleMatchesAlert(existing.Labels, alert.Labels) { + continue + } + rid := alertrule.GetAlertingRuleId(&existing) + alert.Labels[k8s.AlertRuleLabelId] = rid + if alert.Labels[k8s.AlertSourceLabel] == "" { + if src := c.deriveAlertSource(existing.Labels); src != "" { + alert.Labels[k8s.AlertSourceLabel] = src + } + } + break + } + } + if alert.Labels[k8s.AlertSourceLabel] != "" { + return + } + if rid := alert.Labels[k8s.AlertRuleLabelId]; rid != "" { + if existing, ok := c.k8sClient.RelabeledRules().Get(ctx, rid); ok { + if src := c.deriveAlertSource(existing.Labels); src != "" { + alert.Labels[k8s.AlertSourceLabel] = src + } + } + } +} + +func ruleMatchesAlert(existingRuleLabels, alertLabels map[string]string) bool { + existingBusiness := filterBusinessLabels(existingRuleLabels) + for k, v := range existingBusiness { + lv, ok := alertLabels[k] + if !ok || lv != v { + return false + } + } + return true +} + +// correlateAlertToRule tries to find the base alert rule for the given alert labels +// by subset-matching against relabeled rules. +func correlateAlertToRule(alertLabels map[string]string, rules []monitoringv1.Rule) (*monitoringv1.Rule, string) { + // Determine best match: prefer rules with more labels (more specific) + var ( + bestId string + bestRule *monitoringv1.Rule + bestLabelCount int + ) + for i := range rules { + rule := &rules[i] + ruleLabels := sanitizeRuleLabels(rule.Labels) + if isSubset(ruleLabels, alertLabels) { + if len(ruleLabels) > bestLabelCount { + bestLabelCount = len(ruleLabels) + bestRule = rule + bestId = rule.Labels[k8s.AlertRuleLabelId] + } + } + } + if bestRule == nil { + return nil, "" + } + return bestRule, bestId +} + +// sanitizeRuleLabels removes meta labels that will not be present on alerts +func sanitizeRuleLabels(in map[string]string) map[string]string { + out := make(map[string]string, len(in)) + for k, v := range in { + if k == k8s.PrometheusRuleLabelNamespace || k == k8s.PrometheusRuleLabelName || k == k8s.AlertRuleLabelId { + continue + } + out[k] = v + } + return out +} + +// isSubset returns true if all key/value pairs in sub are present in sup +func isSubset(sub map[string]string, sup map[string]string) bool { + for k, v := range sub { + if sv, ok := sup[k]; !ok || sv != v { + return false + } + } + return true +} + +func (c *client) deriveAlertSource(ruleLabels map[string]string) string { + ns := ruleLabels[k8s.PrometheusRuleLabelNamespace] + name := ruleLabels[k8s.PrometheusRuleLabelName] + if ns == "" || name == "" { + return "" + } + if c.isPlatformManagedPrometheusRule(types.NamespacedName{Namespace: ns, Name: name}) { + return k8s.AlertSourcePlatform + } + return k8s.AlertSourceUser +} + +func classifyFromRule(rule *monitoringv1.Rule) (string, string) { + lbls := model.LabelSet{} + for k, v := range rule.Labels { + lbls[model.LabelName(k)] = model.LabelValue(v) + } + if _, ok := lbls["namespace"]; !ok { + if ns := rule.Labels[k8s.PrometheusRuleLabelNamespace]; ns != "" { + lbls["namespace"] = model.LabelValue(ns) + } + } + if rule.Alert != "" { + lbls[model.LabelName(managementlabels.AlertNameLabel)] = model.LabelValue(rule.Alert) + } + + layer, component := alertcomponent.DetermineComponent(lbls) + if component == "" || component == "Others" { + component = "other" + layer = deriveLayerFromSource(rule.Labels) + } + + component, layer = applyRuleScopedDefaults(rule.Labels, component, layer) + return component, layer +} + +func classifyFromAlertLabels(alertLabels map[string]string) (string, string) { + lbls := model.LabelSet{} + for k, v := range alertLabels { + lbls[model.LabelName(k)] = model.LabelValue(v) + } + layer, component := alertcomponent.DetermineComponent(lbls) + if component == "" || component == "Others" { + component = "other" + layer = deriveLayerFromSource(alertLabels) + } + component, layer = applyRuleScopedDefaults(alertLabels, component, layer) + return component, layer +} + +func deriveLayerFromSource(labels map[string]string) string { + if labels[k8s.AlertSourceLabel] == k8s.AlertSourcePlatform { + return "cluster" + } + if labels[k8s.PrometheusRuleLabelNamespace] == k8s.ClusterMonitoringNamespace { + return "cluster" + } + promSrc := labels["prometheus"] + if strings.HasPrefix(promSrc, "openshift-monitoring/") { + return "cluster" + } + return "namespace" +} + +// applyRuleScopedDefaults applies static classification labels from the rule. +func applyRuleScopedDefaults(ruleLabels map[string]string, component, layer string) (string, string) { + if ruleLabels == nil { + return component, layer + } + if v := strings.TrimSpace(ruleLabels[k8s.AlertRuleClassificationComponentKey]); v != "" { + if classification.ValidateComponent(v) { + component = v + } + } + if v := strings.TrimSpace(ruleLabels[k8s.AlertRuleClassificationLayerKey]); v != "" { + if classification.ValidateLayer(v) { + layer = strings.ToLower(strings.TrimSpace(v)) + } + } + return component, layer +} + +// applyDynamicClassification handles _from labels: the rule label points to an +// alert label whose runtime value becomes the classification. _from takes +// precedence over static classification labels. +func ApplyDynamicClassification(ruleLabels, alertLabels map[string]string, component, layer string) (string, string) { + if ruleLabels == nil { + return component, layer + } + if from := strings.TrimSpace(ruleLabels[k8s.AlertRuleClassificationComponentFromKey]); from != "" { + if classification.ValidatePromLabelName(from) { + if v := strings.TrimSpace(alertLabels[from]); v != "" && classification.ValidateComponent(v) { + component = v + } + } + } + if from := strings.TrimSpace(ruleLabels[k8s.AlertRuleClassificationLayerFromKey]); from != "" { + if classification.ValidatePromLabelName(from) { + if v := alertLabels[from]; classification.ValidateLayer(v) { + layer = strings.ToLower(strings.TrimSpace(v)) + } + } + } + return component, layer +} + +func classifyCvoAlert(alertLabels map[string]string) (string, string, bool) { + if _, ok := cvoAlertNames[alertLabels[managementlabels.AlertNameLabel]]; !ok { + return "", "", false + } + component := alertLabels["name"] + if component == "" { + component = "version" + } + return component, "cluster", true +} diff --git a/pkg/management/get_alerts_test.go b/pkg/management/get_alerts_test.go new file mode 100644 index 000000000..66b0e1902 --- /dev/null +++ b/pkg/management/get_alerts_test.go @@ -0,0 +1,465 @@ +package management_test + +import ( + "context" + "errors" + "strings" + "testing" + + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "github.com/prometheus/prometheus/model/relabel" + + alertrule "github.com/openshift/monitoring-plugin/pkg/alert_rule" + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/management" + "github.com/openshift/monitoring-plugin/pkg/management/testutils" + "github.com/openshift/monitoring-plugin/pkg/managementlabels" +) + +func TestGetAlerts_ErrorPropagated(t *testing.T) { + ctx := context.Background() + mockK8s := &testutils.MockClient{ + PrometheusAlertsFunc: func() k8s.PrometheusAlertsInterface { + return &testutils.MockPrometheusAlertsInterface{ + GetAlertsFunc: func(_ context.Context, _ k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + return nil, errors.New("failed to get alerts") + }, + } + }, + } + client := management.New(ctx, mockK8s) + + _, err := client.GetAlerts(ctx, k8s.GetAlertsRequest{}) + if err == nil { + t.Fatal("expected error, got nil") + } + if !strings.Contains(err.Error(), "failed to get prometheus alerts") { + t.Errorf("unexpected error: %v", err) + } +} + +func TestGetAlerts_ReturnsAllWithoutRelabelConfigs(t *testing.T) { + ctx := context.Background() + alert1 := k8s.PrometheusAlert{ + Labels: map[string]string{managementlabels.AlertNameLabel: "Alert1", "severity": "warning", "namespace": "default"}, + State: "firing", + } + alert2 := k8s.PrometheusAlert{ + Labels: map[string]string{managementlabels.AlertNameLabel: "Alert2", "severity": "critical", "namespace": "kube-system"}, + State: "pending", + } + mockK8s := &testutils.MockClient{ + PrometheusAlertsFunc: func() k8s.PrometheusAlertsInterface { + return &testutils.MockPrometheusAlertsInterface{ + GetAlertsFunc: func(_ context.Context, _ k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + return []k8s.PrometheusAlert{alert1, alert2}, nil + }, + } + }, + RelabeledRulesFunc: func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + ConfigFunc: func() []*relabel.Config { return []*relabel.Config{} }, + } + }, + } + client := management.New(ctx, mockK8s) + + alerts, err := client.GetAlerts(ctx, k8s.GetAlertsRequest{}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(alerts) != 2 { + t.Fatalf("expected 2 alerts, got %d", len(alerts)) + } + if alerts[0].Labels[managementlabels.AlertNameLabel] != "Alert1" { + t.Errorf("alert[0] name mismatch") + } + if alerts[1].Labels[managementlabels.AlertNameLabel] != "Alert2" { + t.Errorf("alert[1] name mismatch") + } +} + +func TestGetAlerts_AppliesStaticClassificationFromRelabeledRule(t *testing.T) { + ctx := context.Background() + alert1 := k8s.PrometheusAlert{ + Labels: map[string]string{managementlabels.AlertNameLabel: "Alert1", "severity": "warning", "namespace": "default"}, + State: "firing", + } + + rule := monitoringv1.Rule{ + Alert: "Alert1", + Labels: map[string]string{ + "severity": "warning", + "namespace": "default", + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "test-rule", + k8s.AlertRuleClassificationComponentKey: "networking", + k8s.AlertRuleClassificationLayerKey: "cluster", + }, + } + rule.Labels[k8s.AlertRuleLabelId] = alertrule.GetAlertingRuleId(&rule) + + mockK8s := &testutils.MockClient{ + PrometheusAlertsFunc: func() k8s.PrometheusAlertsInterface { + return &testutils.MockPrometheusAlertsInterface{ + GetAlertsFunc: func(_ context.Context, _ k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + return []k8s.PrometheusAlert{alert1}, nil + }, + } + }, + RelabeledRulesFunc: func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + ListFunc: func(_ context.Context) []monitoringv1.Rule { return []monitoringv1.Rule{rule} }, + GetFunc: func(_ context.Context, id string) (monitoringv1.Rule, bool) { + if id == rule.Labels[k8s.AlertRuleLabelId] { + return rule, true + } + return monitoringv1.Rule{}, false + }, + ConfigFunc: func() []*relabel.Config { return []*relabel.Config{} }, + } + }, + NamespaceFunc: func() k8s.NamespaceInterface { + ns := &testutils.MockNamespaceInterface{} + ns.SetMonitoringNamespaces(map[string]bool{"openshift-monitoring": true}) + return ns + }, + } + client := management.New(ctx, mockK8s) + + alerts, err := client.GetAlerts(ctx, k8s.GetAlertsRequest{}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(alerts) != 1 { + t.Fatalf("expected 1 alert, got %d", len(alerts)) + } + if alerts[0].AlertComponent != "networking" { + t.Errorf("expected component=networking, got %q", alerts[0].AlertComponent) + } + if alerts[0].AlertLayer != "cluster" { + t.Errorf("expected layer=cluster, got %q", alerts[0].AlertLayer) + } +} + +func TestGetAlerts_DerivesComponentFromAlertLabel(t *testing.T) { + ctx := context.Background() + alertWithName := k8s.PrometheusAlert{ + Labels: map[string]string{ + managementlabels.AlertNameLabel: "Alert1", + "severity": "warning", + "namespace": "default", + "name": "kube_apiserver", + }, + State: "firing", + } + + rule := monitoringv1.Rule{ + Alert: "Alert1", + Labels: map[string]string{ + "severity": "warning", + "namespace": "default", + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "test-rule", + k8s.AlertRuleClassificationComponentFromKey: "name", + k8s.AlertRuleClassificationLayerKey: "namespace", + }, + } + rule.Labels[k8s.AlertRuleLabelId] = alertrule.GetAlertingRuleId(&rule) + + mockK8s := &testutils.MockClient{ + PrometheusAlertsFunc: func() k8s.PrometheusAlertsInterface { + return &testutils.MockPrometheusAlertsInterface{ + GetAlertsFunc: func(_ context.Context, _ k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + return []k8s.PrometheusAlert{alertWithName}, nil + }, + } + }, + RelabeledRulesFunc: func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + ListFunc: func(_ context.Context) []monitoringv1.Rule { return []monitoringv1.Rule{rule} }, + GetFunc: func(_ context.Context, id string) (monitoringv1.Rule, bool) { + if id == rule.Labels[k8s.AlertRuleLabelId] { + return rule, true + } + return monitoringv1.Rule{}, false + }, + ConfigFunc: func() []*relabel.Config { return []*relabel.Config{} }, + } + }, + NamespaceFunc: func() k8s.NamespaceInterface { + ns := &testutils.MockNamespaceInterface{} + ns.SetMonitoringNamespaces(map[string]bool{"openshift-monitoring": true}) + return ns + }, + } + client := management.New(ctx, mockK8s) + + alerts, err := client.GetAlerts(ctx, k8s.GetAlertsRequest{}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(alerts) != 1 { + t.Fatalf("expected 1 alert, got %d", len(alerts)) + } + if alerts[0].AlertComponent != "kube_apiserver" { + t.Errorf("expected component=kube_apiserver, got %q", alerts[0].AlertComponent) + } + if alerts[0].AlertLayer != "namespace" { + t.Errorf("expected layer=namespace, got %q", alerts[0].AlertLayer) + } +} + +func TestGetAlerts_DerivesLayerFromAlertLabel(t *testing.T) { + ctx := context.Background() + alertWithLayer := k8s.PrometheusAlert{ + Labels: map[string]string{ + managementlabels.AlertNameLabel: "Alert1", + "severity": "warning", + "namespace": "default", + "tier": "Cluster", + }, + State: "firing", + } + + rule := monitoringv1.Rule{ + Alert: "Alert1", + Labels: map[string]string{ + "severity": "warning", + "namespace": "default", + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "test-rule", + k8s.AlertRuleClassificationComponentKey: "networking", + k8s.AlertRuleClassificationLayerFromKey: "tier", + }, + } + rule.Labels[k8s.AlertRuleLabelId] = alertrule.GetAlertingRuleId(&rule) + + mockK8s := &testutils.MockClient{ + PrometheusAlertsFunc: func() k8s.PrometheusAlertsInterface { + return &testutils.MockPrometheusAlertsInterface{ + GetAlertsFunc: func(_ context.Context, _ k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + return []k8s.PrometheusAlert{alertWithLayer}, nil + }, + } + }, + RelabeledRulesFunc: func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + ListFunc: func(_ context.Context) []monitoringv1.Rule { return []monitoringv1.Rule{rule} }, + GetFunc: func(_ context.Context, id string) (monitoringv1.Rule, bool) { + if id == rule.Labels[k8s.AlertRuleLabelId] { + return rule, true + } + return monitoringv1.Rule{}, false + }, + ConfigFunc: func() []*relabel.Config { return []*relabel.Config{} }, + } + }, + NamespaceFunc: func() k8s.NamespaceInterface { + ns := &testutils.MockNamespaceInterface{} + ns.SetMonitoringNamespaces(map[string]bool{"openshift-monitoring": true}) + return ns + }, + } + client := management.New(ctx, mockK8s) + + alerts, err := client.GetAlerts(ctx, k8s.GetAlertsRequest{}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(alerts) != 1 { + t.Fatalf("expected 1 alert, got %d", len(alerts)) + } + if alerts[0].AlertComponent != "networking" { + t.Errorf("expected component=networking, got %q", alerts[0].AlertComponent) + } + // "Cluster" from alert label lowercased to "cluster" + if alerts[0].AlertLayer != "cluster" { + t.Errorf("expected layer=cluster, got %q", alerts[0].AlertLayer) + } +} + +func TestGetAlerts_UsesRuleLabelsAsDefaults(t *testing.T) { + ctx := context.Background() + alert := k8s.PrometheusAlert{ + Labels: map[string]string{ + "alertname": "AlertRuleDefaults", + "severity": "warning", + "namespace": "default", + k8s.AlertRuleClassificationComponentKey: "team_a", + k8s.AlertRuleClassificationLayerKey: "namespace", + }, + State: "firing", + } + + rule := monitoringv1.Rule{ + Alert: "AlertRuleDefaults", + Labels: map[string]string{ + "severity": "warning", + "namespace": "default", + k8s.AlertRuleClassificationComponentKey: "team_a", + k8s.AlertRuleClassificationLayerKey: "namespace", + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "defaults-rule", + }, + } + rule.Labels[k8s.AlertRuleLabelId] = alertrule.GetAlertingRuleId(&rule) + + mockK8s := &testutils.MockClient{ + PrometheusAlertsFunc: func() k8s.PrometheusAlertsInterface { + return &testutils.MockPrometheusAlertsInterface{ + GetAlertsFunc: func(_ context.Context, _ k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + return []k8s.PrometheusAlert{alert}, nil + }, + } + }, + RelabeledRulesFunc: func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + ListFunc: func(_ context.Context) []monitoringv1.Rule { return []monitoringv1.Rule{rule} }, + GetFunc: func(_ context.Context, id string) (monitoringv1.Rule, bool) { + if id == rule.Labels[k8s.AlertRuleLabelId] { + return rule, true + } + return monitoringv1.Rule{}, false + }, + ConfigFunc: func() []*relabel.Config { return []*relabel.Config{} }, + } + }, + } + client := management.New(ctx, mockK8s) + + alerts, err := client.GetAlerts(ctx, k8s.GetAlertsRequest{}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(alerts) != 1 { + t.Fatalf("expected 1 alert, got %d", len(alerts)) + } + if alerts[0].AlertComponent != "team_a" { + t.Errorf("expected component=team_a, got %q", alerts[0].AlertComponent) + } + if alerts[0].AlertLayer != "namespace" { + t.Errorf("expected layer=namespace, got %q", alerts[0].AlertLayer) + } +} + +func TestGetAlerts_FallsBackToDefaultWhenNoMatchingRule(t *testing.T) { + ctx := context.Background() + alert1 := k8s.PrometheusAlert{ + Labels: map[string]string{managementlabels.AlertNameLabel: "Alert1", "severity": "warning", "namespace": "default"}, + State: "firing", + } + mockK8s := &testutils.MockClient{ + PrometheusAlertsFunc: func() k8s.PrometheusAlertsInterface { + return &testutils.MockPrometheusAlertsInterface{ + GetAlertsFunc: func(_ context.Context, _ k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + return []k8s.PrometheusAlert{alert1}, nil + }, + } + }, + RelabeledRulesFunc: func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + ListFunc: func(_ context.Context) []monitoringv1.Rule { return []monitoringv1.Rule{} }, + ConfigFunc: func() []*relabel.Config { return []*relabel.Config{} }, + } + }, + } + client := management.New(ctx, mockK8s) + + alerts, err := client.GetAlerts(ctx, k8s.GetAlertsRequest{}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(alerts) != 1 { + t.Fatalf("expected 1 alert, got %d", len(alerts)) + } + if alerts[0].AlertComponent != "other" { + t.Errorf("expected component=other, got %q", alerts[0].AlertComponent) + } + if alerts[0].AlertLayer != "namespace" { + t.Errorf("expected layer=namespace, got %q", alerts[0].AlertLayer) + } +} + +func TestGetAlerts_FallsBackToDefaultWithMatchingRuleNoLabels(t *testing.T) { + ctx := context.Background() + alert1 := k8s.PrometheusAlert{ + Labels: map[string]string{managementlabels.AlertNameLabel: "Alert1", "severity": "warning", "namespace": "default"}, + State: "firing", + } + + rule := monitoringv1.Rule{ + Alert: "Alert1", + Labels: map[string]string{ + "severity": "warning", + "namespace": "default", + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "default-rule", + }, + } + rule.Labels[k8s.AlertRuleLabelId] = alertrule.GetAlertingRuleId(&rule) + + mockK8s := &testutils.MockClient{ + PrometheusAlertsFunc: func() k8s.PrometheusAlertsInterface { + return &testutils.MockPrometheusAlertsInterface{ + GetAlertsFunc: func(_ context.Context, _ k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + return []k8s.PrometheusAlert{alert1}, nil + }, + } + }, + RelabeledRulesFunc: func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + ListFunc: func(_ context.Context) []monitoringv1.Rule { return []monitoringv1.Rule{rule} }, + GetFunc: func(_ context.Context, id string) (monitoringv1.Rule, bool) { + if id == rule.Labels[k8s.AlertRuleLabelId] { + return rule, true + } + return monitoringv1.Rule{}, false + }, + ConfigFunc: func() []*relabel.Config { return []*relabel.Config{} }, + } + }, + } + client := management.New(ctx, mockK8s) + + alerts, err := client.GetAlerts(ctx, k8s.GetAlertsRequest{}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(alerts) != 1 { + t.Fatalf("expected 1 alert, got %d", len(alerts)) + } + if alerts[0].AlertComponent != "other" { + t.Errorf("expected component=other, got %q", alerts[0].AlertComponent) + } + if alerts[0].AlertLayer != "cluster" { + t.Errorf("expected layer=cluster, got %q", alerts[0].AlertLayer) + } +} + +func TestGetAlerts_ReturnsEmptyList(t *testing.T) { + ctx := context.Background() + mockK8s := &testutils.MockClient{ + PrometheusAlertsFunc: func() k8s.PrometheusAlertsInterface { + return &testutils.MockPrometheusAlertsInterface{ + GetAlertsFunc: func(_ context.Context, _ k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + return []k8s.PrometheusAlert{}, nil + }, + } + }, + RelabeledRulesFunc: func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + ConfigFunc: func() []*relabel.Config { return []*relabel.Config{} }, + } + }, + } + client := management.New(ctx, mockK8s) + + alerts, err := client.GetAlerts(ctx, k8s.GetAlertsRequest{}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(alerts) != 0 { + t.Errorf("expected empty list, got %d", len(alerts)) + } +} diff --git a/pkg/management/management_suite_test.go b/pkg/management/management_suite_test.go new file mode 100644 index 000000000..d5fa1082b --- /dev/null +++ b/pkg/management/management_suite_test.go @@ -0,0 +1,15 @@ +package management_test + +import ( + "os" + "testing" + + "github.com/prometheus/common/model" +) + +func TestMain(m *testing.M) { + // LegacyValidation is required for tests that construct relabel configs + // containing label names with special characters (e.g. slashes). + model.NameValidationScheme = model.LegacyValidation //nolint:staticcheck + os.Exit(m.Run()) +} diff --git a/pkg/management/testutils/k8s_client_mock.go b/pkg/management/testutils/k8s_client_mock.go index 0b9adbdb2..370125966 100644 --- a/pkg/management/testutils/k8s_client_mock.go +++ b/pkg/management/testutils/k8s_client_mock.go @@ -17,6 +17,8 @@ import ( // by AlertingRules().Get) hit the same store. type MockClient struct { TestConnectionFunc func(ctx context.Context) error + AlertingHealthFunc func(ctx context.Context) (k8s.AlertingHealth, error) + PrometheusAlertsFunc func() k8s.PrometheusAlertsInterface PrometheusRulesFunc func() k8s.PrometheusRuleInterface AlertRelabelConfigsFunc func() k8s.AlertRelabelConfigInterface AlertingRulesFunc func() k8s.AlertingRuleInterface @@ -38,6 +40,20 @@ func (m *MockClient) TestConnection(ctx context.Context) error { return nil } +func (m *MockClient) AlertingHealth(ctx context.Context) (k8s.AlertingHealth, error) { + if m.AlertingHealthFunc != nil { + return m.AlertingHealthFunc(ctx) + } + return k8s.AlertingHealth{}, nil +} + +func (m *MockClient) PrometheusAlerts() k8s.PrometheusAlertsInterface { + if m.PrometheusAlertsFunc != nil { + return m.PrometheusAlertsFunc() + } + return &MockPrometheusAlertsInterface{} +} + func (m *MockClient) PrometheusRules() k8s.PrometheusRuleInterface { if m.PrometheusRulesFunc != nil { return m.PrometheusRulesFunc() @@ -88,7 +104,42 @@ func (m *MockClient) Namespace() k8s.NamespaceInterface { return m.namespace } -// MockPrometheusRuleInterface is a mock implementation of k8s.PrometheusRuleInterface +type MockPrometheusAlertsInterface struct { + GetAlertsFunc func(ctx context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) + GetRulesFunc func(ctx context.Context, req k8s.GetRulesRequest) ([]k8s.PrometheusRuleGroup, error) + + ActiveAlerts []k8s.PrometheusAlert + RuleGroups []k8s.PrometheusRuleGroup +} + +func (m *MockPrometheusAlertsInterface) SetActiveAlerts(alerts []k8s.PrometheusAlert) { + m.ActiveAlerts = alerts +} + +func (m *MockPrometheusAlertsInterface) SetRuleGroups(groups []k8s.PrometheusRuleGroup) { + m.RuleGroups = groups +} + +func (m *MockPrometheusAlertsInterface) GetAlerts(ctx context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + if m.GetAlertsFunc != nil { + return m.GetAlertsFunc(ctx, req) + } + if m.ActiveAlerts != nil { + return m.ActiveAlerts, nil + } + return []k8s.PrometheusAlert{}, nil +} + +func (m *MockPrometheusAlertsInterface) GetRules(ctx context.Context, req k8s.GetRulesRequest) ([]k8s.PrometheusRuleGroup, error) { + if m.GetRulesFunc != nil { + return m.GetRulesFunc(ctx, req) + } + if m.RuleGroups != nil { + return m.RuleGroups, nil + } + return []k8s.PrometheusRuleGroup{}, nil +} + type MockPrometheusRuleInterface struct { ListFunc func() ([]monitoringv1.PrometheusRule, error) GetFunc func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) diff --git a/pkg/management/types.go b/pkg/management/types.go index b636cfc50..74df9485b 100644 --- a/pkg/management/types.go +++ b/pkg/management/types.go @@ -4,6 +4,8 @@ import ( "context" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + + "github.com/openshift/monitoring-plugin/pkg/k8s" ) // Client is the interface for managing alert rules @@ -38,6 +40,12 @@ type Client interface { UpdateAlertRuleClassification(ctx context.Context, req UpdateRuleClassificationRequest) error // BulkUpdateAlertRuleClassification updates classification for multiple rule ids BulkUpdateAlertRuleClassification(ctx context.Context, items []UpdateRuleClassificationRequest) []error + + // GetAlerts retrieves Prometheus alerts + GetAlerts(ctx context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) + + // GetAlertingHealth retrieves the alerting stack health status + GetAlertingHealth(ctx context.Context) (k8s.AlertingHealth, error) } // PrometheusRuleOptions specifies options for selecting PrometheusRule resources and groups diff --git a/pkg/management/update_classification.go b/pkg/management/update_classification.go index 83d092bf3..cf03488c7 100644 --- a/pkg/management/update_classification.go +++ b/pkg/management/update_classification.go @@ -432,28 +432,3 @@ func getOriginalPlatformRuleFromPR(pr *monitoringv1.PrometheusRule, namespace st AdditionalInfo: fmt.Sprintf("in PrometheusRule %s/%s", namespace, name), } } - -// ApplyDynamicClassification resolves the effective component and layer for an -// alert by applying _from indirection. If a rule carries a component_from or -// layer_from label, the corresponding alert label value is used instead of the -// static default. Unresolvable or empty lookups fall back to the supplied -// defaults. -func ApplyDynamicClassification(ruleLabels, alertLabels map[string]string, defaultComponent, defaultLayer string) (string, string) { - component := defaultComponent - layer := defaultLayer - - if ruleLabels != nil { - if fromKey := ruleLabels[k8s.AlertRuleClassificationComponentFromKey]; fromKey != "" { - if v, ok := alertLabels[fromKey]; ok && v != "" { - component = v - } - } - if fromKey := ruleLabels[k8s.AlertRuleClassificationLayerFromKey]; fromKey != "" { - if v, ok := alertLabels[fromKey]; ok && v != "" { - layer = strings.ToLower(v) - } - } - } - - return component, layer -} diff --git a/test/e2e/get_alerts_test.go b/test/e2e/get_alerts_test.go new file mode 100644 index 000000000..25833a17e --- /dev/null +++ b/test/e2e/get_alerts_test.go @@ -0,0 +1,127 @@ +package e2e + +import ( + "context" + "encoding/json" + "net/http" + "testing" + "time" + + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/apimachinery/pkg/util/wait" + + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/test/e2e/framework" +) + +func TestGetAlerts(t *testing.T) { + f, err := framework.New() + if err != nil { + t.Fatalf("Failed to create framework: %v", err) + } + + ctx := context.Background() + + testNamespace, cleanup, err := f.CreateNamespace(ctx, "test-get-alerts", false) + if err != nil { + t.Fatalf("Failed to create test namespace: %v", err) + } + defer cleanup() + + forDuration := monitoringv1.Duration("1s") + alertName := "E2EGetAlertsTest" + + promRule := &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: "e2e-get-alerts-rule", + Namespace: testNamespace, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "e2e-test-group", + Rules: []monitoringv1.Rule{ + { + Alert: alertName, + Expr: intstr.FromString("vector(1)"), + For: &forDuration, + Labels: map[string]string{ + "severity": "none", + "team": "e2e", + }, + Annotations: map[string]string{ + "summary": "E2E test alert for GET /alerts", + }, + }, + }, + }, + }, + }, + } + + _, err = f.Monitoringv1clientset.MonitoringV1().PrometheusRules(testNamespace).Create( + ctx, promRule, metav1.CreateOptions{}, + ) + if err != nil { + t.Fatalf("Failed to create PrometheusRule: %v", err) + } + + httpClient := f.HTTPClient() + err = wait.PollUntilContextTimeout(ctx, 5*time.Second, 3*time.Minute, true, func(ctx context.Context) (bool, error) { + alertsURL := f.PluginURL + "/api/v1/alerting/alerts" + req, err := http.NewRequestWithContext(ctx, http.MethodGet, alertsURL, nil) + if err != nil { + return false, err + } + if f.BearerToken != "" { + req.Header.Set("Authorization", "Bearer "+f.BearerToken) + } + + resp, err := httpClient.Do(req) + if err != nil { + t.Logf("Failed to query alerts: %v", err) + return false, nil + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + t.Logf("GET /alerts returned status %d, retrying", resp.StatusCode) + return false, nil + } + + var alertsResp struct { + Data struct { + Alerts []k8s.PrometheusAlert `json:"alerts"` + } `json:"data"` + } + if err := json.NewDecoder(resp.Body).Decode(&alertsResp); err != nil { + t.Logf("Failed to decode alerts response: %v", err) + return false, nil + } + + for _, alert := range alertsResp.Data.Alerts { + if alert.Labels["alertname"] == alertName { + if alert.State != "firing" && alert.State != "pending" { + t.Logf("Found alert %s but state is %q, waiting for firing/pending", alertName, alert.State) + return false, nil + } + if alert.Labels["severity"] != "none" { + t.Errorf("Expected severity=none, got %q", alert.Labels["severity"]) + } + t.Logf("Found alert %s in state %q", alertName, alert.State) + return true, nil + } + } + + t.Logf("Alert %s not found yet (got %d alerts total)", alertName, len(alertsResp.Data.Alerts)) + return false, nil + }) + + if err != nil { + t.Fatalf("Timeout waiting for alert to appear: %v", err) + } + + t.Log("GET /alerts e2e test passed successfully") +}