Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ require (
github.com/openshift/library-go v0.0.0-20240905123346-5bdbfe35a6f5
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.87.0
github.com/prometheus-operator/prometheus-operator/pkg/client v0.87.0
github.com/prometheus/client_golang v1.23.2
github.com/prometheus/client_model v0.6.2
github.com/prometheus/common v0.67.4
github.com/prometheus/prometheus v0.308.0
github.com/sirupsen/logrus v1.9.3
Expand Down Expand Up @@ -57,8 +59,6 @@ require (
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_golang v1.23.2 // indirect
github.com/prometheus/client_model v0.6.2 // indirect
github.com/prometheus/procfs v0.16.1 // indirect
github.com/spf13/pflag v1.0.6 // indirect
github.com/x448/float16 v0.8.4 // indirect
Expand Down
52 changes: 52 additions & 0 deletions pkg/k8s/alert_relabel_config_gc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package k8s

import (
"context"

"github.com/openshift/monitoring-plugin/pkg/managementlabels"
)

// gcOrphanedARCs deletes AlertRelabelConfigs whose associated alert rule no
// longer exists. This handles the case where an operator (or manual action)
// removes rules from a PrometheusRule or deletes the CR entirely — the ARCs
// that were created by the plugin for classification/drop/stamp become orphans.
//
// Only ARCs carrying the plugin's alertRuleId annotation are considered.
// GitOps-managed ARCs are never deleted automatically; a warning is logged
// so that operators can clean them up manually.
func (rrm *relabeledRulesManager) gcOrphanedARCs(ctx context.Context, liveRuleIDs map[string]struct{}) {
if rrm.alertRelabelConfigs == nil {
return
}

arcs, err := rrm.alertRelabelConfigs.List(ctx, "")
if err != nil {
log.Errorf("orphan ARC GC: failed to list ARCs: %v", err)
return
}

for i := range arcs {
arc := &arcs[i]

ruleID, ok := arc.Annotations[managementlabels.ARCAnnotationAlertRuleIDKey]
if !ok || ruleID == "" {
continue
}

if _, alive := liveRuleIDs[ruleID]; alive {
continue
}

if IsManagedByGitOps(arc.Annotations, arc.Labels) {
log.Warnf("orphan ARC GC: ARC %s/%s (ruleId=%s) is orphaned but GitOps-managed — skipping deletion, manual cleanup required", arc.Namespace, arc.Name, ruleID)
continue
}

if err := rrm.alertRelabelConfigs.Delete(ctx, arc.Namespace, arc.Name); err != nil {
log.Errorf("orphan ARC GC: failed to delete ARC %s/%s: %v", arc.Namespace, arc.Name, err)
continue
}

log.Infof("orphan ARC GC: deleted orphaned ARC %s/%s (ruleId=%s)", arc.Namespace, arc.Name, ruleID)
}
}
168 changes: 168 additions & 0 deletions pkg/k8s/alert_relabel_config_gc_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
package k8s

import (
"context"
"testing"

osmv1 "github.com/openshift/api/monitoring/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/openshift/monitoring-plugin/pkg/managementlabels"
)

type mockARCInterface struct {
arcs map[string]*osmv1.AlertRelabelConfig
deleted []string
}

func (m *mockARCInterface) List(_ context.Context, _ string) ([]osmv1.AlertRelabelConfig, error) {
var result []osmv1.AlertRelabelConfig
for _, arc := range m.arcs {
result = append(result, *arc)
}
return result, nil
}

func (m *mockARCInterface) Get(_ context.Context, ns, name string) (*osmv1.AlertRelabelConfig, bool, error) {
if arc, ok := m.arcs[ns+"/"+name]; ok {
return arc, true, nil
}
return nil, false, nil
}

func (m *mockARCInterface) Create(_ context.Context, arc osmv1.AlertRelabelConfig) (*osmv1.AlertRelabelConfig, error) {
return &arc, nil
}

func (m *mockARCInterface) Update(_ context.Context, _ osmv1.AlertRelabelConfig) error { return nil }

func (m *mockARCInterface) Delete(_ context.Context, ns, name string) error {
m.deleted = append(m.deleted, ns+"/"+name)
delete(m.arcs, ns+"/"+name)
return nil
}

func newARC(ns, name, ruleID string, annotations, labels map[string]string) *osmv1.AlertRelabelConfig {
if annotations == nil {
annotations = map[string]string{}
}
if ruleID != "" {
annotations[managementlabels.ARCAnnotationAlertRuleIDKey] = ruleID
}
return &osmv1.AlertRelabelConfig{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: ns,
Annotations: annotations,
Labels: labels,
},
}
}

func TestGCOrphanedARCs_DeletesOrphan(t *testing.T) {
mock := &mockARCInterface{
arcs: map[string]*osmv1.AlertRelabelConfig{
"openshift-monitoring/arc-orphan": newARC("openshift-monitoring", "arc-orphan", "rule-gone", nil, nil),
},
}
rrm := &relabeledRulesManager{alertRelabelConfigs: mock}

rrm.gcOrphanedARCs(context.Background(), map[string]struct{}{})

if len(mock.deleted) != 1 || mock.deleted[0] != "openshift-monitoring/arc-orphan" {
t.Fatalf("expected orphan ARC to be deleted, got deleted=%v", mock.deleted)
}
}

func TestGCOrphanedARCs_KeepsLiveRule(t *testing.T) {
mock := &mockARCInterface{
arcs: map[string]*osmv1.AlertRelabelConfig{
"openshift-monitoring/arc-live": newARC("openshift-monitoring", "arc-live", "rule-alive", nil, nil),
},
}
rrm := &relabeledRulesManager{alertRelabelConfigs: mock}

rrm.gcOrphanedARCs(context.Background(), map[string]struct{}{"rule-alive": {}})

if len(mock.deleted) != 0 {
t.Fatalf("expected no deletions, got deleted=%v", mock.deleted)
}
}

func TestGCOrphanedARCs_SkipsGitOpsManaged(t *testing.T) {
mock := &mockARCInterface{
arcs: map[string]*osmv1.AlertRelabelConfig{
"openshift-monitoring/arc-gitops": newARC("openshift-monitoring", "arc-gitops", "rule-gone",
map[string]string{"argocd.argoproj.io/tracking-id": "some-id"}, nil),
},
}
rrm := &relabeledRulesManager{alertRelabelConfigs: mock}

rrm.gcOrphanedARCs(context.Background(), map[string]struct{}{})

if len(mock.deleted) != 0 {
t.Fatalf("expected GitOps-managed ARC to be preserved, got deleted=%v", mock.deleted)
}
}

func TestGCOrphanedARCs_SkipsARCWithoutAnnotation(t *testing.T) {
mock := &mockARCInterface{
arcs: map[string]*osmv1.AlertRelabelConfig{
"openshift-monitoring/arc-manual": newARC("openshift-monitoring", "arc-manual", "", nil, nil),
},
}
rrm := &relabeledRulesManager{alertRelabelConfigs: mock}

rrm.gcOrphanedARCs(context.Background(), map[string]struct{}{})

if len(mock.deleted) != 0 {
t.Fatalf("expected ARC without annotation to be preserved, got deleted=%v", mock.deleted)
}
}

func TestGCOrphanedARCs_MixedScenario(t *testing.T) {
mock := &mockARCInterface{
arcs: map[string]*osmv1.AlertRelabelConfig{
"openshift-monitoring/arc-live": newARC("openshift-monitoring", "arc-live", "rule-1", nil, nil),
"openshift-monitoring/arc-orphan1": newARC("openshift-monitoring", "arc-orphan1", "rule-deleted-1", nil, nil),
"openshift-monitoring/arc-orphan2": newARC("openshift-monitoring", "arc-orphan2", "rule-deleted-2", nil, nil),
"openshift-monitoring/arc-gitops": newARC("openshift-monitoring", "arc-gitops", "rule-deleted-3",
map[string]string{"argocd.argoproj.io/tracking-id": "t"}, nil),
"openshift-monitoring/arc-manual": newARC("openshift-monitoring", "arc-manual", "", nil, nil),
},
}
rrm := &relabeledRulesManager{alertRelabelConfigs: mock}

liveIDs := map[string]struct{}{"rule-1": {}}
rrm.gcOrphanedARCs(context.Background(), liveIDs)

deletedSet := map[string]bool{}
for _, d := range mock.deleted {
deletedSet[d] = true
}

if len(mock.deleted) != 2 {
t.Fatalf("expected 2 deletions, got %d: %v", len(mock.deleted), mock.deleted)
}
if !deletedSet["openshift-monitoring/arc-orphan1"] {
t.Error("expected arc-orphan1 to be deleted")
}
if !deletedSet["openshift-monitoring/arc-orphan2"] {
t.Error("expected arc-orphan2 to be deleted")
}
if deletedSet["openshift-monitoring/arc-live"] {
t.Error("arc-live should not have been deleted")
}
if deletedSet["openshift-monitoring/arc-gitops"] {
t.Error("arc-gitops should not have been deleted (GitOps-managed)")
}
if deletedSet["openshift-monitoring/arc-manual"] {
t.Error("arc-manual should not have been deleted (no annotation)")
}
}

func TestGCOrphanedARCs_NilInterface(t *testing.T) {
rrm := &relabeledRulesManager{alertRelabelConfigs: nil}
// Should not panic
rrm.gcOrphanedARCs(context.Background(), map[string]struct{}{})
}
19 changes: 13 additions & 6 deletions pkg/k8s/relabeled_rules.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ func newRelabeledRulesManager(ctx context.Context, namespaceManager NamespaceInt
return nil, fmt.Errorf("failed to sync RelabeledRulesConfig informer")
}

if err := rrm.sync(ctx); err != nil {
if err := rrm.sync(ctx, "initial-sync"); err != nil {
return nil, fmt.Errorf("initial relabeled rules sync failed: %w", err)
}

Expand Down Expand Up @@ -179,7 +179,7 @@ func (rrm *relabeledRulesManager) processNextWorkItem(ctx context.Context) bool

defer rrm.queue.Done(key)

if err := rrm.sync(ctx); err != nil {
if err := rrm.sync(ctx, key); err != nil {
log.Errorf("error syncing relabeled rules: %v", err)
rrm.queue.AddRateLimited(key)
return true
Expand All @@ -190,7 +190,7 @@ func (rrm *relabeledRulesManager) processNextWorkItem(ctx context.Context) bool
return true
}

func (rrm *relabeledRulesManager) sync(ctx context.Context) error {
func (rrm *relabeledRulesManager) sync(ctx context.Context, key string) error {
relabelConfigs, err := rrm.loadRelabelConfigs()
if err != nil {
return fmt.Errorf("failed to load relabel configs: %w", err)
Expand All @@ -200,13 +200,20 @@ func (rrm *relabeledRulesManager) sync(ctx context.Context) error {
rrm.relabelConfigs = relabelConfigs
rrm.mu.Unlock()

alerts := rrm.collectAlerts(ctx, relabelConfigs)
alerts, allRuleIDs := rrm.collectAlerts(ctx, relabelConfigs)

rrm.mu.Lock()
rrm.relabeledRules = alerts
rrm.mu.Unlock()

log.Infof("Synced %d relabeled rules in memory", len(alerts))

// GC orphaned ARCs only when triggered by PrometheusRule events or
// initial sync — secret-only changes cannot create orphans.
if key == "prometheus-rule-sync" || key == "initial-sync" {
rrm.gcOrphanedARCs(ctx, allRuleIDs)
}

return nil
}

Expand Down Expand Up @@ -255,7 +262,7 @@ func (rrm *relabeledRulesManager) loadRelabelConfigs() ([]*relabel.Config, error
return configs, nil
}

func (rrm *relabeledRulesManager) collectAlerts(ctx context.Context, relabelConfigs []*relabel.Config) map[string]monitoringv1.Rule {
func (rrm *relabeledRulesManager) collectAlerts(ctx context.Context, relabelConfigs []*relabel.Config) (map[string]monitoringv1.Rule, map[string]struct{}) {
alerts := make(map[string]monitoringv1.Rule)
seenIDs := make(map[string]struct{})

Expand Down Expand Up @@ -330,7 +337,7 @@ func (rrm *relabeledRulesManager) collectAlerts(ctx context.Context, relabelConf
}

log.Debugf("Collected %d alerts", len(alerts))
return alerts
return alerts, seenIDs
}

// alertingRuleOwner returns the name of the AlertingRule CR that generated
Expand Down
Loading