Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
69 commits
Select commit Hold shift + click to select a range
9bd8462
Add rulecheck CRD
baowj-678 Mar 11, 2024
280b232
init the framework
Mar 11, 2024
6c0b84e
Merge pull request #1 from wang-mask/rulecheck-crd
baowj-678 Mar 12, 2024
885177d
Update rulecheck CRD
baowj-678 Mar 12, 2024
345b80b
Merge branch 'init-framework' into rulecheck-crd
baowj-678 Mar 12, 2024
24bc5f2
Merge pull request #2 from wang-mask/rulecheck-crd
baowj-678 Mar 12, 2024
dab565f
add label field to the rulecheck and cgroupnofity config filed
Mar 12, 2024
5bbfad3
add update rulecheck and cgroupnotify func
Mar 12, 2024
4780e16
add rulecheck crd yaml
baowj-678 Mar 13, 2024
a36ffd1
Merge pull request #3 from wang-mask/rulecheck-crd
baowj-678 Mar 13, 2024
f56b787
fix update bug
Mar 13, 2024
ee063d5
cgroup merge init-framwork
yanxiaoqi932 Mar 14, 2024
c2c4de8
merge init with cgroup
Mar 15, 2024
f20ea63
merge init-framework
yanxiaoqi932 Mar 15, 2024
740428f
Update rulecheck CRD
baowj-678 Mar 15, 2024
5cc5692
Merge pull request #5 from wang-mask/dev-rulecheck
baowj-678 Mar 15, 2024
dba1937
add nodeinformer and fix cgroupinformer
Mar 16, 2024
cecf757
init the process framework
Mar 16, 2024
a80bc98
fix []string type
Mar 16, 2024
cb0d582
add node event
Mar 17, 2024
e8afda4
update informer
Mar 17, 2024
b965824
add priority to rulecheck crd
baowj-678 Mar 18, 2024
68bbee7
Merge pull request #6 from wang-mask/dev-rulecheck
baowj-678 Mar 18, 2024
1c0fe51
update updateRuleCheckConfig func
baowj-678 Mar 18, 2024
03b03b5
delete comment
Mar 18, 2024
fe1b1a9
delete rwmutux
Mar 18, 2024
517511a
update the framework and filter the event
Mar 18, 2024
e20e919
add init the manager
Mar 18, 2024
abd5b72
add rulecheck rbac
baowj-678 Mar 18, 2024
f409ab8
init empty HealthCheckConfig
baowj-678 Mar 18, 2024
e352725
Merge branch 'dev' of github.com:wang-mask/caelus into dev
baowj-678 Mar 18, 2024
666ac6c
fix rulecheck CRD yaml bug.
baowj-678 Mar 19, 2024
9fa6cb7
fix rulecheck nil bug.
baowj-678 Mar 19, 2024
159e7cb
fix group version error.
baowj-678 Mar 19, 2024
acaf1ba
update RuleCheck DetectAction Args type
baowj-678 Mar 20, 2024
1772e83
add RuleCheckAvailableFunc func
baowj-678 Mar 20, 2024
7ad6ad3
update rulecheck crd nodeselector type
baowj-678 Mar 20, 2024
8589974
fix tyep to *type and change label to nodeselector
Mar 20, 2024
9cd54f2
Merge remote-tracking branch 'refs/remotes/origin/dev' into dev
Mar 20, 2024
6cf0dda
fix
Mar 20, 2024
3bf0872
update SyncPeriod to 0
Mar 21, 2024
98117b8
add func to function parameters
Mar 21, 2024
3633c8e
fix updateRuleCheckConfig bug
baowj-678 Mar 21, 2024
e7c1cb9
add register.go
baowj-678 Mar 21, 2024
2864a9b
cgroupnotify merge
yanxiaoqi932 Mar 21, 2024
e1e7a80
add cgroupnotifies.yaml
yanxiaoqi932 Mar 21, 2024
682f6c7
update rulecheck priority type to int32
baowj-678 Mar 22, 2024
ffe8445
move crd yaml files to hack/yaml/caelus.yaml
baowj-678 Mar 22, 2024
577e5f1
merge rulecheck & cgroupnotify informer factory
baowj-678 Mar 22, 2024
56d1a4f
list all namespaces
baowj-678 Mar 24, 2024
d269658
modify priority type from *int to int
yanxiaoqi932 Mar 26, 2024
4184413
update readme
baowj-678 Mar 27, 2024
0f8b472
fix rulecheck time interval bug
baowj-678 Mar 27, 2024
af2b078
set status nil
yanxiaoqi932 Mar 27, 2024
9f4d6bf
modify rbac and rules.md
yanxiaoqi932 Mar 28, 2024
0b4449c
modify: delete cgroup.yaml
yanxiaoqi932 Mar 28, 2024
0a1f803
make rulecheck priority optional
baowj-678 Mar 28, 2024
1d371a6
add omitempty and modify nil pointer bug
yanxiaoqi932 Mar 28, 2024
9e593bd
add omitempty
yanxiaoqi932 Mar 28, 2024
e3b6856
modify newMemoryNotifyConfig
yanxiaoqi932 Mar 28, 2024
369227b
modify caelus.yaml
yanxiaoqi932 Mar 28, 2024
ce1e5ca
rulecheck omitempty
yanxiaoqi932 Mar 28, 2024
9ee903f
add error info
Mar 29, 2024
abd97d1
update readme
Mar 29, 2024
9d881fb
update role v1beta1 to v1
Mar 29, 2024
89b8a1c
remove rulecheck config labels and useless unit test.
baowj-678 Mar 29, 2024
ff5b800
delete types.NotifyConfig.Labels
yanxiaoqi932 Mar 29, 2024
2e52e7d
delete TODO comment
Apr 1, 2024
46e0d4e
Merge remote-tracking branch 'refs/remotes/origin/dev' into dev
Apr 1, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions cmd/caelus/app/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (
"github.com/tencent/caelus/pkg/caelus/checkpoint"
"github.com/tencent/caelus/pkg/caelus/cpi"
"github.com/tencent/caelus/pkg/caelus/diskquota"
"github.com/tencent/caelus/pkg/caelus/healthcheck"
health "github.com/tencent/caelus/pkg/caelus/healthcheck"
"github.com/tencent/caelus/pkg/caelus/healthcheck/conflict"
"github.com/tencent/caelus/pkg/caelus/metrics"
"github.com/tencent/caelus/pkg/caelus/online"
Expand Down Expand Up @@ -244,9 +244,13 @@ func (o *options) initModules(caelus *types.CaelusConfig, ctx *context.CaelusCon
o.ApiOption.loadStatsMetric(metrics.StatsMetricDiskQuota, diskquotaManager)
}
// health check manager
healthCheckManager = health.NewHealthManager(types.InitHealthCheckConfigFunc(&caelus.Metrics.Node,
&caelus.Predicts[0].ReserveResource), stStore, resourceManager, qosManager,
conflictMn, podInformer)
nodeInformer := ctx.GetNodeFactory().Core().V1().Nodes()
ruleCheckInformer := ctx.GetCaelusFactory().Caelus().V1().RuleChecks()
cgroupNotifyInformer := ctx.GetCaelusFactory().Caelus().V1().CgroupNotifies()

healthCheckManager = health.NewHealthManager(stStore, resourceManager, qosManager, conflictMn, podInformer,
nodeInformer, ruleCheckInformer, cgroupNotifyInformer, &caelus.Predicts[0].ReserveResource,
types.RuleCheckAvailableFunc(&caelus.Metrics.Node, &caelus.Predicts[0].ReserveResource))
modules = append(modules, healthCheckManager)

return modules
Expand Down
65 changes: 54 additions & 11 deletions cmd/caelus/context/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,22 +18,29 @@ package context
import (
"time"

caelusclient "github.com/tencent/caelus/pkg/generated/clientset/versioned"
caelusfake "github.com/tencent/caelus/pkg/generated/clientset/versioned/fake"
caelusinformers "github.com/tencent/caelus/pkg/generated/informers/externalversions"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/client-go/informers"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/fake"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
"k8s.io/klog/v2"
)

// CaelusContext stores k8s client and factory
// CaelusContext stores k8s&caelus client and factory
type CaelusContext struct {
Master string
Kubeconfig string
NodeName string
kubeClient clientset.Interface
caelusClient caelusclient.Interface
nodeFactory, podFactory informers.SharedInformerFactory
caelusFactory caelusinformers.SharedInformerFactory
}

const (
Expand All @@ -45,17 +52,35 @@ const (

// lazyInit build kubernetes client
func (c *CaelusContext) lazyInit() {
if c.kubeClient != nil {
return
var kubeconfig *rest.Config = nil
var err error
if c.kubeClient == nil {
kubeconfig, err = clientcmd.BuildConfigFromFlags(c.Master, c.Kubeconfig)
if err != nil {
klog.Warning(err)
klog.Warning("fall back to creating fake kube-client")
// create a fake client to test caelus without k8s
c.kubeClient = fake.NewSimpleClientset()
} else {
c.kubeClient = clientset.NewForConfigOrDie(kubeconfig)
}

}
kubeconfig, err := clientcmd.BuildConfigFromFlags(c.Master, c.Kubeconfig)
if err != nil {
klog.Warning(err)
klog.Warning("fall back to creating fake kube-client")
// create a fake client to test caelus without k8s
c.kubeClient = fake.NewSimpleClientset()
} else {
c.kubeClient = clientset.NewForConfigOrDie(kubeconfig)

// init caelus client
if c.caelusClient == nil {
err = nil
if kubeconfig == nil {
kubeconfig, err = clientcmd.BuildConfigFromFlags(c.Master, c.Kubeconfig)
}
if err != nil {
klog.Warning(err)
klog.Warning("fall back to creating fake caelus-client")
// create a fake client to test caelus without k8s
c.caelusClient = caelusfake.NewSimpleClientset()
} else {
c.caelusClient = caelusclient.NewForConfigOrDie(kubeconfig)
}
}
}

Expand All @@ -65,6 +90,12 @@ func (c *CaelusContext) GetKubeClient() clientset.Interface {
return c.kubeClient
}

// GetCaelusClient returns caelus client
func (c *CaelusContext) GetCaelusClient() caelusclient.Interface {
c.lazyInit()
return c.caelusClient
}

// GetPodFactory returns pod factory
func (c *CaelusContext) GetPodFactory() informers.SharedInformerFactory {
if c.podFactory == nil {
Expand All @@ -79,6 +110,14 @@ func (c *CaelusContext) GetPodFactory() informers.SharedInformerFactory {
return c.podFactory
}

// GetCaelusFactory returns ruleCheck factory
func (c *CaelusContext) GetCaelusFactory() caelusinformers.SharedInformerFactory {
if c.caelusFactory == nil {
c.caelusFactory = caelusinformers.NewSharedInformerFactoryWithOptions(c.GetCaelusClient(), 0)
}
return c.caelusFactory
}

// GetNodeFactory returns node factory
func (c *CaelusContext) GetNodeFactory() informers.SharedInformerFactory {
if c.nodeFactory == nil {
Expand All @@ -105,4 +144,8 @@ func (c *CaelusContext) Run(stop <-chan struct{}) {
c.nodeFactory.Start(stop)
c.nodeFactory.WaitForCacheSync(stop)
}
if c.caelusFactory != nil {
c.caelusFactory.Start(stop)
c.caelusFactory.WaitForCacheSync(stop)
}
}
102 changes: 98 additions & 4 deletions doc/rules.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
# Detection configuration

Caelus dynamically checks abnormalities of various metrics based on [rules.json](../hack/config/rules.json), such as CPU
Caelus dynamically checks abnormalities of various metrics based on CRDs of `RuleCheck` and `CgroupNotify`
, such as CPU
usage or online latency, to make sure online jobs run normally. Batch jobs will be throttled or even killed if
interference detected.
This document describes how to configure [rules.json](../hack/config/rules.json)
This document describes how to configure these CRDs.

> **Attention**: Multiple `RuleCheck` or `CgroupNotify` can exist in a cluster, and they will be merged based on the set priority and creation time.

## 1.Rules
Rules are used to assign the algorithm to check if the timed metrics have significant fluctuations, such as EWMA,
Expand Down Expand Up @@ -103,7 +106,7 @@ type NodeCpu struct {
...
}
```
Node rules example:
Node rules json example:
```json
{
"name": "cpu",
Expand Down Expand Up @@ -145,6 +148,46 @@ type NodeCpu struct {
]
}
```
Node rules yaml example:
```yaml
apiVersion: caelus.io/v1
kind: RuleCheck
metadata:
name: caelus-test1
namespace: caelus-system
spec:
name: cpu
priority: 90
nodeSelector:
disktype: "ssd"
type: node
metrics:
- cpu_avg
checkInterval: 10
handleInterval: 10
recoverInterval: 15
rules:
- detects:
- name: expression
args: |-
{
"expression": "auto",
"warning_count": 10,
"warning_duration": "10s"
}
- actions:
- name: adjust
args: |-
{
"resources": [
{
"step": "1000m"
}]
}
- name: schedule
args: |-
{}
```

### Container rules
Container rules describe how to detect metrics of container level, the supported metrics could be found from
Expand All @@ -158,7 +201,7 @@ type CgroupStats struct {
}
```

Container rules example:
Container rules json example:
```json
{
"metrics": [
Expand All @@ -181,6 +224,34 @@ type CgroupStats struct {
]
}
```

Container rules yaml example:
```yaml
apiVersion: caelus.io/v1
kind: RuleCheck
metadata:
name: caelus-test2
namespace: caelus-system
spec:
name: cpu
priority: 90
nodeSelector:
disktype: "ssd"
type: container
metrics:
- nr_cpu_throttled
checkInterval: 5
handleInterval: 10
recoverInterval: 15
rules:
- detects:
- name: expression
args: |
{
"expression": "nr_cpu_throttled > 0"
}
```

### App rules
App rules describe how to detect metrics of app level, the metrics are provided by users themselves, in the way of
executable command or http server, the example as flowing:
Expand Down Expand Up @@ -336,3 +407,26 @@ type CgroupStats struct {
]
}
```

Memory cgroup event yaml example:
```yaml
apiVersion: caelus.io/v1
kind: CgroupNotify
metadata:
name: caelus-test1
namespace: kube-system
spec:
priority: 90
nodeSelector:
type: "memory"
memory_cgroup:
pressures:
- cgroups: ["/kubepods/offline"]
pressure_level: "low"
duration: 3
count: 2
usages:
- cgroups: ["/kubepods/offline/test"]
margin_mb: 2048
duration: 60000
```
13 changes: 9 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ require (
github.com/docker/engine-api v0.4.0
github.com/emicklei/go-restful v2.9.5+incompatible
github.com/fatih/structs v1.1.0
github.com/fsnotify/fsnotify v1.6.0
github.com/google/cadvisor v0.46.0
github.com/guillermo/go.procmeminfo v0.0.0-20131127224636-be4355a9fb0e
github.com/json-iterator/go v1.1.12
Expand All @@ -22,15 +21,17 @@ require (
github.com/spf13/cobra v1.6.0
github.com/spf13/pflag v1.0.5
github.com/vishvananda/netlink v1.1.1-0.20200915183220-339a215d6544
golang.org/x/net v0.3.1-0.20221206200815-1e63c2f08a10
golang.org/x/net v0.8.0
golang.org/x/sys v0.6.0
gotest.tools v2.2.0+incompatible
k8s.io/api v0.26.0
k8s.io/apimachinery v0.26.0
k8s.io/apiserver v0.26.0
k8s.io/autoscaler/vertical-pod-autoscaler v0.10.0
k8s.io/client-go v0.26.0
k8s.io/code-generator v0.29.3
k8s.io/component-base v0.26.0
k8s.io/klog v1.0.0
k8s.io/klog/v2 v2.80.1
k8s.io/kubernetes v1.26.0
k8s.io/utils v0.0.0-20221107191617-1a15be271d1d
Expand All @@ -56,6 +57,7 @@ require (
github.com/emicklei/go-restful/v3 v3.9.0 // indirect
github.com/euank/go-kmsg-parser v2.0.0+incompatible // indirect
github.com/evanphx/json-patch v4.12.0+incompatible // indirect
github.com/fsnotify/fsnotify v1.6.0 // indirect
github.com/go-logr/logr v1.2.3 // indirect
github.com/go-ole/go-ole v1.2.4 // indirect
github.com/go-openapi/jsonpointer v0.19.5 // indirect
Expand Down Expand Up @@ -95,15 +97,18 @@ require (
github.com/stretchr/testify v1.8.0 // indirect
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 // indirect
github.com/vishvananda/netns v0.0.0-20200728191858-db3c7e526aae // indirect
golang.org/x/mod v0.9.0 // indirect
golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b // indirect
golang.org/x/term v0.3.0 // indirect
golang.org/x/text v0.5.0 // indirect
golang.org/x/term v0.6.0 // indirect
golang.org/x/text v0.8.0 // indirect
golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 // indirect
golang.org/x/tools v0.7.0 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/protobuf v1.28.1 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/gengo v0.0.0-20220902162205-c0856e24416d // indirect
k8s.io/kube-openapi v0.0.0-20221012153701-172d655c2280 // indirect
moul.io/http2curl v1.0.0 // indirect
sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 // indirect
Expand Down
Loading