diff --git a/deployments/kai-scheduler/crds/kai.scheduler_schedulingshards.yaml b/deployments/kai-scheduler/crds/kai.scheduler_schedulingshards.yaml index 171bf1649..f71916d98 100644 --- a/deployments/kai-scheduler/crds/kai.scheduler_schedulingshards.yaml +++ b/deployments/kai-scheduler/crds/kai.scheduler_schedulingshards.yaml @@ -45,6 +45,16 @@ spec: spec: description: SchedulingShardSpec defines the desired state of SchedulingShard properties: + actions: + description: |- + Actions defines the scheduler actions to run in order. If not specified, defaults to + "allocate, consolidation, reclaim, preempt, stalegangeviction" (consolidation is + excluded when using spread placement strategy). + Available actions: allocate, consolidation, reclaim, preempt, stalegangeviction + Example: ["allocate", "reclaim", "preempt"] to disable stalegangeviction and consolidation. + items: + type: string + type: array args: additionalProperties: type: string diff --git a/deployments/kai-scheduler/templates/default-shard.yaml b/deployments/kai-scheduler/templates/default-shard.yaml index cdb828890..9c452f8eb 100644 --- a/deployments/kai-scheduler/templates/default-shard.yaml +++ b/deployments/kai-scheduler/templates/default-shard.yaml @@ -14,3 +14,7 @@ spec: gpu: {{ .Values.scheduler.placementStrategy }} cpu: {{ .Values.scheduler.placementStrategy }} {{- end }} + {{- if .Values.scheduler.actions }} + actions: + {{- toYaml .Values.scheduler.actions | nindent 4 }} + {{- end }} diff --git a/deployments/kai-scheduler/values.yaml b/deployments/kai-scheduler/values.yaml index 93ea5a389..8bc4775f3 100644 --- a/deployments/kai-scheduler/values.yaml +++ b/deployments/kai-scheduler/values.yaml @@ -75,6 +75,12 @@ scheduler: pullPolicy: IfNotPresent # tag: "" # Optional: Override global.tag or Chart.AppVersion placementStrategy: binpack + # actions defines the scheduler actions to run in order. + # If empty, defaults to: allocate, consolidation, reclaim, preempt, stalegangeviction + # (consolidation is excluded when using spread placement strategy) + # Available actions: allocate, consolidation, reclaim, preempt, stalegangeviction + # Example to disable stalegangeviction: ["allocate", "consolidation", "reclaim", "preempt"] + actions: [] ports: metricsPort: 8080 diff --git a/pkg/apis/kai/v1/schedulingshard_types.go b/pkg/apis/kai/v1/schedulingshard_types.go index d310537d3..311ca4bb6 100644 --- a/pkg/apis/kai/v1/schedulingshard_types.go +++ b/pkg/apis/kai/v1/schedulingshard_types.go @@ -32,6 +32,14 @@ const ( // SchedulingShardSpec defines the desired state of SchedulingShard type SchedulingShardSpec struct { + // Actions defines the scheduler actions to run in order. If not specified, defaults to + // "allocate, consolidation, reclaim, preempt, stalegangeviction" (consolidation is + // excluded when using spread placement strategy). + // Available actions: allocate, consolidation, reclaim, preempt, stalegangeviction + // Example: ["allocate", "reclaim", "preempt"] to disable stalegangeviction and consolidation. + // +kubebuilder:validation:Optional + Actions []string `json:"actions,omitempty"` + // Args specifies custom CLI arguments for the scheduler. These are merged with automatically generated flags. // Valid flags are those defined in the scheduler's code. Usage examples: // - To pass "--custom-flag=value": Args: {"custom-flag": "value"} diff --git a/pkg/apis/kai/v1/zz_generated.deepcopy.go b/pkg/apis/kai/v1/zz_generated.deepcopy.go index 3278767b2..e8deb9f99 100644 --- a/pkg/apis/kai/v1/zz_generated.deepcopy.go +++ b/pkg/apis/kai/v1/zz_generated.deepcopy.go @@ -394,6 +394,11 @@ func (in *SchedulingShardList) DeepCopyObject() runtime.Object { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SchedulingShardSpec) DeepCopyInto(out *SchedulingShardSpec) { *out = *in + if in.Actions != nil { + in, out := &in.Actions, &out.Actions + *out = make([]string, len(*in)) + copy(*out, *in) + } if in.Args != nil { in, out := &in.Args, &out.Args *out = make(map[string]string, len(*in)) diff --git a/pkg/operator/operands/scheduler/resources_for_shard.go b/pkg/operator/operands/scheduler/resources_for_shard.go index 28935ce1b..e9276c9d5 100644 --- a/pkg/operator/operands/scheduler/resources_for_shard.go +++ b/pkg/operator/operands/scheduler/resources_for_shard.go @@ -25,6 +25,7 @@ import ( "github.com/NVIDIA/KAI-scheduler/pkg/operator/operands/common" usagedbapi "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/cache/usagedb/api" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/conf" + "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/framework" ) const ( @@ -129,11 +130,18 @@ func (s *SchedulerForShard) configMapForShard( placementArguments := calculatePlacementArguments(shard.Spec.PlacementStrategy) innerConfig := conf.SchedulerConfiguration{} - actions := []string{"allocate"} - if placementArguments[gpuResource] != spreadStrategy && placementArguments[cpuResource] != spreadStrategy { - actions = append(actions, "consolidation") + var actions []string + if len(shard.Spec.Actions) > 0 { + // Use custom actions from shard spec + actions = shard.Spec.Actions + } else { + // Default actions + actions = []string{string(framework.Allocate)} + if placementArguments[gpuResource] != spreadStrategy && placementArguments[cpuResource] != spreadStrategy { + actions = append(actions, string(framework.Consolidation)) + } + actions = append(actions, string(framework.Reclaim), string(framework.Preempt), string(framework.StaleGangEviction)) } - actions = append(actions, []string{"reclaim", "preempt", "stalegangeviction"}...) innerConfig.Actions = strings.Join(actions, ", ")