Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions deployments/kai-scheduler/crds/kai.scheduler_schedulingshards.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,16 @@ spec:
spec:
description: SchedulingShardSpec defines the desired state of SchedulingShard
properties:
actions:
description: |-
Actions defines the scheduler actions to run in order. If not specified, defaults to
"allocate, consolidation, reclaim, preempt, stalegangeviction" (consolidation is
excluded when using spread placement strategy).
Available actions: allocate, consolidation, reclaim, preempt, stalegangeviction
Example: ["allocate", "reclaim", "preempt"] to disable stalegangeviction and consolidation.
items:
type: string
type: array
args:
additionalProperties:
type: string
Expand Down
4 changes: 4 additions & 0 deletions deployments/kai-scheduler/templates/default-shard.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,7 @@ spec:
gpu: {{ .Values.scheduler.placementStrategy }}
cpu: {{ .Values.scheduler.placementStrategy }}
{{- end }}
{{- if .Values.scheduler.actions }}
actions:
{{- toYaml .Values.scheduler.actions | nindent 4 }}
{{- end }}
6 changes: 6 additions & 0 deletions deployments/kai-scheduler/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,12 @@ scheduler:
pullPolicy: IfNotPresent
# tag: "" # Optional: Override global.tag or Chart.AppVersion
placementStrategy: binpack
# actions defines the scheduler actions to run in order.
# If empty, defaults to: allocate, consolidation, reclaim, preempt, stalegangeviction
# (consolidation is excluded when using spread placement strategy)
# Available actions: allocate, consolidation, reclaim, preempt, stalegangeviction
# Example to disable stalegangeviction: ["allocate", "consolidation", "reclaim", "preempt"]
actions: []
ports:
metricsPort: 8080

Expand Down
8 changes: 8 additions & 0 deletions pkg/apis/kai/v1/schedulingshard_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,14 @@ const (

// SchedulingShardSpec defines the desired state of SchedulingShard
type SchedulingShardSpec struct {
// Actions defines the scheduler actions to run in order. If not specified, defaults to
// "allocate, consolidation, reclaim, preempt, stalegangeviction" (consolidation is
// excluded when using spread placement strategy).
// Available actions: allocate, consolidation, reclaim, preempt, stalegangeviction
// Example: ["allocate", "reclaim", "preempt"] to disable stalegangeviction and consolidation.
// +kubebuilder:validation:Optional
Actions []string `json:"actions,omitempty"`

// Args specifies custom CLI arguments for the scheduler. These are merged with automatically generated flags.
// Valid flags are those defined in the scheduler's code. Usage examples:
// - To pass "--custom-flag=value": Args: {"custom-flag": "value"}
Expand Down
5 changes: 5 additions & 0 deletions pkg/apis/kai/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 12 additions & 4 deletions pkg/operator/operands/scheduler/resources_for_shard.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"github.com/NVIDIA/KAI-scheduler/pkg/operator/operands/common"
usagedbapi "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/cache/usagedb/api"
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/conf"
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/framework"
)

const (
Expand Down Expand Up @@ -129,11 +130,18 @@ func (s *SchedulerForShard) configMapForShard(
placementArguments := calculatePlacementArguments(shard.Spec.PlacementStrategy)
innerConfig := conf.SchedulerConfiguration{}

actions := []string{"allocate"}
if placementArguments[gpuResource] != spreadStrategy && placementArguments[cpuResource] != spreadStrategy {
actions = append(actions, "consolidation")
var actions []string
if len(shard.Spec.Actions) > 0 {
// Use custom actions from shard spec
actions = shard.Spec.Actions
} else {
// Default actions
actions = []string{string(framework.Allocate)}
if placementArguments[gpuResource] != spreadStrategy && placementArguments[cpuResource] != spreadStrategy {
actions = append(actions, string(framework.Consolidation))
}
actions = append(actions, string(framework.Reclaim), string(framework.Preempt), string(framework.StaleGangEviction))
}
Comment on lines +134 to 144
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Our convention is to set the defaults on the type (SetDefaultsWhereNeeded) and then we don't need them here.
Also please try to avoid obvious comments - they waste tokens.

actions = append(actions, []string{"reclaim", "preempt", "stalegangeviction"}...)

innerConfig.Actions = strings.Join(actions, ", ")

Expand Down