Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,8 @@ helm-k8s: helmify manifests kustomize clean-helm gen-kmm-charts gen-remediation-
$(MAKE) helm-docs
echo "dependency update, lint and pack charts"
cd $(shell pwd)/helm-charts-k8s; helm dependency update; helm lint .; cd ..; helm package helm-charts-k8s/ --destination ./helm-charts-k8s
mv $(shell pwd)/helm-charts-k8s/gpu-operator-charts-$(HELM_CHART_VERSION).tgz $(GPU_OPERATOR_CHART)
# Avoid $(GPU_OPERATOR_CHART) here: when exported as the chart dir, mv would be a no-op/same-file error.
mv $(shell pwd)/helm-charts-k8s/gpu-operator-charts-$(HELM_CHART_VERSION).tgz $(shell pwd)/helm-charts-k8s/$(HELM_OUTPUT_FILE_NAME)

.PHONY: bundle-build
bundle-build: operator-sdk manifests kustomize ## OpenShift Build OLM bundle.
Expand Down Expand Up @@ -602,6 +603,10 @@ helmify:
helm-install: ## Deploy Helm Charts.
helm install -f helm-charts-k8s/values.yaml amd-gpu-operator ${GPU_OPERATOR_CHART} -n kube-amd-gpu --create-namespace ${SKIP_NFD_CMD} ${SKIP_KMM_CMD} ${SKIP_REMEDIATION_CONTROLLER_CMD} ${HELM_OC_CMD} ${SIM_ENABLE_CMD} ${SKIP_INSTALL_DEFAULT_CR_CMD}

.PHONY: helm-upgrade-install
helm-upgrade-install: ## Same as helm-install but safe when release amd-gpu-operator already exists.
helm upgrade --install -f helm-charts-k8s/values.yaml amd-gpu-operator ${GPU_OPERATOR_CHART} -n kube-amd-gpu --create-namespace ${SKIP_NFD_CMD} ${SKIP_KMM_CMD} ${SKIP_REMEDIATION_CONTROLLER_CMD} ${HELM_OC_CMD} ${SIM_ENABLE_CMD} ${SKIP_INSTALL_DEFAULT_CR_CMD}

.PHONY: helm-uninstall
helm-uninstall-k8s: ## Undeploy Helm Charts.
echo "Deleting all device configs before uninstalling operator..."
Expand Down
4 changes: 3 additions & 1 deletion api/v1alpha1/deviceconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,9 @@ type ConfigManagerSpec struct {
// +optional
ImageRegistrySecret *v1.LocalObjectReference `json:"imageRegistrySecret,omitempty"`

// config map to customize the config for config manager, if not specified default config will be applied
// ConfigMap holding DCM config.json. When set, the operator mounts this ConfigMap and does not create it.
// When omitted or name is empty, the operator mounts ConfigMap "default-dcm-config" and creates it in the
// DeviceConfig namespace if it does not already exist (same default payload as chart defaultDCMConfigMap).
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Config",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:configmap"}
// +optional
Config *v1.LocalObjectReference `json:"config,omitempty"`
Expand Down
10 changes: 6 additions & 4 deletions bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ metadata:
]
capabilities: Seamless Upgrades
categories: AI/Machine Learning,Monitoring
containerImage: docker.io/rocm/amd-gpu-operator:dev
createdAt: "2026-04-02T12:26:30Z"
containerImage: registry.test.pensando.io:5000/amd-gpu-operator:dev
createdAt: "2026-04-06T08:31:30Z"
description: |-
Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
Expand Down Expand Up @@ -127,8 +127,10 @@ spec:
path: configManager
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:configManager
- description: config map to customize the config for config manager, if not
specified default config will be applied
- description: ConfigMap holding DCM config.json. When set, the operator mounts
this ConfigMap and does not create it. When omitted or name is empty, the
operator mounts ConfigMap "default-dcm-config" and creates it in the DeviceConfig
namespace if it does not already exist (same default payload as chart defaultDCMConfigMap).
displayName: Config
path: configManager.config
x-descriptors:
Expand Down
6 changes: 4 additions & 2 deletions bundle/manifests/amd.com_deviceconfigs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,10 @@ spec:
description: config manager
properties:
config:
description: config map to customize the config for config manager,
if not specified default config will be applied
description: |-
ConfigMap holding DCM config.json. When set, the operator mounts this ConfigMap and does not create it.
When omitted or name is empty, the operator mounts ConfigMap "default-dcm-config" and creates it in the
DeviceConfig namespace if it does not already exist (same default payload as chart defaultDCMConfigMap).
properties:
name:
default: ""
Expand Down
6 changes: 4 additions & 2 deletions config/crd/bases/amd.com_deviceconfigs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,10 @@ spec:
description: config manager
properties:
config:
description: config map to customize the config for config manager,
if not specified default config will be applied
description: |-
ConfigMap holding DCM config.json. When set, the operator mounts this ConfigMap and does not create it.
When omitted or name is empty, the operator mounts ConfigMap "default-dcm-config" and creates it in the
DeviceConfig namespace if it does not already exist (same default payload as chart defaultDCMConfigMap).
properties:
name:
default: ""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,10 @@ spec:
path: configManager
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:configManager
- description: config map to customize the config for config manager, if not
specified default config will be applied
- description: ConfigMap holding DCM config.json. When set, the operator mounts
this ConfigMap and does not create it. When omitted or name is empty, the
operator mounts ConfigMap "default-dcm-config" and creates it in the DeviceConfig
namespace if it does not already exist (same default payload as chart defaultDCMConfigMap).
displayName: Config
path: configManager.config
x-descriptors:
Expand Down
12 changes: 12 additions & 0 deletions docs/dcm/device-config-manager-configmap.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,18 @@

The Device Config Manager (DCM) job is to monitor for and apply different configurations on nodes in your cluster. This is done by defining different profiles that can then be applied to each node on your cluster. As such, DCM relies on a Kubernetes ConfigMap that contains the definitions of each configuration profile. This ConfigMap is required to be present for the Device Config Manager to function properly. Once profiles have been defined, specific node labels can be put on the nodes in the cluster to specify which profile should be applied. DCM monitors for any changes in the ConfigMap or changes to the profile node label and applies the correct configuration accordingly. This ConfigMap approach helps to simplify the rollout of different config profiles across all the nodes in the cluster.

## How the GPU Operator uses this ConfigMap

When **Device Config Manager** is enabled on a **DeviceConfig** (`spec.configManager.enable: true`), the GPU Operator deploys DCM on nodes that match the DeviceConfig selector.

**Default ConfigMap name.** If you do **not** set `spec.configManager.config` (or you leave the referenced name empty), the operator mounts a ConfigMap named **`default-dcm-config`** in the **same namespace as the DeviceConfig**. If that object is missing, the operator **creates** it and fills it with a built-in default `config.json` (partition profiles and related settings suitable for typical use). If `default-dcm-config` already exists—for example because your Helm install created it—the operator **does not** overwrite it.

**Custom ConfigMap.** If you set `spec.configManager.config` to a specific ConfigMap name, the operator mounts **that** ConfigMap only. You must create and maintain it; the operator does not populate it for you.

**Where DCM reads config.** The ConfigMap is mounted into the DCM container so profiles are available under **`/etc/config-manager/`**, usually as **`config.json`** in the ConfigMap’s `data`.

**Helm install.** The GPU Operator Helm chart can optionally install the same **`default-dcm-config`** object in the release namespace at deploy time, so the ConfigMap may already be present before you apply a DeviceConfig. Whether it comes from Helm or from the operator, behavior is the same: the default name is used when you omit a custom reference, and an existing object is preserved.

## ConfigMap

As mentioned, the `config.json` data specifies different GPU partitioning profiles that can be set on the GPU nodes in your cluster. Below is an example Device Config Manager ConfigMap. This example ConfigMap is also available in the GPU Operator repo here: [_example/configmap.yaml_](https://github.com/ROCm/gpu-operator/blob/main/example/configManager/configmap.yaml)
Expand Down
2 changes: 1 addition & 1 deletion example/configManager/deviceconfigs_example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ spec:
enable: True

# image for the device-config-manager container
image: "rocm/device-config-manager:v1.4.0"
image: "rocm/device-config-manager:v1.4.1"

# image pull policy for config manager set to always to pull image of latest version
imagePullPolicy: Always
Expand Down
71 changes: 71 additions & 0 deletions hack/k8s-patch/metadata-patch/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,77 @@ kmmModuleLoader:
serviceAccount:
annotations: {}
kubernetesClusterDomain: cluster.local
# Default ConfigMap for Device Config Manager when DeviceConfig.spec.configManager.config is omitted.
# The name must match internal/configmanager.DefaultDCMConfigMapName ("default-dcm-config").
# Override .data to supply partition profiles; see docs/dcm/device-config-manager-configmap.md.
defaultDCMConfigMap:
# -- Install the default ConfigMap in the operator release namespace (recommended when using DCM without an explicit config reference).
install: true
# -- ConfigMap metadata.name; must stay aligned with the operator default mount name.
name: default-dcm-config
# -- Keys become ConfigMap data keys (typically config.json with DCM JSON content).
data:
config.json: |
{
"gpu-config-profiles": {
"default": {
"profiles": [
{
"computePartition": "SPX",
"memoryPartition": "NPS1"
}
]
},
"cpx_nps1_all": {
"profiles": [
{
"computePartition": "CPX",
"memoryPartition": "NPS1"
}
]
},
"cpx_nps4_all": {
"profiles": [
{
"computePartition": "CPX",
"memoryPartition": "NPS4"
}
]
},
"dpx_nps2_all": {
"profiles": [
{
"computePartition": "DPX",
"memoryPartition": "NPS2"
}
]
},
"qpx_nps1_all": {
"profiles": [
{
"computePartition": "QPX",
"memoryPartition": "NPS1"
}
]
},
"heterogeneous_example": {
"profiles": [
{
"computePartition": "CPX",
"memoryPartition": "NPS1",
"numGPUsAssigned": 2
},
{
"computePartition": "SPX",
"memoryPartition": "NPS1"
}
]
}
},
"gpuClientSystemdServices": {
"names": ["amd-metrics-exporter", "gpuagent"]
}
}
managerConfig:
controllerManagerConfigYaml: |-
healthProbeBindAddress: :8081
Expand Down
2 changes: 1 addition & 1 deletion helm-charts-k8s/Chart.lock
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ dependencies:
repository: file://./charts/remediation-crds
version: v1.0.0
digest: sha256:0806f6b6d7aa21be77bf1c91e720ae3238338a16f107df450a53b02ef940db1b
generated: "2026-04-02T12:26:25.920315689Z"
generated: "2026-04-06T08:31:27.592910404Z"
5 changes: 4 additions & 1 deletion helm-charts-k8s/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ The AMD GPU Operator is licensed under the [Apache License 2.0](LICENSE).

## gpu-operator-charts

![Version: v0.0.1](https://img.shields.io/badge/Version-v0.0.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: dev](https://img.shields.io/badge/AppVersion-dev-informational?style=flat-square)
![Version: v0.0.1](https://img.shields.io/badge/Version-v0.0.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: dev](https://img.shields.io/badge/AppVersion-test--img-informational?style=flat-square)

AMD GPU Operator simplifies the deployment and management of AMD Instinct GPU accelerators within Kubernetes clusters.

Expand Down Expand Up @@ -164,6 +164,9 @@ Kubernetes: `>= 1.29.0-0`
| controllerManager.nodeSelector | object | `{}` | Node selector for AMD GPU operator controller manager deployment |
| crds.defaultCR.install | bool | `true` | Deploy default DeviceConfig during helm chart installation |
| crds.defaultCR.upgrade | bool | `false` | Deploy / Patch default DeviceConfig during helm chart upgrade. Be careful about this option: 1. Your customized change on default DeviceConfig may be overwritten 2. Your existing DeviceConfig may conflict with upgraded default DeviceConfig |
| defaultDCMConfigMap.data | object | `{"config.json":"{\n \"gpu-config-profiles\": {\n \"default\": {\n \"profiles\": [\n {\n \"computePartition\": \"SPX\",\n \"memoryPartition\": \"NPS1\"\n }\n ]\n },\n \"cpx_nps1_all\": {\n \"profiles\": [\n {\n \"computePartition\": \"CPX\",\n \"memoryPartition\": \"NPS1\"\n }\n ]\n },\n \"cpx_nps4_all\": {\n \"profiles\": [\n {\n \"computePartition\": \"CPX\",\n \"memoryPartition\": \"NPS4\"\n }\n ]\n },\n \"dpx_nps2_all\": {\n \"profiles\": [\n {\n \"computePartition\": \"DPX\",\n \"memoryPartition\": \"NPS2\"\n }\n ]\n },\n \"qpx_nps1_all\": {\n \"profiles\": [\n {\n \"computePartition\": \"QPX\",\n \"memoryPartition\": \"NPS1\"\n }\n ]\n },\n \"heterogeneous_example\": {\n \"profiles\": [\n {\n \"computePartition\": \"CPX\",\n \"memoryPartition\": \"NPS1\",\n \"numGPUsAssigned\": 2\n },\n {\n \"computePartition\": \"SPX\",\n \"memoryPartition\": \"NPS1\"\n }\n ]\n }\n },\n \"gpuClientSystemdServices\": {\n \"names\": [\"amd-metrics-exporter\", \"gpuagent\"]\n }\n}\n"}` | Keys become ConfigMap data keys (typically config.json with DCM JSON content). |
| defaultDCMConfigMap.install | bool | `true` | Install the default ConfigMap in the operator release namespace (recommended when using DCM without an explicit config reference). |
| defaultDCMConfigMap.name | string | `"default-dcm-config"` | ConfigMap metadata.name; must stay aligned with the operator default mount name. |
| deviceConfig.spec.commonConfig.initContainerImage | string | `"busybox:1.36"` | init container image |
| deviceConfig.spec.commonConfig.utilsContainer.image | string | `"docker.io/rocm/amd-gpu-operator-utils:latest"` | gpu operator utility container image |
| deviceConfig.spec.commonConfig.utilsContainer.imagePullPolicy | string | `"IfNotPresent"` | utility container image pull policy |
Expand Down
6 changes: 4 additions & 2 deletions helm-charts-k8s/crds/deviceconfig-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,10 @@ spec:
description: config manager
properties:
config:
description: config map to customize the config for config manager,
if not specified default config will be applied
description: |-
ConfigMap holding DCM config.json. When set, the operator mounts this ConfigMap and does not create it.
When omitted or name is empty, the operator mounts ConfigMap "default-dcm-config" and creates it in the
DeviceConfig namespace if it does not already exist (same default payload as chart defaultDCMConfigMap).
properties:
name:
default: ""
Expand Down
71 changes: 71 additions & 0 deletions helm-charts-k8s/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,77 @@ kmmModuleLoader:
serviceAccount:
annotations: {}
kubernetesClusterDomain: cluster.local
# Default ConfigMap for Device Config Manager when DeviceConfig.spec.configManager.config is omitted.
# The name must match internal/configmanager.DefaultDCMConfigMapName ("default-dcm-config").
# Override .data to supply partition profiles; see docs/dcm/device-config-manager-configmap.md.
defaultDCMConfigMap:
# -- Install the default ConfigMap in the operator release namespace (recommended when using DCM without an explicit config reference).
install: true
# -- ConfigMap metadata.name; must stay aligned with the operator default mount name.
name: default-dcm-config
# -- Keys become ConfigMap data keys (typically config.json with DCM JSON content).
data:
config.json: |
{
"gpu-config-profiles": {
"default": {
"profiles": [
{
"computePartition": "SPX",
"memoryPartition": "NPS1"
}
]
},
"cpx_nps1_all": {
"profiles": [
{
"computePartition": "CPX",
"memoryPartition": "NPS1"
}
]
},
"cpx_nps4_all": {
"profiles": [
{
"computePartition": "CPX",
"memoryPartition": "NPS4"
}
]
},
"dpx_nps2_all": {
"profiles": [
{
"computePartition": "DPX",
"memoryPartition": "NPS2"
}
]
},
"qpx_nps1_all": {
"profiles": [
{
"computePartition": "QPX",
"memoryPartition": "NPS1"
}
]
},
"heterogeneous_example": {
"profiles": [
{
"computePartition": "CPX",
"memoryPartition": "NPS1",
"numGPUsAssigned": 2
},
{
"computePartition": "SPX",
"memoryPartition": "NPS1"
}
]
}
},
"gpuClientSystemdServices": {
"names": ["amd-metrics-exporter", "gpuagent"]
}
}
managerConfig:
controllerManagerConfigYaml: |-
healthProbeBindAddress: :8081
Expand Down
Loading