diff --git a/Makefile b/Makefile index 90f440d9a..9b8dcc1dd 100644 --- a/Makefile +++ b/Makefile @@ -411,7 +411,8 @@ helm-k8s: helmify manifests kustomize clean-helm gen-kmm-charts gen-remediation- $(MAKE) helm-docs echo "dependency update, lint and pack charts" cd $(shell pwd)/helm-charts-k8s; helm dependency update; helm lint .; cd ..; helm package helm-charts-k8s/ --destination ./helm-charts-k8s - mv $(shell pwd)/helm-charts-k8s/gpu-operator-charts-$(HELM_CHART_VERSION).tgz $(GPU_OPERATOR_CHART) + # Avoid $(GPU_OPERATOR_CHART) here: when exported as the chart dir, mv would be a no-op/same-file error. + mv $(shell pwd)/helm-charts-k8s/gpu-operator-charts-$(HELM_CHART_VERSION).tgz $(shell pwd)/helm-charts-k8s/$(HELM_OUTPUT_FILE_NAME) .PHONY: bundle-build bundle-build: operator-sdk manifests kustomize ## OpenShift Build OLM bundle. @@ -602,6 +603,10 @@ helmify: helm-install: ## Deploy Helm Charts. helm install -f helm-charts-k8s/values.yaml amd-gpu-operator ${GPU_OPERATOR_CHART} -n kube-amd-gpu --create-namespace ${SKIP_NFD_CMD} ${SKIP_KMM_CMD} ${SKIP_REMEDIATION_CONTROLLER_CMD} ${HELM_OC_CMD} ${SIM_ENABLE_CMD} ${SKIP_INSTALL_DEFAULT_CR_CMD} +.PHONY: helm-upgrade-install +helm-upgrade-install: ## Same as helm-install but safe when release amd-gpu-operator already exists. + helm upgrade --install -f helm-charts-k8s/values.yaml amd-gpu-operator ${GPU_OPERATOR_CHART} -n kube-amd-gpu --create-namespace ${SKIP_NFD_CMD} ${SKIP_KMM_CMD} ${SKIP_REMEDIATION_CONTROLLER_CMD} ${HELM_OC_CMD} ${SIM_ENABLE_CMD} ${SKIP_INSTALL_DEFAULT_CR_CMD} + .PHONY: helm-uninstall helm-uninstall-k8s: ## Undeploy Helm Charts. echo "Deleting all device configs before uninstalling operator..." diff --git a/api/v1alpha1/deviceconfig_types.go b/api/v1alpha1/deviceconfig_types.go index 72c4115e7..69ae60079 100644 --- a/api/v1alpha1/deviceconfig_types.go +++ b/api/v1alpha1/deviceconfig_types.go @@ -602,7 +602,9 @@ type ConfigManagerSpec struct { // +optional ImageRegistrySecret *v1.LocalObjectReference `json:"imageRegistrySecret,omitempty"` - // config map to customize the config for config manager, if not specified default config will be applied + // ConfigMap holding DCM config.json. When set, the operator mounts this ConfigMap and does not create it. + // When omitted or name is empty, the operator mounts ConfigMap "default-dcm-config" and creates it in the + // DeviceConfig namespace if it does not already exist (same default payload as chart defaultDCMConfigMap). //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Config",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:configmap"} // +optional Config *v1.LocalObjectReference `json:"config,omitempty"` diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml index a627341a4..e6cf37ecf 100644 --- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml +++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml @@ -35,8 +35,8 @@ metadata: ] capabilities: Seamless Upgrades categories: AI/Machine Learning,Monitoring - containerImage: docker.io/rocm/amd-gpu-operator:dev - createdAt: "2026-04-02T12:26:30Z" + containerImage: registry.test.pensando.io:5000/amd-gpu-operator:dev + createdAt: "2026-04-06T08:31:30Z" description: |- Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/) @@ -127,8 +127,10 @@ spec: path: configManager x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:configManager - - description: config map to customize the config for config manager, if not - specified default config will be applied + - description: ConfigMap holding DCM config.json. When set, the operator mounts + this ConfigMap and does not create it. When omitted or name is empty, the + operator mounts ConfigMap "default-dcm-config" and creates it in the DeviceConfig + namespace if it does not already exist (same default payload as chart defaultDCMConfigMap). displayName: Config path: configManager.config x-descriptors: diff --git a/bundle/manifests/amd.com_deviceconfigs.yaml b/bundle/manifests/amd.com_deviceconfigs.yaml index 2737a0a3d..72d0786e9 100644 --- a/bundle/manifests/amd.com_deviceconfigs.yaml +++ b/bundle/manifests/amd.com_deviceconfigs.yaml @@ -111,8 +111,10 @@ spec: description: config manager properties: config: - description: config map to customize the config for config manager, - if not specified default config will be applied + description: |- + ConfigMap holding DCM config.json. When set, the operator mounts this ConfigMap and does not create it. + When omitted or name is empty, the operator mounts ConfigMap "default-dcm-config" and creates it in the + DeviceConfig namespace if it does not already exist (same default payload as chart defaultDCMConfigMap). properties: name: default: "" diff --git a/config/crd/bases/amd.com_deviceconfigs.yaml b/config/crd/bases/amd.com_deviceconfigs.yaml index 9c862da92..13f0f16db 100644 --- a/config/crd/bases/amd.com_deviceconfigs.yaml +++ b/config/crd/bases/amd.com_deviceconfigs.yaml @@ -107,8 +107,10 @@ spec: description: config manager properties: config: - description: config map to customize the config for config manager, - if not specified default config will be applied + description: |- + ConfigMap holding DCM config.json. When set, the operator mounts this ConfigMap and does not create it. + When omitted or name is empty, the operator mounts ConfigMap "default-dcm-config" and creates it in the + DeviceConfig namespace if it does not already exist (same default payload as chart defaultDCMConfigMap). properties: name: default: "" diff --git a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml index 366246b9e..cb7388616 100644 --- a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml +++ b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml @@ -94,8 +94,10 @@ spec: path: configManager x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:configManager - - description: config map to customize the config for config manager, if not - specified default config will be applied + - description: ConfigMap holding DCM config.json. When set, the operator mounts + this ConfigMap and does not create it. When omitted or name is empty, the + operator mounts ConfigMap "default-dcm-config" and creates it in the DeviceConfig + namespace if it does not already exist (same default payload as chart defaultDCMConfigMap). displayName: Config path: configManager.config x-descriptors: diff --git a/docs/dcm/device-config-manager-configmap.md b/docs/dcm/device-config-manager-configmap.md index 55013f66a..b8a9fab38 100644 --- a/docs/dcm/device-config-manager-configmap.md +++ b/docs/dcm/device-config-manager-configmap.md @@ -2,6 +2,18 @@ The Device Config Manager (DCM) job is to monitor for and apply different configurations on nodes in your cluster. This is done by defining different profiles that can then be applied to each node on your cluster. As such, DCM relies on a Kubernetes ConfigMap that contains the definitions of each configuration profile. This ConfigMap is required to be present for the Device Config Manager to function properly. Once profiles have been defined, specific node labels can be put on the nodes in the cluster to specify which profile should be applied. DCM monitors for any changes in the ConfigMap or changes to the profile node label and applies the correct configuration accordingly. This ConfigMap approach helps to simplify the rollout of different config profiles across all the nodes in the cluster. +## How the GPU Operator uses this ConfigMap + +When **Device Config Manager** is enabled on a **DeviceConfig** (`spec.configManager.enable: true`), the GPU Operator deploys DCM on nodes that match the DeviceConfig selector. + +**Default ConfigMap name.** If you do **not** set `spec.configManager.config` (or you leave the referenced name empty), the operator mounts a ConfigMap named **`default-dcm-config`** in the **same namespace as the DeviceConfig**. If that object is missing, the operator **creates** it and fills it with a built-in default `config.json` (partition profiles and related settings suitable for typical use). If `default-dcm-config` already exists—for example because your Helm install created it—the operator **does not** overwrite it. + +**Custom ConfigMap.** If you set `spec.configManager.config` to a specific ConfigMap name, the operator mounts **that** ConfigMap only. You must create and maintain it; the operator does not populate it for you. + +**Where DCM reads config.** The ConfigMap is mounted into the DCM container so profiles are available under **`/etc/config-manager/`**, usually as **`config.json`** in the ConfigMap’s `data`. + +**Helm install.** The GPU Operator Helm chart can optionally install the same **`default-dcm-config`** object in the release namespace at deploy time, so the ConfigMap may already be present before you apply a DeviceConfig. Whether it comes from Helm or from the operator, behavior is the same: the default name is used when you omit a custom reference, and an existing object is preserved. + ## ConfigMap As mentioned, the `config.json` data specifies different GPU partitioning profiles that can be set on the GPU nodes in your cluster. Below is an example Device Config Manager ConfigMap. This example ConfigMap is also available in the GPU Operator repo here: [_example/configmap.yaml_](https://github.com/ROCm/gpu-operator/blob/main/example/configManager/configmap.yaml) diff --git a/example/configManager/deviceconfigs_example.yaml b/example/configManager/deviceconfigs_example.yaml index dc0679e20..9b44b59fb 100644 --- a/example/configManager/deviceconfigs_example.yaml +++ b/example/configManager/deviceconfigs_example.yaml @@ -12,7 +12,7 @@ spec: enable: True # image for the device-config-manager container - image: "rocm/device-config-manager:v1.4.0" + image: "rocm/device-config-manager:v1.4.1" # image pull policy for config manager set to always to pull image of latest version imagePullPolicy: Always diff --git a/hack/k8s-patch/metadata-patch/values.yaml b/hack/k8s-patch/metadata-patch/values.yaml index 195901c86..64f8a5502 100644 --- a/hack/k8s-patch/metadata-patch/values.yaml +++ b/hack/k8s-patch/metadata-patch/values.yaml @@ -361,6 +361,77 @@ kmmModuleLoader: serviceAccount: annotations: {} kubernetesClusterDomain: cluster.local +# Default ConfigMap for Device Config Manager when DeviceConfig.spec.configManager.config is omitted. +# The name must match internal/configmanager.DefaultDCMConfigMapName ("default-dcm-config"). +# Override .data to supply partition profiles; see docs/dcm/device-config-manager-configmap.md. +defaultDCMConfigMap: + # -- Install the default ConfigMap in the operator release namespace (recommended when using DCM without an explicit config reference). + install: true + # -- ConfigMap metadata.name; must stay aligned with the operator default mount name. + name: default-dcm-config + # -- Keys become ConfigMap data keys (typically config.json with DCM JSON content). + data: + config.json: | + { + "gpu-config-profiles": { + "default": { + "profiles": [ + { + "computePartition": "SPX", + "memoryPartition": "NPS1" + } + ] + }, + "cpx_nps1_all": { + "profiles": [ + { + "computePartition": "CPX", + "memoryPartition": "NPS1" + } + ] + }, + "cpx_nps4_all": { + "profiles": [ + { + "computePartition": "CPX", + "memoryPartition": "NPS4" + } + ] + }, + "dpx_nps2_all": { + "profiles": [ + { + "computePartition": "DPX", + "memoryPartition": "NPS2" + } + ] + }, + "qpx_nps1_all": { + "profiles": [ + { + "computePartition": "QPX", + "memoryPartition": "NPS1" + } + ] + }, + "heterogeneous_example": { + "profiles": [ + { + "computePartition": "CPX", + "memoryPartition": "NPS1", + "numGPUsAssigned": 2 + }, + { + "computePartition": "SPX", + "memoryPartition": "NPS1" + } + ] + } + }, + "gpuClientSystemdServices": { + "names": ["amd-metrics-exporter", "gpuagent"] + } + } managerConfig: controllerManagerConfigYaml: |- healthProbeBindAddress: :8081 diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock index 9fc17b6e4..4a370e68b 100644 --- a/helm-charts-k8s/Chart.lock +++ b/helm-charts-k8s/Chart.lock @@ -9,4 +9,4 @@ dependencies: repository: file://./charts/remediation-crds version: v1.0.0 digest: sha256:0806f6b6d7aa21be77bf1c91e720ae3238338a16f107df450a53b02ef940db1b -generated: "2026-04-02T12:26:25.920315689Z" +generated: "2026-04-06T08:31:27.592910404Z" diff --git a/helm-charts-k8s/README.md b/helm-charts-k8s/README.md index 09885c1b9..045f7927a 100644 --- a/helm-charts-k8s/README.md +++ b/helm-charts-k8s/README.md @@ -125,7 +125,7 @@ The AMD GPU Operator is licensed under the [Apache License 2.0](LICENSE). ## gpu-operator-charts -![Version: v0.0.1](https://img.shields.io/badge/Version-v0.0.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: dev](https://img.shields.io/badge/AppVersion-dev-informational?style=flat-square) +![Version: v0.0.1](https://img.shields.io/badge/Version-v0.0.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: dev](https://img.shields.io/badge/AppVersion-test--img-informational?style=flat-square) AMD GPU Operator simplifies the deployment and management of AMD Instinct GPU accelerators within Kubernetes clusters. @@ -164,6 +164,9 @@ Kubernetes: `>= 1.29.0-0` | controllerManager.nodeSelector | object | `{}` | Node selector for AMD GPU operator controller manager deployment | | crds.defaultCR.install | bool | `true` | Deploy default DeviceConfig during helm chart installation | | crds.defaultCR.upgrade | bool | `false` | Deploy / Patch default DeviceConfig during helm chart upgrade. Be careful about this option: 1. Your customized change on default DeviceConfig may be overwritten 2. Your existing DeviceConfig may conflict with upgraded default DeviceConfig | +| defaultDCMConfigMap.data | object | `{"config.json":"{\n \"gpu-config-profiles\": {\n \"default\": {\n \"profiles\": [\n {\n \"computePartition\": \"SPX\",\n \"memoryPartition\": \"NPS1\"\n }\n ]\n },\n \"cpx_nps1_all\": {\n \"profiles\": [\n {\n \"computePartition\": \"CPX\",\n \"memoryPartition\": \"NPS1\"\n }\n ]\n },\n \"cpx_nps4_all\": {\n \"profiles\": [\n {\n \"computePartition\": \"CPX\",\n \"memoryPartition\": \"NPS4\"\n }\n ]\n },\n \"dpx_nps2_all\": {\n \"profiles\": [\n {\n \"computePartition\": \"DPX\",\n \"memoryPartition\": \"NPS2\"\n }\n ]\n },\n \"qpx_nps1_all\": {\n \"profiles\": [\n {\n \"computePartition\": \"QPX\",\n \"memoryPartition\": \"NPS1\"\n }\n ]\n },\n \"heterogeneous_example\": {\n \"profiles\": [\n {\n \"computePartition\": \"CPX\",\n \"memoryPartition\": \"NPS1\",\n \"numGPUsAssigned\": 2\n },\n {\n \"computePartition\": \"SPX\",\n \"memoryPartition\": \"NPS1\"\n }\n ]\n }\n },\n \"gpuClientSystemdServices\": {\n \"names\": [\"amd-metrics-exporter\", \"gpuagent\"]\n }\n}\n"}` | Keys become ConfigMap data keys (typically config.json with DCM JSON content). | +| defaultDCMConfigMap.install | bool | `true` | Install the default ConfigMap in the operator release namespace (recommended when using DCM without an explicit config reference). | +| defaultDCMConfigMap.name | string | `"default-dcm-config"` | ConfigMap metadata.name; must stay aligned with the operator default mount name. | | deviceConfig.spec.commonConfig.initContainerImage | string | `"busybox:1.36"` | init container image | | deviceConfig.spec.commonConfig.utilsContainer.image | string | `"docker.io/rocm/amd-gpu-operator-utils:latest"` | gpu operator utility container image | | deviceConfig.spec.commonConfig.utilsContainer.imagePullPolicy | string | `"IfNotPresent"` | utility container image pull policy | diff --git a/helm-charts-k8s/crds/deviceconfig-crd.yaml b/helm-charts-k8s/crds/deviceconfig-crd.yaml index 0602ed862..61c77e555 100644 --- a/helm-charts-k8s/crds/deviceconfig-crd.yaml +++ b/helm-charts-k8s/crds/deviceconfig-crd.yaml @@ -116,8 +116,10 @@ spec: description: config manager properties: config: - description: config map to customize the config for config manager, - if not specified default config will be applied + description: |- + ConfigMap holding DCM config.json. When set, the operator mounts this ConfigMap and does not create it. + When omitted or name is empty, the operator mounts ConfigMap "default-dcm-config" and creates it in the + DeviceConfig namespace if it does not already exist (same default payload as chart defaultDCMConfigMap). properties: name: default: "" diff --git a/helm-charts-k8s/values.yaml b/helm-charts-k8s/values.yaml index 195901c86..64f8a5502 100644 --- a/helm-charts-k8s/values.yaml +++ b/helm-charts-k8s/values.yaml @@ -361,6 +361,77 @@ kmmModuleLoader: serviceAccount: annotations: {} kubernetesClusterDomain: cluster.local +# Default ConfigMap for Device Config Manager when DeviceConfig.spec.configManager.config is omitted. +# The name must match internal/configmanager.DefaultDCMConfigMapName ("default-dcm-config"). +# Override .data to supply partition profiles; see docs/dcm/device-config-manager-configmap.md. +defaultDCMConfigMap: + # -- Install the default ConfigMap in the operator release namespace (recommended when using DCM without an explicit config reference). + install: true + # -- ConfigMap metadata.name; must stay aligned with the operator default mount name. + name: default-dcm-config + # -- Keys become ConfigMap data keys (typically config.json with DCM JSON content). + data: + config.json: | + { + "gpu-config-profiles": { + "default": { + "profiles": [ + { + "computePartition": "SPX", + "memoryPartition": "NPS1" + } + ] + }, + "cpx_nps1_all": { + "profiles": [ + { + "computePartition": "CPX", + "memoryPartition": "NPS1" + } + ] + }, + "cpx_nps4_all": { + "profiles": [ + { + "computePartition": "CPX", + "memoryPartition": "NPS4" + } + ] + }, + "dpx_nps2_all": { + "profiles": [ + { + "computePartition": "DPX", + "memoryPartition": "NPS2" + } + ] + }, + "qpx_nps1_all": { + "profiles": [ + { + "computePartition": "QPX", + "memoryPartition": "NPS1" + } + ] + }, + "heterogeneous_example": { + "profiles": [ + { + "computePartition": "CPX", + "memoryPartition": "NPS1", + "numGPUsAssigned": 2 + }, + { + "computePartition": "SPX", + "memoryPartition": "NPS1" + } + ] + } + }, + "gpuClientSystemdServices": { + "names": ["amd-metrics-exporter", "gpuagent"] + } + } managerConfig: controllerManagerConfigYaml: |- healthProbeBindAddress: :8081 diff --git a/internal/configmanager/configmanager.go b/internal/configmanager/configmanager.go index 8b9b3ef6e..011b6bd41 100644 --- a/internal/configmanager/configmanager.go +++ b/internal/configmanager/configmanager.go @@ -33,17 +33,24 @@ limitations under the License. package configmanager import ( + "context" + _ "embed" "fmt" "os" + "strings" "github.com/rh-ecosystem-edge/kernel-module-management/pkg/labels" appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/log" amdv1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1" utils "github.com/ROCm/gpu-operator/internal" @@ -55,9 +62,26 @@ const ( ConfigManagerName = "device-config-manager" defaultSAName = "amd-gpu-operator-config-manager" defaultInitContainerImage = "busybox:1.36" + // DefaultDCMConfigMapName is the ConfigMap metadata.name mounted when spec.configManager.config + // is unset or has an empty name (see api/v1alpha1 ConfigManagerSpec.Config godoc). + // The DeviceConfig reconciler creates this ConfigMap in the DeviceConfig namespace when DCM is + // enabled and no explicit config reference is set (see EnsureDefaultDCMConfigMap). + // The Helm chart also installs the same object by default (values: defaultDCMConfigMap); if it + // already exists, the reconciler leaves it unchanged. + DefaultDCMConfigMapName = "default-dcm-config" + // ConfigManagerConfigVolumeName is the Pod volume and VolumeMount name for the DCM ConfigMap. + ConfigManagerConfigVolumeName = "config-manager-config-volume" + // DefaultDCMConfigMountPath is where the DCM container expects ConfigMap data (e.g. config.json). + DefaultDCMConfigMountPath = "/etc/config-manager/" ) -var configManagerLabelPair = []string{"app.kubernetes.io/name", ConfigManagerName} +var ( + // defaultDCMConfigJSON is embedded from default_dcm_config.json; keep Helm defaultDCMConfigMap values aligned. + //go:embed default_dcm_config.json + defaultDCMConfigJSON []byte + + configManagerLabelPair = []string{"app.kubernetes.io/name", ConfigManagerName} +) //go:generate mockgen -source=configmanager.go -package=configmanager -destination=mock_configmanager.go ConfigManager type ConfigManager interface { @@ -74,6 +98,54 @@ func NewConfigManager(scheme *runtime.Scheme) ConfigManager { } } +// EnsureDefaultDCMConfigMap creates ConfigMap DefaultDCMConfigMapName in devConfig.Namespace when DCM is enabled +// and spec.configManager.config is unset or has an empty name. If the ConfigMap already exists, it is unchanged. +func EnsureDefaultDCMConfigMap(ctx context.Context, c client.Client, devConfig *amdv1alpha1.DeviceConfig) error { + if devConfig == nil { + return nil + } + tr := devConfig.Spec.ConfigManager + if tr.Enable == nil || !*tr.Enable { + return nil + } + if tr.Config != nil && strings.TrimSpace(tr.Config.Name) != "" { + return nil + } + + payload := strings.TrimSpace(string(defaultDCMConfigJSON)) + if payload == "" { + return fmt.Errorf("embedded default_dcm_config.json is empty") + } + + logger := log.FromContext(ctx) + nn := types.NamespacedName{Namespace: devConfig.Namespace, Name: DefaultDCMConfigMapName} + var cm v1.ConfigMap + switch err := c.Get(ctx, nn, &cm); { + case err == nil: + return nil + case !k8serrors.IsNotFound(err): + return fmt.Errorf("get ConfigMap %s: %w", nn.String(), err) + } + + newCM := &v1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: DefaultDCMConfigMapName, + Namespace: devConfig.Namespace, + Labels: map[string]string{ + "app.kubernetes.io/name": "gpu-operator", + "app.kubernetes.io/component": "device-config-manager", + "app.kubernetes.io/managed-by": "gpu-operator", + }, + }, + Data: map[string]string{"config.json": payload}, + } + if err := c.Create(ctx, newCM); err != nil { + return fmt.Errorf("create ConfigMap %s: %w", nn.String(), err) + } + logger.Info("created default DCM ConfigMap", "namespace", devConfig.Namespace, "name", DefaultDCMConfigMapName) + return nil +} + func (nl *configManager) SetConfigManagerAsDesired(ds *appsv1.DaemonSet, devConfig *amdv1alpha1.DeviceConfig) error { if ds == nil { return fmt.Errorf("daemon set is not initialized, zero pointer") @@ -196,21 +268,22 @@ func (nl *configManager) SetConfigManagerAsDesired(ds *appsv1.DaemonSet, devConf }, } - if trSpec.Config != nil { - volumes = append(volumes, v1.Volume{ - Name: "config-manager-config-volume", - VolumeSource: v1.VolumeSource{ - ConfigMap: &v1.ConfigMapVolumeSource{ - LocalObjectReference: *trSpec.Config, - }, - }, - }) - - containerVolumeMounts = append(containerVolumeMounts, v1.VolumeMount{ - Name: "config-manager-config-volume", - MountPath: "/etc/config-manager/", - }) + configRef := v1.LocalObjectReference{Name: DefaultDCMConfigMapName} + if trSpec.Config != nil && trSpec.Config.Name != "" { + configRef.Name = trSpec.Config.Name } + volumes = append(volumes, v1.Volume{ + Name: ConfigManagerConfigVolumeName, + VolumeSource: v1.VolumeSource{ + ConfigMap: &v1.ConfigMapVolumeSource{ + LocalObjectReference: configRef, + }, + }, + }) + containerVolumeMounts = append(containerVolumeMounts, v1.VolumeMount{ + Name: ConfigManagerConfigVolumeName, + MountPath: DefaultDCMConfigMountPath, + }) matchLabels := map[string]string{ "daemonset-name": devConfig.Name, diff --git a/internal/configmanager/default_dcm_config.json b/internal/configmanager/default_dcm_config.json new file mode 100644 index 000000000..c0dbc8948 --- /dev/null +++ b/internal/configmanager/default_dcm_config.json @@ -0,0 +1,60 @@ +{ + "gpu-config-profiles": { + "default": { + "profiles": [ + { + "computePartition": "SPX", + "memoryPartition": "NPS1" + } + ] + }, + "cpx_nps1_all": { + "profiles": [ + { + "computePartition": "CPX", + "memoryPartition": "NPS1" + } + ] + }, + "cpx_nps4_all": { + "profiles": [ + { + "computePartition": "CPX", + "memoryPartition": "NPS4" + } + ] + }, + "dpx_nps2_all": { + "profiles": [ + { + "computePartition": "DPX", + "memoryPartition": "NPS2" + } + ] + }, + "qpx_nps1_all": { + "profiles": [ + { + "computePartition": "QPX", + "memoryPartition": "NPS1" + } + ] + }, + "heterogeneous_example": { + "profiles": [ + { + "computePartition": "CPX", + "memoryPartition": "NPS1", + "numGPUsAssigned": 2 + }, + { + "computePartition": "SPX", + "memoryPartition": "NPS1" + } + ] + } + }, + "gpuClientSystemdServices": { + "names": ["amd-metrics-exporter", "gpuagent"] + } +} diff --git a/internal/controllers/device_config_reconciler.go b/internal/controllers/device_config_reconciler.go index d72c58354..f48a851f1 100644 --- a/internal/controllers/device_config_reconciler.go +++ b/internal/controllers/device_config_reconciler.go @@ -1457,6 +1457,10 @@ func (dcrh *deviceConfigReconcilerHelper) handleConfigManager(ctx context.Contex return dcrh.finalizeConfigManager(ctx, devConfig) } + if err := configmanager.EnsureDefaultDCMConfigMap(ctx, dcrh.client, devConfig); err != nil { + return fmt.Errorf("ensure default DCM ConfigMap: %w", err) + } + opRes, err := controllerutil.CreateOrPatch(ctx, dcrh.client, ds, func() error { return dcrh.configmanagerHandler.SetConfigManagerAsDesired(ds, devConfig) }) diff --git a/tests/e2e/cluster_test.go b/tests/e2e/cluster_test.go index 6317bc7d6..0a6116a26 100644 --- a/tests/e2e/cluster_test.go +++ b/tests/e2e/cluster_test.go @@ -353,6 +353,64 @@ func (s *E2ESuite) checkDeviceConfigManagerStatus(devCfg *v1alpha1.DeviceConfig, }, 5*time.Minute, 5*time.Second) } +func podSpecHasDCMConfigVolumeMount(spec *v1.PodSpec) bool { + check := func(mounts []v1.VolumeMount) bool { + for _, m := range mounts { + if m.Name == configmanager.ConfigManagerConfigVolumeName && m.MountPath == configmanager.DefaultDCMConfigMountPath { + return true + } + } + return false + } + for i := range spec.Containers { + if check(spec.Containers[i].VolumeMounts) { + return true + } + } + for i := range spec.InitContainers { + if check(spec.InitContainers[i].VolumeMounts) { + return true + } + } + return false +} + +// verifyDCMConfigMapVolumeRef asserts the DCM DaemonSet has a ConfigMap volume +// (configmanager.ConfigManagerConfigVolumeName) pointing at expectedConfigMapName, and that some +// workload or init container mounts that volume at configmanager.DefaultDCMConfigMountPath. +func (s *E2ESuite) verifyDCMConfigMapVolumeRef(devCfg *v1alpha1.DeviceConfig, ns string, expectedConfigMapName string, c *C) { + dsName := devCfg.Name + "-" + configmanager.ConfigManagerName + assert.Eventually(c, func() bool { + ds, err := s.clientSet.AppsV1().DaemonSets(ns).Get(context.TODO(), dsName, metav1.GetOptions{}) + if err != nil { + logger.Errorf("verifyDCMConfigMapVolumeRef: get DS %s: %v", dsName, err) + return false + } + spec := &ds.Spec.Template.Spec + var volOK bool + for _, vol := range spec.Volumes { + if vol.Name != configmanager.ConfigManagerConfigVolumeName || vol.ConfigMap == nil { + continue + } + volOK = true + if vol.ConfigMap.Name != expectedConfigMapName { + logger.Errorf("verifyDCMConfigMapVolumeRef: want ConfigMap %q, got %q", expectedConfigMapName, vol.ConfigMap.Name) + return false + } + break + } + if !volOK { + logger.Errorf("verifyDCMConfigMapVolumeRef: volume %q not found or not a ConfigMap", configmanager.ConfigManagerConfigVolumeName) + return false + } + if !podSpecHasDCMConfigVolumeMount(spec) { + logger.Errorf("verifyDCMConfigMapVolumeRef: no container VolumeMount for volume %q at %q", configmanager.ConfigManagerConfigVolumeName, configmanager.DefaultDCMConfigMountPath) + return false + } + return true + }, 2*time.Minute, 3*time.Second) +} + func (s *E2ESuite) checkDRADriverStatus(devCfg *v1alpha1.DeviceConfig, ns string, c *C) { dsName := utils.DRADriverName(devCfg.Name) assert.Eventually(c, func() bool { @@ -1397,9 +1455,7 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) { } func (s *E2ESuite) TestWorkloadRequestedGPUsHomogeneousSingle(c *C) { - if s.simEnable { - skipTest(c, "Skipping for non amd gpu testbed") - } + s.skipDCMTestIfSIMRequiresGPU(c) if !dcmImageDefined { skipTest(c, "skip DCM test because E2E_DCM_IMAGE is not defined") } @@ -1477,9 +1533,7 @@ func (s *E2ESuite) TestWorkloadRequestedGPUsHomogeneousSingle(c *C) { } func (s *E2ESuite) TestWorkloadRequestedGPUsHomogeneousMixed(c *C) { - if s.simEnable { - skipTest(c, "Skipping for non amd gpu testbed") - } + s.skipDCMTestIfSIMRequiresGPU(c) if !dcmImageDefined { skipTest(c, "skip DCM test because E2E_DCM_IMAGE is not defined") } @@ -1556,9 +1610,7 @@ func (s *E2ESuite) TestWorkloadRequestedGPUsHomogeneousMixed(c *C) { } func (s *E2ESuite) TestWorkloadRequestedGPUsHeterogeneousMixed(c *C) { - if s.simEnable { - skipTest(c, "Skipping for non amd gpu testbed") - } + s.skipDCMTestIfSIMRequiresGPU(c) if !dcmImageDefined { skipTest(c, "skip DCM test because E2E_DCM_IMAGE is not defined") } diff --git a/tests/e2e/dcm_e2e_test.go b/tests/e2e/dcm_e2e_test.go index f388c43fe..bb634ae4d 100644 --- a/tests/e2e/dcm_e2e_test.go +++ b/tests/e2e/dcm_e2e_test.go @@ -34,6 +34,7 @@ import ( "github.com/stretchr/testify/assert" . "gopkg.in/check.v1" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/fields" ) @@ -47,6 +48,15 @@ func init() { dcmImage, dcmImageDefined = os.LookupEnv("E2E_DCM_IMAGE") } +// skipDCMTestIfSIMRequiresGPU skips tests that need a real AMD GPU (partitioning, workload, successful partition logs). +// Omit this for K8s-only checks (DaemonSet, ConfigMap volume, CR enable/disable) and for negative profile validation +// that does not require hardware—those may run under SIM (no GPU). +func (s *E2ESuite) skipDCMTestIfSIMRequiresGPU(c *C) { + if s.simEnable { + skipTest(c, "skip DCM test in SIM mode (no GPU)") + } +} + type GPUConfigProfiles struct { ProfilesList map[string]*GPUConfigProfile `json:"gpu-config-profiles,omitempty"` } @@ -306,18 +316,48 @@ func (s *E2ESuite) createConfigMap() GPUConfigProfiles { return profileslist } +// ensureDefaultDCMConfigMap ensures the ConfigMap the operator mounts when spec.configManager.config is unset exists +// with valid config.json. If default-dcm-config already exists with non-empty config.json (e.g. installed by the +// GPU Operator Helm chart defaultDCMConfigMap), it is left unchanged. Otherwise a test fixture is applied via +// CreateConfigMap (delete-if-exists + create), matching clusters that do not pre-install the ConfigMap. +func (s *E2ESuite) ensureDefaultDCMConfigMap(c *C) { + name := configmanager.DefaultDCMConfigMapName + cm, err := s.clientSet.CoreV1().ConfigMaps(s.ns).Get(context.TODO(), name, metav1.GetOptions{}) + if err == nil { + if cfg, ok := cm.Data["config.json"]; ok && strings.TrimSpace(cfg) != "" { + logger.Infof("Using existing ConfigMap %s/%s with config.json (e.g. Helm defaultDCMConfigMap); not overwriting.", s.ns, name) + return + } + logger.Infof("ConfigMap %s/%s exists but config.json is missing or empty; applying test fixture.", s.ns, name) + } else if !apierrors.IsNotFound(err) { + assert.NoError(c, err, "get ConfigMap %s", name) + return + } + + profileslist := s.createConfigMap() + cfgData, err := json.Marshal(profileslist) + assert.NoError(c, err, "failed to marshal config data for default DCM ConfigMap") + err = utils.CreateConfigMap(context.TODO(), s.clientSet, s.ns, name, map[string]string{ + "config.json": string(cfgData), + }) + assert.NoError(c, err, "failed to create default DCM ConfigMap %s", name) +} + func (s *E2ESuite) configMapHelper(c *C) { logger.Infof("###BEGIN TESTCASE###\n") // check to see existing deviceconfig DS pods _, err := s.dClient.DeviceConfigs(s.ns).Get(s.cfgName, metav1.GetOptions{}) assert.Errorf(c, err, fmt.Sprintf("config %v exists", s.cfgName)) + s.ensureDefaultDCMConfigMap(c) + // fetch the CR devCfg := s.getDeviceConfigForDCM(c) logger.Infof("create device-config %+v", devCfg.Spec.ConfigManager) s.createDeviceConfig(devCfg, c) s.checkDeviceConfigManagerStatus(devCfg, s.ns, c) + s.verifyDCMConfigMapVolumeRef(devCfg, s.ns, configmanager.DefaultDCMConfigMapName, c) logger.Infof("SUCCESSFULLY DEPLOYED DCM DAEMONSET") profileslist := s.createConfigMap() @@ -348,6 +388,7 @@ func (s *E2ESuite) configMapHelper(c *C) { _, err = s.dClient.DeviceConfigs(s.ns).Update(updConfig) assert.NoError(c, err, "failed to update %v", updConfig.Name) s.checkDeviceConfigManagerStatus(updConfig, s.ns, c) + s.verifyDCMConfigMapVolumeRef(updConfig, s.ns, devCfg.Name, c) } func (s *E2ESuite) getWorkerNode(c *C) string { @@ -384,10 +425,32 @@ func (s *E2ESuite) eventHelper(reason string, event_type string) bool { return false } +func (s *E2ESuite) TestDCMDefaultConfigMapWhenConfigOmitted(c *C) { + if !dcmImageDefined { + skipTest(c, "skip DCM test because E2E_DCM_IMAGE is not defined") + } + // Runs under SIM: validates operator DaemonSet + default ConfigMap mount only. + logger.Infof("###BEGIN TestDCMDefaultConfigMapWhenConfigOmitted###\n") + _, err := s.dClient.DeviceConfigs(s.ns).Get(s.cfgName, metav1.GetOptions{}) + assert.Errorf(c, err, "deviceconfig %q must not exist before test", s.cfgName) + + s.ensureDefaultDCMConfigMap(c) + devCfg := s.getDeviceConfigForDCM(c) + assert.Nil(c, devCfg.Spec.ConfigManager.Config, "Config must be omitted to test default ConfigMap mount") + logger.Infof("create device-config (config omitted) %+v", devCfg.Spec.ConfigManager) + s.createDeviceConfig(devCfg, c) + s.checkDeviceConfigManagerStatus(devCfg, s.ns, c) + s.verifyDCMConfigMapVolumeRef(devCfg, s.ns, configmanager.DefaultDCMConfigMapName, c) + logger.Infof("DCM DaemonSet mounts default ConfigMap %q as expected", configmanager.DefaultDCMConfigMapName) + s.deleteDeviceConfig(devCfg, c) + logger.Infof("###END TestDCMDefaultConfigMapWhenConfigOmitted###\n") +} + func (s *E2ESuite) TestDCMConfigMapCreation(c *C) { if !dcmImageDefined { skipTest(c, "skip DCM test because E2E_DCM_IMAGE is not defined") } + // Runs under SIM: CR/ConfigMap/DaemonSet lifecycle and volume ref (no GPU partitioning required). s.configMapHelper(c) if s.eventHelper("SuccessfulCreate", "Normal") { logger.Infof("###DCM deployed successfully with a config map###\n") @@ -397,9 +460,7 @@ func (s *E2ESuite) TestDCMConfigMapCreation(c *C) { } func (s *E2ESuite) TestDCMConfigMapPartitionHomogenous(c *C) { - if s.simEnable { - skipTest(c, "Skipping for non amd gpu testbed") - } + s.skipDCMTestIfSIMRequiresGPU(c) if !dcmImageDefined { skipTest(c, "skip DCM test because E2E_DCM_IMAGE is not defined") } @@ -425,9 +486,7 @@ func (s *E2ESuite) TestDCMConfigMapPartitionHomogenous(c *C) { } func (s *E2ESuite) TestDCMConfigMapPartitionHeterogenous(c *C) { - if s.simEnable { - skipTest(c, "Skipping for non amd gpu testbed") - } + s.skipDCMTestIfSIMRequiresGPU(c) if !dcmImageDefined { skipTest(c, "skip DCM test because E2E_DCM_IMAGE is not defined") } @@ -452,9 +511,7 @@ func (s *E2ESuite) TestDCMConfigMapPartitionHeterogenous(c *C) { } func (s *E2ESuite) TestDCMPartitionNPS4(c *C) { - if s.simEnable { - skipTest(c, "Skipping for non amd gpu testbed") - } + s.skipDCMTestIfSIMRequiresGPU(c) if !dcmImageDefined { skipTest(c, "skip DCM test because E2E_DCM_IMAGE is not defined") } @@ -479,9 +536,7 @@ func (s *E2ESuite) TestDCMPartitionNPS4(c *C) { } func (s *E2ESuite) TestDCMInvalidComputeType(c *C) { - if s.simEnable { - skipTest(c, "Skipping for non amd gpu testbed") - } + // Runs under SIM: negative profile validation (no successful partitioning required). if !dcmImageDefined { skipTest(c, "skip DCM test because E2E_DCM_IMAGE is not defined") } @@ -506,9 +561,7 @@ func (s *E2ESuite) TestDCMInvalidComputeType(c *C) { } func (s *E2ESuite) TestDCMInvalidMemoryType(c *C) { - if s.simEnable { - skipTest(c, "Skipping for non amd gpu testbed") - } + // Runs under SIM: negative profile validation (no successful partitioning required). if !dcmImageDefined { skipTest(c, "skip DCM test because E2E_DCM_IMAGE is not defined") } @@ -533,9 +586,7 @@ func (s *E2ESuite) TestDCMInvalidMemoryType(c *C) { } func (s *E2ESuite) TestDCMInvalidGPUFilter(c *C) { - if s.simEnable { - skipTest(c, "Skipping for non amd gpu testbed") - } + // Runs under SIM: negative profile validation (no successful partitioning required). if !dcmImageDefined { skipTest(c, "skip DCM test because E2E_DCM_IMAGE is not defined") } @@ -558,9 +609,7 @@ func (s *E2ESuite) TestDCMInvalidGPUFilter(c *C) { } func (s *E2ESuite) TestDCMDefaultPartition(c *C) { - if s.simEnable { - skipTest(c, "Skipping for non amd gpu testbed") - } + s.skipDCMTestIfSIMRequiresGPU(c) if !dcmImageDefined { skipTest(c, "skip DCM test because E2E_DCM_IMAGE is not defined") } @@ -570,12 +619,15 @@ func (s *E2ESuite) TestDCMDefaultPartition(c *C) { _, err := s.dClient.DeviceConfigs(s.ns).Get(s.cfgName, metav1.GetOptions{}) assert.Errorf(c, err, fmt.Sprintf("config %v exists", s.cfgName)) + s.ensureDefaultDCMConfigMap(c) + // fetch the CR devCfg := s.getDeviceConfigForDCM(c) logger.Infof("create device-config %+v", devCfg.Spec.ConfigManager) s.createDeviceConfig(devCfg, c) s.checkDeviceConfigManagerStatus(devCfg, s.ns, c) + s.verifyDCMConfigMapVolumeRef(devCfg, s.ns, configmanager.DefaultDCMConfigMapName, c) logger.Infof("SUCCESSFULLY DEPLOYED DCM DAEMONSET") time.Sleep(30 * time.Second) @@ -602,7 +654,10 @@ func (s *E2ESuite) TestDCMDefaultPartition(c *C) { } func (s *E2ESuite) TestConfigManagerDeploymentOnly(c *C) { - // Run on SIM and Non SIM Setups + if !dcmImageDefined { + skipTest(c, "skip DCM test because E2E_DCM_IMAGE is not defined") + } + // Runs under SIM: enable/disable DCM and DaemonSet + default ConfigMap mount only. configManagerEnable := false logger.Infof("###BEGIN TESTCASE 1###\n") // check to see existing deviceconfig DS pods @@ -622,11 +677,14 @@ func (s *E2ESuite) TestConfigManagerDeploymentOnly(c *C) { configManagerEnable = true updConfig.Spec.ConfigManager.Enable = &configManagerEnable + s.ensureDefaultDCMConfigMap(c) + logger.Infof("update dcm-config %+v", updConfig.Spec.ConfigManager) _, err = s.dClient.DeviceConfigs(s.ns).Update(updConfig) assert.NoError(c, err, "failed to update %v", updConfig.Name) s.checkDeviceConfigManagerStatus(updConfig, s.ns, c) + s.verifyDCMConfigMapVolumeRef(updConfig, s.ns, configmanager.DefaultDCMConfigMapName, c) logger.Infof("SUCCESSFULLY DEPLOYED DCM DAEMONSET") logger.Infof("###END TESTCASE###\n")