From 7db5de1652ac50e77c296fe0c0c667f8473acba3 Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Thu, 8 Jan 2026 15:05:42 +0800 Subject: [PATCH 1/3] add gpu vendor tag for working node --- src/cluster-configuration/deploy/start.sh.template | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/cluster-configuration/deploy/start.sh.template b/src/cluster-configuration/deploy/start.sh.template index 216d8696..5642c40f 100644 --- a/src/cluster-configuration/deploy/start.sh.template +++ b/src/cluster-configuration/deploy/start.sh.template @@ -48,6 +48,19 @@ echo kubectl label nodes {{ cluster_cfg['layout']['machine-list'][host]['hostnam {%- endif %} {%- if 'pai-worker' in cluster_cfg['layout']['machine-list'][host] and cluster_cfg['layout']['machine-list'][host]['pai-worker'] == 'true' %} echo kubectl label --overwrite=true nodes {{ cluster_cfg['layout']['machine-list'][host]['hostname'] }} pai-worker=true || exit $? + {%- set machine_type = cluster_cfg['layout']['machine-list'][host]['machine-type'] %} + {%- if machine_type in cluster_cfg['layout']['machine-sku'] and 'computing-device' in cluster_cfg['layout']['machine-sku'][machine_type] %} + {%- set device_type = cluster_cfg['layout']['machine-sku'][machine_type]['computing-device']['type'] %} + {%- if device_type == 'nvidia.com/gpu' %} +echo kubectl label --overwrite=true nodes {{ cluster_cfg['layout']['machine-list'][host]['hostname'] }} vendor=nvidia || exit $? + {%- elif device_type == 'amd.com/gpu' %} +echo kubectl label --overwrite=true nodes {{ cluster_cfg['layout']['machine-list'][host]['hostname'] }} vendor=amd || exit $? + {%- else %} +echo kubectl label --overwrite=true nodes {{ cluster_cfg['layout']['machine-list'][host]['hostname'] }} vendor=unknown || exit $? + {%- endif %} + {%- else %} +echo kubectl label --overwrite=true nodes {{ cluster_cfg['layout']['machine-list'][host]['hostname'] }} vendor=cpu || exit $? + {%- endif %} {%- else %} echo kubectl label nodes {{ cluster_cfg['layout']['machine-list'][host]['hostname'] }} pai-worker- || exit $? {%- endif %} From 846e0946652575ccd04e84bca6180415914d436a Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Thu, 8 Jan 2026 15:53:13 +0800 Subject: [PATCH 2/3] add node selctor to deploy device plugin only on GPU nodes with correct tag --- src/device-plugin/deploy/start.sh.template | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/device-plugin/deploy/start.sh.template b/src/device-plugin/deploy/start.sh.template index 91b8b598..3f8d81f3 100644 --- a/src/device-plugin/deploy/start.sh.template +++ b/src/device-plugin/deploy/start.sh.template @@ -36,7 +36,8 @@ pushd $(dirname "$0") > /dev/null s/^([[:space:]]*)allowPrivilegeEscalation: false.*$/\1privileged: false/ G s/(^[[:space:]]*allowPrivilegeEscalation: false.*)\n([[:space:]]*privileged: false)/\1\n\2/ -}'; +}' \ +| sed '/^[[:space:]]*tolerations:/i\ nodeSelector:\n vendor: nvidia'; cat <<'YAML' imagePullSecrets: - name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }} @@ -50,7 +51,8 @@ YAML {% if 'amd.com/gpu' in cluster_cfg['device-plugin']['devices'] %} { curl -s https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml \ -| sed 's|rocm/k8s-device-plugin|{{ cluster_cfg['cluster']['docker-registry']['prefix'] }}k8s-rocm-device-plugin:{{ cluster_cfg['cluster']['docker-registry']['tag'] }}|'; +| sed 's|rocm/k8s-device-plugin|{{ cluster_cfg['cluster']['docker-registry']['prefix'] }}k8s-rocm-device-plugin:{{ cluster_cfg['cluster']['docker-registry']['tag'] }}|' \ +| sed '/^[[:space:]]*tolerations:/i\ nodeSelector:\n vendor: amd'; cat <<'YAML' imagePullSecrets: - name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }} From 1b5d18f65a0d46903a933a8f1a777c10f4fb0792 Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Fri, 9 Jan 2026 08:00:08 +0800 Subject: [PATCH 3/3] fix the redudant nodeselector for amd device plugin --- src/device-plugin/deploy/start.sh.template | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/device-plugin/deploy/start.sh.template b/src/device-plugin/deploy/start.sh.template index 3f8d81f3..7aaeeeec 100644 --- a/src/device-plugin/deploy/start.sh.template +++ b/src/device-plugin/deploy/start.sh.template @@ -52,7 +52,11 @@ YAML { curl -s https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml \ | sed 's|rocm/k8s-device-plugin|{{ cluster_cfg['cluster']['docker-registry']['prefix'] }}k8s-rocm-device-plugin:{{ cluster_cfg['cluster']['docker-registry']['tag'] }}|' \ -| sed '/^[[:space:]]*tolerations:/i\ nodeSelector:\n vendor: amd'; +| sed -E '/^[[:space:]]*nodeSelector:[[:space:]]*$/{ + n + s/^([[:space:]]*)(.*)$/\1vendor: amd\ +\1\2/ +}'; cat <<'YAML' imagePullSecrets: - name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }}