diff --git a/src/cluster-configuration/deploy/start.sh.template b/src/cluster-configuration/deploy/start.sh.template index 216d8696..5642c40f 100644 --- a/src/cluster-configuration/deploy/start.sh.template +++ b/src/cluster-configuration/deploy/start.sh.template @@ -48,6 +48,19 @@ echo kubectl label nodes {{ cluster_cfg['layout']['machine-list'][host]['hostnam {%- endif %} {%- if 'pai-worker' in cluster_cfg['layout']['machine-list'][host] and cluster_cfg['layout']['machine-list'][host]['pai-worker'] == 'true' %} echo kubectl label --overwrite=true nodes {{ cluster_cfg['layout']['machine-list'][host]['hostname'] }} pai-worker=true || exit $? + {%- set machine_type = cluster_cfg['layout']['machine-list'][host]['machine-type'] %} + {%- if machine_type in cluster_cfg['layout']['machine-sku'] and 'computing-device' in cluster_cfg['layout']['machine-sku'][machine_type] %} + {%- set device_type = cluster_cfg['layout']['machine-sku'][machine_type]['computing-device']['type'] %} + {%- if device_type == 'nvidia.com/gpu' %} +echo kubectl label --overwrite=true nodes {{ cluster_cfg['layout']['machine-list'][host]['hostname'] }} vendor=nvidia || exit $? + {%- elif device_type == 'amd.com/gpu' %} +echo kubectl label --overwrite=true nodes {{ cluster_cfg['layout']['machine-list'][host]['hostname'] }} vendor=amd || exit $? + {%- else %} +echo kubectl label --overwrite=true nodes {{ cluster_cfg['layout']['machine-list'][host]['hostname'] }} vendor=unknown || exit $? + {%- endif %} + {%- else %} +echo kubectl label --overwrite=true nodes {{ cluster_cfg['layout']['machine-list'][host]['hostname'] }} vendor=cpu || exit $? + {%- endif %} {%- else %} echo kubectl label nodes {{ cluster_cfg['layout']['machine-list'][host]['hostname'] }} pai-worker- || exit $? {%- endif %} diff --git a/src/device-plugin/deploy/start.sh.template b/src/device-plugin/deploy/start.sh.template index 91b8b598..7aaeeeec 100644 --- a/src/device-plugin/deploy/start.sh.template +++ b/src/device-plugin/deploy/start.sh.template @@ -36,7 +36,8 @@ pushd $(dirname "$0") > /dev/null s/^([[:space:]]*)allowPrivilegeEscalation: false.*$/\1privileged: false/ G s/(^[[:space:]]*allowPrivilegeEscalation: false.*)\n([[:space:]]*privileged: false)/\1\n\2/ -}'; +}' \ +| sed '/^[[:space:]]*tolerations:/i\ nodeSelector:\n vendor: nvidia'; cat <<'YAML' imagePullSecrets: - name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }} @@ -50,7 +51,12 @@ YAML {% if 'amd.com/gpu' in cluster_cfg['device-plugin']['devices'] %} { curl -s https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml \ -| sed 's|rocm/k8s-device-plugin|{{ cluster_cfg['cluster']['docker-registry']['prefix'] }}k8s-rocm-device-plugin:{{ cluster_cfg['cluster']['docker-registry']['tag'] }}|'; +| sed 's|rocm/k8s-device-plugin|{{ cluster_cfg['cluster']['docker-registry']['prefix'] }}k8s-rocm-device-plugin:{{ cluster_cfg['cluster']['docker-registry']['tag'] }}|' \ +| sed -E '/^[[:space:]]*nodeSelector:[[:space:]]*$/{ + n + s/^([[:space:]]*)(.*)$/\1vendor: amd\ +\1\2/ +}'; cat <<'YAML' imagePullSecrets: - name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }}