-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathshowuserjobgpu
More file actions
executable file
·196 lines (168 loc) · 7.49 KB
/
Copy pathshowuserjobgpu
File metadata and controls
executable file
·196 lines (168 loc) · 7.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/bin/bash
set -euo pipefail
TARGET_USER=${1:-$USER}
echo "Checking GPU utilization for $TARGET_USER"
# 1. Get Job ID, Node List, and State
JOBS=$(squeue -u "$TARGET_USER" -t RUNNING,COMPLETING --noheader -o "%i|%N|%T")
Num_Jobs=$(squeue -u "$TARGET_USER" -t RUNNING,COMPLETING --noheader | wc -l)
echo "$TARGET_USER has $Num_Jobs jobs"
if [ -z "$JOBS" ]; then
echo "No running or completing jobs found."
exit 0
fi
# expand Slurm-style GPU index ranges into comma-separated values for nvidia-smi
expand_gpu_ids_for_nvidia_smi() {
local raw="$1"
local out=()
local part
IFS=',' read -ra parts <<< "$raw"
for part in "${parts[@]}"; do
if [[ "$part" =~ ^[0-9]+-[0-9]+$ ]]; then
start="${part%-*}"
end="${part#*-}"
for ((i=start; i<=end; i++)); do
out+=("$i")
done
elif [[ "$part" =~ ^[0-9]+$ ]]; then
out+=("$part")
fi
done
local IFS=,
echo "${out[*]}"
}
# parse one scontrol line and store node - GPU_IDX mapping
parse_job_node_gpu_map() {
local jobid="$1"
local scontrol_out
scontrol_out="$(scontrol show job "$jobid" -d 2>/dev/null || true)"
while IFS= read -r line; do
[[ "$line" == *"Nodes="* ]] || continue
[[ "$line" == *"IDX:"* ]] || continue
nodes_expr="$(echo "$line" | grep -oE 'Nodes=[^ ]+' | cut -d= -f2 || true)"
gpu_ids="$(echo "$line" | sed -n 's/.*IDX:\([^)]*\).*/\1/p' | head -n 1 || true)"
[[ -z "${nodes_expr:-}" ]] && continue
[[ -z "${gpu_ids:-}" ]] && continue
for host in $(scontrol show hostnames "$nodes_expr" 2>/dev/null); do
NODE_GPU_IDS["$host"]="$gpu_ids"
done
done < <(echo "$scontrol_out" | sed -n '/^[[:space:]]*Nodes=/p')
}
printf_row() {
# $1 = JobID $2 = Node $3 = State $4 = GPU‑list
# $5 = GPU‑index(s) $6 = GPU‑util $7 = GPU‑mem $8 = GPU‑name
printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \
"$1" "$2" "$3" "$4" "$5" "$6" "$7" "$8"
}
# -----------------------------------------------------------------
# Print the table header (once)
# -----------------------------------------------------------------
{ printf_row "JobID" "Node" "State" "GPU‑list" "GPU‑idx" "GPU‑util(%)" \
"Mem‑used(MiB)" "GPU‑name"
printf_row "--------" "--------" "--------" "--------" \
"--------" "--------" "--------" "--------"
#printf '%*s\n' "$(tput cols)" '' | tr ' ' '-' # visual separator line
# -----------------------------------------------------------------
# Main loop – read the pipe‑delimited job list
# -----------------------------------------------------------------
while IFS='|' read -r JOBID NODELIST JOBSTATE; do
# Skip empty fields (defensive)
[[ -z ${JOBID:-} || -z ${NODELIST:-} || -z ${JOBSTATE:-} ]] && continue
# ------------------------------------------------------------
# 1. COMPLETING jobs – we still want them in the table,
# but we don’t have to query the GPUs.
# ------------------------------------------------------------
if [[ $JOBSTATE == COMPLETING || $JOBSTATE == CG ]]; then
printf_row "$JOBID" "—" "$JOBSTATE" "—" "—" "—" "—" "—"
continue
fi
# ------------------------------------------------------------
# 2. Expand the node list (Slurm helper)
# ------------------------------------------------------------
NODES=$(scontrol show hostnames "$NODELIST" 2>/dev/null || true)
if [[ -z ${NODES:-} ]]; then
echo "WARNING: could not expand nodes for JobID $JOBID (NODELIST=$NODELIST)" >&2
continue
fi
# ------------------------------------------------------------
# 3. Build a map node → GPU‑ids for this job
# ------------------------------------------------------------
declare -A NODE_GPU_IDS
parse_job_node_gpu_map "$JOBID" # <-- you already have this
# ------------------------------------------------------------
# 4. Walk over every node that the job touches
# ------------------------------------------------------------
for node in $NODES; do
GPU_IDS="${NODE_GPU_IDS[$node]:-}"
[[ -z ${GPU_IDS:-} ]] && continue # no GPU on this node → skip
# Convert something like “0-2,4” into a list that `nvidia‑smi -i`
# can understand (e.g. “0,1,2,4”)
GPU_IDS_FOR_NVIDIA_SMI=$(expand_gpu_ids_for_nvidia_smi "$GPU_IDS")
# Query *only* the GPUs that belong to the job
SMICMD="nvidia-smi -i $GPU_IDS_FOR_NVIDIA_SMI \
--query-gpu=index,utilization.gpu,memory.used,name \
--format=csv,noheader,nounits"
# Run the command over SSH; quiet failures are turned into a warning.
if ! SSH_OUT=$(ssh -n -q -o ConnectTimeout=5 "$node" "$SMICMD" < /dev/null 2>/dev/null); then
echo "WARNING: failed to query nvidia-smi for JobID $JOBID on $node (GPU IDs: $GPU_IDS_FOR_NVIDIA_SMI)" >&2
# Still print a row so the user knows the node was seen
printf_row "$JOBID" "$node" "$JOBSTATE" "$GPU_IDS" "—" "—" "—" "—"
continue
fi
# -----------------------------------------------------------------
# 5. One line per GPU (the table is easier to read this way)
# -----------------------------------------------------------------
while IFS=',' read -r IDX UTIL MEM NAME; do
# Trim possible leading/trailing spaces
IDX=$(echo "$IDX" | xargs)
UTIL=$(echo "$UTIL" | xargs)
MEM=$(echo "$MEM" | xargs)
NAME=$(echo "$NAME" | xargs)
printf_row "$JOBID" "$node" "$JOBSTATE" "$GPU_IDS" "$IDX" "$UTIL" "$MEM" "$NAME"
done <<< "$SSH_OUT"
done
done <<< "$JOBS"
} | column -t -s $'\t'
# ------------------------------------------------------------
# End of script
# ------------------------------------------------------------
#while IFS='|' read -r JOBID NODELIST JOBSTATE; do
# [ -z "${JOBID:-}" ] && continue
# [ -z "${NODELIST:-}" ] && continue
# [ -z "${JOBSTATE:-}" ] && continue
#
# # If it's a job that is completing, report it and skip GPU query
# if [[ "$JOBSTATE" == "COMPLETING" || "$JOBSTATE" == "CG" ]]; then
# echo "------------------------------------------------"
# echo "JobID: $JOBID | NodeList: $NODELIST | State: $JOBSTATE"
# echo "------------------------------------------------"
# continue
# fi
#
# NODES=$(scontrol show hostnames "$NODELIST" 2>/dev/null || true)
# if [ -z "${NODES:-}" ]; then
# echo "WARNING: could not expand nodes for JobID $JOBID (NODELIST=$NODELIST)" >&2
# continue
# fi
#
# declare -A NODE_GPU_IDS
# parse_job_node_gpu_map "$JOBID"
#
# for node in $NODES; do
# GPU_IDS="${NODE_GPU_IDS[$node]:-}"
#
# if [ -z "${GPU_IDS:-}" ]; then
# continue
# fi
#
# GPU_IDS_FOR_NVIDIA_SMI=$(expand_gpu_ids_for_nvidia_smi "$GPU_IDS")
#
# echo "------------------------------------------------"
# echo "JobID: $JOBID | Node: $node"
# echo "Slurm Allocated GPU Index: $GPU_IDS"
# echo "------------------------------------------------"
#
# if ! ssh -n -q -o ConnectTimeout=5 "$node" "nvidia-smi -i $GPU_IDS_FOR_NVIDIA_SMI --query-gpu=index,utilization.gpu,memory.used,name --format=csv,noheader" < /dev/null; then
# echo "WARNING: failed to query nvidia-smi for JobID $JOBID on node $node (GPU IDs: $GPU_IDS_FOR_NVIDIA_SMI)" >&2
# fi
# done
#done <<< "$JOBS"