-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgetjobutilurl
More file actions
executable file
·230 lines (188 loc) · 6.5 KB
/
Copy pathgetjobutilurl
File metadata and controls
executable file
·230 lines (188 loc) · 6.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
#!/usr/bin/env bash
show_usage() {
echo "Usage:"
echo " getjobutilurl <slurm_job_id>"
exit 0
}
JID=$1
# just for better error handling
set -euo pipefail
# color styling (cyan)
if [[ -t 1 ]]; then
C=$'\033[0;36m' # cyan
R=$'\033[0m' # reset
else
C=""
R=""
fi
NL=$(sacct -j $JID -X -nP -o nodelist 2>/dev/null)
#[[ $? = 0 ]] && show_usage
JS=$(SLURM_TIME_FORMAT=%s sacct -j "$JID" -X -nP -o start 2>/dev/null)
JE=$(SLURM_TIME_FORMAT=%s sacct -j "$JID" -X -nP -o end 2>/dev/null)
# Pull fields for job allocation
ALLOCCPUS=$(sacct -j "$JID" -X -nP -o alloccpus 2>/dev/null)
REQMEM=$(sacct -j "$JID" -X -nP -o reqmem 2>/dev/null)
ALLOCTRES=$(sacct -j "$JID" -X -nP -o alloctres 2>/dev/null)
# pull job state and user
JOB_STATE=$(sacct -j "$JID" -X -nP -o state 2>/dev/null | head -n 1)
JOB_USER=$(sacct -j "$JID" -X -nP -o user 2>/dev/null | head -n 1)
# pull cpu and gpu indices
SCONTROL_OUT="$(scontrol show job -d "$JID" 2>/dev/null || true)"
JOB_GRES="$(echo "$SCONTROL_OUT" | grep -oE 'JOB_GRES=[^ ]+' | head -n 1 | cut -d= -f2 || true)"
CPU_IDS="$(echo "$SCONTROL_OUT" | grep -oE 'CPU_IDs=[^ ]+' | head -n 1 | cut -d= -f2 || true)"
GPU_IDX="$(echo "$SCONTROL_OUT" | grep -oE 'IDX:[0-9,-]+' | head -n 1 | cut -d: -f2 || true)"
# expand IDX token like "0,1,3" or "5-6" -> "0,1,3" or "5,6"
expand_idx_list() {
local s="$1"
[ -z "$s" ] && { echo ""; return; }
IFS=',' read -ra parts <<< "$s"
local -a out=()
for p in "${parts[@]}"; do
if [[ "$p" =~ ^([0-9]+)-([0-9]+)$ ]]; then
local a=${BASH_REMATCH[1]} b=${BASH_REMATCH[2]}
local i
for ((i=a;i<=b;i++)); do
out+=("$i")
done
elif [[ "$p" =~ ^[0-9]+$ ]]; then
out+=("$p")
fi
done
# join with commas
local IFS=,
echo "${out[*]}"
}
# multi-node jobs: per-node CPU_IDs and GPU IDX
declare -A NODE_CPU_IDS
declare -A NODE_GPU_IDX
#A100 HPE GPU SLURM to NVIDIA-SMI Mapping
declare -A MAP=(
[0]=2 [2]=0
[1]=3 [3]=1
[4]=6 [6]=4
[5]=7 [7]=5
)
while IFS= read -r line; do
# only consider lines that include Nodes= and CPU_IDs=
[[ "$line" == *"Nodes="* ]] || continue
[[ "$line" == *"CPU_IDs="* ]] || continue
nodes_expr="$(echo "$line" | grep -oE 'Nodes=[^ ]+' | cut -d= -f2 || true)"
cpu_ids_val="$(echo "$line" | grep -oE 'CPU_IDs=[^ ]+' | cut -d= -f2 || true)"
#gpu_idx_val="$(echo "$line" | grep -oE 'IDX:[0-9,-]+' | cut -d: -f2 || true)"
gpu_idx_raw=$(printf '%s\n' "$line" | sed -n 's/.*IDX:\([^)]*\).*/\1/p' | head -n1 || true)
gpu_idx_expanded=""
if [ -n "$gpu_idx_raw" ]; then
gpu_idx_expanded=$(expand_idx_list "$gpu_idx_raw")
fi
echo $gpu_idx_expanded
#Hard coded mapping for A100 HPE nodes
{
if [[ $nodes_expr == "tc-gpu001" || $nodes_expr == "tc-gpu002" || $nodes_expr == "tc-gpu003" || $nodes_expr == "tc-gpu004" ]]; then
IFS=',' read -ra IDX_ARRAY <<<"$gpu_idx_expanded"
for i in "${!IDX_ARRAY[@]}"; do # loops over the *indices*
cur="${IDX_ARRAY[$i]}" # e.g. "0"
# If a mapping exists, use it; otherwise keep the original value.
IDX_ARRAY[$i]="${MAP[$cur]:-$cur}"
done
gpu_idx_expanded=$(IFS=,; echo "${IDX_ARRAY[*]}")
fi
}
echo $nodes_expr
[[ -z "${nodes_expr:-}" ]] && continue
# expand into hostnames
for host in $(scontrol show hostnames "$nodes_expr" 2>/dev/null); do
[[ -n "${cpu_ids_val:-}" ]] && NODE_CPU_IDS["$host"]="$cpu_ids_val"
[[ -n "${gpu_idx_expanded:-}" ]] && NODE_GPU_IDX["$host"]="$gpu_idx_expanded"
#[[ -n "${gpu_idx_val:-}" ]] && NODE_GPU_IDX["$host"]="$gpu_idx_val"
done
done < <(echo "$SCONTROL_OUT" | sed -n '/^[[:space:]]*Nodes=/p')
if [[ $# -ne 1 ]]; then
show_usage
fi
# just some validation for pulling node list
if [[ -z "${NL:-}" || "$NL" == "Unknown" ]]; then
echo "ERROR: Job not found or accounting not available yet for: $JID or crosscheck you are on the correct cluster"
echo "Try: squeue -j $JID"
exit 1
fi
#get number of nodes from the node list
NUM_NODES=$(scontrol show hostnames "$NL" 2>/dev/null | wc -l)
IS_MULTI_NODE=0
if [[ "${NUM_NODES:-0}" -gt 1 ]]; then
IS_MULTI_NODE=1
fi
# Normalize Slurm times to UTC
# - sacct reports local time
# - currently grafana expects UTC for timestamps (the workaround is using broswer time in grafana)
to_utc_z() {
local t="$1"
if [[ -z "$t" || "$t" == "Unknown" || "$t" == "None" ]]; then
echo ""
return 0
fi
t="${t%%|*}"
# trim whitespace
t="${t#"${t%%[![:space:]]*}"}"
t="${t%"${t##*[![:space:]]}"}"
# If epoch seconds, format directly
if [[ "$t" =~ ^[0-9]+$ ]]; then
date -u -d "@$t" +"%Y-%m-%dT%H:%M:%S.000Z"
return 0
fi
# Fallback: try parsing as a timestamp string
date -u -d "$t" +"%Y-%m-%dT%H:%M:%S.000Z"
}
FROM_UTC="$(to_utc_z "$JS")"
# For running job: some validation since JE can be unknown or none
if [[ -z "${JE:-}" || "$JE" == "Unknown" || "$JE" == "None" ]]; then
TO_PARAM="now"
else
TO_UTC="$(to_utc_z "$JE")"
if [[ -z "$TO_UTC" ]]; then
TO_PARAM="now"
else
TO_PARAM="$TO_UTC"
fi
fi
# Job context GPU count from AllocTRES
GPUCOUNT="0"
if [[ -n "${ALLOCTRES:-}" ]]; then
gpu_field="$(echo "$ALLOCTRES" | tr ',' '\n' | awk -F'=' '$1=="gres/gpu"{print $2; exit}')"
if [[ -n "${gpu_field:-}" ]]; then
GPUCOUNT="$gpu_field"
fi
fi
# quick header just for support context
echo
echo "${C}Job:${R} $JID ${C}User:${R} ${JOB_USER:-Unknown} ${C}State:${R} ${JOB_STATE:-Unknown}"
echo "${C}Allocation:${R} cpus=${ALLOCCPUS:-Unknown} mem=${REQMEM:-Unknown} gpus=$GPUCOUNT"
echo "${C}GRES:${R} ${JOB_GRES:-Unknown}"
echo "${C}Time window (UTC):${R} from=$FROM_UTC to=$TO_PARAM"
echo "${C}Nodes:${R} ${NUM_NODES:-Unknown}"
echo
case "$SYSNAME" in
falcon)
DASHBOARD_URL="https://dashboard.arc.vt.edu/d/ac5tx82/arc-cluster-node-efficiency-falcon"
;;
owl)
DASHBOARD_URL="https://dashboard.arc.vt.edu/d/acgqw9w/arc-cluster-node-efficiency-owl"
;;
tinkercliffs)
DASHBOARD_URL="https://dashboard.arc.vt.edu/d/ac4cwcb/arc-cluster-node-efficiency-tinkercliffs"
;;
*)
echo "ERROR: Unknown cluster '$SYSNAME'"
exit 1
;;
esac
for NN in $(scontrol show hostnames "$NL")
do
#per-node CPU/GPU indices
NN_CPU_IDS="${NODE_CPU_IDS[$NN]:-${CPU_IDS:-Unknown}}"
NN_GPU_IDX="${NODE_GPU_IDX[$NN]:-${GPU_IDX:-Unknown}}"
printf "%s:\n" "${C}${NN}${R}"
echo " ${DASHBOARD_URL}?var-interval=1m&orgId=1&from=${FROM_UTC}&to=${TO_PARAM}&timezone=browser&var-hostname=${NN}"
echo " ${C}CPU IDs:${R} ${NN_CPU_IDS:-Unknown}"
echo " ${C}GPU IDX:${R} ${NN_GPU_IDX:-Unknown}"
echo
done