-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathshowjobprocessesusage
More file actions
129 lines (104 loc) · 3.92 KB
/
Copy pathshowjobprocessesusage
File metadata and controls
129 lines (104 loc) · 3.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env bash
# Usage: showjobprocessesusage <SlurmJobID> <run_nvidia_smi>
set -euo pipefail
if [ $# -ne 2 ]; then
echo "Usage: $0 <SlurmJobID> <run_nvidia_smi>"
exit 1
fi
jobid="$1"
run_nvidia_smi="$2"
# Get scontrol output and extract PIDs (skip header and any '-1' or non-numeric values)
mapfile -t pids < <( scontrol listpids "$jobid" 2>/dev/null \
| awk 'NR>1 {print $1}' \
| grep -E '^[0-9]+$' || true )
if [ "${#pids[@]}" -eq 0 ]; then
echo "No numeric PIDs found for job $jobid (scontrol returned none or scontrol failed)."
exit 0
fi
# Map GPU UUID to GPU index (1:1)
declare -A UUID_TO_INDEX
if [ "$run_nvidia_smi" -eq 1 ]; then
while IFS=',' read -r index uuid; do
index=$(echo "$index" | xargs) # trim spaces
uuid=$(echo "$uuid" | xargs)
UUID_TO_INDEX["$uuid"]="$index"
done < <(nvidia-smi --query-gpu=index,uuid --format=csv,noheader,nounits)
fi
# Map PID to list of GPU UUIDs (1:N even though most likely 1:1)
declare -A PID_TO_UUID
if [ "$run_nvidia_smi" -eq 1 ]; then
while IFS=',' read -r uuid pid; do
uuid=$(echo "$uuid" | xargs)
pid=$(echo "$pid" | xargs)
# Append uuid to existing list for this pid
if [[ -n "${PID_TO_UUID[$pid]+x}" ]]; then
PID_TO_UUID["$pid"]+=$' '"$uuid"
else
PID_TO_UUID["$pid"]="$uuid"
fi
done < <(nvidia-smi --query-compute-apps=gpu_uuid,pid --format=csv,noheader,nounits)
fi
# Map PIDs to UUIDs to IDX
#for pid in "${!PID_TO_UUID[@]}"; do
# for uuid in ${PID_TO_UUID[$pid]}; do
# echo "PID $pid has GPU $uuid and IDX ${UUID_TO_INDEX[$uuid]}"
# done
#done
# Map PIDs to IDX
get_gpu_idxs_for_pid() {
local pid=$1
# if PID not in map, return empty
if [[ -z "${PID_TO_UUID[$pid]+x}" ]]; then
echo ""
return
fi
local uuids=(${PID_TO_UUID[$pid]})
local idxs=()
for uuid in "${uuids[@]}"; do
idxs+=("${UUID_TO_INDEX[$uuid]}")
done
(IFS=,; echo "${idxs[*]}")
}
#for pid in "${!PID_TO_UUID[@]}"; do
# echo "PID $pid uses GPU indices: $(get_gpu_idxs_for_pid "$pid")"
#done
# Print header
#printf "%-8s %6s %10s %15s %s\n" "PID" "%CPU" "MEM(MB)" "GPUs" "CMD"
#printf "%-8s %6s %10s %15s %s\n" "--------" "------" "---------" "--------------" "----"
# For each PID, fetch %CPU, RSS (KB) and full command
for pid in "${pids[@]}"; do
# if process does not exist anymore, ps will produce empty output
# use ps to get %cpu, rss (KB), and args
psline=$(ps -p "$pid" -o %cpu=,rss=,args= 2>/dev/null || true)
if [ -z "${psline// /}" ]; then
# either ps failed (no such pid) or empty output
continue
fi
# Trim leading whitespace
psline="$(echo "$psline" | sed -E 's/^[[:space:]]+//')"
# Extract first two whitespace-separated fields (%CPU and RSS) and the remainder as CMD
cpu="$(awk '{print $1}' <<<"$psline")"
rss="$(awk '{print $2}' <<<"$psline")"
# remove the leading "%CPU RSS " from the line to get the command (handles spaces in command)
cmd="$(printf '%s\n' "$psline" | sed -E "s/^[[:space:]]*[^[:space:]]+[[:space:]]+[^[:space:]]+[[:space:]]*//")"
# Convert RSS (KB) to MB with 2 decimal places. If rss is non-numeric, show N/A
if [[ "$rss" =~ ^[0-9]+$ ]]; then
mem_mb=$(awk -v kb="$rss" 'BEGIN{ printf "%.2f", kb/1024 }')
else
mem_mb="N/A"
fi
# Filter: skip commands containing slurmstepd or sleep
#if [[ "$cmd" =~ (slurmstepd:|sshd:|sleep[[:space:]]+100000000|job[0-9]+/slurm_script) ]]; then
# continue
#fi
# Print row
#printf "%-8s %6s %10s %15s %s\n" \
printf "%s|%s|%s|%s|%.100s\n" \
"$pid" "$cpu" "$mem_mb" "$(get_gpu_idxs_for_pid "$pid")" "$cmd"
done
# Total memory currently used by the job on the node (in MB)
if [ -f "/sys/fs/cgroup/system.slice/slurmstepd.scope/job_${jobid}/memory.current" ]; then
mem_bytes=$(< /sys/fs/cgroup/system.slice/slurmstepd.scope/job_${jobid}/memory.current)
mem_mb=$((mem_bytes / 1048576))
echo "Total||${mem_mb}||"
fi