Add mooncake dockerfile build#771
Conversation
There was a problem hiding this comment.
Pull request overview
Adds Mooncake to the ATOM Docker multi-stage “parallel build” image so it can be built in its own stage and then merged into the final atom_image runtime.
Changes:
- Extend the BuildKit parallel stage layout to include a new
build_mooncakestage. - Add a
build_mooncakestage that installs dependencies, builds Mooncake with HIP/etcd enabled, and installs its Python wheel. - Copy Mooncake runtime artifacts (libs, etcd binaries, Python package) into the final
atom_imagestage and validate import.
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
| openssh-server openmpi-bin openmpi-common libopenmpi-dev && \ | ||
| rm -rf /var/lib/apt/lists/* | ||
|
|
||
| RUN apt-get update && apt-get remove -y golang golang-go 2>/dev/null || true && \ |
| wget -q https://go.dev/dl/go1.22.2.linux-amd64.tar.gz && \ | ||
| tar -C /usr/local -xzf go1.22.2.linux-amd64.tar.gz && \ | ||
| rm go1.22.2.linux-amd64.tar.gz |
| COPY --from=build_mooncake /opt/venv/lib/python3.12/site-packages/mooncake /opt/venv/lib/python3.12/site-packages/mooncake | ||
| COPY --from=build_mooncake /opt/venv/lib/python3.12/site-packages/mooncake-*.dist-info /opt/venv/lib/python3.12/site-packages/ | ||
| RUN apt-get update && \ | ||
| apt-get install -y --no-install-recommends librdmacm-dev libibverbs-dev rdma-core && \ |
There was a problem hiding this comment.
Pull request overview
Copilot reviewed 5 out of 5 changed files in this pull request and generated 4 comments.
Comments suppressed due to low confidence (4)
.github/workflows/atom-benchmark.yaml:393
- Same issue here:
inputs.pd-benchmarkis not a valid property access for an input id containing-. Useinputs['pd-benchmark']or rename the input key to avoid hyphens.
if: >-
!cancelled()
&& (github.event_name == 'schedule'
|| inputs.pd-benchmark != false)
needs: [benchmark]
.github/scripts/atom_pd_test.sh:406
local idx=0is used at top-level inside thelaunch-allbranch.localis only valid within functions; this will cause the script to exit. Useidx=0instead (or move this logic into a function).
launch_proxy "$local_ip"
sleep 2
# Launch consumer on each decode node (different port per node)
local idx=0
while IFS= read -r decode_host; do
[ -z "$decode_host" ] && continue
consumer_port=$((CONSUMER_BASE_PORT + idx))
.github/scripts/atom_pd_test.sh:523
local idx=0is used at top-level in thedump-logsbranch, which will error becauselocalis only valid inside functions. Change it toidx=0(or wrap this section in a function) to prevent the script from aborting.
if [ "$TYPE" == "dump-logs" ]; then
echo "========== Proxy Log =========="
cat "$PROXY_LOG" 2>/dev/null || echo "(no proxy log)"
echo ""
echo "========== Producer Log =========="
cat "$PRODUCER_LOG" 2>/dev/null || echo "(no producer log)"
local idx=0
while IFS= read -r decode_host; do
.github/scripts/atom_pd_test.sh:194
- Same issue for consumers:
"kv_connector": "mooncake"is not a registered KV connector backend in ATOM today, so consumeropenai_serverinstances will fail at startup. Register the backend or use the existing supported connector key.
kv_config=$(cat <<KVJSON
{
"kv_role": "kv_consumer",
"kv_connector": "mooncake",
"proxy_ip": "${local_ip}",
"proxy_ping_port": ${DISCOVERY_PORT},
"http_port": ${consumer_port}
}
| if: >- | ||
| !cancelled() | ||
| && (github.event_name == 'schedule' | ||
| || inputs.pd-benchmark != false) | ||
| needs: [benchmark] |
| echo "========== Node Assignment ==========" | ||
| echo " Prefill: $(get_local_hostname) (${local_ip})" | ||
| local idx=0 | ||
| while IFS= read -r dh; do | ||
| [ -z "$dh" ] && continue | ||
| dip=$(get_remote_ip "$dh") | ||
| echo " Decode #${idx}: ${dh} (${dip})" | ||
| idx=$((idx + 1)) |
| ssh -o StrictHostKeyChecking=no "${HOSTS[$i]}" bash -l <<EOF || true | ||
| docker stop atom-pd-consumer-${i} 2>/dev/null || true | ||
| docker rm atom-pd-consumer-${i} 2>/dev/null || true | ||
| EOF |
| kv_config=$(cat <<KVJSON | ||
| { | ||
| "kv_role": "kv_producer", | ||
| "kv_connector": "mooncake", | ||
| "proxy_ip": "${local_ip}", | ||
| "proxy_ping_port": ${DISCOVERY_PORT}, | ||
| "http_port": ${PRODUCER_PORT} | ||
| } |
| COPY --from=build_mooncake /opt/venv/lib/python3.12/site-packages/mooncake /opt/venv/lib/python3.12/site-packages/mooncake | ||
| COPY --from=build_mooncake /opt/venv/lib/python3.12/site-packages/mooncake-*.dist-info /opt/venv/lib/python3.12/site-packages/ | ||
| RUN apt-get update && \ | ||
| apt-get install -y --no-install-recommends librdmacm-dev libibverbs-dev rdma-core && \ |
| strategy: | ||
| fail-fast: false | ||
| max-parallel: 1 | ||
| matrix: | ||
| model: ${{ fromJson(needs.load-pd-models.outputs.models_json) }} | ||
| config: ${{ fromJson(needs.parse-pd-params.outputs.matrix_json) }} | ||
|
|
| - name: Setup SSH | ||
| run: | | ||
| mkdir -p ~/.ssh && chmod 700 ~/.ssh | ||
| if [ -n "${{ secrets.PD_SSH_KEY }}" ]; then | ||
| echo "${{ secrets.PD_SSH_KEY }}" > ~/.ssh/id_rsa | ||
| chmod 600 ~/.ssh/id_rsa | ||
| fi | ||
| IFS=',' read -ra NODES <<< "$PD_NODE_LIST" | ||
| for node in "${NODES[@]}"; do | ||
| ssh-keyscan -H "$(echo "$node" | xargs)" >> ~/.ssh/known_hosts 2>/dev/null || true | ||
| done |
| # PREFILL_ARGS, DECODE_ARGS, PROXY_PORT, PRODUCER_PORT, CONSUMER_PORT, | ||
| # ATOM_DOCKER_IMAGE, ISL, OSL, CONC, RANDOM_RANGE_RATIO, RESULT_FILENAME |
| # ── RDMA NIC detection (ported from mori/docker/ci_run.sh) ───────── | ||
|
|
||
| detect_nic_type() { | ||
| if [[ -n "${MORI_NIC_TYPE:-}" ]]; then echo "$MORI_NIC_TYPE"; return; fi | ||
| local bnxt=0 mlx5=0 ionic=0 | ||
| if [[ -d /sys/class/infiniband ]]; then | ||
| for dev in /sys/class/infiniband/*; do | ||
| local name; name=$(basename "$dev") | ||
| case "$name" in | ||
| bnxt_re*) ((bnxt++)) ;; mlx5*) ((mlx5++)) ;; ionic*) ((ionic++)) ;; | ||
| *) | ||
| local drv; drv=$(basename "$(readlink -f "$dev/device/driver" 2>/dev/null)" 2>/dev/null || true) | ||
| case "$drv" in bnxt*) ((bnxt++)) ;; mlx5*) ((mlx5++)) ;; ionic*) ((ionic++)) ;; esac ;; | ||
| esac | ||
| done | ||
| fi | ||
| if (( bnxt >= mlx5 && bnxt >= ionic && bnxt > 0 )); then echo "bnxt" | ||
| elif (( ionic >= mlx5 && ionic > 0 )); then echo "ionic" | ||
| else echo "mlx5"; fi | ||
| } |
|
|
||
| SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=5 -o BatchMode=yes" |
| "concurrency": [32, 64, 128, 256, 512, 1024, 2048], | ||
| "random_range_ratio": 0.8 | ||
| }, | ||
| { | ||
| "isl": 8192, | ||
| "osl": 1024, | ||
| "concurrency": [32, 64, 128, 256, 512, 1024, 2048], |
2af9c26 to
22c597e
Compare
22c597e to
a4fab45
Compare
There was a problem hiding this comment.
Pull request overview
Copilot reviewed 5 out of 5 changed files in this pull request and generated 4 comments.
Comments suppressed due to low confidence (1)
.github/workflows/atom-benchmark.yaml:658
- Same heredoc indentation issue here: the
EOFterminator is indented, so bash will not close the heredoc correctly. Unindent the terminator (or use<<-EOFwith tabs) to ensure remote cleanup commands execute reliably.
ssh -o StrictHostKeyChecking=no "${HOSTS[$i]}" bash -l <<EOF || true
docker stop atom-pd-consumer-${i} 2>/dev/null || true
docker rm atom-pd-consumer-${i} 2>/dev/null || true
EOF
done
| ssh -o StrictHostKeyChecking=no "${HOSTS[$i]}" bash -l <<EOF || true | ||
| docker stop atom-pd-consumer-${i} 2>/dev/null || true | ||
| docker rm atom-pd-consumer-${i} 2>/dev/null || true | ||
| EOF | ||
| done |
| - name: Install PD dependencies (prefill node) | ||
| run: | | ||
| docker exec atom-pd-prefill bash -lc \ | ||
| "pip install msgpack msgspec quart mooncake-transfer-engine" |
| "pip install msgpack msgspec quart mooncake-transfer-engine" | ||
|
|
| detect_nic_type() { | ||
| if [[ -n "${MORI_NIC_TYPE:-}" ]]; then echo "$MORI_NIC_TYPE"; return; fi | ||
| local bnxt=0 mlx5=0 ionic=0 | ||
| if [[ -d /sys/class/infiniband ]]; then | ||
| for dev in /sys/class/infiniband/*; do | ||
| local name; name=$(basename "$dev") | ||
| case "$name" in | ||
| bnxt_re*) ((bnxt++)) ;; mlx5*) ((mlx5++)) ;; ionic*) ((ionic++)) ;; | ||
| *) | ||
| local drv; drv=$(basename "$(readlink -f "$dev/device/driver" 2>/dev/null)" 2>/dev/null || true) | ||
| case "$drv" in bnxt*) ((bnxt++)) ;; mlx5*) ((mlx5++)) ;; ionic*) ((ionic++)) ;; esac ;; | ||
| esac | ||
| done | ||
| fi | ||
| if (( bnxt >= mlx5 && bnxt >= ionic && bnxt > 0 )); then echo "bnxt" | ||
| elif (( ionic >= mlx5 && ionic > 0 )); then echo "ionic" | ||
| else echo "mlx5"; fi | ||
| } | ||
|
|
||
| find_host_ibverbs() { | ||
| for c in /usr/lib64/libibverbs.so.1 /lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so.1; do | ||
| local resolved; resolved=$(readlink -f "$c" 2>/dev/null || true) | ||
| if [[ -f "$resolved" ]]; then echo "$resolved"; return; fi | ||
| done | ||
| } | ||
|
|
||
| nic_mount_flags() { | ||
| local nic_type="$1" flags=() | ||
| case "$nic_type" in | ||
| bnxt) | ||
| local hib; hib=$(find_host_ibverbs) | ||
| [[ -n "$hib" ]] && flags+=(-v "$hib:/lib/x86_64-linux-gnu/libibverbs.so.1") | ||
| for lib in /usr/local/lib/libbnxt_re-rdmav*.so; do | ||
| [[ -f "$lib" ]] && flags+=(-v "$lib:/usr/lib/x86_64-linux-gnu/libibverbs/$(basename "$lib")") | ||
| done | ||
| for lib in /usr/local/lib/libbnxt_re.so; do | ||
| [[ -f "$lib" ]] && flags+=(-v "$lib:/usr/lib/x86_64-linux-gnu/$(basename "$lib")") | ||
| done | ||
| [[ -d /etc/libibverbs.d ]] && flags+=(-v /etc/libibverbs.d:/etc/libibverbs.d:ro) | ||
| ;; | ||
| ionic) | ||
| local hib; hib=$(find_host_ibverbs) | ||
| [[ -n "$hib" ]] && flags+=(-v "$hib:/lib/x86_64-linux-gnu/libibverbs.so.1") | ||
| for dir in /usr/local/lib /usr/lib/x86_64-linux-gnu; do | ||
| for lib in "$dir"/libionic*.so; do | ||
| if [[ -f "$lib" ]]; then | ||
| local real; real=$(readlink -f "$lib") | ||
| [[ -f "$real" ]] && flags+=(-v "$real:$real") | ||
| flags+=(-v "$lib:/usr/lib/x86_64-linux-gnu/$(basename "$lib")") | ||
| fi | ||
| done | ||
| done | ||
| local pdir=/usr/lib/x86_64-linux-gnu/libibverbs | ||
| if [[ -d "$pdir" ]]; then | ||
| for lib in "$pdir"/libionic-rdmav*.so; do | ||
| [[ -f "$lib" ]] && flags+=(-v "$lib:$lib") | ||
| done | ||
| fi | ||
| [[ -d /etc/libibverbs.d ]] && flags+=(-v /etc/libibverbs.d:/etc/libibverbs.d:ro) | ||
| ;; | ||
| esac | ||
| echo "${flags[@]}" | ||
| } |
a4fab45 to
0142da3
Compare
There was a problem hiding this comment.
Pull request overview
Copilot reviewed 5 out of 5 changed files in this pull request and generated 4 comments.
Comments suppressed due to low confidence (2)
.github/workflows/atom-benchmark.yaml:511
- In the PD workflow container launch, NIC_TYPE can be detected as "ionic" (see case on infiniband devices), but only the bnxt path mounts extra RDMA user-space libraries. Either drop ionic detection here or add the corresponding ionic mount logic (similar to the bnxt mounts) so RDMA-enabled ionic runners work consistently.
NIC_TYPE="mlx5"
if [ -d /sys/class/infiniband ]; then
for dev in /sys/class/infiniband/*; do
name=$(basename "$dev")
case "$name" in bnxt_re*) NIC_TYPE="bnxt"; break ;; ionic*) NIC_TYPE="ionic"; break ;; esac
done
fi
if [ "$NIC_TYPE" = "bnxt" ]; then
for c in /usr/lib64/libibverbs.so.1 /lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so.1; do
resolved=$(readlink -f "$c" 2>/dev/null || true)
.github/scripts/atom_pd_test.sh:292
- In the remote consumer launch path, NIC_TYPE can become "ionic", but only the bnxt branch populates NIC_MOUNTS. This makes the ionic detection misleading and can break RDMA setups that require extra ionic userspace libraries. Either handle the ionic case or simplify detection to only the NIC types you actively support here.
# NIC detection for RDMA userspace libs (ported from mori)
NIC_MOUNTS=""
NIC_TYPE="mlx5"
if [ -d /sys/class/infiniband ]; then
for dev in /sys/class/infiniband/*; do
name=\$(basename "\$dev")
case "\$name" in bnxt_re*) NIC_TYPE="bnxt"; break ;; ionic*) NIC_TYPE="ionic"; break ;; esac
done
fi
if [ "\$NIC_TYPE" = "bnxt" ]; then
for c in /usr/lib64/libibverbs.so.1 /lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so.1; do
resolved=\$(readlink -f "\$c" 2>/dev/null || true)
| # ── RDMA NIC detection (ported from mori/docker/ci_run.sh) ───────── | ||
|
|
||
| detect_nic_type() { | ||
| if [[ -n "${MORI_NIC_TYPE:-}" ]]; then echo "$MORI_NIC_TYPE"; return; fi | ||
| local bnxt=0 mlx5=0 ionic=0 | ||
| if [[ -d /sys/class/infiniband ]]; then | ||
| for dev in /sys/class/infiniband/*; do | ||
| local name; name=$(basename "$dev") | ||
| case "$name" in | ||
| bnxt_re*) ((bnxt++)) ;; mlx5*) ((mlx5++)) ;; ionic*) ((ionic++)) ;; | ||
| *) | ||
| local drv; drv=$(basename "$(readlink -f "$dev/device/driver" 2>/dev/null)" 2>/dev/null || true) | ||
| case "$drv" in bnxt*) ((bnxt++)) ;; mlx5*) ((mlx5++)) ;; ionic*) ((ionic++)) ;; esac ;; | ||
| esac | ||
| done | ||
| fi | ||
| if (( bnxt >= mlx5 && bnxt >= ionic && bnxt > 0 )); then echo "bnxt" | ||
| elif (( ionic >= mlx5 && ionic > 0 )); then echo "ionic" | ||
| else echo "mlx5"; fi | ||
| } | ||
|
|
||
| find_host_ibverbs() { | ||
| for c in /usr/lib64/libibverbs.so.1 /lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so.1; do | ||
| local resolved; resolved=$(readlink -f "$c" 2>/dev/null || true) | ||
| if [[ -f "$resolved" ]]; then echo "$resolved"; return; fi | ||
| done | ||
| } | ||
|
|
| python -m atom.benchmarks.benchmark_serving \ | ||
| --model="$MODEL_PATH" --backend=vllm --base-url="http://localhost:${PROXY_PORT}" \ | ||
| --dataset-name=random \ | ||
| --random-input-len="$ISL" --random-output-len="$OSL" --random-range-ratio="$RANDOM_RANGE_RATIO" \ | ||
| --max-concurrency="$CONC" \ | ||
| --num-prompts="${NUM_PROMPTS_OVERRIDE:-$(( CONC * 10 ))}" \ | ||
| --trust-remote-code \ | ||
| --num-warmups="$(( CONC * 2 ))" \ |
| COPY --from=build_mooncake /opt/venv/lib/python3.12/site-packages/mooncake /opt/venv/lib/python3.12/site-packages/mooncake | ||
| COPY --from=build_mooncake /opt/venv/lib/python3.12/site-packages/mooncake-*.dist-info /opt/venv/lib/python3.12/site-packages/ | ||
| RUN apt-get update && \ | ||
| apt-get install -y --no-install-recommends librdmacm-dev libibverbs-dev rdma-core && \ |
| strategy: | ||
| fail-fast: false | ||
| max-parallel: 1 | ||
| matrix: | ||
| model: ${{ fromJson(needs.load-pd-models.outputs.models_json) }} | ||
| config: ${{ fromJson(needs.parse-pd-params.outputs.matrix_json) }} | ||
|
|
…e install The custom RCCL .deb (built from a specific commit) conflicts with the version expected by the ROCm apt repo's rocm-hip package. This caused apt-get install rdma-core to fail with unresolvable dependencies. Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
There was a problem hiding this comment.
Pull request overview
Copilot reviewed 5 out of 5 changed files in this pull request and generated 8 comments.
Comments suppressed due to low confidence (1)
.github/workflows/atom-benchmark.yaml:809
- Appending
pd_models.jsoninto themodelslist here will overwrite existing display-name mappings when the PD model shares the samepath/HF last-segment and has nosuffix. For example,deepseek-ai/DeepSeek-R1-0528already exists inmodels.json, and both PD entries reuse thatpath, somapping['deepseek-r1-0528']will end up pointing at a PD display name and mislabel the standard single-node DeepSeek results on the dashboard. To avoid collisions, ensure PD configs produce a distinct key (e.g., add asuffixinpd_models.json, or update the mapping key logic to preferprefixwhen present).
python3 -c "
import json
models = json.loads(open('.github/benchmark/models.json').read())
# Include PD model configs in dashboard mapping
try:
models += json.loads(open('.github/benchmark/pd_models.json').read())
except FileNotFoundError:
pass
mapping = {}
for m in models:
display = m.get('display','')
path = m.get('path','')
suffix = m.get('suffix','')
if not display or not path: continue
hf_name = path.split('/')[-1].lower()
# HF path-derived key (old data used model_id last segment + optional suffix)
hf_key = hf_name + suffix.lower() if suffix else hf_name
mapping[hf_key] = display
| zip unzip wget gcc make libtool autoconf cmake \ | ||
| librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool \ | ||
| libibverbs-dev rdma-core \ | ||
| openssh-server openmpi-bin openmpi-common libopenmpi-dev && \ |
| COPY --from=build_mooncake /usr/local/lib/libmooncake* /usr/local/lib/ | ||
| COPY --from=build_mooncake /usr/local/lib/libetcd_wrapper.so /usr/local/lib/ | ||
| COPY --from=build_mooncake /usr/local/lib/libgrpc* /usr/local/lib/ | ||
| COPY --from=build_mooncake /usr/local/lib/libprotobuf* /usr/local/lib/ | ||
| COPY --from=build_mooncake /usr/local/lib/libabsl* /usr/local/lib/ | ||
| COPY --from=build_mooncake /usr/local/lib/libupb* /usr/local/lib/ | ||
| COPY --from=build_mooncake /usr/local/lib/libre2* /usr/local/lib/ |
| COPY --from=build_mooncake /opt/venv/lib/python3.12/site-packages/mooncake-*.dist-info /opt/venv/lib/python3.12/site-packages/ | ||
| RUN apt-mark hold rccl rccl-dev rocm-hip && \ | ||
| apt-get update && \ | ||
| apt-get install -y --no-install-recommends librdmacm-dev libibverbs-dev && \ |
| - name: Setup SSH | ||
| run: | | ||
| mkdir -p ~/.ssh && chmod 700 ~/.ssh | ||
| if [ -n "${{ secrets.PD_SSH_KEY }}" ]; then | ||
| echo "${{ secrets.PD_SSH_KEY }}" > ~/.ssh/id_rsa | ||
| chmod 600 ~/.ssh/id_rsa | ||
| fi | ||
| IFS=',' read -ra NODES <<< "$PD_NODE_LIST" | ||
| for node in "${NODES[@]}"; do | ||
| ssh-keyscan -H "$(echo "$node" | xargs)" >> ~/.ssh/known_hosts 2>/dev/null || true | ||
| done |
| - name: Install PD dependencies (prefill node) | ||
| run: | | ||
| docker exec atom-pd-prefill bash -lc \ | ||
| "pip install msgpack msgspec quart mooncake-transfer-engine" |
| run: | | ||
| if [ -d "/models" ]; then model_path="/models/${{ env.MODEL_PATH }}" | ||
| else model_path="${{ env.MODEL_PATH }}"; fi | ||
|
|
||
| docker exec atom-pd-prefill bash -lc "set -euo pipefail | ||
| .github/scripts/atom_pd_test.sh launch-all $model_path ${{ inputs.extra_args || '' }}" | ||
|
|
| PROXY_PORT=${PROXY_PORT:-10001} | ||
| PRODUCER_PORT=${PRODUCER_PORT:-8003} | ||
| CONSUMER_BASE_PORT=${CONSUMER_BASE_PORT:-8004} | ||
| DISCOVERY_PORT=${DISCOVERY_PORT:-36367} | ||
| NUM_DECODE_NODES=${NUM_DECODE_NODES:-1} |
| # ── RDMA NIC detection (ported from mori/docker/ci_run.sh) ───────── | ||
|
|
||
| detect_nic_type() { | ||
| if [[ -n "${MORI_NIC_TYPE:-}" ]]; then echo "$MORI_NIC_TYPE"; return; fi | ||
| local bnxt=0 mlx5=0 ionic=0 |
| RUN rm -rf /usr/local/go && \ | ||
| wget -q https://go.dev/dl/go1.22.2.linux-amd64.tar.gz && \ | ||
| tar -C /usr/local -xzf go1.22.2.linux-amd64.tar.gz && \ | ||
| rm go1.22.2.linux-amd64.tar.gz && \ | ||
| go version |
| librdmacm-dev libibverbs-dev \ | ||
| libgflags-dev libgoogle-glog-dev \ | ||
| libjsoncpp-dev libyaml-cpp-dev \ | ||
| libnuma-dev libcurl4-openssl-dev \ | ||
| libunwind-dev libc-ares-dev && \ |
| docker exec atom-pd-prefill bash -lc \ | ||
| "pip install msgpack msgspec quart mooncake-transfer-engine" | ||
|
|
| run: | | ||
| if [ -d "/models" ]; then model_path="/models/${{ env.MODEL_PATH }}" | ||
| else model_path="${{ env.MODEL_PATH }}"; fi | ||
|
|
||
| docker exec atom-pd-prefill bash -lc "set -euo pipefail | ||
| .github/scripts/atom_pd_test.sh launch-all $model_path ${{ inputs.extra_args || '' }}" |
| # ── RDMA NIC detection (ported from mori/docker/ci_run.sh) ───────── | ||
|
|
||
| detect_nic_type() { | ||
| if [[ -n "${MORI_NIC_TYPE:-}" ]]; then echo "$MORI_NIC_TYPE"; return; fi | ||
| local bnxt=0 mlx5=0 ionic=0 | ||
| if [[ -d /sys/class/infiniband ]]; then | ||
| for dev in /sys/class/infiniband/*; do | ||
| local name; name=$(basename "$dev") | ||
| case "$name" in | ||
| bnxt_re*) ((bnxt++)) ;; mlx5*) ((mlx5++)) ;; ionic*) ((ionic++)) ;; | ||
| *) | ||
| local drv; drv=$(basename "$(readlink -f "$dev/device/driver" 2>/dev/null)" 2>/dev/null || true) | ||
| case "$drv" in bnxt*) ((bnxt++)) ;; mlx5*) ((mlx5++)) ;; ionic*) ((ionic++)) ;; esac ;; | ||
| esac | ||
| done | ||
| fi | ||
| if (( bnxt >= mlx5 && bnxt >= ionic && bnxt > 0 )); then echo "bnxt" | ||
| elif (( ionic >= mlx5 && ionic > 0 )); then echo "ionic" | ||
| else echo "mlx5"; fi | ||
| } | ||
|
|
||
| find_host_ibverbs() { | ||
| for c in /usr/lib64/libibverbs.so.1 /lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so.1; do | ||
| local resolved; resolved=$(readlink -f "$c" 2>/dev/null || true) | ||
| if [[ -f "$resolved" ]]; then echo "$resolved"; return; fi | ||
| done | ||
| } | ||
|
|
| if docker ps -q -f "name=atom-" 2>/dev/null | grep -q .; then | ||
| echo "BUSY:docker"; exit 1 | ||
| fi | ||
| USED=$(rocm-smi --showmemuse 2>/dev/null | grep "VRAM%" | awk '{print $NF}' | awk '$1 > 0' | wc -l) |
| docker run -dt --device=/dev/kfd \$DEVICE_FLAG \$IB_FLAG \ | ||
| \$NIC_MOUNTS \ | ||
| \$MODEL_MOUNT \ | ||
| -w /workspace --ipc=host --group-add video \ | ||
| --shm-size=16G --privileged --cap-add=SYS_PTRACE \ | ||
| --security-opt seccomp=unconfined \ | ||
| --ulimit memlock=-1 --ulimit stack=67108864 \ | ||
| -e ATOM_DISABLE_MMAP=true \ | ||
| -e NCCL_SOCKET_IFNAME=lo \ | ||
| -e AITER_LOG_LEVEL=WARNING \ | ||
| --network=host \ | ||
| --name ${container_name} \ | ||
| ${ATOM_DOCKER_IMAGE:-rocm/atom-dev:latest} |
| docker exec ${container_name} bash -lc \ | ||
| "pip install msgpack msgspec quart mooncake-transfer-engine" | ||
|
|
Summary
Technical Details
Test Plan
Test Result
Submission Checklist