From b3432ed2c306417044234207bbf2191602e5f436 Mon Sep 17 00:00:00 2001 From: wanzhenchn Date: Wed, 6 May 2026 08:42:32 +0000 Subject: [PATCH 1/5] [ci] add Qwen3.5 Dense/MoE models accuracy validation for atom-plugined sglang --- .github/benchmark/sglang_models_accuracy.json | 36 +++++++++++++++ .../atom-sglang-accuracy-validation.yaml | 45 +++++++++++++++++++ .github/workflows/atom-sglang-test.yaml | 18 ++++++++ 3 files changed, 99 insertions(+) diff --git a/.github/benchmark/sglang_models_accuracy.json b/.github/benchmark/sglang_models_accuracy.json index 2580c133d..d644d3a27 100644 --- a/.github/benchmark/sglang_models_accuracy.json +++ b/.github/benchmark/sglang_models_accuracy.json @@ -10,5 +10,41 @@ "accuracy_baseline": null, "accuracy_baseline_model": "deepseek-ai/DeepSeek-R1-0528", "_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k." + }, + { + "model_name": "Qwen3.5-35B-A3B-FP8 TP2", + "model_path": "Qwen/Qwen3.5-35B-A3B-FP8", + "extraArgs": "--tensor-parallel-size 2", + "env_vars": "", + "runner": "linux-atom-mi35x-4", + "test_level": "nightly", + "accuracy_threshold": 0.89, + "accuracy_baseline": null, + "accuracy_baseline_model": "Qwen/Qwen3.5-35B-A3B-FP8", + "_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k." + }, + { + "model_name": "Qwen3.5-27B-FP8 TP2", + "model_path": "Qwen/Qwen3.5-27B-FP8", + "extraArgs": "--tensor-parallel-size 2", + "env_vars": "", + "runner": "linux-atom-mi35x-4", + "test_level": "nightly", + "accuracy_threshold": 0.88, + "accuracy_baseline": null, + "accuracy_baseline_model": "Qwen/Qwen3.5-27B-FP8", + "_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k." + }, + { + "model_name": "Qwen3.5-35B-A3B TP2", + "model_path": "Qwen/Qwen3.5-35B-A3B", + "extraArgs": "--tensor-parallel-size 2", + "env_vars": "", + "runner": "linux-atom-mi35x-4", + "test_level": "nightly", + "accuracy_threshold": 0.95, + "accuracy_baseline": null, + "accuracy_baseline_model": "Qwen/Qwen3.5-35B-A3B", + "_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k." } ] diff --git a/.github/workflows/atom-sglang-accuracy-validation.yaml b/.github/workflows/atom-sglang-accuracy-validation.yaml index 7e6b9baa1..81b3e6a84 100644 --- a/.github/workflows/atom-sglang-accuracy-validation.yaml +++ b/.github/workflows/atom-sglang-accuracy-validation.yaml @@ -14,6 +14,21 @@ on: required: false type: boolean default: false + run_qwen35_35b_a3b_fp8_tp2: + description: "Qwen3.5-35B-A3B-FP8 TP2" + required: false + type: boolean + default: false + run_qwen35_27b_fp8_tp2: + description: "Qwen3.5-27B-FP8 TP2" + required: false + type: boolean + default: false + run_qwen35_35b_a3b_tp2: + description: "Qwen3.5-35B-A3B TP2" + required: false + type: boolean + default: false upload_accuracy_to_dashboard: description: "Optional: upload SGLANG accuracy results to dashboard after this manual run" required: false @@ -55,6 +70,9 @@ jobs: id: meta env: RUN_DSR1_FP8_TP4: ${{ inputs.run_dsr1_fp8_tp4 }} + RUN_QWEN35_35B_A3B_FP8_TP2: ${{ inputs.run_qwen35_35b_a3b_fp8_tp2 }} + RUN_QWEN35_27B_FP8_TP2: ${{ inputs.run_qwen35_27b_fp8_tp2 }} + RUN_QWEN35_35B_A3B_TP2: ${{ inputs.run_qwen35_35b_a3b_tp2 }} run: | set -euo pipefail @@ -76,6 +94,33 @@ jobs: "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1", "runner": "linux-atom-mi35x-4", }, + { + "toggle_env": "RUN_QWEN35_35B_A3B_FP8_TP2", + "model_name": "Qwen3.5-35B-A3B-FP8 TP2", + "model_path": "Qwen/Qwen3.5-35B-A3B-FP8", + "extra_args": "--tensor-parallel-size 2", + "accuracy_test_threshold": 0.89, + "env_vars": "", + "runner": "linux-atom-mi35x-4", + }, + { + "toggle_env": "RUN_QWEN35_27B_FP8_TP2", + "model_name": "Qwen3.5-27B-FP8 TP2", + "model_path": "Qwen/Qwen3.5-27B-FP8", + "extra_args": "--tensor-parallel-size 2", + "accuracy_test_threshold": 0.88, + "env_vars": "", + "runner": "linux-atom-mi35x-4", + }, + { + "toggle_env": "RUN_QWEN35_35B_A3B_TP2", + "model_name": "Qwen3.5-35B-A3B TP2", + "model_path": "Qwen/Qwen3.5-35B-A3B", + "extra_args": "--tensor-parallel-size 2", + "accuracy_test_threshold": 0.95, + "env_vars": "", + "runner": "linux-atom-mi35x-4", + }, ] selected = [] diff --git a/.github/workflows/atom-sglang-test.yaml b/.github/workflows/atom-sglang-test.yaml index 4cd447d1d..1ac256522 100644 --- a/.github/workflows/atom-sglang-test.yaml +++ b/.github/workflows/atom-sglang-test.yaml @@ -127,6 +127,24 @@ jobs: ATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1 accuracy_test_threshold: 0.92 runner: linux-atom-mi35x-4 + - model_name: "Qwen3.5-35B-A3B-FP8 TP2" + model_path: "Qwen/Qwen3.5-35B-A3B-FP8" + extra_args: "--tensor-parallel-size 2" + env_vars: "" + accuracy_test_threshold: 0.89 + runner: linux-atom-mi35x-4 + - model_name: "Qwen3.5-27B-FP8 TP2" + model_path: "Qwen/Qwen3.5-27B-FP8" + extra_args: "--tensor-parallel-size 2" + env_vars: "" + accuracy_test_threshold: 0.88 + runner: linux-atom-mi35x-4 + - model_name: "Qwen3.5-35B-A3B TP2" + model_path: "Qwen/Qwen3.5-35B-A3B" + extra_args: "--tensor-parallel-size 2" + env_vars: "" + accuracy_test_threshold: 0.95 + runner: linux-atom-mi35x-4 runs-on: ${{ matrix.runner }} timeout-minutes: 180 env: From 248a91a539e31e48efbe4cad93eec173cc7aad38 Mon Sep 17 00:00:00 2001 From: wanzhenchn Date: Fri, 8 May 2026 09:52:38 +0000 Subject: [PATCH 2/5] [ci][benchmark] add Qwen3.5-397B-A13B-FP8 TP4/TP8 benchmark case on MI35X --- .../benchmark/sglang_benchmark_models.json | 22 +++++++++++++++++++ .github/workflows/atom-sglang-benchmark.yaml | 14 ++++++++++++ 2 files changed, 36 insertions(+) diff --git a/.github/benchmark/sglang_benchmark_models.json b/.github/benchmark/sglang_benchmark_models.json index 8ddee78d0..cd37fa6cf 100644 --- a/.github/benchmark/sglang_benchmark_models.json +++ b/.github/benchmark/sglang_benchmark_models.json @@ -51,5 +51,27 @@ "bench_args": "", "runner": "atom-mi355-8gpu-oot-benchmark", "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1" + }, + { + "display": "Qwen3.5-397B-A17B-FP8 TP4", + "dashboard_model": "Qwen3.5-397B-A17B-FP8-tp4", + "source_path": "Qwen/Qwen3.5-397B-A17B-FP8", + "path": "Qwen/Qwen3.5-397B-A17B-FP8", + "prefix": "qwen3-5-397b-a17b-fp8-tp4", + "extra_args": "--trust-remote-code --tensor-parallel-size 4", + "bench_args": "", + "runner": "atom-mi355-8gpu-oot-benchmark", + "env_vars": "" + }, + { + "display": "Qwen3.5-397B-A17B-FP8 TP8", + "dashboard_model": "Qwen3.5-397B-A17B-FP8", + "source_path": "Qwen/Qwen3.5-397B-A17B-FP8", + "path": "Qwen/Qwen3.5-397B-A17B-FP8", + "prefix": "qwen3-5-397b-a17b-fp8-tp8", + "extra_args": "--trust-remote-code --tensor-parallel-size 8", + "bench_args": "", + "runner": "atom-mi355-8gpu-oot-benchmark", + "env_vars": "" } ] diff --git a/.github/workflows/atom-sglang-benchmark.yaml b/.github/workflows/atom-sglang-benchmark.yaml index 66720b924..867d47f71 100644 --- a/.github/workflows/atom-sglang-benchmark.yaml +++ b/.github/workflows/atom-sglang-benchmark.yaml @@ -27,6 +27,14 @@ on: description: "DeepSeek-R1-0528-MXFP4 FP4 TP8 EP8" type: boolean default: false + qwen3-5-397b-a17b-fp8-tp4: + description: "Qwen3.5-397B-A17B-FP8 TP4" + type: boolean + default: false + qwen3-5-397b-a17b-fp8-tp8: + description: "Qwen3.5-397B-A17B-FP8 TP8" + type: boolean + default: false sglang_image: description: "Optional SGLang benchmark image override. Leave empty to use sglang-latest on main or rebuild from the selected non-main branch." type: string @@ -217,6 +225,8 @@ jobs: ENABLE_DEEPSEEK_R1_FP4_TP8: ${{ inputs.deepseek-r1-fp4-tp8 }} ENABLE_DEEPSEEK_R1_FP4_TP4: ${{ inputs.deepseek-r1-fp4-tp4 }} ENABLE_DEEPSEEK_R1_FP4_TP8_EP8: ${{ inputs.deepseek-r1-fp4-tp8-ep8 }} + ENABLE_QWEN3_5_397B_A17B_FP8_TP4: ${{ inputs.qwen3-5-397b-a17b-fp8-tp4 }} + ENABLE_QWEN3_5_397B_A17B_FP8_TP8: ${{ inputs.qwen3-5-397b-a17b-fp8-tp8 }} run: | MODELS_JSON="$(jq -c ' map(select( @@ -225,6 +235,8 @@ jobs: or (.prefix == "deepseek-r1-fp4-tp8" and env.ENABLE_DEEPSEEK_R1_FP4_TP8 == "true") or (.prefix == "deepseek-r1-fp4-tp4" and env.ENABLE_DEEPSEEK_R1_FP4_TP4 == "true") or (.prefix == "deepseek-r1-fp4-tp8-ep8" and env.ENABLE_DEEPSEEK_R1_FP4_TP8_EP8 == "true") + or (.prefix == "qwen3-5-397b-a17b-fp8-tp4" and env.ENABLE_QWEN3_5_397B_A17B_FP8_TP4 == "true") + or (.prefix == "qwen3-5-397b-a17b-fp8-tp8" and env.ENABLE_QWEN3_5_397B_A17B_FP8_TP8 == "true") )) ' .github/benchmark/sglang_benchmark_models.json)" echo "models_json=${MODELS_JSON}" >> "$GITHUB_OUTPUT" @@ -475,6 +487,8 @@ jobs: deepseek-r1-fp4-tp8) echo "enabled=${{ inputs.deepseek-r1-fp4-tp8 }}" >> "$GITHUB_OUTPUT" ;; deepseek-r1-fp4-tp4) echo "enabled=${{ inputs.deepseek-r1-fp4-tp4 }}" >> "$GITHUB_OUTPUT" ;; deepseek-r1-fp4-tp8-ep8) echo "enabled=${{ inputs.deepseek-r1-fp4-tp8-ep8 }}" >> "$GITHUB_OUTPUT" ;; + qwen3-5-397b-a17b-fp8-tp4) echo "enabled=${{ inputs.qwen3-5-397b-a17b-fp8-tp4 }}" >> "$GITHUB_OUTPUT" ;; + qwen3-5-397b-a17b-fp8-tp8) echo "enabled=${{ inputs.qwen3-5-397b-a17b-fp8-tp8 }}" >> "$GITHUB_OUTPUT" ;; *) echo "enabled=true" >> "$GITHUB_OUTPUT" ;; esac From 6afd51f66f6e8487bb1d33a69e85977a1837bea3 Mon Sep 17 00:00:00 2001 From: wanzhenchn Date: Tue, 12 May 2026 07:26:11 +0000 Subject: [PATCH 3/5] [doc] fix qwen3.5 recipe for atom_sglang --- recipes/atom_sglang/Qwen3_5.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/recipes/atom_sglang/Qwen3_5.md b/recipes/atom_sglang/Qwen3_5.md index 6baf61781..a63e2c8dc 100644 --- a/recipes/atom_sglang/Qwen3_5.md +++ b/recipes/atom_sglang/Qwen3_5.md @@ -55,15 +55,15 @@ RESULT_FILENAME=${model}-tp${tp}-${ISL}-${OSL}-${CONC}-${RANDOM_RANGE_RATIO}.jso python3 -m sglang.bench_serving --backend sglang-oai-chat \ --model ${model_path} \ --base-url=http://127.0.0.1:30000 \ - --max-concurrency 16 \ - --num-prompts "$(( CONC * 5 ))" \ + --max-concurrency 16 \ + --num-prompts "$(( CONC * 5 ))" \ --request-rate inf \ --dataset-name random \ --random-input-len ${ISL} \ --random-output-len ${OSL} \ --random-range-ratio ${RANDOM_RANGE_RATIO} \ --warmup-requests $(( CONC * 2 )) \ - --disable-ignore-eos \ + --disable-ignore-eos \ --output-file ${RESULT_FILENAME} \ --trust-remote-code ``` @@ -86,10 +86,9 @@ Then append `--profile` to the `sglang.bench_serving` command in Step 3. ```bash lm_eval --model local-completions \ - --model_args model=${model_path},base_url=http://localhost:30000/v1/completions,num_concurrent=256,max_retries=2,tokenized_requests=False,trust_remote_code=True \ + --model_args model=${model_path},base_url=http://localhost:30000/v1/completions,num_concurrent=65,max_retries=1,tokenized_requests=False,trust_remote_code=True \ --tasks gsm8k \ - --batch_size auto \ - --num_fewshot 5 \ + --num_fewshot 3 \ --trust_remote_code ``` From 5854747110ba327b166716f45e2f692db30eddcd Mon Sep 17 00:00:00 2001 From: zhuyuhua-v Date: Mon, 11 May 2026 07:08:15 +0000 Subject: [PATCH 4/5] update aiter whl download flow Signed-off-by: zhuyuhua-v --- .github/workflows/atom-sglang-test.yaml | 145 +++++++++++++++++++----- 1 file changed, 115 insertions(+), 30 deletions(-) diff --git a/.github/workflows/atom-sglang-test.yaml b/.github/workflows/atom-sglang-test.yaml index 1ac256522..2ebf08d27 100644 --- a/.github/workflows/atom-sglang-test.yaml +++ b/.github/workflows/atom-sglang-test.yaml @@ -50,55 +50,140 @@ jobs: aiter_artifact_id: ${{ steps.download.outputs.aiter_artifact_id }} aiter_wheel_name: ${{ steps.download.outputs.aiter_wheel_name }} steps: - - name: Find and download latest aiter wheel + - name: Prefer latest main aiter wheel manifest and fallback to artifact id: download run: | set -euo pipefail - echo "=== Finding latest aiter-whl-main artifact from ROCm/aiter ===" + echo "=== Trying latest main aiter wheel manifest from S3 first ===" + S3_MAIN_MANIFEST_URL="https://rocm.frameworks-nightlies.amd.com/whl-staging/gfx942-gfx950/main/latest.json" API_URL="https://api.github.com" AUTH_HEADER="Authorization: token ${{ secrets.GITHUB_TOKEN }}" AITER_TEST_WORKFLOW_ID=179476100 - RUNS=$(curl -s -H "$AUTH_HEADER" \ - "$API_URL/repos/ROCm/aiter/actions/workflows/$AITER_TEST_WORKFLOW_ID/runs?per_page=100&branch=main&event=push") - ARTIFACT_ID="" ARTIFACT_NAME="" - for RUN_ID in $(echo "$RUNS" | jq -r '.workflow_runs[].id'); do - ARTIFACT_JSON=$(curl -s -H "$AUTH_HEADER" \ - "$API_URL/repos/ROCm/aiter/actions/runs/$RUN_ID/artifacts" \ - | jq '[.artifacts[] | select(.name | startswith("aiter-whl-main")) | select(.expired == false)] | first') - - if [ "$ARTIFACT_JSON" != "null" ] && [ -n "$ARTIFACT_JSON" ]; then - ARTIFACT_ID=$(echo "$ARTIFACT_JSON" | jq -r '.id') - ARTIFACT_NAME=$(echo "$ARTIFACT_JSON" | jq -r '.name') - echo "Found artifact in run $RUN_ID: $ARTIFACT_NAME (ID: $ARTIFACT_ID)" - break + ARTIFACT_RUN_ID="" + ARTIFACT_RUN_SHA="" + ARTIFACT_RUN_CREATED_AT="" + + resolve_download_url() { + python3 -c 'import sys + from urllib.parse import quote, unquote, urlsplit, urlunsplit + parts = urlsplit(sys.argv[1]) + encoded_path = "/".join(quote(unquote(segment), safe="") for segment in parts.path.split("/")) + print(urlunsplit((parts.scheme, parts.netloc, encoded_path, parts.query, parts.fragment)))' "$1" + } + + find_latest_artifact() { + local runs_json artifact_json run_id + + if [ -n "$ARTIFACT_ID" ] && [ "$ARTIFACT_ID" != "null" ]; then + return 0 fi - done - if [ -z "$ARTIFACT_ID" ] || [ "$ARTIFACT_ID" = "null" ]; then - echo "ERROR: No aiter-whl-main artifact found in recent Aiter Test runs" - exit 1 - fi + echo "=== Finding latest aiter-whl-* artifact from ROCm/aiter ===" + runs_json=$(curl -fsSL -H "$AUTH_HEADER" \ + "$API_URL/repos/ROCm/aiter/actions/workflows/$AITER_TEST_WORKFLOW_ID/runs?per_page=100&branch=main&event=push") + + for run_id in $(echo "$runs_json" | jq -r '.workflow_runs[].id'); do + artifact_json=$(curl -fsSL -H "$AUTH_HEADER" \ + "$API_URL/repos/ROCm/aiter/actions/runs/$run_id/artifacts" \ + | jq '[.artifacts[] | select(.name | startswith("aiter-whl-")) | select(.expired == false)] | sort_by(.created_at) | last') + + if [ "$artifact_json" != "null" ] && [ -n "$artifact_json" ]; then + ARTIFACT_ID=$(echo "$artifact_json" | jq -r '.id') + ARTIFACT_NAME=$(echo "$artifact_json" | jq -r '.name') + ARTIFACT_RUN_ID="$run_id" + ARTIFACT_RUN_SHA=$(echo "$runs_json" | jq -r --arg run_id "$run_id" '.workflow_runs[] | select((.id | tostring) == $run_id) | .head_sha') + ARTIFACT_RUN_CREATED_AT=$(echo "$runs_json" | jq -r --arg run_id "$run_id" '.workflow_runs[] | select((.id | tostring) == $run_id) | .created_at') + echo "Found artifact in run $ARTIFACT_RUN_ID: $ARTIFACT_NAME (ID: $ARTIFACT_ID, SHA: $ARTIFACT_RUN_SHA)" + return 0 + fi + done + + return 1 + } + + download_from_s3_manifest() { + local manifest_file manifest_fetch_url manifest_branch manifest_timestamp manifest_commit wheel_name wheel_url resolved_wheel_url + + mkdir -p aiter-whl + rm -f aiter-whl/amd_aiter*.whl + + manifest_file=$(mktemp) + trap 'rm -f "$manifest_file"' RETURN + manifest_fetch_url="${S3_MAIN_MANIFEST_URL}?ts=$(date +%s)" + curl -fsSL -H "Cache-Control: no-cache" "$manifest_fetch_url" -o "$manifest_file" || return 1 + + manifest_branch=$(jq -r '.branch // empty' "$manifest_file") + manifest_timestamp=$(jq -r '.timestamp // empty' "$manifest_file") + manifest_commit=$(jq -r '.commit // empty' "$manifest_file") + wheel_name=$(jq -r '.wheel_name // empty' "$manifest_file") + wheel_url=$(jq -r '.wheel_url // empty' "$manifest_file") + + if [ "$manifest_branch" != "main" ] || [ -z "$manifest_timestamp" ] || [ -z "$manifest_commit" ] || [ -z "$wheel_name" ] || [ -z "$wheel_url" ]; then + echo "Invalid latest main wheel manifest" + return 1 + fi - echo "=== Downloading artifact ===" - mkdir -p aiter-whl - curl -s -L -H "$AUTH_HEADER" \ - "$API_URL/repos/ROCm/aiter/actions/artifacts/$ARTIFACT_ID/zip" \ - -o aiter-whl.zip - unzip -o aiter-whl.zip -d aiter-whl - rm -f aiter-whl.zip + if find_latest_artifact; then + if [ -n "$ARTIFACT_RUN_SHA" ] && [ "$manifest_commit" != "$ARTIFACT_RUN_SHA" ]; then + if [ -n "$ARTIFACT_RUN_CREATED_AT" ] && [[ "$manifest_timestamp" < "$ARTIFACT_RUN_CREATED_AT" ]]; then + echo "Manifest commit $manifest_commit is older than latest artifact run $ARTIFACT_RUN_ID ($ARTIFACT_RUN_SHA); treating manifest as stale" + return 1 + fi + echo "Manifest commit $manifest_commit differs from latest artifact run $ARTIFACT_RUN_ID ($ARTIFACT_RUN_SHA), but manifest timestamp is not older" + fi + else + echo "No GitHub fallback artifact found while checking manifest freshness" + fi + + resolved_wheel_url=$(resolve_download_url "$wheel_url") + + echo "Selected latest main wheel manifest: $S3_MAIN_MANIFEST_URL" + echo "Manifest timestamp: $manifest_timestamp" + echo "Manifest commit: $manifest_commit" + echo "Manifest wheel: $wheel_name" + echo "Downloading manifest-selected wheel: $resolved_wheel_url" + curl -fsSL "$resolved_wheel_url" -o "aiter-whl/$wheel_name" || return 1 + echo "Downloaded wheel from manifest: aiter-whl/$wheel_name" + + rm -f "$manifest_file" + trap - RETURN + } + + download_from_artifact() { + echo "=== Falling back to latest aiter-whl-* artifact from ROCm/aiter ===" + find_latest_artifact || { + echo "ERROR: No aiter-whl-* artifact found in recent Aiter Test runs" + return 1 + } + + mkdir -p aiter-whl + rm -f aiter-whl/amd_aiter*.whl + curl -fsSL -H "$AUTH_HEADER" \ + "$API_URL/repos/ROCm/aiter/actions/artifacts/$ARTIFACT_ID/zip" \ + -o aiter-whl.zip + unzip -o aiter-whl.zip -d aiter-whl + rm -f aiter-whl.zip + } + + if download_from_s3_manifest; then + echo "Using wheel from S3 main manifest" + else + echo "Main wheel manifest download failed, falling back to GitHub artifact" + download_from_artifact + fi AITER_WHL=$(ls -t aiter-whl/amd_aiter*.whl 2>/dev/null | head -1) if [ -z "$AITER_WHL" ]; then - echo "ERROR: No amd_aiter wheel found in artifact" - ls -la aiter-whl/ + echo "ERROR: No amd_aiter wheel available after S3/artifact attempts" + ls -la aiter-whl/ || true exit 1 fi - echo "Downloaded wheel: $AITER_WHL" + echo "Selected wheel: $AITER_WHL" echo "aiter_artifact_id=${ARTIFACT_ID}" >> "$GITHUB_OUTPUT" echo "aiter_wheel_name=$(basename "$AITER_WHL")" >> "$GITHUB_OUTPUT" From d39fd507fb20f31b5f6785cdfe240a077a6b2bec Mon Sep 17 00:00:00 2001 From: zhuyuhua-v Date: Wed, 13 May 2026 05:13:19 -0500 Subject: [PATCH 5/5] update auto benchmark and add more cases Signed-off-by: zhuyuhua-v --- .../benchmark/sglang_benchmark_models.json | 37 +++++--- .github/benchmark/sglang_models_accuracy.json | 41 +++++--- .github/scripts/atom_sglang_test.sh | 25 +++-- .../atom-sglang-accuracy-validation.yaml | 57 ++++++----- .github/workflows/atom-sglang-benchmark.yaml | 94 ++++++++++++++----- .github/workflows/atom-sglang-test.yaml | 21 ++--- 6 files changed, 179 insertions(+), 96 deletions(-) diff --git a/.github/benchmark/sglang_benchmark_models.json b/.github/benchmark/sglang_benchmark_models.json index 0d0fe789e..d4ef2890d 100644 --- a/.github/benchmark/sglang_benchmark_models.json +++ b/.github/benchmark/sglang_benchmark_models.json @@ -7,6 +7,7 @@ "extra_args": "--trust-remote-code --tensor-parallel-size 8", "bench_args": "", "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1" }, { @@ -18,6 +19,7 @@ "extra_args": "--trust-remote-code --tensor-parallel-size 4", "bench_args": "", "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1" }, { @@ -28,6 +30,7 @@ "extra_args": "--trust-remote-code --tensor-parallel-size 8", "bench_args": "", "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1" }, { @@ -39,18 +42,20 @@ "extra_args": "--trust-remote-code --tensor-parallel-size 4", "bench_args": "", "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1" }, { - "display": "DeepSeek-R1-0528-MXFP4 FP4 TP8 EP8", - "dashboard_model": "DeepSeek-R1-0528-MXFP4-tp8-ep8", - "source_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "prefix": "deepseek-r1-fp4-tp8-ep8", - "extra_args": "--trust-remote-code --tensor-parallel-size 8 --expert-parallel-size 8", - "bench_args": "", - "runner": "atom-mi355-8gpu-aac-runner", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1" + "display": "DeepSeek-R1-0528-MXFP4 FP4 TP8 EP8", + "dashboard_model": "DeepSeek-R1-0528-MXFP4-tp8-ep8", + "source_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", + "path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", + "prefix": "deepseek-r1-fp4-tp8-ep8", + "extra_args": "--trust-remote-code --tensor-parallel-size 8 --expert-parallel-size 8", + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1" }, { "display": "Qwen3.5-397B-A17B-FP8 TP4", @@ -58,10 +63,11 @@ "source_path": "Qwen/Qwen3.5-397B-A17B-FP8", "path": "Qwen/Qwen3.5-397B-A17B-FP8", "prefix": "qwen3-5-397b-a17b-fp8-tp4", - "extra_args": "--trust-remote-code --tensor-parallel-size 4", + "extra_args": "--tensor-parallel-size 4 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache", "bench_args": "", - "runner": "atom-mi355-8gpu-oot-benchmark", - "env_vars": "" + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "B", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0" }, { "display": "Qwen3.5-397B-A17B-FP8 TP8", @@ -69,9 +75,10 @@ "source_path": "Qwen/Qwen3.5-397B-A17B-FP8", "path": "Qwen/Qwen3.5-397B-A17B-FP8", "prefix": "qwen3-5-397b-a17b-fp8-tp8", - "extra_args": "--trust-remote-code --tensor-parallel-size 8", + "extra_args": "--tensor-parallel-size 8 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache", "bench_args": "", - "runner": "atom-mi355-8gpu-oot-benchmark", - "env_vars": "" + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "B", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0" } ] diff --git a/.github/benchmark/sglang_models_accuracy.json b/.github/benchmark/sglang_models_accuracy.json index fecd1b6f8..e2e8548b7 100644 --- a/.github/benchmark/sglang_models_accuracy.json +++ b/.github/benchmark/sglang_models_accuracy.json @@ -14,8 +14,8 @@ { "model_name": "Qwen3.5-35B-A3B-FP8 TP2", "model_path": "Qwen/Qwen3.5-35B-A3B-FP8", - "extraArgs": "--tensor-parallel-size 2", - "env_vars": "", + "extraArgs": "--tensor-parallel-size 2 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0", "runner": "linux-atom-mi35x-4", "test_level": "nightly", "accuracy_threshold": 0.76, @@ -24,27 +24,40 @@ "_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k." }, { - "model_name": "Qwen3.5-27B-FP8 TP2", - "model_path": "Qwen/Qwen3.5-27B-FP8", - "extraArgs": "--tensor-parallel-size 2", - "env_vars": "", + "model_name": "Qwen3.5-35B-A3B TP2", + "model_path": "Qwen/Qwen3.5-35B-A3B", + "extraArgs": "--tensor-parallel-size 2 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0", "runner": "linux-atom-mi35x-4", "test_level": "nightly", - "accuracy_threshold": 0.88, + "accuracy_threshold": 0.83, "accuracy_baseline": null, - "accuracy_baseline_model": "Qwen/Qwen3.5-27B-FP8", + "accuracy_baseline_model": "Qwen/Qwen3.5-35B-A3B", "_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k." }, { - "model_name": "Qwen3.5-35B-A3B TP2", - "model_path": "Qwen/Qwen3.5-35B-A3B", - "extraArgs": "--tensor-parallel-size 2", - "env_vars": "", + "model_name": "Qwen3.5-397B-A17B-FP8 TP4", + "model_path": "Qwen/Qwen3.5-397B-A17B-FP8", + "extraArgs": "--tensor-parallel-size 4 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0", "runner": "linux-atom-mi35x-4", "test_level": "nightly", - "accuracy_threshold": 0.95, + "accuracy_threshold": 0.83, "accuracy_baseline": null, - "accuracy_baseline_model": "Qwen/Qwen3.5-35B-A3B", + "accuracy_baseline_model": "Qwen/Qwen3.5-397B-A17B-FP8", + "_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k." + }, + { + "model_name": "Qwen3.5-397B-A17B-FP8 TP8", + "model_path": "Qwen/Qwen3.5-397B-A17B-FP8", + "extraArgs": "--tensor-parallel-size 8 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0", + "runner": "linux-atom-mi35x-8", + "test_level": "nightly", + "accuracy_threshold": 0.83, + "accuracy_baseline": null, + "accuracy_baseline_model": "Qwen/Qwen3.5-397B-A17B-FP8", + "_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k." }, { "model_name": "DeepSeek-R1-FP8 TP8", diff --git a/.github/scripts/atom_sglang_test.sh b/.github/scripts/atom_sglang_test.sh index 1940e4418..1f1cbeb33 100644 --- a/.github/scripts/atom_sglang_test.sh +++ b/.github/scripts/atom_sglang_test.sh @@ -12,6 +12,7 @@ set -euo pipefail # Optional environment variables: # SGLANG_EXTRA_ARGS # SGLANG_ENV_VARS +# SGLANG_DEFAULT_SERVER_ARGS # SGLANG_PORT # SGLANG_HOST # MAX_WAIT_RETRIES @@ -146,11 +147,6 @@ launch_server() { local resolved_model_path resolved_model_path=$(resolve_model_path "${MODEL_PATH}") - local -a extra_arg_array=() - if [[ -n "${MODEL_EXTRA_ARGS}" ]]; then - read -r -a extra_arg_array <<< "${MODEL_EXTRA_ARGS}" - fi - prepare_runtime_paths export AITER_QUICK_REDUCE_QUANTIZATION="${AITER_QUICK_REDUCE_QUANTIZATION:-INT4}" @@ -168,6 +164,19 @@ launch_server() { done <<< "$(printf '%b' "${MODEL_ENV_VARS}")" fi + local default_server_args + default_server_args=${SGLANG_DEFAULT_SERVER_ARGS---trust-remote-code --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.8 --page-size 1 --disable-radix-cache} + + local -a default_arg_array=() + if [[ -n "${default_server_args}" ]]; then + read -r -a default_arg_array <<< "${default_server_args}" + fi + + local -a extra_arg_array=() + if [[ -n "${MODEL_EXTRA_ARGS}" ]]; then + read -r -a extra_arg_array <<< "${MODEL_EXTRA_ARGS}" + fi + rm -rf /root/.cache rm -f "${SGLANG_PID_FILE}" "${SGLANG_LOG_FILE}" || true @@ -182,11 +191,7 @@ launch_server() { --model-path "${resolved_model_path}" \ --host "${SGLANG_HOST}" \ --port "${SGLANG_PORT}" \ - --trust-remote-code \ - --kv-cache-dtype fp8_e4m3 \ - --mem-fraction-static 0.8 \ - --page-size 1 \ - --disable-radix-cache \ + "${default_arg_array[@]}" \ "${extra_arg_array[@]}" \ > "${SGLANG_LOG_FILE}" 2>&1 & diff --git a/.github/workflows/atom-sglang-accuracy-validation.yaml b/.github/workflows/atom-sglang-accuracy-validation.yaml index 59b50d586..d51a3cff1 100644 --- a/.github/workflows/atom-sglang-accuracy-validation.yaml +++ b/.github/workflows/atom-sglang-accuracy-validation.yaml @@ -19,13 +19,18 @@ on: required: false type: boolean default: false - run_qwen35_27b_fp8_tp2: - description: "Qwen3.5-27B-FP8 TP2" + run_qwen35_35b_a3b_tp2: + description: "Qwen3.5-35B-A3B TP2" required: false type: boolean default: false - run_qwen35_35b_a3b_tp2: - description: "Qwen3.5-35B-A3B TP2" + run_qwen35_397b_a17b_fp8_tp4: + description: "Qwen3.5-397B-A17B-FP8 TP4" + required: false + type: boolean + default: false + run_qwen35_397b_a17b_fp8_tp8: + description: "Qwen3.5-397B-A17B-FP8 TP8" required: false type: boolean default: false @@ -86,8 +91,9 @@ jobs: env: RUN_DSR1_FP8_TP4: ${{ inputs.run_dsr1_fp8_tp4 }} RUN_QWEN35_35B_A3B_FP8_TP2: ${{ inputs.run_qwen35_35b_a3b_fp8_tp2 }} - RUN_QWEN35_27B_FP8_TP2: ${{ inputs.run_qwen35_27b_fp8_tp2 }} RUN_QWEN35_35B_A3B_TP2: ${{ inputs.run_qwen35_35b_a3b_tp2 }} + RUN_QWEN35_397B_A17B_FP8_TP4: ${{ inputs.run_qwen35_397b_a17b_fp8_tp4 }} + RUN_QWEN35_397B_A17B_FP8_TP8: ${{ inputs.run_qwen35_397b_a17b_fp8_tp8 }} RUN_DSR1_FP8_TP8: ${{ inputs.run_dsr1_fp8_tp8 }} RUN_DSR1_FP4_TP4: ${{ inputs.run_dsr1_fp4_tp4 }} RUN_DSR1_FP4_TP8: ${{ inputs.run_dsr1_fp4_tp8 }} @@ -116,29 +122,38 @@ jobs: "toggle_env": "RUN_QWEN35_35B_A3B_FP8_TP2", "model_name": "Qwen3.5-35B-A3B-FP8 TP2", "model_path": "Qwen/Qwen3.5-35B-A3B-FP8", - "extra_args": "--tensor-parallel-size 2", - "accuracy_test_threshold": 0.89, - "env_vars": "", - "runner": "linux-atom-mi35x-4", - }, - { - "toggle_env": "RUN_QWEN35_27B_FP8_TP2", - "model_name": "Qwen3.5-27B-FP8 TP2", - "model_path": "Qwen/Qwen3.5-27B-FP8", - "extra_args": "--tensor-parallel-size 2", - "accuracy_test_threshold": 0.88, - "env_vars": "", + "extra_args": "--tensor-parallel-size 2 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache", + "accuracy_test_threshold": 0.76, + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0", "runner": "linux-atom-mi35x-4", }, { "toggle_env": "RUN_QWEN35_35B_A3B_TP2", "model_name": "Qwen3.5-35B-A3B TP2", "model_path": "Qwen/Qwen3.5-35B-A3B", - "extra_args": "--tensor-parallel-size 2", - "accuracy_test_threshold": 0.95, - "env_vars": "", + "extra_args": "--tensor-parallel-size 2 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache", + "accuracy_test_threshold": 0.83, + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0", + "runner": "linux-atom-mi35x-4", + }, + { + "toggle_env": "RUN_QWEN35_397B_A17B_FP8_TP4", + "model_name": "Qwen3.5-397B-A17B-FP8 TP4", + "model_path": "Qwen/Qwen3.5-397B-A17B-FP8", + "extra_args": "--tensor-parallel-size 4 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache", + "accuracy_test_threshold": 0.83, + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0", "runner": "linux-atom-mi35x-4", - }, + }, + { + "toggle_env": "RUN_QWEN35_397B_A17B_FP8_TP8", + "model_name": "Qwen3.5-397B-A17B-FP8 TP8", + "model_path": "Qwen/Qwen3.5-397B-A17B-FP8", + "extra_args": "--tensor-parallel-size 8 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache", + "accuracy_test_threshold": 0.83, + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0", + "runner": "linux-atom-mi35x-8", + }, { "toggle_env": "RUN_DSR1_FP8_TP8", "model_name": "DeepSeek-R1-FP8 TP8", diff --git a/.github/workflows/atom-sglang-benchmark.yaml b/.github/workflows/atom-sglang-benchmark.yaml index 9aa4ae451..84680999b 100644 --- a/.github/workflows/atom-sglang-benchmark.yaml +++ b/.github/workflows/atom-sglang-benchmark.yaml @@ -6,8 +6,8 @@ concurrency: on: schedule: - # Nightly at 01:00 Beijing time (17:00 UTC on the previous day) - - cron: '0 17 * * *' + # Weekday nightly at 23:00 Beijing time (15:00 UTC) + - cron: '0 15 * * 1-5' workflow_dispatch: inputs: deepseek-r1-fp8-tp8: @@ -282,6 +282,7 @@ jobs: outputs: models_json: ${{ steps.load.outputs.models_json }} has_enabled_models: ${{ steps.load.outputs.has_enabled_models }} + selected_group: ${{ steps.load.outputs.selected_group }} steps: - uses: actions/checkout@v6 - id: load @@ -294,26 +295,77 @@ jobs: ENABLE_QWEN3_5_397B_A17B_FP8_TP4: ${{ inputs.qwen3-5-397b-a17b-fp8-tp4 }} ENABLE_QWEN3_5_397B_A17B_FP8_TP8: ${{ inputs.qwen3-5-397b-a17b-fp8-tp8 }} run: | - MODELS_JSON="$(jq -c ' - map(select( - env.GITHUB_EVENT_NAME == "schedule" - or (.prefix == "deepseek-r1-fp8-tp8" and env.ENABLE_DEEPSEEK_R1_FP8_TP8 == "true") - or (.prefix == "deepseek-r1-fp8-tp4" and env.ENABLE_DEEPSEEK_R1_FP8_TP4 == "true") - or (.prefix == "deepseek-r1-fp4-tp8" and env.ENABLE_DEEPSEEK_R1_FP4_TP8 == "true") - or (.prefix == "deepseek-r1-fp4-tp4" and env.ENABLE_DEEPSEEK_R1_FP4_TP4 == "true") - or (.prefix == "deepseek-r1-fp4-tp8-ep8" and env.ENABLE_DEEPSEEK_R1_FP4_TP8_EP8 == "true") - or (.prefix == "qwen3-5-397b-a17b-fp8-tp4" and env.ENABLE_QWEN3_5_397B_A17B_FP8_TP4 == "true") - or (.prefix == "qwen3-5-397b-a17b-fp8-tp8" and env.ENABLE_QWEN3_5_397B_A17B_FP8_TP8 == "true") - )) - ' .github/benchmark/sglang_benchmark_models.json)" - echo "models_json=${MODELS_JSON}" >> "$GITHUB_OUTPUT" - if [ "${MODELS_JSON}" = "[]" ]; then - echo "has_enabled_models=false" >> "$GITHUB_OUTPUT" - echo "No models selected for SGLang benchmark." - else - echo "has_enabled_models=true" >> "$GITHUB_OUTPUT" - echo "Selected models: ${MODELS_JSON}" + set -euo pipefail + + export BEIJING_WEEKDAY="$(TZ=Asia/Shanghai date +%u)" + python3 - <<'PY' >> "$GITHUB_OUTPUT" + import json + import os + import sys + from pathlib import Path + + models = json.loads( + Path(".github/benchmark/sglang_benchmark_models.json").read_text( + encoding="utf-8" + ) + ) + + event = os.environ["GITHUB_EVENT_NAME"] + selected_group = "" + + if event == "schedule": + weekday = int(os.environ["BEIJING_WEEKDAY"]) + if weekday in (1, 3): + selected_group = "A-DEEPSEEK" + selected = [m for m in models if m.get("nightly_group", "A") == "A"] + elif weekday in (2, 4): + selected_group = "B-QWEN35" + selected = [m for m in models if m.get("nightly_group") == "B"] + elif weekday == 5: + selected_group = "C-ALL" + selected = list(models) + else: + selected_group = "SKIP-WEEKEND" + selected = [] + else: + enabled_by_prefix = { + "deepseek-r1-fp8-tp8": os.environ.get("ENABLE_DEEPSEEK_R1_FP8_TP8", ""), + "deepseek-r1-fp8-tp4": os.environ.get("ENABLE_DEEPSEEK_R1_FP8_TP4", ""), + "deepseek-r1-fp4-tp8": os.environ.get("ENABLE_DEEPSEEK_R1_FP4_TP8", ""), + "deepseek-r1-fp4-tp4": os.environ.get("ENABLE_DEEPSEEK_R1_FP4_TP4", ""), + "deepseek-r1-fp4-tp8-ep8": os.environ.get("ENABLE_DEEPSEEK_R1_FP4_TP8_EP8", ""), + "qwen3-5-397b-a17b-fp8-tp4": os.environ.get("ENABLE_QWEN3_5_397B_A17B_FP8_TP4", ""), + "qwen3-5-397b-a17b-fp8-tp8": os.environ.get("ENABLE_QWEN3_5_397B_A17B_FP8_TP8", ""), + } + selected = [ + model + for model in models + if enabled_by_prefix.get(str(model.get("prefix")), "").lower() == "true" + ] + + if selected_group: + print(f"Scheduled SGLang benchmark group: {selected_group}", file=sys.stderr) + if not selected: + print("No models selected for SGLang benchmark.", file=sys.stderr) + else: + print("Selected SGLang benchmark models:", file=sys.stderr) + for model in selected: + print(f" - {model['display']} ({model['prefix']})", file=sys.stderr) + + print(f"models_json={json.dumps(selected, separators=(',', ':'))}") + print(f"selected_group={selected_group}") + print(f"has_enabled_models={'true' if selected else 'false'}") + PY + + - name: Print selected models + env: + SELECTED_GROUP: ${{ steps.load.outputs.selected_group }} + MODELS_JSON: ${{ steps.load.outputs.models_json }} + run: | + if [[ -n "${SELECTED_GROUP}" ]]; then + echo "Scheduled SGLang benchmark group: ${SELECTED_GROUP}" fi + printf 'Selected models: %s\n' "${MODELS_JSON}" build-benchmark-matrix: name: Build SGLang benchmark matrix diff --git a/.github/workflows/atom-sglang-test.yaml b/.github/workflows/atom-sglang-test.yaml index 764d3c9eb..2a5d8572f 100644 --- a/.github/workflows/atom-sglang-test.yaml +++ b/.github/workflows/atom-sglang-test.yaml @@ -222,21 +222,12 @@ jobs: runner: linux-atom-mi35x-4 - model_name: "Qwen3.5-35B-A3B-FP8 TP2" model_path: "Qwen/Qwen3.5-35B-A3B-FP8" - extra_args: "--tensor-parallel-size 2" - env_vars: "" - accuracy_test_threshold: 0.89 - runner: linux-atom-mi35x-4 - - model_name: "Qwen3.5-27B-FP8 TP2" - model_path: "Qwen/Qwen3.5-27B-FP8" - extra_args: "--tensor-parallel-size 2" - env_vars: "" - accuracy_test_threshold: 0.88 - runner: linux-atom-mi35x-4 - - model_name: "Qwen3.5-35B-A3B TP2" - model_path: "Qwen/Qwen3.5-35B-A3B" - extra_args: "--tensor-parallel-size 2" - env_vars: "" - accuracy_test_threshold: 0.95 + extra_args: "--tensor-parallel-size 2 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache" + env_vars: | + SGLANG_DEFAULT_SERVER_ARGS= + SGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models + ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0 + accuracy_test_threshold: 0.76 runner: linux-atom-mi35x-4 runs-on: ${{ matrix.runner }} timeout-minutes: 180