diff --git a/.github/benchmark/sglang_benchmark_models.json b/.github/benchmark/sglang_benchmark_models.json index b06227844..d4ef2890d 100644 --- a/.github/benchmark/sglang_benchmark_models.json +++ b/.github/benchmark/sglang_benchmark_models.json @@ -7,6 +7,7 @@ "extra_args": "--trust-remote-code --tensor-parallel-size 8", "bench_args": "", "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1" }, { @@ -18,6 +19,7 @@ "extra_args": "--trust-remote-code --tensor-parallel-size 4", "bench_args": "", "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1" }, { @@ -28,6 +30,7 @@ "extra_args": "--trust-remote-code --tensor-parallel-size 8", "bench_args": "", "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1" }, { @@ -39,17 +42,43 @@ "extra_args": "--trust-remote-code --tensor-parallel-size 4", "bench_args": "", "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1" }, { - "display": "DeepSeek-R1-0528-MXFP4 FP4 TP8 EP8", - "dashboard_model": "DeepSeek-R1-0528-MXFP4-tp8-ep8", - "source_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "prefix": "deepseek-r1-fp4-tp8-ep8", - "extra_args": "--trust-remote-code --tensor-parallel-size 8 --expert-parallel-size 8", - "bench_args": "", - "runner": "atom-mi355-8gpu-aac-runner", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1" + "display": "DeepSeek-R1-0528-MXFP4 FP4 TP8 EP8", + "dashboard_model": "DeepSeek-R1-0528-MXFP4-tp8-ep8", + "source_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", + "path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", + "prefix": "deepseek-r1-fp4-tp8-ep8", + "extra_args": "--trust-remote-code --tensor-parallel-size 8 --expert-parallel-size 8", + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1" + }, + { + "display": "Qwen3.5-397B-A17B-FP8 TP4", + "dashboard_model": "Qwen3.5-397B-A17B-FP8-tp4", + "source_path": "Qwen/Qwen3.5-397B-A17B-FP8", + "path": "Qwen/Qwen3.5-397B-A17B-FP8", + "prefix": "qwen3-5-397b-a17b-fp8-tp4", + "extra_args": "--tensor-parallel-size 4 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache", + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "B", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0" + }, + { + "display": "Qwen3.5-397B-A17B-FP8 TP8", + "dashboard_model": "Qwen3.5-397B-A17B-FP8", + "source_path": "Qwen/Qwen3.5-397B-A17B-FP8", + "path": "Qwen/Qwen3.5-397B-A17B-FP8", + "prefix": "qwen3-5-397b-a17b-fp8-tp8", + "extra_args": "--tensor-parallel-size 8 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache", + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "B", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0" } ] diff --git a/.github/benchmark/sglang_models_accuracy.json b/.github/benchmark/sglang_models_accuracy.json index 1fa97d41f..e2e8548b7 100644 --- a/.github/benchmark/sglang_models_accuracy.json +++ b/.github/benchmark/sglang_models_accuracy.json @@ -11,6 +11,54 @@ "accuracy_baseline_model": "deepseek-ai/DeepSeek-R1-0528", "_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k." }, + { + "model_name": "Qwen3.5-35B-A3B-FP8 TP2", + "model_path": "Qwen/Qwen3.5-35B-A3B-FP8", + "extraArgs": "--tensor-parallel-size 2 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0", + "runner": "linux-atom-mi35x-4", + "test_level": "nightly", + "accuracy_threshold": 0.76, + "accuracy_baseline": null, + "accuracy_baseline_model": "Qwen/Qwen3.5-35B-A3B-FP8", + "_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k." + }, + { + "model_name": "Qwen3.5-35B-A3B TP2", + "model_path": "Qwen/Qwen3.5-35B-A3B", + "extraArgs": "--tensor-parallel-size 2 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0", + "runner": "linux-atom-mi35x-4", + "test_level": "nightly", + "accuracy_threshold": 0.83, + "accuracy_baseline": null, + "accuracy_baseline_model": "Qwen/Qwen3.5-35B-A3B", + "_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k." + }, + { + "model_name": "Qwen3.5-397B-A17B-FP8 TP4", + "model_path": "Qwen/Qwen3.5-397B-A17B-FP8", + "extraArgs": "--tensor-parallel-size 4 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0", + "runner": "linux-atom-mi35x-4", + "test_level": "nightly", + "accuracy_threshold": 0.83, + "accuracy_baseline": null, + "accuracy_baseline_model": "Qwen/Qwen3.5-397B-A17B-FP8", + "_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k." + }, + { + "model_name": "Qwen3.5-397B-A17B-FP8 TP8", + "model_path": "Qwen/Qwen3.5-397B-A17B-FP8", + "extraArgs": "--tensor-parallel-size 8 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0", + "runner": "linux-atom-mi35x-8", + "test_level": "nightly", + "accuracy_threshold": 0.83, + "accuracy_baseline": null, + "accuracy_baseline_model": "Qwen/Qwen3.5-397B-A17B-FP8", + "_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k." + }, { "model_name": "DeepSeek-R1-FP8 TP8", "model_path": "deepseek-ai/DeepSeek-R1-0528", diff --git a/.github/scripts/atom_sglang_test.sh b/.github/scripts/atom_sglang_test.sh index 1940e4418..1f1cbeb33 100644 --- a/.github/scripts/atom_sglang_test.sh +++ b/.github/scripts/atom_sglang_test.sh @@ -12,6 +12,7 @@ set -euo pipefail # Optional environment variables: # SGLANG_EXTRA_ARGS # SGLANG_ENV_VARS +# SGLANG_DEFAULT_SERVER_ARGS # SGLANG_PORT # SGLANG_HOST # MAX_WAIT_RETRIES @@ -146,11 +147,6 @@ launch_server() { local resolved_model_path resolved_model_path=$(resolve_model_path "${MODEL_PATH}") - local -a extra_arg_array=() - if [[ -n "${MODEL_EXTRA_ARGS}" ]]; then - read -r -a extra_arg_array <<< "${MODEL_EXTRA_ARGS}" - fi - prepare_runtime_paths export AITER_QUICK_REDUCE_QUANTIZATION="${AITER_QUICK_REDUCE_QUANTIZATION:-INT4}" @@ -168,6 +164,19 @@ launch_server() { done <<< "$(printf '%b' "${MODEL_ENV_VARS}")" fi + local default_server_args + default_server_args=${SGLANG_DEFAULT_SERVER_ARGS---trust-remote-code --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.8 --page-size 1 --disable-radix-cache} + + local -a default_arg_array=() + if [[ -n "${default_server_args}" ]]; then + read -r -a default_arg_array <<< "${default_server_args}" + fi + + local -a extra_arg_array=() + if [[ -n "${MODEL_EXTRA_ARGS}" ]]; then + read -r -a extra_arg_array <<< "${MODEL_EXTRA_ARGS}" + fi + rm -rf /root/.cache rm -f "${SGLANG_PID_FILE}" "${SGLANG_LOG_FILE}" || true @@ -182,11 +191,7 @@ launch_server() { --model-path "${resolved_model_path}" \ --host "${SGLANG_HOST}" \ --port "${SGLANG_PORT}" \ - --trust-remote-code \ - --kv-cache-dtype fp8_e4m3 \ - --mem-fraction-static 0.8 \ - --page-size 1 \ - --disable-radix-cache \ + "${default_arg_array[@]}" \ "${extra_arg_array[@]}" \ > "${SGLANG_LOG_FILE}" 2>&1 & diff --git a/.github/workflows/atom-sglang-accuracy-validation.yaml b/.github/workflows/atom-sglang-accuracy-validation.yaml index 2c9eaaa92..d51a3cff1 100644 --- a/.github/workflows/atom-sglang-accuracy-validation.yaml +++ b/.github/workflows/atom-sglang-accuracy-validation.yaml @@ -14,6 +14,26 @@ on: required: false type: boolean default: false + run_qwen35_35b_a3b_fp8_tp2: + description: "Qwen3.5-35B-A3B-FP8 TP2" + required: false + type: boolean + default: false + run_qwen35_35b_a3b_tp2: + description: "Qwen3.5-35B-A3B TP2" + required: false + type: boolean + default: false + run_qwen35_397b_a17b_fp8_tp4: + description: "Qwen3.5-397B-A17B-FP8 TP4" + required: false + type: boolean + default: false + run_qwen35_397b_a17b_fp8_tp8: + description: "Qwen3.5-397B-A17B-FP8 TP8" + required: false + type: boolean + default: false run_dsr1_fp8_tp8: description: "DeepSeek-R1-FP8 TP8" required: false @@ -70,6 +90,10 @@ jobs: id: meta env: RUN_DSR1_FP8_TP4: ${{ inputs.run_dsr1_fp8_tp4 }} + RUN_QWEN35_35B_A3B_FP8_TP2: ${{ inputs.run_qwen35_35b_a3b_fp8_tp2 }} + RUN_QWEN35_35B_A3B_TP2: ${{ inputs.run_qwen35_35b_a3b_tp2 }} + RUN_QWEN35_397B_A17B_FP8_TP4: ${{ inputs.run_qwen35_397b_a17b_fp8_tp4 }} + RUN_QWEN35_397B_A17B_FP8_TP8: ${{ inputs.run_qwen35_397b_a17b_fp8_tp8 }} RUN_DSR1_FP8_TP8: ${{ inputs.run_dsr1_fp8_tp8 }} RUN_DSR1_FP4_TP4: ${{ inputs.run_dsr1_fp4_tp4 }} RUN_DSR1_FP4_TP8: ${{ inputs.run_dsr1_fp4_tp8 }} @@ -94,6 +118,42 @@ jobs: "env_vars": "SGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1", "runner": "linux-atom-mi35x-4", }, + { + "toggle_env": "RUN_QWEN35_35B_A3B_FP8_TP2", + "model_name": "Qwen3.5-35B-A3B-FP8 TP2", + "model_path": "Qwen/Qwen3.5-35B-A3B-FP8", + "extra_args": "--tensor-parallel-size 2 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache", + "accuracy_test_threshold": 0.76, + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0", + "runner": "linux-atom-mi35x-4", + }, + { + "toggle_env": "RUN_QWEN35_35B_A3B_TP2", + "model_name": "Qwen3.5-35B-A3B TP2", + "model_path": "Qwen/Qwen3.5-35B-A3B", + "extra_args": "--tensor-parallel-size 2 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache", + "accuracy_test_threshold": 0.83, + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0", + "runner": "linux-atom-mi35x-4", + }, + { + "toggle_env": "RUN_QWEN35_397B_A17B_FP8_TP4", + "model_name": "Qwen3.5-397B-A17B-FP8 TP4", + "model_path": "Qwen/Qwen3.5-397B-A17B-FP8", + "extra_args": "--tensor-parallel-size 4 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache", + "accuracy_test_threshold": 0.83, + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0", + "runner": "linux-atom-mi35x-4", + }, + { + "toggle_env": "RUN_QWEN35_397B_A17B_FP8_TP8", + "model_name": "Qwen3.5-397B-A17B-FP8 TP8", + "model_path": "Qwen/Qwen3.5-397B-A17B-FP8", + "extra_args": "--tensor-parallel-size 8 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache", + "accuracy_test_threshold": 0.83, + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0", + "runner": "linux-atom-mi35x-8", + }, { "toggle_env": "RUN_DSR1_FP8_TP8", "model_name": "DeepSeek-R1-FP8 TP8", diff --git a/.github/workflows/atom-sglang-benchmark.yaml b/.github/workflows/atom-sglang-benchmark.yaml index a925da28a..84680999b 100644 --- a/.github/workflows/atom-sglang-benchmark.yaml +++ b/.github/workflows/atom-sglang-benchmark.yaml @@ -6,8 +6,8 @@ concurrency: on: schedule: - # Nightly at 01:00 Beijing time (17:00 UTC on the previous day) - - cron: '0 17 * * *' + # Weekday nightly at 23:00 Beijing time (15:00 UTC) + - cron: '0 15 * * 1-5' workflow_dispatch: inputs: deepseek-r1-fp8-tp8: @@ -30,6 +30,14 @@ on: description: "DeepSeek-R1-0528-MXFP4 FP4 TP8 EP8" type: boolean default: false + qwen3-5-397b-a17b-fp8-tp4: + description: "Qwen3.5-397B-A17B-FP8 TP4" + type: boolean + default: false + qwen3-5-397b-a17b-fp8-tp8: + description: "Qwen3.5-397B-A17B-FP8 TP8" + type: boolean + default: false sglang_image: description: "Optional SGLang benchmark image override. Leave empty to use sglang-latest on main or rebuild from the selected non-main branch." type: string @@ -274,6 +282,7 @@ jobs: outputs: models_json: ${{ steps.load.outputs.models_json }} has_enabled_models: ${{ steps.load.outputs.has_enabled_models }} + selected_group: ${{ steps.load.outputs.selected_group }} steps: - uses: actions/checkout@v6 - id: load @@ -283,25 +292,80 @@ jobs: ENABLE_DEEPSEEK_R1_FP4_TP8: ${{ inputs.deepseek-r1-fp4-tp8 }} ENABLE_DEEPSEEK_R1_FP4_TP4: ${{ inputs.deepseek-r1-fp4-tp4 }} ENABLE_DEEPSEEK_R1_FP4_TP8_EP8: ${{ inputs.deepseek-r1-fp4-tp8-ep8 }} + ENABLE_QWEN3_5_397B_A17B_FP8_TP4: ${{ inputs.qwen3-5-397b-a17b-fp8-tp4 }} + ENABLE_QWEN3_5_397B_A17B_FP8_TP8: ${{ inputs.qwen3-5-397b-a17b-fp8-tp8 }} run: | - MODELS_JSON="$(jq -c ' - map(select( - env.GITHUB_EVENT_NAME == "schedule" - or (.prefix == "deepseek-r1-fp8-tp8" and env.ENABLE_DEEPSEEK_R1_FP8_TP8 == "true") - or (.prefix == "deepseek-r1-fp8-tp4" and env.ENABLE_DEEPSEEK_R1_FP8_TP4 == "true") - or (.prefix == "deepseek-r1-fp4-tp8" and env.ENABLE_DEEPSEEK_R1_FP4_TP8 == "true") - or (.prefix == "deepseek-r1-fp4-tp4" and env.ENABLE_DEEPSEEK_R1_FP4_TP4 == "true") - or (.prefix == "deepseek-r1-fp4-tp8-ep8" and env.ENABLE_DEEPSEEK_R1_FP4_TP8_EP8 == "true") - )) - ' .github/benchmark/sglang_benchmark_models.json)" - echo "models_json=${MODELS_JSON}" >> "$GITHUB_OUTPUT" - if [ "${MODELS_JSON}" = "[]" ]; then - echo "has_enabled_models=false" >> "$GITHUB_OUTPUT" - echo "No models selected for SGLang benchmark." - else - echo "has_enabled_models=true" >> "$GITHUB_OUTPUT" - echo "Selected models: ${MODELS_JSON}" + set -euo pipefail + + export BEIJING_WEEKDAY="$(TZ=Asia/Shanghai date +%u)" + python3 - <<'PY' >> "$GITHUB_OUTPUT" + import json + import os + import sys + from pathlib import Path + + models = json.loads( + Path(".github/benchmark/sglang_benchmark_models.json").read_text( + encoding="utf-8" + ) + ) + + event = os.environ["GITHUB_EVENT_NAME"] + selected_group = "" + + if event == "schedule": + weekday = int(os.environ["BEIJING_WEEKDAY"]) + if weekday in (1, 3): + selected_group = "A-DEEPSEEK" + selected = [m for m in models if m.get("nightly_group", "A") == "A"] + elif weekday in (2, 4): + selected_group = "B-QWEN35" + selected = [m for m in models if m.get("nightly_group") == "B"] + elif weekday == 5: + selected_group = "C-ALL" + selected = list(models) + else: + selected_group = "SKIP-WEEKEND" + selected = [] + else: + enabled_by_prefix = { + "deepseek-r1-fp8-tp8": os.environ.get("ENABLE_DEEPSEEK_R1_FP8_TP8", ""), + "deepseek-r1-fp8-tp4": os.environ.get("ENABLE_DEEPSEEK_R1_FP8_TP4", ""), + "deepseek-r1-fp4-tp8": os.environ.get("ENABLE_DEEPSEEK_R1_FP4_TP8", ""), + "deepseek-r1-fp4-tp4": os.environ.get("ENABLE_DEEPSEEK_R1_FP4_TP4", ""), + "deepseek-r1-fp4-tp8-ep8": os.environ.get("ENABLE_DEEPSEEK_R1_FP4_TP8_EP8", ""), + "qwen3-5-397b-a17b-fp8-tp4": os.environ.get("ENABLE_QWEN3_5_397B_A17B_FP8_TP4", ""), + "qwen3-5-397b-a17b-fp8-tp8": os.environ.get("ENABLE_QWEN3_5_397B_A17B_FP8_TP8", ""), + } + selected = [ + model + for model in models + if enabled_by_prefix.get(str(model.get("prefix")), "").lower() == "true" + ] + + if selected_group: + print(f"Scheduled SGLang benchmark group: {selected_group}", file=sys.stderr) + if not selected: + print("No models selected for SGLang benchmark.", file=sys.stderr) + else: + print("Selected SGLang benchmark models:", file=sys.stderr) + for model in selected: + print(f" - {model['display']} ({model['prefix']})", file=sys.stderr) + + print(f"models_json={json.dumps(selected, separators=(',', ':'))}") + print(f"selected_group={selected_group}") + print(f"has_enabled_models={'true' if selected else 'false'}") + PY + + - name: Print selected models + env: + SELECTED_GROUP: ${{ steps.load.outputs.selected_group }} + MODELS_JSON: ${{ steps.load.outputs.models_json }} + run: | + if [[ -n "${SELECTED_GROUP}" ]]; then + echo "Scheduled SGLang benchmark group: ${SELECTED_GROUP}" fi + printf 'Selected models: %s\n' "${MODELS_JSON}" build-benchmark-matrix: name: Build SGLang benchmark matrix @@ -547,6 +611,8 @@ jobs: deepseek-r1-fp4-tp8) echo "enabled=${{ inputs.deepseek-r1-fp4-tp8 }}" >> "$GITHUB_OUTPUT" ;; deepseek-r1-fp4-tp4) echo "enabled=${{ inputs.deepseek-r1-fp4-tp4 }}" >> "$GITHUB_OUTPUT" ;; deepseek-r1-fp4-tp8-ep8) echo "enabled=${{ inputs.deepseek-r1-fp4-tp8-ep8 }}" >> "$GITHUB_OUTPUT" ;; + qwen3-5-397b-a17b-fp8-tp4) echo "enabled=${{ inputs.qwen3-5-397b-a17b-fp8-tp4 }}" >> "$GITHUB_OUTPUT" ;; + qwen3-5-397b-a17b-fp8-tp8) echo "enabled=${{ inputs.qwen3-5-397b-a17b-fp8-tp8 }}" >> "$GITHUB_OUTPUT" ;; *) echo "enabled=true" >> "$GITHUB_OUTPUT" ;; esac diff --git a/.github/workflows/atom-sglang-test.yaml b/.github/workflows/atom-sglang-test.yaml index 71e1b9568..2a5d8572f 100644 --- a/.github/workflows/atom-sglang-test.yaml +++ b/.github/workflows/atom-sglang-test.yaml @@ -220,6 +220,15 @@ jobs: ATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1 accuracy_test_threshold: 0.91 runner: linux-atom-mi35x-4 + - model_name: "Qwen3.5-35B-A3B-FP8 TP2" + model_path: "Qwen/Qwen3.5-35B-A3B-FP8" + extra_args: "--tensor-parallel-size 2 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache" + env_vars: | + SGLANG_DEFAULT_SERVER_ARGS= + SGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models + ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0 + accuracy_test_threshold: 0.76 + runner: linux-atom-mi35x-4 runs-on: ${{ matrix.runner }} timeout-minutes: 180 env: diff --git a/recipes/atom_sglang/Qwen3_5.md b/recipes/atom_sglang/Qwen3_5.md index 6baf61781..a63e2c8dc 100644 --- a/recipes/atom_sglang/Qwen3_5.md +++ b/recipes/atom_sglang/Qwen3_5.md @@ -55,15 +55,15 @@ RESULT_FILENAME=${model}-tp${tp}-${ISL}-${OSL}-${CONC}-${RANDOM_RANGE_RATIO}.jso python3 -m sglang.bench_serving --backend sglang-oai-chat \ --model ${model_path} \ --base-url=http://127.0.0.1:30000 \ - --max-concurrency 16 \ - --num-prompts "$(( CONC * 5 ))" \ + --max-concurrency 16 \ + --num-prompts "$(( CONC * 5 ))" \ --request-rate inf \ --dataset-name random \ --random-input-len ${ISL} \ --random-output-len ${OSL} \ --random-range-ratio ${RANDOM_RANGE_RATIO} \ --warmup-requests $(( CONC * 2 )) \ - --disable-ignore-eos \ + --disable-ignore-eos \ --output-file ${RESULT_FILENAME} \ --trust-remote-code ``` @@ -86,10 +86,9 @@ Then append `--profile` to the `sglang.bench_serving` command in Step 3. ```bash lm_eval --model local-completions \ - --model_args model=${model_path},base_url=http://localhost:30000/v1/completions,num_concurrent=256,max_retries=2,tokenized_requests=False,trust_remote_code=True \ + --model_args model=${model_path},base_url=http://localhost:30000/v1/completions,num_concurrent=65,max_retries=1,tokenized_requests=False,trust_remote_code=True \ --tasks gsm8k \ - --batch_size auto \ - --num_fewshot 5 \ + --num_fewshot 3 \ --trust_remote_code ```