diff --git a/.github/workflows/export-models.yml b/.github/workflows/export-models.yml new file mode 100644 index 00000000000..2b5b22630f6 --- /dev/null +++ b/.github/workflows/export-models.yml @@ -0,0 +1,461 @@ +name: export-models + +on: + schedule: + # Run daily at midnight UTC + - cron: '0 0 * * *' + pull_request: + paths: + - .github/workflows/export-models.yml + # Allow manual trigger from Actions webpage + workflow_dispatch: + inputs: + models: + description: 'Comma-separated list of models to export (e.g., mv3,mv2,meta-llama/Llama-3.2-1B)' + required: false + type: string + default: 'mv3,mv2,ic4,ic3,resnet50,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf,google/gemma-3-1b-it' + configs: + description: 'Comma-separated list of export configs (leave empty for auto-detection based on model)' + required: false + type: string + s3_prefix: + description: 'S3 prefix path for uploaded models (default: executorch-models)' + required: false + type: string + default: 'executorch-models' + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + set-parameters: + runs-on: ubuntu-22.04 + outputs: + models_matrix: ${{ steps.set-parameters.outputs.models_matrix }} + steps: + - uses: actions/checkout@v3 + with: + submodules: 'false' + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + - name: Set parameters + id: set-parameters + shell: bash + run: | + set -eux + + # Default models for scheduled runs + DEFAULT_MODELS="mv3,mv2,ic4,ic3,resnet50,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf,google/gemma-3-1b-it" + + MODELS="${{ inputs.models }}" + if [ -z "$MODELS" ]; then + MODELS="$DEFAULT_MODELS" + fi + + CONFIGS="${{ inputs.configs }}" + + # Generate the matrix JSON + python3 << 'EOF' + import json + import os + import sys + import re + + sys.path.append(os.path.abspath('.')) + from examples.models import MODEL_NAME_TO_MODEL + + models_str = os.environ.get('MODELS', '') + configs_str = os.environ.get('CONFIGS', '') + + models = [m.strip() for m in models_str.split(',') if m.strip()] + configs = [c.strip() for c in configs_str.split(',') if c.strip()] if configs_str else None + + # Predefined benchmark configurations (subset from gather_benchmark_configs.py) + BENCHMARK_CONFIGS = { + "xplat": [ + "xnnpack_q8", + "hf_xnnpack_custom_spda_kv_cache_8da4w", + "et_xnnpack_custom_spda_kv_cache_8da4w", + "llama3_fb16", + "llama3_spinquant", + "llama3_qlora", + ], + } + + def is_valid_huggingface_model_id(model_name: str) -> bool: + pattern = r"^[a-zA-Z0-9-_]+/[a-zA-Z0-9-_.]+$" + return bool(re.match(pattern, model_name)) + + def generate_compatible_configs(model_name: str): + configs = [] + if is_valid_huggingface_model_id(model_name): + configs.append("hf_xnnpack_custom_spda_kv_cache_8da4w") + if model_name.startswith("meta-llama/"): + repo_name = model_name.split("meta-llama/")[1] + if "qlora" in repo_name.lower(): + configs = ["llama3_qlora"] + elif "spinquant" in repo_name.lower(): + configs = ["llama3_spinquant"] + else: + configs.extend(["llama3_fb16", "et_xnnpack_custom_spda_kv_cache_8da4w"]) + if model_name.startswith("Qwen/Qwen3"): + configs.append("et_xnnpack_custom_spda_kv_cache_8da4w") + elif model_name in MODEL_NAME_TO_MODEL: + configs.append("xnnpack_q8") + return configs + + matrix_entries = [] + for model in models: + model_configs = configs if configs else generate_compatible_configs(model) + for config in model_configs: + matrix_entries.append({ + "model": model, + "config": config + }) + + matrix = {"include": matrix_entries} + + # Write to GITHUB_OUTPUT + github_output = os.environ.get('GITHUB_OUTPUT', '') + output_line = f"models_matrix={json.dumps(matrix)}" + if github_output: + with open(github_output, 'a') as f: + f.write(output_line + '\n') + else: + print(f"::set-output name=models_matrix::{json.dumps(matrix)}") + + print(f"Generated matrix with {len(matrix_entries)} entries:") + for entry in matrix_entries: + print(f" - {entry['model']} with {entry['config']}") + EOF + env: + MODELS: ${{ inputs.models || 'mv3,mv2,ic4,ic3,resnet50,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf,google/gemma-3-1b-it' }} + CONFIGS: ${{ inputs.configs }} + + export-models: + name: export-models + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + needs: set-parameters + secrets: inherit + strategy: + matrix: ${{ fromJson(needs.set-parameters.outputs.models_matrix) }} + fail-fast: false + with: + runner: linux.2xlarge.memory + docker-image: ci-image:executorch-ubuntu-22.04-qnn-sdk + submodules: 'recursive' + timeout: 60 + upload-artifact: exported-models + upload-artifact-to-s3: true + secrets-env: EXECUTORCH_HF_TOKEN + script: | + # The generic Linux job chooses to use base env, not the one setup by the image + echo "::group::Setting up dev environment" + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + if [[ "${{ matrix.config }}" == *"qnn"* ]]; then + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh + PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh + fi + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake" + # Install requirements for export_llama + PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh + + pip install -U "huggingface_hub[cli]" + python -m huggingface_hub.commands.huggingface_cli login --token $SECRET_EXECUTORCH_HF_TOKEN + pip install accelerate sentencepiece + pip list + + S3_PREFIX="${{ inputs.s3_prefix || 'executorch-models' }}" + ARTIFACTS_DIR_NAME="artifacts-to-be-uploaded/${S3_PREFIX}/${{ matrix.model }}_${{ matrix.config }}" + echo "::endgroup::" + + echo "::group::Exporting ${{ matrix.config }} model: ${{ matrix.model }}" + BUILD_MODE="cmake" + + if [[ "${{ matrix.model }}" =~ ^[^/]+/[^/]+$ ]]; then + # HuggingFace model. Assume the pattern is always like "/" + HF_MODEL_REPO=${{ matrix.model }} + OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}" + + # Convert HF checkpoint to ET via etLLM path + if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then + if [[ "${{ matrix.config }}" == "llama3_spinquant" ]]; then + # SpinQuant + # Download prequantized chceckpoint from Hugging Face + DOWNLOADED_PATH=$( + bash .ci/scripts/download_hf_hub.sh \ + --model_id "${HF_MODEL_REPO}" \ + --files "tokenizer.model" "params.json" "consolidated.00.pth" + ) + # Export using ExecuTorch's model definition + python -m extension.llm.export.export_llm \ + base.model_class="llama3_2" \ + base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \ + base.params="${DOWNLOADED_PATH}/params.json" \ + model.use_sdpa_with_kv_cache=true \ + backend.xnnpack.enabled=true \ + backend.xnnpack.extended_ops=true \ + base.preq_mode="preq_8da4w_out_8da8w" \ + base.preq_group_size=32 \ + export.max_seq_length=2048 \ + export.max_context_length=2048 \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" \ + model.use_kv_cache=true \ + model.dtype_override=fp32 \ + base.preq_embedding_quantize=\'8,0\' \ + quantization.use_spin_quant=native \ + base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' + ls -lh "${OUT_ET_MODEL_NAME}.pte" + elif [[ "${{ matrix.config }}" == "llama3_qlora" ]]; then + # QAT + LoRA + # Download prequantized chceckpoint from Hugging Face + DOWNLOADED_PATH=$( + bash .ci/scripts/download_hf_hub.sh \ + --model_id "${HF_MODEL_REPO}" \ + --files "tokenizer.model" "params.json" "consolidated.00.pth" + ) + # Export using ExecuTorch's model definition + python -m extension.llm.export.export_llm \ + base.model_class="llama3_2" \ + base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \ + base.params="${DOWNLOADED_PATH}/params.json" \ + quantization.use_qat=true \ + base.use_lora=16 \ + base.preq_mode="preq_8da4w_out_8da8w" \ + base.preq_group_size=32 \ + base.preq_embedding_quantize=\'8,0\' \ + model.use_sdpa_with_kv_cache=true \ + model.use_kv_cache=true \ + backend.xnnpack.enabled=true \ + backend.xnnpack.extended_ops=true \ + model.dtype_override=fp32 \ + export.max_seq_length=2048 \ + export.max_context_length=2048 \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" \ + base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' + ls -lh "${OUT_ET_MODEL_NAME}.pte" + elif [[ "${{ matrix.config }}" == "llama3_fb16" ]]; then + # Original BF16 version, without any quantization + DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") + python -m extension.llm.export.export_llm \ + base.model_class="llama3_2" \ + base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \ + base.params="${DOWNLOADED_PATH}/params.json" \ + model.use_kv_cache=true \ + model.use_sdpa_with_kv_cache=true \ + backend.xnnpack.enabled=true \ + model.dtype_override=bf16 \ + base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" + ls -lh "${OUT_ET_MODEL_NAME}.pte" + elif [[ "${{ matrix.config }}" == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then + DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") + python -m extension.llm.export.export_llm \ + base.model_class=llama3_2 \ + base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \ + base.params="${DOWNLOADED_PATH}/params.json" \ + model.use_kv_cache=true \ + model.use_sdpa_with_kv_cache=true \ + model.dtype_override=fp32 \ + backend.xnnpack.enabled=true \ + backend.xnnpack.extended_ops=true \ + quantization.qmode=8da4w \ + quantization.group_size=32 \ + quantization.embedding_quantize=\'8,0\' \ + base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" + ls -lh "${OUT_ET_MODEL_NAME}.pte" + elif [[ "${{ matrix.config }}" == "llama3_qnn_htp" ]]; then + export QNN_SDK_ROOT=/tmp/qnn/2.37.0.250724 + export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/ + export PYTHONPATH=$(pwd)/.. + + DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") + python -m examples.qualcomm.oss_scripts.llama3_2.llama -- \ + --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ + --params "${DOWNLOADED_PATH}/params.json" \ + --tokenizer_model "${DOWNLOADED_PATH}/tokenizer.model" \ + --compile_only \ + --ptq 16a4w \ + -m SM8650 \ + --model_size 1B \ + --model_mode kv \ + --prompt "Once" + + OUT_ET_MODEL_NAME="llama3_2_qnn" # Qualcomm hard-coded it in their script + find . -name "${OUT_ET_MODEL_NAME}.pte" -not -path "./${OUT_ET_MODEL_NAME}.pte" -exec mv {} ./ \; + ls -lh "${OUT_ET_MODEL_NAME}.pte" + fi + elif [[ "$HF_MODEL_REPO" == "Qwen/Qwen3-0.6B" ]]; then + if [[ "${{ matrix.config }}" == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then + DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json") + python -m extension.llm.export.export_llm \ + base.model_class=qwen3_0_6b \ + base.params=examples/models/qwen3/config/0_6b_config.json \ + model.use_kv_cache=true \ + model.use_sdpa_with_kv_cache=true \ + model.dtype_override=fp32 \ + backend.xnnpack.enabled=true \ + backend.xnnpack.extended_ops=true \ + quantization.qmode=8da4w \ + quantization.group_size=32 \ + quantization.embedding_quantize=\'8,0\' \ + base.metadata='"{\"get_bos_id\":151644,\"get_eos_ids\":[151645]}"' \ + export.output_name="${OUT_ET_MODEL_NAME}.pte" + ls -lh "${OUT_ET_MODEL_NAME}.pte" + fi + fi + + if [[ "${{ matrix.config }}" == "hf_xnnpack_custom_spda_kv_cache_8da4w" ]]; then + DOWNLOADED_PATH=$( + bash .ci/scripts/download_hf_hub.sh \ + --model_id "${HF_MODEL_REPO}" \ + --files "tokenizer.json" + ) + echo "tokenizer.json is downloaded to $DOWNLOADED_PATH" + + # Install optimum-executorch + OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) + git clone https://github.com/huggingface/optimum-executorch + pushd optimum-executorch + # There is no release yet, for CI stability, always test from the same commit on main + git checkout $OPTIMUM_ET_COMMIT + python install_dev.py --skip_override_torch + pip list + + ARGS=( + "--model" "${HF_MODEL_REPO}" + "--task" "text-generation" + "--recipe" "xnnpack" + "--use_custom_sdpa" + "--use_custom_kv_cache" + "--qlinear" "8da4w" + "--qembedding" "8w" + "--output_dir" ".." + ) + + optimum-cli export executorch "${ARGS[@]}" + popd + + mv model.pte ${OUT_ET_MODEL_NAME}.pte + ls -lh "${OUT_ET_MODEL_NAME}.pte" + fi + + zip -j model.zip ${OUT_ET_MODEL_NAME}.pte ${DOWNLOADED_PATH}/tokenizer.* + ls -lh model.zip + mkdir -p ${ARTIFACTS_DIR_NAME} + mv model.zip ${ARTIFACTS_DIR_NAME} + ls -lh ${ARTIFACTS_DIR_NAME} + elif [[ "${{ matrix.model }}" == "llama" ]]; then + # Install requirements for export_llama + PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh + # Test llama2 + if [[ "${{ matrix.config }}" == *"xnnpack"* ]]; then + DELEGATE_CONFIG="xnnpack+custom+qe" + elif [[ "${{ matrix.config }}" == *"qnn"* ]]; then + DELEGATE_CONFIG="qnn" + else + echo "Unsupported delegate ${{ matrix.config }}" + exit 1 + fi + DTYPE="fp32" + PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh \ + -model "${{ matrix.model }}" \ + -build_tool "${BUILD_MODE}" \ + -dtype "${DTYPE}" \ + -mode "${DELEGATE_CONFIG}" \ + -upload "${ARTIFACTS_DIR_NAME}" + else + PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh \ + "${{ matrix.model }}" \ + "${BUILD_MODE}" \ + "${{ matrix.config }}" \ + "${ARTIFACTS_DIR_NAME}" + fi + echo "::endgroup::" + + # Create a manifest file with export metadata + echo "::group::Creating export manifest" + cat > ${ARTIFACTS_DIR_NAME}/manifest.json << EOF + { + "model": "${{ matrix.model }}", + "config": "${{ matrix.config }}", + "exported_at": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")", + "github_run_id": "${{ github.run_id }}", + "github_sha": "${{ github.sha }}", + "github_ref": "${{ github.ref }}" + } + EOF + cat ${ARTIFACTS_DIR_NAME}/manifest.json + echo "::endgroup::" + + # Summary job that creates an index of all exported models + create-export-summary: + name: create-export-summary + runs-on: ubuntu-22.04 + needs: + - set-parameters + - export-models + if: always() + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@v3 + with: + submodules: 'false' + + - name: Authenticate with AWS + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results + role-duration-seconds: 3600 + aws-region: us-east-1 + + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Create export summary + shell: bash + env: + S3_BUCKET: gha-artifacts + S3_PREFIX: ${{ github.repository }}/${{ github.run_id }}/artifacts + run: | + set -eux + + pip install awscli + + # List all exported artifacts + echo "Listing exported models from S3..." + aws s3 ls "s3://${S3_BUCKET}/${S3_PREFIX}/" --recursive || echo "No artifacts found" + + # Create a summary + cat > export_summary.json << EOF + { + "workflow_run_id": "${{ github.run_id }}", + "workflow_run_url": "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}", + "exported_at": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")", + "s3_base_path": "s3://${S3_BUCKET}/${S3_PREFIX}", + "models_matrix": ${{ needs.set-parameters.outputs.models_matrix }} + } + EOF + + cat export_summary.json + + # Upload the summary + aws s3 cp export_summary.json "s3://${S3_BUCKET}/${S3_PREFIX}/export_summary.json" + + echo "Export summary uploaded to s3://${S3_BUCKET}/${S3_PREFIX}/export_summary.json" + echo "" + echo "=== Exported Models ===" + echo "Models can be downloaded from:" + echo "https://gha-artifacts.s3.amazonaws.com/${S3_PREFIX}/"