From 0f812b5e3c514144a2d20f806654a3c69a58731e Mon Sep 17 00:00:00 2001
From: Hansong Zhang <hsz@meta.com>
Date: Tue, 20 Jan 2026 22:01:53 -0800
Subject: [PATCH 1/3] Add workflow to export models and upload to S3

This workflow exports ExecutorTorch models and uploads them to S3 for
easy access. It can be triggered manually from the Actions webpage or
runs automatically on a daily schedule.

Supported models include in-tree models (mv3, mv2, ic4, resnet50, etc.)
and HuggingFace models (Llama, Qwen, SmolLM, OLMo, Gemma).
---
 .github/workflows/export-models.yml | 461 ++++++++++++++++++++++++++++
 1 file changed, 461 insertions(+)
 create mode 100644 .github/workflows/export-models.yml

diff --git a/.github/workflows/export-models.yml b/.github/workflows/export-models.yml
new file mode 100644
index 00000000000..b08809282d1
--- /dev/null
+++ b/.github/workflows/export-models.yml
@@ -0,0 +1,461 @@
+name: export-models
+
+on:
+  schedule:
+    # Run daily at midnight UTC
+    - cron: '0 0 * * *'
+  pull_request:
+    paths:
+      - .github/workflows/export-models.yml
+  # Allow manual trigger from Actions webpage
+  workflow_dispatch:
+    inputs:
+      models:
+        description: 'Comma-separated list of models to export (e.g., mv3,mv2,meta-llama/Llama-3.2-1B)'
+        required: false
+        type: string
+        default: 'mv3,mv2,ic4,ic3,resnet50,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf,google/gemma-3-1b-it'
+      configs:
+        description: 'Comma-separated list of export configs (leave empty for auto-detection based on model)'
+        required: false
+        type: string
+      s3_prefix:
+        description: 'S3 prefix path for uploaded models (default: executorch-models)'
+        required: false
+        type: string
+        default: 'executorch-models'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+  set-parameters:
+    runs-on: ubuntu-22.04
+    outputs:
+      models_matrix: ${{ steps.set-parameters.outputs.models_matrix }}
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: 'false'
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+      - name: Set parameters
+        id: set-parameters
+        shell: bash
+        run: |
+          set -eux
+
+          # Default models for scheduled runs
+          DEFAULT_MODELS="mv3,mv2,ic4,ic3,resnet50,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf,google/gemma-3-1b-it"
+
+          MODELS="${{ inputs.models }}"
+          if [ -z "$MODELS" ]; then
+            MODELS="$DEFAULT_MODELS"
+          fi
+
+          CONFIGS="${{ inputs.configs }}"
+
+          # Generate the matrix JSON
+          python3 << 'EOF'
+          import json
+          import os
+          import sys
+          import re
+
+          sys.path.append(os.path.abspath('.'))
+          from examples.models import MODEL_NAME_TO_MODEL
+
+          models_str = os.environ.get('MODELS', '')
+          configs_str = os.environ.get('CONFIGS', '')
+
+          models = [m.strip() for m in models_str.split(',') if m.strip()]
+          configs = [c.strip() for c in configs_str.split(',') if c.strip()] if configs_str else None
+
+          # Predefined benchmark configurations (subset from gather_benchmark_configs.py)
+          BENCHMARK_CONFIGS = {
+              "xplat": [
+                  "xnnpack_q8",
+                  "hf_xnnpack_custom_spda_kv_cache_8da4w",
+                  "et_xnnpack_custom_spda_kv_cache_8da4w",
+                  "llama3_fb16",
+                  "llama3_spinquant",
+                  "llama3_qlora",
+              ],
+          }
+
+          def is_valid_huggingface_model_id(model_name: str) -> bool:
+              pattern = r"^[a-zA-Z0-9-_]+/[a-zA-Z0-9-_.]+$"
+              return bool(re.match(pattern, model_name))
+
+          def generate_compatible_configs(model_name: str):
+              configs = []
+              if is_valid_huggingface_model_id(model_name):
+                  configs.append("hf_xnnpack_custom_spda_kv_cache_8da4w")
+                  if model_name.startswith("meta-llama/"):
+                      repo_name = model_name.split("meta-llama/")[1]
+                      if "qlora" in repo_name.lower():
+                          configs = ["llama3_qlora"]
+                      elif "spinquant" in repo_name.lower():
+                          configs = ["llama3_spinquant"]
+                      else:
+                          configs.extend(["llama3_fb16", "et_xnnpack_custom_spda_kv_cache_8da4w"])
+                  if model_name.startswith("Qwen/Qwen3"):
+                      configs.append("et_xnnpack_custom_spda_kv_cache_8da4w")
+              elif model_name in MODEL_NAME_TO_MODEL:
+                  configs.append("xnnpack_q8")
+              return configs
+
+          matrix_entries = []
+          for model in models:
+              model_configs = configs if configs else generate_compatible_configs(model)
+              for config in model_configs:
+                  matrix_entries.append({
+                      "model": model,
+                      "config": config
+                  })
+
+          matrix = {"include": matrix_entries}
+
+          # Write to GITHUB_OUTPUT
+          github_output = os.environ.get('GITHUB_OUTPUT', '')
+          output_line = f"models_matrix={json.dumps(matrix)}"
+          if github_output:
+              with open(github_output, 'a') as f:
+                  f.write(output_line + '\n')
+          else:
+              print(f"::set-output name=models_matrix::{json.dumps(matrix)}")
+
+          print(f"Generated matrix with {len(matrix_entries)} entries:")
+          for entry in matrix_entries:
+              print(f"  - {entry['model']} with {entry['config']}")
+          EOF
+        env:
+          MODELS: ${{ inputs.models || 'mv3,mv2,ic4,ic3,resnet50,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf,google/gemma-3-1b-it' }}
+          CONFIGS: ${{ inputs.configs }}
+
+  export-models:
+    name: export-models
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    needs: set-parameters
+    secrets: inherit
+    strategy:
+      matrix: ${{ fromJson(needs.set-parameters.outputs.models_matrix) }}
+      fail-fast: false
+    with:
+      runner: linux.2xlarge.memory
+      docker-image: ci-image:executorch-ubuntu-22.04-qnn-sdk
+      submodules: 'recursive'
+      timeout: 60
+      upload-artifact: exported-models
+      upload-artifact-to-s3: true
+      secrets-env: EXECUTORCH_HF_TOKEN
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        echo "::group::Setting up dev environment"
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+        if [[ ${{ matrix.config }} == *"qnn"* ]]; then
+            PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
+            PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
+        fi
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
+        # Install requirements for export_llama
+        PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
+
+        pip install -U "huggingface_hub[cli]"
+        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        pip install accelerate sentencepiece
+        pip list
+
+        S3_PREFIX="${{ inputs.s3_prefix || 'executorch-models' }}"
+        ARTIFACTS_DIR_NAME="artifacts-to-be-uploaded/${S3_PREFIX}/${{ matrix.model }}_${{ matrix.config }}"
+        echo "::endgroup::"
+
+        echo "::group::Exporting ${{ matrix.config }} model: ${{ matrix.model }}"
+        BUILD_MODE="cmake"
+
+        if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
+            # HuggingFace model. Assume the pattern is always like "<org>/<repo>"
+            HF_MODEL_REPO=${{ matrix.model }}
+            OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}"
+
+            # Convert HF checkpoint to ET via etLLM path
+            if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
+                if [[ ${{ matrix.config }} == "llama3_spinquant" ]]; then
+                    # SpinQuant
+                    # Download prequantized chceckpoint from Hugging Face
+                    DOWNLOADED_PATH=$(
+                      bash .ci/scripts/download_hf_hub.sh \
+                        --model_id "${HF_MODEL_REPO}" \
+                        --files "tokenizer.model" "params.json" "consolidated.00.pth"
+                    )
+                    # Export using ExecuTorch's model definition
+                    python -m extension.llm.export.export_llm \
+                      base.model_class="llama3_2" \
+                      base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
+                      base.params="${DOWNLOADED_PATH}/params.json" \
+                      model.use_sdpa_with_kv_cache=true \
+                      backend.xnnpack.enabled=true \
+                      backend.xnnpack.extended_ops=true \
+                      base.preq_mode="preq_8da4w_out_8da8w" \
+                      base.preq_group_size=32 \
+                      export.max_seq_length=2048 \
+                      export.max_context_length=2048 \
+                      export.output_name="${OUT_ET_MODEL_NAME}.pte" \
+                      model.use_kv_cache=true \
+                      model.dtype_override=fp32 \
+                      base.preq_embedding_quantize=\'8,0\' \
+                      quantization.use_spin_quant=native \
+                      base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"'
+                    ls -lh "${OUT_ET_MODEL_NAME}.pte"
+                elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then
+                    # QAT + LoRA
+                    # Download prequantized chceckpoint from Hugging Face
+                    DOWNLOADED_PATH=$(
+                      bash .ci/scripts/download_hf_hub.sh \
+                        --model_id "${HF_MODEL_REPO}" \
+                        --files "tokenizer.model" "params.json" "consolidated.00.pth"
+                    )
+                    # Export using ExecuTorch's model definition
+                    python -m extension.llm.export.export_llm \
+                      base.model_class="llama3_2" \
+                      base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
+                      base.params="${DOWNLOADED_PATH}/params.json" \
+                      quantization.use_qat=true \
+                      base.use_lora=16 \
+                      base.preq_mode="preq_8da4w_out_8da8w" \
+                      base.preq_group_size=32 \
+                      base.preq_embedding_quantize=\'8,0\' \
+                      model.use_sdpa_with_kv_cache=true \
+                      model.use_kv_cache=true \
+                      backend.xnnpack.enabled=true \
+                      backend.xnnpack.extended_ops=true \
+                      model.dtype_override=fp32 \
+                      export.max_seq_length=2048 \
+                      export.max_context_length=2048 \
+                      export.output_name="${OUT_ET_MODEL_NAME}.pte" \
+                      base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"'
+                    ls -lh "${OUT_ET_MODEL_NAME}.pte"
+                elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then
+                    # Original BF16 version, without any quantization
+                    DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
+                    python -m extension.llm.export.export_llm \
+                      base.model_class="llama3_2" \
+                      base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
+                      base.params="${DOWNLOADED_PATH}/params.json" \
+                      model.use_kv_cache=true \
+                      model.use_sdpa_with_kv_cache=true \
+                      backend.xnnpack.enabled=true \
+                      model.dtype_override=bf16 \
+                      base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' \
+                      export.output_name="${OUT_ET_MODEL_NAME}.pte"
+                    ls -lh "${OUT_ET_MODEL_NAME}.pte"
+                elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
+                    DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
+                    python -m extension.llm.export.export_llm \
+                      base.model_class=llama3_2 \
+                      base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
+                      base.params="${DOWNLOADED_PATH}/params.json" \
+                      model.use_kv_cache=true \
+                      model.use_sdpa_with_kv_cache=true \
+                      model.dtype_override=fp32 \
+                      backend.xnnpack.enabled=true \
+                      backend.xnnpack.extended_ops=true \
+                      quantization.qmode=8da4w \
+                      quantization.group_size=32 \
+                      quantization.embedding_quantize=\'8,0\' \
+                      base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' \
+                      export.output_name="${OUT_ET_MODEL_NAME}.pte"
+                    ls -lh "${OUT_ET_MODEL_NAME}.pte"
+                elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
+                    export QNN_SDK_ROOT=/tmp/qnn/2.37.0.250724
+                    export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/
+                    export PYTHONPATH=$(pwd)/..
+
+                    DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
+                    python -m examples.qualcomm.oss_scripts.llama3_2.llama -- \
+                      --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
+                      --params "${DOWNLOADED_PATH}/params.json" \
+                      --tokenizer_model "${DOWNLOADED_PATH}/tokenizer.model" \
+                      --compile_only \
+                      --ptq 16a4w \
+                      -m SM8650 \
+                      --model_size 1B \
+                      --model_mode kv \
+                      --prompt "Once"
+
+                    OUT_ET_MODEL_NAME="llama3_2_qnn" # Qualcomm hard-coded it in their script
+                    find . -name "${OUT_ET_MODEL_NAME}.pte" -not -path "./${OUT_ET_MODEL_NAME}.pte" -exec mv {} ./ \;
+                    ls -lh "${OUT_ET_MODEL_NAME}.pte"
+                fi
+            elif [[ "$HF_MODEL_REPO" == "Qwen/Qwen3-0.6B" ]]; then
+              if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
+                DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
+                python -m extension.llm.export.export_llm \
+                  base.model_class=qwen3_0_6b \
+                  base.params=examples/models/qwen3/config/0_6b_config.json \
+                  model.use_kv_cache=true \
+                  model.use_sdpa_with_kv_cache=true \
+                  model.dtype_override=fp32 \
+                  backend.xnnpack.enabled=true \
+                  backend.xnnpack.extended_ops=true \
+                  quantization.qmode=8da4w \
+                  quantization.group_size=32 \
+                  quantization.embedding_quantize=\'8,0\' \
+                  base.metadata='"{\"get_bos_id\":151644,\"get_eos_ids\":[151645]}"' \
+                  export.output_name="${OUT_ET_MODEL_NAME}.pte"
+                ls -lh "${OUT_ET_MODEL_NAME}.pte"
+              fi
+            fi
+
+            if [[ ${{ matrix.config }} == "hf_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
+              DOWNLOADED_PATH=$(
+                bash .ci/scripts/download_hf_hub.sh \
+                  --model_id "${HF_MODEL_REPO}" \
+                  --files "tokenizer.json"
+              )
+              echo "tokenizer.json is downloaded to $DOWNLOADED_PATH"
+
+              # Install optimum-executorch
+              OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+              git clone https://github.com/huggingface/optimum-executorch
+              pushd optimum-executorch
+              # There is no release yet, for CI stability, always test from the same commit on main
+              git checkout $OPTIMUM_ET_COMMIT
+              python install_dev.py --skip_override_torch
+              pip list
+
+              ARGS=(
+                "--model" "${HF_MODEL_REPO}"
+                "--task" "text-generation"
+                "--recipe" "xnnpack"
+                "--use_custom_sdpa"
+                "--use_custom_kv_cache"
+                "--qlinear" "8da4w"
+                "--qembedding" "8w"
+                "--output_dir" ".."
+              )
+
+              optimum-cli export executorch "${ARGS[@]}"
+              popd
+
+              mv model.pte ${OUT_ET_MODEL_NAME}.pte
+              ls -lh "${OUT_ET_MODEL_NAME}.pte"
+            fi
+
+            zip -j model.zip ${OUT_ET_MODEL_NAME}.pte ${DOWNLOADED_PATH}/tokenizer.*
+            ls -lh model.zip
+            mkdir -p ${ARTIFACTS_DIR_NAME}
+            mv model.zip ${ARTIFACTS_DIR_NAME}
+            ls -lh ${ARTIFACTS_DIR_NAME}
+        elif [[ ${{ matrix.model }} == "llama" ]]; then
+            # Install requirements for export_llama
+            PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
+            # Test llama2
+            if [[ ${{ matrix.config }} == *"xnnpack"* ]]; then
+                DELEGATE_CONFIG="xnnpack+custom+qe"
+            elif [[ ${{ matrix.config }} == *"qnn"* ]]; then
+                DELEGATE_CONFIG="qnn"
+            else
+                echo "Unsupported delegate ${{ matrix.config }}"
+                exit 1
+            fi
+            DTYPE="fp32"
+            PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh \
+              -model "${{ matrix.model }}" \
+              -build_tool "${BUILD_MODE}" \
+              -dtype "${DTYPE}" \
+              -mode "${DELEGATE_CONFIG}" \
+              -upload "${ARTIFACTS_DIR_NAME}"
+        else
+            PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh \
+              "${{ matrix.model }}" \
+              "${BUILD_MODE}" \
+              "${{ matrix.config }}" \
+              "${ARTIFACTS_DIR_NAME}"
+        fi
+        echo "::endgroup::"
+
+        # Create a manifest file with export metadata
+        echo "::group::Creating export manifest"
+        cat > ${ARTIFACTS_DIR_NAME}/manifest.json << EOF
+        {
+          "model": "${{ matrix.model }}",
+          "config": "${{ matrix.config }}",
+          "exported_at": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
+          "github_run_id": "${{ github.run_id }}",
+          "github_sha": "${{ github.sha }}",
+          "github_ref": "${{ github.ref }}"
+        }
+        EOF
+        cat ${ARTIFACTS_DIR_NAME}/manifest.json
+        echo "::endgroup::"
+
+  # Summary job that creates an index of all exported models
+  create-export-summary:
+    name: create-export-summary
+    runs-on: ubuntu-22.04
+    needs:
+      - set-parameters
+      - export-models
+    if: always()
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: 'false'
+
+      - name: Authenticate with AWS
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
+          role-duration-seconds: 3600
+          aws-region: us-east-1
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Create export summary
+        shell: bash
+        env:
+          S3_BUCKET: gha-artifacts
+          S3_PREFIX: ${{ github.repository }}/${{ github.run_id }}/artifacts
+        run: |
+          set -eux
+
+          pip install awscli
+
+          # List all exported artifacts
+          echo "Listing exported models from S3..."
+          aws s3 ls "s3://${S3_BUCKET}/${S3_PREFIX}/" --recursive || echo "No artifacts found"
+
+          # Create a summary
+          cat > export_summary.json << EOF
+          {
+            "workflow_run_id": "${{ github.run_id }}",
+            "workflow_run_url": "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}",
+            "exported_at": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
+            "s3_base_path": "s3://${S3_BUCKET}/${S3_PREFIX}",
+            "models_matrix": ${{ needs.set-parameters.outputs.models_matrix }}
+          }
+          EOF
+
+          cat export_summary.json
+
+          # Upload the summary
+          aws s3 cp export_summary.json "s3://${S3_BUCKET}/${S3_PREFIX}/export_summary.json"
+
+          echo "Export summary uploaded to s3://${S3_BUCKET}/${S3_PREFIX}/export_summary.json"
+          echo ""
+          echo "=== Exported Models ==="
+          echo "Models can be downloaded from:"
+          echo "https://gha-artifacts.s3.amazonaws.com/${S3_PREFIX}/"

From abebf2302bd145460dd13d275749c621d749bd81 Mon Sep 17 00:00:00 2001
From: Hansong Zhang <hsz@meta.com>
Date: Tue, 20 Jan 2026 22:20:29 -0800
Subject: [PATCH 2/3] Quote matrix variables in bash conditions for robustness

Properly quote ${{ matrix.config }} and ${{ matrix.model }} in bash
conditional expressions to prevent potential word splitting issues.
---
 .github/workflows/export-models.yml | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/export-models.yml b/.github/workflows/export-models.yml
index b08809282d1..d92feda10fd 100644
--- a/.github/workflows/export-models.yml
+++ b/.github/workflows/export-models.yml
@@ -159,7 +159,7 @@ jobs:
         echo "::group::Setting up dev environment"
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
-        if [[ ${{ matrix.config }} == *"qnn"* ]]; then
+        if [[ "${{ matrix.config }}" == *"qnn"* ]]; then
             PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
             PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
         fi
@@ -179,14 +179,14 @@ jobs:
         echo "::group::Exporting ${{ matrix.config }} model: ${{ matrix.model }}"
         BUILD_MODE="cmake"
 
-        if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
+        if [[ "${{ matrix.model }}" =~ ^[^/]+/[^/]+$ ]]; then
             # HuggingFace model. Assume the pattern is always like "<org>/<repo>"
             HF_MODEL_REPO=${{ matrix.model }}
             OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}"
 
             # Convert HF checkpoint to ET via etLLM path
             if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
-                if [[ ${{ matrix.config }} == "llama3_spinquant" ]]; then
+                if [[ "${{ matrix.config }}" == "llama3_spinquant" ]]; then
                     # SpinQuant
                     # Download prequantized chceckpoint from Hugging Face
                     DOWNLOADED_PATH=$(
@@ -213,7 +213,7 @@ jobs:
                       quantization.use_spin_quant=native \
                       base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"'
                     ls -lh "${OUT_ET_MODEL_NAME}.pte"
-                elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then
+                elif [[ "${{ matrix.config }}" == "llama3_qlora" ]]; then
                     # QAT + LoRA
                     # Download prequantized chceckpoint from Hugging Face
                     DOWNLOADED_PATH=$(
@@ -241,7 +241,7 @@ jobs:
                       export.output_name="${OUT_ET_MODEL_NAME}.pte" \
                       base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"'
                     ls -lh "${OUT_ET_MODEL_NAME}.pte"
-                elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then
+                elif [[ "${{ matrix.config }}" == "llama3_fb16" ]]; then
                     # Original BF16 version, without any quantization
                     DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
                     python -m extension.llm.export.export_llm \
@@ -255,7 +255,7 @@ jobs:
                       base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' \
                       export.output_name="${OUT_ET_MODEL_NAME}.pte"
                     ls -lh "${OUT_ET_MODEL_NAME}.pte"
-                elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
+                elif [[ "${{ matrix.config }}" == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
                     DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
                     python -m extension.llm.export.export_llm \
                       base.model_class=llama3_2 \
@@ -272,7 +272,7 @@ jobs:
                       base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' \
                       export.output_name="${OUT_ET_MODEL_NAME}.pte"
                     ls -lh "${OUT_ET_MODEL_NAME}.pte"
-                elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
+                elif [[ "${{ matrix.config }}" == "llama3_qnn_htp" ]]; then
                     export QNN_SDK_ROOT=/tmp/qnn/2.37.0.250724
                     export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/
                     export PYTHONPATH=$(pwd)/..
@@ -294,7 +294,7 @@ jobs:
                     ls -lh "${OUT_ET_MODEL_NAME}.pte"
                 fi
             elif [[ "$HF_MODEL_REPO" == "Qwen/Qwen3-0.6B" ]]; then
-              if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
+              if [[ "${{ matrix.config }}" == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
                 DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
                 python -m extension.llm.export.export_llm \
                   base.model_class=qwen3_0_6b \
@@ -313,7 +313,7 @@ jobs:
               fi
             fi
 
-            if [[ ${{ matrix.config }} == "hf_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
+            if [[ "${{ matrix.config }}" == "hf_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
               DOWNLOADED_PATH=$(
                 bash .ci/scripts/download_hf_hub.sh \
                   --model_id "${HF_MODEL_REPO}" \
@@ -353,13 +353,13 @@ jobs:
             mkdir -p ${ARTIFACTS_DIR_NAME}
             mv model.zip ${ARTIFACTS_DIR_NAME}
             ls -lh ${ARTIFACTS_DIR_NAME}
-        elif [[ ${{ matrix.model }} == "llama" ]]; then
+        elif [[ "${{ matrix.model }}" == "llama" ]]; then
             # Install requirements for export_llama
             PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
             # Test llama2
-            if [[ ${{ matrix.config }} == *"xnnpack"* ]]; then
+            if [[ "${{ matrix.config }}" == *"xnnpack"* ]]; then
                 DELEGATE_CONFIG="xnnpack+custom+qe"
-            elif [[ ${{ matrix.config }} == *"qnn"* ]]; then
+            elif [[ "${{ matrix.config }}" == *"qnn"* ]]; then
                 DELEGATE_CONFIG="qnn"
             else
                 echo "Unsupported delegate ${{ matrix.config }}"

From 36e5c29a8f0fd79da6e70c497f66ba59d7a21172 Mon Sep 17 00:00:00 2001
From: Hansong Zhang <hsz@meta.com>
Date: Tue, 20 Jan 2026 22:46:57 -0800
Subject: [PATCH 3/3] Fix huggingface-cli not found by using python -m

Use python module invocation instead of CLI binary to avoid
PATH issues in conda environment.
---
 .github/workflows/export-models.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/export-models.yml b/.github/workflows/export-models.yml
index d92feda10fd..2b5b22630f6 100644
--- a/.github/workflows/export-models.yml
+++ b/.github/workflows/export-models.yml
@@ -168,7 +168,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
 
         pip install -U "huggingface_hub[cli]"
-        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        python -m huggingface_hub.commands.huggingface_cli login --token $SECRET_EXECUTORCH_HF_TOKEN
         pip install accelerate sentencepiece
         pip list