[ML] Reapply: Run allowlist validation in PyTorch edge pipeline (#3007)

edsavage · web-flow · commit 48a1e664d503 · 2026-03-26T14:08:21.000+13:00
The Linux build/test Docker images don't include Python 3 (it's only
used during image builds to compile PyTorch, then dropped in the
multi-stage final image). Move the validation to a dedicated pipeline
step using a python:3 agent image, triggered only for
run_pytorch_tests builds.
diff --git a/.buildkite/pipeline.json.py b/.buildkite/pipeline.json.py
@@ -84,6 +84,14 @@ def main():
                                                        ".buildkite/pipelines/check_build_regression.yml.sh",
                                                        soft_fail=True))
 
+    # Validate the PyTorch allowlist against HuggingFace models when
+    # triggered from the PyTorch edge pipeline.  Runs in a python:3
+    # container since the build/test images don't include Python.
+    if config.run_pytorch_tests:
+        pipeline_steps.append(pipeline_steps.generate_step("Upload PyTorch allowlist validation",
+                                                           ".buildkite/pipelines/validate_pytorch_allowlist.yml.sh",
+                                                           soft_fail=True))
+
     pipeline["env"] = env
     pipeline["steps"] = pipeline_steps
     print(json.dumps(pipeline, indent=2))
diff --git a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+# or more contributor license agreements. Licensed under the Elastic License
+# 2.0 and the following additional limitation. Functionality enabled by the
+# files subject to the Elastic License 2.0 may only be used in production when
+# invoked by an Elasticsearch process with a license key installed that permits
+# use of machine learning features. You may not use this file except in
+# compliance with the Elastic License 2.0 and the foregoing additional
+# limitation.
+
+cat <<'EOL'
+steps:
+  - label: "Validate PyTorch allowlist :torch:"
+    key: "validate_pytorch_allowlist"
+    timeout_in_minutes: 60
+    command:
+        - "if [ ! -f dev-tools/extract_model_ops/validate_allowlist.py ]; then echo 'validate_allowlist.py not found, skipping'; exit 0; fi"
+        - "pip install -r dev-tools/extract_model_ops/requirements.txt"
+        - "python3 dev-tools/extract_model_ops/validate_allowlist.py --config dev-tools/extract_model_ops/validation_models.json --pt-dir dev-tools/extract_model_ops/es_it_models --verbose"
+EOL
+
+# Depend on the build steps so validation doesn't start before the
+# pipeline is fully generated.
+if [ -n "${ML_BUILD_STEP_KEYS:-}" ]; then
+    echo '    depends_on:'
+    IFS=',' read -ra STEP_KEYS <<< "$ML_BUILD_STEP_KEYS"
+    for key in "${STEP_KEYS[@]}"; do
+        echo "        - \"${key}\""
+    done
+fi
+
+cat <<'EOL'
+    allow_dependency_failure: true
+    agents:
+      image: "python:3.12"
+      memory: "32G"
+      ephemeralStorage: "30G"
+    notify:
+      - github_commit_status:
+          context: "Validate PyTorch allowlist"
+EOL
diff --git a/.buildkite/scripts/steps/run_tests.sh b/.buildkite/scripts/steps/run_tests.sh
@@ -105,28 +105,6 @@ else
         -P cmake/run-all-tests-parallel.cmake || TEST_OUTCOME=$?
 fi
 
-# --- PyTorch allowlist validation ---
-# When triggered from the PyTorch edge pipeline, run the Python-based
-# allowlist validation which traces live HuggingFace models with the
-# new PyTorch version and verifies every op is in ALLOWED_OPERATIONS.
-VALIDATION_OUTCOME=0
-if [[ "${GITHUB_PR_COMMENT_VAR_ACTION:-}" == "run_pytorch_tests" ]] && [ -f cmake/run-validation.cmake ]; then
-    echo "--- Validating PyTorch allowlist against HuggingFace models"
-    cmake \
-        -DSOURCE_DIR="$(pwd)" \
-        -DVALIDATE_CONFIG="$(pwd)/dev-tools/extract_model_ops/validation_models.json" \
-        -DVALIDATE_PT_DIR="$(pwd)/dev-tools/extract_model_ops/es_it_models" \
-        -DVALIDATE_VERBOSE=TRUE \
-        -DOPTIONAL=TRUE \
-        -P cmake/run-validation.cmake || VALIDATION_OUTCOME=$?
-
-    if [[ $VALIDATION_OUTCOME -ne 0 ]]; then
-        echo "^^^ +++"
-        echo "Allowlist validation failed — the new PyTorch version may introduce ops not in ALLOWED_OPERATIONS."
-        echo "See dev-tools/extract_model_ops/README.md for how to update the allowlist."
-    fi
-fi
-
 # Upload test results
 echo "--- Uploading test results"
 TEST_RESULTS_ARCHIVE=${OS}-${HARDWARE_ARCH}-unit_test_results.tgz
@@ -139,6 +117,4 @@ else
     echo "No test results archive created"
 fi
 
-if [[ $TEST_OUTCOME -ne 0 || $VALIDATION_OUTCOME -ne 0 ]]; then
-    exit 1
-fi
+exit $TEST_OUTCOME
diff --git a/dev-tools/extract_model_ops/torchscript_utils.py b/dev-tools/extract_model_ops/torchscript_utils.py
@@ -145,13 +145,18 @@ def load_and_trace_hf_model(model_name: str, quantize: bool = False,
     attention_mask = inputs["attention_mask"]
 
     try:
-        return torch.jit.trace(
+        traced = torch.jit.trace(
             model, (input_ids, attention_mask), strict=False)
     except Exception as exc:
         print(f"    TRACE WARNING: {exc}", file=sys.stderr)
         print("    Falling back to torch.jit.script...", file=sys.stderr)
         try:
-            return torch.jit.script(model)
+            traced = torch.jit.script(model)
         except Exception as exc2:
             print(f"    SCRIPT ERROR: {exc2}", file=sys.stderr)
             return None
+
+    # Free the original HF model to reduce peak memory when validating
+    # many models sequentially.
+    del model, tokenizer, inputs
+    return traced
diff --git a/dev-tools/extract_model_ops/validate_allowlist.py b/dev-tools/extract_model_ops/validate_allowlist.py
@@ -29,6 +29,7 @@
 """
 
 import argparse
+import gc
 import re
 import sys
 from pathlib import Path
@@ -104,30 +105,44 @@ def validate_model(model_name: str,
                    allowed: set[str],
                    forbidden: set[str],
                    verbose: bool,
-                   quantize: bool = False) -> bool:
-    """Validate one HuggingFace model. Returns True if all ops pass."""
+                   quantize: bool = False,
+                   auto_class: str | None = None,
+                   config_overrides: dict | None = None) -> str:
+    """Validate one HuggingFace model.
+
+    Returns "pass", "fail" (op validation failed), or "skip" (could not
+    load/trace — e.g. private model without HF_TOKEN).
+    """
     label = f"{model_name} (quantized)" if quantize else model_name
     print(f"  {label}...", file=sys.stderr)
-    traced = load_and_trace_hf_model(model_name, quantize=quantize)
+    traced = load_and_trace_hf_model(model_name, quantize=quantize,
+                                     auto_class=auto_class,
+                                     config_overrides=config_overrides)
     if traced is None:
-        print(f"    FAILED (could not load/trace)", file=sys.stderr)
-        return False
+        print(f"    SKIPPED (could not load/trace)", file=sys.stderr)
+        return "skip"
     ops = collect_inlined_ops(traced)
-    return check_ops(ops, allowed, forbidden, verbose)
+    result = "pass" if check_ops(ops, allowed, forbidden, verbose) else "fail"
+    del traced
+    gc.collect()
+    return result
 
 
 def validate_pt_file(name: str,
                      pt_path: str,
                      allowed: set[str],
                      forbidden: set[str],
-                     verbose: bool) -> bool:
-    """Validate a local TorchScript .pt file. Returns True if all ops pass."""
+                     verbose: bool) -> str:
+    """Validate a local TorchScript .pt file.
+
+    Returns "pass", "fail", or "skip".
+    """
     print(f"  {name} ({pt_path})...", file=sys.stderr)
     ops = load_pt_and_collect_ops(pt_path)
     if ops is None:
-        print(f"    FAILED (could not load)", file=sys.stderr)
-        return False
-    return check_ops(ops, allowed, forbidden, verbose)
+        print(f"    SKIPPED (could not load)", file=sys.stderr)
+        return "skip"
+    return "pass" if check_ops(ops, allowed, forbidden, verbose) else "fail"
 
 
 def main():
@@ -151,7 +166,7 @@ def main():
     print(f"Parsed {len(allowed)} allowed ops and {len(forbidden)} "
           f"forbidden ops from {SUPPORTED_OPS_CC.name}", file=sys.stderr)
 
-    results: dict[str, bool] = {}
+    results: dict[str, str] = {}
 
     models = load_model_config(args.config)
 
@@ -161,7 +176,9 @@ def main():
     for arch, spec in models.items():
         results[arch] = validate_model(
             spec["model_id"], allowed, forbidden, args.verbose,
-            quantize=spec["quantized"])
+            quantize=spec["quantized"],
+            auto_class=spec.get("auto_class"),
+            config_overrides=spec.get("config_overrides"))
 
     if args.pt_dir and args.pt_dir.is_dir():
         pt_files = sorted(args.pt_dir.glob("*.pt"))
@@ -175,26 +192,32 @@ def main():
 
     print(file=sys.stderr)
     print("=" * 60, file=sys.stderr)
-    all_pass = all(results.values())
-    for key, passed in results.items():
-        status = "PASS" if passed else "FAIL"
+    for key, status in results.items():
+        display = status.upper()
         if key.startswith("pt:"):
-            print(f"  {key}: {status}", file=sys.stderr)
+            print(f"  {key}: {display}", file=sys.stderr)
         else:
             spec = models[key]
             label = spec["model_id"]
             if spec["quantized"]:
                 label += " (quantized)"
-            print(f"  {key} ({label}): {status}", file=sys.stderr)
+            print(f"  {key} ({label}): {display}", file=sys.stderr)
+
+    failed = [a for a, s in results.items() if s == "fail"]
+    skipped = [a for a, s in results.items() if s == "skip"]
+    passed = [a for a, s in results.items() if s == "pass"]
 
     print("=" * 60, file=sys.stderr)
-    if all_pass:
-        print("All models PASS - no false positives.", file=sys.stderr)
-    else:
-        failed = [a for a, p in results.items() if not p]
-        print(f"FAILED models: {', '.join(failed)}", file=sys.stderr)
+    print(f"{len(passed)} passed, {len(failed)} failed, "
+          f"{len(skipped)} skipped", file=sys.stderr)
+
+    if skipped:
+        print(f"Skipped (could not load/trace — may need HF_TOKEN "
+              f"for private models): {', '.join(skipped)}", file=sys.stderr)
+    if failed:
+        print(f"FAILED (op validation): {', '.join(failed)}", file=sys.stderr)
 
-    sys.exit(0 if all_pass else 1)
+    sys.exit(0 if not failed else 1)
 
 
 if __name__ == "__main__":