From 81f711b1ed38f52d884b75186511943f8948013a Mon Sep 17 00:00:00 2001 From: Misiu Godfrey Date: Sun, 19 Apr 2026 06:18:09 +0000 Subject: [PATCH 01/25] slurm: rebuild presto-nvl72 launcher layer from SpaceMicePOC Port the presto/slurm/presto-nvl72/ tree from misiug/NewClusterScripts (which was based on misiug/SpaceMicePOC) onto current main. The branch was lagging far enough that non-slurm upstream features had landed that duplicated much of its work (custom-queries plumbing, analyze-tables check, benchmark config output). Only the slurm/cluster-specific layer is carried forward here; non-slurm duplicates are dropped in favor of upstream. New scripts: - defaults.env: cluster-wide env vars (workspace, scratch, node list) - enroot-decompress.sh: gzip/zstd auto-detect decompressor for .sqsh - pull_ghcr_image.sh: GHCR -> .sqsh conversion on compute node - run_interactive.sh: one-shot interactive shell with container - gen-tpch-data.slurm + launch-gen-data.sh: TPC-H parquet generation - run-analyze-tables.{sh,slurm} + launch-analyze-tables.sh: table analysis - run-sweep.sh: node x scale-factor sweep wired to post_results.py - get_avg_col.sh, get_luke_col.sh: awk helpers for cli.log parsing Updated scripts: functions.sh (metadata injection, collect_results, config/metastore setup, worker orchestration), launch-run.sh, run-presto-benchmarks.{sh,slurm}, run_multiple.sh, README.md. Validation wiring is deliberately absent: PR #275 (misiug/validationSplit) introduces the upstream validate_results.py and plumbs it through run_benchmark.sh/post_results.py. Once that merges, slurm scripts will be rebased to call the upstream flow rather than carrying their own validation. Co-Authored-By: Claude Opus 4.7 (1M context) --- presto/slurm/presto-nvl72/README.md | 12 +- presto/slurm/presto-nvl72/defaults.env | 32 ++ .../slurm/presto-nvl72/enroot-decompress.sh | 27 ++ presto/slurm/presto-nvl72/functions.sh | 356 ++++++++++++++++-- presto/slurm/presto-nvl72/gen-tpch-data.slurm | 64 ++++ presto/slurm/presto-nvl72/get_avg_col.sh | 13 + presto/slurm/presto-nvl72/get_luke_col.sh | 13 + .../presto-nvl72/launch-analyze-tables.sh | 129 +++++++ presto/slurm/presto-nvl72/launch-gen-data.sh | 89 +++++ presto/slurm/presto-nvl72/launch-run.sh | 64 +++- presto/slurm/presto-nvl72/pull_ghcr_image.sh | 97 +++++ .../slurm/presto-nvl72/run-analyze-tables.sh | 89 +++++ .../presto-nvl72/run-analyze-tables.slurm | 107 ++++++ .../presto-nvl72/run-presto-benchmarks.sh | 12 +- .../presto-nvl72/run-presto-benchmarks.slurm | 34 +- presto/slurm/presto-nvl72/run-sweep.sh | 99 +++++ presto/slurm/presto-nvl72/run_interactive.sh | 23 ++ presto/slurm/presto-nvl72/run_multiple.sh | 4 +- 18 files changed, 1193 insertions(+), 71 deletions(-) create mode 100644 presto/slurm/presto-nvl72/defaults.env create mode 100755 presto/slurm/presto-nvl72/enroot-decompress.sh create mode 100644 presto/slurm/presto-nvl72/gen-tpch-data.slurm create mode 100755 presto/slurm/presto-nvl72/get_avg_col.sh create mode 100755 presto/slurm/presto-nvl72/get_luke_col.sh create mode 100755 presto/slurm/presto-nvl72/launch-analyze-tables.sh create mode 100755 presto/slurm/presto-nvl72/launch-gen-data.sh create mode 100755 presto/slurm/presto-nvl72/pull_ghcr_image.sh create mode 100755 presto/slurm/presto-nvl72/run-analyze-tables.sh create mode 100755 presto/slurm/presto-nvl72/run-analyze-tables.slurm create mode 100755 presto/slurm/presto-nvl72/run-sweep.sh create mode 100755 presto/slurm/presto-nvl72/run_interactive.sh diff --git a/presto/slurm/presto-nvl72/README.md b/presto/slurm/presto-nvl72/README.md index 007f3bcd..b2fb24e3 100644 --- a/presto/slurm/presto-nvl72/README.md +++ b/presto/slurm/presto-nvl72/README.md @@ -56,7 +56,7 @@ Key variables: - NUM_ITERATIONS: required by the job; launcher defaults to 1 (`-i/--iterations` to override) - NUM_NODES: derived from Slurm allocation; provided via `-n/--nodes` to launcher - REPO_ROOT: auto-detected from script location -- LOGS_DIR: `${SCRIPT_DIR}/logs` by default (log files are timestamped; old logs archived to `logs/archive/`) +- LOGS: `${SCRIPT_DIR}/logs` by default - IMAGE_DIR, DATA, CONFIGS: see below or override via environment if needed Other defaults: @@ -82,10 +82,10 @@ squeue -u $USER # Monitor job output tail -f presto-tpch-run_n_sf_i_.out -# Check logs during execution (filenames include a run timestamp) -tail -f logs/coord_*.log -tail -f logs/cli_*.log -tail -f logs/worker_0_*.log +# Check logs during execution +tail -f logs/coord.log +tail -f logs/cli.log +tail -f logs/worker_0.log ``` ## Coordinator IP and Web UI @@ -124,7 +124,7 @@ Results are saved to: ### Coordinator fails to start Check coordinator logs: ```bash -cat logs/coord_*.log +cat logs/coord.log ``` ### Workers not registering diff --git a/presto/slurm/presto-nvl72/defaults.env b/presto/slurm/presto-nvl72/defaults.env new file mode 100644 index 00000000..9a8e9c1b --- /dev/null +++ b/presto/slurm/presto-nvl72/defaults.env @@ -0,0 +1,32 @@ +#!/bin/bash +# ============================================================================== +# Cluster & path defaults for presto-nvl72 benchmark scripts. +# ============================================================================== +# All variables use the := pattern: export a variable before sourcing this file +# (or before running a script that sources it) to override any default. +# +# Multiple checkouts of velox-testing are supported — each checkout gets its own +# scratch/output namespace via VT_WORKSPACE, derived from the repo path. +# Override VT_WORKSPACE to use a custom prefix instead. +# ============================================================================== + +# --- Workspace identifier (per-clone isolation) --- +_vt_path="${VT_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." 2>/dev/null && pwd)}" +: "${VT_WORKSPACE:=$(echo "${_vt_path#${HOME}/}" | tr '/' '-')}" +unset _vt_path + +# --- Scratch-based paths --- +: "${DATA:=/scratch/${USER}/tpch-rs-float-no-delta}" +: "${IMAGE_DIR:=/scratch/${USER}/images/presto}" +: "${EXPECTED_RESULTS_BASE:=/scratch/${USER}/tpch-rs-no-delta-expected}" + +# --- Output paths --- +: "${IMAGES_DIR:=${HOME}/${VT_WORKSPACE}/images}" +: "${RESULTS_BASE:=${HOME}/${VT_WORKSPACE}/results}" + +# --- Shared filesystem paths (not per-user, but overridable) --- +: "${HIVE_METASTORE_SOURCE:=/mnt/data/tpch-rs/HIVE-METASTORE-MG-260313}" + +# --- SLURM node defaults (cluster-specific) --- +: "${DEFAULT_NODELIST:=presto-gb200-gcn-[01-13,15-16]}" +: "${DEFAULT_SINGLE_NODE:=presto-gb200-gcn-01}" diff --git a/presto/slurm/presto-nvl72/enroot-decompress.sh b/presto/slurm/presto-nvl72/enroot-decompress.sh new file mode 100755 index 00000000..fbe1038a --- /dev/null +++ b/presto/slurm/presto-nvl72/enroot-decompress.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Auto-detecting decompressor for enroot image layer downloads. +# Used as ENROOT_GZIP_PROGRAM to support both gzip and OCI tar+zstd layers. +# Called by enroot as: enroot-decompress.sh -d -f -c (args are ignored) + +tmp=$(mktemp) +trap 'rm -f "$tmp"' EXIT + +# Peek at first 4 bytes to detect compression format without buffering the full stream +dd bs=1 count=4 2>/dev/null > "$tmp" +magic=$(od -A n -N 4 -t x1 "$tmp" | tr -d ' \n') + +case "$magic" in + 1f8b*) + # gzip + { cat "$tmp"; cat; } | gzip -d -f -c + ;; + 28b52ffd*) + # zstd (magic: 0xFD2FB528 stored little-endian = 28 b5 2f fd) + { cat "$tmp"; cat; } | zstd -d -f -c + ;; + *) + # Unknown format — pass through unchanged + cat "$tmp" + cat + ;; +esac diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh index 19605321..3b3ef1ba 100755 --- a/presto/slurm/presto-nvl72/functions.sh +++ b/presto/slurm/presto-nvl72/functions.sh @@ -4,25 +4,24 @@ # Validates job preconditions and assigns default values for presto execution. function setup { - [ -z "$SLURM_JOB_NAME" ] && echo "required argument '--job-name' not specified" && exit 1 - [ -z "$SLURM_JOB_ACCOUNT" ] && echo "required argument '--account' not specified" && exit 1 - [ -z "$SLURM_JOB_PARTITION" ] && echo "required argument '--partition' not specified" && exit 1 - [ -z "$SLURM_NNODES" ] && echo "required argument '--nodes' not specified" && exit 1 + [ -z "${SLURM_JOB_NAME:-}" ] && echo "required argument '--job-name' not specified" && exit 1 + [ -z "${SLURM_JOB_ACCOUNT:-}" ] && echo "warning: '--account' not specified" + [ -z "${SLURM_JOB_PARTITION:-}" ] && echo "warning: '--partition' not specified" + [ -z "${SLURM_NNODES:-}" ] && echo "required argument '--nodes' not specified" && exit 1 [ -z "$IMAGE_DIR" ] && echo "IMAGE_DIR must be set" && exit 1 - [ -z "$LOGS_DIR" ] && echo "LOGS_DIR must be set" && exit 1 - [ -z "$SERVER_START_TIMESTAMP" ] && echo "SERVER_START_TIMESTAMP must be set" && exit 1 + [ -z "$LOGS" ] && echo "LOGS must be set" && exit 1 [ -z "$CONFIGS" ] && echo "CONFIGS must be set" && exit 1 [ -z "$NUM_NODES" ] && echo "NUM_NODES must be set" && exit 1 [ -z "$NUM_GPUS_PER_NODE" ] && echo "NUM_GPUS_PER_NODE env variable must be set" && exit 1 [ ! -d "$VT_ROOT" ] && echo "VT_ROOT must be a valid directory" && exit 1 [ ! -d "$DATA" ] && echo "DATA must be a valid directory" && exit 1 - if [ ! -d ${VT_ROOT}/.hive_metastore ]; then - echo "Copying hive metastore from data source." - copy_hive_metastore - else - echo "Hive metastore already exists. Reusing." - fi + #if [ ! -d ${VT_ROOT}/.hive_metastore ]; then + # echo "Copying hive metastore from data source." + # copy_hive_metastore + #else + # echo "Hive metastore already exists. Reusing." + #fi [ ! -d ${VT_ROOT}/.hive_metastore/tpchsf${SCALE_FACTOR} ] && echo "Schema for SF ${SCALE_FACTOR} does not exist in hive metastore." && exit 1 @@ -59,46 +58,70 @@ function validate_environment_preconditions { # Execute script through the coordinator image (used for coordinator and cli executables) function run_coord_image { [ $# -ne 2 ] && echo_error "$0 expected one argument for '