diff --git a/benchmark_reporting_tools/post_results.py b/benchmark_reporting_tools/post_results.py index 318466fc..178fc0c7 100644 --- a/benchmark_reporting_tools/post_results.py +++ b/benchmark_reporting_tools/post_results.py @@ -93,10 +93,12 @@ class BenchmarkMetadata: kind: str | None = None execution_number: int = 1 worker_count: int | None = None + node_count: int | None = None scale_factor: int | None = None gpu_count: int | None = None num_drivers: int | None = None gpu_name: str | None = None + image_digest: str | None = None @classmethod def from_parsed(cls, raw: dict) -> "BenchmarkMetadata": @@ -259,8 +261,9 @@ def _parse_args() -> argparse.Namespace: ) parser.add_argument( "--identifier-hash", - help="Unique identifier hash for software environment (e.g. a container image digest).", - required=True, + default=None, + help="Unique identifier hash for software environment (e.g. a container image digest). " + "If omitted, the image_digest from benchmark_result.json context is used.", ) parser.add_argument( "--version", @@ -299,6 +302,26 @@ def _parse_args() -> argparse.Namespace: help="Benchmark definition name", required=True, ) + parser.add_argument( + "--velox-branch", + default=None, + help="Velox branch used to build the worker image.", + ) + parser.add_argument( + "--velox-repo", + default=None, + help="Velox repository used to build the worker image.", + ) + parser.add_argument( + "--presto-branch", + default=None, + help="Presto branch used to build the worker image.", + ) + parser.add_argument( + "--presto-repo", + default=None, + help="Presto repository used to build the worker image.", + ) parser.add_argument( "--concurrency-streams", help="Number of concurrency streams to use for the benchmark run", @@ -355,6 +378,10 @@ def _build_submission_payload( is_official: bool, asset_ids: list[int] | None = None, concurrency_streams: int = 1, + velox_branch: str | None = None, + velox_repo: str | None = None, + presto_branch: str | None = None, + presto_repo: str | None = None, ) -> dict: """Build a BenchmarkSubmission payload from parsed dataclasses. @@ -449,6 +476,16 @@ def _query_sort_key(name: str): if v is not None } + engine_config_payload = engine_config.serialize() if engine_config else {} + if velox_branch or velox_repo or presto_branch or presto_repo: + engine_config_payload = { + **engine_config_payload, + "velox_branch": velox_branch, + "velox_repo": velox_repo, + "presto_branch": presto_branch, + "presto_repo": presto_repo, + } + return { "sku_name": sku_name, "storage_configuration_name": storage_configuration_name, @@ -461,11 +498,11 @@ def _query_sort_key(name: str): "commit_hash": commit_hash, }, "run_at": benchmark_metadata.timestamp.isoformat(), - "node_count": 1, + "node_count": benchmark_metadata.node_count or 1, "gpu_count": benchmark_metadata.gpu_count or 0, "query_logs": query_logs, "concurrency_streams": concurrency_streams, - "engine_config": engine_config.serialize() if engine_config else {}, + "engine_config": engine_config_payload, "extra_info": extra_info, "is_official": is_official, "asset_ids": asset_ids, @@ -550,7 +587,7 @@ async def _process_benchmark_dir( storage_configuration_name: str, cache_state: str, engine_name: str | None, - identifier_hash: str, + identifier_hash: str | None, version: str | None, commit_hash: str | None, is_official: bool, @@ -563,6 +600,10 @@ async def _process_benchmark_dir( concurrency_streams: int = 1, config_dir: Path | None = None, logs_dir: Path | None = None, + velox_branch: str | None = None, + velox_repo: str | None = None, + presto_branch: str | None = None, + presto_repo: str | None = None, ) -> int: """Process a benchmark directory and post results to API. @@ -589,6 +630,18 @@ async def _process_benchmark_dir( print(f" Error loading metadata: {e}", file=sys.stderr) return 1 + # Fall back to the container image_digest captured in the benchmark + # results context when no explicit identifier_hash was provided on the CLI. + if identifier_hash is None: + identifier_hash = benchmark_metadata.image_digest + if identifier_hash is None: + print( + " Error: --identifier-hash was not provided and benchmark_result.json " + "context has no image_digest to fall back to.", + file=sys.stderr, + ) + return 1 + # Resolve config directory: explicit override → auto-detect from variant effective_config_dir = config_dir variant = _ENGINE_TO_VARIANT.get(benchmark_metadata.engine) @@ -669,6 +722,10 @@ async def _process_benchmark_dir( is_official=is_official, asset_ids=asset_ids, concurrency_streams=concurrency_streams, + velox_branch=velox_branch, + velox_repo=velox_repo, + presto_branch=presto_branch, + presto_repo=presto_repo, ) except Exception as e: print(f" Error building payload for '{bench_name}': {e}", file=sys.stderr) @@ -747,6 +804,10 @@ async def main() -> int: concurrency_streams=args.concurrency_streams, config_dir=Path(args.config_dir) if args.config_dir else None, logs_dir=Path(args.logs_dir) if args.logs_dir else None, + velox_branch=args.velox_branch, + velox_repo=args.velox_repo, + presto_branch=args.presto_branch, + presto_repo=args.presto_repo, ) return result diff --git a/presto/docker/config/template/etc_coordinator/config_native.properties b/presto/docker/config/template/etc_coordinator/config_native.properties index 3e6ad0dc..dfd73fc4 100644 --- a/presto/docker/config/template/etc_coordinator/config_native.properties +++ b/presto/docker/config/template/etc_coordinator/config_native.properties @@ -58,7 +58,7 @@ query.execution-policy=phased # Kill queries based on total reservation on blocked nodes to recover memory. query.low-memory-killer.policy=total-reservation-on-blocked-nodes # Upper limit on query wall time to keep tests bounded. -query.max-execution-time=10m +query.max-execution-time=60m # Keep metadata of up to 1000 queries for UI and debugging. query.max-history=1000 # Memory quotas per node and cluster to protect stability. diff --git a/presto/scripts/generate_presto_config.sh b/presto/scripts/generate_presto_config.sh index 611b4393..25f39cb7 100755 --- a/presto/scripts/generate_presto_config.sh +++ b/presto/scripts/generate_presto_config.sh @@ -135,7 +135,10 @@ EOF fi if [[ "${VARIANT_TYPE}" == "cpu" ]]; then - echo "cluster-tag=native-cpu" >>${COORD_CONFIG} + echo "cluster-tag=native-cpu" >> ${COORD_CONFIG} + # cuDF has no effect in CPU mode but leaving cudf.enabled=true in the worker + # config causes noisy startup warnings; force it off for CPU runs. + sed -i 's/^cudf\.enabled=true/cudf.enabled=false/' ${WORKER_CONFIG} fi # for Java variant, disable some Parquet properties which are now rejected @@ -162,7 +165,7 @@ fi # We want to propagate any changes from the original worker config to the new worker configs even if # we did not re-generate the configs. -if [[ -n "$NUM_WORKERS" && "$VARIANT_TYPE" == "gpu" ]]; then +if [[ -n "$NUM_WORKERS" ]]; then if [[ -n ${GPU_IDS:-} ]]; then WORKER_IDS=($(echo "$GPU_IDS" | tr ',' ' ')) else diff --git a/presto/slurm/presto-nvl72/README.md b/presto/slurm/presto-nvl72/README.md index 007f3bcd..1d71508f 100644 --- a/presto/slurm/presto-nvl72/README.md +++ b/presto/slurm/presto-nvl72/README.md @@ -6,72 +6,124 @@ This directory contains scripts for running Presto TPC-H benchmarks on CoreWeave ``` presto-nvl72/ -├── run-presto-benchmarks.slurm # Main slurm job script with configuration -├── run-presto-benchmarks.sh # Execution script -├── launch-run.sh # Convenience launcher -├── functions.sh # Presto helper functions -├── echo_helpers.sh # Logging helpers -├── logs/ # Execution logs -└── result_dir/ # Benchmark results +├── defaults.env # Cluster-specific path defaults (override via env) +├── functions.sh # Presto helper functions +├── echo_helpers.sh # Logging helpers +├── enroot-decompress.sh # Auto-detecting decompressor for enroot image pulls +│ +├── pull_ghcr_image.sh # Pull a GHCR image and save as .sqsh +│ +├── launch-run.sh # Submit a benchmark run job +├── run-presto-benchmarks.slurm # SLURM job script for benchmarks +├── run-presto-benchmarks.sh # Benchmark execution logic +│ +├── launch-analyze-tables.sh # Submit an analyze-tables job +├── run-analyze-tables.slurm # SLURM job script for ANALYZE TABLE +├── run-analyze-tables.sh # Analyze-tables execution logic +│ +├── launch-gen-data.sh # Submit a TPC-H data generation job +├── gen-tpch-data.slurm # SLURM job script for data generation +│ +├── run-sweep.sh # Run benchmark + post results for a sweep of configs +├── run_interactive.sh # Start an interactive Presto session +│ +├── logs/ # Execution logs +└── result_dir/ # Benchmark results ``` ## Quick Start -### Running the benchmark via launcher (recommended) +### 1. Pull container images + +Images must be pre-pulled as `.sqsh` files before running benchmarks: ```bash cd presto/slurm/presto-nvl72 -./launch-run.sh -n -s [-i ] [additional sbatch options] +./pull_ghcr_image.sh ghcr.io/rapidsai/velox-testing-images: +./pull_ghcr_image.sh ghcr.io/rapidsai/velox-testing-images: +``` + +Images are saved to `${IMAGE_DIR}` (default: `/scratch/${USER}/images/presto`). + +### 2. Generate TPC-H data (one-time per scale factor) + +```bash +# Pull the tpchgen-cli image first +./pull_ghcr_image.sh ghcr.io/rapidsai/velox-testing-images:tpchgen-cli + +./launch-gen-data.sh -s -o +``` + +### 3. Analyze tables (one-time per scale factor / image version) + +```bash +./launch-analyze-tables.sh -s -n \ + -w -c +``` + +### 4. Run benchmarks + +```bash +./launch-run.sh -n -s \ + -w -c \ + [-i ] # examples -./launch-run.sh -n 8 -s 3000 -./launch-run.sh -n 4 -s 10000 -i 3 --partition gpu --account myacct +./launch-run.sh -n 8 -s 3000 \ + -w presto-native-worker-gpu-v1 -c presto-coordinator-v1 + +./launch-run.sh -n 4 -s 10000 -i 3 \ + -w presto-native-worker-gpu-v1 -c presto-coordinator-v1 \ + --partition gpu --account myacct ``` The launcher: -- requires node count (-n/--nodes) and scale factor (-s/--scale-factor) -- accepts optional iterations (-i/--iterations, default 1) -- embeds nodes/SF/iterations in .out/.err filenames -- prints the first node’s hostname/IP when allocated and a ready-to-run SSH port-forward command to access the Presto Web UI on your machine (http://localhost:9200) +- requires node count (`-n/--nodes`), scale factor (`-s/--scale-factor`), worker image (`-w/--worker-image`), and coordinator image (`-c/--coord-image`) +- accepts optional iterations (`-i/--iterations`, default 2) +- embeds nodes/SF/iterations in `.out`/`.err` filenames +- prints a ready-to-run SSH port-forward command to access the Presto Web UI at http://localhost:9200 -### Submitting directly (advanced) +### 5. Run a sweep ```bash -export SCALE_FACTOR=3000 -export NUM_ITERATIONS=1 -sbatch --nodes 8 \ - --output "presto-tpch-run_n8_sf3000_i1_%j.out" \ - --error "presto-tpch-run_n8_sf3000_i1_%j.err" \ - --export "ALL,SCALE_FACTOR=${SCALE_FACTOR},NUM_ITERATIONS=${NUM_ITERATIONS}" \ - run-presto-benchmarks.slurm +./run-sweep.sh \ + --sku-name raplab-gb200-nvl72 \ + --storage-configuration-name \ + --velox-branch \ + --presto-branch \ + --velox-repo \ + --presto-repo \ + [-n "8 4"] \ + [-s "3000 10000"] \ + [-i ] ``` -## Configuration +`--cache-state` is derived automatically: `lukewarm` for 1 iteration, `warm` for 2+. Pass `--cache-state` explicitly to override. -Primary configuration is passed via the launcher flags and environment. The `.slurm` script validates that required variables are set. +## Configuration -Key variables: +### Key variables (set via launcher flags or environment) -- SCALE_FACTOR: required (provided via `-s/--scale-factor`) -- NUM_ITERATIONS: required by the job; launcher defaults to 1 (`-i/--iterations` to override) -- NUM_NODES: derived from Slurm allocation; provided via `-n/--nodes` to launcher -- REPO_ROOT: auto-detected from script location -- LOGS_DIR: `${SCRIPT_DIR}/logs` by default (log files are timestamped; old logs archived to `logs/archive/`) -- IMAGE_DIR, DATA, CONFIGS: see below or override via environment if needed +| Variable | Source | Description | +|---|---|---| +| `SCALE_FACTOR` | `-s/--scale-factor` | TPC-H scale factor (required) | +| `NODES_COUNT` | `-n/--nodes` | Number of SLURM nodes (required) | +| `WORKER_IMAGE` | `-w/--worker-image` | Worker image name, without `.sqsh` (required) | +| `COORD_IMAGE` | `-c/--coord-image` | Coordinator image name, without `.sqsh` (required) | +| `NUM_ITERATIONS` | `-i/--iterations` | Benchmark iterations (default: 2) | +| `NUM_GPUS_PER_NODE` | `-g/--num-gpus-per-node` | GPUs per node (default: 4) | -Other defaults: -- WORKER_IMAGE: `presto-native-worker-gpu` -- NUM_GPUS_PER_NODE: `4` -- DATA: `/mnt/data/tpch-rs` -- IMAGE_DIR: `/mnt/data/images/presto` -- CONFIGS: `${REPO_ROOT}/presto/docker/config/generated/gpu` +### Path defaults (`defaults.env`) -### SBATCH Directives +Override any of these by exporting before running: -- **Time limit**: 1 hour (adjust `--time` if needed) -- **Node allocation**: Full node (144 CPUs, 4 GPUs, exclusive) -- **Memory**: All available (`--mem=0`) -- `--nodes`, `--output`, and `--error` are passed by the launcher instead of being embedded in the `.slurm` file. +| Variable | Default | Description | +|---|---|---| +| `DATA` | `/scratch/${USER}/tpch-rs-float-no-delta` | TPC-H parquet dataset root | +| `IMAGE_DIR` | `/scratch/${USER}/images/presto` | Directory containing `.sqsh` image files | +| `RESULTS_BASE` | `${HOME}/${VT_WORKSPACE}/results` | Benchmark result output root | +| `HIVE_METASTORE_SHARED_ROOT` | `/scratch/${USER}/shared_hive_metadata` | Shared pre-analyzed metastore snapshots | +| `HIVE_METASTORE_VERSION` | `HIVE-METASTORE-20260419-no-delta` | Metastore snapshot version tag | ## Monitoring @@ -80,21 +132,18 @@ Other defaults: squeue -u $USER # Monitor job output -tail -f presto-tpch-run_n_sf_i_.out +tail -f presto-tpch-run_n_sf_i_.out -# Check logs during execution (filenames include a run timestamp) -tail -f logs/coord_*.log -tail -f logs/cli_*.log -tail -f logs/worker_0_*.log +# Check logs during execution +tail -f logs/coord.log +tail -f logs/cli.log +tail -f logs/worker_0.log ``` -## Coordinator IP and Web UI +## Coordinator Web UI -After submission, the launcher waits until nodes are allocated, then prints: -- the first node’s hostname/IP -- an SSH port-forward command you can run locally to access the Presto Web UI - -Example output snippet: +After submission, the launcher waits until nodes are allocated, then prints an SSH +port-forward command you can run locally: ```text Run this command on a machine to get access to the webUI: @@ -102,45 +151,65 @@ Run this command on a machine to get access to the webUI: The UI will be available at http://localhost:9200 ``` -## Results +## Reusing an analyzed Hive metastore across runs + +Running `ANALYZE TABLE` from scratch on every clone is expensive. The launchers +publish and consume pre-analyzed metastore snapshots keyed by version string and +scale factor. + +Two env vars control sharing (defined in `defaults.env`): -Results are saved to: -- **Logs**: `logs/` directory -- **CSV Summary**: `result_dir/summary.csv` -- **Historical Results**: `${REPO_ROOT}/benchmark-storage/YYYY/MM/DD/` +- `HIVE_METASTORE_SHARED_ROOT` — directory on a cluster-visible filesystem where + snapshots live. +- `HIVE_METASTORE_VERSION` — version tag. Bump it when the worker image or parquet + data format changes so stale snapshots don't leak into runs against a newer image. + Unset to disable sharing entirely. -## Prerequisites +Layout: `$HIVE_METASTORE_SHARED_ROOT/$HIVE_METASTORE_VERSION/tpchsf/…` -1. **Container images** must exist in `${IMAGE_DIR}`: - - `presto-coordinator.sqsh` - - `presto-native-worker-gpu.sqsh` or `presto-native-worker-cpu.sqsh` +### Consuming — the default path -2. **Data directory** must be accessible at `${DATA}` (will be mounted in containers) +With defaults, a benchmark run just works: -3. **velox-testing repo** will be auto-cloned to `${REPO_ROOT}/velox-testing` if not present +```bash +./launch-run.sh -n 2 -s 3000 -i 1 -w -c +``` + +`setup` in the SLURM job populates `.hive_metastore/tpchsf/` from the shared +snapshot when the local copy is absent. If neither local nor shared is available the +run fails fast with a message pointing at `launch-analyze-tables.sh`. + +### Publishing — run once per (version, SF) to seed a new slot + +```bash +export HIVE_METASTORE_VERSION=HIVE-METASTORE-20260419-no-delta # or a new tag +./launch-analyze-tables.sh -s -n -w -c +# On success, if the target slot is empty it gets populated atomically. +# Subsequent analyze runs with the same (version, SF) skip the publish. +``` + +### Disabling sharing (fall back to per-clone analyze) + +```bash +unset HIVE_METASTORE_VERSION +./launch-analyze-tables.sh -s -n -w -c +./launch-run.sh -n -s -i -w -c +``` ## Troubleshooting ### Coordinator fails to start -Check coordinator logs: ```bash -cat logs/coord_*.log +cat logs/coord.log ``` ### Workers not registering -Check worker logs: ```bash cat logs/worker_*.log ``` ### Image not found -Verify images exist: -```bash -ls -lh /mnt/data/images/presto/*.sqsh -``` - -### Data directory issues -Verify data path is accessible: +Pull the image first: ```bash -ls -la /mnt/data/tpch-presto +./pull_ghcr_image.sh ghcr.io/rapidsai/velox-testing-images: ``` diff --git a/presto/slurm/presto-nvl72/defaults.env b/presto/slurm/presto-nvl72/defaults.env new file mode 100644 index 00000000..adfc7a93 --- /dev/null +++ b/presto/slurm/presto-nvl72/defaults.env @@ -0,0 +1,35 @@ +#!/bin/bash +# ============================================================================== +# Cluster & path defaults for presto-nvl72 benchmark scripts. +# ============================================================================== +# All variables use the := pattern: export a variable before sourcing this file +# (or before running a script that sources it) to override any default. +# +# Multiple checkouts of velox-testing are supported — each checkout gets its own +# scratch/output namespace via VT_WORKSPACE, derived from the repo path. +# Override VT_WORKSPACE to use a custom prefix instead. +# ============================================================================== + +# --- Workspace identifier (per-clone isolation) --- +_vt_path="${VT_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." 2>/dev/null && pwd)}" +: "${VT_WORKSPACE:=$(echo "${_vt_path#${HOME}/}" | tr '/' '-')}" +unset _vt_path + +# --- Scratch-based paths --- +# DATA points at the canonical TPC-H parquet tree on this cluster. +# Override DATA in the env to point at a different layout. +: "${DATA:=/scratch/${USER}/tpch-rs-float-no-delta}" +: "${IMAGE_DIR:=/scratch/${USER}/images/presto}" +: "${EXPECTED_RESULTS_BASE:=/scratch/${USER}/tpch-rs-no-delta-expected}" + +# --- Output paths --- +: "${RESULTS_BASE:=${HOME}/${VT_WORKSPACE}/results}" + +# --- Shared Hive metastore --- +# Pre-analyzed metastores are published here by launch-analyze-tables.sh and +# consumed by launch-run.sh, keyed by HIVE_METASTORE_VERSION + scale factor. +# Default version string points at the current cluster-wide published snapshot +# (SFs 1000/3000/10000/30000 against DATA above). Bump when the worker image +# or parquet encoding changes, or unset to disable sharing. +: "${HIVE_METASTORE_SHARED_ROOT:=/scratch/${USER}/shared_hive_metadata}" +: "${HIVE_METASTORE_VERSION:=HIVE-METASTORE-20260419-no-delta}" diff --git a/presto/slurm/presto-nvl72/enroot-decompress.sh b/presto/slurm/presto-nvl72/enroot-decompress.sh new file mode 100755 index 00000000..8cac4212 --- /dev/null +++ b/presto/slurm/presto-nvl72/enroot-decompress.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Auto-detecting decompressor for enroot image layer downloads. +# Used as ENROOT_GZIP_PROGRAM to support both gzip and OCI tar+zstd layers. +# Called by enroot as: enroot-decompress.sh -d -f -c (args are ignored) + +tmp=$(mktemp) +trap 'rm -f "$tmp"' EXIT + +# Peek at first 4 bytes to detect compression format without buffering the full stream +dd bs=1 count=4 2>/dev/null > "$tmp" +magic=$(od -A n -N 4 -t x1 "$tmp" | tr -d ' \n') + +case "$magic" in + 1f8b*) + # gzip + { cat "$tmp"; cat; } | gzip -d -f -c + ;; + 28b52ffd*) + # zstd (magic: 0xFD2FB528 stored little-endian = 28 b5 2f fd) + { cat "$tmp"; cat; } | zstd -d -f -c + ;; + *) + # Unknown format — pass through unchanged + cat "$tmp" + cat + ;; +esac diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh index 19605321..9a66009c 100755 --- a/presto/slurm/presto-nvl72/functions.sh +++ b/presto/slurm/presto-nvl72/functions.sh @@ -4,27 +4,33 @@ # Validates job preconditions and assigns default values for presto execution. function setup { - [ -z "$SLURM_JOB_NAME" ] && echo "required argument '--job-name' not specified" && exit 1 - [ -z "$SLURM_JOB_ACCOUNT" ] && echo "required argument '--account' not specified" && exit 1 - [ -z "$SLURM_JOB_PARTITION" ] && echo "required argument '--partition' not specified" && exit 1 - [ -z "$SLURM_NNODES" ] && echo "required argument '--nodes' not specified" && exit 1 + [ -z "${SLURM_JOB_NAME:-}" ] && echo "required argument '--job-name' not specified" && exit 1 + [ -z "${SLURM_JOB_ACCOUNT:-}" ] && echo "warning: '--account' not specified" + [ -z "${SLURM_JOB_PARTITION:-}" ] && echo "warning: '--partition' not specified" + [ -z "${SLURM_NNODES:-}" ] && echo "required argument '--nodes' not specified" && exit 1 [ -z "$IMAGE_DIR" ] && echo "IMAGE_DIR must be set" && exit 1 - [ -z "$LOGS_DIR" ] && echo "LOGS_DIR must be set" && exit 1 - [ -z "$SERVER_START_TIMESTAMP" ] && echo "SERVER_START_TIMESTAMP must be set" && exit 1 + [ -z "$LOGS" ] && echo "LOGS must be set" && exit 1 [ -z "$CONFIGS" ] && echo "CONFIGS must be set" && exit 1 [ -z "$NUM_NODES" ] && echo "NUM_NODES must be set" && exit 1 [ -z "$NUM_GPUS_PER_NODE" ] && echo "NUM_GPUS_PER_NODE env variable must be set" && exit 1 [ ! -d "$VT_ROOT" ] && echo "VT_ROOT must be a valid directory" && exit 1 [ ! -d "$DATA" ] && echo "DATA must be a valid directory" && exit 1 - if [ ! -d ${VT_ROOT}/.hive_metastore ]; then - echo "Copying hive metastore from data source." - copy_hive_metastore - else - echo "Hive metastore already exists. Reusing." + # If sharing is opted in (HIVE_METASTORE_VERSION set) and the local + # snapshot is missing, try to populate it from the shared location. + if [[ -n "${HIVE_METASTORE_VERSION:-}" && ! -d "${VT_ROOT}/.hive_metastore/tpchsf${SCALE_FACTOR}" ]]; then + populate_hive_metastore_from_shared fi - [ ! -d ${VT_ROOT}/.hive_metastore/tpchsf${SCALE_FACTOR} ] && echo "Schema for SF ${SCALE_FACTOR} does not exist in hive metastore." && exit 1 + if [ ! -d "${VT_ROOT}/.hive_metastore/tpchsf${SCALE_FACTOR}" ]; then + echo "Schema for SF ${SCALE_FACTOR} is not present in ${VT_ROOT}/.hive_metastore." + if [[ -n "${HIVE_METASTORE_VERSION:-}" ]]; then + echo "Shared slot $(shared_metastore_slot) is also empty; publish one by running launch-analyze-tables.sh with the same HIVE_METASTORE_VERSION set." + else + echo "Run launch-analyze-tables.sh -s ${SCALE_FACTOR} first, or set HIVE_METASTORE_VERSION to consume a pre-published snapshot." + fi + exit 1 + fi generate_configs @@ -59,46 +65,66 @@ function validate_environment_preconditions { # Execute script through the coordinator image (used for coordinator and cli executables) function run_coord_image { [ $# -ne 2 ] && echo_error "$0 expected one argument for '