-
Notifications
You must be signed in to change notification settings - Fork 19
Updated Slurm Scripts #313
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
81f711b
3238142
5203575
ec58e17
d8f5505
151a578
13e1071
9ab7a2d
160c361
0d3b578
677e5c3
c28013d
a6ac563
4084e50
f3c4b8a
86cc385
1acd024
77e7195
18f0e59
5b07779
ec1ec4b
79adceb
02725ad
48a09d8
1675daf
f5978f2
b6b2f41
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -93,10 +93,12 @@ class BenchmarkMetadata: | |
| kind: str | None = None | ||
| execution_number: int = 1 | ||
| worker_count: int | None = None | ||
| node_count: int | None = None | ||
| scale_factor: int | None = None | ||
| gpu_count: int | None = None | ||
| num_drivers: int | None = None | ||
| gpu_name: str | None = None | ||
| image_digest: str | None = None | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe, by default, the images are deleted after 30 days. Is this information that we want to persist in the database?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right now the Benchmarking DB requires an identifier for run submissions and we default to the image_digest. I think even if the image is deleted this is useful for identifying an image uniquely. |
||
|
|
||
| @classmethod | ||
| def from_parsed(cls, raw: dict) -> "BenchmarkMetadata": | ||
|
|
@@ -259,8 +261,9 @@ def _parse_args() -> argparse.Namespace: | |
| ) | ||
| parser.add_argument( | ||
| "--identifier-hash", | ||
| help="Unique identifier hash for software environment (e.g. a container image digest).", | ||
| required=True, | ||
| default=None, | ||
| help="Unique identifier hash for software environment (e.g. a container image digest). " | ||
| "If omitted, the image_digest from benchmark_result.json context is used.", | ||
| ) | ||
| parser.add_argument( | ||
| "--version", | ||
|
|
@@ -299,6 +302,26 @@ def _parse_args() -> argparse.Namespace: | |
| help="Benchmark definition name", | ||
| required=True, | ||
| ) | ||
| parser.add_argument( | ||
| "--velox-branch", | ||
| default=None, | ||
| help="Velox branch used to build the worker image.", | ||
| ) | ||
| parser.add_argument( | ||
| "--velox-repo", | ||
| default=None, | ||
| help="Velox repository used to build the worker image.", | ||
| ) | ||
| parser.add_argument( | ||
| "--presto-branch", | ||
| default=None, | ||
| help="Presto branch used to build the worker image.", | ||
| ) | ||
| parser.add_argument( | ||
| "--presto-repo", | ||
| default=None, | ||
| help="Presto repository used to build the worker image.", | ||
| ) | ||
| parser.add_argument( | ||
| "--concurrency-streams", | ||
| help="Number of concurrency streams to use for the benchmark run", | ||
|
|
@@ -355,6 +378,10 @@ def _build_submission_payload( | |
| is_official: bool, | ||
| asset_ids: list[int] | None = None, | ||
| concurrency_streams: int = 1, | ||
| velox_branch: str | None = None, | ||
| velox_repo: str | None = None, | ||
| presto_branch: str | None = None, | ||
| presto_repo: str | None = None, | ||
| ) -> dict: | ||
| """Build a BenchmarkSubmission payload from parsed dataclasses. | ||
|
|
||
|
|
@@ -449,6 +476,16 @@ def _query_sort_key(name: str): | |
| if v is not None | ||
| } | ||
|
|
||
| engine_config_payload = engine_config.serialize() if engine_config else {} | ||
| if velox_branch or velox_repo or presto_branch or presto_repo: | ||
| engine_config_payload = { | ||
| **engine_config_payload, | ||
| "velox_branch": velox_branch, | ||
| "velox_repo": velox_repo, | ||
| "presto_branch": presto_branch, | ||
| "presto_repo": presto_repo, | ||
| } | ||
|
|
||
| return { | ||
| "sku_name": sku_name, | ||
| "storage_configuration_name": storage_configuration_name, | ||
|
|
@@ -461,11 +498,11 @@ def _query_sort_key(name: str): | |
| "commit_hash": commit_hash, | ||
| }, | ||
| "run_at": benchmark_metadata.timestamp.isoformat(), | ||
| "node_count": 1, | ||
| "node_count": benchmark_metadata.node_count or 1, | ||
| "gpu_count": benchmark_metadata.gpu_count or 0, | ||
| "query_logs": query_logs, | ||
| "concurrency_streams": concurrency_streams, | ||
| "engine_config": engine_config.serialize() if engine_config else {}, | ||
| "engine_config": engine_config_payload, | ||
| "extra_info": extra_info, | ||
| "is_official": is_official, | ||
| "asset_ids": asset_ids, | ||
|
|
@@ -550,7 +587,7 @@ async def _process_benchmark_dir( | |
| storage_configuration_name: str, | ||
| cache_state: str, | ||
| engine_name: str | None, | ||
| identifier_hash: str, | ||
| identifier_hash: str | None, | ||
| version: str | None, | ||
| commit_hash: str | None, | ||
| is_official: bool, | ||
|
|
@@ -563,6 +600,10 @@ async def _process_benchmark_dir( | |
| concurrency_streams: int = 1, | ||
| config_dir: Path | None = None, | ||
| logs_dir: Path | None = None, | ||
| velox_branch: str | None = None, | ||
| velox_repo: str | None = None, | ||
| presto_branch: str | None = None, | ||
| presto_repo: str | None = None, | ||
| ) -> int: | ||
| """Process a benchmark directory and post results to API. | ||
|
|
||
|
|
@@ -589,6 +630,18 @@ async def _process_benchmark_dir( | |
| print(f" Error loading metadata: {e}", file=sys.stderr) | ||
| return 1 | ||
|
|
||
| # Fall back to the container image_digest captured in the benchmark | ||
| # results context when no explicit identifier_hash was provided on the CLI. | ||
| if identifier_hash is None: | ||
| identifier_hash = benchmark_metadata.image_digest | ||
| if identifier_hash is None: | ||
| print( | ||
| " Error: --identifier-hash was not provided and benchmark_result.json " | ||
| "context has no image_digest to fall back to.", | ||
| file=sys.stderr, | ||
| ) | ||
| return 1 | ||
|
|
||
| # Resolve config directory: explicit override → auto-detect from variant | ||
| effective_config_dir = config_dir | ||
| variant = _ENGINE_TO_VARIANT.get(benchmark_metadata.engine) | ||
|
|
@@ -669,6 +722,10 @@ async def _process_benchmark_dir( | |
| is_official=is_official, | ||
| asset_ids=asset_ids, | ||
| concurrency_streams=concurrency_streams, | ||
| velox_branch=velox_branch, | ||
| velox_repo=velox_repo, | ||
| presto_branch=presto_branch, | ||
| presto_repo=presto_repo, | ||
| ) | ||
| except Exception as e: | ||
| print(f" Error building payload for '{bench_name}': {e}", file=sys.stderr) | ||
|
|
@@ -747,6 +804,10 @@ async def main() -> int: | |
| concurrency_streams=args.concurrency_streams, | ||
| config_dir=Path(args.config_dir) if args.config_dir else None, | ||
| logs_dir=Path(args.logs_dir) if args.logs_dir else None, | ||
| velox_branch=args.velox_branch, | ||
| velox_repo=args.velox_repo, | ||
| presto_branch=args.presto_branch, | ||
| presto_repo=args.presto_repo, | ||
| ) | ||
|
|
||
| return result | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it make sense to also capture the number of workers per node?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure there's much use for a "number of workers per node" field since there is already a "number of workers" and "number of nodes"? Unless there was some use-case for an uneven number of workers per node (not currently a supported use-case), then I think that value should always be derivable from the existing information.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For instance, if we have 6 workers and 2 nodes, is there a guarantee that each node would have 3 workers, or can it be 4 workers on one node and 2 on another node (assuming each node has 4 GPUs)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right now it's guaranteed to be an even split. In CPU-mode you always get one worker per node. In GPU-mode you always get
NUM_GPUS_PER_NODEworkers per node (default 4).The API to the launch script requires you to specify number of nodes (-n) and optionally number of gpus (workers) per node (-g). So the number of workers is dynamically calculated based on this and guarantees an even worker split among nodes.