diff --git a/src/cloudai/models/scenario.py b/src/cloudai/models/scenario.py index 276cac2e5..751915095 100644 --- a/src/cloudai/models/scenario.py +++ b/src/cloudai/models/scenario.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -108,7 +108,7 @@ def tdef_model_dump(self, by_alias: bool) -> dict: "extra_env_vars": self.extra_env_vars if self.extra_env_vars else None, "cmd_args": self.cmd_args.model_dump(by_alias=by_alias) if self.cmd_args else None, "git_repos": [repo.model_dump() for repo in self.git_repos] if self.git_repos else None, - "nsys": self.nsys.model_dump() if self.nsys else None, + "nsys": self.nsys.model_dump(exclude_unset=True) if self.nsys else None, } return {k: v for k, v in data.items() if v is not None} diff --git a/tests/test_test_scenario.py b/tests/test_test_scenario.py index c2af1373b..9e305d093 100644 --- a/tests/test_test_scenario.py +++ b/tests/test_test_scenario.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -515,3 +515,160 @@ def test_get_reporters_nccl(self): assert len(reporters) == 2 assert NcclTestPerformanceReportGenerationStrategy in reporters assert NcclTestPredictionReportGenerationStrategy in reporters + + +class TestNsysMerging: + def test_nsys_partial_override_preserves_base_config( + self, test_scenario_parser: TestScenarioParser, slurm_system: SlurmSystem + ): + from cloudai.core import NsysConfiguration + + test_scenario_parser.test_mapping = { + "nccl": NCCLTestDefinition( + name="nccl", + description="desc", + test_template_name="NcclTest", + cmd_args=NCCLCmdArgs(docker_image_url="fake://url/nccl"), + nsys=NsysConfiguration( + enable=True, + nsys_binary="/custom/nsys", + output="/base/output", + trace="cuda,nvtx", + sample="cpu", + ), + ) + } + model = TestScenarioModel.model_validate( + toml.loads( + """ + name = "test" + + [[Tests]] + id = "1" + test_name = "nccl" + + [Tests.nsys] + output = "/scenario/output" + """ + ) + ) + tdef = test_scenario_parser._prepare_tdef(model.tests[0]) + + assert tdef.nsys is not None + assert tdef.nsys.output == "/scenario/output" + assert tdef.nsys.nsys_binary == "/custom/nsys" + assert tdef.nsys.trace == "cuda,nvtx" + assert tdef.nsys.sample == "cpu" + assert tdef.nsys.enable is True + + def test_nsys_multiple_fields_override(self, test_scenario_parser: TestScenarioParser, slurm_system: SlurmSystem): + from cloudai.core import NsysConfiguration + + test_scenario_parser.test_mapping = { + "nccl": NCCLTestDefinition( + name="nccl", + description="desc", + test_template_name="NcclTest", + cmd_args=NCCLCmdArgs(docker_image_url="fake://url/nccl"), + nsys=NsysConfiguration( + enable=True, + nsys_binary="/base/nsys", + output="/base/output", + trace="cuda", + force_overwrite=False, + ), + ) + } + model = TestScenarioModel.model_validate( + toml.loads( + """ + name = "test" + + [[Tests]] + id = "1" + test_name = "nccl" + + [Tests.nsys] + output = "/new/output" + force_overwrite = true + """ + ) + ) + tdef = test_scenario_parser._prepare_tdef(model.tests[0]) + + assert tdef.nsys is not None + assert tdef.nsys.output == "/new/output" + assert tdef.nsys.force_overwrite is True + assert tdef.nsys.nsys_binary == "/base/nsys" + assert tdef.nsys.trace == "cuda" + assert tdef.nsys.enable is True + + def test_nsys_scenario_adds_to_base_without_nsys( + self, test_scenario_parser: TestScenarioParser, slurm_system: SlurmSystem + ): + test_scenario_parser.test_mapping = { + "nccl": NCCLTestDefinition( + name="nccl", + description="desc", + test_template_name="NcclTest", + cmd_args=NCCLCmdArgs(docker_image_url="fake://url/nccl"), + # No nsys in base config + ) + } + model = TestScenarioModel.model_validate( + toml.loads( + """ + name = "test" + + [[Tests]] + id = "1" + test_name = "nccl" + + [Tests.nsys] + output = "/scenario/output" + trace = "cuda,nvtx" + """ + ) + ) + tdef = test_scenario_parser._prepare_tdef(model.tests[0]) + + assert tdef.nsys is not None + assert tdef.nsys.output == "/scenario/output" + assert tdef.nsys.trace == "cuda,nvtx" + assert tdef.nsys.enable is True + assert tdef.nsys.nsys_binary == "nsys" + + def test_nsys_disable_override(self, test_scenario_parser: TestScenarioParser, slurm_system: SlurmSystem): + from cloudai.core import NsysConfiguration + + test_scenario_parser.test_mapping = { + "nccl": NCCLTestDefinition( + name="nccl", + description="desc", + test_template_name="NcclTest", + cmd_args=NCCLCmdArgs(docker_image_url="fake://url/nccl"), + nsys=NsysConfiguration( + enable=True, + output="/base/output", + ), + ) + } + model = TestScenarioModel.model_validate( + toml.loads( + """ + name = "test" + + [[Tests]] + id = "1" + test_name = "nccl" + + [Tests.nsys] + enable = false + """ + ) + ) + tdef = test_scenario_parser._prepare_tdef(model.tests[0]) + + assert tdef.nsys is not None + assert tdef.nsys.enable is False + assert tdef.nsys.output == "/base/output"