From 14d1cffbd36c54c18685bfed278dfb84dd2e266c Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 2 May 2025 09:22:37 -0500 Subject: [PATCH 01/18] initial commit for gen_on_manager -> gen_on_worker --- docs/FAQ.rst | 5 --- docs/data_structures/libE_specs.rst | 12 ++---- docs/platforms/aurora.rst | 22 +---------- docs/platforms/perlmutter.rst | 20 ---------- docs/platforms/platforms_index.rst | 22 ++--------- docs/running_libE.rst | 30 -------------- docs/tutorials/executor_forces_tutorial.rst | 38 ------------------ docs/tutorials/gpcam_tutorial.rst | 39 ++++++++++++------- libensemble/manager.py | 16 ++++---- libensemble/specs.py | 6 +-- .../test_GPU_gen_resources.py | 8 ++-- .../test_evaluate_existing_plus_gen.py | 3 +- .../test_persistent_uniform_sampling.py | 4 +- .../test_evaluate_mixed_sample.py | 1 - .../test_persistent_gp_multitask_ax.py | 5 +-- 15 files changed, 50 insertions(+), 181 deletions(-) diff --git a/docs/FAQ.rst b/docs/FAQ.rst index b8a3ea2ce5..0339dbb681 100644 --- a/docs/FAQ.rst +++ b/docs/FAQ.rst @@ -13,11 +13,6 @@ We recommend using the following options to help debug workflows:: logger.set_level("DEBUG") libE_specs["safe_mode"] = True -To make it easier to debug a generator try setting the **libE_specs** option ``gen_on_manager``. -To do so, add the following to your calling script:: - - libE_specs["gen_on_manager"] = True - With this, ``pdb`` breakpoints can be set as usual in the generator. For more debugging options see "How can I debug specific libEnsemble processes?" below. diff --git a/docs/data_structures/libE_specs.rst b/docs/data_structures/libE_specs.rst index c0ca141403..6ecac6e582 100644 --- a/docs/data_structures/libE_specs.rst +++ b/docs/data_structures/libE_specs.rst @@ -9,12 +9,7 @@ libEnsemble is primarily customized by setting options within a ``LibeSpecs`` cl from libensemble.specs import LibeSpecs - specs = LibeSpecs( - gen_on_manager=True, - save_every_k_gens=100, - sim_dirs_make=True, - nworkers=4 - ) + specs = LibeSpecs(save_every_k_gens=100, sim_dirs_make=True, nworkers=4) .. dropdown:: Settings by Category :open: @@ -31,9 +26,8 @@ libEnsemble is primarily customized by setting options within a ``LibeSpecs`` cl **nworkers** [int]: Number of worker processes in ``"local"``, ``"threads"``, or ``"tcp"``. - **gen_on_manager** [bool] = False - Instructs Manager process to run generator functions. - This generator function can access/modify user objects by reference. + **gen_on_worker** [bool] = False + Instructs Worker process to run generator instead of Manager. **mpi_comm** [MPI communicator] = ``MPI.COMM_WORLD``: libEnsemble MPI communicator. diff --git a/docs/platforms/aurora.rst b/docs/platforms/aurora.rst index af2e7cc160..4865ba0c18 100644 --- a/docs/platforms/aurora.rst +++ b/docs/platforms/aurora.rst @@ -57,7 +57,7 @@ simulations for each worker: .. code-block:: python # Instruct libEnsemble to exit after this many simulations - ensemble.exit_criteria = ExitCriteria(sim_max=nsim_workers*2) + ensemble.exit_criteria = ExitCriteria(sim_max=nsim_workers * 2) Now grab an interactive session on two nodes (or use the batch script at ``../submission_scripts/submit_pbs_aurora.sh``):: @@ -115,26 +115,6 @@ will use one GPU tile):: python run_libe_forces.py -n 25 -Running generator on the manager --------------------------------- - -An alternative is to run the generator on a thread on the manager. The -number of workers can then be set to the number of simulation workers. - -Change the ``libE_specs`` in **run_libe_forces.py** as follows: - -.. code-block:: python - - nsim_workers = ensemble.nworkers - - # Persistent gen does not need resources - ensemble.libE_specs = LibeSpecs( - gen_on_manager=True, - -then we can run with 12 (instead of 13) workers:: - - python run_libe_forces.py -n 12 - Dynamic resource assignment --------------------------- diff --git a/docs/platforms/perlmutter.rst b/docs/platforms/perlmutter.rst index 88d3f808b2..a2768e1d26 100644 --- a/docs/platforms/perlmutter.rst +++ b/docs/platforms/perlmutter.rst @@ -105,26 +105,6 @@ To see GPU usage, ssh into the node you are on in another window and run:: watch -n 0.1 nvidia-smi -Running generator on the manager -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -An alternative is to run the generator on a thread on the manager. The -number of workers can then be set to the number of simulation workers. - -Change the ``libE_specs`` in **run_libe_forces.py** as follows. - - .. code-block:: python - - nsim_workers = ensemble.nworkers - - # Persistent gen does not need resources - ensemble.libE_specs = LibeSpecs( - gen_on_manager=True, - -and run with:: - - python run_libe_forces.py -n 4 - To watch video ^^^^^^^^^^^^^^ diff --git a/docs/platforms/platforms_index.rst b/docs/platforms/platforms_index.rst index c06cdbe6fd..eca8ab9d58 100644 --- a/docs/platforms/platforms_index.rst +++ b/docs/platforms/platforms_index.rst @@ -24,10 +24,6 @@ simulation worker, and libEnsemble will distribute user applications across the node allocation. This is the **most common approach** where each simulation runs an MPI application. -The generator will run on a worker by default, but if running a single generator, -the :ref:`libE_specs` option **gen_on_manager** is recommended, -which runs the generator on the manager (using a thread) as below. - .. list-table:: :widths: 60 40 @@ -35,15 +31,6 @@ which runs the generator on the manager (using a thread) as below. :alt: centralized :scale: 55 - - In calling script: - - .. code-block:: python - :linenos: - - ensemble.libE_specs = LibeSpecs( - gen_on_manager=True, - ) - A SLURM batch script may include: .. code-block:: bash @@ -52,7 +39,9 @@ which runs the generator on the manager (using a thread) as below. python run_libe_forces.py --nworkers 3 -When using **gen_on_manager**, set ``nworkers`` to the number of workers desired for running simulations. +If running multiple generator processes instead, then set the +:ref:`libE_specs` option **gen_on_worker** so that multiple +worker processes can run multiple generator instances. Dedicated Mode ^^^^^^^^^^^^^^ @@ -87,8 +76,6 @@ remaining nodes in the allocation. python run_libe_forces.py --nworkers 3 -Note that **gen_on_manager** is not set in the above example. - Distributed Running ------------------- @@ -137,8 +124,7 @@ Zero-resource workers --------------------- Users with persistent ``gen_f`` functions may notice that the persistent workers -are still automatically assigned system resources. This can be resolved by using -the ``gen_on_manager`` option or by +are still automatically assigned system resources. This can be resolved by :ref:`fixing the number of resource sets`. Assigning GPUs diff --git a/docs/running_libE.rst b/docs/running_libE.rst index ae658e31c6..80af301f3f 100644 --- a/docs/running_libE.rst +++ b/docs/running_libE.rst @@ -12,13 +12,6 @@ determine the parameters/inputs for simulations. Simulator functions run and manage simulations, which often involve running a user application (see :doc:`Executor`). -.. note:: - As of version 1.3.0, the generator can be run as a thread on the manager, - using the :ref:`libE_specs` option **gen_on_manager**. - When using this option, set the number of workers desired for running - simulations. See :ref:`Running generator on the manager` - for more details. - To use libEnsemble, you will need a calling script, which in turn will specify generator and simulator functions. Many :doc:`examples` are available. @@ -162,29 +155,6 @@ If this example was run as:: No simulations will be able to run. -.. _gen-on-manager: - -Running generator on the manager --------------------------------- - -The majority of libEnsemble use cases run a single generator. The -:ref:`libE_specs` option **gen_on_manager** will cause -the generator function to run on a thread on the manager. This can run -persistent user functions, sharing data structures with the manager, and avoids -additional communication to a generator running on a worker. When using this -option, the number of workers specified should be the (maximum) number of -concurrent simulations. - -If modifying a workflow to use ``gen_on_manager`` consider the following. - -* Set ``nworkers`` to the number of workers desired for running simulations. -* If using :meth:`add_unique_random_streams()` - to seed random streams, the default generator seed will be zero. -* If you have a line like ``libE_specs["nresource_sets"] = nworkers -1``, this - line should be removed. -* If the generator does use resources, ``nresource_sets`` can be increased as needed - so that the generator and all simulations are resourced. - Environment Variables --------------------- diff --git a/docs/tutorials/executor_forces_tutorial.rst b/docs/tutorials/executor_forces_tutorial.rst index e01496734b..a083aa2a82 100644 --- a/docs/tutorials/executor_forces_tutorial.rst +++ b/docs/tutorials/executor_forces_tutorial.rst @@ -336,44 +336,6 @@ These may require additional browsing of the documentation to complete. ... -Running the generator on the manager ------------------------------------- - -As of version 1.3.0, the generator can be run on a thread on the manager, -using the :ref:`libE_specs` option **gen_on_manager**. - -Change the libE_specs as follows. - - .. code-block:: python - :linenos: - :lineno-start: 28 - - nsim_workers = ensemble.nworkers - - # Persistent gen does not need resources - ensemble.libE_specs = LibeSpecs( - gen_on_manager=True, - sim_dirs_make=True, - ensemble_dir_path="./test_executor_forces_tutorial", - ) - -When running set ``nworkers`` to the number of workers desired for running simulations. -E.g., Instead of: - -.. code-block:: bash - - python run_libe_forces.py --nworkers 5 - -use: - -.. code-block:: bash - - python run_libe_forces.py --nworkers 4 - -Note that as the generator random number seed will be zero instead of one, the checksum will change. - -For more information see :ref:`Running generator on the manager`. - Running forces application with input file ------------------------------------------ diff --git a/docs/tutorials/gpcam_tutorial.rst b/docs/tutorials/gpcam_tutorial.rst index a013c1b67e..096d5584ca 100644 --- a/docs/tutorials/gpcam_tutorial.rst +++ b/docs/tutorials/gpcam_tutorial.rst @@ -30,6 +30,7 @@ This version (and others) of the gpCAM generator can be found at `libensemble/ge from libensemble.message_numbers import EVAL_GEN_TAG, FINISHED_PERSISTENT_GEN_TAG, PERSIS_STOP, STOP_TAG from libensemble.tools.persistent_support import PersistentSupport + def persistent_gpCAM(H_in, persis_info, gen_specs, libE_info): """Run a batched gpCAM model to create a surrogate""" @@ -156,6 +157,7 @@ For running applications using parallel resources in the simulator see the `forc # Define our simulation function import numpy as np + def six_hump_camel(H, persis_info, sim_specs, _): """Six-Hump Camel sim_f.""" @@ -189,6 +191,8 @@ First we will create a cleanup script so we can easily re-run. # To rerun this notebook, we need to delete the ensemble directory. import shutil + + def cleanup(): try: shutil.rmtree("ensemble") @@ -218,31 +222,30 @@ If you wish to make your own functions based on the above, those can be imported nworkers = 4 - # When using gen_on_manager, nworkers is number of concurrent sims. # final_gen_send means the last evaluated points are returned to the generator to update the model. - libE_specs = LibeSpecs(nworkers=nworkers, gen_on_manager=True, final_gen_send=True) + libE_specs = LibeSpecs(nworkers=nworkers, final_gen_send=True) n = 2 # Input dimensions batch_size = 4 num_batches = 6 gen_specs = GenSpecs( - gen_f=persistent_gpCAM, # Generator function - persis_in=["f"], # Objective, defined in sim, is returned to gen + gen_f=persistent_gpCAM, # Generator function + persis_in=["f"], # Objective, defined in sim, is returned to gen outputs=[("x", float, (n,))], # Parameters (name, type, size) user={ "batch_size": batch_size, "lb": np.array([-2, -1]), # lower boundaries for n dimensions - "ub": np.array([2, 1]), # upper boundaries for n dimensions - "ask_max_iter": 5, # Number of iterations for ask (default 20) + "ub": np.array([2, 1]), # upper boundaries for n dimensions + "ask_max_iter": 5, # Number of iterations for ask (default 20) "rng_seed": 0, }, ) sim_specs = SimSpecs( - sim_f=six_hump_camel, # Simulator function - inputs=["x"], # Input field names. "x" defined in gen - outputs=[("f", float)], # Objective + sim_f=six_hump_camel, # Simulator function + inputs=["x"], # Input field names. "x" defined in gen + outputs=[("f", float)], # Objective ) # Starts one persistent generator. Simulated values are returned in batch. @@ -251,7 +254,7 @@ If you wish to make your own functions based on the above, those can be imported user={"async_return": False}, # False = batch returns ) - exit_criteria = ExitCriteria(sim_max=num_batches*batch_size) + exit_criteria = ExitCriteria(sim_max=num_batches * batch_size) # Initialize and run the ensemble. ensemble = Ensemble( @@ -272,7 +275,7 @@ At the end of our calling script we run the ensemble. H, persis_info, flag = ensemble.run() # Start the ensemble. Blocks until completion. ensemble.save_output("H_array", append_attrs=False) # Save H (history of all evaluated points) to file - pprint(H[["sim_id", "x", "f"]][:16]) # See first 16 results + pprint(H[["sim_id", "x", "f"]][:16]) # See first 16 results Rerun and test model at known points ------------------------------------ @@ -312,15 +315,21 @@ values at the test points. markersize = 10 plt.figure(figsize=(10, 5)) plt.plot( - num_sims, mse, marker="^", markeredgecolor="black", markeredgewidth=2, - markersize=markersize, linewidth=2, label="Mean squared error" + num_sims, + mse, + marker="^", + markeredgecolor="black", + markeredgewidth=2, + markersize=markersize, + linewidth=2, + label="Mean squared error", ) plt.xticks(num_sims) # Labeling the axes and the legend - plt.title('Mean Squared Error at test points') + plt.title("Mean Squared Error at test points") plt.xlabel("Number of simulations") - plt.ylabel('Mean squared error (rad$^2$)') + plt.ylabel("Mean squared error (rad$^2$)") legend = plt.legend(framealpha=1, edgecolor="black") # Increase edge width here plt.grid(True) plt.show() diff --git a/libensemble/manager.py b/libensemble/manager.py index c0cf02500e..149ab819b6 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -231,19 +231,19 @@ def __init__( (1, "stop_val", self.term_test_stop_val), ] - gen_on_manager = self.libE_specs.get("gen_on_manager", False) + gen_on_worker = self.libE_specs.get("gen_on_worker", False) - self.W = np.zeros(len(self.wcomms) + gen_on_manager, dtype=Manager.worker_dtype) - if gen_on_manager: + self.W = np.zeros(len(self.wcomms) + gen_on_worker, dtype=Manager.worker_dtype) + if gen_on_worker: + self.W["worker_id"] = np.arange(len(self.wcomms)) + 1 # [1, 2, 3, ...] + else: self.W["worker_id"] = np.arange(len(self.wcomms) + 1) # [0, 1, 2, ...] self.W[0]["gen_worker"] = True local_worker_comm = self._run_additional_worker(hist, sim_specs, gen_specs, libE_specs) self.wcomms = [local_worker_comm] + self.wcomms - else: - self.W["worker_id"] = np.arange(len(self.wcomms)) + 1 # [1, 2, 3, ...] - self.W = _WorkerIndexer(self.W, gen_on_manager) - self.wcomms = _WorkerIndexer(self.wcomms, gen_on_manager) + self.W = _WorkerIndexer(self.W, gen_on_worker) + self.wcomms = _WorkerIndexer(self.wcomms, gen_on_worker) temp_EnsembleDirectory = EnsembleDirectory(libE_specs=libE_specs) self.resources = Resources.resources @@ -639,7 +639,7 @@ def _get_alloc_libE_info(self) -> dict: "use_resource_sets": self.use_resource_sets, "gen_num_procs": self.gen_num_procs, "gen_num_gpus": self.gen_num_gpus, - "gen_on_manager": self.libE_specs.get("gen_on_manager", False), + "gen_on_worker": self.libE_specs.get("gen_on_worker", False), } def _alloc_work(self, H: npt.NDArray, persis_info: dict) -> dict: diff --git a/libensemble/specs.py b/libensemble/specs.py index aa70018362..e2cc525636 100644 --- a/libensemble/specs.py +++ b/libensemble/specs.py @@ -182,10 +182,8 @@ class LibeSpecs(BaseModel): nworkers: Optional[int] = 0 """ Number of worker processes in ``"local"``, ``"threads"``, or ``"tcp"``.""" - gen_on_manager: Optional[bool] = False - """ Instructs Manager process to run generator functions. - This generator function can access/modify user objects by reference. - """ + gen_on_worker: Optional[bool] = False + """ Instructs Worker process to run generator instead of Manager.""" mpi_comm: Optional[Any] = None """ libEnsemble MPI communicator. Default: ``MPI.COMM_WORLD``""" diff --git a/libensemble/tests/functionality_tests/test_GPU_gen_resources.py b/libensemble/tests/functionality_tests/test_GPU_gen_resources.py index d77088d7e4..48b7fee0d2 100644 --- a/libensemble/tests/functionality_tests/test_GPU_gen_resources.py +++ b/libensemble/tests/functionality_tests/test_GPU_gen_resources.py @@ -100,18 +100,18 @@ libE_specs["resource_info"] = {"cores_on_node": (nworkers * 2, nworkers * 4), "gpus_on_node": nworkers} base_libE_specs = libE_specs.copy() - for gen_on_manager in [False, True]: + for gen_on_worker in [False, True]: for run in range(5): # reset libE_specs = base_libE_specs.copy() - libE_specs["gen_on_manager"] = gen_on_manager + libE_specs["gen_on_worker"] = gen_on_worker persis_info = add_unique_random_streams({}, nworkers + 1) if run == 0: libE_specs["gen_num_procs"] = 2 elif run == 1: - if gen_on_manager: - print("SECOND LIBE CALL WITH GEN ON MANAGER") + if gen_on_worker: + print("SECOND LIBE CALL WITH GEN ON WORKER INSTEAD OF MANAGER") libE_specs["gen_num_gpus"] = 1 elif run == 2: persis_info["gen_num_gpus"] = 1 diff --git a/libensemble/tests/functionality_tests/test_evaluate_existing_plus_gen.py b/libensemble/tests/functionality_tests/test_evaluate_existing_plus_gen.py index fe3d8dad8e..7a16d70736 100644 --- a/libensemble/tests/functionality_tests/test_evaluate_existing_plus_gen.py +++ b/libensemble/tests/functionality_tests/test_evaluate_existing_plus_gen.py @@ -1,6 +1,6 @@ """ Test libEnsemble's capability to evaluate existing points and then generate -new samples via gen_on_manager. +new samples. Execute via one of the following commands (e.g. 3 workers): mpiexec -np 4 python test_evaluate_existing_sample.py @@ -43,7 +43,6 @@ def create_H0(persis_info, gen_specs, H0_size): if __name__ == "__main__": sampling = Ensemble(parse_args=True) - sampling.libE_specs.gen_on_manager = True sampling.sim_specs = SimSpecs(sim_f=sim_f, inputs=["x"], out=[("f", float)]) gen_specs = { diff --git a/libensemble/tests/functionality_tests/test_persistent_uniform_sampling.py b/libensemble/tests/functionality_tests/test_persistent_uniform_sampling.py index 81a18a5285..643b4723d7 100644 --- a/libensemble/tests/functionality_tests/test_persistent_uniform_sampling.py +++ b/libensemble/tests/functionality_tests/test_persistent_uniform_sampling.py @@ -87,9 +87,9 @@ sim_specs["in"] = ["x", "obj_component"] # sim_specs["out"] = [("f", float), ("grad", float, n)] elif run == 3: - libE_specs["gen_on_manager"] = True + libE_specs["gen_on_worker"] = True elif run == 4: - libE_specs["gen_on_manager"] = False + libE_specs["gen_on_worker"] = False libE_specs["gen_workers"] = [2] # Perform the run diff --git a/libensemble/tests/regression_tests/test_evaluate_mixed_sample.py b/libensemble/tests/regression_tests/test_evaluate_mixed_sample.py index 481db84191..60e43fa57e 100644 --- a/libensemble/tests/regression_tests/test_evaluate_mixed_sample.py +++ b/libensemble/tests/regression_tests/test_evaluate_mixed_sample.py @@ -44,7 +44,6 @@ H0["sim_ended"][:500] = True sampling = Ensemble(parse_args=True) - sampling.libE_specs.gen_on_manager = True sampling.H0 = H0 sampling.sim_specs = SimSpecs(sim_f=sim_f, inputs=["x"], out=[("f", float)]) sampling.alloc_specs = AllocSpecs(alloc_f=alloc_f) diff --git a/libensemble/tests/regression_tests/test_persistent_gp_multitask_ax.py b/libensemble/tests/regression_tests/test_persistent_gp_multitask_ax.py index 8c589161ad..f88db4fe05 100644 --- a/libensemble/tests/regression_tests/test_persistent_gp_multitask_ax.py +++ b/libensemble/tests/regression_tests/test_persistent_gp_multitask_ax.py @@ -2,8 +2,6 @@ Example of multi-fidelity optimization using a persistent GP gen_func (calling Ax). -This test uses the gen_on_manager option (persistent generator runs on -a thread). Therefore nworkers is the number of simulation workers. Execute via one of the following commands: mpiexec -np 4 python test_persistent_gp_multitask_ax.py @@ -50,7 +48,7 @@ def run_simulation(H, persis_info, sim_specs, libE_info): z = 8 elif task == "cheap_model": z = 1 - print('in sim', task) + print("in sim", task) libE_output = np.zeros(1, dtype=sim_specs["out"]) calc_status = WORKER_DONE @@ -64,7 +62,6 @@ def run_simulation(H, persis_info, sim_specs, libE_info): # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": nworkers, is_manager, libE_specs, _ = parse_args() - libE_specs["gen_on_manager"] = True mt_params = { "name_hifi": "expensive_model", From ae40691b0bdc89b0a39747837917a062cf20d1f9 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 2 May 2025 09:24:35 -0500 Subject: [PATCH 02/18] fix --- docs/platforms/platforms_index.rst | 47 +++++++++++++----------------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/docs/platforms/platforms_index.rst b/docs/platforms/platforms_index.rst index eca8ab9d58..843e65e5e6 100644 --- a/docs/platforms/platforms_index.rst +++ b/docs/platforms/platforms_index.rst @@ -24,20 +24,17 @@ simulation worker, and libEnsemble will distribute user applications across the node allocation. This is the **most common approach** where each simulation runs an MPI application. -.. list-table:: - :widths: 60 40 +.. image:: ../images/centralized_gen_on_manager.png + :alt: centralized + :scale: 55 - * - .. image:: ../images/centralized_gen_on_manager.png - :alt: centralized - :scale: 55 +A SLURM batch script may include: - A SLURM batch script may include: +.. code-block:: bash - .. code-block:: bash + #SBATCH --nodes 3 - #SBATCH --nodes 3 - - python run_libe_forces.py --nworkers 3 + python run_libe_forces.py --nworkers 3 If running multiple generator processes instead, then set the :ref:`libE_specs` option **gen_on_worker** so that multiple @@ -51,30 +48,28 @@ True, the MPI executor will not launch applications on nodes where libEnsemble P processes (manager and workers) are running. Workers launch applications onto the remaining nodes in the allocation. -.. list-table:: - :widths: 60 40 - * - .. image:: ../images/centralized_dedicated.png - :alt: centralized dedicated mode - :scale: 30 +.. image:: ../images/centralized_dedicated.png + :alt: centralized dedicated mode + :scale: 30 - - In calling script: +In calling script: - .. code-block:: python - :linenos: +.. code-block:: python + :linenos: - ensemble.libE_specs = LibeSpecs( - num_resource_sets=2, - dedicated_mode=True, - ) + ensemble.libE_specs = LibeSpecs( + num_resource_sets=2, + dedicated_mode=True, + ) - A SLURM batch script may include: +A SLURM batch script may include: - .. code-block:: bash +.. code-block:: bash - #SBATCH --nodes 3 + #SBATCH --nodes 3 - python run_libe_forces.py --nworkers 3 + python run_libe_forces.py --nworkers 3 Distributed Running ------------------- From d6cf1d3052408edaa0d7e6ae236566d88fc6f194 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 2 May 2025 10:04:31 -0500 Subject: [PATCH 03/18] additional note --- docs/overview_usecases.rst | 3 +++ docs/platforms/platforms_index.rst | 1 + 2 files changed, 4 insertions(+) diff --git a/docs/overview_usecases.rst b/docs/overview_usecases.rst index 56ad05b6c9..6d63ce8ff8 100644 --- a/docs/overview_usecases.rst +++ b/docs/overview_usecases.rst @@ -20,6 +20,9 @@ which perform computations via **user functions**: | +As of **v2.0** the **Manager** by default runs **a single generator**. This +is configurable. + The default allocator (``alloc_f``) instructs workers to run the simulator on the highest priority work from the generator. If a worker is idle and there is no work, that worker is instructed to call the generator. diff --git a/docs/platforms/platforms_index.rst b/docs/platforms/platforms_index.rst index 843e65e5e6..6daa319b94 100644 --- a/docs/platforms/platforms_index.rst +++ b/docs/platforms/platforms_index.rst @@ -59,6 +59,7 @@ In calling script: :linenos: ensemble.libE_specs = LibeSpecs( + gen_on_worker=True, num_resource_sets=2, dedicated_mode=True, ) From c54adb3d5063a6913fd9d85d3f5fe3e456f6ea19 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 2 May 2025 10:29:51 -0500 Subject: [PATCH 04/18] fixes, plus for test_manager_main we don't want to run the default worker0 during unit tests --- libensemble/manager.py | 7 ++++--- libensemble/tests/unit_tests/test_manager_main.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/libensemble/manager.py b/libensemble/manager.py index 149ab819b6..49ddebc8b3 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -232,8 +232,9 @@ def __init__( ] gen_on_worker = self.libE_specs.get("gen_on_worker", False) + len_W = len(self.wcomms) + 1 - gen_on_worker - self.W = np.zeros(len(self.wcomms) + gen_on_worker, dtype=Manager.worker_dtype) + self.W = np.zeros(len_W, dtype=Manager.worker_dtype) if gen_on_worker: self.W["worker_id"] = np.arange(len(self.wcomms)) + 1 # [1, 2, 3, ...] else: @@ -242,8 +243,8 @@ def __init__( local_worker_comm = self._run_additional_worker(hist, sim_specs, gen_specs, libE_specs) self.wcomms = [local_worker_comm] + self.wcomms - self.W = _WorkerIndexer(self.W, gen_on_worker) - self.wcomms = _WorkerIndexer(self.wcomms, gen_on_worker) + self.W = _WorkerIndexer(self.W, 1 - gen_on_worker) + self.wcomms = _WorkerIndexer(self.wcomms, 1 - gen_on_worker) temp_EnsembleDirectory = EnsembleDirectory(libE_specs=libE_specs) self.resources = Resources.resources diff --git a/libensemble/tests/unit_tests/test_manager_main.py b/libensemble/tests/unit_tests/test_manager_main.py index 4e246eb570..e34bc76301 100644 --- a/libensemble/tests/unit_tests/test_manager_main.py +++ b/libensemble/tests/unit_tests/test_manager_main.py @@ -6,7 +6,7 @@ import libensemble.manager as man import libensemble.tests.unit_tests.setup as setup -libE_specs = {"comms": "local"} +libE_specs = {"comms": "local", "gen_on_worker": True} def test_term_test_1(): From dc2570d81c63a7d319f05e0ccd02ca09dfb9821d Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 13 Aug 2025 15:11:59 -0500 Subject: [PATCH 05/18] small fixes, comments, additional unit test for affirming alloc behavior with additional worker zero --- libensemble/manager.py | 4 +- .../test_allocation_funcs_and_support.py | 55 +++++++++++++++++++ libensemble/worker.py | 2 +- 3 files changed, 58 insertions(+), 3 deletions(-) diff --git a/libensemble/manager.py b/libensemble/manager.py index de36dc1319..fe6a4bd710 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -232,7 +232,7 @@ def __init__( ] gen_on_worker = self.libE_specs.get("gen_on_worker", False) - len_W = len(self.wcomms) + 1 - gen_on_worker + len_W = len(self.wcomms) + 1 - gen_on_worker # if gen_on_worker, len_W = len(self.wcomms) self.W = np.zeros(len_W, dtype=Manager.worker_dtype) if gen_on_worker: @@ -243,7 +243,7 @@ def __init__( local_worker_comm = self._run_additional_worker(hist, sim_specs, gen_specs, libE_specs) self.wcomms = [local_worker_comm] + self.wcomms - self.W = _WorkerIndexer(self.W, 1 - gen_on_worker) + self.W = _WorkerIndexer(self.W, 1 - gen_on_worker) # if gen on worker, then no additional worker self.wcomms = _WorkerIndexer(self.wcomms, 1 - gen_on_worker) temp_EnsembleDirectory = EnsembleDirectory(libE_specs=libE_specs) diff --git a/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py b/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py index 6d056b1e01..9d1d45fece 100644 --- a/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py +++ b/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py @@ -34,6 +34,25 @@ ], ) +W_gen_mgr = np.array( + [ + (0, True, 0, 0, False, False), + (1, False, 0, 0, False, False), + (2, False, 0, 0, False, False), + (3, False, 0, 0, False, False), + (4, False, 0, 0, False, False), + ], + dtype=[ + ("worker_id", " Date: Wed, 13 Aug 2025 15:27:02 -0500 Subject: [PATCH 06/18] check worker zero for run_order for aposmm-ibcdfo-pounders --- .../regression_tests/test_persistent_aposmm_ibcdfo_pounders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_ibcdfo_pounders.py b/libensemble/tests/regression_tests/test_persistent_aposmm_ibcdfo_pounders.py index 7523704a0b..356b2017a9 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_ibcdfo_pounders.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_ibcdfo_pounders.py @@ -135,7 +135,7 @@ def synthetic_beamline_mapping(H, _, sim_specs): H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs) if is_manager: - assert persis_info[1].get("run_order"), "Run_order should have been given back" + assert persis_info[0].get("run_order"), "Run_order should have been given back" assert flag == 0 save_libE_output(H, persis_info, __file__, nworkers) From a6fd67ce71a10ebcf6fd31cab65c12339652c368 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 13 Aug 2025 15:43:01 -0500 Subject: [PATCH 07/18] test_zero_resource_workers probably needs adjustment? --- .../tests/functionality_tests/test_zero_resource_workers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/tests/functionality_tests/test_zero_resource_workers.py b/libensemble/tests/functionality_tests/test_zero_resource_workers.py index c8f0786d06..4739c5bfb6 100644 --- a/libensemble/tests/functionality_tests/test_zero_resource_workers.py +++ b/libensemble/tests/functionality_tests/test_zero_resource_workers.py @@ -36,7 +36,7 @@ sim_app = "/path/to/fakeapp.x" comms = libE_specs["comms"] - libE_specs["zero_resource_workers"] = [1] + libE_specs["zero_resource_workers"] = [0] libE_specs["dedicated_mode"] = True libE_specs["enforce_worker_core_bounds"] = True From 3ce9ed4aeb40b7ef5f389b2776ec4720637119cd Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 27 Aug 2025 11:20:15 -0500 Subject: [PATCH 08/18] fixes to accomodate the zeroth worker being a zero-resource worker --- libensemble/sim_funcs/run_line_check.py | 2 +- .../tests/functionality_tests/test_zero_resource_workers.py | 2 +- libensemble/tools/parse_args.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libensemble/sim_funcs/run_line_check.py b/libensemble/sim_funcs/run_line_check.py index 530b1e8d9b..96ef13a5da 100644 --- a/libensemble/sim_funcs/run_line_check.py +++ b/libensemble/sim_funcs/run_line_check.py @@ -22,7 +22,7 @@ def exp_nodelist_for_worker(exp_list, workerID, nodes_per_worker, persis_gens): node_list = comp.split(",") for node in node_list: node_name, node_num = node.split("-") - offset = workerID - (1 + persis_gens) + offset = workerID - (persis_gens) new_num = int(node_num) + int(nodes_per_worker * offset) new_node = "-".join([node_name, str(new_num)]) new_node_list.append(new_node) diff --git a/libensemble/tests/functionality_tests/test_zero_resource_workers.py b/libensemble/tests/functionality_tests/test_zero_resource_workers.py index 4739c5bfb6..d286aa7128 100644 --- a/libensemble/tests/functionality_tests/test_zero_resource_workers.py +++ b/libensemble/tests/functionality_tests/test_zero_resource_workers.py @@ -101,7 +101,7 @@ } alloc_specs = {"alloc_f": alloc_f} - persis_info = add_unique_random_streams({}, nworkers + 1) + persis_info = add_unique_random_streams({}, nworkers) exit_criteria = {"sim_max": (nsim_workers) * rounds} # Each worker has 2 nodes. Basic test list for portable options diff --git a/libensemble/tools/parse_args.py b/libensemble/tools/parse_args.py index 5e302c0ce0..c52b6f7c8f 100644 --- a/libensemble/tools/parse_args.py +++ b/libensemble/tools/parse_args.py @@ -73,7 +73,7 @@ def _mpi_parse_args(args): def _local_parse_args(args): """Parses arguments for forked processes using multiprocessing.""" libE_specs = {"comms": args.comms} - nworkers = args.nworkers + nworkers = args.nworkers + 1 # for manager if args.nresource_sets is not None: libE_specs["num_resource_sets"] = args.nresource_sets From 9c38da2e9ae9f3308456a23faef75a3d32068da9 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 27 Aug 2025 11:21:52 -0500 Subject: [PATCH 09/18] fix the specified zero-resource worker --- .../functionality_tests/test_zero_resource_workers_subnode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py b/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py index 69ea2b559c..a7a57b584a 100644 --- a/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py +++ b/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py @@ -39,7 +39,7 @@ sim_app = "/path/to/fakeapp.x" comms = libE_specs["comms"] - libE_specs["zero_resource_workers"] = [1] + libE_specs["zero_resource_workers"] = [0] libE_specs["dedicated_mode"] = True libE_specs["enforce_worker_core_bounds"] = True @@ -100,7 +100,7 @@ } alloc_specs = {"alloc_f": alloc_f} - persis_info = add_unique_random_streams({}, nworkers + 1) + persis_info = add_unique_random_streams({}, nworkers) exit_criteria = {"sim_max": (nsim_workers) * rounds} # Each worker has 2 nodes. Basic test list for portable options From dc477724c8670af1b9491a1d2d23167584b5b0ce Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 27 Aug 2025 12:01:38 -0500 Subject: [PATCH 10/18] tiny fixes --- ...daptive_workers_persistent_oversubscribe_rsets.py | 8 ++++---- .../functionality_tests/test_sim_dirs_per_worker.py | 12 ++++++------ .../test_zero_resource_workers_subnode.py | 2 +- libensemble/tools/parse_args.py | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/libensemble/tests/functionality_tests/test_runlines_adaptive_workers_persistent_oversubscribe_rsets.py b/libensemble/tests/functionality_tests/test_runlines_adaptive_workers_persistent_oversubscribe_rsets.py index fb730b966b..94bf1a387f 100644 --- a/libensemble/tests/functionality_tests/test_runlines_adaptive_workers_persistent_oversubscribe_rsets.py +++ b/libensemble/tests/functionality_tests/test_runlines_adaptive_workers_persistent_oversubscribe_rsets.py @@ -31,9 +31,9 @@ # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": nworkers, is_manager, libE_specs, _ = parse_args() - nsim_workers = nworkers - 1 + nsim_workers = nworkers - libE_specs["zero_resource_workers"] = [1] + libE_specs["zero_resource_workers"] = [0] rsets = nsim_workers * 2 libE_specs["num_resource_sets"] = rsets @@ -64,7 +64,7 @@ "persis_in": ["f", "x", "sim_id"], "out": [("priority", float), ("resource_sets", int), ("x", float, n), ("x_on_cube", float, n)], "user": { - "initial_batch_size": nworkers - 1, + "initial_batch_size": nworkers, "max_resource_sets": max_rsets, "lb": np.array([-3, -2]), "ub": np.array([3, 2]), @@ -91,7 +91,7 @@ "node_file": node_file, } # Name of file containing a node-list - persis_info = add_unique_random_streams({}, nworkers + 1) + persis_info = add_unique_random_streams({}, nworkers) exit_criteria = {"sim_max": 40, "wallclock_max": 300} # Perform the run diff --git a/libensemble/tests/functionality_tests/test_sim_dirs_per_worker.py b/libensemble/tests/functionality_tests/test_sim_dirs_per_worker.py index 69bb34ab84..a15ede9e92 100644 --- a/libensemble/tests/functionality_tests/test_sim_dirs_per_worker.py +++ b/libensemble/tests/functionality_tests/test_sim_dirs_per_worker.py @@ -23,14 +23,14 @@ from libensemble.tests.regression_tests.support import write_sim_func as sim_f from libensemble.tools import add_unique_random_streams, parse_args -nworkers, is_manager, libE_specs, _ = parse_args() +n_simworkers, is_manager, libE_specs, _ = parse_args() # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": sim_input_dir = "./sim_input_dir" dir_to_copy = sim_input_dir + "/copy_this" dir_to_symlink = sim_input_dir + "/symlink_this" - w_ensemble = "./ensemble_workdirs_w" + str(nworkers) + "_" + libE_specs.get("comms") + w_ensemble = "./ensemble_workdirs_w" + str(n_simworkers) + "_" + libE_specs.get("comms") print("creating ensemble dir: ", w_ensemble, flush=True) for dir in [sim_input_dir, dir_to_copy, dir_to_symlink]: @@ -60,7 +60,7 @@ }, } - persis_info = add_unique_random_streams({}, nworkers + 1) + persis_info = add_unique_random_streams({}, n_simworkers) exit_criteria = {"sim_max": 21} @@ -69,9 +69,9 @@ if is_manager: assert os.path.isdir(w_ensemble), f"Ensemble directory {w_ensemble} not created." worker_dir_sum = sum(["worker" in i for i in os.listdir(w_ensemble)]) - assert worker_dir_sum == nworkers, "Number of worker dirs ({}) does not match nworkers ({}).".format( - worker_dir_sum, nworkers - ) + assert ( + worker_dir_sum == n_simworkers + 1 + ), "Number of worker dirs ({}) does not match n_simworkers ({}).".format(worker_dir_sum, n_simworkers) input_copied = [] sim_dir_sum = 0 diff --git a/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py b/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py index a7a57b584a..446142f5db 100644 --- a/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py +++ b/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py @@ -29,7 +29,7 @@ # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi local -# TESTSUITE_NPROCS: 4 +# TESTSUITE_NPROCS: 3 # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": diff --git a/libensemble/tools/parse_args.py b/libensemble/tools/parse_args.py index c52b6f7c8f..5e302c0ce0 100644 --- a/libensemble/tools/parse_args.py +++ b/libensemble/tools/parse_args.py @@ -73,7 +73,7 @@ def _mpi_parse_args(args): def _local_parse_args(args): """Parses arguments for forked processes using multiprocessing.""" libE_specs = {"comms": args.comms} - nworkers = args.nworkers + 1 # for manager + nworkers = args.nworkers if args.nresource_sets is not None: libE_specs["num_resource_sets"] = args.nresource_sets From 46708391e8faac92faf1c086c1e7dbfe4b6ae8ad Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 27 Aug 2025 14:08:11 -0500 Subject: [PATCH 11/18] fix run_order key in persis_info for handful of aposmm tests --- .../tests/regression_tests/test_persistent_aposmm_dfols.py | 2 +- .../tests/regression_tests/test_persistent_aposmm_periodic.py | 2 +- .../tests/regression_tests/test_persistent_aposmm_timeout.py | 2 +- .../tests/regression_tests/test_persistent_aposmm_with_grad.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_dfols.py b/libensemble/tests/regression_tests/test_persistent_aposmm_dfols.py index 6e19930691..322f3d6633 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_dfols.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_dfols.py @@ -102,7 +102,7 @@ def combine_component(x): H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs) if is_manager: - assert persis_info[1].get("run_order"), "Run_order should have been given back" + assert persis_info[0].get("run_order"), "Run_order should have been given back" assert flag == 0 assert np.min(H["f"][H["sim_ended"]]) <= 3000, "Didn't find a value below 3000" diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_periodic.py b/libensemble/tests/regression_tests/test_persistent_aposmm_periodic.py index d99e8802a0..8cc215800d 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_periodic.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_periodic.py @@ -89,7 +89,7 @@ H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs) if is_manager: - assert persis_info[1].get("run_order"), "Run_order should have been given back" + assert persis_info[0].get("run_order"), "Run_order should have been given back" min_ids = np.where(H["local_min"]) # The minima are known on this test problem. If the above [lb, ub] domain is diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_timeout.py b/libensemble/tests/regression_tests/test_persistent_aposmm_timeout.py index e61843fd71..e6014cbee3 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_timeout.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_timeout.py @@ -87,6 +87,6 @@ if is_manager: assert flag == 2, "Test should have timed out" - assert persis_info[1].get("run_order"), "Run_order should have been given back" + assert persis_info[0].get("run_order"), "Run_order should have been given back" min_ids = np.where(H["local_min"]) save_libE_output(H, persis_info, __file__, nworkers) diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py b/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py index f2d2f09cc0..75b3a582d9 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py @@ -121,7 +121,7 @@ H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs, H0=H0) if is_manager: - assert persis_info[1].get("run_order"), "Run_order should have been given back" + assert persis_info[0].get("run_order"), "Run_order should have been given back" assert ( len(persis_info[1]["run_order"]) >= gen_specs["user"]["stop_after_k_minima"] ), "This test should have many runs started." From befcecc2e9821df661f7f08c8248543bf1cb34fc Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 27 Aug 2025 14:13:25 -0500 Subject: [PATCH 12/18] ditto --- .../test_persistent_uniform_sampling_nonblocking.py | 2 +- .../tests/regression_tests/test_persistent_aposmm_with_grad.py | 2 +- .../tests/regression_tests/test_persistent_fd_param_finder.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libensemble/tests/functionality_tests/test_persistent_uniform_sampling_nonblocking.py b/libensemble/tests/functionality_tests/test_persistent_uniform_sampling_nonblocking.py index 5425578849..b62cbe6b64 100644 --- a/libensemble/tests/functionality_tests/test_persistent_uniform_sampling_nonblocking.py +++ b/libensemble/tests/functionality_tests/test_persistent_uniform_sampling_nonblocking.py @@ -68,4 +68,4 @@ assert len(np.unique(H["gen_ended_time"])) == 2 save_libE_output(H, persis_info, __file__, nworkers) - assert persis_info[1]["spin_count"] > 0, "This should have been a nonblocking receive" + assert persis_info[0]["spin_count"] > 0, "This should have been a nonblocking receive" diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py b/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py index 75b3a582d9..48b1ae2ff6 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py @@ -123,7 +123,7 @@ if is_manager: assert persis_info[0].get("run_order"), "Run_order should have been given back" assert ( - len(persis_info[1]["run_order"]) >= gen_specs["user"]["stop_after_k_minima"] + len(persis_info[0]["run_order"]) >= gen_specs["user"]["stop_after_k_minima"] ), "This test should have many runs started." assert len(H) < exit_criteria["sim_max"], "Test should have stopped early due to 'stop_after_k_minima'" diff --git a/libensemble/tests/regression_tests/test_persistent_fd_param_finder.py b/libensemble/tests/regression_tests/test_persistent_fd_param_finder.py index ac01d5683b..de97470dc2 100644 --- a/libensemble/tests/regression_tests/test_persistent_fd_param_finder.py +++ b/libensemble/tests/regression_tests/test_persistent_fd_param_finder.py @@ -70,6 +70,6 @@ if fd_test.is_manager: assert len(H) < fd_test.exit_criteria.gen_max, "Problem didn't stop early, which should have been the case." - assert np.all(persis_info[1]["Fnoise"] > 0), "gen_f didn't find noise for all F_i components." + assert np.all(persis_info[0]["Fnoise"] > 0), "gen_f didn't find noise for all F_i components." fd_test.save_output(__file__) From 30403c70cfdbe6b366900235feebd367f0f784d3 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 27 Aug 2025 15:36:44 -0500 Subject: [PATCH 13/18] fix workercount? --- .../functionality_tests/test_zero_resource_workers_subnode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py b/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py index 446142f5db..a7a57b584a 100644 --- a/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py +++ b/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py @@ -29,7 +29,7 @@ # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi local -# TESTSUITE_NPROCS: 3 +# TESTSUITE_NPROCS: 4 # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": From 519930d37887c6069e1acfcb898e3b4ee1cf7b0f Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 27 Aug 2025 15:55:26 -0500 Subject: [PATCH 14/18] trying to narrow down issue with this test... --- .../test_persistent_uniform_gen_decides_stop.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/libensemble/tests/functionality_tests/test_persistent_uniform_gen_decides_stop.py b/libensemble/tests/functionality_tests/test_persistent_uniform_gen_decides_stop.py index 68c8aaaa05..f6a1e5d57f 100644 --- a/libensemble/tests/functionality_tests/test_persistent_uniform_gen_decides_stop.py +++ b/libensemble/tests/functionality_tests/test_persistent_uniform_gen_decides_stop.py @@ -13,7 +13,7 @@ # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi local -# TESTSUITE_NPROCS: 5 7 +# TESTSUITE_NPROCS: 3 5 # TESTSUITE_OS_SKIP: WIN import sys @@ -82,9 +82,7 @@ assert ( sum(counts == init_batch_size) >= ngens ), "The initial batch of each gen should be common among initial_batch_size number of points" - assert ( - len(counts) > 1 - ), "All gen_ended_times are the same; they should be different for the async case" + assert len(counts) > 1, "All gen_ended_times are the same; they should be different for the async case" gen_workers = np.unique(H["gen_worker"]) print("Generators that issued points", gen_workers) From 015cc4a90446a4759829a484eb64f1c52d8b340b Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 28 Aug 2025 11:11:47 -0500 Subject: [PATCH 15/18] still narrowing down issues with the zrw tests and having a zeroth worker... --- libensemble/sim_funcs/run_line_check.py | 6 +++--- .../test_mpi_runners_zrw_subnode_uneven.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/libensemble/sim_funcs/run_line_check.py b/libensemble/sim_funcs/run_line_check.py index 96ef13a5da..44827d3c90 100644 --- a/libensemble/sim_funcs/run_line_check.py +++ b/libensemble/sim_funcs/run_line_check.py @@ -22,7 +22,7 @@ def exp_nodelist_for_worker(exp_list, workerID, nodes_per_worker, persis_gens): node_list = comp.split(",") for node in node_list: node_name, node_num = node.split("-") - offset = workerID - (persis_gens) + offset = workerID # - (persis_gens) new_num = int(node_num) + int(nodes_per_worker * offset) new_node = "-".join([node_name, str(new_num)]) new_node_list.append(new_node) @@ -80,7 +80,7 @@ def runline_check_by_worker(H, persis_info, sim_specs, libE_info): exctr = Executor.executor test = sim_specs["user"]["tests"][0] exp_list = sim_specs["user"]["expect"] - p_gens = sim_specs["user"].get("persis_gens", 0) + # p_gens = sim_specs["user"].get("persis_gens", 0) task = exctr.submit( calc_type="sim", @@ -107,7 +107,7 @@ def runline_check_by_worker(H, persis_info, sim_specs, libE_info): else: wid_mod = wid - new_exp_list = exp_list[wid_mod - 1 - p_gens] + new_exp_list = exp_list[wid_mod - 1] # - p_gens] if outline != new_exp_list: print(f"Worker {wid}:\n outline is: {outline}\n exp is: {new_exp_list}", flush=True) diff --git a/libensemble/tests/functionality_tests/test_mpi_runners_zrw_subnode_uneven.py b/libensemble/tests/functionality_tests/test_mpi_runners_zrw_subnode_uneven.py index cc73d0e427..9c9f936edb 100644 --- a/libensemble/tests/functionality_tests/test_mpi_runners_zrw_subnode_uneven.py +++ b/libensemble/tests/functionality_tests/test_mpi_runners_zrw_subnode_uneven.py @@ -44,6 +44,7 @@ comms = libE_specs["comms"] libE_specs["dedicated_mode"] = True + libE_specs["zero_resource_workers"] = [0] libE_specs["enforce_worker_core_bounds"] = True # To allow visual checking - log file not used in test @@ -52,7 +53,7 @@ # For varying size test - relate node count to nworkers n_gens = 1 - nsim_workers = nworkers - n_gens + nsim_workers = nworkers # - n_gens if nsim_workers % 2 == 0: sys.exit( From 8c04b3a3212483165c31c3a10b4b222e5b23da29 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 5 Sep 2025 11:35:10 -0500 Subject: [PATCH 16/18] various fixes throughout the codebase to try making the number of workers (with gen_on_worker defaulting to False) clear to resources. other temporary debug adjusts --- libensemble/libE.py | 4 +++- libensemble/resources/worker_resources.py | 2 +- libensemble/sim_funcs/run_line_check.py | 2 +- .../functionality_tests/test_zero_resource_workers.py | 10 ++++------ libensemble/worker.py | 6 +++--- pyproject.toml | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/libensemble/libE.py b/libensemble/libE.py index af302d13c8..665553fe75 100644 --- a/libensemble/libE.py +++ b/libensemble/libE.py @@ -489,8 +489,10 @@ def libE_local(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, li wcomms = start_proc_team(libE_specs["nworkers"], sim_specs, gen_specs, libE_specs) # Set manager resources after the forkpoint. + # if libE_specs["gen_on_worker"] == True, -n reflects the exact number of workers + # if libE_specs["gen_on_worker"] == False: nworkers internally is the number of workers + 1 if resources is not None: - resources.set_resource_manager(libE_specs["nworkers"]) + resources.set_resource_manager(libE_specs["nworkers"] + (1 - libE_specs["gen_on_worker"])) if not libE_specs["disable_log_files"]: exit_logger = manager_logging_config(specs=libE_specs) diff --git a/libensemble/resources/worker_resources.py b/libensemble/resources/worker_resources.py index 5033b2aeee..8df45929cc 100644 --- a/libensemble/resources/worker_resources.py +++ b/libensemble/resources/worker_resources.py @@ -106,7 +106,7 @@ def get_index_list(num_workers: int, num_rsets: int, zero_resource_list: list[in """Map WorkerID to index into a nodelist""" index = 0 index_list = [] - for i in range(1, num_workers + 1): + for i in range(0, num_workers): if i in zero_resource_list: index_list.append(None) else: diff --git a/libensemble/sim_funcs/run_line_check.py b/libensemble/sim_funcs/run_line_check.py index 44827d3c90..9d16205d28 100644 --- a/libensemble/sim_funcs/run_line_check.py +++ b/libensemble/sim_funcs/run_line_check.py @@ -22,7 +22,7 @@ def exp_nodelist_for_worker(exp_list, workerID, nodes_per_worker, persis_gens): node_list = comp.split(",") for node in node_list: node_name, node_num = node.split("-") - offset = workerID # - (persis_gens) + offset = workerID new_num = int(node_num) + int(nodes_per_worker * offset) new_node = "-".join([node_name, str(new_num)]) new_node_list.append(new_node) diff --git a/libensemble/tests/functionality_tests/test_zero_resource_workers.py b/libensemble/tests/functionality_tests/test_zero_resource_workers.py index d286aa7128..93c4350786 100644 --- a/libensemble/tests/functionality_tests/test_zero_resource_workers.py +++ b/libensemble/tests/functionality_tests/test_zero_resource_workers.py @@ -9,8 +9,6 @@ The number of concurrent evaluations of the objective function will be 4-1=3. """ -import sys - import numpy as np from libensemble import logger @@ -23,7 +21,7 @@ from libensemble.tools import add_unique_random_streams, parse_args # logger.set_level("DEBUG") # For testing the test -logger.set_level("INFO") +logger.set_level("DEBUG") # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi local @@ -49,7 +47,7 @@ # For varying size test - relate node count to nworkers in_place = libE_specs["zero_resource_workers"] n_gens = len(in_place) - nsim_workers = nworkers - n_gens + nsim_workers = nworkers # - n_gens comms = libE_specs["comms"] nodes_per_worker = 2 @@ -79,8 +77,8 @@ exctr = MPIExecutor(custom_info=mpi_customizer) exctr.register_app(full_path=sim_app, calc_type="sim") - if nworkers < 2: - sys.exit("Cannot run with a persistent worker if only one worker -- aborting...") + # if nworkers < 2: + # sys.exit("Cannot run with a persistent worker if only one worker -- aborting...") n = 2 sim_specs = { diff --git a/libensemble/worker.py b/libensemble/worker.py index 5f574a3336..e543e93bcc 100644 --- a/libensemble/worker.py +++ b/libensemble/worker.py @@ -177,7 +177,7 @@ def __init__( self.runners = {EVAL_SIM_TAG: self.sim_runner.run, EVAL_GEN_TAG: self.gen_runner.run} self.calc_iter = {EVAL_SIM_TAG: 0, EVAL_GEN_TAG: 0} Worker._set_executor(self.workerID, self.comm) - Worker._set_resources(self.workerID, self.comm) + Worker._set_resources(self.workerID, self.comm, self.libE_specs) self.EnsembleDirectory = EnsembleDirectory(libE_specs=libE_specs) @staticmethod @@ -215,11 +215,11 @@ def _set_executor(workerID: int, comm: Comm) -> bool: return False @staticmethod - def _set_resources(workerID, comm: Comm) -> bool: + def _set_resources(workerID, comm: Comm, libE_specs) -> bool: """Sets worker ID in the resources, return True if set""" resources = Resources.resources if isinstance(resources, Resources): - resources.set_worker_resources(comm.get_num_workers(), workerID) + resources.set_worker_resources(comm.get_num_workers() + (1 - libE_specs["gen_on_worker"]), workerID) return True else: logger.debug(f"No resources set on worker {workerID}") diff --git a/pyproject.toml b/pyproject.toml index 68d5654da3..231afa5bbf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -142,4 +142,4 @@ extend-exclude = ["*.bib", "*.xml", "docs/nitpicky"] disable_error_code = ["import-not-found", "import-untyped"] [dependency-groups] -dev = ["pyenchant", "enchant>=0.0.1,<0.0.2", "flake8-modern-annotations>=1.6.0,<2", "flake8-type-checking>=3.0.0,<4"] +dev = ["pyenchant", "enchant>=0.0.1,<0.0.2", "flake8-modern-annotations>=1.6.0,<2", "flake8-type-checking>=3.0.0,<4", "wat>=0.7.0,<0.8"] From fafff0ea5b0ab16809f00ad9f7cfe6a2f09fc3bf Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 12 Dec 2025 16:25:13 -0600 Subject: [PATCH 17/18] adjust map_workerid_to_index unit test for starting at workerID index zero --- .../tests/unit_tests/test_resources.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/libensemble/tests/unit_tests/test_resources.py b/libensemble/tests/unit_tests/test_resources.py index b87583f377..15db28ea07 100644 --- a/libensemble/tests/unit_tests/test_resources.py +++ b/libensemble/tests/unit_tests/test_resources.py @@ -694,23 +694,23 @@ def test_map_workerid_to_index(): zero_resource_list = [] index_list = ResourceManager.get_index_list(num_workers, num_rsets, zero_resource_list) - for workerID in range(1, num_workers + 1): - index = index_list[workerID - 1] - assert index == workerID - 1, "index incorrect. Received: " + str(index) + for workerID in range(0, num_workers): + index = index_list[workerID] + assert index == workerID, "index incorrect. Received: " + str(index) - zero_resource_list = [1] + zero_resource_list = [0] index_list = ResourceManager.get_index_list(num_workers, num_rsets, zero_resource_list) - for workerID in range(2, num_workers + 1): - index = index_list[workerID - 1] - assert index == workerID - 2, "index incorrect. Received: " + str(index) + for workerID in range(1, num_workers): + index = index_list[workerID] + assert index == workerID - 1, "index incorrect. Received: " + str(index) - zero_resource_list = [1, 2] + zero_resource_list = [0, 1] index_list = ResourceManager.get_index_list(num_workers, num_rsets, zero_resource_list) - for workerID in range(3, num_workers + 1): - index = index_list[workerID - 1] - assert index == workerID - 3, "index incorrect. Received: " + str(index) + for workerID in range(2, num_workers): + index = index_list[workerID] + assert index == workerID - 2, "index incorrect. Received: " + str(index) - zero_resource_list = [1, 3] + zero_resource_list = [0, 2] index_list = ResourceManager.get_index_list(num_workers, num_rsets, zero_resource_list) workerID = 2 From f4370683209460ccbdf97c90a564d1040273e0dd Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 18 Dec 2025 10:46:36 -0600 Subject: [PATCH 18/18] perhaps the default zero-resource-worker should be worker 0 - but does this actually solve anything? --- libensemble/specs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/specs.py b/libensemble/specs.py index 168bbcc380..16347084c5 100644 --- a/libensemble/specs.py +++ b/libensemble/specs.py @@ -469,7 +469,7 @@ class LibeSpecs(BaseModel): libEnsemble processes (manager and workers) are running. """ - zero_resource_workers: list[int] | None = [] + zero_resource_workers: list[int] | None = [0] """ list of workers that require no resources. For when a fixed mapping of workers to resources is required. Otherwise, use ``num_resource_sets``.