diff --git a/doc/source/userguide/discrete_ordinates_problems.rst b/doc/source/userguide/discrete_ordinates_problems.rst index b871f8bd0..5bce94717 100644 --- a/doc/source/userguide/discrete_ordinates_problems.rst +++ b/doc/source/userguide/discrete_ordinates_problems.rst @@ -305,6 +305,10 @@ The practical differences are: dependencies. * ``CBC`` does not support local sweep cycles. +When the ``AAH`` GPU all-at-once scheduler is used, its internal thread pool +is capped by the environment variable ``OPENSN_NUM_THREADS``. If the variable +is unset or invalid, OpenSn uses ``1`` thread. + .. note:: In the ``AAH`` implementation, the lagged data is tied specifically to @@ -417,7 +421,7 @@ to disable progress reporting. Internal threading used by the uncollided solver is capped by the environment variable ``OPENSN_NUM_THREADS``. If the variable is unset or invalid, OpenSn uses ``1`` thread. The current uncollided implementation applies this cap to -reflected-image projection and to the bulk-sweep group solve. +reflected-image projection and to the threaded bulk-sweep group solve. .. code-block:: python diff --git a/doc/source/userguide/solvers.rst b/doc/source/userguide/solvers.rst index 9bddc660d..bf97621b1 100644 --- a/doc/source/userguide/solvers.rst +++ b/doc/source/userguide/solvers.rst @@ -183,6 +183,11 @@ active for the problem. The default is ``True``. This should usually stay enabled for transient and k-eigen workflows unless you explicitly want a prompt-only model. +OpenSn also reads the environment variable ``OPENSN_NUM_THREADS``. If it is +unset or invalid, OpenSn uses ``1`` thread. In the current transport +implementation, this cap is applied to the ``AAH`` GPU all-at-once sweep +scheduler thread pool and to the threaded portions of the uncollided solver. + ``read_restart_path`` reads a full restart for continuing a compatible solve. ``read_initial_condition_path`` reads restart data as an initial condition; this is the option to use when a :py:class:`pyopensn.solver.TransientSolver` should diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/spmd_threadpool.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/spmd_threadpool.cc index f15dba7b9..7ecb98695 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/spmd_threadpool.cc +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/spmd_threadpool.cc @@ -2,10 +2,46 @@ // SPDX-License-Identifier: MIT #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/spmd_threadpool.h" +#include "framework/runtime.h" #include namespace opensn { +namespace +{ + +unsigned int +ResolveEnvThreadCount() +{ + const char* env_value = std::getenv("OPENSN_NUM_THREADS"); // NOLINT(concurrency-mt-unsafe) + if (env_value == nullptr) + return 1; + + try + { + const auto parsed_value = std::stoul(env_value); + return parsed_value > 0 ? static_cast(parsed_value) : 1U; + } + catch (const std::exception&) + { + return 1; + } +} + +std::size_t +ResolveWorkerCount(std::size_t requested_workers) +{ + if (requested_workers == 0) + return 0; + + const auto configured_threads = + opensn_num_threads > 1 ? opensn_num_threads : ResolveEnvThreadCount(); + const auto capped_workers = + std::min(requested_workers, std::max(1U, configured_threads)); + return std::max(1, capped_workers); +} + +} // namespace SPMD_ThreadPool::SPMD_ThreadPool(std::size_t n) { @@ -20,6 +56,7 @@ SPMD_ThreadPool::~SPMD_ThreadPool() void SPMD_ThreadPool::Resize(std::size_t n) { + n = ResolveWorkerCount(n); if (worker_threads_.size() == n) return; Stop(); diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.cu b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.cu index 9167f3e91..c04f30f2c 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.cu +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.cu @@ -9,6 +9,7 @@ #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.h" #include "caribou/main.hpp" +#include #include #include @@ -35,23 +36,16 @@ SweepScheduler::ScheduleAlgoAAO(SweepChunk& sweep_chunk) execution_order_[i] = i; } - // assign sweep task to thread pool (but not execution yet) - pool_.AssignTask( - [this, &sweep_chunk](std::size_t i) - { - auto* aahd = static_cast(angle_agg_[i].get()); - aahd->AngleSetAdvance(sweep_chunk, AngleSetStatus::EXECUTE); - }); - - // poll for readiness and launch threads + // poll for readiness and execute waves of ready angle sets on the capped worker pool while (!execution_order_.empty()) { + std::vector ready_angle_sets; for (auto it = execution_order_.begin(); it != execution_order_.end();) { auto* angle_set = static_cast(angle_agg_[*it].get()); if (angle_set->IsReady()) { - pool_.Run(*it); + ready_angle_sets.push_back(*it); std::swap(*it, execution_order_.back()); execution_order_.pop_back(); } @@ -60,8 +54,25 @@ SweepScheduler::ScheduleAlgoAAO(SweepChunk& sweep_chunk) ++it; } } + + if (ready_angle_sets.empty()) + continue; + + std::atomic next_ready(0); + pool_.ExecuteBatch( + [this, &sweep_chunk, &ready_angle_sets, &next_ready](std::size_t) + { + while (true) + { + const auto ready_idx = next_ready.fetch_add(1, std::memory_order_relaxed); + if (ready_idx >= ready_angle_sets.size()) + break; + + auto* aahd = static_cast(angle_agg_[ready_angle_sets[ready_idx]].get()); + aahd->AngleSetAdvance(sweep_chunk, AngleSetStatus::EXECUTE); + } + }); } - pool_.WaitAll(); // wait for sends and receive of delayed data for (auto& angle_set : angle_agg_) diff --git a/python/lib/solver.cc b/python/lib/solver.cc index 18acb4148..f06808113 100644 --- a/python/lib/solver.cc +++ b/python/lib/solver.cc @@ -912,11 +912,12 @@ WrapLBS(py::module& slv) These options are applied at problem creation. sweep_type : str, default="AAH" The sweep type to use. Must be one of `AAH` or `CBC`. Defaults to `AAH`. - Both sweep types support time-dependent (transient) mode. + The ``AAH`` all-at-once sweep scheduler uses an internal thread pool + whose worker count is capped by the ``OPENSN_NUM_THREADS`` environment + variable and defaults to ``1`` when the variable is unset or invalid. time_dependent : bool, default=False If true, the problem starts in time-dependent mode. Otherwise it starts in steady-state mode. Requires ``options.save_angular_flux=True``. - Both ``AAH`` and ``CBC`` sweep types support time-dependent mode. uncollided_flux : str, default="" HDF5 file generated by :class:`UncollidedSolver`. For steady-state forward fixed-source calculations, OpenSn uses its moments to @@ -928,8 +929,10 @@ WrapLBS(py::module& slv) this problem's scattering order. The same serial file may be read by any MPI partitioning of the matching mesh. use_gpus : bool, default=False - A flag specifying whether GPU acceleration is used for the sweep. Both - ```AAH``` and ```CBC``` sweep types support GPU acceleration. + A flag specifying whether GPU acceleration is used for the sweep. + GPU acceleration requires ``sweep_type="AAH"`` on a supported + problem. When the ``AAH`` GPU all-at-once scheduler is active, its internal + thread pool is still capped by ``OPENSN_NUM_THREADS``. )" ); do_problem.def( @@ -1469,7 +1472,9 @@ WrapUncollidedSolver(py::module& slv) Internal threading in the current uncollided implementation is capped by the ``OPENSN_NUM_THREADS`` environment variable and defaults to ``1`` when - the variable is unset or invalid. + the variable is unset or invalid. The current implementation applies this + cap to reflected-image projection and to the threaded bulk-sweep group + solve. )" ); // clang-format on diff --git a/test/python/modules/linear_boltzmann_solvers/transport_keigen/tests.json b/test/python/modules/linear_boltzmann_solvers/transport_keigen/tests.json index 0b8a8ec98..6fd965662 100644 --- a/test/python/modules/linear_boltzmann_solvers/transport_keigen/tests.json +++ b/test/python/modules/linear_boltzmann_solvers/transport_keigen/tests.json @@ -233,6 +233,9 @@ "comment": "3D 172G keigenvalue test using OpenMX MGXS cross-sections and power iteration", "num_procs": 4, "requires_gpu": "True", + "env": { + "OPENSN_NUM_THREADS": 16 + }, "checks": [ { "type": "KeyValuePair", @@ -247,6 +250,9 @@ "comment": "3D 172G keigenvalue test using OpenMX MGXS cross-sections and power iteration (CBC)", "num_procs": 4, "requires_gpu": "True", + "env": { + "OPENSN_NUM_THREADS": 16 + }, "checks": [ { "type": "KeyValuePair", @@ -492,6 +498,9 @@ "comment": "3D_2G Infinite medium testing pyramid elements of EXODUSII; should agree with the hex model", "num_procs": 1, "requires_gpu": "True", + "env": { + "OPENSN_NUM_THREADS": 16 + }, "checks": [ { "type": "KeyValuePair", @@ -506,6 +515,9 @@ "comment": "3D_2G Infinite medium testing pyramid elements of EXODUSII; should agree with the hex model (CBC)", "num_procs": 1, "requires_gpu": "True", + "env": { + "OPENSN_NUM_THREADS": 16 + }, "checks": [ { "type": "KeyValuePair", diff --git a/test/python/modules/linear_boltzmann_solvers/transport_steady/tests.json b/test/python/modules/linear_boltzmann_solvers/transport_steady/tests.json index af89bc3d5..f8f470d9f 100644 --- a/test/python/modules/linear_boltzmann_solvers/transport_steady/tests.json +++ b/test/python/modules/linear_boltzmann_solvers/transport_steady/tests.json @@ -42,6 +42,9 @@ "comment": "1D LinearBSolver Test - PWLD", "num_procs": 3, "requires_gpu": "True", + "env": { + "OPENSN_NUM_THREADS": 16 + }, "checks": [ { "type": "KeyValuePair", @@ -62,6 +65,9 @@ "comment": "1D LinearBSolver Test - PWLD (CBC)", "num_procs": 3, "requires_gpu": "True", + "env": { + "OPENSN_NUM_THREADS": 16 + }, "checks": [ { "type": "KeyValuePair", @@ -139,6 +145,9 @@ "comment": "2D LinearBSolver Test Unstructured grid - PWLD", "num_procs": 4, "requires_gpu": "True", + "env": { + "OPENSN_NUM_THREADS": 16 + }, "checks": [ { "type": "KeyValuePair", @@ -159,6 +168,9 @@ "comment": "2D LinearBSolver Test Unstructured grid - PWLD (CBC)", "num_procs": 4, "requires_gpu": "True", + "env": { + "OPENSN_NUM_THREADS": 16 + }, "checks": [ { "type": "KeyValuePair", @@ -217,6 +229,9 @@ "comment": "3D LinearBSolver Test Extruded Unstructured - PWLD", "num_procs": 4, "requires_gpu": "True", + "env": { + "OPENSN_NUM_THREADS": 16 + }, "checks": [ { "type": "KeyValuePair", @@ -237,6 +252,9 @@ "comment": "3D LinearBSolver Test Extruded Unstructured - PWLD (CBC)", "num_procs": 4, "requires_gpu": "True", + "env": { + "OPENSN_NUM_THREADS": 16 + }, "checks": [ { "type": "KeyValuePair", @@ -653,6 +671,9 @@ "comment": "3D LinearBSolver Test - PWLD", "num_procs": 4, "requires_gpu": "True", + "env": { + "OPENSN_NUM_THREADS": 16 + }, "checks": [ { "type": "KeyValuePair", @@ -673,6 +694,9 @@ "comment": "3D LinearBSolver Test - PWLD (CBC)", "num_procs": 4, "requires_gpu": "True", + "env": { + "OPENSN_NUM_THREADS": 16 + }, "checks": [ { "type": "KeyValuePair", @@ -731,6 +755,9 @@ "comment": "3D LinearBSolver Test - PWLD Reflecting BC", "requires_gpu": "True", "num_procs": 4, + "env": { + "OPENSN_NUM_THREADS": 16 + }, "checks": [ { "type": "KeyValuePair", @@ -751,6 +778,9 @@ "comment": "3D LinearBSolver Test - PWLD Reflecting BC (CBC)", "requires_gpu": "True", "num_procs": 4, + "env": { + "OPENSN_NUM_THREADS": 16 + }, "checks": [ { "type": "KeyValuePair", @@ -949,6 +979,9 @@ "comment": "3D LinearBSolver Test Extruded-Unstructured Mesh - PWLD", "requires_gpu": "True", "num_procs": 4, + "env": { + "OPENSN_NUM_THREADS": 16 + }, "checks": [ { "type": "KeyValuePair", @@ -1142,6 +1175,9 @@ "requires_gpu": "True", "num_procs": 1, "weight_class": "intermediate", + "env": { + "OPENSN_NUM_THREADS": 16 + }, "checks": [ { "type": "KeyValuePair", @@ -1527,6 +1563,9 @@ "comment": "Runtime boundary-condition mutation rebuild regression", "requires_gpu": "True", "num_procs": 4, + "env": { + "OPENSN_NUM_THREADS": 16 + }, "checks": [ { "type": "KeyValuePair",