diff --git a/doc/source/userguide/discrete_ordinates_problems.rst b/doc/source/userguide/discrete_ordinates_problems.rst
index b871f8bd0..5bce94717 100644
--- a/doc/source/userguide/discrete_ordinates_problems.rst
+++ b/doc/source/userguide/discrete_ordinates_problems.rst
@@ -305,6 +305,10 @@ The practical differences are:
   dependencies.
 * ``CBC`` does not support local sweep cycles.
 
+When the ``AAH`` GPU all-at-once scheduler is used, its internal thread pool
+is capped by the environment variable ``OPENSN_NUM_THREADS``. If the variable
+is unset or invalid, OpenSn uses ``1`` thread.
+
 .. note::
 
    In the ``AAH`` implementation, the lagged data is tied specifically to
@@ -417,7 +421,7 @@ to disable progress reporting.
 Internal threading used by the uncollided solver is capped by the environment
 variable ``OPENSN_NUM_THREADS``. If the variable is unset or invalid, OpenSn
 uses ``1`` thread. The current uncollided implementation applies this cap to
-reflected-image projection and to the bulk-sweep group solve.
+reflected-image projection and to the threaded bulk-sweep group solve.
 
 .. code-block:: python
 
diff --git a/doc/source/userguide/solvers.rst b/doc/source/userguide/solvers.rst
index 9bddc660d..bf97621b1 100644
--- a/doc/source/userguide/solvers.rst
+++ b/doc/source/userguide/solvers.rst
@@ -183,6 +183,11 @@ active for the problem. The default is ``True``. This should usually stay
 enabled for transient and k-eigen workflows unless you explicitly want a
 prompt-only model.
 
+OpenSn also reads the environment variable ``OPENSN_NUM_THREADS``. If it is
+unset or invalid, OpenSn uses ``1`` thread. In the current transport
+implementation, this cap is applied to the ``AAH`` GPU all-at-once sweep
+scheduler thread pool and to the threaded portions of the uncollided solver.
+
 ``read_restart_path`` reads a full restart for continuing a compatible solve.
 ``read_initial_condition_path`` reads restart data as an initial condition; this
 is the option to use when a :py:class:`pyopensn.solver.TransientSolver` should
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/spmd_threadpool.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/spmd_threadpool.cc
index f15dba7b9..7ecb98695 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/spmd_threadpool.cc
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/spmd_threadpool.cc
@@ -2,10 +2,46 @@
 // SPDX-License-Identifier: MIT
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/spmd_threadpool.h"
+#include "framework/runtime.h"
 #include <cassert>
 
 namespace opensn
 {
+namespace
+{
+
+unsigned int
+ResolveEnvThreadCount()
+{
+  const char* env_value = std::getenv("OPENSN_NUM_THREADS"); // NOLINT(concurrency-mt-unsafe)
+  if (env_value == nullptr)
+    return 1;
+
+  try
+  {
+    const auto parsed_value = std::stoul(env_value);
+    return parsed_value > 0 ? static_cast<unsigned int>(parsed_value) : 1U;
+  }
+  catch (const std::exception&)
+  {
+    return 1;
+  }
+}
+
+std::size_t
+ResolveWorkerCount(std::size_t requested_workers)
+{
+  if (requested_workers == 0)
+    return 0;
+
+  const auto configured_threads =
+    opensn_num_threads > 1 ? opensn_num_threads : ResolveEnvThreadCount();
+  const auto capped_workers =
+    std::min<std::size_t>(requested_workers, std::max(1U, configured_threads));
+  return std::max<std::size_t>(1, capped_workers);
+}
+
+} // namespace
 
 SPMD_ThreadPool::SPMD_ThreadPool(std::size_t n)
 {
@@ -20,6 +56,7 @@ SPMD_ThreadPool::~SPMD_ThreadPool()
 void
 SPMD_ThreadPool::Resize(std::size_t n)
 {
+  n = ResolveWorkerCount(n);
   if (worker_threads_.size() == n)
     return;
   Stop();
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.cu b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.cu
index 9167f3e91..c04f30f2c 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.cu
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.cu
@@ -9,6 +9,7 @@
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.h"
 #include "caribou/main.hpp"
+#include <atomic>
 #include <thread>
 #include <vector>
 
@@ -35,23 +36,16 @@ SweepScheduler::ScheduleAlgoAAO(SweepChunk& sweep_chunk)
     execution_order_[i] = i;
   }
 
-  // assign sweep task to thread pool (but not execution yet)
-  pool_.AssignTask(
-    [this, &sweep_chunk](std::size_t i)
-    {
-      auto* aahd = static_cast<AAHD_AngleSet*>(angle_agg_[i].get());
-      aahd->AngleSetAdvance(sweep_chunk, AngleSetStatus::EXECUTE);
-    });
-
-  // poll for readiness and launch threads
+  // poll for readiness and execute waves of ready angle sets on the capped worker pool
   while (!execution_order_.empty())
   {
+    std::vector<std::size_t> ready_angle_sets;
     for (auto it = execution_order_.begin(); it != execution_order_.end();)
     {
       auto* angle_set = static_cast<AAHD_AngleSet*>(angle_agg_[*it].get());
       if (angle_set->IsReady())
       {
-        pool_.Run(*it);
+        ready_angle_sets.push_back(*it);
         std::swap(*it, execution_order_.back());
         execution_order_.pop_back();
       }
@@ -60,8 +54,25 @@ SweepScheduler::ScheduleAlgoAAO(SweepChunk& sweep_chunk)
         ++it;
       }
     }
+
+    if (ready_angle_sets.empty())
+      continue;
+
+    std::atomic<std::size_t> next_ready(0);
+    pool_.ExecuteBatch(
+      [this, &sweep_chunk, &ready_angle_sets, &next_ready](std::size_t)
+      {
+        while (true)
+        {
+          const auto ready_idx = next_ready.fetch_add(1, std::memory_order_relaxed);
+          if (ready_idx >= ready_angle_sets.size())
+            break;
+
+          auto* aahd = static_cast<AAHD_AngleSet*>(angle_agg_[ready_angle_sets[ready_idx]].get());
+          aahd->AngleSetAdvance(sweep_chunk, AngleSetStatus::EXECUTE);
+        }
+      });
   }
-  pool_.WaitAll();
 
   // wait for sends and receive of delayed data
   for (auto& angle_set : angle_agg_)
diff --git a/python/lib/solver.cc b/python/lib/solver.cc
index 18acb4148..f06808113 100644
--- a/python/lib/solver.cc
+++ b/python/lib/solver.cc
@@ -912,11 +912,12 @@ WrapLBS(py::module& slv)
         These options are applied at problem creation.
     sweep_type : str, default="AAH"
         The sweep type to use. Must be one of `AAH` or `CBC`. Defaults to `AAH`.
-        Both sweep types support time-dependent (transient) mode.
+        The ``AAH`` all-at-once sweep scheduler uses an internal thread pool
+        whose worker count is capped by the ``OPENSN_NUM_THREADS`` environment
+        variable and defaults to ``1`` when the variable is unset or invalid.
     time_dependent : bool, default=False
         If true, the problem starts in time-dependent mode. Otherwise it starts in
         steady-state mode. Requires ``options.save_angular_flux=True``.
-        Both ``AAH`` and ``CBC`` sweep types support time-dependent mode.
     uncollided_flux : str, default=""
         HDF5 file generated by :class:`UncollidedSolver`. For steady-state
         forward fixed-source calculations, OpenSn uses its moments to
@@ -928,8 +929,10 @@ WrapLBS(py::module& slv)
         this problem's scattering order. The same serial file may be read by
         any MPI partitioning of the matching mesh.
     use_gpus : bool, default=False
-        A flag specifying whether GPU acceleration is used for the sweep. Both
-        ```AAH``` and ```CBC``` sweep types support GPU acceleration.
+        A flag specifying whether GPU acceleration is used for the sweep.
+        GPU acceleration requires ``sweep_type="AAH"`` on a supported
+        problem. When the ``AAH`` GPU all-at-once scheduler is active, its internal
+        thread pool is still capped by ``OPENSN_NUM_THREADS``.
     )"
   );
   do_problem.def(
@@ -1469,7 +1472,9 @@ WrapUncollidedSolver(py::module& slv)
 
     Internal threading in the current uncollided implementation is capped by
     the ``OPENSN_NUM_THREADS`` environment variable and defaults to ``1`` when
-    the variable is unset or invalid.
+    the variable is unset or invalid. The current implementation applies this
+    cap to reflected-image projection and to the threaded bulk-sweep group
+    solve.
     )"
   );
   // clang-format on
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_keigen/tests.json b/test/python/modules/linear_boltzmann_solvers/transport_keigen/tests.json
index 0b8a8ec98..6fd965662 100644
--- a/test/python/modules/linear_boltzmann_solvers/transport_keigen/tests.json
+++ b/test/python/modules/linear_boltzmann_solvers/transport_keigen/tests.json
@@ -233,6 +233,9 @@
     "comment": "3D 172G keigenvalue test using OpenMX MGXS cross-sections and power iteration",
     "num_procs": 4,
     "requires_gpu": "True",
+    "env": {
+      "OPENSN_NUM_THREADS": 16
+    },
     "checks": [
       {
         "type": "KeyValuePair",
@@ -247,6 +250,9 @@
     "comment": "3D 172G keigenvalue test using OpenMX MGXS cross-sections and power iteration (CBC)",
     "num_procs": 4,
     "requires_gpu": "True",
+    "env": {
+      "OPENSN_NUM_THREADS": 16
+    },
     "checks": [
       {
         "type": "KeyValuePair",
@@ -492,6 +498,9 @@
     "comment": "3D_2G Infinite medium testing pyramid elements of EXODUSII; should agree with the hex model",
     "num_procs": 1,
     "requires_gpu": "True",
+    "env": {
+      "OPENSN_NUM_THREADS": 16
+    },
     "checks": [
       {
         "type": "KeyValuePair",
@@ -506,6 +515,9 @@
     "comment": "3D_2G Infinite medium testing pyramid elements of EXODUSII; should agree with the hex model (CBC)",
     "num_procs": 1,
     "requires_gpu": "True",
+    "env": {
+      "OPENSN_NUM_THREADS": 16
+    },
     "checks": [
       {
         "type": "KeyValuePair",
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_steady/tests.json b/test/python/modules/linear_boltzmann_solvers/transport_steady/tests.json
index af89bc3d5..f8f470d9f 100644
--- a/test/python/modules/linear_boltzmann_solvers/transport_steady/tests.json
+++ b/test/python/modules/linear_boltzmann_solvers/transport_steady/tests.json
@@ -42,6 +42,9 @@
     "comment": "1D LinearBSolver Test - PWLD",
     "num_procs": 3,
     "requires_gpu": "True",
+    "env": {
+      "OPENSN_NUM_THREADS": 16
+    },
     "checks": [
       {
         "type": "KeyValuePair",
@@ -62,6 +65,9 @@
     "comment": "1D LinearBSolver Test - PWLD (CBC)",
     "num_procs": 3,
     "requires_gpu": "True",
+    "env": {
+      "OPENSN_NUM_THREADS": 16
+    },
     "checks": [
       {
         "type": "KeyValuePair",
@@ -139,6 +145,9 @@
     "comment": "2D LinearBSolver Test Unstructured grid - PWLD",
     "num_procs": 4,
     "requires_gpu": "True",
+    "env": {
+      "OPENSN_NUM_THREADS": 16
+    },
     "checks": [
       {
         "type": "KeyValuePair",
@@ -159,6 +168,9 @@
     "comment": "2D LinearBSolver Test Unstructured grid - PWLD (CBC)",
     "num_procs": 4,
     "requires_gpu": "True",
+    "env": {
+      "OPENSN_NUM_THREADS": 16
+    },
     "checks": [
       {
         "type": "KeyValuePair",
@@ -217,6 +229,9 @@
     "comment": "3D LinearBSolver Test Extruded Unstructured - PWLD",
     "num_procs": 4,
     "requires_gpu": "True",
+    "env": {
+      "OPENSN_NUM_THREADS": 16
+    },
     "checks": [
       {
         "type": "KeyValuePair",
@@ -237,6 +252,9 @@
     "comment": "3D LinearBSolver Test Extruded Unstructured - PWLD (CBC)",
     "num_procs": 4,
     "requires_gpu": "True",
+    "env": {
+      "OPENSN_NUM_THREADS": 16
+    },
     "checks": [
       {
         "type": "KeyValuePair",
@@ -653,6 +671,9 @@
     "comment": "3D LinearBSolver Test - PWLD",
     "num_procs": 4,
     "requires_gpu": "True",
+    "env": {
+      "OPENSN_NUM_THREADS": 16
+    },
     "checks": [
       {
         "type": "KeyValuePair",
@@ -673,6 +694,9 @@
     "comment": "3D LinearBSolver Test - PWLD (CBC)",
     "num_procs": 4,
     "requires_gpu": "True",
+    "env": {
+      "OPENSN_NUM_THREADS": 16
+    },
     "checks": [
       {
         "type": "KeyValuePair",
@@ -731,6 +755,9 @@
     "comment": "3D LinearBSolver Test - PWLD Reflecting BC",
     "requires_gpu": "True",
     "num_procs": 4,
+    "env": {
+      "OPENSN_NUM_THREADS": 16
+    },
     "checks": [
       {
         "type": "KeyValuePair",
@@ -751,6 +778,9 @@
     "comment": "3D LinearBSolver Test - PWLD Reflecting BC (CBC)",
     "requires_gpu": "True",
     "num_procs": 4,
+    "env": {
+      "OPENSN_NUM_THREADS": 16
+    },
     "checks": [
       {
         "type": "KeyValuePair",
@@ -949,6 +979,9 @@
     "comment": "3D LinearBSolver Test Extruded-Unstructured Mesh - PWLD",
     "requires_gpu": "True",
     "num_procs": 4,
+    "env": {
+      "OPENSN_NUM_THREADS": 16
+    },
     "checks": [
       {
         "type": "KeyValuePair",
@@ -1142,6 +1175,9 @@
     "requires_gpu": "True",
     "num_procs": 1,
     "weight_class": "intermediate",
+    "env": {
+      "OPENSN_NUM_THREADS": 16
+    },
     "checks": [
       {
         "type": "KeyValuePair",
@@ -1527,6 +1563,9 @@
     "comment": "Runtime boundary-condition mutation rebuild regression",
     "requires_gpu": "True",
     "num_procs": 4,
+    "env": {
+      "OPENSN_NUM_THREADS": 16
+    },
     "checks": [
       {
         "type": "KeyValuePair",