From 8cd33cdb6d96bf3da5f1e7343de6b6cb62ac62d1 Mon Sep 17 00:00:00 2001
From: Eappen Nelluvelil <eappen@tamu.edu>
Date: Tue, 31 Mar 2026 23:30:15 -0500
Subject: [PATCH 1/6] AVX optimizations for CBC sweep kernels

---
 .../sweep/angle_set/cbc_angle_set.cc          |  53 ++-
 .../sweep/angle_set/cbc_angle_set.h           |   2 +
 .../sweep/communicators/async_comm.h          |   9 -
 .../sweep/communicators/cbc_async_comm.cc     |  39 +-
 .../sweep/communicators/cbc_async_comm.cu     |  17 -
 .../sweep/communicators/cbc_async_comm.h      |  28 +-
 .../sweep/fluds/cbc_fluds.cc                  |  52 +--
 .../sweep/fluds/cbc_fluds.h                   |  13 +-
 .../sweep/fluds/cbc_fluds_common_data.cc      |  26 +-
 .../sweep/fluds/cbc_fluds_common_data.h       |   9 +
 .../sweep/fluds/cbcd_fluds.cu                 |  14 +-
 .../sweep/fluds/cbcd_fluds.h                  |  11 +-
 .../sweep/fluds/cbcd_fluds_common_data.cc     |   2 +
 .../sweep/fluds/cbcd_fluds_common_data.cu     |   9 +
 .../sweep/fluds/cbcd_fluds_common_data.h      |  10 +
 .../sweep/fluds/fluds.h                       |  25 +
 .../sweep_chunks/aah_avx_sweep_chunk.cc       | 204 ---------
 .../sweep_chunks/aah_sweep_chunk.h            |  11 +-
 .../sweep_chunks/aah_sweep_kernels.h          |  20 -
 .../sweep_chunks/avx_sweep_chunk_utils.h      | 241 ++++++++++
 .../sweep_chunks/cbc_avx_sweep_chunk.cc       | 427 ++++++++++++++++++
 .../sweep_chunks/cbc_sweep_chunk.cc           |  59 ++-
 .../sweep_chunks/cbc_sweep_chunk.h            |  20 +-
 23 files changed, 919 insertions(+), 382 deletions(-)
 delete mode 100644 modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.cu
 create mode 100644 modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/avx_sweep_chunk_utils.h
 create mode 100644 modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_avx_sweep_chunk.cc
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.cc
index e9229a94dc..999675b039 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.cc
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.cc
@@ -23,6 +23,7 @@ CBC_AngleSet::CBC_AngleSet(size_t id,
                            const MPICommunicatorSet& comm_set)
   : AngleSet(id, num_groups, spds, fluds, angle_indices, boundaries),
     cbc_spds_(dynamic_cast<const CBC_SPDS&>(spds_)),
+    ready_tasks_(),
     async_comm_(id, *fluds, comm_set)
 {
 }
@@ -42,14 +43,25 @@ CBC_AngleSet::AngleSetAdvance(SweepChunk& sweep_chunk, AngleSetStatus permission
     return AngleSetStatus::FINISHED;
 
   if (current_task_list_.empty())
+  {
     current_task_list_ = cbc_spds_.GetTaskList();
+    // Build initial ready queue
+    ready_tasks_.reserve(current_task_list_.size());
+    for (size_t i = 0; i < current_task_list_.size(); ++i)
+      if ((current_task_list_[i].num_dependencies == 0) and (not current_task_list_[i].completed))
+        ready_tasks_.push_back(i);
+  }
 
   sweep_chunk.SetAngleSet(*this);
 
   auto tasks_who_received_data = async_comm_.ReceiveData();
 
   for (const std::uint64_t task_number : tasks_who_received_data)
-    --current_task_list_[task_number].num_dependencies;
+  {
+    if ((--current_task_list_[task_number].num_dependencies == 0) and
+        (not current_task_list_[task_number].completed))
+      ready_tasks_.push_back(task_number);
+  }
 
   async_comm_.SendData();
 
@@ -58,31 +70,28 @@ CBC_AngleSet::AngleSetAdvance(SweepChunk& sweep_chunk, AngleSetStatus permission
     if (not boundary->CheckAnglesReadyStatus(angles_))
       return AngleSetStatus::NOT_FINISHED;
 
-  bool all_tasks_completed = true;
-  bool a_task_executed = true;
-  while (a_task_executed)
+  while (not ready_tasks_.empty())
   {
-    a_task_executed = false;
-    for (auto& cell_task : current_task_list_)
+    const auto task_idx = ready_tasks_.back();
+    ready_tasks_.pop_back();
+    auto& cell_task = current_task_list_[task_idx];
+
+    sweep_chunk.SetCell(cell_task.cell_ptr, *this);
+    sweep_chunk.Sweep(*this);
+
+    for (const auto& local_task_num : cell_task.successors)
     {
-      if (not cell_task.completed)
-        all_tasks_completed = false;
-      if (cell_task.num_dependencies == 0 and not cell_task.completed)
-      {
-        sweep_chunk.SetCell(cell_task.cell_ptr, *this);
-        sweep_chunk.Sweep(*this);
-
-        for (std::uint64_t local_task_num : cell_task.successors)
-          --current_task_list_[local_task_num].num_dependencies;
-
-        cell_task.completed = true;
-        a_task_executed = true;
-        async_comm_.SendData();
-      }
-    } // for cell_task
+      if ((--current_task_list_[local_task_num].num_dependencies == 0) and
+          (not current_task_list_[local_task_num].completed))
+        ready_tasks_.push_back(local_task_num);
+    }
+
+    cell_task.completed = true;
+    ++num_completed_tasks;
     async_comm_.SendData();
   }
 
+  const bool all_tasks_completed = (num_completed_tasks == current_task_list_.size());
   const bool all_messages_sent = async_comm_.SendData();
 
   if (all_tasks_completed and all_messages_sent)
@@ -101,6 +110,8 @@ void
 CBC_AngleSet::ResetSweepBuffers()
 {
   current_task_list_.clear();
+  ready_tasks_.clear();
+  num_completed_tasks = 0;
   async_comm_.Reset();
   fluds_->ClearLocalAndReceivePsi();
   executed_ = false;
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.h
index 36da1250e9..ba127849db 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.h
@@ -59,6 +59,8 @@ class CBC_AngleSet : public AngleSet
 protected:
   const CBC_SPDS& cbc_spds_;
   std::vector<Task> current_task_list_;
+  std::vector<std::uint64_t> ready_tasks_;
+  size_t num_completed_tasks = 0;
   CBC_AsynchronousCommunicator async_comm_;
 };
 
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/async_comm.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/async_comm.h
index f9988256be..9d4f0a0da1 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/async_comm.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/async_comm.h
@@ -24,15 +24,6 @@ class AsynchronousCommunicator
 
   virtual ~AsynchronousCommunicator() = default;
 
-  virtual std::vector<double>& InitGetDownwindMessageData(int location_id,
-                                                          uint64_t cell_global_id,
-                                                          unsigned int face_id,
-                                                          size_t angle_set_id,
-                                                          size_t data_size)
-  {
-    OpenSnLogicalError("Method not implemented");
-  }
-
 protected:
   FLUDS& fluds_;
   const MPICommunicatorSet& comm_set_;
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.cc
index 1cb29d434d..44c61c777e 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.cc
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.cc
@@ -3,12 +3,12 @@
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/spds.h"
-#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h"
 #include "framework/mesh/mesh_continuum/mesh_continuum.h"
 #include "framework/mpi/mpi_comm_set.h"
 #include "framework/logging/log.h"
 #include "framework/runtime.h"
 #include "caliper/cali.h"
+#include <cstring>
 #include <memory>
 
 namespace opensn
@@ -53,8 +53,12 @@ CBC_AsynchronousCommunicator::SendData()
       buffer_array.Write(cell_global_id);
       buffer_array.Write(face_id);
       buffer_array.Write(data_size);
-      for (const double value : data) // actual psi_data
-        buffer_array.Write(value);
+
+      auto& raw = buffer_array.Data();
+      const size_t old_size = raw.size();
+      const size_t num_bytes = data_size * sizeof(double);
+      raw.resize(old_size + num_bytes);
+      std::memcpy(raw.data() + old_size, data.data(), num_bytes);
     }
 
     for (auto& [locI, buffer] : locI_buffer_map)
@@ -94,10 +98,11 @@ CBC_AsynchronousCommunicator::ReceiveData()
 {
   CALI_CXX_MARK_SCOPE("CBC_AsynchronousCommunicator::ReceiveData");
 
-  using CellFaceKey = std::pair<uint64_t, unsigned int>; // cell_gid + face_id
-  std::map<CellFaceKey, std::vector<double>> received_messages;
+  std::unordered_map<FLUDS::CellFaceKey, std::vector<double>, FLUDS::CellFaceKeyHash>
+    received_messages;
   std::vector<uint64_t> cells_who_received_data;
   const auto& location_dependencies = fluds_.GetSPDS().GetLocationDependencies();
+  auto& deplocs_outgoing_messages = fluds_.GetDeplocsOutgoingMessages();
   for (int locJ : location_dependencies)
   {
     const auto& comm = comm_set_.LocICommunicator(opensn::mpi_comm.rank());
@@ -117,33 +122,19 @@ CBC_AsynchronousCommunicator::ReceiveData()
         const auto face_id = data_array.Read<unsigned int>();
         const auto data_size = data_array.Read<size_t>();
 
-        std::vector<double> psi_data;
-        psi_data.reserve(data_size);
-        for (size_t k = 0; k < data_size; ++k)
-          psi_data.push_back(data_array.Read<double>());
+        std::vector<double> psi_data(data_size);
+        const size_t num_bytes = data_size * sizeof(double);
+        std::memcpy(psi_data.data(), &data_array.Data()[data_array.Offset()], num_bytes);
+        data_array.Seek(data_array.Offset() + num_bytes);
 
-        received_messages[{cell_global_id, face_id}] = std::move(psi_data);
+        deplocs_outgoing_messages[{cell_global_id, face_id}] = std::move(psi_data);
         cells_who_received_data.push_back(
           fluds_.GetSPDS().GetGrid()->MapCellGlobalID2LocalID(cell_global_id));
       } // while not at end of buffer
     } // Process each message embedded in buffer
   }
 
-  auto* cbc_fluds = dynamic_cast<CBC_FLUDS*>(&fluds_);
-  if (cbc_fluds != nullptr)
-    cbc_fluds->GetDeplocsOutgoingMessages().merge(received_messages);
-  else
-    MergeDeplocsOutgoingMessages(received_messages);
-
   return cells_who_received_data;
 }
 
-#ifndef __OPENSN_WITH_GPU__
-void
-CBC_AsynchronousCommunicator::MergeDeplocsOutgoingMessages(
-  std::map<CBC_FLUDS::CellFaceKey, std::vector<double>>& received_messages)
-{
-}
-#endif
-
 } // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.cu b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.cu
deleted file mode 100644
index e997a209d8..0000000000
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-// SPDX-FileCopyrightText: 2026 The OpenSn Authors <https://open-sn.github.io/opensn/>
-// SPDX-License-Identifier: MIT
-
-#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h"
-#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h"
-
-namespace opensn
-{
-
-void
-CBC_AsynchronousCommunicator::MergeDeplocsOutgoingMessages(
-  std::map<CellFaceKey, std::vector<double>>& received_messages)
-{
-  dynamic_cast<CBCD_FLUDS&>(fluds_).GetDeplocsOutgoingMessages().merge(received_messages);
-}
-
-} // namespace opensn
\ No newline at end of file
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h
index dbd8735157..ead2c03bd9 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h
@@ -4,9 +4,10 @@
 #pragma once
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/async_comm.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/fluds.h"
 #include "framework/data_types/byte_array.h"
 #include "mpicpp-lite/mpicpp-lite.h"
-#include <map>
+#include <unordered_map>
 #include <vector>
 #include <cstdint>
 #include <cstddef>
@@ -33,7 +34,7 @@ class CBC_AsynchronousCommunicator : public AsynchronousCommunicator
                                                   uint64_t cell_global_id,
                                                   unsigned int face_id,
                                                   size_t angle_set_id,
-                                                  size_t data_size) override;
+                                                  size_t data_size);
 
   bool SendData();
 
@@ -48,9 +49,22 @@ class CBC_AsynchronousCommunicator : public AsynchronousCommunicator
 protected:
   const size_t angle_set_id_;
 
-  // location_id, cell_global_id, face_id
-  using MessageKey = std::tuple<int, uint64_t, unsigned int>;
-  std::map<MessageKey, std::vector<double>> outgoing_message_queue_;
+  /// location_id, cell_global_id, face_id
+  using MessageKey = std::tuple<int, std::uint64_t, unsigned int>;
+
+  /// boost::hash_combine hash function for MessageKey.
+  struct MessageKeyHash
+  {
+    std::size_t operator()(const MessageKey& key) const noexcept
+    {
+      size_t h = std::hash<int>{}(std::get<0>(key));
+      h ^= std::hash<std::uint64_t>{}(std::get<1>(key)) + 0x9e3779b9 + (h << 6) + (h >> 2);
+      h ^= std::hash<unsigned int>{}(std::get<2>(key)) + 0x9e3779b9 + (h << 6) + (h >> 2);
+      return h;
+    }
+  };
+
+  std::unordered_map<MessageKey, std::vector<double>, MessageKeyHash> outgoing_message_queue_;
 
   struct BufferItem
   {
@@ -61,10 +75,6 @@ class CBC_AsynchronousCommunicator : public AsynchronousCommunicator
     ByteArray data_array;
   };
   std::vector<BufferItem> send_buffer_;
-
-  // cell_global_id, face_id
-  using CellFaceKey = std::pair<uint64_t, unsigned int>;
-  void MergeDeplocsOutgoingMessages(std::map<CellFaceKey, std::vector<double>>& received_messages);
 };
 
 } // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.cc
index 362d1c4570..c6e85f9be4 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.cc
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.cc
@@ -26,6 +26,16 @@ CBC_FLUDS::CBC_FLUDS(unsigned int num_groups,
     local_psi_data_size_(num_local_spatial_dofs_ * num_groups_and_angles_),
     local_psi_data_(local_psi_data_size_)
 {
+  const auto& grid = *spds_.GetGrid();
+  cell_psi_start_.resize(grid.local_cells.size());
+  for (const auto& cell : grid.local_cells)
+  {
+    cell_psi_start_[cell.local_id] =
+      (sdm_.MapDOFLocal(cell, 0, psi_uk_man_, 0, 0) / num_angles_in_gs_quadrature_ / num_groups_) *
+      num_groups_and_angles_;
+  }
+
+  deplocs_outgoing_messages_.reserve(common_data.GetNumIncomingNonlocalFaces());
 }
 
 const FLUDSCommonData&
@@ -37,38 +47,19 @@ CBC_FLUDS::GetCommonData() const
 double*
 CBC_FLUDS::UpwindPsi(const Cell& face_neighbor, unsigned int adj_cell_node, size_t as_ss_idx)
 {
-  // Map to face neighbor cell's first spatial DOF index
-  // (0 to (num_local_spatial_dofs_ - 1))
-  const size_t face_nbr_spatial_dof_0_index =
-    (sdm_.MapDOFLocal(face_neighbor, 0, psi_uk_man_, 0, 0) / num_angles_in_gs_quadrature_ /
-     num_groups_);
-
-  // Index to start of neighbor cell's data block in local_psi_data_
-  const size_t face_nbr_data_start_index = face_nbr_spatial_dof_0_index * num_groups_and_angles_;
-  const size_t addr_offset = adj_cell_node * num_groups_and_angles_ + as_ss_idx * num_groups_;
-  const size_t face_nbr_data_index = face_nbr_data_start_index + addr_offset;
-
-  assert((face_nbr_data_index >= 0) and (face_nbr_data_index < local_psi_data_.size()));
-
-  return &local_psi_data_[face_nbr_data_index];
+  const size_t index = cell_psi_start_[face_neighbor.local_id] +
+                       adj_cell_node * num_groups_and_angles_ + as_ss_idx * num_groups_;
+  assert(index < local_psi_data_.size());
+  return &local_psi_data_[index];
 }
 
 double*
 CBC_FLUDS::OutgoingPsi(const Cell& cell, unsigned int cell_node, size_t as_ss_idx)
 {
-  // Map to current cell's first spatial DOF index
-  // (0 to (num_local_spatial_dofs_ - 1))
-  const size_t cur_cell_spatial_dof_0_index =
-    (sdm_.MapDOFLocal(cell, 0, psi_uk_man_, 0, 0) / num_angles_in_gs_quadrature_ / num_groups_);
-
-  // Index to start of current cell's data block in local_psi_data_
-  const size_t cur_cell_data_start_index = cur_cell_spatial_dof_0_index * num_groups_and_angles_;
-  const size_t addr_offset = cell_node * num_groups_and_angles_ + as_ss_idx * num_groups_;
-  const size_t cur_cell_data_index = cur_cell_data_start_index + addr_offset;
-
-  assert((cur_cell_data_index >= 0) and (cur_cell_data_index < local_psi_data_.size()));
-
-  return &local_psi_data_[cur_cell_data_index];
+  const size_t index =
+    cell_psi_start_[cell.local_id] + cell_node * num_groups_and_angles_ + as_ss_idx * num_groups_;
+  assert(index < local_psi_data_.size());
+  return &local_psi_data_[index];
 }
 
 double*
@@ -77,12 +68,15 @@ CBC_FLUDS::NLUpwindPsi(uint64_t cell_global_id,
                        unsigned int face_node_mapped,
                        size_t as_ss_idx)
 {
-  std::vector<double>& psi = deplocs_outgoing_messages_.at({cell_global_id, face_id});
+  auto it = deplocs_outgoing_messages_.find({cell_global_id, face_id});
+  if (it == deplocs_outgoing_messages_.end())
+    return nullptr;
+  auto& psi = it->second;
   const size_t dof_map =
     face_node_mapped * num_groups_and_angles_ + //  Offset to start of data for face_node_mapped
     as_ss_idx * num_groups_;                    // Offset to start of data for angle_set_index
 
-  assert((dof_map >= 0) and (dof_map < psi.size()));
+  assert(dof_map < psi.size());
   return &psi[dof_map];
 }
 
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h
index dc232ba52a..ba7a6467bf 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h
@@ -8,7 +8,7 @@
 #include "framework/math/unknown_manager/unknown_manager.h"
 #include "framework/math/spatial_discretization/spatial_discretization.h"
 #include <cstddef>
-#include <map>
+#include <unordered_map>
 #include <functional>
 
 namespace opensn
@@ -82,14 +82,6 @@ class CBC_FLUDS : public FLUDS
   void AllocatePrelocIOutgoingPsi() override {}
   void AllocateDelayedPrelocIOutgoingPsi() override {}
 
-  // cell_global_id, face_id
-  using CellFaceKey = std::pair<uint64_t, unsigned int>;
-
-  std::map<CellFaceKey, std::vector<double>>& GetDeplocsOutgoingMessages()
-  {
-    return deplocs_outgoing_messages_;
-  }
-
 protected:
   const CBC_FLUDSCommonData& common_data_;
   const UnknownManager& psi_uk_man_;
@@ -107,7 +99,8 @@ class CBC_FLUDS : public FLUDS
 
   std::vector<std::vector<double>> boundryI_incoming_psi_;
 
-  std::map<CellFaceKey, std::vector<double>> deplocs_outgoing_messages_;
+  /// Pre-computed start index into local_psi_data_ for each local cell
+  std::vector<size_t> cell_psi_start_;
 };
 
 } // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.cc
index 3fe986500f..354b0fd3a0 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.cc
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.cc
@@ -3,14 +3,38 @@
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/spds.h"
+#include "framework/mesh/cell/cell.h"
+#include "framework/mesh/mesh_continuum/mesh_continuum.h"
 
 namespace opensn
 {
 
 CBC_FLUDSCommonData::CBC_FLUDSCommonData(
   const SPDS& spds, const std::vector<CellFaceNodalMapping>& grid_nodal_mappings)
-  : FLUDSCommonData(spds, grid_nodal_mappings)
+  : FLUDSCommonData(spds, grid_nodal_mappings),
+    num_incoming_nonlocal_faces_(0),
+    num_outgoing_nonlocal_faces_(0)
 {
+  // Pre-compute non-local face counts for hash map capacity reservation
+  const auto& grid = *spds.GetGrid();
+  const auto& face_orientations = spds.GetCellFaceOrientations();
+
+  for (const auto& cell : grid.local_cells)
+  {
+    for (size_t f = 0; f < cell.faces.size(); ++f)
+    {
+      const auto& face = cell.faces[f];
+      const auto orientation = face_orientations[cell.local_id][f];
+
+      if ((not face.has_neighbor) or (face.IsNeighborLocal(&grid)))
+        continue;
+
+      if (orientation == FaceOrientation::INCOMING)
+        ++num_incoming_nonlocal_faces_;
+      else if (orientation == FaceOrientation::OUTGOING)
+        ++num_outgoing_nonlocal_faces_;
+    }
+  }
 }
 
 } // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.h
index 62f1a461f7..a1cd93f7ad 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.h
@@ -5,6 +5,7 @@
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/fluds_common_data.h"
 #include <cinttypes>
+#include <cstddef>
 
 namespace opensn
 {
@@ -14,6 +15,14 @@ class CBC_FLUDSCommonData : public FLUDSCommonData
 public:
   CBC_FLUDSCommonData(const SPDS& spds,
                       const std::vector<CellFaceNodalMapping>& grid_nodal_mappings);
+
+  size_t GetNumIncomingNonlocalFaces() const { return num_incoming_nonlocal_faces_; }
+
+  size_t GetNumOutgoingNonlocalFaces() const { return num_outgoing_nonlocal_faces_; }
+
+private:
+  size_t num_incoming_nonlocal_faces_;
+  size_t num_outgoing_nonlocal_faces_;
 };
 
 } // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.cu b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.cu
index 1007396682..dde99da1f6 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.cu
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.cu
@@ -3,6 +3,7 @@
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbcd_angle_set.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h"
@@ -50,6 +51,7 @@ CBCD_FLUDS::CBCD_FLUDS(size_t num_groups,
     device_saved_psi_ = crb::DeviceMemory<double>(local_psi_data_size_);
   }
   CreatePointerSet();
+  deplocs_outgoing_messages_.reserve(common_data.GetNumIncomingNonlocalFaces());
 }
 
 CBCD_FLUDS::~CBCD_FLUDS()
@@ -192,7 +194,8 @@ CBCD_FLUDS::CopyOutgoingPsiBackToHost(CBCDSweepChunk& sweep_chunk,
         const auto& face_data_size = num_face_nodes * num_groups_and_angles_;
         const int locality =
           sweep_chunk.GetCellTransportView(node.cell_local_id).FaceLocality(node.face_id);
-        auto& async_comm = *angle_set->GetCommunicator();
+        auto& async_comm =
+          static_cast<CBC_AsynchronousCommunicator&>(*angle_set->GetCommunicator());
         std::vector<double>* psi_nonlocal_outgoing =
           &async_comm.InitGetDownwindMessageData(locality,
                                                  face.neighbor_id,
@@ -261,12 +264,15 @@ CBCD_FLUDS::NLUpwindPsi(uint64_t cell_global_id,
                         unsigned int face_node_mapped,
                         size_t as_ss_idx)
 {
-  std::vector<double>& psi = deplocs_outgoing_messages_.at({cell_global_id, face_id});
+  auto it = deplocs_outgoing_messages_.find({cell_global_id, face_id});
+  if (it == deplocs_outgoing_messages_.end())
+    return nullptr;
+  auto& psi = it->second;
   const size_t dof_map =
     face_node_mapped * num_groups_and_angles_ + //  Offset to start of data for face_node_mapped
     as_ss_idx * num_groups_;                    // Offset to start of data for angle_set_index
 
-  assert((dof_map >= 0) and (dof_map < psi.size()));
+  assert(dof_map < psi.size());
   return &psi[dof_map];
 }
 
@@ -280,4 +286,4 @@ CBCD_FLUDS::NLOutgoingPsi(std::vector<double>* psi_nonlocal_outgoing,
   return &(*psi_nonlocal_outgoing)[addr_offset];
 }
 
-} // namespace opensn
\ No newline at end of file
+} // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h
index 9090fe2bd8..f466af2052 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h
@@ -10,7 +10,7 @@
 #include "caribou/main.hpp"
 #include <cstddef>
 #include <functional>
-#include <map>
+#include <unordered_map>
 
 namespace crb = caribou;
 
@@ -93,13 +93,6 @@ class CBCD_FLUDS : public FLUDS
   void AllocatePrelocIOutgoingPsi() override {}
   void AllocateDelayedPrelocIOutgoingPsi() override {}
 
-  // cell_global_id, face_id
-  using CellFaceKey = std::pair<uint64_t, unsigned int>;
-  std::map<CellFaceKey, std::vector<double>>& GetDeplocsOutgoingMessages()
-  {
-    return deplocs_outgoing_messages_;
-  }
-
 private:
   /// Reference to the common data.
   const CBCD_FLUDSCommonData& common_data_;
@@ -138,8 +131,6 @@ class CBCD_FLUDS : public FLUDS
   void CreatePointerSet();
 
   std::vector<std::vector<double>> boundaryI_incoming_psi_;
-
-  std::map<CellFaceKey, std::vector<double>> deplocs_outgoing_messages_;
 };
 
 } // namespace opensn
\ No newline at end of file
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cc
index ecdcd4023c..411bcebd7f 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cc
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cc
@@ -15,7 +15,9 @@ CBCD_FLUDSCommonData::CBCD_FLUDSCommonData(
   : FLUDSCommonData(spds, grid_nodal_mappings),
     num_incoming_boundary_nodes_(0),
     num_outgoing_boundary_nodes_(0),
+    num_incoming_nonlocal_faces_(0),
     num_incoming_nonlocal_nodes_(0),
+    num_outgoing_nonlocal_faces_(0),
     num_outgoing_nonlocal_nodes_(0),
     device_cell_face_node_map_(nullptr),
     incoming_boundary_node_map_(),
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cu b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cu
index ee527e3bac..98d2294a72 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cu
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cu
@@ -51,6 +51,15 @@ CBCD_FLUDSCommonData::CopyFlattenedNodeIndexToDevice(const SpatialDiscretization
       const bool is_incoming_face = (orientation == FaceOrientation::INCOMING);
       const bool is_local_face = face.IsNeighborLocal(&grid);
       const bool is_boundary_face = not face.has_neighbor;
+
+      if ((not is_local_face) and (not is_boundary_face))
+      {
+        if (is_incoming_face)
+          ++num_incoming_nonlocal_faces_;
+        else if (is_outgoing_face)
+          ++num_outgoing_nonlocal_faces_;
+      }
+
       for (size_t fn = 0; fn < num_face_nodes; ++fn)
       {
         CBCD_NodeIndex node_index;
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h
index ad193c07e7..1d61b5201e 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h
@@ -35,6 +35,12 @@ class CBCD_FLUDSCommonData : public FLUDSCommonData
   /// Get number of outgoing non-local face nodes.
   std::size_t GetNumOutgoingNonlocalNodes() const { return num_outgoing_nonlocal_nodes_; }
 
+  /// Get number of incoming non-local faces.
+  std::size_t GetNumIncomingNonlocalFaces() const { return num_incoming_nonlocal_faces_; }
+
+  /// Get number of outgoing non-local faces.
+  std::size_t GetNumOutgoingNonlocalFaces() const { return num_outgoing_nonlocal_faces_; }
+
   /// Get incoming boundary node map.
   const std::vector<BoundaryNodeInfo>& GetIncomingBoundaryNodeMap() const
   {
@@ -67,8 +73,12 @@ class CBCD_FLUDSCommonData : public FLUDSCommonData
   size_t num_incoming_boundary_nodes_;
   /// Number of outgoing boundary face nodes.
   size_t num_outgoing_boundary_nodes_;
+  /// Number of incoming non-local faces.
+  size_t num_incoming_nonlocal_faces_;
   /// Number of incoming non-local face nodes.
   size_t num_incoming_nonlocal_nodes_;
+  /// Number of outgoing non-local faces.
+  size_t num_outgoing_nonlocal_faces_;
   /// Number of outgoing non-local face nodes.
   size_t num_outgoing_nonlocal_nodes_;
   /// Device pointer to cell-face-node map for angular flux buffer access.
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/fluds.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/fluds.h
index 3177c4c4bc..1097113a74 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/fluds.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/fluds.h
@@ -10,6 +10,8 @@
 #include <vector>
 #include <cstddef>
 #include <cstdint>
+#include <unordered_map>
+#include <utility>
 
 namespace opensn
 {
@@ -59,6 +61,27 @@ class FLUDS
 
   virtual ~FLUDS() = default;
 
+  /// cell_global_id, face_id
+  using CellFaceKey = std::pair<std::uint64_t, unsigned int>;
+
+  /// boost::hash_combine hash function for CellFaceKey.
+  struct CellFaceKeyHash
+  {
+    size_t operator()(const CellFaceKey& key) const noexcept
+    {
+      size_t h = std::hash<std::uint64_t>{}(key.first);
+      h ^=
+        std::hash<unsigned int>{}(key.second) + 0x9e3779b9 + (h << 6) + (h >> 2); // Combine hashes
+      return h;
+    }
+  };
+
+  std::unordered_map<CellFaceKey, std::vector<double>, CellFaceKeyHash>&
+  GetDeplocsOutgoingMessages()
+  {
+    return deplocs_outgoing_messages_;
+  }
+
 protected:
   const unsigned int num_groups_;
   const size_t num_angles_;
@@ -71,6 +94,8 @@ class FLUDS
   std::vector<std::span<double>> prelocI_outgoing_psi_view_;
   std::vector<std::span<double>> delayed_prelocI_outgoing_psi_view_;
   std::vector<std::span<double>> delayed_prelocI_outgoing_psi_old_view_;
+
+  std::unordered_map<CellFaceKey, std::vector<double>, CellFaceKeyHash> deplocs_outgoing_messages_;
 };
 
 } // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_avx_sweep_chunk.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_avx_sweep_chunk.cc
index 2c9ce44dc5..e369c8d505 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_avx_sweep_chunk.cc
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_avx_sweep_chunk.cc
@@ -11,213 +11,9 @@
 #include <array>
 #include <vector>
 
-#if __AVX512F__ || __AVX2__
-#include <immintrin.h>
-#endif
-
-#if __clang__ || __INTEL_COMPILER
-#define PRAGMA_UNROLL _Pragma("unroll")
-#elif __GNUC__
-#define PRAGMA_UNROLL _Pragma("GCC unroll 8")
-#else
-#define PRAGMA_UNROLL
-#endif
-
 namespace opensn
 {
 
-namespace detail
-{
-
-#if __AVX512F__
-struct AVX512Ops
-{
-  using avx_vec = __m512d;
-  using avx_index = __m512i;
-
-  static inline avx_vec LoadSigma(const double* sigma) { return _mm512_loadu_pd(sigma); }
-  static inline avx_vec Set1(double x) { return _mm512_set1_pd(x); }
-  static inline avx_vec Add(const avx_vec& a, const avx_vec& b) { return _mm512_add_pd(a, b); }
-  static inline avx_vec Sub(const avx_vec& a, const avx_vec& b) { return _mm512_sub_pd(a, b); }
-  static inline avx_vec Mul(const avx_vec& a, const avx_vec& b) { return _mm512_mul_pd(a, b); }
-  static inline avx_vec Div(const avx_vec& a, const avx_vec& b) { return _mm512_div_pd(a, b); }
-  static inline avx_vec Fmadd(const avx_vec& a, const avx_vec& b, const avx_vec& c)
-  {
-    // a + b * c
-    return _mm512_fmadd_pd(b, c, a);
-  }
-  static inline avx_vec Fnmadd(const avx_vec& a, const avx_vec& b, const avx_vec& c)
-  {
-    // c - a * b
-    return _mm512_fnmadd_pd(a, b, c);
-  }
-  static inline avx_vec Reciprocal(const avx_vec& v) { return Div(Set1(1.0), v); }
-  static inline avx_vec Gather(const avx_index& idx, const double* base)
-  {
-    return _mm512_i64gather_pd(idx, base, sizeof(double));
-  }
-  static inline void Scatter(const avx_index& idx, double* base, const avx_vec& value)
-  {
-    _mm512_i64scatter_pd(base, idx, value, sizeof(double));
-  }
-};
-#elif __AVX2__
-struct AVX2Ops
-{
-  using avx_vec = __m256d;
-  using avx_index = __m128i;
-
-  static inline avx_vec LoadSigma(const double* sigma) { return _mm256_loadu_pd(sigma); }
-  static inline avx_vec Set1(double x) { return _mm256_set1_pd(x); }
-  static inline avx_vec Add(const avx_vec& a, const avx_vec& b) { return _mm256_add_pd(a, b); }
-  static inline avx_vec Sub(const avx_vec& a, const avx_vec& b) { return _mm256_sub_pd(a, b); }
-  static inline avx_vec Mul(const avx_vec& a, const avx_vec& b) { return _mm256_mul_pd(a, b); }
-  static inline avx_vec Div(const avx_vec& a, const avx_vec& b) { return _mm256_div_pd(a, b); }
-
-#if __FMA__
-  static inline avx_vec Fmadd(const avx_vec& a, const avx_vec& b, const avx_vec& c)
-  {
-    return _mm256_fmadd_pd(b, c, a);
-  }
-  static inline avx_vec Fnmadd(const avx_vec& a, const avx_vec& b, const avx_vec& c)
-  {
-    return _mm256_fnmadd_pd(a, b, c);
-  }
-#else
-  static inline avx_vec Fmadd(const avx_vec& a, const avx_vec& b, const avx_vec& c)
-  {
-    return Add(a, Mul(b, c));
-  }
-  static inline avx_vec Fnmadd(const avx_vec& a, const avx_vec& b, const avx_vec& c)
-  {
-    return Sub(c, Mul(a, b));
-  }
-#endif
-
-  static inline avx_vec Reciprocal(const avx_vec& v) { return Div(Set1(1.0), v); }
-  static inline avx_vec Gather(const avx_index& idx, const double* base)
-  {
-    return _mm256_i32gather_pd(base, idx, sizeof(double));
-  }
-  static inline void Scatter(const avx_index& idx, double* base, const avx_vec& value)
-  {
-    alignas(32) double buffer[simd_width];
-    _mm256_store_pd(buffer, value);
-    alignas(16) int offsets[simd_width];
-    _mm_store_si128(reinterpret_cast<__m128i*>(offsets), idx);
-    for (int lane = 0; lane < simd_width; ++lane)
-      base[offsets[lane]] = buffer[lane];
-  }
-};
-#endif
-
-template <class Ops, int N>
-struct GatherIndexBuilder
-{
-  static typename Ops::avx_index Build(int /*unused*/)
-  {
-    static_assert(sizeof(Ops) == 0, "SIMD gather index helper not implemented for this Ops type.");
-    return typename Ops::avx_index{};
-  }
-};
-
-#if __AVX512F__
-template <int N>
-struct GatherIndexBuilder<AVX512Ops, N>
-{
-  static AVX512Ops::avx_index Build(int row)
-  {
-    long long vals[simd_width];
-    for (int lane = 0; lane < simd_width; ++lane)
-      vals[lane] = static_cast<long long>(lane * N + row);
-    return _mm512_setr_epi64(
-      vals[0], vals[1], vals[2], vals[3], vals[4], vals[5], vals[6], vals[7]);
-  }
-};
-#elif __AVX2__
-template <int N>
-struct GatherIndexBuilder<AVX2Ops, N>
-{
-  static AVX2Ops::avx_index Build(int row)
-  {
-    int vals[simd_width];
-    for (int lane = 0; lane < simd_width; ++lane)
-      vals[lane] = lane * N + row;
-    return _mm_setr_epi32(vals[0], vals[1], vals[2], vals[3]);
-  }
-};
-#endif
-
-namespace
-{
-
-template <class Ops, int N>
-inline typename Ops::avx_index
-MakeGatherIndex(int row)
-{
-  return GatherIndexBuilder<Ops, N>::Build(row);
-}
-
-template <class Ops, int N>
-inline void
-SimdBatchSolve(const double* Am, const double* Mm, const double* sigma_t, double* __restrict b)
-{
-  using avx_vec = typename Ops::avx_vec;
-
-  avx_vec rhs[N];
-  PRAGMA_UNROLL
-  for (int row = 0; row < N; ++row)
-    rhs[row] = Ops::Gather(MakeGatherIndex<Ops, N>(row), b);
-
-  const avx_vec sigma = Ops::LoadSigma(sigma_t);
-  avx_vec A[N * N];
-  PRAGMA_UNROLL
-  for (int i = 0; i < N; ++i)
-  {
-    PRAGMA_UNROLL
-    for (int j = 0; j < N; ++j)
-    {
-      const avx_vec Amij = Ops::Set1(Am[i * N + j]);
-      const avx_vec Mmij = Ops::Set1(Mm[i * N + j]);
-      A[i * N + j] = Ops::Fmadd(Amij, sigma, Mmij);
-    }
-  }
-
-  auto entry = [&](int i, int j) -> avx_vec& { return A[i * N + j]; };
-  PRAGMA_UNROLL
-  for (int pivot = 0; pivot < N; ++pivot)
-  {
-    const avx_vec inv = Ops::Reciprocal(entry(pivot, pivot));
-    PRAGMA_UNROLL
-    for (int row = pivot + 1; row < N; ++row)
-    {
-      const avx_vec factor = Ops::Mul(entry(row, pivot), inv);
-      rhs[row] = Ops::Fnmadd(factor, rhs[pivot], rhs[row]);
-      PRAGMA_UNROLL
-      for (int col = pivot + 1; col < N; ++col)
-        entry(row, col) = Ops::Fnmadd(factor, entry(pivot, col), entry(row, col));
-    }
-  }
-
-  PRAGMA_UNROLL
-  for (int pivot = N - 1; pivot >= 0; --pivot)
-  {
-    avx_vec rhs_vec = rhs[pivot];
-    PRAGMA_UNROLL
-    for (int col = pivot + 1; col < N; ++col)
-      rhs_vec = Ops::Fnmadd(entry(pivot, col), rhs[col], rhs_vec);
-    rhs[pivot] = Ops::Mul(rhs_vec, Ops::Reciprocal(entry(pivot, pivot)));
-  }
-
-  PRAGMA_UNROLL
-  for (int row = 0; row < N; ++row)
-    Ops::Scatter(MakeGatherIndex<Ops, N>(row), b, rhs[row]);
-}
-
-} // namespace
-
-} // namespace detail
-
 template <unsigned int NumNodes, bool time_dependent>
 void
 AAH_Sweep_FixedN(AAHSweepData& data, AngleSet& angle_set)
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_sweep_chunk.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_sweep_chunk.h
index c3f04c4cd3..f84baa93b3 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_sweep_chunk.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_sweep_chunk.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/sweep_chunk.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/avx_sweep_chunk_utils.h"
 #include "modules/linear_boltzmann_solvers/lbs_problem/groupset/lbs_groupset.h"
 #include "framework/math/spatial_discretization/spatial_discretization.h"
 #include <cstdint>
@@ -13,16 +14,6 @@
 namespace opensn
 {
 
-// experimental, to be moved to a higher level header file
-static constexpr size_t simd_width =
-#if __AVX512F__
-  8; // 8 lanes (512-bit, doubles)
-#elif __AVX2__
-  4; // 4 lanes (256-bit, doubles)
-#else
-  1; // scalar
-#endif
-
 class DiscreteOrdinatesProblem;
 
 class AAHSweepChunk : public SweepChunk
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_sweep_kernels.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_sweep_kernels.h
index c6a327678f..4c1e3da001 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_sweep_kernels.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_sweep_kernels.h
@@ -42,26 +42,6 @@ struct AAHSweepData
   unsigned int group_block_size;      // used by fixed-N/AVX path
 };
 
-inline size_t
-ComputeGroupBlockSize(size_t gs_size)
-{
-  if (gs_size <= simd_width)
-    return gs_size;
-
-  size_t target = 0;
-  if (gs_size >= 16 * simd_width)
-    target = 4 * simd_width;
-  else if (gs_size >= 4 * simd_width)
-    target = 2 * simd_width;
-  else
-    target = 1 * simd_width;
-
-  target = std::min(target, gs_size);
-  if (target >= simd_width)
-    target = (target / simd_width) * simd_width;
-  return target;
-}
-
 /// Generic sweep kernel (scalar), parameterized by time dependence.
 template <bool time_dependent>
 inline void
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/avx_sweep_chunk_utils.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/avx_sweep_chunk_utils.h
new file mode 100644
index 0000000000..dc1af04119
--- /dev/null
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/avx_sweep_chunk_utils.h
@@ -0,0 +1,241 @@
+// SPDX-FileCopyrightText: 2026 The OpenSn Authors <https://open-sn.github.io/opensn/>
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+
+#if __AVX512F__ || __AVX2__
+#include <immintrin.h>
+#endif
+
+#if __clang__ || __INTEL_COMPILER
+#define PRAGMA_UNROLL _Pragma("unroll")
+#elif __GNUC__
+#define PRAGMA_UNROLL _Pragma("GCC unroll 8")
+#else
+#define PRAGMA_UNROLL
+#endif
+
+namespace opensn
+{
+
+static constexpr size_t simd_width =
+#if __AVX512F__
+  8; // 8 lanes (512-bit, doubles)
+#elif __AVX2__
+  4; // 4 lanes (256-bit, doubles)
+#else
+  1; // scalar
+#endif
+
+inline size_t
+ComputeGroupBlockSize(size_t gs_size)
+{
+  if (gs_size <= simd_width)
+    return gs_size;
+
+  size_t target = 0;
+  if (gs_size >= 16 * simd_width)
+    target = 4 * simd_width;
+  else if (gs_size >= 4 * simd_width)
+    target = 2 * simd_width;
+  else
+    target = 1 * simd_width;
+
+  target = std::min(target, gs_size);
+  if (target >= simd_width)
+    target = (target / simd_width) * simd_width;
+  return target;
+}
+
+namespace detail
+{
+
+#if __AVX512F__
+struct AVX512Ops
+{
+  using avx_vec = __m512d;
+  using avx_index = __m512i;
+
+  static inline avx_vec LoadSigma(const double* sigma) { return _mm512_loadu_pd(sigma); }
+  static inline avx_vec Set1(double x) { return _mm512_set1_pd(x); }
+  static inline avx_vec Add(const avx_vec& a, const avx_vec& b) { return _mm512_add_pd(a, b); }
+  static inline avx_vec Sub(const avx_vec& a, const avx_vec& b) { return _mm512_sub_pd(a, b); }
+  static inline avx_vec Mul(const avx_vec& a, const avx_vec& b) { return _mm512_mul_pd(a, b); }
+  static inline avx_vec Div(const avx_vec& a, const avx_vec& b) { return _mm512_div_pd(a, b); }
+  static inline avx_vec Fmadd(const avx_vec& a, const avx_vec& b, const avx_vec& c)
+  {
+    // a + b * c
+    return _mm512_fmadd_pd(b, c, a);
+  }
+  static inline avx_vec Fnmadd(const avx_vec& a, const avx_vec& b, const avx_vec& c)
+  {
+    // c - a * b
+    return _mm512_fnmadd_pd(a, b, c);
+  }
+  static inline avx_vec Reciprocal(const avx_vec& v) { return Div(Set1(1.0), v); }
+  static inline avx_vec Gather(const avx_index& idx, const double* base)
+  {
+    return _mm512_i64gather_pd(idx, base, sizeof(double));
+  }
+  static inline void Scatter(const avx_index& idx, double* base, const avx_vec& value)
+  {
+    _mm512_i64scatter_pd(base, idx, value, sizeof(double));
+  }
+};
+#elif __AVX2__
+struct AVX2Ops
+{
+  using avx_vec = __m256d;
+  using avx_index = __m128i;
+
+  static inline avx_vec LoadSigma(const double* sigma) { return _mm256_loadu_pd(sigma); }
+  static inline avx_vec Set1(double x) { return _mm256_set1_pd(x); }
+  static inline avx_vec Add(const avx_vec& a, const avx_vec& b) { return _mm256_add_pd(a, b); }
+  static inline avx_vec Sub(const avx_vec& a, const avx_vec& b) { return _mm256_sub_pd(a, b); }
+  static inline avx_vec Mul(const avx_vec& a, const avx_vec& b) { return _mm256_mul_pd(a, b); }
+  static inline avx_vec Div(const avx_vec& a, const avx_vec& b) { return _mm256_div_pd(a, b); }
+
+#if __FMA__
+  static inline avx_vec Fmadd(const avx_vec& a, const avx_vec& b, const avx_vec& c)
+  {
+    return _mm256_fmadd_pd(b, c, a);
+  }
+  static inline avx_vec Fnmadd(const avx_vec& a, const avx_vec& b, const avx_vec& c)
+  {
+    return _mm256_fnmadd_pd(a, b, c);
+  }
+#else
+  static inline avx_vec Fmadd(const avx_vec& a, const avx_vec& b, const avx_vec& c)
+  {
+    return Add(a, Mul(b, c));
+  }
+  static inline avx_vec Fnmadd(const avx_vec& a, const avx_vec& b, const avx_vec& c)
+  {
+    return Sub(c, Mul(a, b));
+  }
+#endif
+
+  static inline avx_vec Reciprocal(const avx_vec& v) { return Div(Set1(1.0), v); }
+  static inline avx_vec Gather(const avx_index& idx, const double* base)
+  {
+    return _mm256_i32gather_pd(base, idx, sizeof(double));
+  }
+  static inline void Scatter(const avx_index& idx, double* base, const avx_vec& value)
+  {
+    alignas(32) double buffer[simd_width];
+    _mm256_store_pd(buffer, value);
+    alignas(16) int offsets[simd_width];
+    _mm_store_si128(reinterpret_cast<__m128i*>(offsets), idx);
+    for (int lane = 0; lane < static_cast<int>(simd_width); ++lane)
+      base[offsets[lane]] = buffer[lane];
+  }
+};
+#endif
+
+template <class Ops, int N>
+struct GatherIndexBuilder
+{
+  static typename Ops::avx_index Build(int /*unused*/)
+  {
+    static_assert(sizeof(Ops) == 0, "SIMD gather index helper not implemented for this Ops type.");
+    return typename Ops::avx_index{};
+  }
+};
+
+#if __AVX512F__
+template <int N>
+struct GatherIndexBuilder<AVX512Ops, N>
+{
+  static AVX512Ops::avx_index Build(int row)
+  {
+    long long vals[simd_width];
+    for (int lane = 0; lane < static_cast<int>(simd_width); ++lane)
+      vals[lane] = static_cast<long long>(lane * N + row);
+    return _mm512_setr_epi64(
+      vals[0], vals[1], vals[2], vals[3], vals[4], vals[5], vals[6], vals[7]);
+  }
+};
+#elif __AVX2__
+template <int N>
+struct GatherIndexBuilder<AVX2Ops, N>
+{
+  static AVX2Ops::avx_index Build(int row)
+  {
+    int vals[simd_width];
+    for (int lane = 0; lane < static_cast<int>(simd_width); ++lane)
+      vals[lane] = lane * N + row;
+    return _mm_setr_epi32(vals[0], vals[1], vals[2], vals[3]);
+  }
+};
+#endif
+
+template <class Ops, int N>
+inline typename Ops::avx_index static MakeGatherIndex(int row)
+{
+  return GatherIndexBuilder<Ops, N>::Build(row);
+}
+
+template <class Ops, int N>
+inline void static SimdBatchSolve(const double* Am,
+                                  const double* Mm,
+                                  const double* sigma_t,
+                                  double* __restrict b)
+{
+  using avx_vec = typename Ops::avx_vec;
+
+  avx_vec rhs[N];
+  PRAGMA_UNROLL
+  for (int row = 0; row < N; ++row)
+    rhs[row] = Ops::Gather(MakeGatherIndex<Ops, N>(row), b);
+
+  const avx_vec sigma = Ops::LoadSigma(sigma_t);
+  avx_vec A[N * N];
+  PRAGMA_UNROLL
+  for (int i = 0; i < N; ++i)
+  {
+    PRAGMA_UNROLL
+    for (int j = 0; j < N; ++j)
+    {
+      const avx_vec Amij = Ops::Set1(Am[i * N + j]);
+      const avx_vec Mmij = Ops::Set1(Mm[i * N + j]);
+      A[i * N + j] = Ops::Fmadd(Amij, sigma, Mmij);
+    }
+  }
+
+  auto entry = [&](int i, int j) -> avx_vec& { return A[i * N + j]; };
+  PRAGMA_UNROLL
+  for (int pivot = 0; pivot < N; ++pivot)
+  {
+    const avx_vec inv = Ops::Reciprocal(entry(pivot, pivot));
+    PRAGMA_UNROLL
+    for (int row = pivot + 1; row < N; ++row)
+    {
+      const avx_vec factor = Ops::Mul(entry(row, pivot), inv);
+      rhs[row] = Ops::Fnmadd(factor, rhs[pivot], rhs[row]);
+      PRAGMA_UNROLL
+      for (int col = pivot + 1; col < N; ++col)
+        entry(row, col) = Ops::Fnmadd(factor, entry(pivot, col), entry(row, col));
+    }
+  }
+
+  PRAGMA_UNROLL
+  for (int pivot = N - 1; pivot >= 0; --pivot)
+  {
+    avx_vec rhs_vec = rhs[pivot];
+    PRAGMA_UNROLL
+    for (int col = pivot + 1; col < N; ++col)
+      rhs_vec = Ops::Fnmadd(entry(pivot, col), rhs[col], rhs_vec);
+    rhs[pivot] = Ops::Mul(rhs_vec, Ops::Reciprocal(entry(pivot, pivot)));
+  }
+
+  PRAGMA_UNROLL
+  for (int row = 0; row < N; ++row)
+    Ops::Scatter(MakeGatherIndex<Ops, N>(row), b, rhs[row]);
+}
+
+} // namespace detail
+
+} // namespace opensn
\ No newline at end of file
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_avx_sweep_chunk.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_avx_sweep_chunk.cc
new file mode 100644
index 0000000000..0b541d316c
--- /dev/null
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_avx_sweep_chunk.cc
@@ -0,0 +1,427 @@
+// SPDX-FileCopyrightText: 2026 The OpenSn Authors <https://open-sn.github.io/opensn/>
+// SPDX-License-Identifier: MIT
+
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h"
+#include "framework/utils/error.h"
+#include "caliper/cali.h"
+#include <algorithm>
+#include <array>
+#include <vector>
+
+namespace opensn
+{
+
+template <unsigned int NumNodes, bool time_dependent>
+void
+CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set)
+{
+  CALI_CXX_MARK_SCOPE("CBC_Sweep_FixedN");
+
+  static_assert(NumNodes >= 2 and NumNodes <= 8);
+
+  const auto& groupset = data.groupset;
+  const auto& m2d_op = groupset.quadrature->GetMomentToDiscreteOperator();
+  const auto& d2m_op = groupset.quadrature->GetDiscreteToMomentOperator();
+
+  OpenSnInvalidArgumentIf(data.cell_num_nodes != static_cast<size_t>(NumNodes),
+                          "CBC_Sweep_FixedN invoked for an incompatible cell topology.");
+
+  const auto& face_orientations = angle_set.GetSPDS().GetCellFaceOrientations()[data.cell_local_id];
+  const auto& sigma_t = data.xs.at(data.cell.block_id)->GetSigmaTotal();
+
+  constexpr size_t matrix_size = static_cast<size_t>(NumNodes) * static_cast<size_t>(NumNodes);
+  auto idx = [](size_t i, size_t j) -> size_t { return i * NumNodes + j; };
+
+  std::array<double, matrix_size> mass_matrix{};
+  PRAGMA_UNROLL
+  for (size_t i = 0; i < NumNodes; ++i)
+  {
+    PRAGMA_UNROLL
+    for (size_t j = 0; j < NumNodes; ++j)
+      mass_matrix[idx(i, j)] = data.M(i, j);
+  }
+
+  std::vector<std::array<size_t, NumNodes>> moment_dof_map(data.num_moments);
+  for (unsigned int m = 0; m < data.num_moments; ++m)
+  {
+    PRAGMA_UNROLL
+    for (size_t i = 0; i < NumNodes; ++i)
+      moment_dof_map[m][i] = data.cell_transport_view.MapDOF(i, m, data.gs_gi);
+  }
+
+  std::array<double, matrix_size> Amat{};
+  std::vector<double> b(static_cast<std::size_t>(data.gs_size) * NumNodes, 0.0);
+  std::vector<double> sigma_block;
+  sigma_block.reserve(data.group_block_size);
+  std::vector<double> face_mu_values(data.cell_num_faces);
+
+  std::vector<double> tau_gsg;
+  if constexpr (time_dependent)
+  {
+    const auto& inv_velg = data.xs.at(data.cell.block_id)->GetInverseVelocity();
+    const double theta = data.problem.GetTheta();
+    const double inv_theta = 1.0 / theta;
+    const double dt = data.problem.GetTimeStep();
+    const double inv_dt = 1.0 / dt;
+
+    tau_gsg.assign(data.gs_size, 0.0);
+    for (size_t gsg = 0; gsg < data.gs_size; ++gsg)
+      tau_gsg[gsg] = inv_velg[data.gs_gi + gsg] * inv_theta * inv_dt;
+  }
+
+  const double* psi_old =
+    (time_dependent and data.psi_old)
+      ? &(*data.psi_old)[data.discretization.MapDOFLocal(data.cell, 0, groupset.psi_uk_man_, 0, 0)]
+      : nullptr;
+
+  const auto& as_angle_indices = angle_set.GetAngleIndices();
+
+  for (size_t as_ss_idx = 0; as_ss_idx < data.num_angles_in_as; ++as_ss_idx)
+  {
+    const auto direction_num = as_angle_indices[as_ss_idx];
+    const auto omega = groupset.quadrature->omegas[direction_num];
+    const auto wt = groupset.quadrature->weights[direction_num];
+
+    std::fill(b.begin(), b.end(), 0.0);
+
+    PRAGMA_UNROLL
+    for (size_t i = 0; i < NumNodes; ++i)
+    {
+      PRAGMA_UNROLL
+      for (size_t j = 0; j < NumNodes; ++j)
+        Amat[idx(i, j)] = omega.Dot(data.G(i, j));
+    }
+
+    for (size_t f = 0; f < data.cell_num_faces; ++f)
+      face_mu_values[f] = omega.Dot(data.cell.faces[f].normal);
+
+    for (size_t f = 0; f < data.cell_num_faces; ++f)
+    {
+      if (face_orientations[f] != FaceOrientation::INCOMING)
+        continue;
+
+      const auto& face = data.cell.faces[f];
+      const bool is_local_face = data.cell_transport_view.IsFaceLocal(f);
+      const bool is_boundary_face = not face.has_neighbor;
+      const auto* face_nodal_mapping =
+        &data.fluds.GetCommonData().GetFaceNodalMapping(data.cell_local_id, f);
+
+      const auto& Ms_f = data.M_surf[f];
+      const size_t num_face_nodes = data.cell_mapping.GetNumFaceNodes(f);
+      const double mu_f = -face_mu_values[f];
+
+      for (size_t fj = 0; fj < num_face_nodes; ++fj)
+      {
+        const int j = data.cell_mapping.MapFaceNode(f, fj);
+
+        const double* psi = nullptr;
+        if (is_local_face)
+          psi = data.fluds.UpwindPsi(*data.cell_transport_view.FaceNeighbor(f),
+                                     face_nodal_mapping->cell_node_mapping_[fj],
+                                     as_ss_idx);
+        else if (not is_boundary_face)
+          psi = data.fluds.NLUpwindPsi(
+            data.cell.global_id, f, face_nodal_mapping->face_node_mapping_[fj], as_ss_idx);
+        else
+          psi = angle_set.PsiBoundary(face.neighbor_id,
+                                      direction_num,
+                                      data.cell_local_id,
+                                      f,
+                                      fj,
+                                      data.gs_gi,
+                                      data.surface_source_active);
+
+        for (size_t fi = 0; fi < num_face_nodes; ++fi)
+        {
+          const int i = data.cell_mapping.MapFaceNode(f, fi);
+          const double mu_Nij = mu_f * Ms_f(i, j);
+          Amat[idx(i, j)] += mu_Nij;
+
+          if (not psi)
+            continue;
+
+          for (size_t gsg = 0; gsg < data.gs_size; ++gsg)
+            b[gsg * NumNodes + i] += psi[gsg] * mu_Nij;
+        }
+      }
+    }
+
+    const auto dir_moment_offset =
+      static_cast<std::size_t>(direction_num) * static_cast<std::size_t>(data.num_moments);
+    const double* __restrict m2d_row = m2d_op.data() + dir_moment_offset;
+    const double* __restrict d2m_row = d2m_op.data() + dir_moment_offset;
+
+    for (unsigned int g0 = 0; g0 < data.gs_size; g0 += data.group_block_size)
+    {
+      const auto g1 = std::min(g0 + data.group_block_size, static_cast<unsigned int>(data.gs_size));
+      const auto block_len = g1 - g0;
+      sigma_block.resize(block_len);
+
+      for (unsigned int gsg = g0; gsg < g1; ++gsg)
+      {
+        const size_t rel = gsg - g0;
+        double sigma_tg = sigma_t[data.gs_gi + gsg];
+        if constexpr (time_dependent)
+          sigma_tg += tau_gsg[gsg];
+        sigma_block[rel] = sigma_tg;
+
+        double* __restrict bg = &b[static_cast<std::size_t>(gsg) * NumNodes];
+        for (unsigned int m = 0; m < data.num_moments; ++m)
+        {
+          const double w = m2d_row[m];
+          std::array<double, NumNodes> nodal_source{};
+          for (size_t i = 0; i < NumNodes; ++i)
+            nodal_source[i] = w * data.source_moments[moment_dof_map[m][i] + gsg];
+
+          for (size_t i = 0; i < NumNodes; ++i)
+          {
+            double value = 0.0;
+            const double* row = &mass_matrix[idx(i, 0)];
+            PRAGMA_UNROLL
+            for (size_t j = 0; j < NumNodes; ++j)
+              value += row[j] * nodal_source[j];
+            bg[i] += value;
+          }
+        }
+      }
+
+      if constexpr (time_dependent)
+      {
+        if (data.include_rhs_time_term and psi_old)
+        {
+          for (size_t gsg = g0; gsg < g1; ++gsg)
+          {
+            const double tau = tau_gsg[gsg];
+            double* __restrict bg = &b[gsg * NumNodes];
+
+            for (size_t i = 0; i < NumNodes; ++i)
+            {
+              double value = 0.0;
+              const double* row = &mass_matrix[idx(i, 0)];
+              PRAGMA_UNROLL
+              for (size_t j = 0; j < NumNodes; ++j)
+              {
+                const size_t imap =
+                  j * data.groupset_angle_group_stride + direction_num * data.groupset_group_stride;
+                const double psi_old_val = psi_old[imap + gsg];
+                value += row[j] * psi_old_val;
+              }
+              bg[i] += tau * value;
+            }
+          }
+        }
+      }
+
+      size_t k = 0;
+
+#if __AVX512F__
+      for (; k + simd_width <= block_len; k += simd_width)
+        detail::SimdBatchSolve<detail::AVX512Ops, NumNodes>(
+          Amat.data(), mass_matrix.data(), &sigma_block[k], &b[(g0 + k) * NumNodes]);
+#elif __AVX2__
+      for (; k + simd_width <= block_len; k += simd_width)
+        detail::SimdBatchSolve<detail::AVX2Ops, NumNodes>(
+          Amat.data(), mass_matrix.data(), &sigma_block[k], &b[(g0 + k) * NumNodes]);
+#endif
+
+      for (; k < block_len; ++k)
+      {
+        const size_t gsg = g0 + k;
+        const double sigma_tg = sigma_block[k];
+
+        std::array<double, matrix_size> A{};
+        PRAGMA_UNROLL
+        for (size_t i = 0; i < NumNodes; ++i)
+        {
+          PRAGMA_UNROLL
+          for (size_t j = 0; j < NumNodes; ++j)
+            A[idx(i, j)] = Amat[idx(i, j)] + sigma_tg * mass_matrix[idx(i, j)];
+        }
+
+        double* __restrict bg = &b[gsg * NumNodes];
+
+        for (size_t pivot = 0; pivot < NumNodes; ++pivot)
+        {
+          const double inv = 1.0 / A[idx(pivot, pivot)];
+          for (size_t row = pivot + 1; row < NumNodes; ++row)
+          {
+            const double factor = A[idx(row, pivot)] * inv;
+            bg[row] -= factor * bg[pivot];
+            PRAGMA_UNROLL
+            for (size_t col = pivot + 1; col < NumNodes; ++col)
+              A[idx(row, col)] -= factor * A[idx(pivot, col)];
+          }
+        }
+
+        for (size_t pivot = NumNodes; pivot-- > 0;)
+        {
+          PRAGMA_UNROLL
+          for (size_t col = pivot + 1; col < NumNodes; ++col)
+            bg[pivot] -= A[idx(pivot, col)] * bg[col];
+          bg[pivot] /= A[idx(pivot, pivot)];
+        }
+      }
+
+      for (size_t gsg = g0; gsg < g1; ++gsg)
+      {
+        const double* __restrict bg = &b[gsg * NumNodes];
+        for (unsigned int m = 0; m < data.num_moments; ++m)
+        {
+          const double w = d2m_row[m];
+          PRAGMA_UNROLL
+          for (size_t i = 0; i < NumNodes; ++i)
+          {
+            const size_t dof = data.cell_transport_view.MapDOF(i, m, data.gs_gi);
+            data.destination_phi[dof + gsg] += w * bg[i];
+          }
+        }
+      }
+    }
+
+    if (data.save_angular_flux)
+    {
+      double* psi_new = &data.destination_psi[data.discretization.MapDOFLocal(
+        data.cell, 0, groupset.psi_uk_man_, 0, 0)];
+
+      double theta = 1.0;
+      double inv_theta = 1.0;
+      if constexpr (time_dependent)
+      {
+        theta = data.problem.GetTheta();
+        inv_theta = 1.0 / theta;
+      }
+
+      PRAGMA_UNROLL
+      for (size_t i = 0; i < NumNodes; ++i)
+      {
+        const size_t imap =
+          i * data.groupset_angle_group_stride + direction_num * data.groupset_group_stride;
+
+        for (size_t gsg = 0; gsg < data.gs_size; ++gsg)
+        {
+          const double psi_sol = b[gsg * NumNodes + i];
+          if constexpr (time_dependent)
+          {
+            const double psi_old_val = psi_old ? psi_old[imap + gsg] : 0.0;
+            psi_new[imap + gsg] = inv_theta * (psi_sol + (theta - 1.0) * psi_old_val);
+          }
+          else
+            psi_new[imap + gsg] = psi_sol;
+        }
+      }
+    }
+
+    for (size_t f = 0; f < data.cell_num_faces; ++f)
+    {
+      if (face_orientations[f] != FaceOrientation::OUTGOING)
+        continue;
+
+      const auto& face = data.cell.faces[f];
+      const bool is_local_face = data.cell_transport_view.IsFaceLocal(f);
+      const bool is_boundary_face = not face.has_neighbor;
+      const bool is_reflecting_boundary_face =
+        (is_boundary_face and angle_set.GetBoundaries()[face.neighbor_id]->IsReflecting());
+      const auto& IntF_shapeI = data.IntS_shapeI[f];
+
+      const int locality = data.cell_transport_view.FaceLocality(f);
+      const size_t num_face_nodes = data.cell_mapping.GetNumFaceNodes(f);
+      const auto& face_nodal_mapping =
+        data.fluds.GetCommonData().GetFaceNodalMapping(data.cell_local_id, f);
+      std::vector<double>* psi_nonlocal_outgoing = nullptr;
+
+      if (not is_boundary_face and not is_local_face)
+      {
+        auto* async_comm = dynamic_cast<CBC_AsynchronousCommunicator*>(angle_set.GetCommunicator());
+        const size_t data_size_for_msg = num_face_nodes * data.group_angle_stride;
+        psi_nonlocal_outgoing =
+          &async_comm->InitGetDownwindMessageData(locality,
+                                                  face.neighbor_id,
+                                                  face_nodal_mapping.associated_face_,
+                                                  angle_set.GetID(),
+                                                  data_size_for_msg);
+      }
+
+      const double mu_wt_f = wt * face_mu_values[f];
+
+      for (size_t fi = 0; fi < num_face_nodes; ++fi)
+      {
+        const int i = data.cell_mapping.MapFaceNode(f, fi);
+
+        if (is_boundary_face)
+        {
+          const double flux_i = mu_wt_f * IntF_shapeI(i);
+          for (size_t gsg = 0; gsg < data.gs_size; ++gsg)
+            data.cell_transport_view.AddOutflow(
+              f, data.gs_gi + gsg, flux_i * b[gsg * NumNodes + i]);
+        }
+
+        double* psi = nullptr;
+        if (is_local_face)
+          psi = data.fluds.OutgoingPsi(data.cell, i, as_ss_idx);
+        else if (not is_boundary_face)
+          psi = data.fluds.NLOutgoingPsi(psi_nonlocal_outgoing, fi, as_ss_idx);
+        else if (is_reflecting_boundary_face)
+          psi = angle_set.PsiReflected(face.neighbor_id, direction_num, data.cell_local_id, f, fi);
+
+        if (psi != nullptr)
+        {
+          for (size_t gsg = 0; gsg < data.gs_size; ++gsg)
+            psi[gsg] = b[gsg * NumNodes + i];
+        }
+      }
+    }
+  }
+}
+
+template <unsigned int NumNodes>
+void
+CBCSweepChunk::Sweep_FixedN(AngleSet& angle_set)
+{
+  CALI_CXX_MARK_SCOPE("CBCSweepChunk::Sweep_FixedN");
+
+  auto data = MakeCBCSweepData(discretization_,
+                               source_moments_,
+                               groupset_,
+                               xs_,
+                               num_moments_,
+                               max_num_cell_dofs_,
+                               SaveAngularFluxEnabled(),
+                               groupset_angle_group_stride_,
+                               groupset_group_stride_,
+                               destination_phi_,
+                               destination_psi_,
+                               include_rhs_time_term_,
+                               problem_,
+                               nullptr,
+                               group_block_size_,
+                               ctx_);
+
+  CBC_Sweep_FixedN<NumNodes, false>(data, angle_set);
+}
+
+template void CBC_Sweep_FixedN<2, false>(CBCSweepData&, AngleSet&);
+template void CBC_Sweep_FixedN<3, false>(CBCSweepData&, AngleSet&);
+template void CBC_Sweep_FixedN<4, false>(CBCSweepData&, AngleSet&);
+template void CBC_Sweep_FixedN<5, false>(CBCSweepData&, AngleSet&);
+template void CBC_Sweep_FixedN<6, false>(CBCSweepData&, AngleSet&);
+template void CBC_Sweep_FixedN<7, false>(CBCSweepData&, AngleSet&);
+template void CBC_Sweep_FixedN<8, false>(CBCSweepData&, AngleSet&);
+
+template void CBC_Sweep_FixedN<2, true>(CBCSweepData&, AngleSet&);
+template void CBC_Sweep_FixedN<3, true>(CBCSweepData&, AngleSet&);
+template void CBC_Sweep_FixedN<4, true>(CBCSweepData&, AngleSet&);
+template void CBC_Sweep_FixedN<5, true>(CBCSweepData&, AngleSet&);
+template void CBC_Sweep_FixedN<6, true>(CBCSweepData&, AngleSet&);
+template void CBC_Sweep_FixedN<7, true>(CBCSweepData&, AngleSet&);
+template void CBC_Sweep_FixedN<8, true>(CBCSweepData&, AngleSet&);
+
+template void CBCSweepChunk::Sweep_FixedN<2>(AngleSet&);
+template void CBCSweepChunk::Sweep_FixedN<3>(AngleSet&);
+template void CBCSweepChunk::Sweep_FixedN<4>(AngleSet&);
+template void CBCSweepChunk::Sweep_FixedN<5>(AngleSet&);
+template void CBCSweepChunk::Sweep_FixedN<6>(AngleSet&);
+template void CBCSweepChunk::Sweep_FixedN<7>(AngleSet&);
+template void CBCSweepChunk::Sweep_FixedN<8>(AngleSet&);
+
+} // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.cc
index 24bbf8ef5f..70ee7c82d3 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.cc
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.cc
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: MIT
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h"
 #include "modules/linear_boltzmann_solvers/lbs_problem/groupset/lbs_groupset.h"
 #include "framework/math/spatial_discretization/spatial_discretization.h"
@@ -26,6 +27,8 @@ CBCSweepChunk::CBCSweepChunk(DiscreteOrdinatesProblem& problem, LBSGroupset& gro
                problem.GetNumMoments(),
                problem.GetMaxCellDOFCount(),
                problem.GetMinCellDOFCount()),
+    sweep_impl_(&CBCSweepChunk::Sweep_Generic),
+    group_block_size_(0),
     fluds_(nullptr),
     gs_size_(0),
     gs_gi_(0),
@@ -40,6 +43,37 @@ CBCSweepChunk::CBCSweepChunk(DiscreteOrdinatesProblem& problem, LBSGroupset& gro
     cell_num_faces_(0),
     cell_num_nodes_(0)
 {
+  if ((min_num_cell_dofs_ == max_num_cell_dofs_) and (min_num_cell_dofs_ >= 2)
+      and (min_num_cell_dofs_ <= 8))
+  {
+    switch (min_num_cell_dofs_)
+    {
+      case 2:
+        sweep_impl_ = &CBCSweepChunk::Sweep_FixedN<2>;
+        break;
+      case 3:
+        sweep_impl_ = &CBCSweepChunk::Sweep_FixedN<3>;
+        break;
+      case 4:
+        sweep_impl_ = &CBCSweepChunk::Sweep_FixedN<4>;
+        break;
+      case 5:
+        sweep_impl_ = &CBCSweepChunk::Sweep_FixedN<5>;
+        break;
+      case 6:
+        sweep_impl_ = &CBCSweepChunk::Sweep_FixedN<6>;
+        break;
+      case 7:
+        sweep_impl_ = &CBCSweepChunk::Sweep_FixedN<7>;
+        break;
+      case 8:
+        sweep_impl_ = &CBCSweepChunk::Sweep_FixedN<8>;
+        break;
+      default:
+        break;
+    }
+  }
+  group_block_size_ = ComputeGroupBlockSize(groupset_.GetNumGroups());
 }
 
 void
@@ -69,14 +103,21 @@ CBCSweepChunk::SetCell(const Cell* cell_ptr, AngleSet& angle_set)
   cell_num_nodes_ = cell_mapping_->GetNumNodes();
 
   // Get cell matrices
-  G_ = unit_cell_matrices_[cell_local_id_].intV_shapeI_gradshapeJ;
-  M_ = unit_cell_matrices_[cell_local_id_].intV_shapeI_shapeJ;
-  M_surf_ = unit_cell_matrices_[cell_local_id_].intS_shapeI_shapeJ;
-  IntS_shapeI_ = unit_cell_matrices_[cell_local_id_].intS_shapeI;
+  const auto& unit_mats = unit_cell_matrices_[cell_local_id_];
+  G_ = &unit_mats.intV_shapeI_gradshapeJ;
+  M_ = &unit_mats.intV_shapeI_shapeJ;
+  M_surf_ = &unit_mats.intS_shapeI_shapeJ;
+  IntS_shapeI_ = &unit_mats.intS_shapeI;
 }
 
 void
 CBCSweepChunk::Sweep(AngleSet& angle_set)
+{
+  (this->*sweep_impl_)(angle_set);
+}
+
+void
+CBCSweepChunk::Sweep_Generic(AngleSet& angle_set)
 {
   const auto& m2d_op = groupset_.quadrature->GetMomentToDiscreteOperator();
   const auto& d2m_op = groupset_.quadrature->GetDiscreteToMomentOperator();
@@ -108,7 +149,7 @@ CBCSweepChunk::Sweep(AngleSet& angle_set)
 
     for (size_t i = 0; i < cell_num_nodes_; ++i)
       for (size_t j = 0; j < cell_num_nodes_; ++j)
-        Amat(i, j) = omega.Dot(G_(i, j));
+        Amat(i, j) = omega.Dot((*G_)(i, j));
 
     // Update face orientations
     for (size_t f = 0; f < cell_num_faces_; ++f)
@@ -136,7 +177,7 @@ CBCSweepChunk::Sweep(AngleSet& angle_set)
         {
           const int j = cell_mapping_->MapFaceNode(f, fj);
 
-          const double mu_Nij = -face_mu_values[f] * M_surf_[f](i, j);
+          const double mu_Nij = -face_mu_values[f] * (*M_surf_)[f](i, j);
           Amat(i, j) += mu_Nij;
 
           const double* psi = nullptr;
@@ -194,7 +235,7 @@ CBCSweepChunk::Sweep(AngleSet& angle_set)
         double temp = 0.0;
         for (size_t j = 0; j < cell_num_nodes_; ++j)
         {
-          const double Mij = M_(i, j);
+          const double Mij = (*M_)(i, j);
           Atemp(i, j) = Amat(i, j) + Mij * sigma_tg;
           temp += Mij * source[j];
         }
@@ -244,7 +285,7 @@ CBCSweepChunk::Sweep(AngleSet& angle_set)
       const bool is_boundary_face = not face.has_neighbor;
       const bool is_reflecting_boundary_face =
         (is_boundary_face and angle_set.GetBoundaries()[face.neighbor_id]->IsReflecting());
-      const auto& IntF_shapeI = IntS_shapeI_[f];
+      const auto& IntF_shapeI = (*IntS_shapeI_)[f];
 
       const int locality = cell_transport_view_->FaceLocality(f);
       const size_t num_face_nodes = cell_mapping_->GetNumFaceNodes(f);
@@ -254,7 +295,7 @@ CBCSweepChunk::Sweep(AngleSet& angle_set)
 
       if (not is_boundary_face and not is_local_face)
       {
-        auto& async_comm = *angle_set.GetCommunicator();
+        auto& async_comm = static_cast<CBC_AsynchronousCommunicator&>(*angle_set.GetCommunicator());
         const size_t data_size_for_msg = num_face_nodes * group_angle_stride_;
         psi_nonlocal_outgoing =
           &async_comm.InitGetDownwindMessageData(locality,
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h
index 0e8bf6fbfc..a07b78daf2 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h
@@ -5,6 +5,7 @@
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/sweep_chunk.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/avx_sweep_chunk_utils.h"
 
 namespace opensn
 {
@@ -51,7 +52,16 @@ class CBCSweepChunk : public SweepChunk
    */
   void Sweep(AngleSet& angle_set) override;
 
-protected:
+private:
+  using SweepFunc = void (CBCSweepChunk::*)(AngleSet&);
+  SweepFunc sweep_impl_ = nullptr;
+
+  void Sweep_Generic(AngleSet& angle_set);
+  template <unsigned int NumNodes>
+  void Sweep_FixedN(AngleSet& angle_set);
+
+  unsigned int group_block_size_;
+
   CBC_FLUDS* fluds_;
   size_t gs_size_;
   unsigned int gs_gi_;
@@ -67,10 +77,10 @@ class CBCSweepChunk : public SweepChunk
   size_t cell_num_faces_;
   size_t cell_num_nodes_;
 
-  DenseMatrix<Vector3> G_;
-  DenseMatrix<double> M_;
-  std::vector<DenseMatrix<double>> M_surf_;
-  std::vector<Vector<double>> IntS_shapeI_;
+  const DenseMatrix<Vector3>* G_;
+  const DenseMatrix<double>* M_;
+  const std::vector<DenseMatrix<double>>* M_surf_;
+  const std::vector<Vector<double>>* IntS_shapeI_;
 };
 
 } // namespace opensn

From a0444eea18e470165fbcef9c248a80e92e137efb Mon Sep 17 00:00:00 2001
From: Eappen Nelluvelil <eappen@tamu.edu>
Date: Tue, 31 Mar 2026 23:30:58 -0500
Subject: [PATCH 2/6] Time-dependent CBC sweep chunk

---
 .../discrete_ordinates_problem.cc             |   7 +-
 .../sweep_chunks/cbc_sweep_chunk.cc           | 291 ++---------------
 .../sweep_chunks/cbc_sweep_chunk.h            |  31 +-
 .../sweep_chunks/cbc_sweep_chunk_shared.h     | 126 +++++++
 .../sweep_chunks/cbc_sweep_chunk_td.cc        | 148 +++++++++
 .../sweep_chunks/cbc_sweep_chunk_td.h         |  39 +++
 .../sweep_chunks/cbc_sweep_kernels.h          | 308 ++++++++++++++++++
 7 files changed, 661 insertions(+), 289 deletions(-)
 create mode 100644 modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_shared.h
 create mode 100644 modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.cc
 create mode 100644 modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.h
 create mode 100644 modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h

diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc
index 74f1919c60..1a375321e1 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc
@@ -16,6 +16,7 @@
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_sweep_chunk.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_sweep_chunk_td.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/iterative_methods/sweep_wgs_context.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/io/discrete_ordinates_problem_io.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/iterative_methods/ags_linear_solver.h"
@@ -1882,10 +1883,6 @@ DiscreteOrdinatesProblem::SetSweepChunk(LBSGroupset& groupset)
 
   const bool use_time_dependent_chunk = (mode == SweepChunkMode::TIME_DEPENDENT);
 
-  if (use_time_dependent_chunk && sweep_type_ != "AAH")
-    throw std::invalid_argument(GetName() +
-                                ": Time dependent is only supported with sweep_type='AAH'.");
-
   if (sweep_type_ == "AAH")
   {
     if (use_time_dependent_chunk)
@@ -1896,6 +1893,8 @@ DiscreteOrdinatesProblem::SetSweepChunk(LBSGroupset& groupset)
   }
   else if (sweep_type_ == "CBC")
   {
+    if (use_time_dependent_chunk)
+      return std::make_shared<CBCSweepChunkTD>(*this, groupset);
     if (use_gpus_)
       return CreateCBCDSweepChunk(groupset);
     return std::make_shared<CBCSweepChunk>(*this, groupset);
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.cc
index 70ee7c82d3..b40c3d7ed0 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.cc
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.cc
@@ -2,13 +2,7 @@
 // SPDX-License-Identifier: MIT
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.h"
-#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h"
-#include "modules/linear_boltzmann_solvers/lbs_problem/groupset/lbs_groupset.h"
-#include "framework/math/spatial_discretization/spatial_discretization.h"
-#include "framework/mesh/mesh_continuum/mesh_continuum.h"
-#include "framework/mesh/cell/cell.h"
-#include "framework/logging/log.h"
 #include "caliper/cali.h"
 
 namespace opensn
@@ -27,24 +21,11 @@ CBCSweepChunk::CBCSweepChunk(DiscreteOrdinatesProblem& problem, LBSGroupset& gro
                problem.GetNumMoments(),
                problem.GetMaxCellDOFCount(),
                problem.GetMinCellDOFCount()),
-    sweep_impl_(&CBCSweepChunk::Sweep_Generic),
-    group_block_size_(0),
-    fluds_(nullptr),
-    gs_size_(0),
-    gs_gi_(0),
-    num_angles_in_as_(0),
-    group_stride_(0),
-    group_angle_stride_(0),
-    surface_source_active_(false),
-    cell_(nullptr),
-    cell_local_id_(0),
-    cell_mapping_(nullptr),
-    cell_transport_view_(nullptr),
-    cell_num_faces_(0),
-    cell_num_nodes_(0)
+    problem_(problem),
+    sweep_impl_(&CBCSweepChunk::Sweep_Generic)
 {
-  if ((min_num_cell_dofs_ == max_num_cell_dofs_) and (min_num_cell_dofs_ >= 2)
-      and (min_num_cell_dofs_ <= 8))
+  if ((min_num_cell_dofs_ == max_num_cell_dofs_) and (min_num_cell_dofs_ >= 2) and
+      (min_num_cell_dofs_ <= 8))
   {
     switch (min_num_cell_dofs_)
     {
@@ -73,41 +54,23 @@ CBCSweepChunk::CBCSweepChunk(DiscreteOrdinatesProblem& problem, LBSGroupset& gro
         break;
     }
   }
+
   group_block_size_ = ComputeGroupBlockSize(groupset_.GetNumGroups());
 }
 
 void
 CBCSweepChunk::SetAngleSet(AngleSet& angle_set)
 {
-  CALI_CXX_MARK_SCOPE("CbcSweepChunk::SetAngleSet");
-
-  fluds_ = &dynamic_cast<CBC_FLUDS&>(angle_set.GetFLUDS());
-
-  gs_size_ = groupset_.GetNumGroups();
-  gs_gi_ = groupset_.first_group;
+  CALI_CXX_MARK_SCOPE("CBCSweepChunk::SetAngleSet");
 
-  surface_source_active_ = IsSurfaceSourceActive();
-  num_angles_in_as_ = angle_set.GetNumAngles();
-  group_stride_ = angle_set.GetNumGroups();
-  group_angle_stride_ = group_stride_ * num_angles_in_as_;
+  CBCBindAngleSetContext(ctx_, groupset_, IsSurfaceSourceActive(), angle_set);
 }
 
 void
 CBCSweepChunk::SetCell(const Cell* cell_ptr, AngleSet& angle_set)
 {
-  cell_ = cell_ptr;
-  cell_local_id_ = cell_ptr->local_id;
-  cell_mapping_ = &discretization_.GetCellMapping(*cell_);
-  cell_transport_view_ = &cell_transport_views_[cell_->local_id];
-  cell_num_faces_ = cell_->faces.size();
-  cell_num_nodes_ = cell_mapping_->GetNumNodes();
-
-  // Get cell matrices
-  const auto& unit_mats = unit_cell_matrices_[cell_local_id_];
-  G_ = &unit_mats.intV_shapeI_gradshapeJ;
-  M_ = &unit_mats.intV_shapeI_shapeJ;
-  M_surf_ = &unit_mats.intS_shapeI_shapeJ;
-  IntS_shapeI_ = &unit_mats.intS_shapeI;
+  static_cast<void>(angle_set);
+  CBCBindCellContext(ctx_, discretization_, unit_cell_matrices_, cell_transport_views_, cell_ptr);
 }
 
 void
@@ -119,220 +82,26 @@ CBCSweepChunk::Sweep(AngleSet& angle_set)
 void
 CBCSweepChunk::Sweep_Generic(AngleSet& angle_set)
 {
-  const auto& m2d_op = groupset_.quadrature->GetMomentToDiscreteOperator();
-  const auto& d2m_op = groupset_.quadrature->GetDiscreteToMomentOperator();
-
-  DenseMatrix<double> Amat(max_num_cell_dofs_, max_num_cell_dofs_);
-  DenseMatrix<double> Atemp(max_num_cell_dofs_, max_num_cell_dofs_);
-  std::vector<Vector<double>> b(gs_size_, Vector<double>(max_num_cell_dofs_));
-  std::vector<double> source(max_num_cell_dofs_);
-
-  const auto& face_orientations = angle_set.GetSPDS().GetCellFaceOrientations()[cell_local_id_];
-  std::vector<double> face_mu_values(cell_num_faces_);
-
-  const auto& sigma_t = xs_.at(cell_->block_id)->GetSigmaTotal();
-
-  // as = angle set
-  // ss = subset
-  const std::vector<std::uint32_t>& as_angle_indices = angle_set.GetAngleIndices();
-
-  for (size_t as_ss_idx = 0; as_ss_idx < num_angles_in_as_; ++as_ss_idx)
-  {
-    auto direction_num = as_angle_indices[as_ss_idx];
-    auto omega = groupset_.quadrature->omegas[direction_num];
-    auto wt = groupset_.quadrature->weights[direction_num];
-
-    // Reset right-hand side
-    for (size_t gsg = 0; gsg < gs_size_; ++gsg)
-      for (size_t i = 0; i < cell_num_nodes_; ++i)
-        b[gsg](i) = 0.0;
-
-    for (size_t i = 0; i < cell_num_nodes_; ++i)
-      for (size_t j = 0; j < cell_num_nodes_; ++j)
-        Amat(i, j) = omega.Dot((*G_)(i, j));
-
-    // Update face orientations
-    for (size_t f = 0; f < cell_num_faces_; ++f)
-      face_mu_values[f] = omega.Dot(cell_->faces[f].normal);
-
-    // Surface integrals
-    for (size_t f = 0; f < cell_num_faces_; ++f)
-    {
-      if (face_orientations[f] != FaceOrientation::INCOMING)
-        continue;
-
-      const auto& face = cell_->faces[f];
-      const bool is_local_face = cell_transport_view_->IsFaceLocal(f);
-      const bool is_boundary_face = not face.has_neighbor;
-      const auto* face_nodal_mapping =
-        &fluds_->GetCommonData().GetFaceNodalMapping(cell_local_id_, f);
-
-      // IntSf_mu_psi_Mij_dA
-      const size_t num_face_nodes = cell_mapping_->GetNumFaceNodes(f);
-      for (size_t fi = 0; fi < num_face_nodes; ++fi)
-      {
-        const int i = cell_mapping_->MapFaceNode(f, fi);
-
-        for (size_t fj = 0; fj < num_face_nodes; ++fj)
-        {
-          const int j = cell_mapping_->MapFaceNode(f, fj);
-
-          const double mu_Nij = -face_mu_values[f] * (*M_surf_)[f](i, j);
-          Amat(i, j) += mu_Nij;
-
-          const double* psi = nullptr;
-
-          if (is_local_face)
-            psi = fluds_->UpwindPsi(*cell_transport_view_->FaceNeighbor(f),
-                                    face_nodal_mapping->cell_node_mapping_[fj],
-                                    as_ss_idx);
-          else if (not is_boundary_face)
-            psi = fluds_->NLUpwindPsi(
-              cell_->global_id, f, face_nodal_mapping->face_node_mapping_[fj], as_ss_idx);
-          else
-            psi = angle_set.PsiBoundary(face.neighbor_id,
-                                        direction_num,
-                                        cell_local_id_,
-                                        f,
-                                        fj,
-                                        gs_gi_,
-                                        surface_source_active_);
-
-          if (psi != nullptr)
-            for (size_t gsg = 0; gsg < gs_size_; ++gsg)
-              b[gsg](i) += psi[gsg] * mu_Nij;
-        } // for face node j
-      } // for face node i
-    } // for f
-
-    const auto dir_moment_offset =
-      static_cast<std::size_t>(direction_num) * static_cast<std::size_t>(num_moments_);
-    const double* m2d_row = m2d_op.data() + dir_moment_offset;
-    const double* d2m_row = d2m_op.data() + dir_moment_offset;
-
-    // Looping over groups, assembling mass terms
-    for (unsigned int gsg = 0; gsg < gs_size_; ++gsg)
-    {
-      double sigma_tg = sigma_t[gs_gi_ + gsg];
-
-      // Contribute source moments q = M_n^T * q_moms
-      for (size_t i = 0; i < cell_num_nodes_; ++i)
-      {
-        double temp_src = 0.0;
-        for (unsigned int m = 0; m < num_moments_; ++m)
-        {
-          const auto ir = cell_transport_view_->MapDOF(i, m, gs_gi_ + gsg);
-          temp_src += m2d_row[m] * source_moments_[ir];
-        }
-        source[i] = temp_src;
-      }
-
-      // Mass matrix and source
-      // Atemp = Amat + sigma_tgr * M
-      // b += M * q
-      for (size_t i = 0; i < cell_num_nodes_; ++i)
-      {
-        double temp = 0.0;
-        for (size_t j = 0; j < cell_num_nodes_; ++j)
-        {
-          const double Mij = (*M_)(i, j);
-          Atemp(i, j) = Amat(i, j) + Mij * sigma_tg;
-          temp += Mij * source[j];
-        }
-        b[gsg](i) += temp;
-      }
-
-      // Solve system
-      GaussElimination(Atemp, b[gsg], static_cast<int>(cell_num_nodes_));
-    } // for gsg
-
-    // Update phi
-    for (unsigned int m = 0; m < num_moments_; ++m)
-    {
-      const auto wn_d2m = d2m_row[m];
-      for (size_t i = 0; i < cell_num_nodes_; ++i)
-      {
-        const auto ir = cell_transport_view_->MapDOF(i, m, gs_gi_);
-        for (size_t gsg = 0; gsg < gs_size_; ++gsg)
-          destination_phi_[ir + gsg] += wn_d2m * b[gsg](i);
-      }
-    }
-
-    // If requested, save angular fluxes during sweep
-    if (SaveAngularFluxEnabled())
-    {
-      double* cell_psi =
-        &destination_psi_[discretization_.MapDOFLocal(*cell_, 0, groupset_.psi_uk_man_, 0, 0)];
-
-      for (size_t i = 0; i < cell_num_nodes_; ++i)
-      {
-        const size_t addr_offset =
-          i * groupset_angle_group_stride_ + direction_num * groupset_group_stride_;
-
-        for (size_t gsg = 0; gsg < gs_size_; ++gsg)
-          cell_psi[addr_offset + gsg] = b[gsg](i);
-      }
-    }
-
-    // Perform outgoing surface operations
-    for (size_t f = 0; f < cell_num_faces_; ++f)
-    {
-      if (face_orientations[f] != FaceOrientation::OUTGOING)
-        continue;
-
-      const auto& face = cell_->faces[f];
-      const bool is_local_face = cell_transport_view_->IsFaceLocal(f);
-      const bool is_boundary_face = not face.has_neighbor;
-      const bool is_reflecting_boundary_face =
-        (is_boundary_face and angle_set.GetBoundaries()[face.neighbor_id]->IsReflecting());
-      const auto& IntF_shapeI = (*IntS_shapeI_)[f];
-
-      const int locality = cell_transport_view_->FaceLocality(f);
-      const size_t num_face_nodes = cell_mapping_->GetNumFaceNodes(f);
-      const auto& face_nodal_mapping =
-        fluds_->GetCommonData().GetFaceNodalMapping(cell_local_id_, f);
-      std::vector<double>* psi_nonlocal_outgoing = nullptr;
-
-      if (not is_boundary_face and not is_local_face)
-      {
-        auto& async_comm = static_cast<CBC_AsynchronousCommunicator&>(*angle_set.GetCommunicator());
-        const size_t data_size_for_msg = num_face_nodes * group_angle_stride_;
-        psi_nonlocal_outgoing =
-          &async_comm.InitGetDownwindMessageData(locality,
-                                                 face.neighbor_id,
-                                                 face_nodal_mapping.associated_face_,
-                                                 angle_set.GetID(),
-                                                 data_size_for_msg);
-      }
-
-      for (size_t fi = 0; fi < num_face_nodes; ++fi)
-      {
-        const int i = cell_mapping_->MapFaceNode(f, fi);
-
-        // Tally outflow for particle balance
-        if (is_boundary_face)
-        {
-          for (size_t gsg = 0; gsg < gs_size_; ++gsg)
-            cell_transport_view_->AddOutflow(
-              f, gs_gi_ + gsg, wt * face_mu_values[f] * b[gsg](i) * IntF_shapeI(i));
-        }
-
-        double* psi = nullptr;
-
-        if (is_local_face)
-          psi = fluds_->OutgoingPsi(*cell_, i, as_ss_idx);
-        else if (not is_boundary_face)
-          psi = fluds_->NLOutgoingPsi(psi_nonlocal_outgoing, fi, as_ss_idx);
-        else if (is_reflecting_boundary_face)
-          psi = angle_set.PsiReflected(face.neighbor_id, direction_num, cell_local_id_, f, fi);
-
-        // Write the solved angular flux to the determined location
-        if (psi != nullptr)
-          for (size_t gsg = 0; gsg < gs_size_; ++gsg)
-            psi[gsg] = b[gsg](i);
-      } // for fi
-    } // for face
-  } // for angleset/subset
+  CALI_CXX_MARK_SCOPE("CBCSweepChunk::Sweep_Generic");
+
+  auto data = MakeCBCSweepData(discretization_,
+                               source_moments_,
+                               groupset_,
+                               xs_,
+                               num_moments_,
+                               max_num_cell_dofs_,
+                               SaveAngularFluxEnabled(),
+                               groupset_angle_group_stride_,
+                               groupset_group_stride_,
+                               destination_phi_,
+                               destination_psi_,
+                               include_rhs_time_term_,
+                               problem_,
+                               nullptr,
+                               group_block_size_,
+                               ctx_);
+
+  CBC_Sweep_Generic<false>(data, angle_set);
 }
 
-} // namespace opensn
\ No newline at end of file
+} // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h
index a07b78daf2..5d8acaa305 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h
@@ -3,9 +3,9 @@
 
 #pragma once
 
-#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h"
-#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/sweep_chunk.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/avx_sweep_chunk_utils.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_shared.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/sweep_chunk.h"
 
 namespace opensn
 {
@@ -52,6 +52,11 @@ class CBCSweepChunk : public SweepChunk
    */
   void Sweep(AngleSet& angle_set) override;
 
+protected:
+  DiscreteOrdinatesProblem& problem_;
+  CBCSweepChunkContext ctx_;
+  unsigned int group_block_size_ = 0;
+
 private:
   using SweepFunc = void (CBCSweepChunk::*)(AngleSet&);
   SweepFunc sweep_impl_ = nullptr;
@@ -59,28 +64,6 @@ class CBCSweepChunk : public SweepChunk
   void Sweep_Generic(AngleSet& angle_set);
   template <unsigned int NumNodes>
   void Sweep_FixedN(AngleSet& angle_set);
-
-  unsigned int group_block_size_;
-
-  CBC_FLUDS* fluds_;
-  size_t gs_size_;
-  unsigned int gs_gi_;
-  size_t num_angles_in_as_;
-  unsigned int group_stride_; // Stride for consecutive angles
-  size_t group_angle_stride_; // Stride for consecutive spatial DOFs
-  bool surface_source_active_;
-
-  const Cell* cell_;
-  std::uint32_t cell_local_id_;
-  const CellMapping* cell_mapping_;
-  CellLBSView* cell_transport_view_;
-  size_t cell_num_faces_;
-  size_t cell_num_nodes_;
-
-  const DenseMatrix<Vector3>* G_;
-  const DenseMatrix<double>* M_;
-  const std::vector<DenseMatrix<double>>* M_surf_;
-  const std::vector<Vector<double>>* IntS_shapeI_;
 };
 
 } // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_shared.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_shared.h
new file mode 100644
index 0000000000..13a8ae1f1b
--- /dev/null
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_shared.h
@@ -0,0 +1,126 @@
+// SPDX-FileCopyrightText: 2026 The OpenSn Authors <https://open-sn.github.io/opensn/>
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "framework/math/spatial_discretization/spatial_discretization.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h"
+
+namespace opensn
+{
+
+struct CBCSweepChunkContext
+{
+  CBC_FLUDS* fluds = nullptr;
+
+  size_t gs_size = 0;
+  unsigned int gs_gi = 0;
+  size_t num_angles_in_as = 0;
+  unsigned int group_stride = 0;
+  size_t group_angle_stride = 0;
+  bool surface_source_active = false;
+
+  const Cell* cell = nullptr;
+  std::uint32_t cell_local_id = 0;
+  const CellMapping* cell_mapping = nullptr;
+  CellLBSView* cell_transport_view = nullptr;
+  size_t cell_num_faces = 0;
+  size_t cell_num_nodes = 0;
+
+  const DenseMatrix<Vector3>* G = nullptr;
+  const DenseMatrix<double>* M = nullptr;
+  const std::vector<DenseMatrix<double>>* M_surf = nullptr;
+  const std::vector<Vector<double>>* IntS_shapeI = nullptr;
+};
+
+inline void
+CBCBindAngleSetContext(CBCSweepChunkContext& ctx,
+                       const LBSGroupset& groupset,
+                       bool surface_source_active,
+                       AngleSet& angle_set)
+{
+  ctx.fluds = &dynamic_cast<CBC_FLUDS&>(angle_set.GetFLUDS());
+  ctx.gs_size = groupset.GetNumGroups();
+  ctx.gs_gi = groupset.first_group;
+  ctx.surface_source_active = surface_source_active;
+  ctx.num_angles_in_as = angle_set.GetNumAngles();
+  ctx.group_stride = angle_set.GetNumGroups();
+  ctx.group_angle_stride = ctx.group_stride * ctx.num_angles_in_as;
+}
+
+inline void
+CBCBindCellContext(CBCSweepChunkContext& ctx,
+                   const SpatialDiscretization& discretization,
+                   const std::vector<UnitCellMatrices>& unit_cell_matrices,
+                   std::vector<CellLBSView>& cell_transport_views,
+                   const Cell* cell_ptr)
+{
+  ctx.cell = cell_ptr;
+  ctx.cell_local_id = cell_ptr->local_id;
+  ctx.cell_mapping = &discretization.GetCellMapping(*ctx.cell);
+  ctx.cell_transport_view = &cell_transport_views[ctx.cell->local_id];
+  ctx.cell_num_faces = ctx.cell->faces.size();
+  ctx.cell_num_nodes = ctx.cell_mapping->GetNumNodes();
+
+  const auto& unit_mats = unit_cell_matrices[ctx.cell_local_id];
+  ctx.G = &unit_mats.intV_shapeI_gradshapeJ;
+  ctx.M = &unit_mats.intV_shapeI_shapeJ;
+  ctx.M_surf = &unit_mats.intS_shapeI_shapeJ;
+  ctx.IntS_shapeI = &unit_mats.intS_shapeI;
+}
+
+inline CBCSweepData
+MakeCBCSweepData(const SpatialDiscretization& discretization,
+                 const std::vector<double>& source_moments,
+                 const LBSGroupset& groupset,
+                 const BlockID2XSMap& xs,
+                 unsigned int num_moments,
+                 unsigned int max_num_cell_dofs,
+                 bool save_angular_flux,
+                 size_t groupset_angle_group_stride,
+                 size_t groupset_group_stride,
+                 std::vector<double>& destination_phi,
+                 std::vector<double>& destination_psi,
+                 bool include_rhs_time_term,
+                 DiscreteOrdinatesProblem& problem,
+                 const std::vector<double>* psi_old,
+                 unsigned int group_block_size,
+                 const CBCSweepChunkContext& ctx)
+{
+  return CBCSweepData{discretization,
+                      source_moments,
+                      groupset,
+                      xs,
+                      num_moments,
+                      max_num_cell_dofs,
+                      save_angular_flux,
+                      groupset_angle_group_stride,
+                      groupset_group_stride,
+                      destination_phi,
+                      destination_psi,
+                      ctx.surface_source_active,
+                      include_rhs_time_term,
+                      problem,
+                      psi_old,
+                      group_block_size,
+                      *ctx.fluds,
+                      *ctx.cell,
+                      ctx.cell_local_id,
+                      *ctx.cell_mapping,
+                      *ctx.cell_transport_view,
+                      ctx.cell_num_faces,
+                      ctx.cell_num_nodes,
+                      ctx.gs_size,
+                      ctx.gs_gi,
+                      ctx.num_angles_in_as,
+                      ctx.group_stride,
+                      ctx.group_angle_stride,
+                      *ctx.G,
+                      *ctx.M,
+                      *ctx.M_surf,
+                      *ctx.IntS_shapeI};
+}
+
+} // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.cc
new file mode 100644
index 0000000000..0b261ceb48
--- /dev/null
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.cc
@@ -0,0 +1,148 @@
+// SPDX-FileCopyrightText: 2026 The OpenSn Authors <https://open-sn.github.io/opensn/>
+// SPDX-License-Identifier: MIT
+
+#include "framework/utils/error.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h"
+#include "caliper/cali.h"
+#include <stdexcept>
+
+namespace opensn
+{
+
+CBCSweepChunkTD::CBCSweepChunkTD(DiscreteOrdinatesProblem& problem, LBSGroupset& groupset)
+  : SweepChunk(problem.GetPhiNewLocal(),
+               problem.GetPsiNewLocal()[groupset.id],
+               problem.GetGrid(),
+               problem.GetSpatialDiscretization(),
+               problem.GetUnitCellMatrices(),
+               problem.GetCellTransportViews(),
+               problem.GetQMomentsLocal(),
+               groupset,
+               problem.GetBlockID2XSMap(),
+               problem.GetNumMoments(),
+               problem.GetMaxCellDOFCount(),
+               problem.GetMinCellDOFCount()),
+    problem_(problem),
+    psi_old_(problem.GetPsiOldLocal()[groupset.id]),
+    sweep_impl_td_(&CBCSweepChunkTD::Sweep_Generic)
+{
+  if (problem.UseGPUs())
+    throw std::runtime_error("Time-dependent calculations do not yet support GPUs.\n");
+
+  if ((min_num_cell_dofs_ == max_num_cell_dofs_) and (min_num_cell_dofs_ >= 2) and
+      (min_num_cell_dofs_ <= 8))
+  {
+    switch (min_num_cell_dofs_)
+    {
+      case 2:
+        sweep_impl_td_ = &CBCSweepChunkTD::Sweep_FixedN<2>;
+        break;
+      case 3:
+        sweep_impl_td_ = &CBCSweepChunkTD::Sweep_FixedN<3>;
+        break;
+      case 4:
+        sweep_impl_td_ = &CBCSweepChunkTD::Sweep_FixedN<4>;
+        break;
+      case 5:
+        sweep_impl_td_ = &CBCSweepChunkTD::Sweep_FixedN<5>;
+        break;
+      case 6:
+        sweep_impl_td_ = &CBCSweepChunkTD::Sweep_FixedN<6>;
+        break;
+      case 7:
+        sweep_impl_td_ = &CBCSweepChunkTD::Sweep_FixedN<7>;
+        break;
+      case 8:
+        sweep_impl_td_ = &CBCSweepChunkTD::Sweep_FixedN<8>;
+        break;
+      default:
+        break;
+    }
+  }
+
+  group_block_size_ = ComputeGroupBlockSize(groupset_.GetNumGroups());
+}
+
+void
+CBCSweepChunkTD::SetAngleSet(AngleSet& angle_set)
+{
+  CALI_CXX_MARK_SCOPE("CBCSweepChunkTD::SetAngleSet");
+
+  CBCBindAngleSetContext(ctx_, groupset_, IsSurfaceSourceActive(), angle_set);
+}
+
+void
+CBCSweepChunkTD::SetCell(const Cell* cell_ptr, AngleSet& angle_set)
+{
+  static_cast<void>(angle_set);
+  CBCBindCellContext(ctx_, discretization_, unit_cell_matrices_, cell_transport_views_, cell_ptr);
+}
+
+void
+CBCSweepChunkTD::Sweep(AngleSet& angle_set)
+{
+  (this->*sweep_impl_td_)(angle_set);
+}
+
+void
+CBCSweepChunkTD::Sweep_Generic(AngleSet& angle_set)
+{
+  CALI_CXX_MARK_SCOPE("CBCSweepChunkTD::Sweep_Generic");
+
+  auto data = MakeCBCSweepData(discretization_,
+                               source_moments_,
+                               groupset_,
+                               xs_,
+                               num_moments_,
+                               max_num_cell_dofs_,
+                               SaveAngularFluxEnabled(),
+                               groupset_angle_group_stride_,
+                               groupset_group_stride_,
+                               destination_phi_,
+                               destination_psi_,
+                               include_rhs_time_term_,
+                               problem_,
+                               &psi_old_,
+                               group_block_size_,
+                               ctx_);
+
+  CBC_Sweep_Generic<true>(data, angle_set);
+}
+
+template <unsigned int NumNodes>
+void
+CBCSweepChunkTD::Sweep_FixedN(AngleSet& angle_set)
+{
+  CALI_CXX_MARK_SCOPE("CBCSweepChunkTD::Sweep_FixedN");
+
+  auto data = MakeCBCSweepData(discretization_,
+                               source_moments_,
+                               groupset_,
+                               xs_,
+                               num_moments_,
+                               max_num_cell_dofs_,
+                               SaveAngularFluxEnabled(),
+                               groupset_angle_group_stride_,
+                               groupset_group_stride_,
+                               destination_phi_,
+                               destination_psi_,
+                               include_rhs_time_term_,
+                               problem_,
+                               &psi_old_,
+                               group_block_size_,
+                               ctx_);
+
+  CBC_Sweep_FixedN<NumNodes, true>(data, angle_set);
+}
+
+template void CBCSweepChunkTD::Sweep_FixedN<2>(AngleSet&);
+template void CBCSweepChunkTD::Sweep_FixedN<3>(AngleSet&);
+template void CBCSweepChunkTD::Sweep_FixedN<4>(AngleSet&);
+template void CBCSweepChunkTD::Sweep_FixedN<5>(AngleSet&);
+template void CBCSweepChunkTD::Sweep_FixedN<6>(AngleSet&);
+template void CBCSweepChunkTD::Sweep_FixedN<7>(AngleSet&);
+template void CBCSweepChunkTD::Sweep_FixedN<8>(AngleSet&);
+
+} // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.h
new file mode 100644
index 0000000000..5e99bb83ef
--- /dev/null
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.h
@@ -0,0 +1,39 @@
+// SPDX-FileCopyrightText: 2026 The OpenSn Authors <https://open-sn.github.io/opensn/>
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/avx_sweep_chunk_utils.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_shared.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/sweep_chunk.h"
+
+namespace opensn
+{
+
+class CBCSweepChunkTD : public SweepChunk
+{
+public:
+  CBCSweepChunkTD(DiscreteOrdinatesProblem& problem, LBSGroupset& groupset);
+  ~CBCSweepChunkTD() override = default;
+
+  void SetAngleSet(AngleSet& angle_set) override;
+  void SetCell(const Cell* cell_ptr, AngleSet& angle_set) override;
+  void Sweep(AngleSet& angle_set) override;
+  bool IsTimeDependent() const override { return true; }
+
+protected:
+  using SweepFunc = void (CBCSweepChunkTD::*)(AngleSet&);
+  void Sweep_Generic(AngleSet& angle_set);
+  template <unsigned int NumNodes>
+  void Sweep_FixedN(AngleSet& angle_set);
+
+  DiscreteOrdinatesProblem& problem_;
+  const std::vector<double>& psi_old_;
+  unsigned int group_block_size_ = 0;
+  CBCSweepChunkContext ctx_;
+
+private:
+  SweepFunc sweep_impl_td_ = nullptr;
+};
+
+} // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h
new file mode 100644
index 0000000000..7413c44ab7
--- /dev/null
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h
@@ -0,0 +1,308 @@
+// SPDX-FileCopyrightText: 2026 The OpenSn Authors <https://open-sn.github.io/opensn/>
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "framework/data_types/dense_matrix.h"
+#include "framework/data_types/vector.h"
+#include "framework/mesh/cell/cell.h"
+#include "framework/math/spatial_discretization/spatial_discretization.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h"
+#include <algorithm>
+
+namespace opensn
+{
+
+struct CBCSweepData
+{
+  const SpatialDiscretization& discretization;
+  const std::vector<double>& source_moments;
+  const LBSGroupset& groupset;
+  const BlockID2XSMap& xs;
+  unsigned int num_moments;
+  unsigned int max_num_cell_dofs;
+  bool save_angular_flux;
+  size_t groupset_angle_group_stride;
+  size_t groupset_group_stride;
+  std::vector<double>& destination_phi;
+  std::vector<double>& destination_psi;
+  bool surface_source_active;
+  bool include_rhs_time_term;
+  DiscreteOrdinatesProblem& problem;
+  const std::vector<double>* psi_old;
+  unsigned int group_block_size;
+
+  CBC_FLUDS& fluds;
+  const Cell& cell;
+  std::uint32_t cell_local_id;
+  const CellMapping& cell_mapping;
+  CellLBSView& cell_transport_view;
+  size_t cell_num_faces;
+  size_t cell_num_nodes;
+
+  size_t gs_size;
+  unsigned int gs_gi;
+  size_t num_angles_in_as;
+  unsigned int group_stride;
+  size_t group_angle_stride;
+
+  const DenseMatrix<Vector3>& G;
+  const DenseMatrix<double>& M;
+  const std::vector<DenseMatrix<double>>& M_surf;
+  const std::vector<Vector<double>>& IntS_shapeI;
+};
+
+template <bool time_dependent>
+inline void
+CBC_Sweep_Generic(CBCSweepData& data, AngleSet& angle_set)
+{
+  const auto& groupset = data.groupset;
+  const auto& m2d_op = groupset.quadrature->GetMomentToDiscreteOperator();
+  const auto& d2m_op = groupset.quadrature->GetDiscreteToMomentOperator();
+
+  DenseMatrix<double> Amat(data.max_num_cell_dofs, data.max_num_cell_dofs);
+  DenseMatrix<double> Atemp(data.max_num_cell_dofs, data.max_num_cell_dofs);
+  std::vector<Vector<double>> b(data.gs_size, Vector<double>(data.max_num_cell_dofs));
+  std::vector<double> source(data.max_num_cell_dofs);
+  std::vector<double> face_mu_values(data.cell_num_faces);
+
+  const auto& face_orientations = angle_set.GetSPDS().GetCellFaceOrientations()[data.cell_local_id];
+  const auto& sigma_t = data.xs.at(data.cell.block_id)->GetSigmaTotal();
+
+  std::vector<double> tau_gsg;
+  if constexpr (time_dependent)
+  {
+    const auto& inv_velg = data.xs.at(data.cell.block_id)->GetInverseVelocity();
+    const double theta = data.problem.GetTheta();
+    const double inv_theta = 1.0 / theta;
+    const double dt = data.problem.GetTimeStep();
+    const double inv_dt = 1.0 / dt;
+
+    tau_gsg.assign(data.gs_size, 0.0);
+    for (size_t gsg = 0; gsg < data.gs_size; ++gsg)
+      tau_gsg[gsg] = inv_velg[data.gs_gi + gsg] * inv_theta * inv_dt;
+  }
+
+  const double* psi_old =
+    (time_dependent and data.psi_old)
+      ? &(*data.psi_old)[data.discretization.MapDOFLocal(data.cell, 0, groupset.psi_uk_man_, 0, 0)]
+      : nullptr;
+
+  const auto& as_angle_indices = angle_set.GetAngleIndices();
+
+  for (size_t as_ss_idx = 0; as_ss_idx < data.num_angles_in_as; ++as_ss_idx)
+  {
+    const auto direction_num = as_angle_indices[as_ss_idx];
+    const auto omega = groupset.quadrature->omegas[direction_num];
+    const auto wt = groupset.quadrature->weights[direction_num];
+
+    for (size_t gsg = 0; gsg < data.gs_size; ++gsg)
+      for (size_t i = 0; i < data.cell_num_nodes; ++i)
+        b[gsg](i) = 0.0;
+
+    for (size_t i = 0; i < data.cell_num_nodes; ++i)
+      for (size_t j = 0; j < data.cell_num_nodes; ++j)
+        Amat(i, j) = omega.Dot(data.G(i, j));
+
+    for (size_t f = 0; f < data.cell_num_faces; ++f)
+      face_mu_values[f] = omega.Dot(data.cell.faces[f].normal);
+
+    for (size_t f = 0; f < data.cell_num_faces; ++f)
+    {
+      if (face_orientations[f] != FaceOrientation::INCOMING)
+        continue;
+
+      const auto& face = data.cell.faces[f];
+      const bool is_local_face = data.cell_transport_view.IsFaceLocal(f);
+      const bool is_boundary_face = not face.has_neighbor;
+      const auto* face_nodal_mapping =
+        &data.fluds.GetCommonData().GetFaceNodalMapping(data.cell_local_id, f);
+
+      const size_t num_face_nodes = data.cell_mapping.GetNumFaceNodes(f);
+      for (size_t fi = 0; fi < num_face_nodes; ++fi)
+      {
+        const int i = data.cell_mapping.MapFaceNode(f, fi);
+
+        for (size_t fj = 0; fj < num_face_nodes; ++fj)
+        {
+          const int j = data.cell_mapping.MapFaceNode(f, fj);
+          const double mu_Nij = -face_mu_values[f] * data.M_surf[f](i, j);
+          Amat(i, j) += mu_Nij;
+
+          const double* psi = nullptr;
+
+          if (is_local_face)
+            psi = data.fluds.UpwindPsi(*data.cell_transport_view.FaceNeighbor(f),
+                                       face_nodal_mapping->cell_node_mapping_[fj],
+                                       as_ss_idx);
+          else if (not is_boundary_face)
+            psi = data.fluds.NLUpwindPsi(
+              data.cell.global_id, f, face_nodal_mapping->face_node_mapping_[fj], as_ss_idx);
+          else
+            psi = angle_set.PsiBoundary(face.neighbor_id,
+                                        direction_num,
+                                        data.cell_local_id,
+                                        f,
+                                        fj,
+                                        data.gs_gi,
+                                        data.surface_source_active);
+
+          if (psi != nullptr)
+            for (size_t gsg = 0; gsg < data.gs_size; ++gsg)
+              b[gsg](i) += psi[gsg] * mu_Nij;
+        }
+      }
+    }
+
+    const auto dir_moment_offset =
+      static_cast<std::size_t>(direction_num) * static_cast<std::size_t>(data.num_moments);
+    const double* m2d_row = m2d_op.data() + dir_moment_offset;
+    const double* d2m_row = d2m_op.data() + dir_moment_offset;
+
+    for (unsigned int gsg = 0; gsg < data.gs_size; ++gsg)
+    {
+      double sigma_tg = sigma_t[data.gs_gi + gsg];
+      if constexpr (time_dependent)
+        sigma_tg += tau_gsg[gsg];
+
+      for (size_t i = 0; i < data.cell_num_nodes; ++i)
+      {
+        double temp_src = 0.0;
+        for (unsigned int m = 0; m < data.num_moments; ++m)
+        {
+          const auto ir = data.cell_transport_view.MapDOF(i, m, data.gs_gi + gsg);
+          temp_src += m2d_row[m] * data.source_moments[ir];
+        }
+
+        if constexpr (time_dependent)
+        {
+          const size_t imap =
+            i * data.groupset_angle_group_stride + direction_num * data.groupset_group_stride;
+          if (data.include_rhs_time_term and psi_old)
+            temp_src += tau_gsg[gsg] * psi_old[imap + gsg];
+        }
+
+        source[i] = temp_src;
+      }
+
+      for (size_t i = 0; i < data.cell_num_nodes; ++i)
+      {
+        double temp = 0.0;
+        for (size_t j = 0; j < data.cell_num_nodes; ++j)
+        {
+          const double Mij = data.M(i, j);
+          Atemp(i, j) = Amat(i, j) + Mij * sigma_tg;
+          temp += Mij * source[j];
+        }
+        b[gsg](i) += temp;
+      }
+
+      GaussElimination(Atemp, b[gsg], static_cast<int>(data.cell_num_nodes));
+    }
+
+    for (unsigned int m = 0; m < data.num_moments; ++m)
+    {
+      const auto wn_d2m = d2m_row[m];
+      for (size_t i = 0; i < data.cell_num_nodes; ++i)
+      {
+        const auto ir = data.cell_transport_view.MapDOF(i, m, data.gs_gi);
+        for (size_t gsg = 0; gsg < data.gs_size; ++gsg)
+          data.destination_phi[ir + gsg] += wn_d2m * b[gsg](i);
+      }
+    }
+
+    if (data.save_angular_flux)
+    {
+      double* psi_new = &data.destination_psi[data.discretization.MapDOFLocal(
+        data.cell, 0, groupset.psi_uk_man_, 0, 0)];
+
+      double theta = 1.0;
+      double inv_theta = 1.0;
+      if constexpr (time_dependent)
+      {
+        theta = data.problem.GetTheta();
+        inv_theta = 1.0 / theta;
+      }
+
+      for (size_t i = 0; i < data.cell_num_nodes; ++i)
+      {
+        const size_t imap =
+          i * data.groupset_angle_group_stride + direction_num * data.groupset_group_stride;
+
+        for (size_t gsg = 0; gsg < data.gs_size; ++gsg)
+        {
+          const double psi_sol = b[gsg](i);
+          if constexpr (time_dependent)
+          {
+            const double psi_old_val = psi_old ? psi_old[imap + gsg] : 0.0;
+            psi_new[imap + gsg] = inv_theta * (psi_sol + (theta - 1.0) * psi_old_val);
+          }
+          else
+            psi_new[imap + gsg] = psi_sol;
+        }
+      }
+    }
+
+    for (size_t f = 0; f < data.cell_num_faces; ++f)
+    {
+      if (face_orientations[f] != FaceOrientation::OUTGOING)
+        continue;
+
+      const auto& face = data.cell.faces[f];
+      const bool is_local_face = data.cell_transport_view.IsFaceLocal(f);
+      const bool is_boundary_face = not face.has_neighbor;
+      const bool is_reflecting_boundary_face =
+        (is_boundary_face and angle_set.GetBoundaries()[face.neighbor_id]->IsReflecting());
+      const auto& IntF_shapeI = data.IntS_shapeI[f];
+
+      const int locality = data.cell_transport_view.FaceLocality(f);
+      const size_t num_face_nodes = data.cell_mapping.GetNumFaceNodes(f);
+      const auto& face_nodal_mapping =
+        data.fluds.GetCommonData().GetFaceNodalMapping(data.cell_local_id, f);
+      std::vector<double>* psi_nonlocal_outgoing = nullptr;
+
+      if (not is_boundary_face and not is_local_face)
+      {
+        auto* async_comm = dynamic_cast<CBC_AsynchronousCommunicator*>(angle_set.GetCommunicator());
+        const size_t data_size_for_msg = num_face_nodes * data.group_angle_stride;
+        psi_nonlocal_outgoing =
+          &async_comm->InitGetDownwindMessageData(locality,
+                                                  face.neighbor_id,
+                                                  face_nodal_mapping.associated_face_,
+                                                  angle_set.GetID(),
+                                                  data_size_for_msg);
+      }
+
+      for (size_t fi = 0; fi < num_face_nodes; ++fi)
+      {
+        const int i = data.cell_mapping.MapFaceNode(f, fi);
+
+        if (is_boundary_face)
+        {
+          for (size_t gsg = 0; gsg < data.gs_size; ++gsg)
+            data.cell_transport_view.AddOutflow(
+              f, data.gs_gi + gsg, wt * face_mu_values[f] * b[gsg](i) * IntF_shapeI(i));
+        }
+
+        double* psi = nullptr;
+        if (is_local_face)
+          psi = data.fluds.OutgoingPsi(data.cell, i, as_ss_idx);
+        else if (not is_boundary_face)
+          psi = data.fluds.NLOutgoingPsi(psi_nonlocal_outgoing, fi, as_ss_idx);
+        else if (is_reflecting_boundary_face)
+          psi = angle_set.PsiReflected(face.neighbor_id, direction_num, data.cell_local_id, f, fi);
+
+        if (psi != nullptr)
+          for (size_t gsg = 0; gsg < data.gs_size; ++gsg)
+            psi[gsg] = b[gsg](i);
+      }
+    }
+  }
+}
+
+template <unsigned int NumNodes, bool time_dependent>
+void CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set);
+
+} // namespace opensn

From 26def95e381c7d237ed4f33a5b71a4158b57fe90 Mon Sep 17 00:00:00 2001
From: Eappen Nelluvelil <eappen@tamu.edu>
Date: Tue, 31 Mar 2026 23:31:38 -0500
Subject: [PATCH 3/6] Transport transient tests for CBC

---
 .../transport_transient/tests.json            | 568 +++++++++++++++++-
 .../transient_init_leakage_pulse_decay_cbc.py | 102 ++++
 .../transient_init_precursor_decay_cbc.py     | 131 ++++
 .../transient_init_steady_state_source_cbc.py | 100 +++
 ...ransient_init_time_dependent_source_cbc.py | 102 ++++
 ...transient_init_zero_absorber_source_cbc.py |  95 +++
 ...eigen_1d_delayed_fission_prod_count_cbc.py |  88 +++
 ...sient_keigen_1d_delayed_prke_vs_stk_cbc.py | 171 ++++++
 .../transient_keigen_1d_prompt_step_cbc.py    | 106 ++++
 ...t_keigen_1d_theta_precursor_scaling_cbc.py | 102 ++++
 ...transient_keigen_2d_2g_delayed_step_cbc.py | 110 ++++
 ...gen_2d_2g_prompt_combine_velocities_cbc.py | 109 ++++
 ...nt_keigen_2d_delayed_prke_vs_stk_2p_cbc.py | 209 +++++++
 .../transient_keigen_2d_prompt_ramp_xs_cbc.py | 106 ++++
 ...ansient_keigen_3d_2g_prompt_step_xs_cbc.py | 107 ++++
 ...eigen_3d_6g_delayed_step_nu_sigma_f_cbc.py | 107 ++++
 ...ransient_keigen_3d_delayed_analytic_cbc.py | 157 +++++
 ...3d_delayed_prke_vs_stk_2p_callbacks_cbc.py | 228 +++++++
 ...nt_keigen_3d_delayed_prke_vs_stk_2p_cbc.py | 215 +++++++
 ...transient_keigen_3d_delayed_ramp_xs_cbc.py | 205 +++++++
 ...gen_3d_delayed_stiff_dt_sensitivity_cbc.py | 134 +++++
 ...transient_keigen_3d_prompt_analytic_cbc.py | 158 +++++
 ...ansient_keigen_3d_prompt_bc_leakage_cbc.py | 114 ++++
 ...ient_keigen_3d_prompt_mid_step_swap_cbc.py | 112 ++++
 .../transient_zero_3d_1g_pulse_inf_med_cbc.py | 124 ++++
 ...ent_zero_3d_1g_ramp_source_analytic_cbc.py | 136 +++++
 .../transient_zero_3d_1g_v0.5_inf_med_cbc.py  | 102 ++++
 .../transient_zero_3d_1g_v1_inf_med_cbc.py    | 107 ++++
 ...ransient_zero_3d_1g_v1_inf_med_swap_cbc.py | 131 ++++
 .../transient_zero_3d_1g_v2_inf_med_cbc.py    |  96 +++
 ...ient_zero_3d_2g_inf_med_downscatter_cbc.py | 132 ++++
 ...zero_3d_2g_inf_med_downscatter_swap_cbc.py | 148 +++++
 ...transient_zero_3d_2g_inf_med_pydrvr_cbc.py | 163 +++++
 ...t_zero_3d_2g_inf_med_pydrvr_ramp_dt_cbc.py | 169 ++++++
 .../transient_zero_3d_openmc_xs_cbc.py        | 107 ++++
 35 files changed, 5033 insertions(+), 18 deletions(-)
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_leakage_pulse_decay_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_precursor_decay_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_steady_state_source_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_time_dependent_source_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_zero_absorber_source_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_delayed_fission_prod_count_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_delayed_prke_vs_stk_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_prompt_step_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_theta_precursor_scaling_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_2g_delayed_step_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_2g_prompt_combine_velocities_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_delayed_prke_vs_stk_2p_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_prompt_ramp_xs_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_2g_prompt_step_xs_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_6g_delayed_step_nu_sigma_f_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_analytic_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_prke_vs_stk_2p_callbacks_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_prke_vs_stk_2p_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_ramp_xs_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_stiff_dt_sensitivity_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_prompt_analytic_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_prompt_bc_leakage_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_prompt_mid_step_swap_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_pulse_inf_med_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_ramp_source_analytic_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v0.5_inf_med_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v1_inf_med_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v1_inf_med_swap_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v2_inf_med_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_downscatter_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_downscatter_swap_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_pydrvr_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_pydrvr_ramp_dt_cbc.py
 create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_openmc_xs_cbc.py

diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/tests.json b/test/python/modules/linear_boltzmann_solvers/transport_transient/tests.json
index 92a5ff5ea8..9514ef6b63 100644
--- a/test/python/modules/linear_boltzmann_solvers/transport_transient/tests.json
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/tests.json
@@ -106,7 +106,7 @@
         "type": "KeyValuePair",
         "key": "Max phi(3s) = ",
         "goldvalue": 5.104477,
-        "abs_tol": 0.000001
+        "abs_tol": 1e-06
       }
     ]
   },
@@ -119,7 +119,7 @@
         "type": "KeyValuePair",
         "key": "Max phi(1s) = ",
         "goldvalue": 2.330956,
-        "abs_tol": 0.000001
+        "abs_tol": 1e-06
       }
     ]
   },
@@ -132,13 +132,13 @@
         "type": "KeyValuePair",
         "key": "Max phi0(2s) = ",
         "goldvalue": 3.658193,
-        "abs_tol": 0.000001
+        "abs_tol": 1e-06
       },
       {
         "type": "KeyValuePair",
         "key": "Max phi1(2s) = ",
         "goldvalue": 1.027354,
-        "abs_tol": 0.000001
+        "abs_tol": 1e-06
       }
     ]
   },
@@ -151,7 +151,7 @@
         "type": "KeyValuePair",
         "key": "Max phi(2s) = ",
         "goldvalue": 0.889725,
-        "abs_tol": 0.000001
+        "abs_tol": 1e-06
       }
     ]
   },
@@ -164,7 +164,7 @@
         "type": "KeyValuePair",
         "key": "Max phi(1s) = ",
         "goldvalue": 1.736077,
-        "abs_tol": 0.000001
+        "abs_tol": 1e-06
       }
     ]
   },
@@ -177,7 +177,7 @@
         "type": "KeyValuePair",
         "key": "Max phi(1s) = ",
         "goldvalue": 3.184793,
-        "abs_tol": 0.000001
+        "abs_tol": 1e-06
       }
     ]
   },
@@ -190,13 +190,13 @@
         "type": "KeyValuePair",
         "key": "Max phi0(1s) = ",
         "goldvalue": 1.978816,
-        "abs_tol": 0.000001
+        "abs_tol": 1e-06
       },
       {
         "type": "KeyValuePair",
         "key": "Max phi1(1s) = ",
         "goldvalue": 0.394171,
-        "abs_tol": 0.000001
+        "abs_tol": 1e-06
       }
     ]
   },
@@ -209,13 +209,32 @@
         "type": "KeyValuePair",
         "key": "Max phi0 = ",
         "goldvalue": 3.672537,
-        "abs_tol": 0.000001
+        "abs_tol": 1e-06
       },
       {
         "type": "KeyValuePair",
         "key": "Max phi1 = ",
         "goldvalue": 1.035548,
-        "abs_tol": 0.000001
+        "abs_tol": 1e-06
+      }
+    ]
+  },
+  {
+    "file": "transient_zero_3d_2g_inf_med_pydrvr_cbc.py",
+    "comment": "3D unstructured mesh, 2 group, downscatter, zero-init transient, Python time step loop (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "KeyValuePair",
+        "key": "Max phi0 = ",
+        "goldvalue": 3.672537,
+        "abs_tol": 5e-06
+      },
+      {
+        "type": "KeyValuePair",
+        "key": "Max phi1 = ",
+        "goldvalue": 1.035548,
+        "abs_tol": 1e-06
       }
     ]
   },
@@ -228,13 +247,13 @@
         "type": "KeyValuePair",
         "key": "Max phi0 = ",
         "goldvalue": 3.674989,
-        "abs_tol": 0.000001
+        "abs_tol": 1e-06
       },
       {
         "type": "KeyValuePair",
         "key": "Max phi1 = ",
         "goldvalue": 1.035541,
-        "abs_tol": 0.000001
+        "abs_tol": 1e-06
       }
     ]
   },
@@ -251,6 +270,19 @@
       }
     ]
   },
+  {
+    "file": "transient_zero_3d_openmc_xs_cbc.py",
+    "comment": "3D orthogonal mesh, 30 group, zero-init transient, OpenMC cross sections (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "KeyValuePair",
+        "key": "Max phi(0.1s) = ",
+        "goldvalue": 51.057722,
+        "abs_tol": 0.0001
+      }
+    ]
+  },
   {
     "file": "transient_keigen_3d_prompt_analytic.py",
     "comment": "3D prompt-only analytic exponential check",
@@ -387,7 +419,7 @@
         "key": "FP_RATIO_ACTUAL",
         "wordnum": 1,
         "gold": 1.2,
-        "abs_tol": 0.000001
+        "abs_tol": 1e-06
       },
       {
         "type": "FloatCompare",
@@ -408,7 +440,7 @@
         "key": "FP_RATIO_ACTUAL",
         "wordnum": 1,
         "gold": 1.2,
-        "abs_tol": 0.000001
+        "abs_tol": 1e-06
       },
       {
         "type": "FloatCompare",
@@ -429,7 +461,7 @@
         "key": "FP_RATIO_ACTUAL",
         "wordnum": 1,
         "gold": 2.2,
-        "abs_tol": 0.000001
+        "abs_tol": 1e-06
       },
       {
         "type": "FloatCompare",
@@ -492,7 +524,7 @@
         "key": "FP_RATIO_ACTUAL",
         "wordnum": 1,
         "gold": 1.2,
-        "abs_tol": 0.000001
+        "abs_tol": 1e-06
       },
       {
         "type": "FloatCompare",
@@ -513,7 +545,7 @@
         "key": "FP_RATIO_ACTUAL",
         "wordnum": 1,
         "gold": 1.2,
-        "abs_tol": 0.000001
+        "abs_tol": 1e-06
       }
     ]
   },
@@ -572,5 +604,505 @@
         "abs_tol": 1e-12
       }
     ]
+  },
+  {
+    "file": "transient_init_leakage_pulse_decay_cbc.py",
+    "comment": "Leakage sanity: vacuum boundaries with source removal (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "FloatCompare",
+        "key": "LEAKAGE_DECAY_PASS",
+        "wordnum": 1,
+        "gold": 1,
+        "abs_tol": 1e-12
+      }
+    ]
+  },
+  {
+    "file": "transient_init_precursor_decay_cbc.py",
+    "comment": "Delayed precursor decay check with source removal (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "FloatCompare",
+        "key": "PRECURSOR_DECAY_PASS",
+        "wordnum": 1,
+        "gold": 1,
+        "abs_tol": 1e-12
+      }
+    ]
+  },
+  {
+    "file": "transient_init_steady_state_source_cbc.py",
+    "comment": "Steady-state source init: 1D absorber consistency (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "FloatCompare",
+        "key": "STEADY_INIT_PASS",
+        "wordnum": 1,
+        "gold": 1,
+        "abs_tol": 1e-12
+      }
+    ]
+  },
+  {
+    "file": "transient_init_time_dependent_source_cbc.py",
+    "comment": "Time-dependent source init: 1D absorber step consistency (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "FloatCompare",
+        "key": "TD_INIT_PASS",
+        "wordnum": 1,
+        "gold": 1,
+        "abs_tol": 1e-12
+      }
+    ]
+  },
+  {
+    "file": "transient_init_zero_absorber_source_cbc.py",
+    "comment": "Zero-init transient: 1D absorber with constant source (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "FloatCompare",
+        "key": "ZERO_INIT_PASS",
+        "wordnum": 1,
+        "gold": 1,
+        "abs_tol": 1e-12
+      }
+    ]
+  },
+  {
+    "file": "transient_keigen_1d_delayed_fission_prod_count_cbc.py",
+    "comment": "1D delayed fission production invariant to precursor count (1p vs 2p) (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "FloatCompare",
+        "key": "K_PRECURSOR_FPROD_PASS",
+        "wordnum": 1,
+        "gold": 1,
+        "abs_tol": 1e-12
+      }
+    ]
+  },
+  {
+    "file": "transient_keigen_1d_delayed_prke_vs_stk_cbc.py",
+    "comment": "1D delayed homogeneous step: PRKE vs space-time kinetics (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "FloatCompare",
+        "key": "PRKE_STK_PASS",
+        "wordnum": 1,
+        "gold": 1,
+        "abs_tol": 1e-12
+      }
+    ]
+  },
+  {
+    "file": "transient_keigen_1d_prompt_step_cbc.py",
+    "comment": "1D prompt-only xs step (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "FloatCompare",
+        "key": "FP_RATIO_ACTUAL",
+        "wordnum": 1,
+        "gold": 1.2,
+        "abs_tol": 1e-06
+      }
+    ]
+  },
+  {
+    "file": "transient_keigen_1d_theta_precursor_scaling_cbc.py",
+    "comment": "1D delayed fission source scales with theta (TransientSourceFunction check) (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "FloatCompare",
+        "key": "DELAYED_THETA_RATIO",
+        "wordnum": 1,
+        "gold": 2,
+        "abs_tol": 0.01
+      }
+    ]
+  },
+  {
+    "file": "transient_keigen_2d_2g_delayed_step_cbc.py",
+    "comment": "2D 2g delayed xs step (ratio) (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "FloatCompare",
+        "key": "FP_RATIO_ACTUAL",
+        "wordnum": 1,
+        "gold": 1.2,
+        "abs_tol": 1e-06
+      },
+      {
+        "type": "FloatCompare",
+        "key": "TRANSIENT_OK",
+        "wordnum": 1,
+        "gold": 1,
+        "abs_tol": 1e-12
+      }
+    ]
+  },
+  {
+    "file": "transient_keigen_2d_2g_prompt_combine_velocities_cbc.py",
+    "comment": "2D 2g prompt combine xs with group-wise velocities (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "FloatCompare",
+        "key": "FP_RATIO_ACTUAL",
+        "wordnum": 1,
+        "gold": 2.2,
+        "abs_tol": 1e-06
+      },
+      {
+        "type": "FloatCompare",
+        "key": "TRANSIENT_OK",
+        "wordnum": 1,
+        "gold": 1,
+        "abs_tol": 1e-12
+      }
+    ]
+  },
+  {
+    "file": "transient_keigen_2d_delayed_prke_vs_stk_2p_cbc.py",
+    "comment": "2D delayed homogeneous step: PRKE vs space-time kinetics (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "FloatCompare",
+        "key": "PRKE_STK_PASS",
+        "wordnum": 1,
+        "gold": 1,
+        "abs_tol": 1e-12
+      }
+    ]
+  },
+  {
+    "file": "transient_keigen_2d_prompt_ramp_xs_cbc.py",
+    "comment": "2D prompt-only ramp xs (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "FloatCompare",
+        "key": "TRANSIENT_OK",
+        "wordnum": 1,
+        "gold": 1,
+        "abs_tol": 1e-12
+      }
+    ]
+  },
+  {
+    "file": "transient_keigen_3d_2g_prompt_step_xs_cbc.py",
+    "comment": "3D 2g prompt-only step xs swap (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "FloatCompare",
+        "key": "FP_RATIO_ACTUAL",
+        "wordnum": 1,
+        "gold": 1.2,
+        "abs_tol": 1e-06
+      },
+      {
+        "type": "FloatCompare",
+        "key": "TRANSIENT_OK",
+        "wordnum": 1,
+        "gold": 1,
+        "abs_tol": 1e-12
+      }
+    ]
+  },
+  {
+    "file": "transient_keigen_3d_6g_delayed_step_nu_sigma_f_cbc.py",
+    "comment": "3D 6g, 2-precursor delayed step nu_sigma_f (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "FloatCompare",
+        "key": "FP_RATIO_ACTUAL",
+        "wordnum": 1,
+        "gold": 1.2,
+        "abs_tol": 1e-06
+      },
+      {
+        "type": "FloatCompare",
+        "key": "TRANSIENT_OK",
+        "wordnum": 1,
+        "gold": 1,
+        "abs_tol": 1e-12
+      }
+    ]
+  },
+  {
+    "file": "transient_keigen_3d_delayed_analytic_cbc.py",
+    "comment": "3D delayed analytic point-kinetics check (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "FloatCompare",
+        "key": "ANALYTIC_PASS",
+        "wordnum": 1,
+        "gold": 1,
+        "abs_tol": 1e-12
+      }
+    ]
+  },
+  {
+    "file": "transient_keigen_3d_delayed_prke_vs_stk_2p_callbacks_cbc.py",
+    "comment": "3D delayed homogeneous step: PRKE vs space-time kinetics (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "FloatCompare",
+        "key": "PRKE_STK_PASS",
+        "wordnum": 1,
+        "gold": 1,
+        "abs_tol": 1e-12
+      }
+    ]
+  },
+  {
+    "file": "transient_keigen_3d_delayed_prke_vs_stk_2p_cbc.py",
+    "comment": "3D delayed homogeneous step: PRKE vs space-time kinetics (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "FloatCompare",
+        "key": "PRKE_STK_PASS",
+        "wordnum": 1,
+        "gold": 1,
+        "abs_tol": 1e-12
+      }
+    ]
+  },
+  {
+    "file": "transient_keigen_3d_delayed_ramp_xs_cbc.py",
+    "comment": "3D delayed ramp xs with point-kinetics reference (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "FloatCompare",
+        "key": "TRANSIENT_OK",
+        "wordnum": 1,
+        "gold": 1,
+        "abs_tol": 1e-12
+      }
+    ]
+  },
+  {
+    "file": "transient_keigen_3d_delayed_stiff_dt_sensitivity_cbc.py",
+    "comment": "3D delayed stiff precursor dt (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "FloatCompare",
+        "key": "TRANSIENT_OK",
+        "wordnum": 1,
+        "gold": 1,
+        "abs_tol": 1e-12
+      }
+    ]
+  },
+  {
+    "file": "transient_keigen_3d_prompt_analytic_cbc.py",
+    "comment": "3D prompt-only analytic exponential check (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "FloatCompare",
+        "key": "ANALYTIC_PASS",
+        "wordnum": 1,
+        "gold": 1,
+        "abs_tol": 1e-12
+      }
+    ]
+  },
+  {
+    "file": "transient_keigen_3d_prompt_bc_leakage_cbc.py",
+    "comment": "3D prompt leakage vs reflecting boundary (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "FloatCompare",
+        "key": "TRANSIENT_OK",
+        "wordnum": 1,
+        "gold": 1,
+        "abs_tol": 1e-12
+      }
+    ]
+  },
+  {
+    "file": "transient_keigen_3d_prompt_mid_step_swap_cbc.py",
+    "comment": "3D prompt mid-step XS swap (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "FloatCompare",
+        "key": "TIME_AT_SWAP",
+        "wordnum": 1,
+        "gold": 0.07,
+        "abs_tol": 1e-10
+      },
+      {
+        "type": "FloatCompare",
+        "key": "FP_RATIO_AT_SWAP",
+        "wordnum": 1,
+        "gold": 1.2,
+        "abs_tol": 0.0001
+      },
+      {
+        "type": "FloatCompare",
+        "key": "TRANSIENT_OK",
+        "wordnum": 1,
+        "gold": 1,
+        "abs_tol": 1e-12
+      }
+    ]
+  },
+  {
+    "file": "transient_zero_3d_1g_pulse_inf_med_cbc.py",
+    "comment": "3D unstructured mesh, 1 group, vel = 1.0, zero-init transient pulse (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "KeyValuePair",
+        "key": "Max phi(3s) = ",
+        "goldvalue": 5.104477,
+        "abs_tol": 5e-06
+      }
+    ]
+  },
+  {
+    "file": "transient_zero_3d_1g_ramp_source_analytic_cbc.py",
+    "comment": "Zero-mode transient with ramped source and analytic check (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "FloatCompare",
+        "key": "RAMP_SOURCE_ANALYTIC_PASS",
+        "wordnum": 1,
+        "gold": 1,
+        "abs_tol": 1e-12
+      }
+    ]
+  },
+  {
+    "file": "transient_zero_3d_1g_v0.5_inf_med_cbc.py",
+    "comment": "3D unstructured mesh, 1 group, vel = 0.5, zero-init transient (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "KeyValuePair",
+        "key": "Max phi(2s) = ",
+        "goldvalue": 0.889725,
+        "abs_tol": 5e-06
+      }
+    ]
+  },
+  {
+    "file": "transient_zero_3d_1g_v1_inf_med_cbc.py",
+    "comment": "3D unstructured mesh, 1 group, vel = 1.0, zero-init transient (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "KeyValuePair",
+        "key": "Max phi(1s) = ",
+        "goldvalue": 2.330956,
+        "abs_tol": 5e-06
+      }
+    ]
+  },
+  {
+    "file": "transient_zero_3d_1g_v1_inf_med_swap_cbc.py",
+    "comment": "3D unstructured mesh, 1 group, vel = 1.0, zero-init transient, swap xs at 0.5s (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "KeyValuePair",
+        "key": "Max phi(1s) = ",
+        "goldvalue": 1.736077,
+        "abs_tol": 5e-06
+      }
+    ]
+  },
+  {
+    "file": "transient_zero_3d_1g_v2_inf_med_cbc.py",
+    "comment": "3D unstructured mesh, 1 group, vel = 2.0, zero-init transient (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "KeyValuePair",
+        "key": "Max phi(1s) = ",
+        "goldvalue": 3.184793,
+        "abs_tol": 5e-06
+      }
+    ]
+  },
+  {
+    "file": "transient_zero_3d_2g_inf_med_downscatter_cbc.py",
+    "comment": "3D unstructured mesh, 2 group, downscatter, zero-init transient (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "KeyValuePair",
+        "key": "Max phi0(2s) = ",
+        "goldvalue": 3.658193,
+        "abs_tol": 5e-06
+      },
+      {
+        "type": "KeyValuePair",
+        "key": "Max phi1(2s) = ",
+        "goldvalue": 1.027354,
+        "abs_tol": 1e-06
+      }
+    ]
+  },
+  {
+    "file": "transient_zero_3d_2g_inf_med_downscatter_swap_cbc.py",
+    "comment": "3D unstructured mesh, 2 group, downscatter, zero-init transient, swap xs at 0.5s (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "KeyValuePair",
+        "key": "Max phi0(1s) = ",
+        "goldvalue": 1.978816,
+        "abs_tol": 1e-06
+      },
+      {
+        "type": "KeyValuePair",
+        "key": "Max phi1(1s) = ",
+        "goldvalue": 0.394171,
+        "abs_tol": 1e-06
+      }
+    ]
+  },
+  {
+    "file": "transient_zero_3d_2g_inf_med_pydrvr_ramp_dt_cbc.py",
+    "comment": "3D unstructured mesh, 2 group, downscatter, zero-init transient, Python time step loop, ramp dt (CBC)",
+    "num_procs": 4,
+    "checks": [
+      {
+        "type": "KeyValuePair",
+        "key": "Max phi0 = ",
+        "goldvalue": 3.674989,
+        "abs_tol": 5e-06
+      },
+      {
+        "type": "KeyValuePair",
+        "key": "Max phi1 = ",
+        "goldvalue": 1.035541,
+        "abs_tol": 1e-06
+      }
+    ]
   }
 ]
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_leakage_pulse_decay_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_leakage_pulse_decay_cbc.py
new file mode 100644
index 0000000000..09b1d27ab7
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_leakage_pulse_decay_cbc.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Leakage sanity: 1D vacuum boundaries with source removal.
+
+Initialize from a steady-state source solve, then remove the source and
+advance one transient step. The scalar flux should decrease with leakage.
+LEAKAGE_DECAY_PASS is 1 if phi decreases and remains non-negative.
+"""
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.mesh import OrthogonalMeshGenerator
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.source import VolumetricSource
+    from pyopensn.aquad import GLProductQuadrature1DSlab
+    from pyopensn.solver import DiscreteOrdinatesProblem, SteadyStateSourceSolver, TransientSolver
+    from pyopensn.fieldfunc import FieldFunctionInterpolationVolume
+    from pyopensn.logvol import RPPLogicalVolume
+
+
+def max_phi(phys):
+    fflist = phys.GetScalarFluxFieldFunction()
+    monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True)
+    field_interp = FieldFunctionInterpolationVolume()
+    field_interp.SetOperationType("max")
+    field_interp.SetLogicalVolume(monitor_volume)
+    field_interp.AddFieldFunction(fflist[0])
+    field_interp.Execute()
+    return field_interp.GetValue()
+
+
+if __name__ == "__main__":
+    dx = 2.0 / 40
+    nodes = [i * dx for i in range(40 + 1)]
+    meshgen = OrthogonalMeshGenerator(node_sets=[nodes])
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+
+    sigma_t = 1.0
+    Q = 2.0
+    dt = 0.05
+
+    xs = MultiGroupXS()
+    xs.CreateSimpleOneGroup(sigma_t, 0.0, 1.0)
+
+    source = VolumetricSource(block_ids=[0], group_strength=[Q], start_time=0.0, end_time=10.0)
+
+    pquad = GLProductQuadrature1DSlab(n_polar=4, scattering_order=0)
+
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=1,
+        groupsets=[
+            {
+                "groups_from_to": (0, 0),
+                "angular_quadrature": pquad,
+                "inner_linear_method": "petsc_richardson",
+                "l_abs_tol": 1.0e-8,
+                "l_max_its": 200,
+            }
+        ],
+        xs_map=[{"block_ids": [0], "xs": xs}],
+        volumetric_sources=[source],
+        boundary_conditions=[
+            {"name": "zmin", "type": "vacuum"},
+            {"name": "zmax", "type": "vacuum"},
+        ],
+        options={
+            "save_angular_flux": True,
+            "verbose_inner_iterations": False,
+            "verbose_outer_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    steady = SteadyStateSourceSolver(problem=phys)
+    steady.Initialize()
+    steady.Execute()
+
+    phi0 = max_phi(phys)
+
+    phys.SetVolumetricSources(clear_volumetric_sources=True)
+
+    phys.SetTimeDependentMode()
+
+    solver = TransientSolver(problem=phys, dt=dt, theta=1.0, stop_time=dt, initial_state="existing")
+    solver.Initialize()
+    solver.Execute()
+
+    phi1 = max_phi(phys)
+    pass_flag = 1 if (phi1 >= 0.0 and phi1 < phi0) else 0
+
+    if rank == 0:
+        print(f"LEAKAGE_DECAY_PASS {pass_flag}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_precursor_decay_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_precursor_decay_cbc.py
new file mode 100644
index 0000000000..04b7e41a49
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_precursor_decay_cbc.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Delayed precursor decay: 1D delayed system with source removal.
+
+Compute a steady-state with precursors, remove the external source, and
+advance one transient step. The flux ratio should roughly follow
+exp(-lambda*dt) for the single precursor group. PRECURSOR_DECAY_PASS is 1
+if the ratio matches within 20%.
+"""
+
+import math
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.mesh import OrthogonalMeshGenerator
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.source import VolumetricSource
+    from pyopensn.aquad import GLProductQuadrature1DSlab
+    from pyopensn.solver import DiscreteOrdinatesProblem, SteadyStateSourceSolver, TransientSolver
+    from pyopensn.fieldfunc import FieldFunctionInterpolationVolume
+    from pyopensn.logvol import RPPLogicalVolume
+
+
+def read_precursor_value(path, block_name):
+    begin = f"{block_name}_BEGIN"
+    end = f"{block_name}_END"
+    in_block = False
+    with open(path, "r", encoding="utf-8") as handle:
+        for line in handle:
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+            if line == begin:
+                in_block = True
+                continue
+            if line == end:
+                in_block = False
+                continue
+            if in_block:
+                parts = line.split()
+                if len(parts) >= 2:
+                    return float(parts[1])
+    raise RuntimeError(f"Failed to find {block_name} in {path}")
+
+
+def max_phi(phys):
+    fflist = phys.GetScalarFluxFieldFunction()
+    monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True)
+    field_interp = FieldFunctionInterpolationVolume()
+    field_interp.SetOperationType("max")
+    field_interp.SetLogicalVolume(monitor_volume)
+    field_interp.AddFieldFunction(fflist[0])
+    field_interp.Execute()
+    return field_interp.GetValue()
+
+
+if __name__ == "__main__":
+    dx = 1.0 / 10
+    nodes = [i * dx for i in range(10 + 1)]
+    meshgen = OrthogonalMeshGenerator(node_sets=[nodes])
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+
+    dt = 0.05
+    Q = 0.5
+
+    xs_path = os.path.join(os.path.dirname(__file__), "xs1g_delayed_crit_1p.cxs")
+    xs = MultiGroupXS()
+    xs.LoadFromOpenSn(xs_path)
+
+    source = VolumetricSource(block_ids=[0], group_strength=[Q], start_time=0.0, end_time=10.0)
+
+    pquad = GLProductQuadrature1DSlab(n_polar=4, scattering_order=0)
+
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=1,
+        groupsets=[
+            {
+                "groups_from_to": (0, 0),
+                "angular_quadrature": pquad,
+                "inner_linear_method": "petsc_richardson",
+                "l_abs_tol": 1.0e-8,
+                "l_max_its": 200,
+            }
+        ],
+        xs_map=[{"block_ids": [0], "xs": xs}],
+        volumetric_sources=[source],
+        boundary_conditions=[
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={
+            "save_angular_flux": True,
+            "use_precursors": True,
+            "verbose_inner_iterations": False,
+            "verbose_outer_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    steady = SteadyStateSourceSolver(problem=phys)
+    steady.Initialize()
+    steady.Execute()
+
+    phi0 = max_phi(phys)
+
+    phys.SetVolumetricSources(clear_volumetric_sources=True)
+
+    phys.SetTimeDependentMode()
+
+    solver = TransientSolver(problem=phys, dt=dt, theta=1.0, stop_time=dt, initial_state="existing")
+    solver.Initialize()
+    solver.Execute()
+
+    phi1 = max_phi(phys)
+    ratio = phi1 / phi0 if phi0 > 0.0 else 0.0
+
+    lam = read_precursor_value(xs_path, "PRECURSOR_DECAY_CONSTANTS")
+    expected = math.exp(-lam * dt)
+    pass_flag = 1 if abs(ratio - expected) < 0.2 else 0
+
+    if rank == 0:
+        print(f"PRECURSOR_DECAY_PASS {pass_flag}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_steady_state_source_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_steady_state_source_cbc.py
new file mode 100644
index 0000000000..d5c620936d
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_steady_state_source_cbc.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Steady-state source initialization: 1D pure absorber with constant source.
+
+Initialize with a steady-state source solve, then advance one BE step with
+TransientSolver. The flux should remain at phi_ss = Q/sigma_t.
+STEADY_INIT_PASS is 1 if the transient step preserves the steady state.
+"""
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.mesh import OrthogonalMeshGenerator
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.source import VolumetricSource
+    from pyopensn.aquad import GLProductQuadrature1DSlab
+    from pyopensn.solver import DiscreteOrdinatesProblem, SteadyStateSourceSolver, TransientSolver
+    from pyopensn.fieldfunc import FieldFunctionInterpolationVolume
+    from pyopensn.logvol import RPPLogicalVolume
+
+
+def max_phi(phys):
+    fflist = phys.GetScalarFluxFieldFunction()
+    monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True)
+    field_interp = FieldFunctionInterpolationVolume()
+    field_interp.SetOperationType("max")
+    field_interp.SetLogicalVolume(monitor_volume)
+    field_interp.AddFieldFunction(fflist[0])
+    field_interp.Execute()
+    return field_interp.GetValue()
+
+
+if __name__ == "__main__":
+    dx = 1.0 / 10
+    nodes = [i * dx for i in range(10 + 1)]
+    meshgen = OrthogonalMeshGenerator(node_sets=[nodes])
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+
+    sigma_t = 1.0
+    Q = 2.0
+    dt = 0.1
+
+    xs = MultiGroupXS()
+    xs.CreateSimpleOneGroup(sigma_t, 0.0, 1.0)
+
+    source = VolumetricSource(block_ids=[0], group_strength=[Q], start_time=0.0, end_time=10.0)
+
+    pquad = GLProductQuadrature1DSlab(n_polar=4, scattering_order=0)
+
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=1,
+        groupsets=[
+            {
+                "groups_from_to": (0, 0),
+                "angular_quadrature": pquad,
+                "inner_linear_method": "petsc_richardson",
+                "l_abs_tol": 1.0e-8,
+                "l_max_its": 200,
+            }
+        ],
+        xs_map=[{"block_ids": [0], "xs": xs}],
+        volumetric_sources=[source],
+        boundary_conditions=[
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={
+            "save_angular_flux": True,
+            "verbose_inner_iterations": False,
+            "verbose_outer_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    steady = SteadyStateSourceSolver(problem=phys)
+    steady.Initialize()
+    steady.Execute()
+
+    phys.SetTimeDependentMode()
+
+    solver = TransientSolver(problem=phys, dt=dt, theta=1.0, stop_time=dt, initial_state="existing")
+    solver.Initialize()
+    solver.Execute()
+
+    phi_num = max_phi(phys)
+    phi_ss = Q / sigma_t
+    rel_err = abs(phi_num - phi_ss) / phi_ss
+    pass_flag = 1 if rel_err < 1.0e-3 else 0
+
+    if rank == 0:
+        print(f"STEADY_INIT_PASS {pass_flag}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_time_dependent_source_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_time_dependent_source_cbc.py
new file mode 100644
index 0000000000..69c5add538
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_time_dependent_source_cbc.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Time-dependent source initialization: 1D pure absorber with constant source.
+
+Initialize with a transient step from zero state, then advance one BE step with
+TransientSolver using the existing state. The next step should satisfy the analytic update
+phi^{n+1} = (phi^n + dt*Q)/(1 + sigma_t*dt).
+TD_INIT_PASS is 1 if the transient step matches the update within 2%.
+"""
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.mesh import OrthogonalMeshGenerator
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.source import VolumetricSource
+    from pyopensn.aquad import GLProductQuadrature1DSlab
+    from pyopensn.solver import DiscreteOrdinatesProblem, TransientSolver
+    from pyopensn.fieldfunc import FieldFunctionInterpolationVolume
+    from pyopensn.logvol import RPPLogicalVolume
+
+
+def max_phi(phys):
+    fflist = phys.GetScalarFluxFieldFunction()
+    monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True)
+    field_interp = FieldFunctionInterpolationVolume()
+    field_interp.SetOperationType("max")
+    field_interp.SetLogicalVolume(monitor_volume)
+    field_interp.AddFieldFunction(fflist[0])
+    field_interp.Execute()
+    return field_interp.GetValue()
+
+
+if __name__ == "__main__":
+    dx = 1.0 / 10
+    nodes = [i * dx for i in range(10 + 1)]
+    meshgen = OrthogonalMeshGenerator(node_sets=[nodes])
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+
+    sigma_t = 1.0
+    Q = 1.2
+    dt = 0.1
+
+    xs = MultiGroupXS()
+    xs.CreateSimpleOneGroup(sigma_t, 0.0, 1.0)
+
+    source = VolumetricSource(block_ids=[0], group_strength=[Q], start_time=0.0, end_time=10.0)
+
+    pquad = GLProductQuadrature1DSlab(n_polar=4, scattering_order=0)
+
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=1,
+        time_dependent=True,
+        groupsets=[
+            {
+                "groups_from_to": (0, 0),
+                "angular_quadrature": pquad,
+                "inner_linear_method": "petsc_richardson",
+                "l_abs_tol": 1.0e-8,
+                "l_max_its": 200,
+            }
+        ],
+        xs_map=[{"block_ids": [0], "xs": xs}],
+        volumetric_sources=[source],
+        boundary_conditions=[
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={
+            "save_angular_flux": True,
+            "verbose_inner_iterations": False,
+            "verbose_outer_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    td_solver = TransientSolver(problem=phys, dt=dt, theta=1.0, stop_time=dt, initial_state="zero")
+    td_solver.Initialize()
+    td_solver.Execute()
+
+    phi_n = max_phi(phys)
+
+    solver = TransientSolver(problem=phys, dt=dt, theta=1.0, stop_time=dt, initial_state="existing")
+    solver.Initialize()
+    solver.Execute()
+
+    phi_np1 = max_phi(phys)
+    phi_expected = (phi_n + dt * Q) / (1.0 + sigma_t * dt)
+    rel_err = abs(phi_np1 - phi_expected) / phi_expected
+    pass_flag = 1 if rel_err < 0.02 else 0
+
+    if rank == 0:
+        print(f"TD_INIT_PASS {pass_flag}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_zero_absorber_source_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_zero_absorber_source_cbc.py
new file mode 100644
index 0000000000..7ffa74b25f
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_zero_absorber_source_cbc.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Zero-init transient: 1D pure absorber with a constant volumetric source.
+
+A single Backward Euler step from zero should match the analytic
+phi = Q*dt/(1 + sigma_t*dt) for a homogeneous infinite medium.
+ZERO_INIT_PASS is 1 if the numeric and analytic values agree within 2%.
+"""
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.mesh import OrthogonalMeshGenerator
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.source import VolumetricSource
+    from pyopensn.aquad import GLProductQuadrature1DSlab
+    from pyopensn.solver import DiscreteOrdinatesProblem, TransientSolver
+    from pyopensn.fieldfunc import FieldFunctionInterpolationVolume
+    from pyopensn.logvol import RPPLogicalVolume
+
+
+def max_phi(phys):
+    fflist = phys.GetScalarFluxFieldFunction()
+    monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True)
+    field_interp = FieldFunctionInterpolationVolume()
+    field_interp.SetOperationType("max")
+    field_interp.SetLogicalVolume(monitor_volume)
+    field_interp.AddFieldFunction(fflist[0])
+    field_interp.Execute()
+    return field_interp.GetValue()
+
+
+if __name__ == "__main__":
+    dx = 1.0 / 10
+    nodes = [i * dx for i in range(10 + 1)]
+    meshgen = OrthogonalMeshGenerator(node_sets=[nodes])
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+
+    sigma_t = 1.0
+    Q = 1.5
+    dt = 0.1
+
+    xs = MultiGroupXS()
+    xs.CreateSimpleOneGroup(sigma_t, 0.0, 1.0)
+
+    source = VolumetricSource(block_ids=[0], group_strength=[Q], start_time=0.0, end_time=1.0)
+
+    pquad = GLProductQuadrature1DSlab(n_polar=4, scattering_order=0)
+
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=1,
+        time_dependent=True,
+        groupsets=[
+            {
+                "groups_from_to": (0, 0),
+                "angular_quadrature": pquad,
+                "inner_linear_method": "petsc_richardson",
+                "l_abs_tol": 1.0e-8,
+                "l_max_its": 200,
+            }
+        ],
+        xs_map=[{"block_ids": [0], "xs": xs}],
+        volumetric_sources=[source],
+        boundary_conditions=[
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={
+            "save_angular_flux": True,
+            "verbose_inner_iterations": False,
+            "verbose_outer_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    solver = TransientSolver(problem=phys, dt=dt, theta=1.0, stop_time=dt, initial_state="zero")
+    solver.Initialize()
+    solver.Execute()
+
+    phi_num = max_phi(phys)
+    phi_analytic = Q * dt / (1.0 + sigma_t * dt)
+    rel_err = abs(phi_num - phi_analytic) / phi_analytic
+    pass_flag = 1 if rel_err < 0.02 else 0
+
+    if rank == 0:
+        print(f"ZERO_INIT_PASS {pass_flag}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_delayed_fission_prod_count_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_delayed_fission_prod_count_cbc.py
new file mode 100644
index 0000000000..d8c1fcbdbe
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_delayed_fission_prod_count_cbc.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+1D delayed fission production consistency across precursor counts.
+
+Same physics (prompt + delayed) but with 1 vs 2 precursors should
+yield the same steady-state total fission production. This test
+fails if delayed production is over-counted per precursor.
+"""
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.mesh import OrthogonalMeshGenerator
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.aquad import GLProductQuadrature1DSlab
+    from pyopensn.solver import DiscreteOrdinatesProblem, PowerIterationKEigenSolver
+
+
+def solve_and_get_fission_prod(xs_path):
+    n_cells = 40
+    L = 8.0
+    dx = L / n_cells
+    nodes = [i * dx for i in range(n_cells + 1)]
+    meshgen = OrthogonalMeshGenerator(node_sets=[nodes])
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+
+    xs = MultiGroupXS()
+    xs.LoadFromOpenSn(xs_path)
+
+    pquad = GLProductQuadrature1DSlab(n_polar=4, scattering_order=0)
+
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=1,
+        groupsets=[
+            {
+                "groups_from_to": (0, 0),
+                "angular_quadrature": pquad,
+                "inner_linear_method": "petsc_richardson",
+                "l_abs_tol": 1.0e-8,
+                "l_max_its": 200,
+            }
+        ],
+        xs_map=[{"block_ids": [0], "xs": xs}],
+        boundary_conditions=[
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={
+            "use_precursors": True,
+            "verbose_inner_iterations": False,
+            "verbose_outer_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    ksolver = PowerIterationKEigenSolver(problem=phys, max_iters=200, k_tol=1.0e-12)
+    ksolver.Initialize()
+    ksolver.Execute()
+
+    # Use the steady-state flux to compute total fission production.
+    fprod = phys.ComputeFissionProduction("new")
+    return fprod
+
+
+if __name__ == "__main__":
+    base_dir = os.path.dirname(__file__)
+    xs_1p = os.path.join(base_dir, "xs1g_delayed_crit_1p.cxs")
+    xs_2p = os.path.join(base_dir, "xs1g_delayed_crit_2p.cxs")
+
+    fp_1p = solve_and_get_fission_prod(xs_1p)
+    fp_2p = solve_and_get_fission_prod(xs_2p)
+
+    rel_diff = abs(fp_1p - fp_2p) / max(fp_1p, fp_2p, 1.0)
+    tol = 1.0e-6
+    pass_flag = 1 if rel_diff < tol else 0
+
+    if rank == 0:
+        print(f"FP_1P {fp_1p:.8e} FP_2P {fp_2p:.8e}")
+        print(f"K_PRECURSOR_FPROD_PASS {pass_flag}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_delayed_prke_vs_stk_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_delayed_prke_vs_stk_cbc.py
new file mode 100644
index 0000000000..be9fcae998
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_delayed_prke_vs_stk_cbc.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+1D delayed transient: homogeneous step xs vs PRKE.
+
+Validate space-time kinetics against point-reactor kinetics (PRKE) for a
+homogeneous perturbation in a homogeneous system.
+
+1-group, 1 precursor, reflecting boundaries (infinite-medium). A step
+to a supercritical xs is applied at t=0. Space-time kinetics should follow
+PRKE for this homogeneous case.
+
+PRKE_STK_PASS is 1 if the space-time fission-production ratio matches PRKE within 2%
+for t<=0.2.
+"""
+
+import math
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.solver import (
+        DiscreteOrdinatesProblem,
+        PowerIterationKEigenSolver,
+        TransientSolver,
+    )
+    from pyopensn.aquad import GLProductQuadrature1DSlab
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.mesh import OrthogonalMeshGenerator
+
+
+def read_precursor_value(path, block_name):
+    begin = f"{block_name}_BEGIN"
+    end = f"{block_name}_END"
+    in_block = False
+    with open(path, "r", encoding="utf-8") as handle:
+        for line in handle:
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+            if line == begin:
+                in_block = True
+                continue
+            if line == end:
+                in_block = False
+                continue
+            if in_block:
+                parts = line.split()
+                if len(parts) >= 2:
+                    return float(parts[1])
+    raise RuntimeError(f"Failed to find {block_name} in {path}")
+
+
+def prke_phi_ratio(t, beta, lam, rho, Lambda):
+    # Point-kinetics 1-precursor step solution with phi(0)=1, C(0)=beta/(Lambda*lam)
+    a = (rho - beta) / Lambda - lam
+    b = math.sqrt(((rho - beta) / Lambda + lam) ** 2 + 4.0 * beta * lam / Lambda)
+    w1 = 0.5 * (a + b)
+    w2 = 0.5 * (a - b)
+
+    k1 = (beta / Lambda) / (w1 + lam)
+    k2 = (beta / Lambda) / (w2 + lam)
+
+    c2 = (beta / (Lambda * lam) - k1) / (k2 - k1)
+    c1 = 1.0 - c2
+
+    return c1 * math.exp(w1 * t) + c2 * math.exp(w2 * t)
+
+
+if __name__ == "__main__":
+    dx = 8.0 / 40
+    nodes = [i * dx for i in range(40 + 1)]
+    meshgen = OrthogonalMeshGenerator(node_sets=[nodes])
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+
+    xs_crit = MultiGroupXS()
+    xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_crit_1p.cxs"))
+
+    xs_super = MultiGroupXS()
+    xs_super.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_1p.cxs"))
+
+    pquad = GLProductQuadrature1DSlab(n_polar=4, scattering_order=0)
+
+    num_groups = 1
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=num_groups,
+        groupsets=[
+            {
+                "groups_from_to": (0, num_groups - 1),
+                "angular_quadrature": pquad,
+                "inner_linear_method": "petsc_richardson",
+                "l_abs_tol": 1.0e-8,
+                "l_max_its": 200,
+            },
+        ],
+        xs_map=[{"block_ids": [0], "xs": xs_crit}],
+        boundary_conditions=[
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={
+            "save_angular_flux": True,
+            "use_precursors": True,
+            "verbose_inner_iterations": False,
+            "verbose_outer_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    keigen = PowerIterationKEigenSolver(problem=phys, max_iters=200, k_tol=1.0e-10)
+    keigen.Initialize()
+    keigen.Execute()
+
+    phys.SetTimeDependentMode()
+
+    solver = TransientSolver(problem=phys, initial_state="existing")
+    solver.Initialize()
+
+    # Apply homogeneous perturbation at t=0
+    phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_super}])
+
+    # PRKE parameters from xs
+    sigma_a = xs_super.sigma_a[0]
+    nu_sigma_f = xs_super.nu_sigma_f[0]
+    inv_vel = xs_super.inv_velocity[0]
+    v = 1.0 / inv_vel
+
+    # k_inf = nu_sigma_f / sigma_a for 1-group infinite medium
+    k_eff = nu_sigma_f / sigma_a
+    rho = (k_eff - 1.0) / k_eff
+
+    beta = read_precursor_value(
+        os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_1p.cxs"),
+        "PRECURSOR_FRACTIONAL_YIELDS",
+    )
+    lam = read_precursor_value(
+        os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_1p.cxs"),
+        "PRECURSOR_DECAY_CONSTANTS",
+    )
+    Lambda = 1.0 / (v * nu_sigma_f)
+
+    dt = 1.0e-2
+    solver.SetTimeStep(dt)
+    solver.SetTheta(1.0)
+
+    fp0 = phys.ComputeFissionProduction("new")
+
+    t_end = 0.2
+    rel_tol = 2.0e-2
+    ok = True
+    step = 0
+    while phys.GetTime() < t_end:
+        step += 1
+        solver.Advance()
+        fp_new = phys.ComputeFissionProduction("new")
+        t_to = phys.GetTime()
+
+        ratio_num = fp_new / fp0
+        ratio_prke = prke_phi_ratio(t_to, beta, lam, rho, Lambda)
+        if abs(ratio_num - ratio_prke) > rel_tol * ratio_prke:
+            ok = False
+
+    if rank == 0:
+        print(f"PRKE_STK_PASS {1 if ok else 0}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_prompt_step_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_prompt_step_cbc.py
new file mode 100644
index 0000000000..156ce6b217
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_prompt_step_cbc.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+1D prompt-only transient: xs step change.
+
+1-group prompt-only. A step change scales macroscopic fission terms. With
+reflecting BCs, the FP ratio should match the scaling.
+
+FP_RATIO_ACTUAL = 1.2 from scaling sigma_f by 1.2
+"""
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.solver import (
+        DiscreteOrdinatesProblem,
+        PowerIterationKEigenSolver,
+        TransientSolver,
+    )
+    from pyopensn.aquad import GLProductQuadrature1DSlab
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.mesh import OrthogonalMeshGenerator
+
+if __name__ == "__main__":
+    dx = 8.0 / 40
+    nodes = [i * dx for i in range(40 + 1)]
+    meshgen = OrthogonalMeshGenerator(node_sets=[nodes])
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+
+    xs_crit = MultiGroupXS()
+    xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_prompt_crit.cxs"))
+
+    xs_dense = MultiGroupXS()
+    xs_dense.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_prompt_density_up.cxs"))
+
+    pquad = GLProductQuadrature1DSlab(n_polar=4, scattering_order=0)
+
+    num_groups = 1
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=num_groups,
+        groupsets=[
+            {
+                "groups_from_to": (0, num_groups - 1),
+                "angular_quadrature": pquad,
+                "inner_linear_method": "classic_richardson",
+                "l_abs_tol": 1.0e-8,
+                "l_max_its": 200,
+            },
+        ],
+        xs_map=[{"block_ids": [0], "xs": xs_crit}],
+        boundary_conditions=[
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={
+            "save_angular_flux": True,
+            "use_precursors": False,
+            "verbose_inner_iterations": False,
+            "verbose_outer_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    keigen = PowerIterationKEigenSolver(problem=phys)
+    keigen.Initialize()
+    keigen.Execute()
+
+    phys.SetTimeDependentMode()
+
+    solver = TransientSolver(problem=phys, initial_state="existing")
+    solver.Initialize()
+
+    fp_old = phys.ComputeFissionProduction("new")
+
+    phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_dense}])
+    fp_new = phys.ComputeFissionProduction("new")
+
+    ratio_expected = 1.2
+    ratio_actual = fp_new / fp_old
+
+    dt = 1.0e-2
+    solver.SetTimeStep(dt)
+    solver.SetTheta(1.0)
+
+    solver.Advance()
+    fp1 = phys.ComputeFissionProduction("new")
+
+    solver.Advance()
+    fp2 = phys.ComputeFissionProduction("new")
+
+    growth1 = fp1 / fp_new
+    growth2 = fp2 / fp1
+    transient_ok = 1 if (growth1 > 1.0 and growth2 > 1.0) else 0
+
+    if rank == 0:
+        print(f"FP_RATIO_EXPECTED {ratio_expected:.12e}")
+        print(f"FP_RATIO_ACTUAL {ratio_actual:.12e}")
+        print(f"TRANSIENT_OK {transient_ok}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_theta_precursor_scaling_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_theta_precursor_scaling_cbc.py
new file mode 100644
index 0000000000..c3f6212940
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_theta_precursor_scaling_cbc.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+1D transient check: delayed fission source scales with theta*dt.
+
+Compare delayed-vs-prompt FP ratio deltas at theta=1.0 and theta=0.5 for a
+single step. The transient delayed-fission source should scale with theta,
+so the delta at theta=1 should be roughly twice the delta at theta=0.5.
+"""
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.solver import (
+        DiscreteOrdinatesProblem,
+        PowerIterationKEigenSolver,
+        TransientSolver,
+    )
+    from pyopensn.aquad import GLProductQuadrature1DSlab
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.mesh import OrthogonalMeshGenerator
+
+
+def run_case(theta, use_precursors, xs):
+    dx = 8.0 / 40
+    nodes = [i * dx for i in range(40 + 1)]
+    meshgen = OrthogonalMeshGenerator(node_sets=[nodes])
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+
+    pquad = GLProductQuadrature1DSlab(n_polar=4, scattering_order=0)
+
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=1,
+        groupsets=[
+            {
+                "groups_from_to": (0, 0),
+                "angular_quadrature": pquad,
+                "inner_linear_method": "classic_richardson",
+                "l_abs_tol": 1.0e-8,
+                "l_max_its": 200,
+            },
+        ],
+        xs_map=[{"block_ids": [0], "xs": xs}],
+        boundary_conditions=[
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={
+            "save_angular_flux": True,
+            "use_precursors": use_precursors,
+            "verbose_inner_iterations": False,
+            "verbose_outer_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    keigen = PowerIterationKEigenSolver(problem=phys)
+    keigen.Initialize()
+    keigen.Execute()
+
+    phys.SetTimeDependentMode()
+
+    solver = TransientSolver(problem=phys, initial_state="existing")
+    solver.Initialize()
+    solver.SetTheta(theta)
+    solver.SetTimeStep(1.0e-2)
+
+    fp0 = phys.ComputeFissionProduction("new")
+    solver.Advance()
+    fp1 = phys.ComputeFissionProduction("new")
+
+    return fp1 / fp0
+
+
+if __name__ == "__main__":
+    xs = MultiGroupXS()
+    xs.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_crit_1p.cxs"))
+
+    ratio_delayed_t1 = run_case(1.0, True, xs)
+    ratio_prompt_t1 = run_case(1.0, False, xs)
+    ratio_delayed_t05 = run_case(0.5, True, xs)
+    ratio_prompt_t05 = run_case(0.5, False, xs)
+
+    delta_t1 = ratio_delayed_t1 - ratio_prompt_t1
+    delta_t05 = ratio_delayed_t05 - ratio_prompt_t05
+
+    ok = delta_t1 > 0.0 and delta_t05 > 0.0 and delta_t05 > 1.0e-10
+    if ok:
+        ratio = delta_t1 / delta_t05
+    else:
+        ratio = 0.0
+
+    if rank == 0:
+        print(f"DELAYED_THETA_RATIO {ratio:.12e}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_2g_delayed_step_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_2g_delayed_step_cbc.py
new file mode 100644
index 0000000000..d7b6a2a171
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_2g_delayed_step_cbc.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+2D 2-group delayed transient xs step change.
+
+Confirm delayed-neutron coupling with an xs step in a multi-group
+setting.
+
+2-group, delayed neutrons enabled. A step change scales macroscopic fission
+terms.
+
+FP_RATIO_ACTUAL = 1.2 from scaling sigma_f by 1.2.
+"""
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.solver import (
+        DiscreteOrdinatesProblem,
+        PowerIterationKEigenSolver,
+        TransientSolver,
+    )
+    from pyopensn.aquad import GLCProductQuadrature2DXY
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.mesh import OrthogonalMeshGenerator
+
+if __name__ == "__main__":
+    dx = 6.0 / 6
+    nodes = [i * dx for i in range(6 + 1)]
+    meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes])
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+
+    xs_crit = MultiGroupXS()
+    xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs2g_delayed_crit_1p.cxs"))
+
+    xs_dense = MultiGroupXS()
+    xs_dense.LoadFromOpenSn(
+        os.path.join(os.path.dirname(__file__), "xs2g_delayed_density_up_1p.cxs")
+    )
+
+    pquad = GLCProductQuadrature2DXY(n_polar=2, n_azimuthal=4, scattering_order=0)
+
+    num_groups = 2
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=num_groups,
+        groupsets=[
+            {
+                "groups_from_to": (0, num_groups - 1),
+                "angular_quadrature": pquad,
+                "inner_linear_method": "classic_richardson",
+                "l_abs_tol": 1.0e-8,
+                "l_max_its": 200,
+                "gmres_restart_interval": 50,
+            },
+        ],
+        xs_map=[{"block_ids": [0], "xs": xs_crit}],
+        boundary_conditions=[
+            {"name": "xmin", "type": "reflecting"},
+            {"name": "xmax", "type": "reflecting"},
+            {"name": "ymin", "type": "reflecting"},
+            {"name": "ymax", "type": "reflecting"},
+        ],
+        options={
+            "save_angular_flux": True,
+            "use_precursors": True,
+            "verbose_inner_iterations": False,
+            "verbose_outer_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    keigen = PowerIterationKEigenSolver(problem=phys)
+    keigen.Initialize()
+    keigen.Execute()
+
+    phys.SetTimeDependentMode()
+
+    solver = TransientSolver(problem=phys, initial_state="existing")
+    solver.Initialize()
+
+    fp_old = phys.ComputeFissionProduction("new")
+
+    phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_dense}])
+    fp_new = phys.ComputeFissionProduction("new")
+
+    dt = 1.0e-2
+    solver.SetTimeStep(dt)
+    solver.SetTheta(1.0)
+
+    solver.Advance()
+    fp1 = phys.ComputeFissionProduction("new")
+
+    solver.Advance()
+    fp2 = phys.ComputeFissionProduction("new")
+
+    r1 = fp1 / fp_new
+    r2 = fp2 / fp1
+    transient_ok = 1 if (fp1 > 0.0 and fp2 > 0.0 and 0.5 < r1 < 2.0 and 0.5 < r2 < 2.0) else 0
+
+    if rank == 0:
+        print(f"FP_RATIO_ACTUAL {fp_new / fp_old:.12e}")
+        print(f"TRANSIENT_OK {transient_ok}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_2g_prompt_combine_velocities_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_2g_prompt_combine_velocities_cbc.py
new file mode 100644
index 0000000000..6ba94b268e
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_2g_prompt_combine_velocities_cbc.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+2D 2-group prompt: Combine xs with group-wise velocities.
+
+2-group prompt-only. Combine forms a composite xs from two macroscopic xs
+inputs.
+
+FP_RATIO_ACTUAL = 2.2
+Combine uses density weights, so with (1.0, 1.0):
+sigma_f_mix = 1.0 * sigma_f_crit + 1.0 * sigma_f_super.
+Given sigma_f_super = 1.2 * sigma_f_crit, the ratio is
+sigma_f_mix / sigma_f_crit = 1.0 + 1.2 = 2.2.
+FP_RATIO_ACTUAL checks Combine behavior with
+mixed group velocities. TRANSIENT_OK ensures the first transient step is
+finite and positive.
+"""
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.solver import (
+        DiscreteOrdinatesProblem,
+        PowerIterationKEigenSolver,
+        TransientSolver,
+    )
+    from pyopensn.aquad import GLCProductQuadrature2DXY
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.mesh import OrthogonalMeshGenerator
+
+if __name__ == "__main__":
+    dx = 6.0 / 6
+    nodes = [i * dx for i in range(6 + 1)]
+    meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes])
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+
+    xs_crit = MultiGroupXS()
+    xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs2g_prompt_crit.cxs"))
+
+    xs_super = MultiGroupXS()
+    xs_super.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs2g_prompt_super.cxs"))
+
+    xs_mix = MultiGroupXS.Combine([(xs_crit, 1.0), (xs_super, 1.0)])
+
+    pquad = GLCProductQuadrature2DXY(n_polar=2, n_azimuthal=4, scattering_order=0)
+
+    num_groups = 2
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=num_groups,
+        groupsets=[
+            {
+                "groups_from_to": (0, num_groups - 1),
+                "angular_quadrature": pquad,
+                "inner_linear_method": "petsc_gmres",
+                "l_abs_tol": 1.0e-8,
+                "l_max_its": 200,
+                "gmres_restart_interval": 10,
+            },
+        ],
+        xs_map=[{"block_ids": [0], "xs": xs_crit}],
+        boundary_conditions=[
+            {"name": "xmin", "type": "reflecting"},
+            {"name": "xmax", "type": "reflecting"},
+            {"name": "ymin", "type": "reflecting"},
+            {"name": "ymax", "type": "reflecting"},
+        ],
+        options={
+            "save_angular_flux": True,
+            "use_precursors": False,
+            "verbose_inner_iterations": False,
+            "verbose_outer_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    keigen = PowerIterationKEigenSolver(problem=phys)
+    keigen.Initialize()
+    keigen.Execute()
+
+    phys.SetTimeDependentMode()
+
+    solver = TransientSolver(problem=phys, initial_state="existing")
+    solver.Initialize()
+
+    fp_old = phys.ComputeFissionProduction("new")
+
+    phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_mix}])
+    fp_new = phys.ComputeFissionProduction("new")
+
+    dt = 1.0e-2
+    solver.SetTimeStep(dt)
+    solver.SetTheta(1.0)
+
+    solver.Advance()
+    fp1 = phys.ComputeFissionProduction("new")
+
+    transient_ok = 1 if (fp1 > 0.0) else 0
+
+    if rank == 0:
+        print(f"FP_RATIO_ACTUAL {fp_new / fp_old:.12e}")
+        print(f"TRANSIENT_OK {transient_ok}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_delayed_prke_vs_stk_2p_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_delayed_prke_vs_stk_2p_cbc.py
new file mode 100644
index 0000000000..c56bdc5c4b
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_delayed_prke_vs_stk_2p_cbc.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+2D delayed transient: homogeneous step xs vs PRKE (2 precursors).
+
+1-group, 2 precursors, reflecting boundaries (infinite-medium). A step
+to a supercritical xs is applied at t=0. Space-time kinetics should follow
+PRKE for this homogeneous case.
+
+PRKE_STK_PASS is 1 if the space-time fission-production ratio matches PRKE within 2%
+for t<=0.2.
+"""
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.solver import (
+        DiscreteOrdinatesProblem,
+        PowerIterationKEigenSolver,
+        TransientSolver,
+    )
+    from pyopensn.aquad import GLCProductQuadrature2DXY
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.mesh import OrthogonalMeshGenerator
+
+
+def read_block_values(path, block_name):
+    begin = f"{block_name}_BEGIN"
+    end = f"{block_name}_END"
+    values = []
+    in_block = False
+    with open(path, "r", encoding="utf-8") as handle:
+        for line in handle:
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+            if line == begin:
+                in_block = True
+                continue
+            if line == end:
+                in_block = False
+                continue
+            if in_block:
+                parts = line.split()
+                if len(parts) >= 2:
+                    values.append(float(parts[1]))
+    if not values:
+        raise RuntimeError(f"Failed to find {block_name} in {path}")
+    return values
+
+
+def solve_linear(A, b):
+    n = len(b)
+    a = [row[:] for row in A]
+    x = b[:]
+    for i in range(n):
+        pivot = i
+        for r in range(i + 1, n):
+            if abs(a[r][i]) > abs(a[pivot][i]):
+                pivot = r
+        if abs(a[pivot][i]) < 1.0e-14:
+            raise RuntimeError("Singular system in PRKE solve")
+        if pivot != i:
+            a[i], a[pivot] = a[pivot], a[i]
+            x[i], x[pivot] = x[pivot], x[i]
+        piv = a[i][i]
+        for j in range(i, n):
+            a[i][j] /= piv
+        x[i] /= piv
+        for r in range(n):
+            if r == i:
+                continue
+            factor = a[r][i]
+            if factor == 0.0:
+                continue
+            for j in range(i, n):
+                a[r][j] -= factor * a[i][j]
+            x[r] -= factor * x[i]
+    return x
+
+
+def prke_step(phi, C, dt, beta, lambdas, rho, Lambda):
+    m = len(lambdas)
+    beta_total = sum(beta)
+    size = 1 + m
+    A = [[0.0 for _ in range(size)] for _ in range(size)]
+    b = [0.0 for _ in range(size)]
+
+    A[0][0] = 1.0 - dt * (rho - beta_total) / Lambda
+    for i in range(m):
+        A[0][1 + i] = -dt * lambdas[i]
+
+    b[0] = phi
+
+    for i in range(m):
+        A[1 + i][0] = -dt * (beta[i] / Lambda)
+        A[1 + i][1 + i] = 1.0 + dt * lambdas[i]
+        b[1 + i] = C[i]
+
+    x = solve_linear(A, b)
+    return x[0], x[1:]
+
+
+if __name__ == "__main__":
+    dx = 6.0 / 6
+    nodes = [i * dx for i in range(6 + 1)]
+    meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes])
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+
+    xs_crit = MultiGroupXS()
+    xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_crit_2p.cxs"))
+
+    xs_super = MultiGroupXS()
+    xs_super.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_2p.cxs"))
+
+    pquad = GLCProductQuadrature2DXY(n_polar=2, n_azimuthal=4, scattering_order=0)
+
+    num_groups = 1
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=num_groups,
+        groupsets=[
+            {
+                "groups_from_to": (0, num_groups - 1),
+                "angular_quadrature": pquad,
+                "inner_linear_method": "classic_richardson",
+                "l_abs_tol": 1.0e-8,
+                "l_max_its": 200,
+            },
+        ],
+        xs_map=[{"block_ids": [0], "xs": xs_crit}],
+        boundary_conditions=[
+            {"name": "xmin", "type": "reflecting"},
+            {"name": "xmax", "type": "reflecting"},
+            {"name": "ymin", "type": "reflecting"},
+            {"name": "ymax", "type": "reflecting"},
+        ],
+        options={
+            "save_angular_flux": True,
+            "use_precursors": True,
+            "verbose_inner_iterations": False,
+            "verbose_outer_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    keigen = PowerIterationKEigenSolver(problem=phys, max_iters=200, k_tol=1.0e-10)
+    keigen.Initialize()
+    keigen.Execute()
+
+    phys.SetTimeDependentMode()
+
+    solver = TransientSolver(problem=phys, initial_state="existing")
+    solver.Initialize()
+
+    # Apply homogeneous perturbation at t=0
+    phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_super}])
+
+    sigma_a = xs_super.sigma_a[0]
+    nu_sigma_f = xs_super.nu_sigma_f[0]
+    inv_vel = xs_super.inv_velocity[0]
+    v = 1.0 / inv_vel
+
+    k_eff = nu_sigma_f / sigma_a
+    rho = (k_eff - 1.0) / k_eff
+    Lambda = 1.0 / (v * nu_sigma_f)
+
+    beta = read_block_values(
+        os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_2p.cxs"),
+        "PRECURSOR_FRACTIONAL_YIELDS",
+    )
+    lambdas = read_block_values(
+        os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_2p.cxs"),
+        "PRECURSOR_DECAY_CONSTANTS",
+    )
+
+    dt = 1.0e-2
+    solver.SetTimeStep(dt)
+    solver.SetTheta(1.0)
+
+    fp0 = phys.ComputeFissionProduction("new")
+
+    # PRKE initial conditions for steady state
+    phi = 1.0
+    C = [beta[i] / (Lambda * lambdas[i]) for i in range(len(lambdas))]
+
+    t_end = 0.2
+    rel_tol = 2.0e-2
+    ok = True
+    while phys.GetTime() < t_end:
+        solver.Advance()
+        fp_new = phys.ComputeFissionProduction("new")
+        t_to = phys.GetTime()
+
+        phi, C = prke_step(phi, C, dt, beta, lambdas, rho, Lambda)
+        ratio_num = fp_new / fp0
+        ratio_prke = phi
+        if abs(ratio_num - ratio_prke) > rel_tol * ratio_prke:
+            ok = False
+
+    if rank == 0:
+        print(f"PRKE_STK_PASS {1 if ok else 0}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_prompt_ramp_xs_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_prompt_ramp_xs_cbc.py
new file mode 100644
index 0000000000..867c1992cb
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_prompt_ramp_xs_cbc.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+2D prompt-only transient with a ramped XS.
+
+Prompt-only with a monotonic increase in nu*sigma_f across a discrete xs list.
+With reflecting BCs and no delayed neutrons, the fission production should be
+increasing in time.
+
+TRANSIENT_OK checks finite response and increasing FP over the ramp.
+"""
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.solver import (
+        DiscreteOrdinatesProblem,
+        PowerIterationKEigenSolver,
+        TransientSolver,
+    )
+    from pyopensn.aquad import GLCProductQuadrature2DXY
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.mesh import OrthogonalMeshGenerator
+
+if __name__ == "__main__":
+    dx = 8.0 / 8
+    nodes = [i * dx for i in range(8 + 1)]
+    meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes])
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+
+    xs_list = []
+    for i in range(5):
+        xs = MultiGroupXS()
+        xs.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), f"xs1g_prompt_ramp_{i}.cxs"))
+        xs_list.append(xs)
+
+    pquad = GLCProductQuadrature2DXY(n_polar=2, n_azimuthal=4, scattering_order=0)
+
+    num_groups = 1
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=num_groups,
+        groupsets=[
+            {
+                "groups_from_to": (0, num_groups - 1),
+                "angular_quadrature": pquad,
+                "inner_linear_method": "classic_richardson",
+                "l_abs_tol": 1.0e-8,
+                "l_max_its": 200,
+            },
+        ],
+        xs_map=[{"block_ids": [0], "xs": xs_list[0]}],
+        boundary_conditions=[
+            {"name": "xmin", "type": "reflecting"},
+            {"name": "xmax", "type": "reflecting"},
+            {"name": "ymin", "type": "reflecting"},
+            {"name": "ymax", "type": "reflecting"},
+        ],
+        options={
+            "save_angular_flux": True,
+            "use_precursors": False,
+            "verbose_inner_iterations": False,
+            "verbose_outer_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    keigen = PowerIterationKEigenSolver(problem=phys)
+    keigen.Initialize()
+    keigen.Execute()
+
+    phys.SetTimeDependentMode()
+
+    solver = TransientSolver(problem=phys, initial_state="existing")
+    solver.Initialize()
+
+    fp_old = phys.ComputeFissionProduction("new")
+
+    sigma_f_vals = [0.150000, 0.157500, 0.165000, 0.172500, 0.180000]
+    dt = 2.0e-2
+    solver.SetTimeStep(dt)
+    solver.SetTheta(1.0)
+
+    growth_ok = True
+    last_fr = fp_old
+    for i in range(1, len(xs_list)):
+        phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_list[i]}])
+
+        solver.Advance()
+        fp = phys.ComputeFissionProduction("new")
+
+        if fp <= last_fr:
+            growth_ok = False
+        last_fr = fp
+
+    transient_ok = 1 if growth_ok else 0
+
+    if rank == 0:
+        print(f"TRANSIENT_OK {transient_ok}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_2g_prompt_step_xs_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_2g_prompt_step_xs_cbc.py
new file mode 100644
index 0000000000..5b8198bac6
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_2g_prompt_step_xs_cbc.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+3D 2-group prompt-only transient: step XS swap.
+
+2-group prompt-only. A step in sigma_f scales the prompt source. Scattering
+couples groups so the transient response is not strictly monotonic.
+
+FP_RATIO_ACTUAL = 1.2 from scaling both groups' sigma_f by 1.2 (0.144/0.120).
+TRANSIENT_OK checks positive response and reasonable step ratios (0.5 < r < 2).
+"""
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.solver import (
+        DiscreteOrdinatesProblem,
+        PowerIterationKEigenSolver,
+        TransientSolver,
+    )
+    from pyopensn.aquad import GLCProductQuadrature3DXYZ
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.mesh import OrthogonalMeshGenerator
+
+if __name__ == "__main__":
+    dx = 8.0 / 4
+    nodes = [i * dx for i in range(4 + 1)]
+    meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes, nodes])
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+
+    xs_crit = MultiGroupXS()
+    xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs2g_prompt_crit.cxs"))
+
+    xs_super = MultiGroupXS()
+    xs_super.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs2g_prompt_super.cxs"))
+
+    pquad = GLCProductQuadrature3DXYZ(n_polar=2, n_azimuthal=4, scattering_order=0)
+
+    num_groups = 2
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=num_groups,
+        groupsets=[
+            {
+                "groups_from_to": (0, num_groups - 1),
+                "angular_quadrature": pquad,
+                "inner_linear_method": "petsc_richardson",
+                "l_abs_tol": 1.0e-8,
+                "l_max_its": 200,
+            },
+        ],
+        xs_map=[{"block_ids": [0], "xs": xs_crit}],
+        boundary_conditions=[
+            {"name": "xmin", "type": "reflecting"},
+            {"name": "xmax", "type": "reflecting"},
+            {"name": "ymin", "type": "reflecting"},
+            {"name": "ymax", "type": "reflecting"},
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={
+            "save_angular_flux": True,
+            "use_precursors": False,
+            "verbose_inner_iterations": False,
+            "verbose_outer_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    keigen = PowerIterationKEigenSolver(problem=phys)
+    keigen.Initialize()
+    keigen.Execute()
+
+    phys.SetTimeDependentMode()
+
+    solver = TransientSolver(problem=phys, initial_state="existing")
+    solver.Initialize()
+
+    fp_old = phys.ComputeFissionProduction("new")
+
+    phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_super}])
+    fp_new = phys.ComputeFissionProduction("new")
+
+    dt = 1.0e-2
+    solver.SetTimeStep(dt)
+    solver.SetTheta(1.0)
+
+    solver.Advance()
+    fp1 = phys.ComputeFissionProduction("new")
+
+    solver.Advance()
+    fp2 = phys.ComputeFissionProduction("new")
+
+    r1 = fp1 / fp_new
+    r2 = fp2 / fp1
+    transient_ok = 1 if (fp1 > 0.0 and fp2 > 0.0 and 0.5 < r1 < 2.0 and 0.5 < r2 < 2.0) else 0
+
+    if rank == 0:
+        print(f"FP_RATIO_ACTUAL {fp_new / fp_old:.12e}")
+        print(f"TRANSIENT_OK {transient_ok}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_6g_delayed_step_nu_sigma_f_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_6g_delayed_step_nu_sigma_f_cbc.py
new file mode 100644
index 0000000000..985caa7ede
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_6g_delayed_step_nu_sigma_f_cbc.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+3D 6-group, 2-precursor delayed transient step.
+
+6 energy groups, 2 precursor families. A step in nu*sigma_f changes the prompt
+source immediately and the delayed source through precursor evolution.
+
+FP_RATIO_ACTUAL = 1.2 from scaling all sigma_f by 1.2 between crit and super xs.
+"""
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.solver import (
+        DiscreteOrdinatesProblem,
+        PowerIterationKEigenSolver,
+        TransientSolver,
+    )
+    from pyopensn.aquad import GLCProductQuadrature3DXYZ
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.mesh import OrthogonalMeshGenerator
+
+if __name__ == "__main__":
+    dx = 8.0 / 4
+    nodes = [i * dx for i in range(4 + 1)]
+    meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes, nodes])
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+
+    xs_crit = MultiGroupXS()
+    xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs6g_delayed_crit_2p.cxs"))
+
+    xs_super = MultiGroupXS()
+    xs_super.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs6g_delayed_super_2p.cxs"))
+
+    pquad = GLCProductQuadrature3DXYZ(n_polar=2, n_azimuthal=4, scattering_order=0)
+
+    num_groups = 6
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=num_groups,
+        groupsets=[
+            {
+                "groups_from_to": (0, num_groups - 1),
+                "angular_quadrature": pquad,
+                "inner_linear_method": "petsc_gmres",
+                "l_abs_tol": 1.0e-8,
+                "l_max_its": 200,
+                "gmres_restart_interval": 10,
+            },
+        ],
+        xs_map=[{"block_ids": [0], "xs": xs_crit}],
+        boundary_conditions=[
+            {"name": "xmin", "type": "reflecting"},
+            {"name": "xmax", "type": "reflecting"},
+            {"name": "ymin", "type": "reflecting"},
+            {"name": "ymax", "type": "reflecting"},
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={
+            "save_angular_flux": True,
+            "use_precursors": True,
+            "verbose_inner_iterations": False,
+            "verbose_outer_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    keigen = PowerIterationKEigenSolver(problem=phys)
+    keigen.Initialize()
+    keigen.Execute()
+
+    phys.SetTimeDependentMode()
+
+    solver = TransientSolver(problem=phys, initial_state="existing")
+    solver.Initialize()
+
+    fp_old = phys.ComputeFissionProduction("new")
+
+    phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_super}])
+    fp_new = phys.ComputeFissionProduction("new")
+
+    dt = 1.0e-2
+    solver.SetTimeStep(dt)
+    solver.SetTheta(1.0)
+
+    solver.Advance()
+    fp1 = phys.ComputeFissionProduction("new")
+
+    solver.Advance()
+    fp2 = phys.ComputeFissionProduction("new")
+
+    transient_ok = 1 if (fp1 > 0.0 and fp2 > 0.0) else 0
+
+    if rank == 0:
+        print(f"FP_RATIO_ACTUAL {fp_new / fp_old:.12e}")
+        print(f"FP1 {fp1:.12e}")
+        print(f"FP2 {fp2:.12e}")
+        print(f"TRANSIENT_OK {transient_ok}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_analytic_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_analytic_cbc.py
new file mode 100644
index 0000000000..1c60e38363
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_analytic_cbc.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+3D delayed transient k-eigen with semi-analytic 1-precursor kinetics.
+
+Validate delayed-neutron coupling and precursor update against the closed-form
+1-precursor point-kinetics solution for a reactivity step.
+
+1-group, 1 precursor. Point kinetics: dphi/dt = ((rho - beta)/Lambda) * phi +
+lambda * C dC/dt   = (beta/Lambda) * phi - lambda * C For a step to rho>0 with
+phi(0)=1 and C(0)=beta/(Lambda*lambda), the solution is a sum of two
+exponentials with eigenvalues w1,w2. The helper delayed_phi_ratio implements
+that exact form.
+
+ANALYTIC_PASS is 1 if |FP_ratio - delayed_phi_ratio(t)| < 2% for all steps up to
+t=0.2. Parameters: beta=0.0065, lambda=0.08, k=1.2 => rho=(k-1)/k=0.166666...,
+nu_total=2.0, sigma_f=0.18, Lambda = 1/(v*nu*sigma_f) = 1/0.36 ≈ 2.7778. These
+are the inputs to delayed_phi_ratio. ANALYTIC_PASS validates delayed source and
+precursor updates against the semi-analytic solution.
+"""
+
+import math
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.solver import (
+        DiscreteOrdinatesProblem,
+        PowerIterationKEigenSolver,
+        TransientSolver,
+    )
+    from pyopensn.aquad import GLCProductQuadrature3DXYZ
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.mesh import OrthogonalMeshGenerator
+
+
+def delayed_phi_ratio(t, beta, lam, rho, Lambda):
+    # Point-kinetics 1-precursor step solution with phi(0)=1, C(0)=beta/(Lambda*lam)
+    a = (rho - beta) / Lambda - lam
+    b = math.sqrt(((rho - beta) / Lambda + lam) ** 2 + 4.0 * beta * lam / Lambda)
+    w1 = 0.5 * (a + b)
+    w2 = 0.5 * (a - b)
+
+    # C = (beta/Lambda)/(w+lam) * phi for each mode
+    k1 = (beta / Lambda) / (w1 + lam)
+    k2 = (beta / Lambda) / (w2 + lam)
+
+    # Solve for coefficients c1, c2 from phi(0)=1 and C(0)=beta/(Lambda*lam)
+    c2 = (beta / (Lambda * lam) - k1) / (k2 - k1)
+    c1 = 1.0 - c2
+
+    return c1 * math.exp(w1 * t) + c2 * math.exp(w2 * t)
+
+
+if __name__ == "__main__":
+    dx = 8.0 / 4
+    nodes = [i * dx for i in range(4 + 1)]
+    meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes, nodes])
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+
+    xs_crit = MultiGroupXS()
+    xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_crit_1p.cxs"))
+
+    xs_super = MultiGroupXS()
+    xs_super.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_1p.cxs"))
+
+    pquad = GLCProductQuadrature3DXYZ(n_polar=2, n_azimuthal=4, scattering_order=0)
+
+    num_groups = 1
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=num_groups,
+        groupsets=[
+            {
+                "groups_from_to": (0, num_groups - 1),
+                "angular_quadrature": pquad,
+                "inner_linear_method": "classic_richardson",
+                "l_abs_tol": 1.0e-8,
+                "l_max_its": 200,
+            },
+        ],
+        xs_map=[{"block_ids": [0], "xs": xs_crit}],
+        boundary_conditions=[
+            {"name": "xmin", "type": "reflecting"},
+            {"name": "xmax", "type": "reflecting"},
+            {"name": "ymin", "type": "reflecting"},
+            {"name": "ymax", "type": "reflecting"},
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={
+            "save_angular_flux": True,
+            "use_precursors": True,
+            "verbose_inner_iterations": False,
+            "verbose_outer_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    keigen = PowerIterationKEigenSolver(problem=phys, max_iters=200, k_tol=1.0e-10)
+    keigen.Initialize()
+    keigen.Execute()
+
+    phys.SetTimeDependentMode()
+
+    solver = TransientSolver(problem=phys, verbose=False, initial_state="existing")
+    solver.Initialize()
+
+    # Swap to supercritical XS at t=0
+    phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_super}])
+
+    # Semi-analytic parameters
+    beta = 0.0065
+    lam = 0.08
+    k = 1.2
+    rho = (k - 1.0) / k
+    # Lambda based on infinite-medium definition (prompt gen time)
+    nu_prompt = 1.987
+    nu_delayed = 0.013
+    sigma_f = 0.180000
+    v = 1.0
+    nu_sigma_f = sigma_f * (nu_prompt + nu_delayed)
+    Lambda = 1.0 / (v * nu_sigma_f)
+
+    dt = 1.0e-2
+    solver.SetTimeStep(dt)
+    solver.SetTheta(1.0)
+
+    fp0 = phys.ComputeFissionProduction("new")
+
+    t_end = 0.2
+    rel_tol = 2.0e-2
+    ok = True
+    if rank == 0:
+        print("step time ratio_numeric ratio_analytic")
+    step = 0
+    while phys.GetTime() < t_end:
+        step += 1
+        solver.Advance()
+        fp_new = phys.ComputeFissionProduction("new")
+        t_to = phys.GetTime()
+        ratio_num = fp_new / fp0
+        ratio_ana = delayed_phi_ratio(t_to, beta, lam, rho, Lambda)
+
+        if rank == 0:
+            print(f"{step:4d} {t_to:10.4e} {ratio_num:12.6e} {ratio_ana:12.6e}")
+        if abs(ratio_num - ratio_ana) > rel_tol * ratio_ana:
+            ok = False
+
+    if rank == 0:
+        print(f"ANALYTIC_PASS {1 if ok else 0}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_prke_vs_stk_2p_callbacks_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_prke_vs_stk_2p_callbacks_cbc.py
new file mode 100644
index 0000000000..8e408d7cd0
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_prke_vs_stk_2p_callbacks_cbc.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+3D delayed transient: homogeneous step xs vs PRKE (2 precursors) with callbacks.
+
+Validate space-time kinetics against point-reactor kinetics (PRKE) using the
+transient solver Execute loop and pre/post-advance callbacks.
+
+1-group, 2 precursors, reflecting boundaries (infinite-medium). A step
+to a supercritical xs is applied at t=0. Space-time kinetics should follow
+PRKE for this homogeneous case.
+
+PRKE_STK_PASS is 1 if the space-time fission-production ratio matches PRKE within 2%
+for t<=0.15.
+"""
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.solver import (
+        DiscreteOrdinatesProblem,
+        PowerIterationKEigenSolver,
+        TransientSolver,
+    )
+    from pyopensn.aquad import GLCProductQuadrature3DXYZ
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.mesh import OrthogonalMeshGenerator
+
+
+def read_block_values(path, block_name):
+    begin = f"{block_name}_BEGIN"
+    end = f"{block_name}_END"
+    values = []
+    in_block = False
+    with open(path, "r", encoding="utf-8") as handle:
+        for line in handle:
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+            if line == begin:
+                in_block = True
+                continue
+            if line == end:
+                in_block = False
+                continue
+            if in_block:
+                parts = line.split()
+                if len(parts) >= 2:
+                    values.append(float(parts[1]))
+    if not values:
+        raise RuntimeError(f"Failed to find {block_name} in {path}")
+    return values
+
+
+def solve_linear(A, b):
+    n = len(b)
+    a = [row[:] for row in A]
+    x = b[:]
+    for i in range(n):
+        pivot = i
+        for r in range(i + 1, n):
+            if abs(a[r][i]) > abs(a[pivot][i]):
+                pivot = r
+        if abs(a[pivot][i]) < 1.0e-14:
+            raise RuntimeError("Singular system in PRKE solve")
+        if pivot != i:
+            a[i], a[pivot] = a[pivot], a[i]
+            x[i], x[pivot] = x[pivot], x[i]
+        piv = a[i][i]
+        for j in range(i, n):
+            a[i][j] /= piv
+        x[i] /= piv
+        for r in range(n):
+            if r == i:
+                continue
+            factor = a[r][i]
+            if factor == 0.0:
+                continue
+            for j in range(i, n):
+                a[r][j] -= factor * a[i][j]
+            x[r] -= factor * x[i]
+    return x
+
+
+def prke_step(phi, C, dt, beta, lambdas, rho, Lambda):
+    m = len(lambdas)
+    beta_total = sum(beta)
+    size = 1 + m
+    A = [[0.0 for _ in range(size)] for _ in range(size)]
+    b = [0.0 for _ in range(size)]
+
+    A[0][0] = 1.0 - dt * (rho - beta_total) / Lambda
+    for i in range(m):
+        A[0][1 + i] = -dt * lambdas[i]
+
+    b[0] = phi
+
+    for i in range(m):
+        A[1 + i][0] = -dt * (beta[i] / Lambda)
+        A[1 + i][1 + i] = 1.0 + dt * lambdas[i]
+        b[1 + i] = C[i]
+
+    x = solve_linear(A, b)
+    return x[0], x[1:]
+
+
+if __name__ == "__main__":
+    dx = 8.0 / 4
+    nodes = [i * dx for i in range(4 + 1)]
+    meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes, nodes])
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+
+    xs_crit = MultiGroupXS()
+    xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_crit_2p.cxs"))
+
+    xs_super = MultiGroupXS()
+    xs_super.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_2p.cxs"))
+
+    pquad = GLCProductQuadrature3DXYZ(n_polar=2, n_azimuthal=4, scattering_order=0)
+
+    num_groups = 1
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=num_groups,
+        groupsets=[
+            {
+                "groups_from_to": (0, num_groups - 1),
+                "angular_quadrature": pquad,
+                "inner_linear_method": "classic_richardson",
+                "l_abs_tol": 1.0e-8,
+                "l_max_its": 200,
+            },
+        ],
+        xs_map=[{"block_ids": [0], "xs": xs_crit}],
+        boundary_conditions=[
+            {"name": "xmin", "type": "reflecting"},
+            {"name": "xmax", "type": "reflecting"},
+            {"name": "ymin", "type": "reflecting"},
+            {"name": "ymax", "type": "reflecting"},
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={
+            "save_angular_flux": True,
+            "use_precursors": True,
+            "verbose_inner_iterations": False,
+            "verbose_outer_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    t_end = 0.15
+    dt = 1.0e-2
+    keigen = PowerIterationKEigenSolver(problem=phys, max_iters=200, k_tol=1.0e-10)
+    keigen.Initialize()
+    keigen.Execute()
+
+    phys.SetTimeDependentMode()
+
+    solver = TransientSolver(
+        problem=phys,
+        stop_time=t_end,
+        dt=dt,
+        initial_state="existing",
+    )
+    solver.Initialize()
+    solver.SetTheta(1.0)
+
+    # Apply homogeneous perturbation at t=0
+    phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_super}])
+
+    sigma_a = xs_super.sigma_a[0]
+    nu_sigma_f = xs_super.nu_sigma_f[0]
+    inv_vel = xs_super.inv_velocity[0]
+    v = 1.0 / inv_vel
+
+    k_eff = nu_sigma_f / sigma_a
+    rho = (k_eff - 1.0) / k_eff
+    Lambda = 1.0 / (v * nu_sigma_f)
+
+    beta = read_block_values(
+        os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_2p.cxs"),
+        "PRECURSOR_FRACTIONAL_YIELDS",
+    )
+    lambdas = read_block_values(
+        os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_2p.cxs"),
+        "PRECURSOR_DECAY_CONSTANTS",
+    )
+
+    fp0 = phys.ComputeFissionProduction("new")
+
+    state = {
+        "phi": 1.0,
+        "C": [beta[i] / (Lambda * lambdas[i]) for i in range(len(lambdas))],
+        "ok": True,
+        "steps": 0,
+    }
+    rel_tol = 2.0e-2
+
+    def pre_advance():
+        state["steps"] += 1
+
+    def post_advance():
+        fp_new = phys.ComputeFissionProduction("new")
+        step_dt = phys.GetTimeStep()
+        state["phi"], state["C"] = prke_step(
+            state["phi"], state["C"], step_dt, beta, lambdas, rho, Lambda
+        )
+        ratio_num = fp_new / fp0
+        ratio_prke = state["phi"]
+        if abs(ratio_num - ratio_prke) > rel_tol * ratio_prke:
+            state["ok"] = False
+
+    solver.SetPreAdvanceCallback(pre_advance)
+    solver.SetPostAdvanceCallback(post_advance)
+    solver.Execute()
+    solver.SetPreAdvanceCallback(None)
+    solver.SetPostAdvanceCallback(None)
+
+    if rank == 0:
+        print(f"PRKE_STK_PASS {1 if state['ok'] else 0}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_prke_vs_stk_2p_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_prke_vs_stk_2p_cbc.py
new file mode 100644
index 0000000000..401dd97d37
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_prke_vs_stk_2p_cbc.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+3D delayed transient: homogeneous step xs vs PRKE (2 precursors).
+
+Validate space-time kinetics against point-reactor kinetics (PRKE) for a
+homogeneous perturbation in a homogeneous system with two precursors.
+
+1-group, 2 precursors, reflecting boundaries (infinite-medium). A step
+to a supercritical xs is applied at t=0. Space-time kinetics should follow
+PRKE for this homogeneous case.
+
+PRKE_STK_PASS is 1 if the space-time fission-production ratio matches PRKE within 2%
+for t<=0.15.
+"""
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.solver import (
+        DiscreteOrdinatesProblem,
+        PowerIterationKEigenSolver,
+        TransientSolver,
+    )
+    from pyopensn.aquad import GLCProductQuadrature3DXYZ
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.mesh import OrthogonalMeshGenerator
+
+
+def read_block_values(path, block_name):
+    begin = f"{block_name}_BEGIN"
+    end = f"{block_name}_END"
+    values = []
+    in_block = False
+    with open(path, "r", encoding="utf-8") as handle:
+        for line in handle:
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+            if line == begin:
+                in_block = True
+                continue
+            if line == end:
+                in_block = False
+                continue
+            if in_block:
+                parts = line.split()
+                if len(parts) >= 2:
+                    values.append(float(parts[1]))
+    if not values:
+        raise RuntimeError(f"Failed to find {block_name} in {path}")
+    return values
+
+
+def solve_linear(A, b):
+    n = len(b)
+    a = [row[:] for row in A]
+    x = b[:]
+    for i in range(n):
+        pivot = i
+        for r in range(i + 1, n):
+            if abs(a[r][i]) > abs(a[pivot][i]):
+                pivot = r
+        if abs(a[pivot][i]) < 1.0e-14:
+            raise RuntimeError("Singular system in PRKE solve")
+        if pivot != i:
+            a[i], a[pivot] = a[pivot], a[i]
+            x[i], x[pivot] = x[pivot], x[i]
+        piv = a[i][i]
+        for j in range(i, n):
+            a[i][j] /= piv
+        x[i] /= piv
+        for r in range(n):
+            if r == i:
+                continue
+            factor = a[r][i]
+            if factor == 0.0:
+                continue
+            for j in range(i, n):
+                a[r][j] -= factor * a[i][j]
+            x[r] -= factor * x[i]
+    return x
+
+
+def prke_step(phi, C, dt, beta, lambdas, rho, Lambda):
+    m = len(lambdas)
+    beta_total = sum(beta)
+    size = 1 + m
+    A = [[0.0 for _ in range(size)] for _ in range(size)]
+    b = [0.0 for _ in range(size)]
+
+    A[0][0] = 1.0 - dt * (rho - beta_total) / Lambda
+    for i in range(m):
+        A[0][1 + i] = -dt * lambdas[i]
+
+    b[0] = phi
+
+    for i in range(m):
+        A[1 + i][0] = -dt * (beta[i] / Lambda)
+        A[1 + i][1 + i] = 1.0 + dt * lambdas[i]
+        b[1 + i] = C[i]
+
+    x = solve_linear(A, b)
+    return x[0], x[1:]
+
+
+if __name__ == "__main__":
+    dx = 8.0 / 4
+    nodes = [i * dx for i in range(4 + 1)]
+    meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes, nodes])
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+
+    xs_crit = MultiGroupXS()
+    xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_crit_2p.cxs"))
+
+    xs_super = MultiGroupXS()
+    xs_super.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_2p.cxs"))
+
+    pquad = GLCProductQuadrature3DXYZ(n_polar=2, n_azimuthal=4, scattering_order=0)
+
+    num_groups = 1
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=num_groups,
+        groupsets=[
+            {
+                "groups_from_to": (0, num_groups - 1),
+                "angular_quadrature": pquad,
+                "inner_linear_method": "petsc_gmres",
+                "l_abs_tol": 1.0e-8,
+                "l_max_its": 200,
+                "gmres_restart_interval": 10,
+            },
+        ],
+        xs_map=[{"block_ids": [0], "xs": xs_crit}],
+        boundary_conditions=[
+            {"name": "xmin", "type": "reflecting"},
+            {"name": "xmax", "type": "reflecting"},
+            {"name": "ymin", "type": "reflecting"},
+            {"name": "ymax", "type": "reflecting"},
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={
+            "save_angular_flux": True,
+            "use_precursors": True,
+            "verbose_inner_iterations": False,
+            "verbose_outer_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    keigen = PowerIterationKEigenSolver(problem=phys, max_iters=200, k_tol=1.0e-10)
+    keigen.Initialize()
+    keigen.Execute()
+
+    phys.SetTimeDependentMode()
+
+    solver = TransientSolver(problem=phys, initial_state="existing")
+    solver.Initialize()
+
+    # Apply homogeneous perturbation at t=0
+    phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_super}])
+
+    sigma_a = xs_super.sigma_a[0]
+    nu_sigma_f = xs_super.nu_sigma_f[0]
+    inv_vel = xs_super.inv_velocity[0]
+    v = 1.0 / inv_vel
+
+    k_eff = nu_sigma_f / sigma_a
+    rho = (k_eff - 1.0) / k_eff
+    Lambda = 1.0 / (v * nu_sigma_f)
+
+    beta = read_block_values(
+        os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_2p.cxs"),
+        "PRECURSOR_FRACTIONAL_YIELDS",
+    )
+    lambdas = read_block_values(
+        os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_2p.cxs"),
+        "PRECURSOR_DECAY_CONSTANTS",
+    )
+
+    dt = 1.0e-2
+    solver.SetTimeStep(dt)
+    solver.SetTheta(1.0)
+
+    fp0 = phys.ComputeFissionProduction("new")
+
+    # PRKE initial conditions for steady state
+    phi = 1.0
+    C = [beta[i] / (Lambda * lambdas[i]) for i in range(len(lambdas))]
+
+    t_end = 0.15
+    rel_tol = 2.0e-2
+    ok = True
+    while phys.GetTime() < t_end:
+        solver.Advance()
+        fp_new = phys.ComputeFissionProduction("new")
+        t_to = phys.GetTime()
+
+        phi, C = prke_step(phi, C, dt, beta, lambdas, rho, Lambda)
+        ratio_num = fp_new / fp0
+        ratio_prke = phi
+        if abs(ratio_num - ratio_prke) > rel_tol * ratio_prke:
+            ok = False
+
+    if rank == 0:
+        print(f"PRKE_STK_PASS {1 if ok else 0}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_ramp_xs_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_ramp_xs_cbc.py
new file mode 100644
index 0000000000..6feb53a688
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_ramp_xs_cbc.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+3D delayed transient with a ramped xs.
+
+Similar to the prompt ramp, but with delayed neutrons enabled.
+
+1-group, 1 precursor. nu*sigma_f ramps upward in time. The fission production
+should grow monotonically for this case with reflecting BCs.
+
+TRANSIENT_OK checks finite response and non-decreasing FP.
+"""
+
+import math
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.solver import (
+        DiscreteOrdinatesProblem,
+        PowerIterationKEigenSolver,
+        TransientSolver,
+    )
+    from pyopensn.aquad import GLCProductQuadrature3DXYZ
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.mesh import OrthogonalMeshGenerator
+
+
+def read_block_value(file_path, block_begin, block_end):
+    in_block = False
+    with open(file_path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line == block_begin:
+                in_block = True
+                continue
+            if line == block_end:
+                in_block = False
+                continue
+            if in_block:
+                parts = line.split()
+                if len(parts) == 2 and parts[0] == "0":
+                    return float(parts[1])
+    return None
+
+
+def load_xs_scalar_params(xs_file):
+    nu_prompt = read_block_value(xs_file, "NU_PROMPT_BEGIN", "NU_PROMPT_END")
+    nu_delayed = read_block_value(xs_file, "NU_DELAYED_BEGIN", "NU_DELAYED_END")
+    lam = read_block_value(
+        xs_file,
+        "PRECURSOR_DECAY_CONSTANTS_BEGIN",
+        "PRECURSOR_DECAY_CONSTANTS_END",
+    )
+    frac_yield = read_block_value(
+        xs_file,
+        "PRECURSOR_FRACTIONAL_YIELDS_BEGIN",
+        "PRECURSOR_FRACTIONAL_YIELDS_END",
+    )
+    return nu_prompt, nu_delayed, lam, frac_yield
+
+
+if __name__ == "__main__":
+    dx = 8.0 / 4
+    nodes = [i * dx for i in range(4 + 1)]
+    meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes, nodes])
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+
+    xs_list = []
+    for i in range(5):
+        xs = MultiGroupXS()
+        xs.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), f"xs1g_delayed_ramp_{i}.cxs"))
+        xs_list.append(xs)
+    xs_crit = xs_list[0]
+    xs_super = xs_list[-1]
+    xs_scalar_file = os.path.join(os.path.dirname(__file__), "xs1g_delayed_ramp_0.cxs")
+
+    pquad = GLCProductQuadrature3DXYZ(n_polar=2, n_azimuthal=4, scattering_order=0)
+
+    num_groups = 1
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=num_groups,
+        groupsets=[
+            {
+                "groups_from_to": (0, num_groups - 1),
+                "angular_quadrature": pquad,
+                "inner_linear_method": "classic_richardson",
+                "l_abs_tol": 1.0e-8,
+                "l_max_its": 200,
+            },
+        ],
+        xs_map=[{"block_ids": [0], "xs": xs_crit}],
+        boundary_conditions=[
+            {"name": "xmin", "type": "reflecting"},
+            {"name": "xmax", "type": "reflecting"},
+            {"name": "ymin", "type": "reflecting"},
+            {"name": "ymax", "type": "reflecting"},
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={
+            "save_angular_flux": True,
+            "use_precursors": True,
+            "verbose_inner_iterations": False,
+            "verbose_outer_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    keigen = PowerIterationKEigenSolver(problem=phys, max_iters=200, k_tol=1.0e-10)
+    keigen.Initialize()
+    keigen.Execute()
+
+    phys.SetTimeDependentMode()
+
+    solver = TransientSolver(problem=phys, verbose=False, initial_state="existing")
+    solver.Initialize()
+
+    # XS-based kinetics parameters (1-group, 1-precursor)
+    sigma_a = xs_super.sigma_a[0]
+    nu_prompt, nu_delayed, lam, frac_yield = load_xs_scalar_params(xs_scalar_file)
+    # Use beta from precursor fractional yield and nu_total from nu_prompt + nu_delayed
+    beta = frac_yield
+    v = 1.0 / xs_super.inv_velocity[0]
+
+    def k_from_nu_sigma_f(nu_sigma_f):
+        return nu_sigma_f / sigma_a
+
+    def rho_from_k(k):
+        return (k - 1.0) / k
+
+    # Ramp parameters
+    dt = 1.0e-2
+    solver.SetTimeStep(dt)
+    solver.SetTheta(1.0)
+
+    t_end = 0.2
+    ramp_time = t_end
+
+    def mix_factor(t):
+        if t <= 0.0:
+            return 0.0
+        if t >= ramp_time:
+            return 1.0
+        return t / ramp_time
+
+    def xs_index(t):
+        f = mix_factor(t)
+        return min(int(f * (len(xs_list) - 1) + 1.0e-12), len(xs_list) - 1)
+
+    def nu_sigma_f_of_t(t):
+        sigma_f = xs_list[xs_index(t)].sigma_f[0]
+        nu_total = nu_prompt + nu_delayed
+        return sigma_f * nu_total
+
+    def rho_of_t(t):
+        k = k_from_nu_sigma_f(nu_sigma_f_of_t(t))
+        return rho_from_k(k)
+
+    def Lambda_of_t(t):
+        return 1.0 / (v * nu_sigma_f_of_t(t))
+
+    nu_sigma_f_crit = nu_sigma_f_of_t(0.0)
+
+    fp0 = phys.ComputeFissionProduction("new")
+
+    rel_tol = 2.0e-2
+    ok = True
+    growth_ok = True
+    last_ratio = None
+    if rank == 0:
+        print("step time ratio_numeric ratio_analytic")
+    step = 0
+    while phys.GetTime() < t_end:
+        step += 1
+        t_from = phys.GetTime()
+        # Update XS mix for current step (piecewise-constant over dt)
+        idx = xs_index(t_from)
+        phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_list[idx]}])
+
+        solver.Advance()
+        fp_new = phys.ComputeFissionProduction("new")
+        t_to = phys.GetTime()
+
+        ratio_num = fp_new / fp0
+        ratio_ana = 1.0
+
+        if rank == 0:
+            print(f"{step:4d} {t_to:10.4e} {ratio_num:12.6e} {ratio_ana:12.6e}")
+        if (not math.isfinite(fp_new)) or (not math.isfinite(ratio_num)):
+            ok = False
+        elif last_ratio is not None and ratio_num < (last_ratio - 1.0e-4):
+            growth_ok = False
+        last_ratio = ratio_num
+        ok = ok and growth_ok
+
+    if rank == 0:
+        print(f"TRANSIENT_OK {1 if ok else 0}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_stiff_dt_sensitivity_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_stiff_dt_sensitivity_cbc.py
new file mode 100644
index 0000000000..974d3a35f9
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_stiff_dt_sensitivity_cbc.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+3D delayed transient with stiff precursor.
+
+Stresses the solver with a large decay constant (lambda).
+
+1-group, 1 precursor with large lambda. Two runs (dt_small and dt_large) are
+compared at the same t_end. A correct theta-scheme should produce similar FP
+ratios when dt is sufficiently small.
+
+REL_DIFF < 0.05 is a robustness threshold: the dt_large solution should be
+within 5% of dt_small.
+TRANSIENT_OK requires positive finite response, matching t_end, and relative
+difference < 5% between dt_small and dt_large.
+"""
+
+import math
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.solver import (
+        DiscreteOrdinatesProblem,
+        PowerIterationKEigenSolver,
+        TransientSolver,
+    )
+    from pyopensn.aquad import GLCProductQuadrature3DXYZ
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.mesh import OrthogonalMeshGenerator
+
+
+def run_transient(dt, t_end, xs_crit, xs_super):
+    dx = 8.0 / 4
+    nodes = [i * dx for i in range(4 + 1)]
+    meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes, nodes])
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+
+    pquad = GLCProductQuadrature3DXYZ(n_polar=2, n_azimuthal=4, scattering_order=0)
+
+    num_groups = 1
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=num_groups,
+        groupsets=[
+            {
+                "groups_from_to": (0, num_groups - 1),
+                "angular_quadrature": pquad,
+                "inner_linear_method": "classic_richardson",
+                "l_abs_tol": 1.0e-8,
+                "l_max_its": 200,
+            },
+        ],
+        xs_map=[{"block_ids": [0], "xs": xs_crit}],
+        boundary_conditions=[
+            {"name": "xmin", "type": "reflecting"},
+            {"name": "xmax", "type": "reflecting"},
+            {"name": "ymin", "type": "reflecting"},
+            {"name": "ymax", "type": "reflecting"},
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={
+            "save_angular_flux": True,
+            "use_precursors": True,
+            "verbose_inner_iterations": False,
+            "verbose_outer_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    keigen = PowerIterationKEigenSolver(problem=phys)
+    keigen.Initialize()
+    keigen.Execute()
+
+    phys.SetTimeDependentMode()
+
+    solver = TransientSolver(problem=phys, initial_state="existing")
+    solver.Initialize()
+
+    fp0 = phys.ComputeFissionProduction("new")
+
+    phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_super}])
+
+    solver.SetTimeStep(dt)
+    solver.SetTheta(1.0)
+
+    while phys.GetTime() < t_end - 1.0e-12:
+        solver.Advance()
+
+    fp_end = phys.ComputeFissionProduction("new")
+    return fp0, fp_end, phys.GetTime()
+
+
+if __name__ == "__main__":
+    xs_crit = MultiGroupXS()
+    xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_stiff_crit.cxs"))
+
+    xs_super = MultiGroupXS()
+    xs_super.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_stiff_super.cxs"))
+
+    t_end = 0.2
+    dt_small = 5.0e-3
+    dt_large = 2.0e-2
+
+    fr0_s, fp_end_s, t_s = run_transient(dt_small, t_end, xs_crit, xs_super)
+    fr0_l, fp_end_l, t_l = run_transient(dt_large, t_end, xs_crit, xs_super)
+
+    ratio_small = fp_end_s / fr0_s
+    ratio_large = fp_end_l / fr0_l
+
+    rel_diff = abs(ratio_small - ratio_large) / max(abs(ratio_small), 1.0e-14)
+
+    ok = (
+        math.isfinite(ratio_small)
+        and math.isfinite(ratio_large)
+        and ratio_small > 1.0
+        and ratio_large > 1.0
+        and rel_diff < 5.0e-2
+        and abs(t_s - t_end) < 1.0e-6
+        and abs(t_l - t_end) < 1.0e-6
+    )
+
+    if rank == 0:
+        print(f"DT_SMALL_RATIO {ratio_small:.12e}")
+        print(f"DT_LARGE_RATIO {ratio_large:.12e}")
+        print(f"REL_DIFF {rel_diff:.12e}")
+        print(f"TRANSIENT_OK {1 if ok else 0}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_prompt_analytic_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_prompt_analytic_cbc.py
new file mode 100644
index 0000000000..eed0fdee39
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_prompt_analytic_cbc.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+3D prompt-only transient k-eigen with analytic exponential response.
+
+1-group, reflecting BCs, no delayed neutrons. After a step to supercritical:
+dphi/dt = alpha * phi, with alpha = nu*sigma_f - sigma_a. In discrete time with
+theta=1, the update ratio per step is r = (tau + sigma_s + nu*sigma_f)/(tau +
+sigma_t), where tau = v^{-1}/dt. For small dt, r ≈ exp(alpha dt), giving
+phi(t)/phi(0) = exp(alpha t).
+
+ANALYTIC_PASS is 1 if |FP_ratio - exp(alpha t)| < 0.5% for all steps up to
+t=0.1. With sigma_t=1.0, sigma_s=0.7 => sigma_a=0.3, nu=2, sigma_f=0.18,
+alpha=0.36-0.3=0.06. Thus exp(alpha*0.1)=exp(0.006)≈1.0060 (used implicitly in
+the comparison). ANALYTIC_PASS validates the time term and prompt fission source
+handling against the analytic solution.
+"""
+
+import math
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.solver import (
+        DiscreteOrdinatesProblem,
+        PowerIterationKEigenSolver,
+        TransientSolver,
+    )
+    from pyopensn.aquad import GLCProductQuadrature3DXYZ
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.mesh import OrthogonalMeshGenerator
+
+if __name__ == "__main__":
+    dx = 8.0 / 4
+    nodes = [i * dx for i in range(4 + 1)]
+    meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes, nodes])
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+
+    xs_crit = MultiGroupXS()
+    xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_prompt_crit.cxs"))
+
+    xs_super = MultiGroupXS()
+    xs_super.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_prompt_super.cxs"))
+
+    pquad = GLCProductQuadrature3DXYZ(n_polar=2, n_azimuthal=4, scattering_order=0)
+
+    num_groups = 1
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=num_groups,
+        groupsets=[
+            {
+                "groups_from_to": (0, num_groups - 1),
+                "angular_quadrature": pquad,
+                "inner_linear_method": "classic_richardson",
+                "l_abs_tol": 1.0e-8,
+                "l_max_its": 200,
+            },
+        ],
+        xs_map=[{"block_ids": [0], "xs": xs_crit}],
+        boundary_conditions=[
+            {"name": "xmin", "type": "reflecting"},
+            {"name": "xmax", "type": "reflecting"},
+            {"name": "ymin", "type": "reflecting"},
+            {"name": "ymax", "type": "reflecting"},
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={
+            "save_angular_flux": True,
+            "use_precursors": False,
+            "verbose_inner_iterations": False,
+            "verbose_outer_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    keigen = PowerIterationKEigenSolver(problem=phys, max_iters=200, k_tol=1.0e-10)
+    keigen.Initialize()
+    keigen.Execute()
+
+    phys.SetTimeDependentMode()
+
+    solver = TransientSolver(problem=phys, verbose=False, initial_state="existing")
+    solver.Initialize()
+    phi_old = phys.GetPhiOldLocal()
+    phi_new = phys.GetPhiNewLocal()
+    if rank == 0:
+        print("phi_old[0]", phi_old[0], "phi_new[0]", phi_new[0])
+    # Swap to supercritical XS at t=0
+    phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_super}])
+    fp_new = phys.ComputeFissionProduction("new")
+    if rank == 0:
+        print("fp_new (new)", fp_new)
+    # Analytic alpha for prompt-only
+    sigma_t = 1.0
+    sigma_s = 0.7
+    sigma_a = sigma_t - sigma_s
+    nu = 2.0
+    sigma_f = 0.180000
+    alpha = nu * sigma_f - sigma_a
+
+    dt = 1.0e-2
+    solver.SetTimeStep(dt)
+    solver.SetTheta(1.0)
+
+    # Use the converged flux from the k-eigen solve as the initial state
+    fp0 = phys.ComputeFissionProduction("new")
+
+    if rank == 0:
+        print("inv_velocity", xs_super.inv_velocity)
+        print("dt", dt, "theta", 1.0)
+    tau = xs_super.inv_velocity[0] / dt
+    r_expected = (tau + sigma_s + nu * sigma_f) / (tau + sigma_t)
+    if rank == 0:
+        print("tau", tau, "r_expected", r_expected, "r_expected^11", r_expected ** 11)
+
+    t_end = 0.1
+    rel_tol = 5.0e-3
+    ok = True
+    if rank == 0:
+        print("step time ratio_numeric ratio_analytic")
+    step = 0
+    while phys.GetTime() < t_end:
+        step += 1
+        t_from = phys.GetTime()
+        solver.Advance()
+        fp_step_new = phys.ComputeFissionProduction("new")
+        fp_step_old = phys.ComputeFissionProduction("old")
+        if rank == 0:
+            print(
+                "fp_new(after step)",
+                fp_step_new,
+                "fp_old(after step)",
+                fp_step_old,
+            )
+        phi_old = phys.GetPhiOldLocal()
+        phi_new = phys.GetPhiNewLocal()
+        if rank == 0:
+            print("phi_old[0]", phi_old[0], "phi_new[0]", phi_new[0])
+        fp_new = fp_step_new
+        t_to = phys.GetTime()
+        ratio_num = fp_new / fp0
+        ratio_ana = math.exp(alpha * t_to)
+
+        if rank == 0:
+            print(f"{step:4d} {t_to:10.4e} {ratio_num:12.6e} {ratio_ana:12.6e}")
+        if abs(ratio_num - ratio_ana) > rel_tol * ratio_ana:
+            ok = False
+
+    if rank == 0:
+        print(f"ANALYTIC_PASS {1 if ok else 0}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_prompt_bc_leakage_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_prompt_bc_leakage_cbc.py
new file mode 100644
index 0000000000..abbe54e21d
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_prompt_bc_leakage_cbc.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+3D prompt transient: boundary leakage (vacuum vs reflecting).
+
+Ensure boundary conditions influence transient response: vacuum boundaries
+should leak neutrons and yield lower growth than reflecting boundaries.
+
+Prompt-only with a step to supercritical material. Leakage reduces effective
+reactivity when vacuum BCs are used.
+
+TRANSIENT_OK requires step growth with vacuum BCs to be less than that under
+reflecting BCs.
+"""
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.solver import (
+        DiscreteOrdinatesProblem,
+        PowerIterationKEigenSolver,
+        TransientSolver,
+    )
+    from pyopensn.aquad import GLCProductQuadrature3DXYZ
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.mesh import OrthogonalMeshGenerator
+
+
+def run_case(bc_type, xs_crit, xs_super):
+    dx = 8.0 / 4
+    nodes = [i * dx for i in range(4 + 1)]
+    meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes, nodes])
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+    pquad = GLCProductQuadrature3DXYZ(n_polar=2, n_azimuthal=4, scattering_order=0)
+
+    bcs = [
+        {"name": "xmin", "type": bc_type},
+        {"name": "xmax", "type": bc_type},
+        {"name": "ymin", "type": bc_type},
+        {"name": "ymax", "type": bc_type},
+        {"name": "zmin", "type": bc_type},
+        {"name": "zmax", "type": bc_type},
+    ]
+
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=1,
+        groupsets=[
+            {
+                "groups_from_to": (0, 0),
+                "angular_quadrature": pquad,
+                "inner_linear_method": "petsc_richardson",
+                "l_abs_tol": 1.0e-8,
+                "l_max_its": 200,
+            },
+        ],
+        xs_map=[{"block_ids": [0], "xs": xs_crit}],
+        boundary_conditions=bcs,
+        options={
+            "save_angular_flux": True,
+            "use_precursors": False,
+            "verbose_inner_iterations": False,
+            "verbose_outer_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    keigen = PowerIterationKEigenSolver(problem=phys)
+    keigen.Initialize()
+    keigen.Execute()
+
+    phys.SetTimeDependentMode()
+
+    solver = TransientSolver(problem=phys, initial_state="existing")
+    solver.Initialize()
+
+    fp_old = phys.ComputeFissionProduction("new")
+    phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_super}])
+    fp_new = phys.ComputeFissionProduction("new")
+
+    solver.SetTimeStep(1.0e-2)
+    solver.SetTheta(1.0)
+
+    solver.Advance()
+    fp1 = phys.ComputeFissionProduction("new")
+
+    return fp1 / fp_new, fp_new / fp_old
+
+
+if __name__ == "__main__":
+    xs_crit = MultiGroupXS()
+    xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_prompt_crit.cxs"))
+
+    xs_super = MultiGroupXS()
+    xs_super.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_prompt_super.cxs"))
+
+    ratio_reflect, fp_ratio_reflect = run_case("reflecting", xs_crit, xs_super)
+    ratio_vacuum, fp_ratio_vacuum = run_case("vacuum", xs_crit, xs_super)
+
+    ok = 1 if ratio_vacuum < ratio_reflect else 0
+
+    if rank == 0:
+        print(f"FP_RATIO_REFLECT {fp_ratio_reflect:.12e}")
+        print(f"FP_RATIO_VACUUM {fp_ratio_vacuum:.12e}")
+        print(f"STEP_RATIO_REFLECT {ratio_reflect:.12e}")
+        print(f"STEP_RATIO_VACUUM {ratio_vacuum:.12e}")
+        print(f"TRANSIENT_OK {ok}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_prompt_mid_step_swap_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_prompt_mid_step_swap_cbc.py
new file mode 100644
index 0000000000..940af984df
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_prompt_mid_step_swap_cbc.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+3D prompt transient: mid-step xs swap time.
+
+Validate swapping xs at a non-integer time, ensuring reported time matches
+the swap and that the fission production reflects the new xs immediately.
+
+Prompt-only. First step to t=0.07, swap xs, then step to t=0.12. The fission
+production computed at the swap time should scale by the xs ratio.
+
+TIME_AT_SWAP = 0.07 because we advance with dt=0.07 before swapping.
+FP_RATIO_AT_SWAP = 1.2 from sigma_f ratio 0.180/0.150 at the swap time.
+TIME_AT_SWAP verifies correct time advance. FP_RATIO_AT_SWAP verifies immediate
+response to XS swap at that time.
+"""
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.solver import (
+        DiscreteOrdinatesProblem,
+        PowerIterationKEigenSolver,
+        TransientSolver,
+    )
+    from pyopensn.aquad import GLCProductQuadrature3DXYZ
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.mesh import OrthogonalMeshGenerator
+
+if __name__ == "__main__":
+    dx = 8.0 / 4
+    nodes = [i * dx for i in range(4 + 1)]
+    meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes, nodes])
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+
+    xs_crit = MultiGroupXS()
+    xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_prompt_crit.cxs"))
+
+    xs_super = MultiGroupXS()
+    xs_super.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_prompt_super.cxs"))
+
+    pquad = GLCProductQuadrature3DXYZ(n_polar=2, n_azimuthal=4, scattering_order=0)
+
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=1,
+        groupsets=[
+            {
+                "groups_from_to": (0, 0),
+                "angular_quadrature": pquad,
+                "inner_linear_method": "petsc_gmres",
+                "l_abs_tol": 1.0e-8,
+                "l_max_its": 200,
+                "gmres_restart_interval": 10,
+            },
+        ],
+        xs_map=[{"block_ids": [0], "xs": xs_crit}],
+        boundary_conditions=[
+            {"name": "xmin", "type": "reflecting"},
+            {"name": "xmax", "type": "reflecting"},
+            {"name": "ymin", "type": "reflecting"},
+            {"name": "ymax", "type": "reflecting"},
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={
+            "save_angular_flux": True,
+            "use_precursors": False,
+            "verbose_inner_iterations": False,
+            "verbose_outer_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    keigen = PowerIterationKEigenSolver(problem=phys)
+    keigen.Initialize()
+    keigen.Execute()
+
+    phys.SetTimeDependentMode()
+
+    solver = TransientSolver(problem=phys, initial_state="existing")
+    solver.Initialize()
+
+    fp0 = phys.ComputeFissionProduction("new")
+
+    solver.SetTheta(1.0)
+
+    # First step to t=0.07
+    solver.SetTimeStep(0.07)
+    solver.Advance()
+
+    time_at_swap = phys.GetTime()
+
+    # Swap XS at non-integer time
+    phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_super}])
+    fp_swap = phys.ComputeFissionProduction("new")
+
+    # Next step to t=0.12
+    solver.SetTimeStep(0.05)
+    solver.Advance()
+
+    if rank == 0:
+        print(f"TIME_AT_SWAP {time_at_swap:.12e}")
+        print(f"FP_RATIO_AT_SWAP {fp_swap / fp0:.12e}")
+        print("TRANSIENT_OK 1")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_pulse_inf_med_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_pulse_inf_med_cbc.py
new file mode 100644
index 0000000000..1e919a84f0
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_pulse_inf_med_cbc.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+3D 1-group pulse, transient zero-init in a 3.2 cm reflecting cube.
+
+Pure absorber with sigma_t = 1.0 cm^-1, sigma_s = 0, v = 1.0 cm/s, and a
+total Q_tot = 122.58 particles/s on for t=[0, 1] s, then 0 for t=[1, 2] s,
+then 2Q_tot from t=[2, 3] s. V = 3.2^3 cm^3, so volumetric
+Q = Q_tot / V ~= 3.7408 cm^-3 s^-1.
+
+phi1 = phi(1s) = Q * (1 - e^{-1}) ~= 2.365
+phi2 = phi(2s) = phi1 * e^{-1} ~=0.870
+phi3 = phi(3s) = phi2*e^{-1} + 2*Q*(1-e^{-1}) ~= 5.049
+"""
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.mesh import FromFileMeshGenerator
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.source import VolumetricSource
+    from pyopensn.aquad import GLCProductQuadrature3DXYZ
+    from pyopensn.solver import (
+        CrankNicolson,
+        DiscreteOrdinatesProblem,
+        TransientSolver,
+    )
+    from pyopensn.fieldfunc import FieldFunctionInterpolationVolume
+    from pyopensn.logvol import RPPLogicalVolume
+
+if __name__ == "__main__":
+
+    meshgen = FromFileMeshGenerator(filename="../../../../assets/mesh/cube3.2.msh")
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+    grid.SetOrthogonalBoundaries()
+
+    num_groups = 1
+    xs_diag = MultiGroupXS()
+    xs_diag.CreateSimpleOneGroup(1.0, 0.0, 1.0)
+
+    Q_tot = 122.58
+    Q_vol = Q_tot / (3.2 * 3.2 * 3.2)
+
+    strength1 = [0.0 for _ in range(num_groups)]
+    strength2 = [0.0 for _ in range(num_groups)]
+    strength1[0] = Q_vol
+    strength2[0] = 2.0 * Q_vol
+
+    src1 = VolumetricSource(
+        block_ids=[0],
+        group_strength=strength1,
+        start_time=0.0,
+        end_time=1.0,
+    )
+
+    src2 = VolumetricSource(
+        block_ids=[0],
+        group_strength=strength2,
+        start_time=2.0,
+        end_time=3.0,
+    )
+
+    pquad = GLCProductQuadrature3DXYZ(n_polar=4, n_azimuthal=16, scattering_order=0)
+
+    gs0 = [0, num_groups - 1]
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=num_groups,
+        time_dependent=True,
+        groupsets=[
+            {
+                "groups_from_to": gs0,
+                "angular_quadrature": pquad,
+                "angle_aggregation_type": "single",
+                "angle_aggregation_num_subsets": 1,
+                "inner_linear_method": "petsc_gmres",
+                "l_abs_tol": 1.0e-6,
+                "l_max_its": 500,
+            },
+        ],
+        xs_map=[
+            {"block_ids": [0], "xs": xs_diag},
+        ],
+        volumetric_sources=[src1, src2],
+        boundary_conditions=[
+            {"name": "xmin", "type": "reflecting"},
+            {"name": "xmax", "type": "reflecting"},
+            {"name": "ymin", "type": "reflecting"},
+            {"name": "ymax", "type": "reflecting"},
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={"save_angular_flux": True},
+        sweep_type="CBC",
+    )
+
+    solver = TransientSolver(
+        problem=phys,
+        dt=0.1,
+        theta=CrankNicolson,
+        stop_time=3.0,
+        initial_state="zero",
+    )
+    solver.Initialize()
+    solver.Execute()
+
+    fflist = phys.GetScalarFluxFieldFunction()
+    monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True)
+    field_interp = FieldFunctionInterpolationVolume()
+    field_interp.SetOperationType("max")
+    field_interp.SetLogicalVolume(monitor_volume)
+    field_interp.AddFieldFunction(fflist[0])
+    field_interp.Execute()
+    flux_max = field_interp.GetValue()
+
+    if rank == 0:
+        print(f"Max phi(3s) = {flux_max:.6f}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_ramp_source_analytic_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_ramp_source_analytic_cbc.py
new file mode 100644
index 0000000000..3a148dd056
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_ramp_source_analytic_cbc.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+1-group infinite-medium transient with a ramped source and analytic solution.
+
+ODE (v=1): d(phi)/dt + sigma_t * phi = Q(t)
+Q(t) ramps linearly from 0 at t=0 to Q0 at t=t_ramp, then stays at Q0.
+"""
+
+import os
+import sys
+import math
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.mesh import FromFileMeshGenerator
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.source import VolumetricSource
+    from pyopensn.aquad import GLCProductQuadrature3DXYZ
+    from pyopensn.solver import DiscreteOrdinatesProblem, TransientSolver
+    from pyopensn.fieldfunc import FieldFunctionInterpolationVolume
+    from pyopensn.logvol import RPPLogicalVolume
+
+
+def ramp_q(time_value: float, q0: float, t_ramp: float) -> float:
+    if time_value <= 0.0:
+        return 0.0
+    if time_value < t_ramp:
+        return q0 * time_value / t_ramp
+    return q0
+
+
+def analytic_phi(time_value: float,
+                 q0: float,
+                 t_ramp: float,
+                 sigma_t: float,
+                 v: float) -> float:
+    lam = v * sigma_t
+    if time_value <= t_ramp:
+        a = q0 / t_ramp
+        return v * a * (
+            time_value / lam
+            - 1.0 / (lam * lam)
+            + math.exp(-lam * time_value) / (lam * lam)
+        )
+    # value at t_ramp
+    a = q0 / t_ramp
+    phi_tr = v * a * (
+        t_ramp / lam - 1.0 / (lam * lam) + math.exp(-lam * t_ramp) / (lam * lam)
+    )
+    dt = time_value - t_ramp
+    return phi_tr * math.exp(-lam * dt) + (v * q0 / lam) * (1.0 - math.exp(-lam * dt))
+
+
+if __name__ == "__main__":
+    meshgen = FromFileMeshGenerator(filename="../../../../assets/mesh/cube3.2.msh")
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+    grid.SetOrthogonalBoundaries()
+
+    sigma_t = 1.0
+    v = 1.0
+    q0 = 1.0
+    t_ramp = 0.5
+
+    xs = MultiGroupXS()
+    xs.CreateSimpleOneGroup(sigma_t, 0.0, v)
+
+    def source_func(group: int, time_value: float) -> float:
+        return ramp_q(time_value, q0, t_ramp)
+    vol_src = VolumetricSource(block_ids=[0], strength_function=source_func)
+
+    pquad = GLCProductQuadrature3DXYZ(n_polar=4, n_azimuthal=16, scattering_order=0)
+
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=1,
+        time_dependent=True,
+        groupsets=[
+            {
+                "groups_from_to": (0, 0),
+                "angular_quadrature": pquad,
+                "angle_aggregation_type": "single",
+                "inner_linear_method": "petsc_gmres",
+                "l_abs_tol": 1.0e-8,
+                "l_max_its": 200,
+            }
+        ],
+        xs_map=[{"block_ids": [0], "xs": xs}],
+        volumetric_sources=[vol_src],
+        boundary_conditions=[
+            {"name": "xmin", "type": "reflecting"},
+            {"name": "xmax", "type": "reflecting"},
+            {"name": "ymin", "type": "reflecting"},
+            {"name": "ymax", "type": "reflecting"},
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={"save_angular_flux": True, "verbose_inner_iterations": False},
+        sweep_type="CBC",
+    )
+
+    solver = TransientSolver(problem=phys, initial_state="zero")
+    solver.Initialize()
+    solver.SetTheta(0.5)
+
+    dt = 0.01
+    stop_time = 1.0
+    current_time = 0.0
+
+    while current_time < stop_time:
+        target_time = min(current_time + dt, stop_time)
+        solver.SetTimeStep(target_time - current_time)
+        solver.Advance()
+        current_time = target_time
+
+    fflist = phys.GetScalarFluxFieldFunction()
+    monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True)
+    field_interp = FieldFunctionInterpolationVolume()
+    field_interp.SetOperationType("max")
+    field_interp.SetLogicalVolume(monitor_volume)
+    field_interp.AddFieldFunction(fflist[0])
+    field_interp.Execute()
+    phi_num = field_interp.GetValue()
+
+    phi_exact = analytic_phi(stop_time, q0, t_ramp, sigma_t, v)
+    rel_err = abs(phi_num - phi_exact) / phi_exact
+    pass_flag = 1 if rel_err < 0.01 else 0
+
+    if rank == 0:
+        print(f"RAMP_SOURCE_ANALYTIC phi_num {phi_num:.6f} phi_exact {phi_exact:.6f}")
+        print(f"RAMP_SOURCE_ANALYTIC_PASS {pass_flag}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v0.5_inf_med_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v0.5_inf_med_cbc.py
new file mode 100644
index 0000000000..ad9d90e492
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v0.5_inf_med_cbc.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# 1-group, infinite-medium, transient in a 3.2 cm cube (all reflecting).
+# The cube is a pure absorber with sigma_t = 1.0 cm^-1, sigma_s = 0, v = 0.5 cm/s
+# and a total Q_tot = 122.58 particles/s on for t=[0, 1] s, then 0 for t>1 s.
+# V = 3.2^3 cm^3, so volumetric Q = Q_tot / V ~= 3.7408 cm^-3 s^-1.
+#
+#   For 0 <= t <= 1: phi(t) = Q * (1 - e^{-t})
+#   For t >= 1:      phi(t) = phi(1) * e^{-(t - 1)}
+#   phi(1s) ~= 1.472
+#   phi(2s) ~= 0.893
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.mesh import FromFileMeshGenerator
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.source import VolumetricSource
+    from pyopensn.aquad import GLCProductQuadrature3DXYZ
+    from pyopensn.solver import BackwardEuler, DiscreteOrdinatesProblem, TransientSolver
+    from pyopensn.fieldfunc import FieldFunctionInterpolationVolume
+    from pyopensn.logvol import RPPLogicalVolume
+
+if __name__ == "__main__":
+
+    meshgen = FromFileMeshGenerator(filename="../../../../assets/mesh/cube3.2.msh")
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+    grid.SetOrthogonalBoundaries()
+
+    num_groups = 1
+    xs_diag = MultiGroupXS()
+    xs_diag.CreateSimpleOneGroup(1.0, 0.0, 0.5)
+
+    strength = [0.0 for _ in range(num_groups)]
+    strength[0] = 122.58 / (3.2 * 3.2 * 3.2)
+    mg_src = VolumetricSource(block_ids=[0],
+                              group_strength=strength,
+                              start_time=0.0,
+                              end_time=1.0)
+
+    pquad = GLCProductQuadrature3DXYZ(n_polar=4, n_azimuthal=16, scattering_order=0)
+
+    gs0 = [0, num_groups - 1]
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=num_groups,
+        time_dependent=True,
+        groupsets=[
+            {
+                "groups_from_to": gs0,
+                "angular_quadrature": pquad,
+                "angle_aggregation_type": "single",
+                "angle_aggregation_num_subsets": 1,
+                "inner_linear_method": "petsc_richardson",
+                "l_abs_tol": 1.0e-6,
+                "l_max_its": 500,
+            },
+        ],
+        xs_map=[
+            {"block_ids": [0], "xs": xs_diag},
+        ],
+        volumetric_sources=[mg_src],
+        boundary_conditions=[
+            {"name": "xmin", "type": "reflecting"},
+            {"name": "xmax", "type": "reflecting"},
+            {"name": "ymin", "type": "reflecting"},
+            {"name": "ymax", "type": "reflecting"},
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={"save_angular_flux": True},
+        sweep_type="CBC",
+    )
+
+    solver = TransientSolver(
+        problem=phys,
+        dt=0.05,
+        theta=BackwardEuler,
+        stop_time=2.0,
+        initial_state="zero",
+    )
+    solver.Initialize()
+    solver.Execute()
+
+    fflist = phys.GetScalarFluxFieldFunction()
+    monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True)
+    field_interp = FieldFunctionInterpolationVolume()
+    field_interp.SetOperationType("max")
+    field_interp.SetLogicalVolume(monitor_volume)
+    field_interp.AddFieldFunction(fflist[0])
+    field_interp.Execute()
+    flux_max = field_interp.GetValue()
+
+    if rank == 0:
+        print(f"Max phi(2s) = {flux_max:.6f}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v1_inf_med_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v1_inf_med_cbc.py
new file mode 100644
index 0000000000..0b3e907bcc
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v1_inf_med_cbc.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+3D 1-group, v=1.0, transient zero-init in a 3.2 cm reflecting cube.
+
+Pure absorber with sigma_t = 1.0 cm^-1, sigma_s = 0, v = 1.0 cm/s and a
+total Q_tot = 122.58 particles/s on for t=[0, 1] s, then 0 for t>1 s.
+V = 3.2^3 cm^3, so volumetric Q = Q_tot / V ~= 3.7408 cm^-3 s^-1.
+
+For 0 <= t <= 1: phi(t) = Q * (1 - e^{-t})
+For t >= 1:      phi(t) = phi(1) * e^{-(t - 1)}
+phi(1s) ~= 2.365
+phi(2s) ~= 0.870
+"""
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.mesh import FromFileMeshGenerator
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.source import VolumetricSource
+    from pyopensn.aquad import GLCProductQuadrature3DXYZ
+    from pyopensn.solver import DiscreteOrdinatesProblem, TransientSolver
+    from pyopensn.fieldfunc import FieldFunctionInterpolationVolume
+    from pyopensn.logvol import RPPLogicalVolume
+
+if __name__ == "__main__":
+
+    meshgen = FromFileMeshGenerator(filename="../../../../assets/mesh/cube3.2.msh")
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+    grid.SetOrthogonalBoundaries()
+
+    num_groups = 1
+    xs_diag = MultiGroupXS()
+    xs_diag.CreateSimpleOneGroup(1.0, 0.0, 1.0)
+
+    strength = [0.0 for _ in range(num_groups)]
+    strength[0] = 122.58 / (3.2 * 3.2 * 3.2)
+    mg_src = VolumetricSource(
+        block_ids=[0],
+        group_strength=strength,
+        start_time=0.0,
+        end_time=1.0,
+    )
+
+    pquad = GLCProductQuadrature3DXYZ(n_polar=4, n_azimuthal=16, scattering_order=0)
+
+    gs0 = [0, num_groups - 1]
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=num_groups,
+        time_dependent=True,
+        groupsets=[
+            {
+                "groups_from_to": gs0,
+                "angular_quadrature": pquad,
+                "angle_aggregation_type": "single",
+                "angle_aggregation_num_subsets": 1,
+                "inner_linear_method": "petsc_richardson",
+                "l_abs_tol": 1.0e-6,
+                "l_max_its": 500,
+            },
+        ],
+        xs_map=[
+            {"block_ids": [0], "xs": xs_diag},
+        ],
+        volumetric_sources=[mg_src],
+        boundary_conditions=[
+            {"name": "xmin", "type": "reflecting"},
+            {"name": "xmax", "type": "reflecting"},
+            {"name": "ymin", "type": "reflecting"},
+            {"name": "ymax", "type": "reflecting"},
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={"save_angular_flux": True},
+        sweep_type="CBC",
+    )
+
+    solver = TransientSolver(
+        problem=phys,
+        dt=0.05,
+        theta=1.0,
+        stop_time=1.0,
+        initial_state="zero",
+    )
+    solver.Initialize()
+    solver.Execute()
+
+    fflist = phys.GetScalarFluxFieldFunction()
+    monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True)
+    field_interp = FieldFunctionInterpolationVolume()
+    field_interp.SetOperationType("max")
+    field_interp.SetLogicalVolume(monitor_volume)
+    field_interp.AddFieldFunction(fflist[0])
+    field_interp.Execute()
+    flux_max = field_interp.GetValue()
+
+    if rank == 0:
+        print(f"Max phi(1s) = {flux_max:.6f}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v1_inf_med_swap_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v1_inf_med_swap_cbc.py
new file mode 100644
index 0000000000..c60df9e9cd
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v1_inf_med_swap_cbc.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# 1-group, infinite-medium, transient in a 3.2 cm cube (all reflecting).
+# The cube is a pure absorber with sigma_t = 1.0 cm^-1, sigma_s = 0, v = 1.0 cm/s
+# and a total Q_tot = 122.58 particles/s on for t=[0, 1] s, then 0 for t>1 s.
+# At t = 0.5 s, cross sections are swapped to sigma_t = 2.0 cm^-1.
+# V = 3.2^3 cm^3, so volumetric Q = Q_tot / V ~= 3.7408 cm^-3 s^-1.
+#
+# For 0 <= t <= 1: phi(t) = Q * (1 - e^{-t})
+# For t >= 1:      phi(t) = phi(1) * e^{-2 (t - 1)}
+# With backward Euler,  dt = 0.05:
+# phi(1s) ~= 1.706
+# phi(2s) ~= 0.233
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.mesh import FromFileMeshGenerator
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.source import VolumetricSource
+    from pyopensn.aquad import GLCProductQuadrature3DXYZ
+    from pyopensn.solver import DiscreteOrdinatesProblem, TransientSolver
+    from pyopensn.fieldfunc import FieldFunctionInterpolationVolume
+    from pyopensn.logvol import RPPLogicalVolume
+
+if __name__ == "__main__":
+
+    meshgen = FromFileMeshGenerator(filename="../../../../assets/mesh/cube3.2.msh")
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+    grid.SetOrthogonalBoundaries()
+
+    num_groups = 1
+    xs_diag = MultiGroupXS()
+    xs_diag.CreateSimpleOneGroup(1.0, 0.0, 1.0)
+    xs_diag_swap = MultiGroupXS()
+    xs_diag_swap.CreateSimpleOneGroup(2.0, 0.0, 1.0)
+
+    strength = [0.0 for _ in range(num_groups)]
+    strength[0] = 122.58 / (3.2 * 3.2 * 3.2)
+    mg_src = VolumetricSource(block_ids=[0],
+                              group_strength=strength,
+                              start_time=0.0,
+                              end_time=1.0)
+
+    pquad = GLCProductQuadrature3DXYZ(n_polar=4, n_azimuthal=16, scattering_order=0)
+
+    gs0 = [0, num_groups - 1]
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=num_groups,
+        time_dependent=True,
+        groupsets=[
+            {
+                "groups_from_to": gs0,
+                "angular_quadrature": pquad,
+                "angle_aggregation_type": "single",
+                "angle_aggregation_num_subsets": 1,
+                "inner_linear_method": "petsc_richardson",
+                "l_abs_tol": 1.0e-6,
+                "l_max_its": 500,
+            },
+        ],
+        xs_map=[
+            {"block_ids": [0], "xs": xs_diag},
+        ],
+        volumetric_sources=[mg_src],
+        boundary_conditions=[
+            {"name": "xmin", "type": "reflecting"},
+            {"name": "xmax", "type": "reflecting"},
+            {"name": "ymin", "type": "reflecting"},
+            {"name": "ymax", "type": "reflecting"},
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={"save_angular_flux": True},
+        sweep_type="CBC",
+    )
+
+    solver = TransientSolver(problem=phys, verbose=False, initial_state="zero")
+    solver.Initialize()
+
+    dt = 0.05
+    theta = 1.0
+    step = 0
+    stop = 0
+    stop_time = 1.0
+    swap_time = 0.5
+    current_time = 0.0
+    swapped = False
+    solver.SetTheta(theta)
+
+    while current_time < stop_time:
+        target_time = min(current_time + dt, stop_time)
+        step_dt = target_time - current_time
+        solver.SetTimeStep(step_dt)
+
+        if rank == 0:
+            print("")
+            print(
+                f"*************** Time step #{step:d}  t = {target_time:.6f} "
+                f"(from {current_time:.6f}, dt = {step_dt:.6f}, theta = {theta:.3f}) "
+                f"***************"
+            )
+
+        solver.Advance()
+
+        if (not swapped) and target_time >= swap_time:
+            phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_diag_swap}])
+            swapped = True
+
+        current_time = target_time
+        step = step + 1
+
+    fflist = phys.GetScalarFluxFieldFunction()
+    monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True)
+    field_interp = FieldFunctionInterpolationVolume()
+    field_interp.SetOperationType("max")
+    field_interp.SetLogicalVolume(monitor_volume)
+    field_interp.AddFieldFunction(fflist[0])
+    field_interp.Execute()
+    flux_max = field_interp.GetValue()
+
+    if rank == 0:
+        print(f"Max phi(1s) = {flux_max:.6f}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v2_inf_med_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v2_inf_med_cbc.py
new file mode 100644
index 0000000000..316b9b3409
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v2_inf_med_cbc.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# 1-group, infinite-medium, transient in a 3.2 cm cube (all reflecting).
+# The cube is a pure absorber with sigma_t = 1.0 cm^-1, sigma_s = 0, v = 2.0 cm/s
+# and a total Q_tot = 122.58 particles/s on for t=[0, 1] s, then 0 for t>1 s.
+# V = 3.2^3 cm^3, so volumetric Q = Q_tot / V ~= 3.7408 cm^-3 s^-1.
+#
+# For 0 <= t <= 1: phi(t) = Q * (1 - e^{-t})
+# For t >= 1:      phi(t) = phi(1) * e^{-(t - 1)}
+# phi(1s) ~= 3.235
+# phi(2s) ~= 0.438
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.mesh import FromFileMeshGenerator
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.source import VolumetricSource
+    from pyopensn.aquad import GLCProductQuadrature3DXYZ
+    from pyopensn.solver import DiscreteOrdinatesProblem, TransientSolver
+    from pyopensn.fieldfunc import FieldFunctionInterpolationVolume
+    from pyopensn.logvol import RPPLogicalVolume
+
+if __name__ == "__main__":
+
+    meshgen = FromFileMeshGenerator(filename="../../../../assets/mesh/cube3.2.msh")
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+    grid.SetOrthogonalBoundaries()
+
+    num_groups = 1
+    xs_diag = MultiGroupXS()
+    xs_diag.CreateSimpleOneGroup(1.0, 0.0, 2.0)
+
+    strength = [0.0 for _ in range(num_groups)]
+    strength[0] = 122.58 / (3.2 * 3.2 * 3.2)
+    mg_src = VolumetricSource(block_ids=[0],
+                              group_strength=strength,
+                              start_time=0.0,
+                              end_time=1.0)
+
+    pquad = GLCProductQuadrature3DXYZ(n_polar=4, n_azimuthal=16, scattering_order=0)
+
+    gs0 = [0, num_groups - 1]
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=num_groups,
+        time_dependent=True,
+        groupsets=[
+            {
+                "groups_from_to": gs0,
+                "angular_quadrature": pquad,
+                "angle_aggregation_type": "single",
+                "angle_aggregation_num_subsets": 1,
+                "inner_linear_method": "petsc_gmres",
+                "l_abs_tol": 1.0e-6,
+                "l_max_its": 500,
+            },
+        ],
+        xs_map=[
+            {"block_ids": [0], "xs": xs_diag},
+        ],
+        volumetric_sources=[mg_src],
+        boundary_conditions=[
+            {"name": "xmin", "type": "reflecting"},
+            {"name": "xmax", "type": "reflecting"},
+            {"name": "ymin", "type": "reflecting"},
+            {"name": "ymax", "type": "reflecting"},
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={"save_angular_flux": True},
+        sweep_type="CBC",
+    )
+
+    solver = TransientSolver(problem=phys, dt=0.05, theta=1.0, stop_time=1.0, initial_state="zero")
+    solver.Initialize()
+    solver.Execute()
+
+    fflist = phys.GetScalarFluxFieldFunction()
+    monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True)
+    field_interp = FieldFunctionInterpolationVolume()
+    field_interp.SetOperationType("max")
+    field_interp.SetLogicalVolume(monitor_volume)
+    field_interp.AddFieldFunction(fflist[0])
+    field_interp.Execute()
+    flux_max = field_interp.GetValue()
+
+    if rank == 0:
+        print(f"Max phi(1s) = {flux_max:.6f}")
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_downscatter_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_downscatter_cbc.py
new file mode 100644
index 0000000000..109c1465c0
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_downscatter_cbc.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+3D 2-group downscatter, transient zero-init in a 3.2 cm reflecting cube.
+
+g0 (fast):    sigma_t0 = 1.0 cm^-1, v0 = 2.0 cm/s
+g1 (thermal): sigma_t1 = 0.8 cm^-1, v1 = 0.5 cm/s
+sigma_s(0 -> 1) = 0.5 cm^-1, all other sigma_s = 0
+Constant in time source in g0 only:
+Q0 = 122.58 / 3.2^3 cm^-3 s^-1, for all t >= 0
+
+(1/v0) d(phi0)/dt + sigma_t0 * phi0 = Q0
+(1/v1) d(phi1)/dt + sigma_t1 * phi1 = sigma_s(0->1) * phi0(t)
+phi0(t) = (Q0 / sigma_t0) * (1 - exp(-v0 * sigma_t0 * t)) = Q0 * (1 - exp(-2 t))
+phi1(t) = exp(-v1 * sigma_t1 * t) *
+          [ v1 * sigma_s(0->1) * integral_0^t exp(v1 * sigma_t1 * s) * phi0(s) ds ]
+phi0(1s) ~= 3.235,  phi1(1s) ~= 0.458
+phi0(2s) ~= 3.672,  phi1(2s) ~= 1.036
+"""
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.mesh import FromFileMeshGenerator
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.source import VolumetricSource
+    from pyopensn.aquad import GLCProductQuadrature3DXYZ
+    from pyopensn.solver import DiscreteOrdinatesProblem, TransientSolver
+    from pyopensn.fieldfunc import FieldFunctionInterpolationVolume
+    from pyopensn.logvol import RPPLogicalVolume
+
+if __name__ == "__main__":
+
+    meshgen = FromFileMeshGenerator(filename="../../../../assets/mesh/cube3.2.msh")
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+    grid.SetOrthogonalBoundaries()
+
+    xs_diag = MultiGroupXS()
+    xs_diag.LoadFromOpenSn(
+        os.path.join(
+            os.path.dirname(__file__),
+            "simple_2g_downscatter_td.cxs",
+        )
+    )
+    num_groups = xs_diag.num_groups
+
+    Q_tot = 122.58
+    Q_vol = Q_tot / (3.2 * 3.2 * 3.2)
+
+    strength = [0.0 for _ in range(num_groups)]
+    strength[0] = Q_vol   # source only in group 0
+    strength[1] = 0.0
+
+    mg_src = VolumetricSource(
+        block_ids=[0],
+        group_strength=strength,
+        start_time=0.0,
+        end_time=1.0e9,
+    )  # effectively always on
+
+    pquad = GLCProductQuadrature3DXYZ(n_polar=4, n_azimuthal=16, scattering_order=0)
+
+    gs0 = [0, num_groups - 1]
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=num_groups,
+        time_dependent=True,
+        groupsets=[
+            {
+                "groups_from_to": gs0,
+                "angular_quadrature": pquad,
+                "angle_aggregation_type": "single",
+                "angle_aggregation_num_subsets": 1,
+                "inner_linear_method": "petsc_gmres",
+                "l_abs_tol": 1.0e-6,
+                "l_max_its": 500,
+            },
+        ],
+        xs_map=[
+            {"block_ids": [0], "xs": xs_diag},
+        ],
+        volumetric_sources=[mg_src],
+        boundary_conditions=[
+            {"name": "xmin", "type": "reflecting"},
+            {"name": "xmax", "type": "reflecting"},
+            {"name": "ymin", "type": "reflecting"},
+            {"name": "ymax", "type": "reflecting"},
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={"save_angular_flux": True},
+        sweep_type="CBC",
+    )
+
+    solver = TransientSolver(
+        problem=phys,
+        dt=0.05,
+        theta=1.0,
+        stop_time=2.0,
+        initial_state="zero",
+    )
+    solver.Initialize()
+    solver.Execute()
+
+    fflist = phys.GetScalarFluxFieldFunction()
+    monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True)
+
+    # Group 0
+    ff_interp_g0 = FieldFunctionInterpolationVolume()
+    ff_interp_g0.SetOperationType("max")
+    ff_interp_g0.SetLogicalVolume(monitor_volume)
+    ff_interp_g0.AddFieldFunction(fflist[0])
+    ff_interp_g0.Execute()
+    flux_max_g0 = ff_interp_g0.GetValue()
+
+    # Group 1
+    ff_interp_g1 = FieldFunctionInterpolationVolume()
+    ff_interp_g1.SetOperationType("max")
+    ff_interp_g1.SetLogicalVolume(monitor_volume)
+    ff_interp_g1.AddFieldFunction(fflist[1])
+    ff_interp_g1.Execute()
+    flux_max_g1 = ff_interp_g1.GetValue()
+
+    if rank == 0:
+        print("Max phi0(2s) = {:.6f}".format(flux_max_g0))
+        print("Max phi1(2s) = {:.6f}".format(flux_max_g1))
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_downscatter_swap_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_downscatter_swap_cbc.py
new file mode 100644
index 0000000000..982bc600c0
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_downscatter_swap_cbc.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# 2-group, infinite-medium transient with downscatter (group 0 -> group 1).
+# Cross sections are swapped at t=0.5 s:
+#   before: sigma_t0=1.0, sigma_t1=0.8, sigma_s01=0.5
+#   after : sigma_t0=2.0, sigma_t1=1.2, sigma_s01=0.6
+# Velocities remain v0=2.0 cm/s, v1=0.5 cm/s.
+#
+# (1/v0) d(phi0)/dt + sigma_t0 * phi0 = Q0
+# (1/v1) d(phi1)/dt + sigma_t1 * phi1 = sigma_s01 * phi0(t)
+# With backward Euler, dt=0.05: phi0(1s) ~= 1.939573, phi1(1s) ~= 0.384769
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.mesh import FromFileMeshGenerator
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.source import VolumetricSource
+    from pyopensn.aquad import GLCProductQuadrature3DXYZ
+    from pyopensn.solver import DiscreteOrdinatesProblem, TransientSolver
+    from pyopensn.fieldfunc import FieldFunctionInterpolationVolume
+    from pyopensn.logvol import RPPLogicalVolume
+
+if __name__ == "__main__":
+
+    meshgen = FromFileMeshGenerator(filename="../../../../assets/mesh/cube3.2.msh")
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+    grid.SetOrthogonalBoundaries()
+
+    xs_diag = MultiGroupXS()
+    xs_diag.LoadFromOpenSn(
+        os.path.join(os.path.dirname(__file__), "simple_2g_downscatter_td.cxs")
+    )
+    xs_diag_swap = MultiGroupXS()
+    xs_diag_swap.LoadFromOpenSn(
+        os.path.join(os.path.dirname(__file__), "simple_2g_downscatter_td_swap.cxs")
+    )
+    num_groups = xs_diag.num_groups
+
+    Q_tot = 122.58
+    Q_vol = Q_tot / (3.2 * 3.2 * 3.2)
+
+    strength = [0.0 for _ in range(num_groups)]
+    strength[0] = Q_vol   # source only in group 0
+    strength[1] = 0.0
+
+    mg_src = VolumetricSource(block_ids=[0],
+                              group_strength=strength,
+                              start_time=0.0,
+                              end_time=1.0e9)  # effectively always on
+
+    pquad = GLCProductQuadrature3DXYZ(n_polar=4, n_azimuthal=16, scattering_order=0)
+
+    gs0 = [0, num_groups - 1]
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=num_groups,
+        time_dependent=True,
+        groupsets=[
+            {
+                "groups_from_to": gs0,
+                "angular_quadrature": pquad,
+                "angle_aggregation_type": "single",
+                "angle_aggregation_num_subsets": 1,
+                "inner_linear_method": "petsc_gmres",
+                "l_abs_tol": 1.0e-6,
+                "l_max_its": 500,
+            },
+        ],
+        xs_map=[
+            {"block_ids": [0], "xs": xs_diag},
+        ],
+        volumetric_sources=[mg_src],
+        boundary_conditions=[
+            {"name": "xmin", "type": "reflecting"},
+            {"name": "xmax", "type": "reflecting"},
+            {"name": "ymin", "type": "reflecting"},
+            {"name": "ymax", "type": "reflecting"},
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={"save_angular_flux": True},
+        sweep_type="CBC",
+    )
+
+    solver = TransientSolver(problem=phys, verbose=False, initial_state="zero")
+    solver.Initialize()
+
+    dt = 0.05
+    theta = 1.0
+    step = 0
+    stop_time = 1.0
+    swap_time = 0.5
+    current_time = 0.0
+    swapped = False
+    solver.SetTheta(theta)
+
+    while current_time < stop_time:
+        target_time = min(current_time + dt, stop_time)
+        step_dt = target_time - current_time
+        solver.SetTimeStep(step_dt)
+
+        if rank == 0:
+            print("")
+            print(
+                f"*************** Time step #{step:d}  t = {target_time:.6f} "
+                f"(from {current_time:.6f}, dt = {step_dt:.6f}, theta = {theta:.3f}) "
+                f"***************"
+            )
+
+        solver.Advance()
+
+        if (not swapped) and target_time >= swap_time:
+            phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_diag_swap}])
+            swapped = True
+
+        current_time = target_time
+        step = step + 1
+
+    fflist = phys.GetScalarFluxFieldFunction()
+    monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True)
+
+    # Group 0
+    ff_interp_g0 = FieldFunctionInterpolationVolume()
+    ff_interp_g0.SetOperationType("max")
+    ff_interp_g0.SetLogicalVolume(monitor_volume)
+    ff_interp_g0.AddFieldFunction(fflist[0])
+    ff_interp_g0.Execute()
+    flux_max_g0 = ff_interp_g0.GetValue()
+
+    # Group 1
+    ff_interp_g1 = FieldFunctionInterpolationVolume()
+    ff_interp_g1.SetOperationType("max")
+    ff_interp_g1.SetLogicalVolume(monitor_volume)
+    ff_interp_g1.AddFieldFunction(fflist[1])
+    ff_interp_g1.Execute()
+    flux_max_g1 = ff_interp_g1.GetValue()
+
+    if rank == 0:
+        print("Max phi0(1s) = {:.6f}".format(flux_max_g0))
+        print("Max phi1(1s) = {:.6f}".format(flux_max_g1))
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_pydrvr_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_pydrvr_cbc.py
new file mode 100644
index 0000000000..6c22f36e86
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_pydrvr_cbc.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# 2-group, infinite-medium transient with downscatter (group 0 -> group 1).
+# 3.2 cm reflecting cube (infinite medium) with:
+# g0 (fast):    sigma_t0 = 1.0 cm^-1, v0 = 2.0 cm/s
+# g1 (thermal): sigma_t1 = 0.8 cm^-1, v1 = 0.5 cm/s
+# sigma_s(0 -> 1) = 0.5 cm^-1, all other sigma_s = 0
+# Constant in time source in g0 only:
+# Q0 = 122.58 / 3.2^3 cm^-3 s^-1, for all t >= 0
+#
+# (1/v0) d(phi0)/dt + sigma_t0 * phi0 = Q0
+# (1/v1) d(phi1)/dt + sigma_t1 * phi1 = sigma_s(0->1) * phi0(t)
+# phi0(t) = (Q0 / sigma_t0) * (1 - exp(-v0 * sigma_t0 * t)) = Q0 * (1 - exp(-2 t))
+# phi1(t) = exp(-v1 * sigma_t1 * t) *
+#           [ v1 * sigma_s(0->1) * integral_0^t exp(v1 * sigma_t1 * s) * phi0(s) ds ]
+# phi0(1s) ~= 3.235,  phi1(1s) ~= 0.458
+# phi0(2s) ~= 3.672,  phi1(2s) ~= 1.036
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.mesh import FromFileMeshGenerator
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.source import VolumetricSource
+    from pyopensn.aquad import GLCProductQuadrature3DXYZ
+    from pyopensn.solver import DiscreteOrdinatesProblem, TransientSolver
+    from pyopensn.fieldfunc import FieldFunctionInterpolationVolume
+    from pyopensn.logvol import RPPLogicalVolume
+
+if __name__ == "__main__":
+
+    meshgen = FromFileMeshGenerator(filename="../../../../assets/mesh/cube3.2.msh")
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+    grid.SetOrthogonalBoundaries()
+
+    xs_diag = MultiGroupXS()
+    xs_diag.LoadFromOpenSn(
+        os.path.join(os.path.dirname(__file__), "simple_2g_downscatter_td.cxs")
+    )
+    num_groups = xs_diag.num_groups
+
+    # Total source in group 0, converted to volumetric rate
+    Q_tot = 122.58
+    Q_vol = Q_tot / (3.2 * 3.2 * 3.2)
+
+    strength = [0.0 for _ in range(num_groups)]
+    strength[0] = Q_vol  # source only in group 0
+    strength[1] = 0.0
+
+    # Volumetric source is effectively always on
+    mg_src = VolumetricSource(block_ids=[0], group_strength=strength)
+
+    # Angular quadrature
+    pquad = GLCProductQuadrature3DXYZ(n_polar=4, n_azimuthal=16, scattering_order=0)
+
+    gs0 = [0, num_groups - 1]
+
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=num_groups,
+        time_dependent=True,
+        groupsets=[
+            {
+                "groups_from_to": gs0,
+                "angular_quadrature": pquad,
+                "angle_aggregation_type": "single",
+                "angle_aggregation_num_subsets": 1,
+                "inner_linear_method": "petsc_richardson",
+                "l_abs_tol": 1.0e-6,
+                "l_max_its": 500,
+            },
+        ],
+        xs_map=[
+            {"block_ids": [0], "xs": xs_diag},
+        ],
+        volumetric_sources=[mg_src],
+        boundary_conditions=[
+            {"name": "xmin", "type": "reflecting"},
+            {"name": "xmax", "type": "reflecting"},
+            {"name": "ymin", "type": "reflecting"},
+            {"name": "ymax", "type": "reflecting"},
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={
+            "save_angular_flux": True,
+            "verbose_inner_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    # Create the time-dependent solver without stop_time, we will loop in Python
+    solver = TransientSolver(problem=phys, verbose=False, initial_state="zero")
+    solver.Initialize()
+
+    # Time stepping parameters (constant dt)
+    dt = 0.05
+    theta = 0.5
+    stop_time = 2.0
+    current_time = 0.0
+    step = 0
+    solver.SetTheta(theta)
+
+    monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True)
+    fflist = phys.GetScalarFluxFieldFunction()
+
+    # Group 0
+    ff_interp_g0 = FieldFunctionInterpolationVolume()
+    ff_interp_g0.SetOperationType("max")
+    ff_interp_g0.SetLogicalVolume(monitor_volume)
+    ff_interp_g0.AddFieldFunction(fflist[0])
+
+    # Group 1
+    ff_interp_g1 = FieldFunctionInterpolationVolume()
+    ff_interp_g1.SetOperationType("max")
+    ff_interp_g1.SetLogicalVolume(monitor_volume)
+    ff_interp_g1.AddFieldFunction(fflist[1])
+
+    phi0 = []
+    phi1 = []
+    while current_time < stop_time:
+        target_time = min(current_time + dt, stop_time)
+        step_dt = target_time - current_time
+
+        # dt is constant here with the exception of the last step.
+        # We adjust dt for the last step so that we get the solution
+        # exactly at stop_time
+        solver.SetTimeStep(step_dt)
+
+        if rank == 0:
+            print("")
+            print(
+                f"*************** Time step #{step:d}  t = {target_time:.6f} "
+                f"(from {current_time:.6f}, dt = {step_dt:.6f}, theta = {theta:.3f}) "
+                f"***************"
+            )
+
+        # Advance the solution from current_time to target_time
+        solver.Advance()
+        fflist[0].Update()
+        fflist[1].Update()
+
+        ff_interp_g0.Execute()
+        flux_max_g0 = ff_interp_g0.GetValue()
+        phi0.append(flux_max_g0)
+
+        ff_interp_g1.Execute()
+        flux_max_g1 = ff_interp_g1.GetValue()
+        phi1.append(flux_max_g1)
+
+        current_time = target_time
+        step += 1
+
+    if rank == 0:
+        print("Max phi0 = {:.6f}".format(max(phi0)))
+        print("Max phi1 = {:.6f}".format(max(phi1)))
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_pydrvr_ramp_dt_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_pydrvr_ramp_dt_cbc.py
new file mode 100644
index 0000000000..9c49b34b09
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_pydrvr_ramp_dt_cbc.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# 2-group, infinite-medium transient with downscatter (group 0 -> group 1).
+# 3.2 cm reflecting cube (infinite medium) with:
+# g0 (fast):    sigma_t0 = 1.0 cm^-1, v0 = 2.0 cm/s
+# g1 (thermal): sigma_t1 = 0.8 cm^-1, v1 = 0.5 cm/s
+# sigma_s(0 -> 1) = 0.5 cm^-1, all other sigma_s = 0
+# Constant in time source in g0 only:
+# Q0 = 122.58 / 3.2^3 cm^-3 s^-1, for all t >= 0
+#
+# (1/v0) d(phi0)/dt + sigma_t0 * phi0 = Q0
+# (1/v1) d(phi1)/dt + sigma_t1 * phi1 = sigma_s(0->1) * phi0(t)
+# phi0(t) = (Q0 / sigma_t0) * (1 - exp(-v0 * sigma_t0 * t)) = Q0 * (1 - exp(-2 t))
+# phi1(t) = exp(-v1 * sigma_t1 * t) *
+#           [ v1 * sigma_s(0->1) * integral_0^t exp(v1 * sigma_t1 * s) * phi0(s) ds ]
+# phi0(1s) ~= 3.235,  phi1(1s) ~= 0.458
+# phi0(2s) ~= 3.672,  phi1(2s) ~= 1.036
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.mesh import FromFileMeshGenerator
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.source import VolumetricSource
+    from pyopensn.aquad import GLCProductQuadrature3DXYZ
+    from pyopensn.solver import DiscreteOrdinatesProblem, TransientSolver
+    from pyopensn.fieldfunc import FieldFunctionInterpolationVolume
+    from pyopensn.logvol import RPPLogicalVolume
+
+if __name__ == "__main__":
+
+    meshgen = FromFileMeshGenerator(filename="../../../../assets/mesh/cube3.2.msh")
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+    grid.SetOrthogonalBoundaries()
+
+    xs_diag = MultiGroupXS()
+    xs_diag.LoadFromOpenSn(
+        os.path.join(os.path.dirname(__file__), "simple_2g_downscatter_td.cxs")
+    )
+    num_groups = xs_diag.num_groups
+
+    # Total source in group 0, converted to volumetric rate
+    Q_tot = 122.58
+    Q_vol = Q_tot / (3.2 * 3.2 * 3.2)
+
+    strength = [0.0 for _ in range(num_groups)]
+    strength[0] = Q_vol  # source only in group 0
+    strength[1] = 0.0
+
+    # Volumetric source is effectively always on
+    mg_src = VolumetricSource(block_ids=[0], group_strength=strength)
+
+    # Angular quadrature
+    pquad = GLCProductQuadrature3DXYZ(n_polar=4, n_azimuthal=16, scattering_order=0)
+
+    gs0 = [0, num_groups - 1]
+
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=num_groups,
+        time_dependent=True,
+        groupsets=[
+            {
+                "groups_from_to": gs0,
+                "angular_quadrature": pquad,
+                "angle_aggregation_type": "single",
+                "angle_aggregation_num_subsets": 1,
+                "inner_linear_method": "petsc_richardson",
+                "l_abs_tol": 1.0e-6,
+                "l_max_its": 500,
+            },
+        ],
+        xs_map=[
+            {"block_ids": [0], "xs": xs_diag},
+        ],
+        volumetric_sources=[mg_src],
+        boundary_conditions=[
+            {"name": "xmin", "type": "reflecting"},
+            {"name": "xmax", "type": "reflecting"},
+            {"name": "ymin", "type": "reflecting"},
+            {"name": "ymax", "type": "reflecting"},
+            {"name": "zmin", "type": "reflecting"},
+            {"name": "zmax", "type": "reflecting"},
+        ],
+        options={
+            "save_angular_flux": True,
+            "verbose_inner_iterations": False,
+        },
+        sweep_type="CBC",
+    )
+
+    # Create the time-dependent solver without stop_time, we will loop in Python
+    solver = TransientSolver(problem=phys, verbose=False, initial_state="zero")
+    solver.Initialize()
+
+    monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True)
+    fflist = phys.GetScalarFluxFieldFunction()
+
+    ff_interp_g0 = FieldFunctionInterpolationVolume()
+    ff_interp_g0.SetOperationType("max")
+    ff_interp_g0.SetLogicalVolume(monitor_volume)
+    ff_interp_g0.AddFieldFunction(fflist[0])
+
+    ff_interp_g1 = FieldFunctionInterpolationVolume()
+    ff_interp_g1.SetOperationType("max")
+    ff_interp_g1.SetLogicalVolume(monitor_volume)
+    ff_interp_g1.AddFieldFunction(fflist[1])
+
+    # Time stepping parameters
+    theta = 0.5
+    stop_time = 2.0
+    current_time = 0.0
+    step = 0
+    solver.SetTheta(theta)
+
+    # Paramters for ramp dt
+    dt_min = 0.01
+    dt_max = 0.20
+    ramp_steps = 10
+
+    while current_time < stop_time:
+
+        # Determine dt for this step
+        if step < ramp_steps:
+            # Linear ramp from dt_min to dt_max over ramp_steps
+            frac = step / (ramp_steps - 1) if ramp_steps > 1 else 1.0
+            dt = dt_min + frac * (dt_max - dt_min)
+        else:
+            # Constant dt after ramp up
+            dt = dt_max
+
+        target_time = min(current_time + dt, stop_time)
+        step_dt = target_time - current_time
+
+        # Set the timestep in OpenSn for the Advance call
+        solver.SetTimeStep(step_dt)
+
+        if rank == 0:
+            print("")
+            print(
+                f"*************** Time step #{step:d}  t = {target_time:.6f} "
+                f"(from {current_time:.6f}, dt = {step_dt:.6f}, theta = {theta:.3f}) "
+                f"***************"
+            )
+
+        # Advance the solution
+        solver.Advance()
+        fflist[0].Update()
+        fflist[1].Update()
+
+        ff_interp_g0.Execute()
+        flux_max_g0 = ff_interp_g0.GetValue()
+
+        ff_interp_g1.Execute()
+        flux_max_g1 = ff_interp_g1.GetValue()
+
+        if rank == 0:
+            print("Max phi0 = {:.6f}".format(flux_max_g0))
+            print("Max phi1 = {:.6f}".format(flux_max_g1))
+
+        current_time = target_time
+        step += 1
diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_openmc_xs_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_openmc_xs_cbc.py
new file mode 100644
index 0000000000..880849b51e
--- /dev/null
+++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_openmc_xs_cbc.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Fixed-source time-dependent transport in a homogeneous cube using OpenMC-generated
+# macroscopic, multigroup cross sections. The time-dependent solution is advanced to
+# steady state and compared with the OpenSn steady state solution and the OpenMC
+# steady state solution.
+#
+# OpenSn time-dependent solution: 51.057722
+# OpenSn steady-state solution: 51.057722
+# OpenMC steady-state solution: 50.96678
+
+import os
+import sys
+
+if "opensn_console" not in globals():
+    from mpi4py import MPI
+    size = MPI.COMM_WORLD.size
+    rank = MPI.COMM_WORLD.rank
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../")))
+    from pyopensn.mesh import OrthogonalMeshGenerator
+    from pyopensn.xs import MultiGroupXS
+    from pyopensn.source import VolumetricSource
+    from pyopensn.aquad import GLCProductQuadrature3DXYZ
+    from pyopensn.solver import DiscreteOrdinatesProblem, TransientSolver
+    from pyopensn.fieldfunc import FieldFunctionInterpolationVolume
+    from pyopensn.logvol import RPPLogicalVolume
+
+if __name__ == "__main__":
+
+    N = 10
+    L = 10.0
+    xmin = -L / 2.0
+    dx = L / N
+    nodes = [xmin + i * dx for i in range(N + 1)]
+
+    meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes, nodes])
+    grid = meshgen.Execute()
+    grid.SetUniformBlockID(0)
+
+    xs_water = MultiGroupXS()
+    xs_water.LoadFromOpenMC(
+        os.path.join(os.path.dirname(__file__), "xs_water.h5"), "set1", 294
+    )
+    num_groups = xs_water.num_groups
+
+    strength = [0.0 for _ in range(num_groups)]
+    strength[3] = 12.285
+    src1 = VolumetricSource(block_ids=[0], group_strength=strength)
+
+    pquad = GLCProductQuadrature3DXYZ(n_polar=8, n_azimuthal=16, scattering_order=1)
+
+    phys = DiscreteOrdinatesProblem(
+        mesh=grid,
+        num_groups=num_groups,
+        time_dependent=True,
+        groupsets=[
+            {
+                "groups_from_to": [0, num_groups - 1],
+                "angular_quadrature": pquad,
+                "inner_linear_method": "petsc_gmres",
+                "l_abs_tol": 1.0e-6,
+                "l_max_its": 300,
+                "gmres_restart_interval": 30,
+            },
+        ],
+        xs_map=[
+            {"block_ids": [0], "xs": xs_water},
+        ],
+        volumetric_sources=[src1],
+        options={"save_angular_flux": True},
+        sweep_type="CBC",
+    )
+
+    monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True)
+    dt = 0.01
+    theta_cn = 0.5
+    theta_be = 1.0
+    be_startup_steps = 2
+    stop_time = 0.1
+
+    solver = TransientSolver(problem=phys, dt=dt, theta=theta_be, initial_state="zero")
+    solver.Initialize()
+
+    current_time = 0.0
+    flux_max = 0.0
+    step = 0
+    fflist = phys.GetScalarFluxFieldFunction()
+    field_interp = FieldFunctionInterpolationVolume()
+    field_interp.SetOperationType("max")
+    field_interp.SetLogicalVolume(monitor_volume)
+    field_interp.AddFieldFunction(fflist[3])
+
+    while current_time < stop_time - 1.0e-14:
+        target_time = min(current_time + dt, stop_time)
+        solver.SetTimeStep(target_time - current_time)
+        theta_step = theta_be if step < be_startup_steps else theta_cn
+        solver.SetTheta(theta_step)
+        solver.Advance()
+        current_time = target_time
+        fflist[3].Update()
+        field_interp.Execute()
+        flux_max = field_interp.GetValue()
+        step += 1
+
+    if rank == 0:
+        print(f"Max phi(0.1s) = {flux_max:.6f}")

From a9b6634ab804539729697ebe0a08d528d0e8b2e9 Mon Sep 17 00:00:00 2001
From: Eappen Nelluvelil <eappen@tamu.edu>
Date: Sun, 12 Apr 2026 00:04:54 -0500
Subject: [PATCH 4/6] CBC_SPDS calculates max number of cell-face slots  for
 local psi data storage during sweeps

---
 .../discrete_ordinates_problem.cc             |  88 +++
 .../sweep/scheduler/spmd_threadpool.h         |   2 +-
 .../sweep/spds/cbc.cc                         | 254 +++++--
 .../sweep/spds/cbc.h                          | 127 +++-
 .../sweep/spds/cbc_slot_planner.cc            | 622 ++++++++++++++++++
 .../sweep/spds/cbc_slot_planner.h             |  77 +++
 6 files changed, 1123 insertions(+), 47 deletions(-)
 create mode 100644 modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc_slot_planner.cc
 create mode 100644 modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc_slot_planner.h

diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc
index 1a375321e1..dddec78344 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc
@@ -17,6 +17,7 @@
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_sweep_chunk_td.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/spmd_threadpool.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/iterative_methods/sweep_wgs_context.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/io/discrete_ordinates_problem_io.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/iterative_methods/ags_linear_solver.h"
@@ -43,11 +44,13 @@
 #include "framework/runtime.h"
 #include "caliper/cali.h"
 #include <algorithm>
+#include <atomic>
 #include <cassert>
 #include <cmath>
 #include <iomanip>
 #include <sstream>
 #include <stdexcept>
+#include <thread>
 
 namespace opensn
 {
@@ -1402,6 +1405,7 @@ DiscreteOrdinatesProblem::InitializeSweepDataStructures()
   }
   else if (sweep_type_ == "CBC")
   {
+    std::vector<std::shared_ptr<CBC_SPDS>> cbc_spds_list;
     // Build SPDS
     for (const auto& [quadrature, info] : quadrature_unq_so_grouping_map_)
     {
@@ -1416,8 +1420,92 @@ DiscreteOrdinatesProblem::InitializeSweepDataStructures()
         const auto new_swp_order =
           std::make_shared<CBC_SPDS>(omega, this->grid_, quadrature_allow_cycles_map_[quadrature]);
         quadrature_spds_map_[quadrature].push_back(new_swp_order);
+        cbc_spds_list.push_back(new_swp_order);
       }
     }
+
+    if (cbc_spds_list.size() == 1)
+    {
+      auto start_time = std::chrono::steady_clock::now();
+      cbc_spds_list.front()->ComputeMaxNumLocalPsiSlots();
+      auto end_time = std::chrono::steady_clock::now();
+      std::chrono::duration<double> elapsed_seconds = end_time - start_time;
+
+      const auto local_face_slots = cbc_spds_list.front()->GetMaxNumLocalPsiSlots();
+      log.Log() << program_timer.GetTimeString() << "CBC SPDS local cell-face psi slot summary\n"
+                << "   SPDS count  : 1\n"
+                << "   Elapsed     : " << elapsed_seconds.count() << " s\n"
+                << "   Max         : " << local_face_slots << "\n"
+                << "   Min         : " << local_face_slots << "\n"
+                << "   Median      : " << static_cast<double>(local_face_slots) << "\n"
+                << "   Average     : " << static_cast<double>(local_face_slots) << "\n";
+    }
+    else if (not cbc_spds_list.empty())
+    {
+      const auto hardware_threads = std::max<std::size_t>(1, std::thread::hardware_concurrency());
+      const auto num_workers = std::min(cbc_spds_list.size(), hardware_threads);
+
+      SPMD_ThreadPool pool(num_workers);
+      std::atomic<std::size_t> next_index{0};
+
+      log.Log() << program_timer.GetTimeString()
+                << " Compute max num local cell-face psi slots for " << cbc_spds_list.size()
+                << " CBC SPDS using " << num_workers << " worker threads.\n";
+
+      auto start_time = std::chrono::steady_clock::now();
+      pool.ExecuteBatch(
+        [&](std::size_t /* thread ID */)
+        {
+          std::size_t index;
+          // Atomically fetch the next index to work on
+          // std::memory_order_relaxed is sufficient here because we need atomicity only for the
+          // fetch_add operation, and there are no other synchronization requirements between
+          // threads for calculating max num local psi slots.
+          while ((index = next_index.fetch_add(1, std::memory_order_relaxed)) <
+                 cbc_spds_list.size())
+          {
+            cbc_spds_list[index]->ComputeMaxNumLocalPsiSlots();
+          }
+        });
+      auto end_time = std::chrono::steady_clock::now();
+      std::chrono::duration<double> elapsed_seconds = end_time - start_time;
+      double elapsed_time = elapsed_seconds.count();
+
+      size_t max_local_psi_slots = 0;
+      size_t min_local_psi_slots = std::numeric_limits<size_t>::max();
+      std::vector<size_t> local_psi_slot_counts;
+      local_psi_slot_counts.reserve(cbc_spds_list.size());
+
+      for (const auto& spds : cbc_spds_list)
+      {
+        const auto local_psi_slots = spds->GetMaxNumLocalPsiSlots();
+        max_local_psi_slots = std::max(max_local_psi_slots, local_psi_slots);
+        min_local_psi_slots = std::min(min_local_psi_slots, local_psi_slots);
+        local_psi_slot_counts.push_back(local_psi_slots);
+      }
+
+      std::sort(local_psi_slot_counts.begin(), local_psi_slot_counts.end());
+      const auto num_counts = local_psi_slot_counts.size();
+      const double avg_local_psi_slots =
+        static_cast<double>(std::accumulate(
+          local_psi_slot_counts.begin(), local_psi_slot_counts.end(), std::size_t{0})) /
+        num_counts;
+      const double median_local_psi_slots =
+        (num_counts % 2 == 1)
+          ? static_cast<double>(local_psi_slot_counts[num_counts / 2])
+          : 0.5 * static_cast<double>(local_psi_slot_counts[num_counts / 2 - 1] +
+                                      local_psi_slot_counts[num_counts / 2]);
+
+      log.Log() << program_timer.GetTimeString()
+                << " CBC SPDS local cell-face psi slot statistics\n"
+                << "    SPDS count : " << cbc_spds_list.size() << "\n"
+                << "    Workers    : " << num_workers << "\n"
+                << "    Elapsed    : " << elapsed_time << " s\n"
+                << "    Max        : " << max_local_psi_slots << "\n"
+                << "    Min        : " << min_local_psi_slots << "\n"
+                << "    Median     : " << median_local_psi_slots << "\n"
+                << "    Average    : " << avg_local_psi_slots << "\n";
+    }
   }
   else
     OpenSnInvalidArgument("Unsupported sweep type \"" + sweep_type_ + "\"");
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/spmd_threadpool.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/spmd_threadpool.h
index c0bb26071f..ca87d3e991 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/spmd_threadpool.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/spmd_threadpool.h
@@ -9,7 +9,6 @@
 #include <memory>
 #include <mutex>
 #include <new>
-#include <semaphore>
 #include <thread>
 #include <vector>
 
@@ -96,6 +95,7 @@ class SPMD_ThreadPool
       for (std::size_t i = 0; i < n; ++i)
         ++epoch_states_[i].request;
     }
+    cv_start_.notify_all();
     WaitAll();
   }
 
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.cc
index b99109a3a9..e7d880e1d7 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.cc
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.cc
@@ -2,16 +2,168 @@
 // SPDX-License-Identifier: MIT
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h"
-#include "framework/mesh/mesh_continuum/mesh_continuum.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc_slot_planner.h"
 #include "framework/logging/log.h"
-#include "framework/utils/timer.h"
+#include "framework/mesh/mesh_continuum/mesh_continuum.h"
 #include "framework/runtime.h"
 #include "caliper/cali.h"
 #include <boost/graph/topological_sort.hpp>
+#include <numeric>
+#include <stdexcept>
 
 namespace opensn
 {
 
+void
+CBC_SPDS::BuildTaskGraph()
+{
+  constexpr auto INCOMING = FaceOrientation::INCOMING;
+  constexpr auto OUTGOING = FaceOrientation::OUTGOING;
+
+  const auto num_loc_cells = grid_->local_cells.size();
+  task_list_.assign(num_loc_cells, Task{});
+  task_successor_rank_offsets_.assign(num_loc_cells + 1, 0);
+  task_successor_ranks_.clear();
+  task_successor_ranks_.reserve(num_loc_cells * 4);
+
+  for (std::size_t rank = 0; rank < topo_order_.size(); ++rank)
+  {
+    const auto& cell = grid_->local_cells[topo_order_[rank]];
+    unsigned int num_dependencies = 0;
+    std::vector<std::uint32_t> successors;
+
+    successors.reserve(cell.faces.size());
+    task_successor_rank_offsets_[rank] = static_cast<std::uint32_t>(task_successor_ranks_.size());
+    for (std::size_t f = 0; f < cell.faces.size(); ++f)
+    {
+      const auto& face = cell.faces[f];
+      const auto& orientation = cell_face_orientations_[cell.local_id][f];
+
+      if (orientation == INCOMING and face.has_neighbor)
+        ++num_dependencies;
+      else if ((orientation == OUTGOING) and (face.has_neighbor) and
+               (face.IsNeighborLocal(grid_.get())))
+      {
+        const auto successor_local_id = grid_->cells[face.neighbor_id].local_id;
+        successors.push_back(successor_local_id);
+        task_successor_ranks_.push_back(topo_rank_by_cell_local_id_[successor_local_id]);
+      }
+    }
+
+    task_list_[cell.local_id] =
+      Task{num_dependencies, std::move(successors), cell.local_id, &cell, false};
+  }
+  task_successor_rank_offsets_.back() = static_cast<std::uint32_t>(task_successor_ranks_.size());
+}
+
+void
+CBC_SPDS::BuildLocalFaceTaskGraph()
+{
+  // Each outgoing local face becomes one directed-face task.
+  // The task is keyed by the producer-cell topological rank, the consumer-cell topological
+  // rank, and the face-node count needed later when CBC/CBCD size the compact slot bank.
+  const auto num_loc_cells = grid_->local_cells.size();
+  cell_face_offsets_.assign(num_loc_cells + 1, 0);
+  size_t total_num_faces = 0;
+  for (const auto& cell : grid_->local_cells)
+  {
+    cell_face_offsets_[cell.local_id] = static_cast<std::uint32_t>(total_num_faces);
+    total_num_faces += cell.faces.size();
+  }
+  cell_face_offsets_.back() = static_cast<std::uint32_t>(total_num_faces);
+  outgoing_local_face_task_ids_.assign(total_num_faces, INVALID_LOCAL_FACE_TASK_ID);
+  incoming_local_face_task_ids_.assign(total_num_faces, INVALID_LOCAL_FACE_TASK_ID);
+
+  producer_cell_face_offsets_.assign(num_loc_cells + 1, 0);
+  local_face_producer_ranks_.clear();
+  local_face_consumer_ranks_.clear();
+  local_face_node_counts_.clear();
+  max_local_face_node_count_ = 0;
+
+  for (std::size_t producer_rank = 0; producer_rank < topo_order_.size(); ++producer_rank)
+  {
+    producer_cell_face_offsets_[producer_rank] =
+      static_cast<std::uint32_t>(local_face_producer_ranks_.size());
+
+    const auto producer_cell_local_id = topo_order_[producer_rank];
+    const auto& cell = grid_->local_cells[producer_cell_local_id];
+    const auto& face_orientations = cell_face_orientations_[producer_cell_local_id];
+
+    for (std::size_t f = 0; f < cell.faces.size(); ++f)
+    {
+      const auto& face = cell.faces[f];
+      const auto& orientation = face_orientations[f];
+      if ((orientation != FaceOrientation::OUTGOING) or (not face.IsNeighborLocal(grid_.get())))
+        continue;
+
+      const auto consumer_cell_local_id = face.GetNeighborLocalID(grid_.get());
+      const auto consumer_face_id =
+        static_cast<std::uint16_t>(face.GetNeighborAdjacentFaceIndex(grid_.get()));
+      const auto num_face_nodes = static_cast<std::uint32_t>(face.vertex_ids.size());
+      max_local_face_node_count_ =
+        std::max(max_local_face_node_count_, static_cast<std::size_t>(num_face_nodes));
+
+      const auto face_task_id = static_cast<std::uint32_t>(local_face_producer_ranks_.size());
+      local_face_producer_ranks_.push_back(static_cast<std::uint32_t>(producer_rank));
+      local_face_consumer_ranks_.push_back(topo_rank_by_cell_local_id_[consumer_cell_local_id]);
+      local_face_node_counts_.push_back(static_cast<std::uint16_t>(num_face_nodes));
+      outgoing_local_face_task_ids_[cell_face_offsets_[producer_cell_local_id] + f] = face_task_id;
+      incoming_local_face_task_ids_[cell_face_offsets_[consumer_cell_local_id] + consumer_face_id] =
+        face_task_id;
+    }
+  }
+
+  producer_cell_face_offsets_.back() =
+    static_cast<std::uint32_t>(local_face_producer_ranks_.size());
+  local_face_slot_ids_.resize(local_face_producer_ranks_.size());
+  std::iota(local_face_slot_ids_.begin(), local_face_slot_ids_.end(), std::uint32_t{0});
+}
+
+void
+CBC_SPDS::UpdateLocalFaceSlotLayout()
+{
+  // The slot planner only decides which faces may share one slot. The physical storage bank is
+  // then sized slot-by-slot by taking the maximum face-node extent over each slot chain.
+  local_face_slot_node_counts_.assign(max_num_local_psi_slots_, std::uint16_t{0});
+  local_face_slot_node_offsets_.assign(max_num_local_psi_slots_ + 1, std::uint32_t{0});
+  total_local_face_slot_nodes_ = 0;
+
+  bool is_identity_layout = max_num_local_psi_slots_ == local_face_slot_ids_.size();
+  for (std::size_t face_task_id = 0;
+       is_identity_layout and face_task_id < local_face_slot_ids_.size();
+       ++face_task_id)
+    is_identity_layout = local_face_slot_ids_[face_task_id] == face_task_id;
+
+  if (is_identity_layout)
+  {
+    for (std::size_t slot_id = 0; slot_id < local_face_node_counts_.size(); ++slot_id)
+    {
+      local_face_slot_node_counts_[slot_id] = local_face_node_counts_[slot_id];
+      local_face_slot_node_offsets_[slot_id] =
+        static_cast<std::uint32_t>(total_local_face_slot_nodes_);
+      total_local_face_slot_nodes_ += local_face_node_counts_[slot_id];
+    }
+    local_face_slot_node_offsets_.back() = static_cast<std::uint32_t>(total_local_face_slot_nodes_);
+    return;
+  }
+
+  for (std::size_t face_task_id = 0; face_task_id < local_face_slot_ids_.size(); ++face_task_id)
+  {
+    const auto slot_id = local_face_slot_ids_[face_task_id];
+    assert(slot_id < local_face_slot_node_counts_.size());
+    local_face_slot_node_counts_[slot_id] =
+      std::max(local_face_slot_node_counts_[slot_id], local_face_node_counts_[face_task_id]);
+  }
+
+  for (std::size_t slot_id = 0; slot_id < local_face_slot_node_counts_.size(); ++slot_id)
+  {
+    local_face_slot_node_offsets_[slot_id] =
+      static_cast<std::uint32_t>(total_local_face_slot_nodes_);
+    total_local_face_slot_nodes_ += local_face_slot_node_counts_[slot_id];
+  }
+  local_face_slot_node_offsets_.back() = static_cast<std::uint32_t>(total_local_face_slot_nodes_);
+}
+
 CBC_SPDS::CBC_SPDS(const Vector3& omega,
                    const std::shared_ptr<MeshContinuum>& grid,
                    bool allow_cycles)
@@ -21,7 +173,6 @@ CBC_SPDS::CBC_SPDS(const Vector3& omega,
 
   size_t num_loc_cells = grid->local_cells.size();
 
-  // Populate Cell Relationships
   std::vector<std::set<std::pair<std::uint32_t, double>>> cell_successors(num_loc_cells);
   std::set<int> location_successors;
   std::set<int> location_dependencies;
@@ -37,10 +188,8 @@ CBC_SPDS::CBC_SPDS(const Vector3& omega,
   for (auto v : location_dependencies)
     location_dependencies_.push_back(v);
 
-  // Build local cell graph
   Graph local_DG(num_loc_cells);
 
-  // Create graph edges
   for (size_t c = 0; c < num_loc_cells; ++c) // NOLINT
     for (const auto& successor : cell_successors[c])
       boost::add_edge(c, successor.first, successor.second, local_DG);
@@ -48,11 +197,10 @@ CBC_SPDS::CBC_SPDS(const Vector3& omega,
   if (allow_cycles) // NOLINT
   {
     auto edges_to_remove = RemoveCyclicDependencies(local_DG);
-    for (auto& edge_to_remove : edges_to_remove)
-      local_sweep_fas_.emplace_back(edge_to_remove.first, edge_to_remove.second);
+    for (const auto& [u, v] : edges_to_remove)
+      local_sweep_fas_.emplace_back(u, v);
   }
 
-  // Generate topological sorting
   spls_.clear();
   boost::topological_sort(local_DG, std::back_inserter(spls_)); // NOLINT
   std::reverse(spls_.begin(), spls_.end());
@@ -62,44 +210,76 @@ CBC_SPDS::CBC_SPDS(const Vector3& omega,
                            "Cycles need to be allowed by the calling application.");
   }
 
-  // Create task list
-  std::vector<std::vector<int>> global_dependencies;
-  global_dependencies.resize(opensn::mpi_comm.size());
+  topo_order_.assign(spls_.begin(), spls_.end());
+  topo_rank_by_cell_local_id_.assign(num_loc_cells, 0);
+  for (std::size_t rank = 0; rank < topo_order_.size(); ++rank)
+    topo_rank_by_cell_local_id_[topo_order_[rank]] = static_cast<std::uint32_t>(rank);
+
+  std::vector<std::vector<int>> global_dependencies(opensn::mpi_comm.size());
   CommunicateLocationDependencies(location_dependencies_, global_dependencies);
+  BuildTaskGraph();
+  BuildLocalFaceTaskGraph();
 
-  constexpr auto INCOMING = FaceOrientation::INCOMING;
-  constexpr auto OUTGOING = FaceOrientation::OUTGOING;
+  max_num_local_psi_slots_ = local_face_producer_ranks_.size();
+  UpdateLocalFaceSlotLayout();
+}
 
-  // For each local cell create a task
-  for (const auto& cell : grid_->local_cells)
-  {
-    const size_t num_faces = cell.faces.size();
-    unsigned int num_dependencies = 0;
-    std::vector<std::uint32_t> successors;
+const std::vector<Task>&
+CBC_SPDS::GetTaskList() const noexcept
+{
+  return task_list_;
+}
 
-    for (size_t f = 0; f < num_faces; ++f)
-    {
-      if (cell_face_orientations_[cell.local_id][f] == INCOMING)
-      {
-        if (cell.faces[f].has_neighbor)
-          ++num_dependencies;
-      }
-      else if (cell_face_orientations_[cell.local_id][f] == OUTGOING)
-      {
-        const auto& face = cell.faces[f];
-        if (face.has_neighbor and grid->IsCellLocal(face.neighbor_id))
-          successors.push_back(grid->cells[face.neighbor_id].local_id);
-      }
-    }
+void
+CBC_SPDS::ComputeMaxNumLocalPsiSlots()
+{
+  CALI_CXX_MARK_SCOPE("CBC_SPDS::ComputeMaxNumLocalPsiSlots");
 
-    task_list_.push_back({num_dependencies, successors, cell.local_id, &cell, false});
+  if (task_list_.empty())
+  {
+    max_num_local_psi_slots_ = 0;
+    local_face_slot_ids_.clear();
+    UpdateLocalFaceSlotLayout();
+    return;
   }
+
+  if (local_face_producer_ranks_.empty())
+  {
+    max_num_local_psi_slots_ = 0;
+    local_face_slot_ids_.clear();
+    UpdateLocalFaceSlotLayout();
+    return;
+  }
+
+  // Solve the exact minimum chain cover of the local-face reuse poset, then turn that chain
+  // decomposition into a static slot assignment and compact slot-bank layout.
+  const auto result = detail::ComputeLocalFaceSlotPlan(task_successor_rank_offsets_,
+                                                       task_successor_ranks_,
+                                                       local_face_producer_ranks_,
+                                                       local_face_consumer_ranks_,
+                                                       producer_cell_face_offsets_,
+                                                       local_face_slot_ids_);
+  max_num_local_psi_slots_ = result.slot_count;
+  UpdateLocalFaceSlotLayout();
+  if (result.verifier_rejected)
+    opensn::log.LogAllWarning()
+      << "CBC_SPDS::ComputeMaxNumLocalPsiSlots: local cell-face slot assignment verifier rejected "
+      << " the computed slot count; falling back to the identity assignment "
+      << " (one slot per local directed face, no reuse).";
 }
 
-const std::vector<Task>&
-CBC_SPDS::GetTaskList() const
+std::uint32_t
+CBC_SPDS::GetOutgoingLocalFaceTaskID(const std::uint32_t cell_local_id,
+                                     const unsigned int face_id) const noexcept
 {
-  return task_list_;
+  return outgoing_local_face_task_ids_[cell_face_offsets_[cell_local_id] + face_id];
+}
+
+std::uint32_t
+CBC_SPDS::GetIncomingLocalFaceTaskID(const std::uint32_t cell_local_id,
+                                     const unsigned int face_id) const noexcept
+{
+  return incoming_local_face_task_ids_[cell_face_offsets_[cell_local_id] + face_id];
 }
 
 } // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h
index 09135920f5..b2de98ad50 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h
@@ -5,28 +5,137 @@
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/spds.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/sweep.h"
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <vector>
 
 namespace opensn
 {
 
+/**
+ * Cell-by-cell sweep plane data structure and exact local-face slot metadata.
+ *
+ * This class stores the local CBC task DAG for one sweep direction, its topological ordering,
+ * and the local directed-face metadata consumed by both the host CBC FLUDS and the device
+ * CBCD FLUDS. Each outgoing local face is represented by one directed-face task with a
+ * producer-cell rank, a consumer-cell rank, and a face-node count.
+ *
+ * `ComputeMaxNumLocalPsiSlots()` computes the exact minimum safe number of reusable local-face
+ * slots and the corresponding static face-to-slot map. The result is shared by the host and
+ * device CBC implementations. `UpdateLocalFaceSlotLayout()` converts the face-to-slot mapping
+ * plan into a compact slot bank sized by the maximum face-node extent within each slot.
+ */
 class CBC_SPDS : public SPDS
 {
 public:
+  /// Value returned when a local face does not participate in the requested face-task map.
+  static constexpr std::uint32_t INVALID_LOCAL_FACE_TASK_ID =
+    std::numeric_limits<std::uint32_t>::max();
+
+  /// Construct the CBC sweep plane data structure for one angular direction.
+  CBC_SPDS(const Vector3& omega, const std::shared_ptr<MeshContinuum>& grid, bool allow_cycles);
+
+  /// Return the local CBC task list.
+  const std::vector<Task>& GetTaskList() const noexcept;
+
   /**
-   * Constructs a cell-by-cell sweep-plane data strcture (SPDS) with the given direction and grid.
+   * Compute the exact minimum number of reusable local-face psi slots.
+   *
+   * The local directed faces define a poset under the safe reuse relation. A chain in this
+   * poset is one statically reusable slot. The required slot count is therefore the minimum
+   * chain-cover cardinality, equivalently the maximum antichain cardinality by Dilworth's
+   * theorem.
    *
-   * \param omega The angular direction vector.
-   * \param grid Reference to the grid.
-   * \param allow_cycles Whether cycles are allowed in the local sweep dependency graph.
+   * The planner obtains this value from a maximum cardinality matching on the bipartite
+   * split graph of the reuse relation. The extracted chains define the static slot IDs.
+   * `UpdateLocalFaceSlotLayout()` then sizes each slot by the maximum face-node extent
+   * over the faces assigned to that slot.
    */
-  CBC_SPDS(const Vector3& omega, const std::shared_ptr<MeshContinuum>& grid, bool allow_cycles);
+  void ComputeMaxNumLocalPsiSlots();
+
+  std::size_t GetMaxNumLocalPsiSlots() const noexcept { return max_num_local_psi_slots_; }
+
+  const std::vector<std::uint32_t>& GetLocalFaceSlotIDs() const noexcept
+  {
+    return local_face_slot_ids_;
+  }
+
+  const std::vector<std::uint32_t>& GetLocalFaceSlotNodeOffsets() const noexcept
+  {
+    return local_face_slot_node_offsets_;
+  }
+
+  const std::vector<std::uint16_t>& GetLocalFaceSlotNodeCounts() const noexcept
+  {
+    return local_face_slot_node_counts_;
+  }
 
-  /// Returns the cell-by-cell task list.
-  const std::vector<Task>& GetTaskList() const;
+  std::size_t GetTotalLocalFaceSlotNodes() const noexcept { return total_local_face_slot_nodes_; }
 
-protected:
-  /// Cell-by-cell task list.
+  std::size_t GetMaxLocalFaceNodeCount() const noexcept { return max_local_face_node_count_; }
+
+  /// Return the local directed-face task ID for an outgoing local face.
+  std::uint32_t GetOutgoingLocalFaceTaskID(std::uint32_t cell_local_id,
+                                           unsigned int face_id) const noexcept;
+
+  /// Return the local directed-face task ID for an incoming local face.
+  std::uint32_t GetIncomingLocalFaceTaskID(std::uint32_t cell_local_id,
+                                           unsigned int face_id) const noexcept;
+
+  ~CBC_SPDS() override = default;
+
+private:
+  /// Build the local cell task DAG and its successor-rank adjacency.
+  void BuildTaskGraph();
+
+  /// Enumerate local directed faces and map them to producer and consumer cell ranks.
+  void BuildLocalFaceTaskGraph();
+
+  /// Topological ordering of local cell IDs: topo_order_[rank] = cell_local_id.
+  std::vector<std::uint32_t> topo_order_;
+  /// Topological rank keyed by local cell ID.
+  std::vector<std::uint32_t> topo_rank_by_cell_local_id_;
+  /// Per-cell task descriptors with successor adjacency lists.
   std::vector<Task> task_list_;
+  /// Offsets into the flat successor-rank array indexed by topological task rank.
+  std::vector<std::uint32_t> task_successor_rank_offsets_;
+  /// Flat successor topological ranks grouped by producer task rank.
+  std::vector<std::uint32_t> task_successor_ranks_;
+  /// Flat face-table offsets indexed by cell local IDs.
+  std::vector<std::uint32_t> cell_face_offsets_;
+  /// Flat outgoing local-face task IDs indexed by face storage index.
+  std::vector<std::uint32_t> outgoing_local_face_task_ids_;
+  /// Flat incoming local-face task IDs indexed by face storage index.
+  std::vector<std::uint32_t> incoming_local_face_task_ids_;
+  /// Face-rank offsets grouped by producer-cell topological rank.
+  std::vector<std::uint32_t> producer_cell_face_offsets_;
+  /// Producer-cell topological rank for each local directed face.
+  std::vector<std::uint32_t> local_face_producer_ranks_;
+  /// Consumer-cell topological rank for each local directed face.
+  std::vector<std::uint32_t> local_face_consumer_ranks_;
+  /// Number of nodes for each local directed face task.
+  std::vector<std::uint16_t> local_face_node_counts_;
+  /// Static slot assignment: local_face_slot_ids_[face_task_id] = slot_id.
+  std::vector<std::uint32_t> local_face_slot_ids_;
+  /// Slot-local node extents: local_face_slot_node_counts_[slot_id] = max nodes in that slot.
+  std::vector<std::uint16_t> local_face_slot_node_counts_;
+  /// Prefix offsets into the compact local-face slot bank.
+  std::vector<std::uint32_t> local_face_slot_node_offsets_;
+  /// Minimum number of local-face angular flux storage slots.
+  std::size_t max_num_local_psi_slots_ = 0;
+  /// Total number of local-face nodes in the compact slot bank.
+  std::size_t total_local_face_slot_nodes_ = 0;
+  /// Maximum number of nodes across all local directed faces.
+  std::size_t max_local_face_node_count_ = 0;
+
+  /**
+   * Recompute slot-local node extents and prefix offsets from the current slot assignment.
+   *
+   * Each slot is sized to the maximum face-node extent of the local directed faces assigned
+   * to that slot. This preserves the exact slot count while avoiding one global slot extent.
+   */
+  void UpdateLocalFaceSlotLayout();
 };
 
 } // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc_slot_planner.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc_slot_planner.cc
new file mode 100644
index 0000000000..3f8e3b71d3
--- /dev/null
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc_slot_planner.cc
@@ -0,0 +1,622 @@
+// SPDX-FileCopyrightText: 2026 The OpenSn Authors <https://open-sn.github.io/opensn/>
+// SPDX-License-Identifier: MIT
+
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc_slot_planner.h"
+#include <algorithm>
+#include <bit>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <numeric>
+#include <vector>
+
+namespace opensn::detail
+{
+
+// Planner overview:
+// 1. Build the reflexive transitive closure of the local CBC task DAG.
+// 2. Define a face-poset reuse relation: face u may precede face v in one slot if the
+//    consumer cell of u reaches the producer cell of v in the task DAG.
+// 3. Solve the resulting minimum chain-cover problem exactly through the standard bipartite
+//    maximum-matching reduction.
+// 4. Extract one slot chain per unmatched right-side face and verify the resulting static
+//    handoff sequence before exposing it to CBC_SPDS.
+
+constexpr std::uint32_t INVALID_INDEX = std::numeric_limits<std::uint32_t>::max();
+
+// Bit-packed reachability matrix for the local cell DAG.
+// Rows are padded so the closure builder can copy and OR contiguous word spans efficiently.
+class BitMatrix
+{
+public:
+  void ResizeAndClear(const std::size_t n)
+  {
+    n_ = n;
+    active_words_per_row_ = (n + 63) / 64;
+    padded_words_per_row_ = (active_words_per_row_ + 7) & ~std::size_t{7};
+    const std::size_t required_words = n_ * padded_words_per_row_;
+    if (data_.size() < required_words)
+      data_.resize(required_words);
+    if (row_active_word_counts_.size() < n_)
+      row_active_word_counts_.resize(n_);
+    std::fill_n(data_.begin(), required_words, 0ULL);
+    std::fill_n(row_active_word_counts_.begin(), n_, std::size_t{0});
+  }
+
+  std::uint64_t* Row(const std::size_t i) noexcept
+  {
+    return data_.data() + i * padded_words_per_row_;
+  }
+
+  const std::uint64_t* Row(const std::size_t i) const noexcept
+  {
+    return data_.data() + i * padded_words_per_row_;
+  }
+
+  void SetBit(const std::size_t i, const std::size_t j) noexcept
+  {
+    Row(i)[j / 64] |= (1ULL << (j % 64));
+    row_active_word_counts_[i] = std::max(row_active_word_counts_[i], (j / 64) + 1);
+  }
+
+  bool TestBit(const std::size_t i, const std::size_t j) const noexcept
+  {
+    return (Row(i)[j / 64] & (1ULL << (j % 64))) != 0ULL;
+  }
+
+  void CopyRowFromWord(const std::size_t dst,
+                       const BitMatrix& src_mat,
+                       const std::size_t src_row,
+                       const std::size_t start_word) noexcept
+  {
+    const std::size_t src_active_words = src_mat.row_active_word_counts_[src_row];
+    if (start_word >= src_active_words)
+    {
+      row_active_word_counts_[dst] =
+        std::max(row_active_word_counts_[dst], std::min(start_word, active_words_per_row_));
+      return;
+    }
+
+    std::uint64_t* const d = Row(dst) + start_word;
+    const std::uint64_t* const s = src_mat.Row(src_row) + start_word;
+    const std::size_t words_to_copy = src_active_words - start_word;
+    std::memcpy(d, s, words_to_copy * sizeof(std::uint64_t));
+    row_active_word_counts_[dst] = src_active_words;
+  }
+
+  void OrRowsFromWord(const std::size_t dst,
+                      const BitMatrix& src_mat,
+                      const std::size_t src_row,
+                      const std::size_t start_word) noexcept
+  {
+    const std::size_t src_active_words = src_mat.row_active_word_counts_[src_row];
+    if (start_word >= src_active_words)
+      return;
+
+    std::uint64_t* const d = Row(dst) + start_word;
+    const std::uint64_t* const s = src_mat.Row(src_row) + start_word;
+    const std::size_t words_to_process = src_active_words - start_word;
+    for (std::size_t w = 0; w < words_to_process; ++w)
+      d[w] |= s[w];
+    row_active_word_counts_[dst] = std::max(row_active_word_counts_[dst], src_active_words);
+  }
+
+  std::size_t FindFirstSet(const std::size_t row, const std::size_t start_pos = 0) const noexcept
+  {
+    const std::uint64_t* const r = Row(row);
+    std::size_t w = start_pos / 64;
+    const std::size_t active_words = row_active_word_counts_[row];
+    if (w >= active_words)
+      return n_;
+
+    std::uint64_t masked = r[w] & (~0ULL << (start_pos % 64));
+    if (masked)
+      return w * 64 + static_cast<std::size_t>(std::countr_zero(masked));
+
+    for (++w; w < active_words; ++w)
+    {
+      if (r[w])
+        return w * 64 + static_cast<std::size_t>(std::countr_zero(r[w]));
+    }
+    return n_;
+  }
+
+  std::size_t FindNextSet(const std::size_t row, const std::size_t pos) const noexcept
+  {
+    return FindFirstSet(row, pos + 1);
+  }
+
+private:
+  std::size_t n_ = 0;
+  std::size_t active_words_per_row_ = 0;
+  std::size_t padded_words_per_row_ = 0;
+  std::vector<std::size_t> row_active_word_counts_;
+  std::vector<std::uint64_t> data_;
+};
+
+struct DFSFrame
+{
+  std::uint32_t u_face_rank = INVALID_INDEX;
+  std::uint32_t via_v_face_rank = INVALID_INDEX;
+  std::uint32_t producer_rank_index = 0;
+  std::uint32_t producer_rank_end = 0;
+  std::uint32_t next_v_face_rank = 0;
+  std::uint32_t v_face_end = 0;
+};
+
+struct ThreadLocalWorkspace
+{
+  BitMatrix reachability;
+  std::vector<std::uint32_t> face_mate_u;
+  std::vector<std::uint32_t> face_mate_v;
+  std::vector<int> face_dist;
+  std::vector<std::uint32_t> face_queue;
+  std::vector<std::uint32_t> consumer_rank_face_offsets;
+  std::vector<std::uint32_t> consumer_rank_face_write_offsets;
+  std::vector<std::uint32_t> faces_by_consumer_rank;
+  std::vector<std::uint32_t> candidate_producer_rank_offsets;
+  std::vector<std::uint32_t> candidate_producer_ranks;
+  std::vector<std::uint32_t> candidate_face_counts_by_consumer_rank;
+  std::vector<std::uint32_t> greedy_consumer_rank_order;
+  std::vector<std::uint32_t> face_last_rank_for_slot;
+  std::vector<DFSFrame> dfs_frames;
+
+  void PrepareMatching(const std::size_t num_consumer_ranks, const std::size_t num_faces)
+  {
+    face_mate_u.assign(num_faces, INVALID_INDEX);
+    face_mate_v.assign(num_faces, INVALID_INDEX);
+    face_dist.assign(num_faces, -1);
+    if (face_queue.size() < num_faces)
+      face_queue.resize(num_faces);
+    if (face_last_rank_for_slot.size() < num_faces)
+      face_last_rank_for_slot.resize(num_faces);
+    if (dfs_frames.capacity() < num_faces)
+      dfs_frames.reserve(num_faces);
+    consumer_rank_face_offsets.assign(num_consumer_ranks + 1, 0);
+    consumer_rank_face_write_offsets.assign(num_consumer_ranks, 0);
+    faces_by_consumer_rank.assign(num_faces, INVALID_INDEX);
+    candidate_producer_rank_offsets.assign(num_consumer_ranks + 1, 0);
+    candidate_face_counts_by_consumer_rank.assign(num_consumer_ranks, 0);
+    greedy_consumer_rank_order.clear();
+    candidate_producer_ranks.clear();
+  }
+};
+
+namespace
+{
+
+// Build the reflexive transitive closure of the local cell DAG in topological-rank space.
+void
+BuildReachability(const std::uint32_t num_tasks,
+                  const std::vector<std::uint32_t>& successor_rank_offsets,
+                  const std::vector<std::uint32_t>& successor_ranks,
+                  ThreadLocalWorkspace& ws)
+{
+  ws.reachability.ResizeAndClear(num_tasks);
+  for (std::uint32_t i = 0; i < num_tasks; ++i)
+  {
+    const auto successor_begin = successor_ranks.begin() + successor_rank_offsets[i];
+    const auto successor_end = successor_ranks.begin() + successor_rank_offsets[i + 1];
+    const auto start_word = static_cast<std::size_t>(i / 64);
+
+    ws.reachability.SetBit(i, i);
+    if (successor_begin == successor_end)
+      continue;
+
+    ws.reachability.CopyRowFromWord(i, ws.reachability, *successor_begin, start_word);
+    for (auto it = successor_begin + 1; it != successor_end; ++it)
+      ws.reachability.OrRowsFromWord(i, ws.reachability, *it, start_word);
+  }
+}
+
+} // namespace
+
+// Exact minimum chain-cover solver for the local-face reuse poset.
+//
+// The bipartite graph is never explicity materialized. Candidate right vertices are generated
+// on demand from the cached reachability rows and from the producer-face grouping created by
+// CBC_SPDS::BuildLocalFaceTaskGraph(), which avoids the memory cost of an explicit dense
+// face-to-face adjacency structure.
+class LocalFaceHopcroftKarp
+{
+public:
+  LocalFaceHopcroftKarp(const std::vector<std::uint32_t>& face_producer_ranks,
+                        const std::vector<std::uint32_t>& face_consumer_ranks,
+                        const std::vector<std::uint32_t>& producer_cell_face_offsets,
+                        std::vector<std::uint32_t>& face_slot_ids,
+                        ThreadLocalWorkspace& ws)
+    : num_faces_(static_cast<std::uint32_t>(face_producer_ranks.size())),
+      face_producer_ranks_(face_producer_ranks),
+      face_consumer_ranks_(face_consumer_ranks),
+      producer_cell_face_offsets_(producer_cell_face_offsets),
+      face_slot_ids_(face_slot_ids),
+      ws_(ws)
+  {
+    ws_.PrepareMatching(producer_cell_face_offsets_.size() - 1, num_faces_);
+    PrepareConsumerFaceCache();
+    PrepareCandidateProducerRankCache();
+    PrepareGreedyOrder();
+  }
+
+  SlotSolveResult Solve()
+  {
+    if (num_faces_ == 0)
+    {
+      face_slot_ids_.clear();
+      return {};
+    }
+
+    // Greedy seeding to increase the initial matching size and reduces the
+    // number of BFS/DFS phases that follow.
+    std::size_t matching_size = GreedyInit();
+    while (BFS())
+    {
+      for (std::uint32_t i = 0; i < num_faces_; ++i)
+      {
+        if (ws_.face_mate_u[i] == INVALID_INDEX and DFS(i))
+          ++matching_size;
+      }
+    }
+
+    ExtractSlotAssignment();
+    const std::size_t slot_count = static_cast<std::size_t>(num_faces_) - matching_size;
+    if (VerifySlotAssignment(slot_count))
+      return {slot_count, false};
+
+    std::iota(face_slot_ids_.begin(), face_slot_ids_.end(), std::uint32_t{0});
+    return {static_cast<std::size_t>(num_faces_), true};
+  }
+
+private:
+  template <class F>
+  void ForEachCandidate(const std::uint32_t u_face_rank, const F& fn) const
+  {
+    // The bipartite graph is implicit. For one left-side face u, the admissible right-side
+    // faces are all faces whose producer ranks lie in the cached reachable-producer row of
+    // u's consumer rank.
+    const auto consumer_cell_rank = face_consumer_ranks_[u_face_rank];
+    const auto rank_begin = ws_.candidate_producer_rank_offsets[consumer_cell_rank];
+    const auto rank_end = ws_.candidate_producer_rank_offsets[consumer_cell_rank + 1];
+    for (std::uint32_t rank_index = rank_begin; rank_index < rank_end; ++rank_index)
+    {
+      const auto producer_cell_rank = ws_.candidate_producer_ranks[rank_index];
+      const auto face_begin = producer_cell_face_offsets_[producer_cell_rank];
+      const auto face_end = producer_cell_face_offsets_[producer_cell_rank + 1];
+      for (std::uint32_t v_face_rank = face_begin; v_face_rank < face_end; ++v_face_rank)
+      {
+        if (fn(v_face_rank))
+          return;
+      }
+    }
+  }
+
+  bool ReuseRelationHolds(const std::uint32_t u_face_rank,
+                          const std::uint32_t v_face_rank) const noexcept
+  {
+    return ws_.reachability.TestBit(face_consumer_ranks_[u_face_rank],
+                                    face_producer_ranks_[v_face_rank]);
+  }
+
+  void ExtractSlotAssignment()
+  {
+    // Every unmatched right vertex starts one chain. Following the matched left-to-right
+    // links recovers the full chain, and each chain becomes one reusable slot.
+    face_slot_ids_.assign(num_faces_, INVALID_INDEX);
+    std::uint32_t next_slot_id = 0;
+    for (std::uint32_t i = 0; i < num_faces_; ++i)
+    {
+      if (ws_.face_mate_v[i] != INVALID_INDEX)
+        continue;
+
+      std::uint32_t current = i;
+      while (current != INVALID_INDEX)
+      {
+        face_slot_ids_[current] = next_slot_id;
+        current = ws_.face_mate_u[current];
+      }
+      ++next_slot_id;
+    }
+  }
+
+  bool VerifySlotAssignment(const std::size_t slot_count) const
+  {
+    for (std::uint32_t face = 0; face < num_faces_; ++face)
+    {
+      if (face_slot_ids_[face] >= slot_count)
+        return false;
+    }
+
+    std::fill_n(ws_.face_last_rank_for_slot.begin(), slot_count, INVALID_INDEX);
+    for (std::uint32_t rank = 0; rank < num_faces_; ++rank)
+    {
+      // It is sufficient to check consecutive faces within one extracted chain.
+      // Transitivity of the reuse relation then covers the full chain.
+      const auto slot_id = face_slot_ids_[rank];
+      const auto prev_rank = ws_.face_last_rank_for_slot[slot_id];
+      if ((prev_rank != INVALID_INDEX) and (not ReuseRelationHolds(prev_rank, rank)))
+        return false;
+      ws_.face_last_rank_for_slot[slot_id] = rank;
+    }
+    return true;
+  }
+
+  std::size_t GreedyInit()
+  {
+    std::size_t count = 0;
+    for (const auto consumer_rank : ws_.greedy_consumer_rank_order)
+    {
+      // Process the scarcest consumer rows first.
+      // This preserves exactness while giving the greedy phase a better
+      // chance of seeding a large initial matching.
+      const auto face_begin = ws_.consumer_rank_face_offsets[consumer_rank];
+      const auto face_end = ws_.consumer_rank_face_offsets[consumer_rank + 1];
+      for (std::uint32_t face_index = face_begin; face_index < face_end; ++face_index)
+      {
+        const auto u_face_rank = ws_.faces_by_consumer_rank[face_index];
+        if (ws_.face_mate_u[u_face_rank] != INVALID_INDEX)
+          continue;
+
+        ForEachCandidate(u_face_rank,
+                         [&](const std::uint32_t v_face_rank) -> bool
+                         {
+                           if (ws_.face_mate_v[v_face_rank] != INVALID_INDEX)
+                             return false;
+                           ws_.face_mate_u[u_face_rank] = v_face_rank;
+                           ws_.face_mate_v[v_face_rank] = u_face_rank;
+                           ++count;
+                           return true;
+                         });
+      }
+    }
+    return count;
+  }
+
+  bool BFS()
+  {
+    // Hopcroft-Karp BFS: build distance labels from all unmatched left vertices and
+    // stop at the first layer that reaches the null vertex.
+    std::fill_n(ws_.face_dist.begin(), num_faces_, -1);
+    std::size_t head = 0;
+    std::size_t tail = 0;
+
+    for (std::uint32_t i = 0; i < num_faces_; ++i)
+    {
+      if (ws_.face_mate_u[i] != INVALID_INDEX)
+        continue;
+      ws_.face_dist[i] = 0;
+      ws_.face_queue[tail++] = i;
+    }
+
+    dist_null_ = std::numeric_limits<int>::max();
+    while (head < tail)
+    {
+      const auto u_face_rank = ws_.face_queue[head++];
+      if (ws_.face_dist[u_face_rank] >= dist_null_)
+        continue;
+
+      ForEachCandidate(u_face_rank,
+                       [&](const std::uint32_t v_face_rank) -> bool
+                       {
+                         const auto mate_of_v = ws_.face_mate_v[v_face_rank];
+                         if (mate_of_v == INVALID_INDEX)
+                         {
+                           if (dist_null_ == std::numeric_limits<int>::max())
+                             dist_null_ = ws_.face_dist[u_face_rank] + 1;
+                         }
+                         else if (ws_.face_dist[mate_of_v] == -1)
+                         {
+                           ws_.face_dist[mate_of_v] = ws_.face_dist[u_face_rank] + 1;
+                           ws_.face_queue[tail++] = mate_of_v;
+                         }
+                         return false;
+                       });
+    }
+
+    return dist_null_ != std::numeric_limits<int>::max();
+  }
+
+  bool DFS(const std::uint32_t u_face_rank)
+  {
+    // Hopcroft-Karp DFS, implemented iteratively.
+    // Each frame represents one left vertex together with the current
+    // position in its implicit adjacency row. This avoids recursion while
+    // preserving the same augmenting-path search.
+    ws_.dfs_frames.clear();
+    PushDFSFrame(u_face_rank, INVALID_INDEX);
+
+    while (not ws_.dfs_frames.empty())
+    {
+      auto& frame = ws_.dfs_frames.back();
+      const auto current_u = frame.u_face_rank;
+      const auto current_dist = ws_.face_dist[current_u];
+
+      bool descended = false;
+      while (AdvanceFrame(frame))
+      {
+        const auto v_face_rank = frame.next_v_face_rank++;
+        const auto mate_of_v = ws_.face_mate_v[v_face_rank];
+        if (mate_of_v == INVALID_INDEX)
+        {
+          if (dist_null_ != current_dist + 1)
+            continue;
+
+          // An augmenting path has been found. Walk back through the explicit stack and flip
+          // the matching along the full alternating path.
+          ws_.face_mate_v[v_face_rank] = current_u;
+          ws_.face_mate_u[current_u] = v_face_rank;
+          ws_.face_dist[current_u] = -1;
+          for (std::size_t depth = ws_.dfs_frames.size(); depth-- > 1;)
+          {
+            const auto parent_u = ws_.dfs_frames[depth - 1].u_face_rank;
+            const auto via_v_face_rank = ws_.dfs_frames[depth].via_v_face_rank;
+            ws_.face_mate_v[via_v_face_rank] = parent_u;
+            ws_.face_mate_u[parent_u] = via_v_face_rank;
+            ws_.face_dist[parent_u] = -1;
+          }
+          return true;
+        }
+
+        if (ws_.face_dist[mate_of_v] != current_dist + 1)
+          continue;
+
+        PushDFSFrame(mate_of_v, v_face_rank);
+        descended = true;
+        break;
+      }
+
+      if (descended)
+        continue;
+
+      ws_.face_dist[current_u] = -1;
+      ws_.dfs_frames.pop_back();
+    }
+
+    return false;
+  }
+
+  void PrepareConsumerFaceCache()
+  {
+    // Regroup left-side faces by consumer rank once so both greedy seeding and layered
+    // matching traverse contiguous face ranges instead of repeatedly filtering the face list.
+    for (const auto consumer_rank : face_consumer_ranks_)
+      ++ws_.consumer_rank_face_offsets[consumer_rank + 1];
+
+    std::partial_sum(ws_.consumer_rank_face_offsets.begin(),
+                     ws_.consumer_rank_face_offsets.end(),
+                     ws_.consumer_rank_face_offsets.begin());
+    std::copy_n(ws_.consumer_rank_face_offsets.begin(),
+                ws_.consumer_rank_face_write_offsets.size(),
+                ws_.consumer_rank_face_write_offsets.begin());
+
+    for (std::uint32_t u_face_rank = 0; u_face_rank < num_faces_; ++u_face_rank)
+    {
+      const auto consumer_rank = face_consumer_ranks_[u_face_rank];
+      const auto write_index = ws_.consumer_rank_face_write_offsets[consumer_rank]++;
+      ws_.faces_by_consumer_rank[write_index] = u_face_rank;
+    }
+  }
+
+  void PrepareCandidateProducerRankCache()
+  {
+    // Cache the sparse producer-rank rows of the implicit bipartite graph. All faces with
+    // the same consumer rank share the same reachable producer ranks.
+    const auto num_consumer_ranks = producer_cell_face_offsets_.size() - 1;
+    for (std::size_t consumer_rank = 0; consumer_rank < num_consumer_ranks; ++consumer_rank)
+    {
+      ws_.candidate_producer_rank_offsets[consumer_rank] =
+        static_cast<std::uint32_t>(ws_.candidate_producer_ranks.size());
+
+      if (ws_.consumer_rank_face_offsets[consumer_rank] ==
+          ws_.consumer_rank_face_offsets[consumer_rank + 1])
+        continue;
+
+      std::uint32_t candidate_face_count = 0;
+      for (std::size_t producer_rank = ws_.reachability.FindFirstSet(consumer_rank, consumer_rank);
+           producer_rank < num_consumer_ranks;
+           producer_rank = ws_.reachability.FindNextSet(consumer_rank, producer_rank))
+      {
+        const auto face_begin = producer_cell_face_offsets_[producer_rank];
+        const auto face_end = producer_cell_face_offsets_[producer_rank + 1];
+        if (face_begin == face_end)
+          continue;
+
+        ws_.candidate_producer_ranks.push_back(static_cast<std::uint32_t>(producer_rank));
+        candidate_face_count += face_end - face_begin;
+      }
+      ws_.candidate_face_counts_by_consumer_rank[consumer_rank] = candidate_face_count;
+    }
+
+    ws_.candidate_producer_rank_offsets.back() =
+      static_cast<std::uint32_t>(ws_.candidate_producer_ranks.size());
+  }
+
+  void PrepareGreedyOrder()
+  {
+    // Order nonempty consumer rows by increasing right-side candidate count.
+    // This affects only the heuristic seed matching, not the final result.
+    const auto num_consumer_ranks = producer_cell_face_offsets_.size() - 1;
+    ws_.greedy_consumer_rank_order.reserve(num_consumer_ranks);
+    for (std::uint32_t consumer_rank = 0; consumer_rank < num_consumer_ranks; ++consumer_rank)
+    {
+      if (ws_.consumer_rank_face_offsets[consumer_rank] ==
+          ws_.consumer_rank_face_offsets[consumer_rank + 1])
+        continue;
+      ws_.greedy_consumer_rank_order.push_back(consumer_rank);
+    }
+
+    std::sort(ws_.greedy_consumer_rank_order.begin(),
+              ws_.greedy_consumer_rank_order.end(),
+              [&](const std::uint32_t lhs, const std::uint32_t rhs)
+              {
+                const auto lhs_count = ws_.candidate_face_counts_by_consumer_rank[lhs];
+                const auto rhs_count = ws_.candidate_face_counts_by_consumer_rank[rhs];
+                if (lhs_count != rhs_count)
+                  return lhs_count < rhs_count;
+                return lhs < rhs;
+              });
+  }
+
+  void PushDFSFrame(const std::uint32_t u_face_rank, const std::uint32_t via_v_face_rank)
+  {
+    // Materialize the current state of one implicit adjacency-row scan on the DFS stack.
+    const auto consumer_rank = face_consumer_ranks_[u_face_rank];
+    const auto producer_rank_index = ws_.candidate_producer_rank_offsets[consumer_rank];
+    const auto producer_rank_end = ws_.candidate_producer_rank_offsets[consumer_rank + 1];
+    ws_.dfs_frames.push_back(
+      {u_face_rank, via_v_face_rank, producer_rank_index, producer_rank_end, 0, 0});
+  }
+
+  bool AdvanceFrame(DFSFrame& frame) const
+  {
+    // Advance the current DFS frame to the next candidate right vertex. The frame stores
+    // both the producer-rank row cursor and the face-range cursor within that row.
+    while (true)
+    {
+      if (frame.next_v_face_rank < frame.v_face_end)
+        return true;
+      if (frame.producer_rank_index >= frame.producer_rank_end)
+        return false;
+
+      const auto producer_rank = ws_.candidate_producer_ranks[frame.producer_rank_index++];
+      frame.next_v_face_rank = producer_cell_face_offsets_[producer_rank];
+      frame.v_face_end = producer_cell_face_offsets_[producer_rank + 1];
+    }
+  }
+
+  std::uint32_t num_faces_ = 0;
+  const std::vector<std::uint32_t>& face_producer_ranks_;
+  const std::vector<std::uint32_t>& face_consumer_ranks_;
+  const std::vector<std::uint32_t>& producer_cell_face_offsets_;
+  std::vector<std::uint32_t>& face_slot_ids_;
+  ThreadLocalWorkspace& ws_;
+  int dist_null_ = 0;
+};
+
+SlotSolveResult
+ComputeLocalFaceSlotPlan(const std::vector<std::uint32_t>& successor_rank_offsets,
+                         const std::vector<std::uint32_t>& successor_ranks,
+                         const std::vector<std::uint32_t>& face_producer_ranks,
+                         const std::vector<std::uint32_t>& face_consumer_ranks,
+                         const std::vector<std::uint32_t>& producer_cell_face_offsets,
+                         std::vector<std::uint32_t>& face_slot_ids)
+{
+  if (face_producer_ranks.empty())
+  {
+    face_slot_ids.clear();
+    return {};
+  }
+
+  static thread_local ThreadLocalWorkspace workspace;
+  BuildReachability(static_cast<std::uint32_t>(successor_rank_offsets.size() - 1),
+                    successor_rank_offsets,
+                    successor_ranks,
+                    workspace);
+
+  LocalFaceHopcroftKarp slot_planner(
+    face_producer_ranks, face_consumer_ranks, producer_cell_face_offsets, face_slot_ids, workspace);
+  return slot_planner.Solve();
+}
+
+} // namespace opensn::detail
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc_slot_planner.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc_slot_planner.h
new file mode 100644
index 0000000000..83f493225d
--- /dev/null
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc_slot_planner.h
@@ -0,0 +1,77 @@
+// SPDX-FileCopyrightText: 2026 The OpenSn Authors <https://open-sn.github.io/opensn/>
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+namespace opensn::detail
+{
+
+/// Result of an exact local-face slot-planning solve.
+struct SlotSolveResult
+{
+  /// Exact number of reusable slots required by the computed chain cover.
+  std::size_t slot_count = 0;
+  /// Flag indicating that the post-solve verifier rejected the computed assignment.
+  bool verifier_rejected = false;
+};
+
+/**
+ * Compute the exact minimum safe local-face slot assignment.
+ *
+ * Let `F` denote the local directed faces, and define `u < v` when the consumer cell of
+ * face `u` reaches the producer cell of face `v` in the local CBC task DAG. This is the
+ * safe reuse relation: if `u < v`, then every admissible CBC or CBCD sweep consumes the
+ * angular flux stored for `u` before `v` may overwrite the same slot.
+ *
+ * Computing the minimum number of reusable cell-face slots is equivalent to
+ * the minimum chain-cover problem for the induced face poset.
+ * A chain is one statically reusable slot. The minimum number of slots equals the poset
+ * width (i.e. the maximum cardinality of any antichain of pairwise incomparable
+ * faces). By Dilworth's theorem, this is the minimum chain-cover cardinality.
+ *
+ * The implementation uses the standard bipartite split-graph reduction. The reuse relation
+ * defines the bipartite edges, Hopcroft-Karp computes a maximum cardinality matching, and
+ * the matching induces a minimum chain cover of size `|F| - |M|`. Koenig's theorem provides
+ * the matching-cover duality for the bipartite graph. Consequently, the returned slot count
+ * is exact.
+ *
+ * Algorithm flow:
+ * 1. Build the reflexive transitive closure of the local CBC task DAG in topological-rank
+ * space.
+ * 2. Group local directed faces by consumer-cell rank and cache the reachable producer-cell
+ * ranks that define the reuse graph rows.
+ * 3. Run Hopcroft-Karp on the implicit bipartite reuse graph:
+ *    a. greedy seeding,
+ *    b. BFS layer construction,
+ *    c. iterative DFS augmentation.
+ * 4. Extract one slot chain per unmatched right-side face.
+ * 5. Verify the extracted assignment and report whether the caller should fall back to the
+ * identity assignment.
+ *
+ * After chain extraction, the assignment is verified by checking each consecutive reuse
+ * handoff in face enumeration order. If the verifier rejects the result, the caller may
+ * conservatively fall back to the identity assignment.
+ *
+ * \param successor_rank_offsets Offsets into the flat successor-rank adjacency list of the
+ * local CBC task DAG.
+ * \param successor_ranks Flat successor-rank adjacency list of the local CBC task DAG.
+ * \param face_producer_ranks Producer-cell topological rank for each local directed face.
+ * \param face_consumer_ranks Consumer-cell topological rank for each local directed face.
+ * \param producer_cell_face_offsets Offsets grouping local faces by producer-cell topological
+ * rank.
+ * \param face_slot_ids Output slot assignment keyed by local face rank.
+ * \return Exact slot count and verifier status for the computed assignment.
+ */
+SlotSolveResult
+ComputeLocalFaceSlotPlan(const std::vector<std::uint32_t>& successor_rank_offsets,
+                         const std::vector<std::uint32_t>& successor_ranks,
+                         const std::vector<std::uint32_t>& face_producer_ranks,
+                         const std::vector<std::uint32_t>& face_consumer_ranks,
+                         const std::vector<std::uint32_t>& producer_cell_face_offsets,
+                         std::vector<std::uint32_t>& face_slot_ids);
+
+} // namespace opensn::detail

From 2ea48b7d6e08e99e15dca2afb3f6086aaa86b598 Mon Sep 17 00:00:00 2001
From: Eappen Nelluvelil <eappen@tamu.edu>
Date: Sun, 12 Apr 2026 17:39:06 -0500
Subject: [PATCH 5/6] CBC_FLUDS uses minimally sized local psi buffer

---
 .../discrete_ordinates_problem.cc             |  10 +-
 .../sweep/angle_set/cbc_angle_set.cc          |  72 +++--
 .../sweep/angle_set/cbc_angle_set.h           |  68 +++-
 .../sweep/communicators/cbc_async_comm.cc     | 217 ++++++++-----
 .../sweep/communicators/cbc_async_comm.h      |  68 +++-
 .../sweep/fluds/cbc_fluds.cc                  | 165 ++++++----
 .../sweep/fluds/cbc_fluds.h                   | 197 ++++++++----
 .../sweep/fluds/cbc_fluds_common_data.cc      | 101 +++++-
 .../sweep/fluds/cbc_fluds_common_data.h       | 143 ++++++++-
 .../sweep_chunks/cbc_avx_sweep_chunk.cc       | 162 ++++++----
 .../sweep_chunks/cbc_sweep_chunk.cc           |  60 ++--
 .../sweep_chunks/cbc_sweep_chunk.h            |  58 ++--
 .../sweep_chunks/cbc_sweep_chunk_shared.h     | 141 ++++-----
 .../sweep_chunks/cbc_sweep_chunk_td.cc        |  81 ++---
 .../sweep_chunks/cbc_sweep_chunk_td.h         |  25 +-
 .../sweep_chunks/cbc_sweep_kernels.h          | 298 ++++++++++++++----
 16 files changed, 1327 insertions(+), 539 deletions(-)

diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc
index dddec78344..bf9076043a 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc
@@ -1922,12 +1922,10 @@ DiscreteOrdinatesProblem::InitFluxDataStructures(LBSGroupset& groupset)
         }
         else
         {
-          fluds =
-            std::make_shared<CBC_FLUDS>(gs_num_grps,
-                                        angle_indices.size(),
-                                        dynamic_cast<const CBC_FLUDSCommonData&>(fluds_common_data),
-                                        groupset.psi_uk_man_,
-                                        *discretization_);
+          fluds = std::make_shared<CBC_FLUDS>(
+            gs_num_grps,
+            angle_indices.size(),
+            dynamic_cast<const CBC_FLUDSCommonData&>(fluds_common_data));
         }
 
         std::shared_ptr<AngleSet> angle_set;
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.cc
index 999675b039..8237eb543d 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.cc
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.cc
@@ -3,6 +3,7 @@
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/sweep_chunk.h"
 #include "framework/mesh/mesh_continuum/mesh_continuum.h"
@@ -10,6 +11,7 @@
 #include "framework/logging/log.h"
 #include "framework/runtime.h"
 #include "caliper/cali.h"
+#include <cassert>
 
 namespace opensn
 {
@@ -24,8 +26,27 @@ CBC_AngleSet::CBC_AngleSet(size_t id,
   : AngleSet(id, num_groups, spds, fluds, angle_indices, boundaries),
     cbc_spds_(dynamic_cast<const CBC_SPDS&>(spds_)),
     ready_tasks_(),
-    async_comm_(id, *fluds, comm_set)
+    async_comm_(id, *fluds, comm_set),
+    cbc_fluds_(dynamic_cast<CBC_FLUDS&>(*fluds))
 {
+
+  const auto& task_list = cbc_spds_.GetTaskList();
+  const auto num_tasks = task_list.size();
+  initial_dependencies_.resize(num_tasks);
+  remaining_dependencies_.resize(num_tasks);
+  initial_ready_tasks_.reserve(num_tasks);
+  ready_tasks_.reserve(num_tasks);
+
+  for (std::uint32_t task_idx = 0; task_idx < num_tasks; ++task_idx)
+  {
+    const auto& task = task_list[task_idx];
+    const auto num_dependencies = task.num_dependencies;
+    initial_dependencies_[task_idx] = num_dependencies;
+    if (num_dependencies == 0)
+      initial_ready_tasks_.push_back(task_idx);
+  }
+
+  ResetTaskState();
 }
 
 AsynchronousCommunicator*
@@ -42,24 +63,15 @@ CBC_AngleSet::AngleSetAdvance(SweepChunk& sweep_chunk, AngleSetStatus permission
   if (executed_)
     return AngleSetStatus::FINISHED;
 
-  if (current_task_list_.empty())
-  {
-    current_task_list_ = cbc_spds_.GetTaskList();
-    // Build initial ready queue
-    ready_tasks_.reserve(current_task_list_.size());
-    for (size_t i = 0; i < current_task_list_.size(); ++i)
-      if ((current_task_list_[i].num_dependencies == 0) and (not current_task_list_[i].completed))
-        ready_tasks_.push_back(i);
-  }
-
+  const auto& task_list = cbc_spds_.GetTaskList();
   sweep_chunk.SetAngleSet(*this);
 
-  auto tasks_who_received_data = async_comm_.ReceiveData();
+  const auto tasks_who_received_data = async_comm_.ReceiveData();
 
-  for (const std::uint64_t task_number : tasks_who_received_data)
+  for (const auto& task_number : tasks_who_received_data)
   {
-    if ((--current_task_list_[task_number].num_dependencies == 0) and
-        (not current_task_list_[task_number].completed))
+    assert(remaining_dependencies_[task_number] > 0);
+    if (--remaining_dependencies_[task_number] == 0)
       ready_tasks_.push_back(task_number);
   }
 
@@ -74,24 +86,23 @@ CBC_AngleSet::AngleSetAdvance(SweepChunk& sweep_chunk, AngleSetStatus permission
   {
     const auto task_idx = ready_tasks_.back();
     ready_tasks_.pop_back();
-    auto& cell_task = current_task_list_[task_idx];
+    const auto& cell_task = task_list[task_idx];
 
     sweep_chunk.SetCell(cell_task.cell_ptr, *this);
     sweep_chunk.Sweep(*this);
 
     for (const auto& local_task_num : cell_task.successors)
     {
-      if ((--current_task_list_[local_task_num].num_dependencies == 0) and
-          (not current_task_list_[local_task_num].completed))
+      assert(remaining_dependencies_[local_task_num] > 0);
+      if (--remaining_dependencies_[local_task_num] == 0)
         ready_tasks_.push_back(local_task_num);
     }
 
-    cell_task.completed = true;
-    ++num_completed_tasks;
+    ++num_completed_tasks_;
     async_comm_.SendData();
   }
 
-  const bool all_tasks_completed = (num_completed_tasks == current_task_list_.size());
+  const bool all_tasks_completed = (num_completed_tasks_ == task_list.size());
   const bool all_messages_sent = async_comm_.SendData();
 
   if (all_tasks_completed and all_messages_sent)
@@ -109,14 +120,21 @@ CBC_AngleSet::AngleSetAdvance(SweepChunk& sweep_chunk, AngleSetStatus permission
 void
 CBC_AngleSet::ResetSweepBuffers()
 {
-  current_task_list_.clear();
-  ready_tasks_.clear();
-  num_completed_tasks = 0;
+  ResetTaskState();
   async_comm_.Reset();
   fluds_->ClearLocalAndReceivePsi();
   executed_ = false;
 }
 
+void
+CBC_AngleSet::ResetTaskState()
+{
+  std::copy(
+    initial_dependencies_.begin(), initial_dependencies_.end(), remaining_dependencies_.begin());
+  ready_tasks_ = initial_ready_tasks_;
+  num_completed_tasks_ = 0;
+}
+
 const double*
 CBC_AngleSet::PsiBoundary(uint64_t boundary_id,
                           unsigned int angle_num,
@@ -126,12 +144,8 @@ CBC_AngleSet::PsiBoundary(uint64_t boundary_id,
                           unsigned int g,
                           bool surface_source_active)
 {
-  if (boundaries_[boundary_id]->IsReflecting())
-    return boundaries_[boundary_id]->PsiIncoming(cell_local_id, face_num, fi, angle_num, g);
-
-  if (not surface_source_active)
+  if ((not boundaries_[boundary_id]->IsReflecting()) and (not surface_source_active))
     return boundaries_[boundary_id]->ZeroFlux(g);
-
   return boundaries_[boundary_id]->PsiIncoming(cell_local_id, face_num, fi, angle_num, g);
 }
 
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.h
index ba127849db..266aef7cd8 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.h
@@ -9,11 +9,29 @@
 namespace opensn
 {
 
+class CBC_FLUDS;
 class CBC_SPDS;
 
+/**
+ * Host-side CBC angle set.
+ *
+ * Owns the local CBC task state for one angle set and advances the host CBC sweep
+ * by combining local task execution with non-local message progress.
+ */
 class CBC_AngleSet : public AngleSet
 {
 public:
+  /**
+   * Construct the CBC angle set.
+   *
+   * \param id Angle-set ID.
+   * \param num_groups Number of groups in the angle set.
+   * \param spds Sweep plane data structure for this angle set.
+   * \param fluds CBC FLUDS instance for this angle set.
+   * \param angle_indices Global angle indices represented by this angle set.
+   * \param boundaries Sweep-boundary table indexed by boundary ID.
+   * \param comm_set MPI communicator set used for receives.
+   */
   CBC_AngleSet(size_t id,
                unsigned int num_groups,
                const SPDS& spds,
@@ -22,14 +40,19 @@ class CBC_AngleSet : public AngleSet
                std::map<uint64_t, std::shared_ptr<SweepBoundary>>& boundaries,
                const MPICommunicatorSet& comm_set);
 
+  /// Return the delayed-data communicator for this angle set.
   AsynchronousCommunicator* GetCommunicator() override;
 
+  /// Initialize delayed upstream data before the sweep starts.
   void InitializeDelayedUpstreamData() override {}
 
+  /// Return the buffered-message limit used by the scheduler.
   int GetMaxBufferMessages() const override { return 0; }
 
-  void SetMaxBufferMessages(int new_max) override {}
+  /// Set the buffered-message limit used by the scheduler.
+  void SetMaxBufferMessages(int max_buffer_messages) override {}
 
+  /// Advance the host CBC angle set by one scheduler step.
   AngleSetStatus AngleSetAdvance(SweepChunk& sweep_chunk, AngleSetStatus permission) override;
 
   AngleSetStatus FlushSendBuffers() override
@@ -40,8 +63,21 @@ class CBC_AngleSet : public AngleSet
 
   void ResetSweepBuffers() override;
 
+  /// Report whether delayed upstream data has been received.
   bool ReceiveDelayedData() override { return true; }
 
+  /**
+   * Return the incoming boundary angular flux for one boundary face node.
+   *
+   * \param boundary_id Boundary ID.
+   * \param angle_num Angle index within the angle set.
+   * \param cell_local_id Local cell ID.
+   * \param face_num Face ID on the cell.
+   * \param fi Face-node index.
+   * \param g Group index.
+   * \param surface_source_active Flag if surface source is active.
+   * \return Pointer to the requested incoming boundary value.
+   */
   const double* PsiBoundary(uint64_t boundary_id,
                             unsigned int angle_num,
                             uint64_t cell_local_id,
@@ -50,6 +86,16 @@ class CBC_AngleSet : public AngleSet
                             unsigned int g,
                             bool surface_source_active) override;
 
+  /**
+   * Return the outgoing reflecting-boundary storage for one face node.
+   *
+   * \param boundary_id Boundary ID.
+   * \param angle_num Angle index within the angle set.
+   * \param cell_local_id Local cell ID.
+   * \param face_num Face ID on the cell.
+   * \param fi Face-node index.
+   * \return Pointer to the reflected outgoing storage for the node.
+   */
   double* PsiReflected(uint64_t boundary_id,
                        unsigned int angle_num,
                        uint64_t cell_local_id,
@@ -57,11 +103,25 @@ class CBC_AngleSet : public AngleSet
                        unsigned int fi) override;
 
 protected:
+  /// Reset the mutable local-task state before a new sweep.
+  void ResetTaskState();
+
+  /// CBC sweep plane data structure for this angle set.
   const CBC_SPDS& cbc_spds_;
-  std::vector<Task> current_task_list_;
-  std::vector<std::uint64_t> ready_tasks_;
-  size_t num_completed_tasks = 0;
+  /// Initial predecessor counts per local CBC task.
+  std::vector<unsigned int> initial_dependencies_;
+  /// Local tasks that are ready at the start of a sweep.
+  std::vector<std::uint32_t> initial_ready_tasks_;
+  /// Mutable predecessor counts for the current sweep.
+  std::vector<unsigned int> remaining_dependencies_;
+  /// Local tasks ready to execute.
+  std::vector<std::uint32_t> ready_tasks_;
+  /// Number of completed local tasks.
+  size_t num_completed_tasks_ = 0;
+  /// Asynchronous communicator for this angle set.
   CBC_AsynchronousCommunicator async_comm_;
+  /// CBC FLUDS instance for this angle set.
+  CBC_FLUDS& cbc_fluds_;
 };
 
 } // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.cc
index 44c61c777e..86f08029dd 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.cc
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.cc
@@ -2,82 +2,146 @@
 // SPDX-License-Identifier: MIT
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/spds.h"
 #include "framework/mesh/mesh_continuum/mesh_continuum.h"
 #include "framework/mpi/mpi_comm_set.h"
-#include "framework/logging/log.h"
 #include "framework/runtime.h"
 #include "caliper/cali.h"
 #include <cstring>
-#include <memory>
+#include <limits>
+#include <span>
 
 namespace opensn
 {
 
+namespace detail
+{
+
+namespace
+{
+
+template <typename T>
+void
+AppendBytes(std::vector<std::byte>& buffer, const T& value)
+{
+  const size_t old_size = buffer.size();
+  buffer.resize(old_size + sizeof(T));
+  std::memcpy(buffer.data() + old_size, &value, sizeof(T));
+}
+
+template <typename T>
+T
+ReadBytes(std::span<const std::byte> buffer, size_t& offset)
+{
+  T value;
+  std::memcpy(&value, buffer.data() + offset, sizeof(T));
+  offset += sizeof(T);
+  return value;
+}
+
+} // namespace
+
+} // namespace detail
+
+CBC_AsynchronousCommunicator::CBC_AsynchronousCommunicator(size_t angle_set_id,
+                                                           FLUDS& fluds,
+                                                           const MPICommunicatorSet& comm_set)
+  : AsynchronousCommunicator(fluds, comm_set),
+    angle_set_id_(angle_set_id),
+    cbc_fluds_(dynamic_cast<CBC_FLUDS&>(fluds))
+{
+  const auto& cbc_common = dynamic_cast<const CBC_FLUDSCommonData&>(cbc_fluds_.GetCommonData());
+  const auto num_deplocs = fluds_.GetSPDS().GetLocationSuccessors().size();
+
+  outgoing_message_queue_.reserve(cbc_common.GetNumOutgoingNonlocalFaces());
+  send_buffer_.reserve(num_deplocs);
+  destination_buffer_bytes_.assign(num_deplocs, 0);
+  destination_buffer_indices_.assign(num_deplocs, std::numeric_limits<size_t>::max());
+
+  constexpr size_t header_bytes = sizeof(std::uint64_t) + sizeof(unsigned int) + sizeof(size_t);
+  for (size_t deplocI = 0; deplocI < num_deplocs; ++deplocI)
+  {
+    destination_buffer_bytes_[deplocI] =
+      cbc_common.GetDeplocIFaceNodeCount(deplocI) * cbc_fluds_.GetStrideSize() * sizeof(double) +
+      cbc_common.GetDeplocIFaceCount(deplocI) * header_bytes;
+  }
+}
+
 std::vector<double>&
 CBC_AsynchronousCommunicator::InitGetDownwindMessageData(int location_id,
-                                                         uint64_t cell_global_id,
+                                                         std::uint64_t cell_global_id,
                                                          unsigned int face_id,
-                                                         size_t angle_set_id,
-                                                         size_t data_size)
+                                                         std::size_t angle_set_id,
+                                                         std::size_t data_size)
 {
   MessageKey key{location_id, cell_global_id, face_id};
-  std::vector<double>& data = outgoing_message_queue_[key];
-  if (data.empty())
-    data.assign(data_size, 0.0);
+  auto [it, inserted] = outgoing_message_queue_.try_emplace(key);
+  std::vector<double>& data = it->second;
+  if (inserted)
+    data.resize(data_size);
   return data;
 }
 
-bool
-CBC_AsynchronousCommunicator::SendData()
+void
+CBC_AsynchronousCommunicator::QueueOutgoingMessages()
 {
-  CALI_CXX_MARK_SCOPE("CBC_AsynchronousCommunicator::SendData");
-
-  // First we convert any new outgoing messages from the queue into
-  // buffer messages. We aggregate these messages per location-id
-  // they need to be sent to
-  if (not outgoing_message_queue_.empty())
+  if (outgoing_message_queue_.empty())
+    return;
+  std::fill(destination_buffer_indices_.begin(),
+            destination_buffer_indices_.end(),
+            std::numeric_limits<size_t>::max());
+  for (const auto& [msg_key, data] : outgoing_message_queue_)
   {
-    std::map<int, BufferItem> locI_buffer_map;
-
-    for (const auto& [msg_key, data] : outgoing_message_queue_)
+    const int locI = std::get<0>(msg_key);
+    const std::uint64_t cell_global_id = std::get<1>(msg_key);
+    const unsigned int face_id = std::get<2>(msg_key);
+    const size_t data_size = data.size();
+    const auto deplocI = static_cast<size_t>(fluds_.GetSPDS().MapLocJToDeplocI(locI));
+
+    auto buffer_index = destination_buffer_indices_[deplocI];
+    if (buffer_index == std::numeric_limits<size_t>::max())
     {
-      const int locI = std::get<0>(msg_key);
-      const uint64_t cell_global_id = std::get<1>(msg_key);
-      const unsigned int face_id = std::get<2>(msg_key);
-      const size_t data_size = data.size();
-
-      BufferItem& buffer_item = locI_buffer_map[locI];
-      buffer_item.destination = locI;
-      auto& buffer_array = buffer_item.data_array;
-      buffer_array.Write(cell_global_id);
-      buffer_array.Write(face_id);
-      buffer_array.Write(data_size);
-
-      auto& raw = buffer_array.Data();
-      const size_t old_size = raw.size();
-      const size_t num_bytes = data_size * sizeof(double);
-      raw.resize(old_size + num_bytes);
-      std::memcpy(raw.data() + old_size, data.data(), num_bytes);
+      buffer_index = send_buffer_.size();
+      destination_buffer_indices_[deplocI] = buffer_index;
+      send_buffer_.emplace_back();
+      send_buffer_.back().destination = locI;
+      send_buffer_.back().data.reserve(destination_buffer_bytes_[deplocI]);
     }
 
-    for (auto& [locI, buffer] : locI_buffer_map)
-      send_buffer_.push_back(std::move(buffer));
+    auto& buffer_item = send_buffer_[buffer_index];
+    auto& buffer = buffer_item.data;
+    detail::AppendBytes(buffer, cell_global_id);
+    detail::AppendBytes(buffer, face_id);
+    detail::AppendBytes(buffer, data_size);
+
+    const size_t old_size = buffer.size();
+    const size_t num_bytes = data_size * sizeof(double);
+    buffer.resize(old_size + num_bytes);
+    std::memcpy(buffer.data() + old_size, data.data(), num_bytes);
+  }
+  outgoing_message_queue_.clear();
+}
+
+bool
+CBC_AsynchronousCommunicator::SendData()
+{
+  CALI_CXX_MARK_SCOPE("CBC_AsynchronousCommunicator::SendData");
 
-    outgoing_message_queue_.clear();
-  } // if there are outgoing messages
+  QueueOutgoingMessages();
 
-  // Now we attempt to flush items in the send buffer
   bool all_messages_sent = true;
-  for (auto& buffer_item : send_buffer_)
+  size_t next_open_buffer = 0;
+  for (size_t buffer_idx = 0; buffer_idx < send_buffer_.size(); ++buffer_idx)
   {
+    auto& buffer_item = send_buffer_[buffer_idx];
     if (not buffer_item.send_initiated)
     {
       const int locJ = buffer_item.destination;
       const auto& comm = comm_set_.LocICommunicator(locJ);
       auto dest = comm_set_.MapIonJ(locJ, locJ);
       auto tag = static_cast<int>(angle_set_id_);
-      buffer_item.mpi_request = comm.isend(dest, tag, buffer_item.data_array.Data());
+      buffer_item.mpi_request = comm.isend(dest, tag, buffer_item.data);
       buffer_item.send_initiated = true;
     }
 
@@ -88,8 +152,16 @@ CBC_AsynchronousCommunicator::SendData()
       else
         all_messages_sent = false;
     }
-  } // for item in buffer
 
+    if (not buffer_item.completed)
+    {
+      if (next_open_buffer != buffer_idx)
+        send_buffer_[next_open_buffer] = std::move(buffer_item);
+      ++next_open_buffer;
+    }
+  }
+
+  send_buffer_.resize(next_open_buffer);
   return all_messages_sent;
 }
 
@@ -98,40 +170,33 @@ CBC_AsynchronousCommunicator::ReceiveData()
 {
   CALI_CXX_MARK_SCOPE("CBC_AsynchronousCommunicator::ReceiveData");
 
-  std::unordered_map<FLUDS::CellFaceKey, std::vector<double>, FLUDS::CellFaceKeyHash>
-    received_messages;
-  std::vector<uint64_t> cells_who_received_data;
-  const auto& location_dependencies = fluds_.GetSPDS().GetLocationDependencies();
-  auto& deplocs_outgoing_messages = fluds_.GetDeplocsOutgoingMessages();
-  for (int locJ : location_dependencies)
+  std::vector<std::uint64_t> cells_who_received_data;
+  const auto& comm = comm_set_.LocICommunicator(opensn::mpi_comm.rank());
+  const auto tag = static_cast<int>(angle_set_id_);
+
+  mpi::Status status;
+  while (comm.iprobe(ANY_SOURCE, tag, status))
   {
-    const auto& comm = comm_set_.LocICommunicator(opensn::mpi_comm.rank());
-    auto source_rank = comm_set_.MapIonJ(locJ, opensn::mpi_comm.rank());
-    auto tag = static_cast<int>(angle_set_id_);
-    mpi::Status status;
-    if (comm.iprobe(source_rank, tag, status))
+    const int source_rank = status.source();
+    const int num_items = status.count<std::byte>();
+    receive_buffer_.resize(static_cast<size_t>(num_items));
+    comm.recv(source_rank, status.tag(), receive_buffer_.data(), num_items);
+    size_t offset = 0;
+    const std::span<const std::byte> data_array(receive_buffer_);
+
+    while (offset < data_array.size())
     {
-      int num_items = status.count<std::byte>();
-      std::vector<std::byte> recv_buffer(num_items);
-      comm.recv(source_rank, status.tag(), recv_buffer.data(), num_items);
-      ByteArray data_array(recv_buffer);
-
-      while (not data_array.EndOfBuffer())
-      {
-        const auto cell_global_id = data_array.Read<uint64_t>();
-        const auto face_id = data_array.Read<unsigned int>();
-        const auto data_size = data_array.Read<size_t>();
-
-        std::vector<double> psi_data(data_size);
-        const size_t num_bytes = data_size * sizeof(double);
-        std::memcpy(psi_data.data(), &data_array.Data()[data_array.Offset()], num_bytes);
-        data_array.Seek(data_array.Offset() + num_bytes);
-
-        deplocs_outgoing_messages[{cell_global_id, face_id}] = std::move(psi_data);
-        cells_who_received_data.push_back(
-          fluds_.GetSPDS().GetGrid()->MapCellGlobalID2LocalID(cell_global_id));
-      } // while not at end of buffer
-    } // Process each message embedded in buffer
+      const auto cell_global_id = detail::ReadBytes<std::uint64_t>(data_array, offset);
+      const auto face_id = detail::ReadBytes<unsigned int>(data_array, offset);
+      const auto data_size = detail::ReadBytes<size_t>(data_array, offset);
+
+      const size_t num_bytes = data_size * sizeof(double);
+      const auto cell_local_id = cbc_fluds_.StoreIncomingFaceData(
+        cell_global_id, face_id, data_array.data() + offset, data_size);
+      offset += num_bytes;
+
+      cells_who_received_data.push_back(cell_local_id);
+    }
   }
 
   return cells_who_received_data;
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h
index ead2c03bd9..9e2329d80d 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h
@@ -4,42 +4,63 @@
 #pragma once
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/async_comm.h"
-#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/fluds.h"
-#include "framework/data_types/byte_array.h"
 #include "mpicpp-lite/mpicpp-lite.h"
+#include <cstddef>
+#include <cstdint>
 #include <unordered_map>
 #include <vector>
-#include <cstdint>
-#include <cstddef>
 
 namespace mpi = mpicpp_lite;
 
 namespace opensn
 {
 
+class CBC_FLUDS;
 class MPICommunicatorSet;
-class ByteArray;
 
+/**
+ * Host-side CBC delayed-data communicator.
+ *
+ * Packs outgoing non-local face data by destination locality, performs asynchronous
+ * sends, and receives upwind data needed by the host CBC sweep.
+ */
 class CBC_AsynchronousCommunicator : public AsynchronousCommunicator
 {
 public:
+  /**
+   * Construct the CBC delayed-data communicator.
+   *
+   * \param angle_set_id Owning angle-set ID.
+   * \param fluds CBC FLUDS instance served by this communicator.
+   * \param comm_set MPI communicator set.
+   */
   explicit CBC_AsynchronousCommunicator(size_t angle_set_id,
                                         FLUDS& fluds,
-                                        const MPICommunicatorSet& comm_set)
-    : AsynchronousCommunicator(fluds, comm_set), angle_set_id_(angle_set_id)
-  {
-  }
-
+                                        const MPICommunicatorSet& comm_set);
+
+  /**
+   * Initialize one outgoing message payload and return its writable data vector.
+   *
+   * \param location_id Destination locality ID.
+   * \param cell_global_id Destination cell global ID.
+   * \param face_id Destination face ID.
+   * \param angle_set_id Producing angle-set ID.
+   * \param data_size Number of doubles to store in the payload.
+   * \return Writable payload vector for the outgoing face data.
+   */
   std::vector<double>& InitGetDownwindMessageData(int location_id,
                                                   uint64_t cell_global_id,
                                                   unsigned int face_id,
                                                   size_t angle_set_id,
                                                   size_t data_size);
 
+  /// Send all currently queued outgoing messages.
   bool SendData();
 
+  /// Receive all currently available upwind messages.
   std::vector<uint64_t> ReceiveData();
 
+  /// Clear all queued outgoing state.
   void Reset()
   {
     outgoing_message_queue_.clear();
@@ -47,12 +68,13 @@ class CBC_AsynchronousCommunicator : public AsynchronousCommunicator
   }
 
 protected:
+  /// Owning angle-set ID.
   const size_t angle_set_id_;
 
-  /// location_id, cell_global_id, face_id
+  /// Outgoing message key: `(location_id, cell_global_id, face_id)`.
   using MessageKey = std::tuple<int, std::uint64_t, unsigned int>;
 
-  /// boost::hash_combine hash function for MessageKey.
+  /// Hash for MessageKey.
   struct MessageKeyHash
   {
     std::size_t operator()(const MessageKey& key) const noexcept
@@ -64,17 +86,37 @@ class CBC_AsynchronousCommunicator : public AsynchronousCommunicator
     }
   };
 
+  /// Outgoing face payloads grouped by destination key.
   std::unordered_map<MessageKey, std::vector<double>, MessageKeyHash> outgoing_message_queue_;
 
+  /// In-flight send buffer record.
   struct BufferItem
   {
+    /// Destination locality.
     int destination = 0;
+    /// MPI request for the send.
     mpi::Request mpi_request;
+    /// Flag indicating that the send was posted.
     bool send_initiated = false;
+    /// Flag indicating that the send completed.
     bool completed = false;
-    ByteArray data_array;
+    /// Packed outgoing message bytes.
+    std::vector<std::byte> data;
   };
+  /// In-flight outgoing message buffers.
   std::vector<BufferItem> send_buffer_;
+  /// CBC FLUDS instance served by this communicator.
+  CBC_FLUDS& cbc_fluds_;
+  /// Scratch receive buffer for incoming messages.
+  std::vector<std::byte> receive_buffer_;
+  /// Packed byte counts per destination locality.
+  std::vector<size_t> destination_buffer_bytes_;
+  /// Send-buffer indices grouped by destination locality.
+  std::vector<size_t> destination_buffer_indices_;
+
+private:
+  /// Pack the queued outgoing face payloads into send buffers.
+  void QueueOutgoingMessages();
 };
 
 } // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.cc
index c6e85f9be4..972eb68d70 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.cc
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.cc
@@ -3,91 +3,130 @@
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/spds.h"
-#include "framework/math/spatial_discretization/spatial_discretization.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h"
+#include "framework/mesh/cell/cell.h"
 #include "framework/mesh/mesh_continuum/mesh_continuum.h"
-#include "caliper/cali.h"
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <new>
 
 namespace opensn
 {
 
-CBC_FLUDS::CBC_FLUDS(unsigned int num_groups,
-                     size_t num_angles,
-                     const CBC_FLUDSCommonData& common_data,
-                     const UnknownManager& psi_uk_man,
-                     const SpatialDiscretization& sdm)
-  : FLUDS(num_groups, num_angles, common_data.GetSPDS()),
-    common_data_(common_data),
-    psi_uk_man_(psi_uk_man),
-    sdm_(sdm),
-    num_angles_in_gs_quadrature_(psi_uk_man_.GetNumberOfUnknowns()),
-    num_quadrature_local_dofs_(sdm_.GetNumLocalDOFs(psi_uk_man_)),
-    num_local_spatial_dofs_(num_quadrature_local_dofs_ / num_angles_in_gs_quadrature_ /
-                            num_groups_),
-    local_psi_data_size_(num_local_spatial_dofs_ * num_groups_and_angles_),
-    local_psi_data_(local_psi_data_size_)
+namespace detail
 {
-  const auto& grid = *spds_.GetGrid();
-  cell_psi_start_.resize(grid.local_cells.size());
-  for (const auto& cell : grid.local_cells)
-  {
-    cell_psi_start_[cell.local_id] =
-      (sdm_.MapDOFLocal(cell, 0, psi_uk_man_, 0, 0) / num_angles_in_gs_quadrature_ / num_groups_) *
-      num_groups_and_angles_;
-  }
 
-  deplocs_outgoing_messages_.reserve(common_data.GetNumIncomingNonlocalFaces());
+namespace
+{
+
+constexpr std::size_t LOCAL_PSI_ALIGNMENT = 64;
+constexpr std::size_t DOUBLES_PER_CACHE_LINE = LOCAL_PSI_ALIGNMENT / sizeof(double);
+std::size_t
+RoundUpToCacheLineMultiple(const std::size_t value)
+{
+  return ((value + DOUBLES_PER_CACHE_LINE - 1) / DOUBLES_PER_CACHE_LINE) * DOUBLES_PER_CACHE_LINE;
 }
 
-const FLUDSCommonData&
-CBC_FLUDS::GetCommonData() const
+} // namespace
+
+} // namespace detail
+
+void
+CBC_FLUDS::AlignedDoubleDeleter::operator()(double* ptr) const noexcept
 {
-  return common_data_;
+  ::operator delete[](ptr, std::align_val_t(detail::LOCAL_PSI_ALIGNMENT));
 }
 
-double*
-CBC_FLUDS::UpwindPsi(const Cell& face_neighbor, unsigned int adj_cell_node, size_t as_ss_idx)
+CBC_FLUDS::AlignedDoubleBuffer
+CBC_FLUDS::AllocateAlignedBuffer(const size_t num_values)
 {
-  const size_t index = cell_psi_start_[face_neighbor.local_id] +
-                       adj_cell_node * num_groups_and_angles_ + as_ss_idx * num_groups_;
-  assert(index < local_psi_data_.size());
-  return &local_psi_data_[index];
+  auto* const ptr = static_cast<double*>(
+    ::operator new[](num_values * sizeof(double), std::align_val_t(detail::LOCAL_PSI_ALIGNMENT)));
+  std::fill_n(ptr, num_values, 0.0);
+  return AlignedDoubleBuffer(ptr);
 }
 
-double*
-CBC_FLUDS::OutgoingPsi(const Cell& cell, unsigned int cell_node, size_t as_ss_idx)
+CBC_FLUDS::CBC_FLUDS(unsigned int num_groups,
+                     size_t num_angles,
+                     const CBC_FLUDSCommonData& common_data)
+  : FLUDS(num_groups, num_angles, common_data.GetSPDS()),
+    common_data_(common_data),
+    cell_face_offsets_(common_data.GetCellFaceOffsets()),
+    num_slots_(common_data.GetNumLocalFaceSlots()),
+    slot_size_(detail::RoundUpToCacheLineMultiple(common_data.GetMaxLocalFaceNodeCount() *
+                                                  num_groups_and_angles_)),
+    local_face_slot_bases_(common_data.GetNumCellFaces(), nullptr),
+    local_psi_buffer_(AllocateAlignedBuffer(num_slots_ * slot_size_)),
+    incoming_nonlocal_face_dof_offsets_(common_data.GetNumCellFaces(), 0),
+    incoming_nonlocal_face_bases_(common_data.GetNumCellFaces(), nullptr),
+    incoming_nonlocal_psi_buffer_(
+      [&]()
+      {
+        size_t incoming_nonlocal_dof_count = 0;
+        for (size_t face_storage_index = 0; face_storage_index < common_data.GetNumCellFaces();
+             ++face_storage_index)
+        {
+          const auto& face_info =
+            common_data.GetIncomingNonlocalFaceInfoByStorageIndex(face_storage_index);
+          if (face_info.num_face_nodes == 0)
+            continue;
+          incoming_nonlocal_face_dof_offsets_[face_storage_index] = incoming_nonlocal_dof_count;
+          incoming_nonlocal_dof_count +=
+            detail::RoundUpToCacheLineMultiple(face_info.num_face_nodes * num_groups_and_angles_);
+        }
+        return AllocateAlignedBuffer(incoming_nonlocal_dof_count);
+      }())
 {
-  const size_t index =
-    cell_psi_start_[cell.local_id] + cell_node * num_groups_and_angles_ + as_ss_idx * num_groups_;
-  assert(index < local_psi_data_.size());
-  return &local_psi_data_[index];
+  for (const auto& cell : common_data.GetSPDS().GetGrid()->local_cells)
+  {
+    const auto face_storage_offset = cell_face_offsets_[cell.local_id];
+    for (std::size_t f = 0; f < cell.faces.size(); ++f)
+    {
+      const auto slot_id =
+        common_data.GetLocalFaceSlotID(cell.local_id, static_cast<unsigned int>(f));
+      if (slot_id == CBC_SPDS::INVALID_LOCAL_FACE_TASK_ID)
+        continue;
+      assert(slot_id < num_slots_);
+      local_face_slot_bases_[face_storage_offset + f] =
+        local_psi_buffer_.get() + static_cast<size_t>(slot_id) * slot_size_;
+    }
+  }
+
+  for (std::size_t face_storage_index = 0; face_storage_index < common_data.GetNumCellFaces();
+       ++face_storage_index)
+  {
+    const auto& face_info =
+      common_data.GetIncomingNonlocalFaceInfoByStorageIndex(face_storage_index);
+    if (face_info.num_face_nodes == 0)
+      continue;
+    incoming_nonlocal_face_bases_[face_storage_index] =
+      incoming_nonlocal_psi_buffer_.get() + incoming_nonlocal_face_dof_offsets_[face_storage_index];
+  }
 }
 
-double*
-CBC_FLUDS::NLUpwindPsi(uint64_t cell_global_id,
-                       unsigned int face_id,
-                       unsigned int face_node_mapped,
-                       size_t as_ss_idx)
+std::uint64_t
+CBC_FLUDS::StoreIncomingFaceData(uint64_t cell_global_id,
+                                 unsigned int face_id,
+                                 const std::byte* psi_data_bytes,
+                                 size_t data_size)
 {
-  auto it = deplocs_outgoing_messages_.find({cell_global_id, face_id});
-  if (it == deplocs_outgoing_messages_.end())
-    return nullptr;
-  auto& psi = it->second;
-  const size_t dof_map =
-    face_node_mapped * num_groups_and_angles_ + //  Offset to start of data for face_node_mapped
-    as_ss_idx * num_groups_;                    // Offset to start of data for angle_set_index
-
-  assert(dof_map < psi.size());
-  return &psi[dof_map];
+  const auto face_storage_index =
+    common_data_.GetIncomingNonlocalFaceStorageIndexByKey(cell_global_id, face_id);
+  const auto& face_info =
+    common_data_.GetIncomingNonlocalFaceInfoByStorageIndex(face_storage_index);
+
+  assert(data_size == static_cast<size_t>(face_info.num_face_nodes) * num_groups_and_angles_);
+
+  const size_t base = incoming_nonlocal_face_dof_offsets_[face_storage_index];
+  std::memcpy(
+    incoming_nonlocal_psi_buffer_.get() + base, psi_data_bytes, data_size * sizeof(double));
+  return face_info.cell_local_id;
 }
 
-double*
-CBC_FLUDS::NLOutgoingPsi(std::vector<double>* psi_nonlocal_outgoing,
-                         size_t face_node,
-                         size_t as_ss_idx)
+void
+CBC_FLUDS::ClearLocalAndReceivePsi()
 {
-  assert(psi_nonlocal_outgoing != nullptr);
-  const size_t addr_offset = face_node * num_groups_and_angles_ + as_ss_idx * num_groups_;
-  return &(*psi_nonlocal_outgoing)[addr_offset];
 }
 
 } // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h
index ba7a6467bf..5d92af368f 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h
@@ -5,75 +5,140 @@
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/fluds.h"
-#include "framework/math/unknown_manager/unknown_manager.h"
-#include "framework/math/spatial_discretization/spatial_discretization.h"
+#include <cassert>
 #include <cstddef>
-#include <unordered_map>
-#include <functional>
+#include <cstdint>
+#include <memory>
 
 namespace opensn
 {
 
-class UnknownManager;
-class SpatialDiscretization;
-class Cell;
-
 /**
- * Flux data structures (FLUDS) specific to the cell-by-cell (CBC) sweep algorithm
- *
- * This class manages the storage and access of angular flux data during a CBC sweep
+ * CBC FLUDS for managing local and non-local psi buffers during sweeps.
  *
- * It provides methods to access:
- * - Upwind angular flux data from local neighbor cells
- * - Storage locations for downwind angular flux data for the current cell
- * - Upwind angular flux data received from remote MPI ranks
+ * Owns the compact local-face slot bank and the receive-side non-local storage used
+ * by one host CBC angle set.
  */
 class CBC_FLUDS : public FLUDS
 {
 public:
-  CBC_FLUDS(unsigned int num_groups,
-            size_t num_angles,
-            const CBC_FLUDSCommonData& common_data,
-            const UnknownManager& psi_uk_man,
-            const SpatialDiscretization& sdm);
-
-  virtual const FLUDSCommonData& GetCommonData() const;
+  /**
+   * Construct the host CBC FLUDS.
+   *
+   * \param num_groups Number of groups in the angle set.
+   * \param num_angles Number of angles in the angle set.
+   * \param common_data Shared CBC FLUDS metadata.
+   */
+  CBC_FLUDS(unsigned int num_groups, size_t num_angles, const CBC_FLUDSCommonData& common_data);
+
+  const FLUDSCommonData& GetCommonData() const noexcept { return common_data_; }
+
+  /// Return the stride in doubles between consecutive angle slots.
+  size_t GetStrideSize() const noexcept { return num_groups_and_angles_; }
+
+  /// Return the local psi buffer size in bytes.
+  size_t GetLocalPsiBufferSize() const noexcept { return num_slots_ * slot_size_ * sizeof(double); }
+
+  /// Return the slot base pointer for a local cell face.
+  double* GetLocalFacePsiPointer(std::uint32_t cell_local_id, unsigned int face_id) const noexcept
+  {
+    auto* const slot_base = local_face_slot_bases_[cell_face_offsets_[cell_local_id] + face_id];
+    assert(slot_base != nullptr);
+    return slot_base;
+  }
+
+  /// Return the base pointer for an incoming non-local face.
+  double* GetIncomingNonlocalFacePsiPointer(std::uint32_t cell_local_id,
+                                            unsigned int face_id) const noexcept
+  {
+    auto* const face_base =
+      incoming_nonlocal_face_bases_[cell_face_offsets_[cell_local_id] + face_id];
+    assert(face_base != nullptr);
+    return face_base;
+  }
 
   /**
-   * Given a local upwind neighbor cell, a node index on this cell, and an
-   * angleset subset index, this function returns a pointer to
-   * the start of the group data for the specified node and angle.
+   * Return a pointer to the upwind angular flux for a local incoming face.
+   *
+   * \param cell_local_id Local ID of the cell currently being swept.
+   * \param face_id Local incoming face ID on the current cell.
+   * \param face_node_mapped Mapped node index on the producer's outgoing face.
+   * \param as_ss_idx Angleset subset index within the angleset.
+   * \return Pointer to the start of the group data for the specified face node and angle.
    */
-  double* UpwindPsi(const Cell& face_neighbor, unsigned int adj_cell_node, size_t as_ss_idx);
+  double* UpwindPsi(std::uint32_t cell_local_id,
+                    unsigned int face_id,
+                    unsigned int face_node_mapped,
+                    size_t as_ss_idx) const noexcept
+  {
+    return GetLocalFacePsiPointer(cell_local_id, face_id) +
+           static_cast<size_t>(face_node_mapped) * num_groups_and_angles_ + as_ss_idx * num_groups_;
+  }
 
   /**
-   * Given a local cell, a node index on this cell, and an angleset subset index,
-   * this function returns a pointer to the start of the group data for the specified
-   * node and angle for writing its just solved angular fluxes.
+   * Return a pointer to the outgoing angular flux slot for a local outgoing face.
+   *
+   * \param cell_local_id Local ID of the cell currently being swept.
+   * \param face_id Outgoing face ID on the current cell.
+   * \param face_node Face-local node index on the outgoing face.
+   * \param as_ss_idx Angleset subset index within the angleset.
+   * \return Pointer to the start of the group data for the specified face node and angle
    */
-  double* OutgoingPsi(const Cell& cell, unsigned int cell_node, size_t as_ss_idx);
+  double* OutgoingPsi(std::uint32_t cell_local_id,
+                      unsigned int face_id,
+                      unsigned int face_node,
+                      size_t as_ss_idx) const noexcept
+  {
+    return GetLocalFacePsiPointer(cell_local_id, face_id) +
+           static_cast<size_t>(face_node) * num_groups_and_angles_ + as_ss_idx * num_groups_;
+  }
 
   /**
-   * Given a remote upwind cell's global ID, a face ID on this cell,
-   * a node index on this face, and an angleset subset index,
-   * this function returns a pointer to the start of the group data for the specified
-   * face node and angle.
+   * Return a pointer to received nonlocal upwind angular flux for a face node.
+   *
+   * \param cell_local_id Local ID of the cell owning the face
+   * \param face_id Face index on the cell
+   * \param face_node_mapped Face index on the cell.
+   * \param as_ss_idx Angleset subset index within the angleset
+   * \return Pointer to the start of the group data for the specified face node and angle
    */
-  double* NLUpwindPsi(uint64_t cell_global_id,
+  double* NLUpwindPsi(std::uint32_t cell_local_id,
                       unsigned int face_id,
                       unsigned int face_node_mapped,
-                      size_t as_ss_idx);
+                      size_t as_ss_idx) noexcept
+  {
+    return GetIncomingNonlocalFacePsiPointer(cell_local_id, face_id) +
+           static_cast<size_t>(face_node_mapped) * num_groups_and_angles_ + as_ss_idx * num_groups_;
+  }
+
+  /**
+   * Return a pointer to the nonlocal outgoing angular flux for a face node.
+   *
+   * \param psi_nonlocal_outgoing Base pointer to the face's outgoing psi buffer
+   * \param face_node Face node index
+   * \param as_ss_idx Angleset subset index within the angleset
+   * \return Pointer to the start of the group data for the specified face node and angle
+   */
+  double* NLOutgoingPsi(double* psi_nonlocal_outgoing, size_t face_node, size_t as_ss_idx) noexcept
+  {
+    assert(psi_nonlocal_outgoing != nullptr);
+    return psi_nonlocal_outgoing + face_node * num_groups_and_angles_ + as_ss_idx * num_groups_;
+  }
 
   /**
-   * Given a pointer to a vector holding the non-local outgoing psi data for a face,
-   * a node index on this face, and an angleset subset index,
-   * this function returns a pointer to the start of the group data for the specified
-   * face node and angle.
+   * Store received nonlocal face angular flux into the incoming buffer.
+   *
+   * \param cell_global_id Global ID of the neighbor cell that produced the data
+   * \param face_id Face index on the neighbor cell
+   * \param psi_data_bytes Pointer to the received angular flux payload bytes
+   * \param data_size Number of doubles in the payload
    */
-  double*
-  NLOutgoingPsi(std::vector<double>* psi_nonlocal_outgoing, size_t face_node, size_t as_ss_idx);
+  std::uint64_t StoreIncomingFaceData(uint64_t cell_global_id,
+                                      unsigned int face_id,
+                                      const std::byte* psi_data_bytes,
+                                      size_t data_size);
 
-  void ClearLocalAndReceivePsi() override { deplocs_outgoing_messages_.clear(); }
+  void ClearLocalAndReceivePsi() override;
   void ClearSendPsi() override {}
   void AllocateInternalLocalPsi() override {}
   void AllocateOutgoingPsi() override {}
@@ -83,24 +148,42 @@ class CBC_FLUDS : public FLUDS
   void AllocateDelayedPrelocIOutgoingPsi() override {}
 
 protected:
+  /// Custom deleter for 64-byte aligned double arrays.
+  struct AlignedDoubleDeleter
+  {
+    void operator()(double* ptr) const noexcept;
+  };
+
+  /// Owning pointer to a 64-byte aligned double array.
+  using AlignedDoubleBuffer = std::unique_ptr<double[], AlignedDoubleDeleter>;
+
+  /// Allocate a zero-initialized 64-byte aligned double buffer.
+  static AlignedDoubleBuffer AllocateAlignedBuffer(std::size_t num_values);
+
+  /// Shared face-level indexing metadata.
   const CBC_FLUDSCommonData& common_data_;
-  const UnknownManager& psi_uk_man_;
-  const SpatialDiscretization& sdm_;
-  size_t num_angles_in_gs_quadrature_;
-  size_t num_quadrature_local_dofs_;
-  size_t num_local_spatial_dofs_;
-  size_t local_psi_data_size_;
+  /// Flat face-table offsets cached locally for hot-path indexing.
+  std::vector<size_t> cell_face_offsets_;
+  /// Number of angular flux storage slots.
+  size_t num_slots_;
+  /// Size of each slot in doubles (cache-line aligned).
+  size_t slot_size_;
+  /// Per-face-storage base pointer into the local psi buffer.
+  std::vector<double*> local_face_slot_bases_;
 
   /**
-   * Layout for storage for local angular fluxes:
-   * spatial DOF major -> angle in angleset major -> group in groupset major
+   * Contiguous local angular flux buffer with `num_slots_` slots.
+   *
+   * Layout per slot: node-major, angle-in-angleset-major, group-in-groupset major.
    */
-  std::vector<double> local_psi_data_;
-
-  std::vector<std::vector<double>> boundryI_incoming_psi_;
-
-  /// Pre-computed start index into local_psi_data_ for each local cell
-  std::vector<size_t> cell_psi_start_;
+  AlignedDoubleBuffer local_psi_buffer_;
+
+  /// Per-face-storage index DOF offset into the incoming non-local psi buffer.
+  std::vector<size_t> incoming_nonlocal_face_dof_offsets_;
+  /// Per-face storage-index base pointer into the incoming non-local psi buffer.
+  std::vector<double*> incoming_nonlocal_face_bases_;
+  /// Flat buffer holding received non-local angular fluxes.
+  AlignedDoubleBuffer incoming_nonlocal_psi_buffer_;
 };
 
 } // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.cc
index 354b0fd3a0..db9173b91d 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.cc
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.cc
@@ -3,8 +3,10 @@
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/spds.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h"
 #include "framework/mesh/cell/cell.h"
 #include "framework/mesh/mesh_continuum/mesh_continuum.h"
+#include <cassert>
 
 namespace opensn
 {
@@ -13,28 +15,125 @@ CBC_FLUDSCommonData::CBC_FLUDSCommonData(
   const SPDS& spds, const std::vector<CellFaceNodalMapping>& grid_nodal_mappings)
   : FLUDSCommonData(spds, grid_nodal_mappings),
     num_incoming_nonlocal_faces_(0),
-    num_outgoing_nonlocal_faces_(0)
+    num_incoming_nonlocal_face_nodes_(0),
+    num_outgoing_nonlocal_faces_(0),
+    num_local_faces_(0),
+    max_local_face_node_count_(0),
+    num_local_face_slots_(dynamic_cast<const CBC_SPDS&>(spds).GetMaxNumLocalPsiSlots())
 {
   // Pre-compute non-local face counts for hash map capacity reservation
   const auto& grid = *spds.GetGrid();
+  const auto& cbc_spds = dynamic_cast<const CBC_SPDS&>(spds);
   const auto& face_orientations = spds.GetCellFaceOrientations();
 
+  outgoing_nonlocal_face_counts_.assign(spds.GetLocationSuccessors().size(), 0);
+  outgoing_nonlocal_face_node_counts_.assign(spds.GetLocationSuccessors().size(), 0);
+  cell_face_offsets_.resize(grid.local_cells.size() + 1, 0);
+  size_t total_num_faces = 0;
+
+  for (const auto& cell : grid.local_cells)
+  {
+    cell_face_offsets_[cell.local_id] = static_cast<std::uint32_t>(total_num_faces);
+    total_num_faces += cell.faces.size();
+  }
+  cell_face_offsets_.back() = static_cast<std::uint32_t>(total_num_faces);
+  local_face_slot_ids_.assign(total_num_faces, CBC_SPDS::INVALID_LOCAL_FACE_TASK_ID);
+  incoming_nonlocal_face_info_.resize(total_num_faces);
+  outgoing_nonlocal_face_info_.resize(total_num_faces);
+
   for (const auto& cell : grid.local_cells)
   {
+    const size_t face_offset = cell_face_offsets_[cell.local_id];
     for (size_t f = 0; f < cell.faces.size(); ++f)
     {
       const auto& face = cell.faces[f];
       const auto orientation = face_orientations[cell.local_id][f];
+      const size_t face_storage_index = face_offset + f;
 
       if ((not face.has_neighbor) or (face.IsNeighborLocal(&grid)))
+      {
+        if (face.has_neighbor)
+        {
+          max_local_face_node_count_ = std::max(max_local_face_node_count_, face.vertex_ids.size());
+          if (orientation == FaceOrientation::OUTGOING)
+          {
+            const auto task_id =
+              cbc_spds.GetOutgoingLocalFaceTaskID(cell.local_id, static_cast<unsigned int>(f));
+            assert(task_id != CBC_SPDS::INVALID_LOCAL_FACE_TASK_ID);
+            local_face_slot_ids_[face_storage_index] = cbc_spds.GetLocalFaceSlotIDs()[task_id];
+            ++num_local_faces_;
+          }
+          else if (orientation == FaceOrientation::INCOMING)
+          {
+            const auto task_id =
+              cbc_spds.GetIncomingLocalFaceTaskID(cell.local_id, static_cast<unsigned int>(f));
+            assert(task_id != CBC_SPDS::INVALID_LOCAL_FACE_TASK_ID);
+            local_face_slot_ids_[face_storage_index] = cbc_spds.GetLocalFaceSlotIDs()[task_id];
+          }
+        }
         continue;
+      }
 
       if (orientation == FaceOrientation::INCOMING)
+      {
         ++num_incoming_nonlocal_faces_;
+        const auto num_face_nodes = static_cast<std::uint32_t>(
+          grid_nodal_mappings[cell.local_id][f].face_node_mapping_.size());
+        IncomingNonlocalFaceInfo info{static_cast<std::uint32_t>(cell.local_id),
+                                      static_cast<std::uint32_t>(num_incoming_nonlocal_face_nodes_),
+                                      num_face_nodes};
+        incoming_nonlocal_face_info_[face_storage_index] = info;
+        incoming_nonlocal_face_info_by_key_.emplace(
+          CellFaceKey{cell.global_id, static_cast<unsigned int>(f)}, face_storage_index);
+        num_incoming_nonlocal_face_nodes_ += num_face_nodes;
+      }
       else if (orientation == FaceOrientation::OUTGOING)
+      {
         ++num_outgoing_nonlocal_faces_;
+        const auto deplocI =
+          static_cast<std::size_t>(spds.MapLocJToDeplocI(face.GetNeighborPartitionID(&grid)));
+        ++outgoing_nonlocal_face_counts_[deplocI];
+        outgoing_nonlocal_face_node_counts_[deplocI] +=
+          grid_nodal_mappings[cell.local_id][f].face_node_mapping_.size();
+        outgoing_nonlocal_face_info_[face_storage_index] = OutgoingNonlocalFaceInfo{
+          face.GetNeighborPartitionID(&grid),
+          face.neighbor_id,
+          static_cast<unsigned int>(grid_nodal_mappings[cell.local_id][f].associated_face_),
+          static_cast<std::uint32_t>(
+            grid_nodal_mappings[cell.local_id][f].face_node_mapping_.size())};
+      }
     }
   }
 }
 
+const CBC_FLUDSCommonData::IncomingNonlocalFaceInfo&
+CBC_FLUDSCommonData::GetIncomingNonlocalFaceInfo(const std::uint32_t cell_local_id,
+                                                 const unsigned int face_id) const noexcept
+{
+  return incoming_nonlocal_face_info_[cell_face_offsets_[cell_local_id] + face_id];
+}
+
+const CBC_FLUDSCommonData::IncomingNonlocalFaceInfo&
+CBC_FLUDSCommonData::GetIncomingNonlocalFaceInfoByStorageIndex(
+  const std::size_t storage_index) const noexcept
+{
+  return incoming_nonlocal_face_info_[storage_index];
+}
+
+std::size_t
+CBC_FLUDSCommonData::GetIncomingNonlocalFaceStorageIndexByKey(
+  const std::uint64_t cell_global_id, const unsigned int face_id) const noexcept
+{
+  const auto it = incoming_nonlocal_face_info_by_key_.find({cell_global_id, face_id});
+  assert(it != incoming_nonlocal_face_info_by_key_.end());
+  return it->second;
+}
+
+const CBC_FLUDSCommonData::OutgoingNonlocalFaceInfo&
+CBC_FLUDSCommonData::GetOutgoingNonlocalFaceInfo(const std::uint32_t cell_local_id,
+                                                 const unsigned int face_id) const noexcept
+{
+  return outgoing_nonlocal_face_info_[cell_face_offsets_[cell_local_id] + face_id];
+}
+
 } // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.h
index a1cd93f7ad..020cad200d 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.h
@@ -4,25 +4,166 @@
 #pragma once
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/fluds_common_data.h"
-#include <cinttypes>
+#include <cstdint>
 #include <cstddef>
+#include <unordered_map>
 
 namespace opensn
 {
 
+/**
+ * Shared CBC FLUDS metadata.
+ *
+ * Owns the flat face-level lookup tables used to index compact local-face slots and
+ * incoming/outgoing non-local face storage for one CBC sweep plane.
+ */
 class CBC_FLUDSCommonData : public FLUDSCommonData
 {
 public:
+  /// Incoming-face key: `(cell_global_id, face_id)`.
+  using CellFaceKey = std::pair<std::uint64_t, unsigned int>;
+
+  /// Hash for CellFaceKey.
+  struct CellFaceKeyHash
+  {
+    size_t operator()(const CellFaceKey& key) const noexcept
+    {
+      size_t h = std::hash<std::uint64_t>{}(key.first);
+      h ^= std::hash<unsigned int>{}(key.second) + 0x9e3779b9 + (h << 6) + (h >> 2);
+      return h;
+    }
+  };
+
+  /// Metadata for one incoming non-local face.
+  struct IncomingNonlocalFaceInfo
+  {
+    /// Local ID of the cell owning this face.
+    std::uint32_t cell_local_id = 0;
+    /// Offset into the incoming non-local psi buffer for this face's node data.
+    std::uint32_t face_node_offset = 0;
+    /// Number of face nodes.
+    std::uint32_t num_face_nodes = 0;
+  };
+
+  /// Metadata for one outgoing non-local face.
+  struct OutgoingNonlocalFaceInfo
+  {
+    /// Destination MPI rank locality index.
+    int locality = 0;
+    /// Global ID of the destination cell.
+    std::uint64_t cell_global_id = 0;
+    /// Face index on the destination cell.
+    unsigned int associated_face = 0;
+    /// Number of face nodes.
+    std::uint32_t num_face_nodes = 0;
+  };
+
+  /**
+   * Construct common data from the SPDS and grid nodal mappings.
+   *
+   * \param spds Sweep-plane data structure providing face orientations.
+   * \param grid_nodal_mappings Per-cell-face nodal mapping data.
+   */
   CBC_FLUDSCommonData(const SPDS& spds,
                       const std::vector<CellFaceNodalMapping>& grid_nodal_mappings);
 
+  /// Return the number of incoming non-local faces.
   size_t GetNumIncomingNonlocalFaces() const { return num_incoming_nonlocal_faces_; }
 
+  /// Return the number of incoming non-local face nodes.
+  size_t GetNumIncomingNonlocalFaceNodes() const { return num_incoming_nonlocal_face_nodes_; }
+
+  /// Return the number of outgoing non-local faces.
   size_t GetNumOutgoingNonlocalFaces() const { return num_outgoing_nonlocal_faces_; }
 
+  /// Return the number of local directed faces.
+  size_t GetNumLocalFaces() const { return num_local_faces_; }
+
+  /// Return the maximum local-face node count.
+  size_t GetMaxLocalFaceNodeCount() const { return max_local_face_node_count_; }
+
+  /// Return the number of reusable local-face slots.
+  size_t GetNumLocalFaceSlots() const { return num_local_face_slots_; }
+
+  /// Get number of outgoing non-local faces for dependent locality `deplocI`.
+  size_t GetDeplocIFaceCount(std::size_t deplocI) const noexcept
+  {
+    return outgoing_nonlocal_face_counts_[deplocI];
+  }
+
+  /// Get number of outgoing non-local face nodes for dependent locality `deplocI`.
+  size_t GetDeplocIFaceNodeCount(std::size_t deplocI) const noexcept
+  {
+    return outgoing_nonlocal_face_node_counts_[deplocI];
+  }
+
+  /// Look up incoming nonlocal face info by cell local ID and face index.
+  const IncomingNonlocalFaceInfo& GetIncomingNonlocalFaceInfo(std::uint32_t cell_local_id,
+                                                              unsigned int face_id) const noexcept;
+
+  /// Look up incoming nonlocal face info by flat storage index.
+  const IncomingNonlocalFaceInfo&
+  GetIncomingNonlocalFaceInfoByStorageIndex(std::size_t storage_index) const noexcept;
+
+  /// Resolve a (cell_global_id, face_id) pair to a flat storage index.
+  std::size_t GetIncomingNonlocalFaceStorageIndexByKey(std::uint64_t cell_global_id,
+                                                       unsigned int face_id) const noexcept;
+
+  /// Total number of cell-face entries in the flat face table.
+  std::size_t GetNumCellFaces() const noexcept { return cell_face_offsets_.back(); }
+
+  /// Look up outgoing nonlocal face info by cell local ID and face index.
+  const OutgoingNonlocalFaceInfo& GetOutgoingNonlocalFaceInfo(std::uint32_t cell_local_id,
+                                                              unsigned int face_id) const noexcept;
+
+  /// Look up the static local-face slot id by cell local ID and face index.
+  std::uint32_t GetLocalFaceSlotID(std::uint32_t cell_local_id, unsigned int face_id) const noexcept
+  {
+    return local_face_slot_ids_[cell_face_offsets_[cell_local_id] + face_id];
+  }
+
+  /// Flat face-table offset for a given cell.
+  size_t GetCellFaceOffset(std::uint32_t cell_local_id) const noexcept
+  {
+    return cell_face_offsets_[cell_local_id];
+  }
+
+  /// Flat face-table index for a given cell face.
+  size_t GetFaceStorageIndex(std::uint32_t cell_local_id, unsigned int face_id) const noexcept
+  {
+    return cell_face_offsets_[cell_local_id] + face_id;
+  }
+
+  /// Return the flat cell-face offsets table.
+  const std::vector<size_t>& GetCellFaceOffsets() const noexcept { return cell_face_offsets_; }
+
 private:
+  /// Number of incoming non-local faces.
   size_t num_incoming_nonlocal_faces_;
+  /// Number of incoming non-local face nodes.
+  size_t num_incoming_nonlocal_face_nodes_;
+  /// Number of outgoing non-local faces.
   size_t num_outgoing_nonlocal_faces_;
+  /// Number of local directed faces.
+  size_t num_local_faces_;
+  /// Maximum number of nodes on any local directed face.
+  size_t max_local_face_node_count_;
+  /// Number of reusable local-face storage slots.
+  size_t num_local_face_slots_;
+  /// Prefix-sum offsets into the flat face tables, indexed by cell local ID.
+  std::vector<size_t> cell_face_offsets_;
+  /// Flat local-face slot IDs, indexed by face storage index.
+  std::vector<std::uint32_t> local_face_slot_ids_;
+  /// Flat incoming non-local face metadata, indexed by face storage index.
+  std::vector<IncomingNonlocalFaceInfo> incoming_nonlocal_face_info_;
+  /// Flat outgoing non-local face metadata, indexed by face storage index.
+  std::vector<OutgoingNonlocalFaceInfo> outgoing_nonlocal_face_info_;
+  /// Per-dependent locality outgoing face counts.
+  std::vector<size_t> outgoing_nonlocal_face_counts_;
+  /// Per-dependent locality outgoing face node counts.
+  std::vector<size_t> outgoing_nonlocal_face_node_counts_;
+  /// Map from (cell_global_id, face_id) to flat storage index for incoming non-local faces.
+  std::unordered_map<CellFaceKey, std::size_t, CellFaceKeyHash> incoming_nonlocal_face_info_by_key_;
 };
 
 } // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_avx_sweep_chunk.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_avx_sweep_chunk.cc
index 0b541d316c..77104b7e53 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_avx_sweep_chunk.cc
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_avx_sweep_chunk.cc
@@ -20,6 +20,19 @@ CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set)
 
   static_assert(NumNodes >= 2 and NumNodes <= 8);
 
+  struct IncomingFaceData
+  {
+    const FaceNodalMapping* face_nodal_mapping = nullptr;
+    double* psi_base = nullptr;
+  };
+
+  struct OutgoingFaceData
+  {
+    bool is_reflecting_boundary_face = false;
+    double* psi_base = nullptr;
+    const CBC_FLUDSCommonData::OutgoingNonlocalFaceInfo* outgoing_nonlocal_face_info = nullptr;
+  };
+
   const auto& groupset = data.groupset;
   const auto& m2d_op = groupset.quadrature->GetMomentToDiscreteOperator();
   const auto& d2m_op = groupset.quadrature->GetDiscreteToMomentOperator();
@@ -28,7 +41,8 @@ CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set)
                           "CBC_Sweep_FixedN invoked for an incompatible cell topology.");
 
   const auto& face_orientations = angle_set.GetSPDS().GetCellFaceOrientations()[data.cell_local_id];
-  const auto& sigma_t = data.xs.at(data.cell.block_id)->GetSigmaTotal();
+  const auto& cell_xs = data.cell_transport_view.GetXS();
+  const auto& sigma_t = cell_xs.GetSigmaTotal();
 
   constexpr size_t matrix_size = static_cast<size_t>(NumNodes) * static_cast<size_t>(NumNodes);
   auto idx = [](size_t i, size_t j) -> size_t { return i * NumNodes + j; };
@@ -59,7 +73,7 @@ CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set)
   std::vector<double> tau_gsg;
   if constexpr (time_dependent)
   {
-    const auto& inv_velg = data.xs.at(data.cell.block_id)->GetInverseVelocity();
+    const auto& inv_velg = cell_xs.GetInverseVelocity();
     const double theta = data.problem.GetTheta();
     const double inv_theta = 1.0 / theta;
     const double dt = data.problem.GetTimeStep();
@@ -76,6 +90,57 @@ CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set)
       : nullptr;
 
   const auto& as_angle_indices = angle_set.GetAngleIndices();
+  const auto& cbc_common = dynamic_cast<const CBC_FLUDSCommonData&>(data.fluds.GetCommonData());
+  auto* const async_comm = dynamic_cast<CBC_AsynchronousCommunicator*>(angle_set.GetCommunicator());
+  std::vector<IncomingFaceData> incoming_face_data(data.cell_num_faces);
+  std::vector<OutgoingFaceData> outgoing_face_data(data.cell_num_faces);
+  for (size_t f = 0; f < data.cell_num_faces; ++f)
+  {
+    const auto& face = data.cell.faces[f];
+    const bool is_local_face = data.cell_transport_view.IsFaceLocal(f);
+    const bool is_boundary_face = not face.has_neighbor;
+    const auto* face_nodal_mapping =
+      &data.fluds.GetCommonData().GetFaceNodalMapping(data.cell_local_id, f);
+
+    if (face_orientations[f] == FaceOrientation::INCOMING)
+    {
+      auto& face_data = incoming_face_data[f];
+      face_data.face_nodal_mapping = face_nodal_mapping;
+      if (is_local_face)
+        face_data.psi_base =
+          data.fluds.GetLocalFacePsiPointer(data.cell_local_id, static_cast<unsigned int>(f));
+      else if (not is_boundary_face)
+        face_data.psi_base = data.fluds.GetIncomingNonlocalFacePsiPointer(
+          data.cell_local_id, static_cast<unsigned int>(f));
+    }
+
+    if (face_orientations[f] == FaceOrientation::OUTGOING)
+    {
+      auto& face_data = outgoing_face_data[f];
+      face_data.is_reflecting_boundary_face =
+        is_boundary_face and angle_set.GetBoundaries()[face.neighbor_id]->IsReflecting();
+      if (is_local_face)
+        face_data.psi_base =
+          data.fluds.GetLocalFacePsiPointer(data.cell_local_id, static_cast<unsigned int>(f));
+      if (not is_local_face and not is_boundary_face)
+        face_data.outgoing_nonlocal_face_info =
+          &cbc_common.GetOutgoingNonlocalFaceInfo(data.cell_local_id, static_cast<unsigned int>(f));
+    }
+  }
+
+  double* psi_new_base = nullptr;
+  double theta = 1.0;
+  double inv_theta = 1.0;
+  if (data.save_angular_flux)
+  {
+    psi_new_base = &data.destination_psi[data.discretization.MapDOFLocal(
+      data.cell, 0, groupset.psi_uk_man_, 0, 0)];
+    if constexpr (time_dependent)
+    {
+      theta = data.problem.GetTheta();
+      inv_theta = 1.0 / theta;
+    }
+  }
 
   for (size_t as_ss_idx = 0; as_ss_idx < data.num_angles_in_as; ++as_ss_idx)
   {
@@ -102,10 +167,8 @@ CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set)
         continue;
 
       const auto& face = data.cell.faces[f];
-      const bool is_local_face = data.cell_transport_view.IsFaceLocal(f);
-      const bool is_boundary_face = not face.has_neighbor;
-      const auto* face_nodal_mapping =
-        &data.fluds.GetCommonData().GetFaceNodalMapping(data.cell_local_id, f);
+      const auto& face_data = incoming_face_data[f];
+      const auto* face_nodal_mapping = face_data.face_nodal_mapping;
 
       const auto& Ms_f = data.M_surf[f];
       const size_t num_face_nodes = data.cell_mapping.GetNumFaceNodes(f);
@@ -116,13 +179,11 @@ CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set)
         const int j = data.cell_mapping.MapFaceNode(f, fj);
 
         const double* psi = nullptr;
-        if (is_local_face)
-          psi = data.fluds.UpwindPsi(*data.cell_transport_view.FaceNeighbor(f),
-                                     face_nodal_mapping->cell_node_mapping_[fj],
-                                     as_ss_idx);
-        else if (not is_boundary_face)
-          psi = data.fluds.NLUpwindPsi(
-            data.cell.global_id, f, face_nodal_mapping->face_node_mapping_[fj], as_ss_idx);
+        if (face_data.psi_base != nullptr)
+          psi = face_data.psi_base +
+                static_cast<size_t>(face_nodal_mapping->face_node_mapping_[fj]) *
+                  data.group_angle_stride +
+                as_ss_idx * data.group_stride;
         else
           psi = angle_set.PsiBoundary(face.neighbor_id,
                                       direction_num,
@@ -271,27 +332,13 @@ CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set)
           const double w = d2m_row[m];
           PRAGMA_UNROLL
           for (size_t i = 0; i < NumNodes; ++i)
-          {
-            const size_t dof = data.cell_transport_view.MapDOF(i, m, data.gs_gi);
-            data.destination_phi[dof + gsg] += w * bg[i];
-          }
+            data.destination_phi[moment_dof_map[m][i] + gsg] += w * bg[i];
         }
       }
     }
 
     if (data.save_angular_flux)
     {
-      double* psi_new = &data.destination_psi[data.discretization.MapDOFLocal(
-        data.cell, 0, groupset.psi_uk_man_, 0, 0)];
-
-      double theta = 1.0;
-      double inv_theta = 1.0;
-      if constexpr (time_dependent)
-      {
-        theta = data.problem.GetTheta();
-        inv_theta = 1.0 / theta;
-      }
-
       PRAGMA_UNROLL
       for (size_t i = 0; i < NumNodes; ++i)
       {
@@ -304,10 +351,10 @@ CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set)
           if constexpr (time_dependent)
           {
             const double psi_old_val = psi_old ? psi_old[imap + gsg] : 0.0;
-            psi_new[imap + gsg] = inv_theta * (psi_sol + (theta - 1.0) * psi_old_val);
+            psi_new_base[imap + gsg] = inv_theta * (psi_sol + (theta - 1.0) * psi_old_val);
           }
           else
-            psi_new[imap + gsg] = psi_sol;
+            psi_new_base[imap + gsg] = psi_sol;
         }
       }
     }
@@ -318,28 +365,26 @@ CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set)
         continue;
 
       const auto& face = data.cell.faces[f];
-      const bool is_local_face = data.cell_transport_view.IsFaceLocal(f);
-      const bool is_boundary_face = not face.has_neighbor;
-      const bool is_reflecting_boundary_face =
-        (is_boundary_face and angle_set.GetBoundaries()[face.neighbor_id]->IsReflecting());
+      const auto& face_data = outgoing_face_data[f];
+      const bool is_reflecting_boundary_face = face_data.is_reflecting_boundary_face;
       const auto& IntF_shapeI = data.IntS_shapeI[f];
 
-      const int locality = data.cell_transport_view.FaceLocality(f);
       const size_t num_face_nodes = data.cell_mapping.GetNumFaceNodes(f);
-      const auto& face_nodal_mapping =
-        data.fluds.GetCommonData().GetFaceNodalMapping(data.cell_local_id, f);
-      std::vector<double>* psi_nonlocal_outgoing = nullptr;
+      double* psi_nonlocal_outgoing = nullptr;
 
-      if (not is_boundary_face and not is_local_face)
+      if (face_data.outgoing_nonlocal_face_info != nullptr)
       {
-        auto* async_comm = dynamic_cast<CBC_AsynchronousCommunicator*>(angle_set.GetCommunicator());
-        const size_t data_size_for_msg = num_face_nodes * data.group_angle_stride;
+        const auto& outgoing_nonlocal_face_info = *face_data.outgoing_nonlocal_face_info;
+        const size_t data_size_for_msg =
+          static_cast<size_t>(outgoing_nonlocal_face_info.num_face_nodes) * data.group_angle_stride;
         psi_nonlocal_outgoing =
-          &async_comm->InitGetDownwindMessageData(locality,
-                                                  face.neighbor_id,
-                                                  face_nodal_mapping.associated_face_,
-                                                  angle_set.GetID(),
-                                                  data_size_for_msg);
+          async_comm
+            ->InitGetDownwindMessageData(outgoing_nonlocal_face_info.locality,
+                                         outgoing_nonlocal_face_info.cell_global_id,
+                                         outgoing_nonlocal_face_info.associated_face,
+                                         angle_set.GetID(),
+                                         data_size_for_msg)
+            .data();
       }
 
       const double mu_wt_f = wt * face_mu_values[f];
@@ -348,7 +393,7 @@ CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set)
       {
         const int i = data.cell_mapping.MapFaceNode(f, fi);
 
-        if (is_boundary_face)
+        if (face_data.outgoing_nonlocal_face_info == nullptr and face_data.psi_base == nullptr)
         {
           const double flux_i = mu_wt_f * IntF_shapeI(i);
           for (size_t gsg = 0; gsg < data.gs_size; ++gsg)
@@ -357,9 +402,9 @@ CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set)
         }
 
         double* psi = nullptr;
-        if (is_local_face)
-          psi = data.fluds.OutgoingPsi(data.cell, i, as_ss_idx);
-        else if (not is_boundary_face)
+        if (face_data.psi_base != nullptr)
+          psi = face_data.psi_base + fi * data.group_angle_stride + as_ss_idx * data.group_stride;
+        else if (face_data.outgoing_nonlocal_face_info != nullptr)
           psi = data.fluds.NLOutgoingPsi(psi_nonlocal_outgoing, fi, as_ss_idx);
         else if (is_reflecting_boundary_face)
           psi = angle_set.PsiReflected(face.neighbor_id, direction_num, data.cell_local_id, f, fi);
@@ -380,22 +425,7 @@ CBCSweepChunk::Sweep_FixedN(AngleSet& angle_set)
 {
   CALI_CXX_MARK_SCOPE("CBCSweepChunk::Sweep_FixedN");
 
-  auto data = MakeCBCSweepData(discretization_,
-                               source_moments_,
-                               groupset_,
-                               xs_,
-                               num_moments_,
-                               max_num_cell_dofs_,
-                               SaveAngularFluxEnabled(),
-                               groupset_angle_group_stride_,
-                               groupset_group_stride_,
-                               destination_phi_,
-                               destination_psi_,
-                               include_rhs_time_term_,
-                               problem_,
-                               nullptr,
-                               group_block_size_,
-                               ctx_);
+  auto data = MakeSweepData(nullptr);
 
   CBC_Sweep_FixedN<NumNodes, false>(data, angle_set);
 }
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.cc
index b40c3d7ed0..8a3f4ce304 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.cc
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.cc
@@ -56,6 +56,7 @@ CBCSweepChunk::CBCSweepChunk(DiscreteOrdinatesProblem& problem, LBSGroupset& gro
   }
 
   group_block_size_ = ComputeGroupBlockSize(groupset_.GetNumGroups());
+  generic_scratch_.EnsureCapacity(max_num_cell_dofs_, groupset_.GetNumGroups(), 0);
 }
 
 void
@@ -63,14 +64,50 @@ CBCSweepChunk::SetAngleSet(AngleSet& angle_set)
 {
   CALI_CXX_MARK_SCOPE("CBCSweepChunk::SetAngleSet");
 
-  CBCBindAngleSetContext(ctx_, groupset_, IsSurfaceSourceActive(), angle_set);
+  ctx_.BindAngleSet(groupset_, IsSurfaceSourceActive(), angle_set);
 }
 
 void
 CBCSweepChunk::SetCell(const Cell* cell_ptr, AngleSet& angle_set)
 {
   static_cast<void>(angle_set);
-  CBCBindCellContext(ctx_, discretization_, unit_cell_matrices_, cell_transport_views_, cell_ptr);
+  ctx_.BindCell(discretization_, unit_cell_matrices_, cell_transport_views_, cell_ptr);
+}
+
+CBCSweepData
+CBCSweepChunk::MakeSweepData(const std::vector<double>* psi_old)
+{
+  return CBCSweepData{discretization_,
+                      source_moments_,
+                      groupset_,
+                      num_moments_,
+                      max_num_cell_dofs_,
+                      SaveAngularFluxEnabled(),
+                      groupset_angle_group_stride_,
+                      groupset_group_stride_,
+                      destination_phi_,
+                      destination_psi_,
+                      ctx_.surface_source_active,
+                      include_rhs_time_term_,
+                      problem_,
+                      psi_old,
+                      group_block_size_,
+                      *ctx_.fluds,
+                      *ctx_.cell,
+                      ctx_.cell_local_id,
+                      *ctx_.cell_mapping,
+                      *ctx_.cell_transport_view,
+                      ctx_.cell_num_faces,
+                      ctx_.cell_num_nodes,
+                      ctx_.gs_size,
+                      ctx_.gs_gi,
+                      ctx_.num_angles_in_as,
+                      ctx_.group_stride,
+                      ctx_.group_angle_stride,
+                      *ctx_.G,
+                      *ctx_.M,
+                      *ctx_.M_surf,
+                      *ctx_.IntS_shapeI};
 }
 
 void
@@ -84,24 +121,9 @@ CBCSweepChunk::Sweep_Generic(AngleSet& angle_set)
 {
   CALI_CXX_MARK_SCOPE("CBCSweepChunk::Sweep_Generic");
 
-  auto data = MakeCBCSweepData(discretization_,
-                               source_moments_,
-                               groupset_,
-                               xs_,
-                               num_moments_,
-                               max_num_cell_dofs_,
-                               SaveAngularFluxEnabled(),
-                               groupset_angle_group_stride_,
-                               groupset_group_stride_,
-                               destination_phi_,
-                               destination_psi_,
-                               include_rhs_time_term_,
-                               problem_,
-                               nullptr,
-                               group_block_size_,
-                               ctx_);
+  auto data = MakeSweepData(nullptr);
 
-  CBC_Sweep_Generic<false>(data, angle_set);
+  CBC_Sweep_Generic<false>(data, generic_scratch_, angle_set);
 }
 
 } // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h
index 5d8acaa305..2f7a15a541 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h
@@ -14,54 +14,68 @@ class CellMapping;
 class DiscreteOrdinatesProblem;
 
 /**
- * Implements the core sweep operation for a single cell within the
- * cell-by-cell (CBC) sweep algorithm.
+ * Host CBC sweep chunk.
  *
- * This class is responsible for performing the discrete ordinates transport
- * calculation on a given cell for all angles and groups managed by its
- * current AngleSet
- * It interacts with a CBC_FLUDS object to obtain upwind angular flux data
- * (from local neighbors, MPI remote buffers, or boundaries) and to store
- * outgoing angular flux data (to local neighbors or MPI send buffers)
+ * Dispatches between the generic and fixed-node CBC sweep kernels for the
+ * currently bound angle set and cell.
  */
 class CBCSweepChunk : public SweepChunk
 {
 public:
+  /**
+   * Construct one CBC sweep chunk for a groupset.
+   *
+   * \param problem Owning discrete-ordinates problem.
+   * \param groupset Groupset swept by this chunk.
+   */
   CBCSweepChunk(DiscreteOrdinatesProblem& problem, LBSGroupset& groupset);
 
-  /// Set the current AngleSet
+  /**
+   * Bind the current angle set.
+   *
+   * \param angle_set Angle set to bind.
+   */
   void SetAngleSet(AngleSet& angle_set) override;
 
-  /// Set the current cell to be swept
+  /**
+   * Bind the current cell to be swept.
+   *
+   * \param cell_ptr Cell to bind.
+   * \param angle_set Owning angle set.
+   */
   void SetCell(Cell const* cell_ptr, AngleSet& angle_set) override;
 
   /**
-   * Performs the discrete ordinates sweep calculation for the currently
-   * set cell, for all angles and groups within the provided AngleSet.
+   * Sweep the currently bound cell for the provided angle set.
+   *
+   * Selects the fixed-node kernel when all local cells have the same node count
+   * in the supported range, otherwise falls back to the generic CBC kernel.
    *
-   * It:
-   * - Assembles the local transport equation system for each angle and group
-   * - Retrieves upwind angular fluxes from local neighbors, remote locations
-   *   (via MPI data managed by CBC_FLUDS), or boundaries
-   * - Solves the local system for the outgoing angular fluxes at the cell nodes
-   * - Updates the global scalar flux moments
-   * - If save_angular_flux is true, stores the computed angular fluxes into
-   *   the global angular flux vector
-   * - Propagates outgoing angular fluxes to local downwind neighbors or stages
-   *   them for MPI transmission to remote downwind neighbors
+   * \param angle_set Angle set currently being advanced.
    */
   void Sweep(AngleSet& angle_set) override;
 
 protected:
+  /// Owning discrete-ordinates problem.
   DiscreteOrdinatesProblem& problem_;
+  /// Cached per-cell and per-angleset context.
   CBCSweepChunkContext ctx_;
+  /// Group block size for SIMD batch solves.
   unsigned int group_block_size_ = 0;
+  /// Reusable scratch buffers for generic sweep chunk kernel.
+  CBCGenericSweepScratch generic_scratch_;
 
 private:
+  /// Pointer-to-member for the selected sweep implementation (generic or fixed-node).
   using SweepFunc = void (CBCSweepChunk::*)(AngleSet&);
+  /// Selected sweep function pointer (generic or fixed-node).
   SweepFunc sweep_impl_ = nullptr;
 
+  /// Construct the aggregated sweep data for the current cell.
+  CBCSweepData MakeSweepData(const std::vector<double>* psi_old);
+  /// Sweep using the generic kernel.
   void Sweep_Generic(AngleSet& angle_set);
+  /// Sweep using the fixed-node kernel.
   template <unsigned int NumNodes>
   void Sweep_FixedN(AngleSet& angle_set);
 };
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_shared.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_shared.h
index 13a8ae1f1b..037f4f8379 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_shared.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_shared.h
@@ -11,116 +11,83 @@
 namespace opensn
 {
 
+/**
+ * Cached per-cell and per-angleset context for CBC sweep chunks.
+ *
+ * Populated in two phases:
+ * 1. BindAngleSet caches angleset-level data (FLUDS, group range, strides)
+ * 2. BindCell caches cell-level data (geometry, transport views, unit cell matrices)
+ */
 struct CBCSweepChunkContext
 {
+  /// FLUDS for current angleset.
   CBC_FLUDS* fluds = nullptr;
-
+  /// Number of groups in the current groupset.
   size_t gs_size = 0;
+  /// First group index in the current groupset.
   unsigned int gs_gi = 0;
+  /// Number of angles in the current angleset.
   size_t num_angles_in_as = 0;
+  /// Per-angle group stride ( = num_groups).
   unsigned int group_stride = 0;
+  /// Per-node angular stride ( = num_angles * num_groups).
   size_t group_angle_stride = 0;
+  /// Whether the surface source BCs are active.
   bool surface_source_active = false;
 
+  /// Current cell pointer
   const Cell* cell = nullptr;
+  /// Local ID of the current cell.
   std::uint32_t cell_local_id = 0;
+  /// Cell mapping for the current cell.
   const CellMapping* cell_mapping = nullptr;
+  /// Transport view for the current cell.
   CellLBSView* cell_transport_view = nullptr;
+  /// Number of faces on the current cell.
   size_t cell_num_faces = 0;
+  /// Number of nodes in the current cell.
   size_t cell_num_nodes = 0;
 
+  /// Volume integral matrix.
   const DenseMatrix<Vector3>* G = nullptr;
+  /// Mass matrix.
   const DenseMatrix<double>* M = nullptr;
+  /// Per-face surface mass matrices.
   const std::vector<DenseMatrix<double>>* M_surf = nullptr;
+  /// Per-face surface integrals of shape functions.
   const std::vector<Vector<double>>* IntS_shapeI = nullptr;
-};
 
-inline void
-CBCBindAngleSetContext(CBCSweepChunkContext& ctx,
-                       const LBSGroupset& groupset,
-                       bool surface_source_active,
-                       AngleSet& angle_set)
-{
-  ctx.fluds = &dynamic_cast<CBC_FLUDS&>(angle_set.GetFLUDS());
-  ctx.gs_size = groupset.GetNumGroups();
-  ctx.gs_gi = groupset.first_group;
-  ctx.surface_source_active = surface_source_active;
-  ctx.num_angles_in_as = angle_set.GetNumAngles();
-  ctx.group_stride = angle_set.GetNumGroups();
-  ctx.group_angle_stride = ctx.group_stride * ctx.num_angles_in_as;
-}
+  /// Cache angleset-level data (FLUDS, group range, strides).
+  void BindAngleSet(const LBSGroupset& groupset, const bool has_surface_source, AngleSet& angle_set)
+  {
+    fluds = &dynamic_cast<CBC_FLUDS&>(angle_set.GetFLUDS());
+    gs_size = groupset.GetNumGroups();
+    gs_gi = groupset.first_group;
+    surface_source_active = has_surface_source;
+    num_angles_in_as = angle_set.GetNumAngles();
+    group_stride = angle_set.GetNumGroups();
+    group_angle_stride = group_stride * num_angles_in_as;
+  }
 
-inline void
-CBCBindCellContext(CBCSweepChunkContext& ctx,
-                   const SpatialDiscretization& discretization,
-                   const std::vector<UnitCellMatrices>& unit_cell_matrices,
-                   std::vector<CellLBSView>& cell_transport_views,
-                   const Cell* cell_ptr)
-{
-  ctx.cell = cell_ptr;
-  ctx.cell_local_id = cell_ptr->local_id;
-  ctx.cell_mapping = &discretization.GetCellMapping(*ctx.cell);
-  ctx.cell_transport_view = &cell_transport_views[ctx.cell->local_id];
-  ctx.cell_num_faces = ctx.cell->faces.size();
-  ctx.cell_num_nodes = ctx.cell_mapping->GetNumNodes();
+  /// Cache cell-level data (geometry, transport views, unit cell matrices).
+  void BindCell(const SpatialDiscretization& discretization,
+                const std::vector<UnitCellMatrices>& unit_cell_matrices,
+                std::vector<CellLBSView>& cell_transport_views,
+                const Cell* cell_ptr)
+  {
+    cell = cell_ptr;
+    cell_local_id = cell_ptr->local_id;
+    cell_mapping = &discretization.GetCellMapping(*cell);
+    cell_transport_view = &cell_transport_views[cell->local_id];
+    cell_num_faces = cell->faces.size();
+    cell_num_nodes = cell_mapping->GetNumNodes();
 
-  const auto& unit_mats = unit_cell_matrices[ctx.cell_local_id];
-  ctx.G = &unit_mats.intV_shapeI_gradshapeJ;
-  ctx.M = &unit_mats.intV_shapeI_shapeJ;
-  ctx.M_surf = &unit_mats.intS_shapeI_shapeJ;
-  ctx.IntS_shapeI = &unit_mats.intS_shapeI;
-}
-
-inline CBCSweepData
-MakeCBCSweepData(const SpatialDiscretization& discretization,
-                 const std::vector<double>& source_moments,
-                 const LBSGroupset& groupset,
-                 const BlockID2XSMap& xs,
-                 unsigned int num_moments,
-                 unsigned int max_num_cell_dofs,
-                 bool save_angular_flux,
-                 size_t groupset_angle_group_stride,
-                 size_t groupset_group_stride,
-                 std::vector<double>& destination_phi,
-                 std::vector<double>& destination_psi,
-                 bool include_rhs_time_term,
-                 DiscreteOrdinatesProblem& problem,
-                 const std::vector<double>* psi_old,
-                 unsigned int group_block_size,
-                 const CBCSweepChunkContext& ctx)
-{
-  return CBCSweepData{discretization,
-                      source_moments,
-                      groupset,
-                      xs,
-                      num_moments,
-                      max_num_cell_dofs,
-                      save_angular_flux,
-                      groupset_angle_group_stride,
-                      groupset_group_stride,
-                      destination_phi,
-                      destination_psi,
-                      ctx.surface_source_active,
-                      include_rhs_time_term,
-                      problem,
-                      psi_old,
-                      group_block_size,
-                      *ctx.fluds,
-                      *ctx.cell,
-                      ctx.cell_local_id,
-                      *ctx.cell_mapping,
-                      *ctx.cell_transport_view,
-                      ctx.cell_num_faces,
-                      ctx.cell_num_nodes,
-                      ctx.gs_size,
-                      ctx.gs_gi,
-                      ctx.num_angles_in_as,
-                      ctx.group_stride,
-                      ctx.group_angle_stride,
-                      *ctx.G,
-                      *ctx.M,
-                      *ctx.M_surf,
-                      *ctx.IntS_shapeI};
-}
+    const auto& unit_mats = unit_cell_matrices[cell_local_id];
+    G = &unit_mats.intV_shapeI_gradshapeJ;
+    M = &unit_mats.intV_shapeI_shapeJ;
+    M_surf = &unit_mats.intS_shapeI_shapeJ;
+    IntS_shapeI = &unit_mats.intS_shapeI;
+  }
+};
 
 } // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.cc
index 0b261ceb48..78a8086017 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.cc
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.cc
@@ -63,6 +63,7 @@ CBCSweepChunkTD::CBCSweepChunkTD(DiscreteOrdinatesProblem& problem, LBSGroupset&
   }
 
   group_block_size_ = ComputeGroupBlockSize(groupset_.GetNumGroups());
+  generic_scratch_.EnsureCapacity(max_num_cell_dofs_, groupset_.GetNumGroups(), 0);
 }
 
 void
@@ -70,14 +71,50 @@ CBCSweepChunkTD::SetAngleSet(AngleSet& angle_set)
 {
   CALI_CXX_MARK_SCOPE("CBCSweepChunkTD::SetAngleSet");
 
-  CBCBindAngleSetContext(ctx_, groupset_, IsSurfaceSourceActive(), angle_set);
+  ctx_.BindAngleSet(groupset_, IsSurfaceSourceActive(), angle_set);
 }
 
 void
 CBCSweepChunkTD::SetCell(const Cell* cell_ptr, AngleSet& angle_set)
 {
   static_cast<void>(angle_set);
-  CBCBindCellContext(ctx_, discretization_, unit_cell_matrices_, cell_transport_views_, cell_ptr);
+  ctx_.BindCell(discretization_, unit_cell_matrices_, cell_transport_views_, cell_ptr);
+}
+
+CBCSweepData
+CBCSweepChunkTD::MakeSweepData(const std::vector<double>* psi_old)
+{
+  return CBCSweepData{discretization_,
+                      source_moments_,
+                      groupset_,
+                      num_moments_,
+                      max_num_cell_dofs_,
+                      SaveAngularFluxEnabled(),
+                      groupset_angle_group_stride_,
+                      groupset_group_stride_,
+                      destination_phi_,
+                      destination_psi_,
+                      ctx_.surface_source_active,
+                      include_rhs_time_term_,
+                      problem_,
+                      psi_old,
+                      group_block_size_,
+                      *ctx_.fluds,
+                      *ctx_.cell,
+                      ctx_.cell_local_id,
+                      *ctx_.cell_mapping,
+                      *ctx_.cell_transport_view,
+                      ctx_.cell_num_faces,
+                      ctx_.cell_num_nodes,
+                      ctx_.gs_size,
+                      ctx_.gs_gi,
+                      ctx_.num_angles_in_as,
+                      ctx_.group_stride,
+                      ctx_.group_angle_stride,
+                      *ctx_.G,
+                      *ctx_.M,
+                      *ctx_.M_surf,
+                      *ctx_.IntS_shapeI};
 }
 
 void
@@ -91,24 +128,9 @@ CBCSweepChunkTD::Sweep_Generic(AngleSet& angle_set)
 {
   CALI_CXX_MARK_SCOPE("CBCSweepChunkTD::Sweep_Generic");
 
-  auto data = MakeCBCSweepData(discretization_,
-                               source_moments_,
-                               groupset_,
-                               xs_,
-                               num_moments_,
-                               max_num_cell_dofs_,
-                               SaveAngularFluxEnabled(),
-                               groupset_angle_group_stride_,
-                               groupset_group_stride_,
-                               destination_phi_,
-                               destination_psi_,
-                               include_rhs_time_term_,
-                               problem_,
-                               &psi_old_,
-                               group_block_size_,
-                               ctx_);
-
-  CBC_Sweep_Generic<true>(data, angle_set);
+  auto data = MakeSweepData(&psi_old_);
+
+  CBC_Sweep_Generic<true>(data, generic_scratch_, angle_set);
 }
 
 template <unsigned int NumNodes>
@@ -117,22 +139,7 @@ CBCSweepChunkTD::Sweep_FixedN(AngleSet& angle_set)
 {
   CALI_CXX_MARK_SCOPE("CBCSweepChunkTD::Sweep_FixedN");
 
-  auto data = MakeCBCSweepData(discretization_,
-                               source_moments_,
-                               groupset_,
-                               xs_,
-                               num_moments_,
-                               max_num_cell_dofs_,
-                               SaveAngularFluxEnabled(),
-                               groupset_angle_group_stride_,
-                               groupset_group_stride_,
-                               destination_phi_,
-                               destination_psi_,
-                               include_rhs_time_term_,
-                               problem_,
-                               &psi_old_,
-                               group_block_size_,
-                               ctx_);
+  auto data = MakeSweepData(&psi_old_);
 
   CBC_Sweep_FixedN<NumNodes, true>(data, angle_set);
 }
@@ -145,4 +152,4 @@ template void CBCSweepChunkTD::Sweep_FixedN<6>(AngleSet&);
 template void CBCSweepChunkTD::Sweep_FixedN<7>(AngleSet&);
 template void CBCSweepChunkTD::Sweep_FixedN<8>(AngleSet&);
 
-} // namespace opensn
+} // namespace opensn
\ No newline at end of file
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.h
index 5e99bb83ef..e7d870ad11 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.h
@@ -10,30 +10,53 @@
 namespace opensn
 {
 
+/**
+ * Time-dependent host-side CBC sweep chunk.
+ *
+ * Identical to CBCSweepChunk but instantiates the Generic and FixedN kernels
+ * with \c time_dependent=true, adding the \f$v_g^{-1}/(\theta\Delta t)\f$
+ * time-absorption term and the previous-time-step angular flux source.
+ */
 class CBCSweepChunkTD : public SweepChunk
 {
 public:
   CBCSweepChunkTD(DiscreteOrdinatesProblem& problem, LBSGroupset& groupset);
   ~CBCSweepChunkTD() override = default;
 
+  /// Cache angle-set-level data and select the FixedN or Generic kernel.
   void SetAngleSet(AngleSet& angle_set) override;
+  /// Cache cell-level data for the next Sweep call.
   void SetCell(const Cell* cell_ptr, AngleSet& angle_set) override;
+  /// Sweep the current cell for all angles and groups (time-dependent).
   void Sweep(AngleSet& angle_set) override;
+  /// Indicate this chunk uses the time-dependent kernel variant.
   bool IsTimeDependent() const override { return true; }
 
 protected:
+  /// Pointer-to-member for the selected sweep implementation.
   using SweepFunc = void (CBCSweepChunkTD::*)(AngleSet&);
+  /// Construct the aggregated sweep data struct for the current cell.
+  CBCSweepData MakeSweepData(const std::vector<double>* psi_old);
+  /// Sweep using the generic (dynamic-size) kernel.
   void Sweep_Generic(AngleSet& angle_set);
+  /// Sweep using the FixedN (compile-time node count) kernel.
   template <unsigned int NumNodes>
   void Sweep_FixedN(AngleSet& angle_set);
 
+  /// Owning discrete ordinates problem.
   DiscreteOrdinatesProblem& problem_;
+  /// Previous-time-step angular flux vector.
   const std::vector<double>& psi_old_;
+  /// Energy group block size for SIMD batch solve.
   unsigned int group_block_size_ = 0;
+  /// Cached per-cell and per-angle-set context.
   CBCSweepChunkContext ctx_;
+  /// Reusable scratch buffers for the Generic kernel.
+  CBCGenericSweepScratch generic_scratch_;
 
 private:
+  /// Selected sweep function pointer (Generic or FixedN).
   SweepFunc sweep_impl_td_ = nullptr;
 };
 
-} // namespace opensn
+} // namespace opensn
\ No newline at end of file
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h
index 7413c44ab7..4504375d9f 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h
@@ -15,71 +15,198 @@
 namespace opensn
 {
 
+/**
+ * Aggregated sweep parameters for one cell in the CBC Generic/FixedN kernels.
+ *
+ * Bundles all data needed by the per-cell sweep kernel into a single struct
+ * to avoid long parameter lists. Includes references to the spatial
+ * discretization, groupset, FLUDS, cell geometry, and unit cell matrices.
+ * Constructed once per cell by CBCSweepChunk::MakeSweepData.
+ */
 struct CBCSweepData
 {
+  /// Spatial discretization providing DOF mapping.
   const SpatialDiscretization& discretization;
+  /// Source moment vector (indexed by DOF mapping).
   const std::vector<double>& source_moments;
+  /// Groupset containing quadrature and group range.
   const LBSGroupset& groupset;
-  const BlockID2XSMap& xs;
+  /// Number of angular moments.
   unsigned int num_moments;
+  /// Maximum number of DOFs (nodes) per cell in the mesh.
   unsigned int max_num_cell_dofs;
+  /// Whether to store solved angular fluxes into destination_psi.
   bool save_angular_flux;
+  /// Stride in the psi array: num_angles_in_quadrature * num_groups.
   size_t groupset_angle_group_stride;
+  /// Stride in the psi array: num_groups.
   size_t groupset_group_stride;
+  /// Output scalar flux moments vector (accumulated across angles).
   std::vector<double>& destination_phi;
+  /// Output angular flux vector (written if save_angular_flux is true).
   std::vector<double>& destination_psi;
+  /// Whether surface source boundary conditions are active.
   bool surface_source_active;
+  /// Whether the RHS time-derivative term is included.
   bool include_rhs_time_term;
+  /// Owning discrete ordinates problem (for time-step and theta access).
   DiscreteOrdinatesProblem& problem;
+  /// Previous-time-step angular flux (nullptr for steady-state sweeps).
   const std::vector<double>* psi_old;
+  /// Energy group block size for FixedN SIMD batch solve.
   unsigned int group_block_size;
 
+  /// FLUDS providing local/nonlocal angular flux access.
   CBC_FLUDS& fluds;
+  /// Current cell being swept.
   const Cell& cell;
+  /// Local ID of the current cell.
   std::uint32_t cell_local_id;
+  /// Cell mapping providing face-node maps and DOF counts.
   const CellMapping& cell_mapping;
+  /// Transport view providing cross-section data and DOF mapping.
   CellLBSView& cell_transport_view;
+  /// Number of faces on the current cell.
   size_t cell_num_faces;
+  /// Number of nodes on the current cell.
   size_t cell_num_nodes;
 
+  /// Number of energy groups in the groupset.
   size_t gs_size;
+  /// First group index in the groupset.
   unsigned int gs_gi;
+  /// Number of angles in the current angle set.
   size_t num_angles_in_as;
+  /// Per-angle group stride (= num_groups).
   unsigned int group_stride;
+  /// Per-node angular stride (= num_angles * num_groups).
   size_t group_angle_stride;
 
+  /// Volume integral: \f$\int_V \nabla\phi_i \cdot \phi_j \, dV\f$.
   const DenseMatrix<Vector3>& G;
+  /// Mass matrix: \f$\int_V \phi_i \phi_j \, dV\f$.
   const DenseMatrix<double>& M;
+  /// Per-face surface mass matrices: \f$\int_S \phi_i \phi_j \, dS\f$.
   const std::vector<DenseMatrix<double>>& M_surf;
+  /// Per-face surface integrals: \f$\int_S \phi_i \, dS\f$.
   const std::vector<Vector<double>>& IntS_shapeI;
 };
 
+/// Pre-resolved metadata for one incoming face of the current cell.
+struct CBCIncomingFaceData
+{
+  /// Nodal mapping for local face access (nullptr for nonlocal/boundary).
+  const FaceNodalMapping* face_nodal_mapping = nullptr;
+  /// Base pointer for local/nonlocal incoming face psi; null only for boundary faces.
+  double* psi_base = nullptr;
+};
+
+/// Pre-resolved metadata for one outgoing face of the current cell.
+struct CBCOutgoingFaceData
+{
+  /// Whether the face is a reflecting boundary.
+  bool is_reflecting_boundary_face = false;
+  /// Base pointer for local outgoing face psi, when applicable.
+  double* psi_base = nullptr;
+  /// Nonlocal face info for MPI send staging; null for local/boundary faces.
+  const CBC_FLUDSCommonData::OutgoingNonlocalFaceInfo* outgoing_nonlocal_face_info = nullptr;
+};
+
+/**
+ * Reusable scratch buffers for the CBC Generic sweep kernel.
+ *
+ * Allocated once per sweep chunk and resized lazily via EnsureCapacity.
+ * Avoids per-cell heap allocation in the hot path.
+ */
+struct CBCGenericSweepScratch
+{
+  /// Transport matrix: \f$A_{ij} = \hat\Omega \cdot G_{ij} + \text{face terms}\f$.
+  DenseMatrix<double> Amat;
+  /// Temporary copy of A with \f$\sigma_t M\f$ added, consumed by Gauss elimination.
+  DenseMatrix<double> Atemp;
+  /// Per-group RHS vectors.
+  std::vector<Vector<double>> b;
+  /// Per-node source assembly scratch.
+  std::vector<double> source;
+  /// Per-face dot product \f$\hat\Omega \cdot \hat n_f\f$.
+  std::vector<double> face_mu_values;
+  /// Per-group time-absorption coefficient \f$v_g^{-1} / (\theta \Delta t)\f$.
+  std::vector<double> tau_gsg;
+  /// Pre-resolved incoming face metadata (one per cell face).
+  std::vector<CBCIncomingFaceData> incoming_face_data;
+  /// Pre-resolved outgoing face metadata (one per cell face).
+  std::vector<CBCOutgoingFaceData> outgoing_face_data;
+  /// Pre-computed DOF indices: \c moment_dof_map[m * cell_num_nodes + i].
+  std::vector<size_t> moment_dof_map;
+
+  void
+  EnsureCapacity(const size_t max_num_cell_dofs, const size_t gs_size, const size_t cell_num_faces)
+  {
+    if (Amat.Rows() != max_num_cell_dofs or Amat.Columns() != max_num_cell_dofs)
+    {
+      Amat = DenseMatrix<double>(max_num_cell_dofs, max_num_cell_dofs);
+      Atemp = DenseMatrix<double>(max_num_cell_dofs, max_num_cell_dofs);
+    }
+
+    if (b.size() != gs_size)
+      b.assign(gs_size, Vector<double>(max_num_cell_dofs));
+    else
+      for (auto& vec : b)
+        if (vec.Rows() != max_num_cell_dofs)
+          vec = Vector<double>(max_num_cell_dofs);
+
+    if (source.size() != max_num_cell_dofs)
+      source.assign(max_num_cell_dofs, 0.0);
+
+    if (face_mu_values.size() != cell_num_faces)
+      face_mu_values.assign(cell_num_faces, 0.0);
+
+    if (incoming_face_data.size() != cell_num_faces)
+      incoming_face_data.assign(cell_num_faces, CBCIncomingFaceData{});
+
+    if (outgoing_face_data.size() != cell_num_faces)
+      outgoing_face_data.assign(cell_num_faces, CBCOutgoingFaceData{});
+  }
+};
+
+/**
+ * Generic CBC sweep kernel for one cell, parameterized by time dependence.
+ *
+ * Assembles and solves the local transport system for all angles and groups
+ * in the angle set, using dynamic-size matrices and Gauss elimination.
+ * Used when the cell node count does not match a compile-time FixedN
+ * specialization.
+ *
+ * \tparam time_dependent if true, include the time-derivative source term
+ */
 template <bool time_dependent>
 inline void
-CBC_Sweep_Generic(CBCSweepData& data, AngleSet& angle_set)
+CBC_Sweep_Generic(CBCSweepData& data, CBCGenericSweepScratch& scratch, AngleSet& angle_set)
 {
   const auto& groupset = data.groupset;
   const auto& m2d_op = groupset.quadrature->GetMomentToDiscreteOperator();
   const auto& d2m_op = groupset.quadrature->GetDiscreteToMomentOperator();
-
-  DenseMatrix<double> Amat(data.max_num_cell_dofs, data.max_num_cell_dofs);
-  DenseMatrix<double> Atemp(data.max_num_cell_dofs, data.max_num_cell_dofs);
-  std::vector<Vector<double>> b(data.gs_size, Vector<double>(data.max_num_cell_dofs));
-  std::vector<double> source(data.max_num_cell_dofs);
-  std::vector<double> face_mu_values(data.cell_num_faces);
+  scratch.EnsureCapacity(data.max_num_cell_dofs, data.gs_size, data.cell_num_faces);
+  auto& Amat = scratch.Amat;
+  auto& Atemp = scratch.Atemp;
+  auto& b = scratch.b;
+  auto& source = scratch.source;
+  auto& face_mu_values = scratch.face_mu_values;
 
   const auto& face_orientations = angle_set.GetSPDS().GetCellFaceOrientations()[data.cell_local_id];
-  const auto& sigma_t = data.xs.at(data.cell.block_id)->GetSigmaTotal();
+  const auto& cell_xs = data.cell_transport_view.GetXS();
+  const auto& sigma_t = cell_xs.GetSigmaTotal();
 
-  std::vector<double> tau_gsg;
+  scratch.tau_gsg.clear();
   if constexpr (time_dependent)
   {
-    const auto& inv_velg = data.xs.at(data.cell.block_id)->GetInverseVelocity();
+    const auto& inv_velg = cell_xs.GetInverseVelocity();
     const double theta = data.problem.GetTheta();
     const double inv_theta = 1.0 / theta;
     const double dt = data.problem.GetTimeStep();
     const double inv_dt = 1.0 / dt;
 
+    auto& tau_gsg = scratch.tau_gsg;
     tau_gsg.assign(data.gs_size, 0.0);
     for (size_t gsg = 0; gsg < data.gs_size; ++gsg)
       tau_gsg[gsg] = inv_velg[data.gs_gi + gsg] * inv_theta * inv_dt;
@@ -91,6 +218,66 @@ CBC_Sweep_Generic(CBCSweepData& data, AngleSet& angle_set)
       : nullptr;
 
   const auto& as_angle_indices = angle_set.GetAngleIndices();
+  const auto& cbc_common = dynamic_cast<const CBC_FLUDSCommonData&>(data.fluds.GetCommonData());
+  auto* const async_comm = dynamic_cast<CBC_AsynchronousCommunicator*>(angle_set.GetCommunicator());
+  auto& incoming_face_data = scratch.incoming_face_data;
+  auto& outgoing_face_data = scratch.outgoing_face_data;
+  for (size_t f = 0; f < data.cell_num_faces; ++f)
+  {
+    incoming_face_data[f] = CBCIncomingFaceData{};
+    outgoing_face_data[f] = CBCOutgoingFaceData{};
+    const auto& face = data.cell.faces[f];
+    const bool is_local_face = data.cell_transport_view.IsFaceLocal(f);
+    const bool is_boundary_face = not face.has_neighbor;
+    const auto* face_nodal_mapping =
+      &data.fluds.GetCommonData().GetFaceNodalMapping(data.cell_local_id, f);
+
+    if (face_orientations[f] == FaceOrientation::INCOMING)
+    {
+      auto& face_data = incoming_face_data[f];
+      face_data.face_nodal_mapping = face_nodal_mapping;
+      if (is_local_face)
+        face_data.psi_base =
+          data.fluds.GetLocalFacePsiPointer(data.cell_local_id, static_cast<unsigned int>(f));
+      else if (not is_boundary_face)
+        face_data.psi_base = data.fluds.GetIncomingNonlocalFacePsiPointer(
+          data.cell_local_id, static_cast<unsigned int>(f));
+    }
+
+    if (face_orientations[f] == FaceOrientation::OUTGOING)
+    {
+      auto& face_data = outgoing_face_data[f];
+      face_data.is_reflecting_boundary_face =
+        is_boundary_face and angle_set.GetBoundaries()[face.neighbor_id]->IsReflecting();
+      if (is_local_face)
+        face_data.psi_base =
+          data.fluds.GetLocalFacePsiPointer(data.cell_local_id, static_cast<unsigned int>(f));
+      if (not is_local_face and not is_boundary_face)
+        face_data.outgoing_nonlocal_face_info =
+          &cbc_common.GetOutgoingNonlocalFaceInfo(data.cell_local_id, static_cast<unsigned int>(f));
+    }
+  }
+
+  auto& moment_dof_map = scratch.moment_dof_map;
+  moment_dof_map.resize(static_cast<size_t>(data.num_moments) * data.cell_num_nodes);
+  for (unsigned int m = 0; m < data.num_moments; ++m)
+    for (size_t i = 0; i < data.cell_num_nodes; ++i)
+      moment_dof_map[static_cast<size_t>(m) * data.cell_num_nodes + i] =
+        data.cell_transport_view.MapDOF(i, m, data.gs_gi);
+
+  double* psi_new_base = nullptr;
+  double theta = 1.0;
+  double inv_theta = 1.0;
+  if (data.save_angular_flux)
+  {
+    psi_new_base = &data.destination_psi[data.discretization.MapDOFLocal(
+      data.cell, 0, groupset.psi_uk_man_, 0, 0)];
+    if constexpr (time_dependent)
+    {
+      theta = data.problem.GetTheta();
+      inv_theta = 1.0 / theta;
+    }
+  }
 
   for (size_t as_ss_idx = 0; as_ss_idx < data.num_angles_in_as; ++as_ss_idx)
   {
@@ -115,10 +302,8 @@ CBC_Sweep_Generic(CBCSweepData& data, AngleSet& angle_set)
         continue;
 
       const auto& face = data.cell.faces[f];
-      const bool is_local_face = data.cell_transport_view.IsFaceLocal(f);
-      const bool is_boundary_face = not face.has_neighbor;
-      const auto* face_nodal_mapping =
-        &data.fluds.GetCommonData().GetFaceNodalMapping(data.cell_local_id, f);
+      const auto& face_data = incoming_face_data[f];
+      const auto* face_nodal_mapping = face_data.face_nodal_mapping;
 
       const size_t num_face_nodes = data.cell_mapping.GetNumFaceNodes(f);
       for (size_t fi = 0; fi < num_face_nodes; ++fi)
@@ -133,13 +318,11 @@ CBC_Sweep_Generic(CBCSweepData& data, AngleSet& angle_set)
 
           const double* psi = nullptr;
 
-          if (is_local_face)
-            psi = data.fluds.UpwindPsi(*data.cell_transport_view.FaceNeighbor(f),
-                                       face_nodal_mapping->cell_node_mapping_[fj],
-                                       as_ss_idx);
-          else if (not is_boundary_face)
-            psi = data.fluds.NLUpwindPsi(
-              data.cell.global_id, f, face_nodal_mapping->face_node_mapping_[fj], as_ss_idx);
+          if (face_data.psi_base != nullptr)
+            psi = face_data.psi_base +
+                  static_cast<size_t>(face_nodal_mapping->face_node_mapping_[fj]) *
+                    data.group_angle_stride +
+                  as_ss_idx * data.group_stride;
           else
             psi = angle_set.PsiBoundary(face.neighbor_id,
                                         direction_num,
@@ -165,19 +348,23 @@ CBC_Sweep_Generic(CBCSweepData& data, AngleSet& angle_set)
     {
       double sigma_tg = sigma_t[data.gs_gi + gsg];
       if constexpr (time_dependent)
+      {
+        const auto& tau_gsg = scratch.tau_gsg;
         sigma_tg += tau_gsg[gsg];
+      }
 
       for (size_t i = 0; i < data.cell_num_nodes; ++i)
       {
         double temp_src = 0.0;
         for (unsigned int m = 0; m < data.num_moments; ++m)
         {
-          const auto ir = data.cell_transport_view.MapDOF(i, m, data.gs_gi + gsg);
+          const auto ir = moment_dof_map[static_cast<size_t>(m) * data.cell_num_nodes + i] + gsg;
           temp_src += m2d_row[m] * data.source_moments[ir];
         }
 
         if constexpr (time_dependent)
         {
+          const auto& tau_gsg = scratch.tau_gsg;
           const size_t imap =
             i * data.groupset_angle_group_stride + direction_num * data.groupset_group_stride;
           if (data.include_rhs_time_term and psi_old)
@@ -207,7 +394,7 @@ CBC_Sweep_Generic(CBCSweepData& data, AngleSet& angle_set)
       const auto wn_d2m = d2m_row[m];
       for (size_t i = 0; i < data.cell_num_nodes; ++i)
       {
-        const auto ir = data.cell_transport_view.MapDOF(i, m, data.gs_gi);
+        const auto ir = moment_dof_map[static_cast<size_t>(m) * data.cell_num_nodes + i];
         for (size_t gsg = 0; gsg < data.gs_size; ++gsg)
           data.destination_phi[ir + gsg] += wn_d2m * b[gsg](i);
       }
@@ -215,17 +402,6 @@ CBC_Sweep_Generic(CBCSweepData& data, AngleSet& angle_set)
 
     if (data.save_angular_flux)
     {
-      double* psi_new = &data.destination_psi[data.discretization.MapDOFLocal(
-        data.cell, 0, groupset.psi_uk_man_, 0, 0)];
-
-      double theta = 1.0;
-      double inv_theta = 1.0;
-      if constexpr (time_dependent)
-      {
-        theta = data.problem.GetTheta();
-        inv_theta = 1.0 / theta;
-      }
-
       for (size_t i = 0; i < data.cell_num_nodes; ++i)
       {
         const size_t imap =
@@ -237,10 +413,10 @@ CBC_Sweep_Generic(CBCSweepData& data, AngleSet& angle_set)
           if constexpr (time_dependent)
           {
             const double psi_old_val = psi_old ? psi_old[imap + gsg] : 0.0;
-            psi_new[imap + gsg] = inv_theta * (psi_sol + (theta - 1.0) * psi_old_val);
+            psi_new_base[imap + gsg] = inv_theta * (psi_sol + (theta - 1.0) * psi_old_val);
           }
           else
-            psi_new[imap + gsg] = psi_sol;
+            psi_new_base[imap + gsg] = psi_sol;
         }
       }
     }
@@ -251,35 +427,33 @@ CBC_Sweep_Generic(CBCSweepData& data, AngleSet& angle_set)
         continue;
 
       const auto& face = data.cell.faces[f];
-      const bool is_local_face = data.cell_transport_view.IsFaceLocal(f);
-      const bool is_boundary_face = not face.has_neighbor;
-      const bool is_reflecting_boundary_face =
-        (is_boundary_face and angle_set.GetBoundaries()[face.neighbor_id]->IsReflecting());
+      const auto& face_data = outgoing_face_data[f];
+      const bool is_reflecting_boundary_face = face_data.is_reflecting_boundary_face;
       const auto& IntF_shapeI = data.IntS_shapeI[f];
 
-      const int locality = data.cell_transport_view.FaceLocality(f);
       const size_t num_face_nodes = data.cell_mapping.GetNumFaceNodes(f);
-      const auto& face_nodal_mapping =
-        data.fluds.GetCommonData().GetFaceNodalMapping(data.cell_local_id, f);
-      std::vector<double>* psi_nonlocal_outgoing = nullptr;
+      double* psi_nonlocal_outgoing = nullptr;
 
-      if (not is_boundary_face and not is_local_face)
+      if (face_data.outgoing_nonlocal_face_info != nullptr)
       {
-        auto* async_comm = dynamic_cast<CBC_AsynchronousCommunicator*>(angle_set.GetCommunicator());
-        const size_t data_size_for_msg = num_face_nodes * data.group_angle_stride;
+        const auto& outgoing_nonlocal_face_info = *face_data.outgoing_nonlocal_face_info;
+        const size_t data_size_for_msg =
+          static_cast<size_t>(outgoing_nonlocal_face_info.num_face_nodes) * data.group_angle_stride;
         psi_nonlocal_outgoing =
-          &async_comm->InitGetDownwindMessageData(locality,
-                                                  face.neighbor_id,
-                                                  face_nodal_mapping.associated_face_,
-                                                  angle_set.GetID(),
-                                                  data_size_for_msg);
+          async_comm
+            ->InitGetDownwindMessageData(outgoing_nonlocal_face_info.locality,
+                                         outgoing_nonlocal_face_info.cell_global_id,
+                                         outgoing_nonlocal_face_info.associated_face,
+                                         angle_set.GetID(),
+                                         data_size_for_msg)
+            .data();
       }
 
       for (size_t fi = 0; fi < num_face_nodes; ++fi)
       {
         const int i = data.cell_mapping.MapFaceNode(f, fi);
 
-        if (is_boundary_face)
+        if (face_data.outgoing_nonlocal_face_info == nullptr and face_data.psi_base == nullptr)
         {
           for (size_t gsg = 0; gsg < data.gs_size; ++gsg)
             data.cell_transport_view.AddOutflow(
@@ -287,9 +461,9 @@ CBC_Sweep_Generic(CBCSweepData& data, AngleSet& angle_set)
         }
 
         double* psi = nullptr;
-        if (is_local_face)
-          psi = data.fluds.OutgoingPsi(data.cell, i, as_ss_idx);
-        else if (not is_boundary_face)
+        if (face_data.psi_base != nullptr)
+          psi = face_data.psi_base + fi * data.group_angle_stride + as_ss_idx * data.group_stride;
+        else if (face_data.outgoing_nonlocal_face_info != nullptr)
           psi = data.fluds.NLOutgoingPsi(psi_nonlocal_outgoing, fi, as_ss_idx);
         else if (is_reflecting_boundary_face)
           psi = angle_set.PsiReflected(face.neighbor_id, direction_num, data.cell_local_id, f, fi);
@@ -302,6 +476,16 @@ CBC_Sweep_Generic(CBCSweepData& data, AngleSet& angle_set)
   }
 }
 
+/**
+ * Fixed-node-count CBC sweep kernel with AVX/AVX512 SIMD batch solve.
+ *
+ * Specialized in cbc_avx_sweep_chunk.cc for compile-time-known node counts
+ * (4, 8, etc.), enabling stack-allocated matrices, loop unrolling, and SIMD
+ * batch Gauss elimination across multiple energy groups simultaneously.
+ *
+ * \tparam NumNodes compile-time number of cell nodes
+ * \tparam time_dependent if true, include the time-derivative source term
+ */
 template <unsigned int NumNodes, bool time_dependent>
 void CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set);
 

From d38db2ee1162c681e06a047e6777d3ea618cf3fe Mon Sep 17 00:00:00 2001
From: Eappen Nelluvelil <eappen@tamu.edu>
Date: Mon, 13 Apr 2026 14:26:12 -0500
Subject: [PATCH 6/6] CBCD V2 with aggregated communicator, multithreaded
 sweeps, and minimally sized FLUDs

---
 .../discrete_ordinates_problem.cc             | 162 +++++--
 .../sweep/angle_set/cbcd_angle_set.cu         | 390 ++++++++++++++-
 .../sweep/angle_set/cbcd_angle_set.h          | 214 ++++++++-
 .../sweep/communicators/cbcd_async_comm.cu    | 443 ++++++++++++++++++
 .../sweep/communicators/cbcd_async_comm.h     | 246 ++++++++++
 .../sweep/communicators/lock_free_queues.h    | 170 +++++++
 .../sweep/fluds/cbcd_fluds.cu                 | 320 +++++++------
 .../sweep/fluds/cbcd_fluds.h                  | 179 ++++---
 .../sweep/fluds/cbcd_fluds_common_data.cc     |  20 +-
 .../sweep/fluds/cbcd_fluds_common_data.cu     | 251 ++++++++--
 .../sweep/fluds/cbcd_fluds_common_data.h      | 123 ++++-
 .../sweep/fluds/cbcd_structs.h                |  96 +++-
 .../sweep/scheduler/sweep_scheduler.cc        |  11 +-
 .../sweep/scheduler/sweep_scheduler.cu        | 204 ++------
 .../sweep_chunks/cbc_sweep_kernels.h          |   4 +-
 .../sweep_chunks/cbcd_sweep_chunk.cu          | 180 +++++--
 .../sweep_chunks/cbcd_sweep_chunk.h           |  69 ++-
 .../sweep_chunks/gpu_kernel/solver.h          |  48 +-
 18 files changed, 2523 insertions(+), 607 deletions(-)
 create mode 100644 modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.cu
 create mode 100644 modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.h
 create mode 100644 modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/lock_free_queues.h

diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc
index bf9076043a..ea0240b32d 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc
@@ -8,6 +8,7 @@
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/boundary/isotropic_boundary.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/boundary/arbitrary_boundary.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/aah.h"
@@ -48,6 +49,7 @@
 #include <cassert>
 #include <cmath>
 #include <iomanip>
+#include <numeric>
 #include <sstream>
 #include <stdexcept>
 #include <thread>
@@ -1432,13 +1434,10 @@ DiscreteOrdinatesProblem::InitializeSweepDataStructures()
       std::chrono::duration<double> elapsed_seconds = end_time - start_time;
 
       const auto local_face_slots = cbc_spds_list.front()->GetMaxNumLocalPsiSlots();
-      log.Log() << program_timer.GetTimeString() << "CBC SPDS local cell-face psi slot summary\n"
-                << "   SPDS count  : 1\n"
-                << "   Elapsed     : " << elapsed_seconds.count() << " s\n"
-                << "   Max         : " << local_face_slots << "\n"
-                << "   Min         : " << local_face_slots << "\n"
-                << "   Median      : " << static_cast<double>(local_face_slots) << "\n"
-                << "   Average     : " << static_cast<double>(local_face_slots) << "\n";
+      log.Log() << "CBC SPDS cell-face slot plan calculated in " << elapsed_seconds.count()
+                << " s with 1 thread.\n"
+                << "  (max, min, avg) = (" << local_face_slots << ", " << local_face_slots << ", "
+                << static_cast<double>(local_face_slots) << ").\n";
     }
     else if (not cbc_spds_list.empty())
     {
@@ -1448,15 +1447,14 @@ DiscreteOrdinatesProblem::InitializeSweepDataStructures()
       SPMD_ThreadPool pool(num_workers);
       std::atomic<std::size_t> next_index{0};
 
-      log.Log() << program_timer.GetTimeString()
-                << " Compute max num local cell-face psi slots for " << cbc_spds_list.size()
-                << " CBC SPDS using " << num_workers << " worker threads.\n";
+      log.Log() << "Computing cell-face slot plans for " << cbc_spds_list.size()
+                << " CBC SPDS with " << num_workers << " threads.\n";
 
       auto start_time = std::chrono::steady_clock::now();
       pool.ExecuteBatch(
         [&](std::size_t /* thread ID */)
         {
-          std::size_t index;
+          std::size_t index = 0;
           // Atomically fetch the next index to work on
           // std::memory_order_relaxed is sufficient here because we need atomicity only for the
           // fetch_add operation, and there are no other synchronization requirements between
@@ -1469,42 +1467,26 @@ DiscreteOrdinatesProblem::InitializeSweepDataStructures()
         });
       auto end_time = std::chrono::steady_clock::now();
       std::chrono::duration<double> elapsed_seconds = end_time - start_time;
-      double elapsed_time = elapsed_seconds.count();
 
       size_t max_local_psi_slots = 0;
       size_t min_local_psi_slots = std::numeric_limits<size_t>::max();
-      std::vector<size_t> local_psi_slot_counts;
-      local_psi_slot_counts.reserve(cbc_spds_list.size());
+      std::uint64_t total_local_psi_slots = 0;
 
       for (const auto& spds : cbc_spds_list)
       {
         const auto local_psi_slots = spds->GetMaxNumLocalPsiSlots();
         max_local_psi_slots = std::max(max_local_psi_slots, local_psi_slots);
         min_local_psi_slots = std::min(min_local_psi_slots, local_psi_slots);
-        local_psi_slot_counts.push_back(local_psi_slots);
+        total_local_psi_slots += local_psi_slots;
       }
 
-      std::sort(local_psi_slot_counts.begin(), local_psi_slot_counts.end());
-      const auto num_counts = local_psi_slot_counts.size();
       const double avg_local_psi_slots =
-        static_cast<double>(std::accumulate(
-          local_psi_slot_counts.begin(), local_psi_slot_counts.end(), std::size_t{0})) /
-        num_counts;
-      const double median_local_psi_slots =
-        (num_counts % 2 == 1)
-          ? static_cast<double>(local_psi_slot_counts[num_counts / 2])
-          : 0.5 * static_cast<double>(local_psi_slot_counts[num_counts / 2 - 1] +
-                                      local_psi_slot_counts[num_counts / 2]);
-
-      log.Log() << program_timer.GetTimeString()
-                << " CBC SPDS local cell-face psi slot statistics\n"
-                << "    SPDS count : " << cbc_spds_list.size() << "\n"
-                << "    Workers    : " << num_workers << "\n"
-                << "    Elapsed    : " << elapsed_time << " s\n"
-                << "    Max        : " << max_local_psi_slots << "\n"
-                << "    Min        : " << min_local_psi_slots << "\n"
-                << "    Median     : " << median_local_psi_slots << "\n"
-                << "    Average    : " << avg_local_psi_slots << "\n";
+        static_cast<double>(total_local_psi_slots) / static_cast<double>(cbc_spds_list.size());
+
+      log.Log() << "CBC SPDS cell-face slot plans calculated in " << elapsed_seconds.count()
+                << " s.\n"
+                << "  (avg, max, min) = (" << avg_local_psi_slots << " slots, "
+                << max_local_psi_slots << " slots, " << min_local_psi_slots << " slots).";
     }
   }
   else
@@ -1842,6 +1824,12 @@ DiscreteOrdinatesProblem::InitFluxDataStructures(LBSGroupset& groupset)
   groupset.angle_agg =
     std::make_shared<AngleAggregation>(sweep_boundaries_, groupset.quadrature, grid_);
 
+  std::vector<std::size_t> cbc_fluds_local_psi_bytes;
+  std::vector<std::size_t> cbc_fluds_boundary_nonlocal_bytes;
+  const auto num_local_spatial_dofs = discretization_->GetNumLocalDOFs(groupset.psi_uk_man_) /
+                                      groupset.psi_uk_man_.GetNumberOfUnknowns() / gs_num_grps;
+  std::uint64_t full_local_psi_storage_bytes = 0;
+
   size_t angle_set_id = 0;
   for (const auto& so_grouping : unique_so_groupings)
   {
@@ -1910,8 +1898,11 @@ DiscreteOrdinatesProblem::InitFluxDataStructures(LBSGroupset& groupset)
       else if (sweep_type_ == "CBC")
       {
         std::shared_ptr<FLUDS> fluds;
+        std::size_t boundary_nonlocal_bytes = 0;
         if (use_gpus_)
         {
+          const auto& cbcd_common_data =
+            dynamic_cast<const CBCD_FLUDSCommonData&>(fluds_common_data);
           fluds = CreateCBCD_FLUDS(gs_num_grps,
                                    angle_indices.size(),
                                    grid_->local_cells.size(),
@@ -1919,6 +1910,13 @@ DiscreteOrdinatesProblem::InitFluxDataStructures(LBSGroupset& groupset)
                                    groupset.psi_uk_man_,
                                    *discretization_,
                                    (not GetPsiNewLocal()[groupset.id].empty()));
+
+          const auto num_groups_and_angles = gs_num_grps * angle_indices.size();
+          boundary_nonlocal_bytes = (cbcd_common_data.GetNumIncomingBoundaryNodes() +
+                                     cbcd_common_data.GetNumOutgoingBoundaryNodes() +
+                                     cbcd_common_data.GetNumIncomingNonlocalNodes() +
+                                     cbcd_common_data.GetNumOutgoingNonlocalNodes()) *
+                                    num_groups_and_angles * sizeof(double);
         }
         else
         {
@@ -1926,7 +1924,46 @@ DiscreteOrdinatesProblem::InitFluxDataStructures(LBSGroupset& groupset)
             gs_num_grps,
             angle_indices.size(),
             dynamic_cast<const CBC_FLUDSCommonData&>(fluds_common_data));
+
+          const auto& cbc_common_data = dynamic_cast<const CBC_FLUDSCommonData&>(fluds_common_data);
+          const auto num_groups_and_angles = gs_num_grps * angle_indices.size();
+          constexpr std::size_t local_psi_alignment = 64;
+          constexpr std::size_t doubles_per_cache_line = local_psi_alignment / sizeof(double);
+          const auto round_up_to_cache_line_multiple = [](std::size_t value)
+          {
+            return ((value + doubles_per_cache_line - 1) / doubles_per_cache_line) *
+                   doubles_per_cache_line;
+          };
+
+          for (std::size_t face_storage_index = 0;
+               face_storage_index < cbc_common_data.GetNumCellFaces();
+               ++face_storage_index)
+          {
+            const auto& face_info =
+              cbc_common_data.GetIncomingNonlocalFaceInfoByStorageIndex(face_storage_index);
+            if (face_info.num_face_nodes == 0)
+              continue;
+            boundary_nonlocal_bytes +=
+              round_up_to_cache_line_multiple(static_cast<std::size_t>(face_info.num_face_nodes) *
+                                              num_groups_and_angles) *
+              sizeof(double);
+          }
+        }
+
+        if (use_gpus_)
+        {
+          const auto& cbc_spds = dynamic_cast<const CBC_SPDS&>(fluds_common_data.GetSPDS());
+          cbc_fluds_local_psi_bytes.push_back(cbc_spds.GetMaxNumLocalPsiSlots() *
+                                              cbc_spds.GetMaxLocalFaceNodeCount() * gs_num_grps *
+                                              angle_indices.size() * sizeof(double));
         }
+        else
+          cbc_fluds_local_psi_bytes.push_back(
+            dynamic_cast<const CBC_FLUDS&>(*fluds).GetLocalPsiBufferSize());
+        cbc_fluds_boundary_nonlocal_bytes.push_back(boundary_nonlocal_bytes);
+
+        full_local_psi_storage_bytes +=
+          num_local_spatial_dofs * gs_num_grps * angle_indices.size() * sizeof(double);
 
         std::shared_ptr<AngleSet> angle_set;
         if (use_gpus_)
@@ -1957,6 +1994,61 @@ DiscreteOrdinatesProblem::InitFluxDataStructures(LBSGroupset& groupset)
     } // for an_ss
   } // for so_grouping
 
+  if (sweep_type_ == "CBC" and not cbc_fluds_local_psi_bytes.empty())
+  {
+    const auto [min_it, max_it] =
+      std::minmax_element(cbc_fluds_local_psi_bytes.begin(), cbc_fluds_local_psi_bytes.end());
+    const auto [boundary_nonlocal_min_it, boundary_nonlocal_max_it] = std::minmax_element(
+      cbc_fluds_boundary_nonlocal_bytes.begin(), cbc_fluds_boundary_nonlocal_bytes.end());
+    const auto total_local_psi_storage = std::accumulate(
+      cbc_fluds_local_psi_bytes.begin(), cbc_fluds_local_psi_bytes.end(), std::uint64_t{0});
+    const auto total_boundary_nonlocal_storage =
+      std::accumulate(cbc_fluds_boundary_nonlocal_bytes.begin(),
+                      cbc_fluds_boundary_nonlocal_bytes.end(),
+                      std::uint64_t{0});
+    const auto total_managed_psi_storage =
+      total_local_psi_storage + total_boundary_nonlocal_storage;
+    std::ostringstream savings_out;
+    if (full_local_psi_storage_bytes > 0)
+      savings_out << 100.0 * (1.0 - (static_cast<double>(total_local_psi_storage) /
+                                     static_cast<double>(full_local_psi_storage_bytes)))
+                  << "%.";
+    else
+      savings_out << "N/A.";
+    const auto format_bytes = [](const std::uint64_t bytes)
+    {
+      constexpr std::pair<double, const char*> units[] = {
+        {1024.0 * 1024.0 * 1024.0, "GiB"}, {1024.0 * 1024.0, "MiB"}, {1024.0, "KiB"}, {1.0, "B"}};
+      const auto bytes_as_double = static_cast<double>(bytes);
+
+      for (const auto& [scale, suffix] : units)
+      {
+        if (bytes_as_double >= scale || scale == 1.0)
+        {
+          std::ostringstream out;
+          const double value = bytes_as_double / scale;
+          const int precision = (scale == 1.0 || value >= 100.0) ? 0 : (value >= 10.0 ? 1 : 2);
+          out << std::fixed << std::setprecision(precision) << value << ' ' << suffix;
+          return out.str();
+        }
+      }
+
+      return std::string("0 B");
+    };
+
+    log.Log() << (use_gpus_ ? "CBCD FLUDS" : "CBC FLUDS") << " psi storage usage across "
+              << cbc_fluds_local_psi_bytes.size() << " FLUDS instances.\n"
+              << "  Total local psi storage and savings: (" << format_bytes(total_local_psi_storage)
+              << ", " << savings_out.str() << ")\n"
+              << "  Total boundary/non-local storage: "
+              << format_bytes(total_boundary_nonlocal_storage) << ".\n"
+              << "  Total managed local/boundary/non-local psi storage: "
+              << format_bytes(total_managed_psi_storage) << ".\n";
+  }
+
+  if (options_.verbose_inner_iterations)
+    log.Log() << program_timer.GetTimeString() << " Initialized angle aggregation.";
+
   opensn::mpi_comm.barrier();
 }
 
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbcd_angle_set.cu b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbcd_angle_set.cu
index 6ea54c785e..0b0a6f1bb3 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbcd_angle_set.cu
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbcd_angle_set.cu
@@ -2,15 +2,17 @@
 // SPDX-License-Identifier: MIT
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbcd_angle_set.h"
-#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h"
-#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/sweep_chunk.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/boundary/sweep_boundary.h"
 #include "framework/mesh/mesh_continuum/mesh_continuum.h"
-#include "framework/data_types/range.h"
-#include "framework/logging/log.h"
-#include "framework/runtime.h"
 #include "caliper/cali.h"
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <thread>
 
 namespace opensn
 {
@@ -24,16 +26,22 @@ CBCD_AngleSet::CBCD_AngleSet(size_t id,
                              const MPICommunicatorSet& comm_set)
   : AngleSet(id, num_groups, spds, fluds, angle_indices, boundaries),
     cbc_spds_(dynamic_cast<const CBC_SPDS&>(spds)),
-    async_comm_(id, *fluds, comm_set),
+    comm_set_(comm_set),
+    cbcd_fluds_(static_cast<CBCD_FLUDS&>(*fluds_)),
     stream_(),
     device_angle_indices_(angles_.size())
 {
+  boundary_ptrs_.reserve(boundaries_.size());
+  for (auto& [bid, bndry] : boundaries_)
+    boundary_ptrs_.emplace(bid, bndry.get());
+
   crb::MemoryPinningManager angle_indices_pinner_(angles_);
   crb::copy(device_angle_indices_, angle_indices_pinner_, angles_.size(), 0, 0, stream_);
-  // Set CBCD_FLUDS stream and asynchronously allocate storage for local psi
-  auto* cbcd_fluds = std::static_pointer_cast<CBCD_FLUDS>(fluds_).get();
-  cbcd_fluds->GetStream() = stream_;
-  cbcd_fluds->AllocateLocalAndSavedPsi();
+  cbcd_fluds_.GetStream() = stream_;
+  cbcd_fluds_.AllocateLocalAndSavedPsi();
+  cbcd_fluds_.InitializeReflectingBoundaryNodes(boundaries_);
+  InitializeTaskGraphData();
+  InitializeReflectingTaskMask();
 }
 
 CBCD_AngleSet::~CBCD_AngleSet()
@@ -43,22 +51,353 @@ CBCD_AngleSet::~CBCD_AngleSet()
 AsynchronousCommunicator*
 CBCD_AngleSet::GetCommunicator()
 {
-  return static_cast<AsynchronousCommunicator*>(&async_comm_);
+  return nullptr;
+}
+
+void
+CBCD_AngleSet::UpdateSweepDependencies(std::set<AngleSet*>& following_angle_sets)
+{
+  for (auto* as : following_angle_sets)
+  {
+    auto* cbcd_as = static_cast<CBCD_AngleSet*>(as);
+    following_angle_sets_.push_back(cbcd_as);
+    ++(cbcd_as->num_dependencies_);
+  }
+}
+
+void
+CBCD_AngleSet::ResetDependencyCounter()
+{
+  dependency_counter_.store(num_dependencies_, std::memory_order_relaxed);
+}
+
+bool
+CBCD_AngleSet::IsOutgoingReflectingFace(const CellFace& face,
+                                        const std::uint64_t cell_local_id,
+                                        const std::size_t face_id) const
+{
+  if ((face.has_neighbor) or
+      (cbc_spds_.GetCellFaceOrientations()[cell_local_id][face_id] != FaceOrientation::OUTGOING))
+    return false;
+  const auto boundary_it = boundary_ptrs_.find(face.neighbor_id);
+  return ((boundary_it != boundary_ptrs_.end()) and (boundary_it->second->IsReflecting()));
+}
+
+void
+CBCD_AngleSet::InitializeReflectingTaskMask()
+{
+  const auto& task_list = cbc_spds_.GetTaskList();
+  cell_has_outgoing_reflecting_boundary_.assign(task_list.size(), 0);
+  reflecting_boundaries_.clear();
+  reflecting_boundaries_.reserve(boundaries_.size());
+  for (auto& [_, bndry] : boundaries_)
+    if (bndry->IsReflecting())
+      reflecting_boundaries_.push_back(bndry.get());
+
+  for (std::size_t task_idx = 0; task_idx < task_list.size(); ++task_idx)
+  {
+    const auto& cell = *task_list[task_idx].cell_ptr;
+    bool has_outgoing_reflecting_face = false;
+    for (std::size_t f = 0; f < cell.faces.size(); ++f)
+    {
+      if (IsOutgoingReflectingFace(cell.faces[f], cell.local_id, f))
+      {
+        has_outgoing_reflecting_face = true;
+        break;
+      }
+    }
+
+    if (has_outgoing_reflecting_face)
+    {
+      cell_has_outgoing_reflecting_boundary_[task_idx] = 1;
+      ++initial_reflecting_task_count_;
+    }
+  }
+}
+
+void
+CBCD_AngleSet::InitializeTaskGraphData()
+{
+  if (not initial_deps_.empty())
+    return;
+
+  const auto& task_list = cbc_spds_.GetTaskList();
+  num_tasks_ = task_list.size();
+
+  initial_deps_.resize(num_tasks_);
+  remaining_deps_.resize(num_tasks_);
+  successor_offsets_.assign(num_tasks_ + 1, 0);
+  initial_ready_cell_ids_.clear();
+  initial_ready_cell_ids_.reserve(num_tasks_);
+
+  for (std::size_t task_idx = 0; task_idx < task_list.size(); ++task_idx)
+  {
+    const auto& task = task_list[task_idx];
+    initial_deps_[task_idx] = static_cast<int>(task.num_dependencies);
+    successor_offsets_[task_idx + 1] = static_cast<std::uint32_t>(task.successors.size());
+    if (task.num_dependencies == 0)
+      initial_ready_cell_ids_.push_back(static_cast<std::uint32_t>(task_idx));
+  }
+
+  for (std::size_t task_idx = 0; task_idx < num_tasks_; ++task_idx)
+    successor_offsets_[task_idx + 1] += successor_offsets_[task_idx];
+
+  successor_data_.resize(successor_offsets_.back());
+  for (std::size_t task_idx = 0; task_idx < task_list.size(); ++task_idx)
+  {
+    const auto& task = task_list[task_idx];
+    std::copy(task.successors.begin(),
+              task.successors.end(),
+              successor_data_.begin() + successor_offsets_[task_idx]);
+  }
+}
+
+void
+CBCD_AngleSet::InitializeTaskState()
+{
+  std::copy(initial_deps_.begin(), initial_deps_.end(), remaining_deps_.begin());
+  batch_state_.Reset();
+  auto& ready_cell_ids = cbcd_fluds_.GetLocalCellIDs(batch_state_.ready_buffer_index);
+  ready_cell_ids.clear();
+  ready_cell_ids.insert(
+    ready_cell_ids.end(), initial_ready_cell_ids_.begin(), initial_ready_cell_ids_.end());
+  num_completed_tasks_ = 0;
+  pending_reflecting_tasks_ = following_angle_sets_.empty() ? 0 : initial_reflecting_task_count_;
+}
+
+bool
+CBCD_AngleSet::TryRetireCompletedBatch()
+{
+  if ((not batch_state_.kernel_in_flight) or (not stream_.is_completed()))
+    return false;
+
+  auto& completed_cell_ids = cbcd_fluds_.GetLocalCellIDs(batch_state_.launch_buffer_index);
+  for (std::uint32_t i = 0; i < batch_state_.launch_count; ++i)
+  {
+    const auto cell_local_id = completed_cell_ids[i];
+    const auto succ_begin = successor_offsets_[cell_local_id];
+    const auto succ_end = successor_offsets_[cell_local_id + 1];
+    for (auto succ_i = succ_begin; succ_i < succ_end; ++succ_i)
+    {
+      if (--remaining_deps_[successor_data_[succ_i]] == 0)
+        cbcd_fluds_.GetLocalCellIDs(batch_state_.ready_buffer_index)
+          .push_back(successor_data_[succ_i]);
+    }
+
+    if ((not following_angle_sets_.empty()) and (not following_angle_sets_notified_) and
+        (cell_has_outgoing_reflecting_boundary_[cell_local_id] != 0))
+    {
+      assert(pending_reflecting_tasks_ > 0);
+      --pending_reflecting_tasks_;
+    }
+  }
+
+  num_completed_tasks_ += batch_state_.launch_count;
+  batch_state_.completed_buffer_index = batch_state_.launch_buffer_index;
+  batch_state_.completed_count = batch_state_.launch_count;
+  batch_state_.completed_batch_pending = true;
+  batch_state_.launch_count = 0;
+  batch_state_.kernel_in_flight = false;
+  return true;
+}
+
+bool
+CBCD_AngleSet::TryLaunchReadyBatch(CBCDSweepChunk& sweep_chunk)
+{
+  auto& ready_cell_ids = cbcd_fluds_.GetLocalCellIDs(batch_state_.ready_buffer_index);
+  if (batch_state_.kernel_in_flight or ready_cell_ids.empty())
+    return false;
+
+  const auto launch_count = static_cast<std::uint32_t>(ready_cell_ids.size());
+  batch_state_.launch_buffer_index = batch_state_.ready_buffer_index;
+  batch_state_.launch_count = launch_count;
+  batch_state_.ready_buffer_index = batch_state_.AcquireFreeBuffer();
+  cbcd_fluds_.GetLocalCellIDs(batch_state_.ready_buffer_index).clear();
+  sweep_chunk.Sweep(launch_count, GetID(), ready_cell_ids.data());
+  batch_state_.kernel_in_flight = true;
+  return true;
+}
+
+void
+CBCD_AngleSet::FlushCompletedBatch(CBCDSweepChunk& sweep_chunk)
+{
+  if (not batch_state_.completed_batch_pending)
+    return;
+
+  auto& completed_cell_ids = cbcd_fluds_.GetLocalCellIDs(batch_state_.completed_buffer_index);
+  cbcd_fluds_.CopyOutgoingPsiBackToHost(
+    sweep_chunk,
+    *async_comm_,
+    GetID(),
+    GetAngleIndices(),
+    {completed_cell_ids.data(), static_cast<std::size_t>(batch_state_.completed_count)});
+  completed_cell_ids.clear();
+  batch_state_.ReleaseBuffer(batch_state_.completed_buffer_index);
+  batch_state_.completed_buffer_index = 0;
+  batch_state_.completed_count = 0;
+  batch_state_.completed_batch_pending = false;
+  TryNotifyFollowingAngleSets();
+}
+
+void
+CBCD_AngleSet::TryNotifyFollowingAngleSets()
+{
+  if (following_angle_sets_notified_)
+    return;
+
+  if (following_angle_sets_.empty())
+  {
+    following_angle_sets_notified_ = true;
+    return;
+  }
+
+  if (pending_reflecting_tasks_ != 0)
+    return;
+
+  for (auto* boundary : reflecting_boundaries_)
+    boundary->UpdateAnglesReadyStatus(angles_);
+  for (auto* following_angle_set : following_angle_sets_)
+  {
+    const auto old_value =
+      following_angle_set->dependency_counter_.fetch_sub(1, std::memory_order_release);
+    assert(old_value > 0);
+  }
+  following_angle_sets_notified_ = true;
+}
+
+bool
+CBCD_AngleSet::TryInitialize(CBCDSweepChunk& sweep_chunk)
+{
+  if (boundary_data_initialized_)
+    return false;
+  if (dependency_counter_.load(std::memory_order_acquire) != 0)
+    return false;
+
+  CALI_CXX_MARK_SCOPE("CBCD_AngleSet::TryInitialize");
+
+  cbcd_fluds_.CopyIncomingBoundaryPsiToDevice(sweep_chunk, this);
+  InitializeTaskState();
+  boundary_data_initialized_ = true;
+  return true;
+}
+
+bool
+CBCD_AngleSet::TryAdvanceOneStep(CBCDSweepChunk& cbcd_sweep_chunk)
+{
+  CALI_CXX_MARK_SCOPE("CBCD_AngleSet::TryAdvanceOneStep");
+
+  if (executed_ or (not boundary_data_initialized_))
+    return false;
+
+  auto& ready_cell_ids = cbcd_fluds_.GetLocalCellIDs(batch_state_.ready_buffer_index);
+  const bool kernel_completed = batch_state_.kernel_in_flight and stream_.is_completed();
+  const bool has_incoming = async_comm_->HasIncoming(GetID());
+  const bool can_finalize = (num_completed_tasks_ == num_tasks_) and
+                            (not batch_state_.kernel_in_flight) and
+                            (not batch_state_.completed_batch_pending);
+
+  if ((not kernel_completed) and (not batch_state_.completed_batch_pending) and
+      ready_cell_ids.empty() and (not has_incoming) and (not can_finalize))
+    return false;
+
+  bool work_done = false;
+
+  // Retire a completed kernel batch before processing new arrivals.
+  if (kernel_completed)
+  {
+    CALI_CXX_MARK_SCOPE("CBCD_AngleSet::RetireBatch");
+    work_done |= TryRetireCompletedBatch();
+  }
+
+  // Consume any newly received non-local face data and release newly ready cells.
+  if (has_incoming)
+  {
+    CALI_CXX_MARK_SCOPE("CBCD_AngleSet::ProcessIncoming");
+    work_done |= async_comm_->ProcessIncoming(
+      GetID(),
+      [this](const IncomingFaceBatch& batch)
+      {
+        const auto* psi_base = batch.psi_data.data();
+        for (const auto& entry : batch.entries)
+        {
+          const auto cell_local_id = cbcd_fluds_.ScatterReceivedFaceData(
+            batch.source_slot, entry.source_face_index, psi_base + entry.payload_offset);
+          if (--remaining_deps_[cell_local_id] == 0)
+            cbcd_fluds_.GetLocalCellIDs(batch_state_.ready_buffer_index)
+              .push_back(static_cast<std::uint32_t>(cell_local_id));
+        }
+      });
+  }
+
+  // Launch the next batch once the stream is idle.
+  if ((not batch_state_.kernel_in_flight) and (not ready_cell_ids.empty()))
+  {
+    CALI_CXX_MARK_SCOPE("CBCD_AngleSet::LaunchBatch");
+    work_done |= TryLaunchReadyBatch(cbcd_sweep_chunk);
+  }
+
+  // Flush the completed batch after launching the next one so host packing
+  // overlaps with device execution when another batch is ready.
+  if (batch_state_.completed_batch_pending)
+  {
+    CALI_CXX_MARK_SCOPE("CBCD_AngleSet::FlushBatch");
+    FlushCompletedBatch(cbcd_sweep_chunk);
+    work_done = true;
+  }
+
+  // Finalize once all tasks are done and no kernel is in flight.
+  if (num_completed_tasks_ == num_tasks_ and (not batch_state_.kernel_in_flight) and
+      (not batch_state_.completed_batch_pending))
+  {
+    CALI_CXX_MARK_SCOPE("CBCD_AngleSet::FinalizeCompletion");
+    async_comm_->SignalAngleSetComplete(GetID());
+    TryNotifyFollowingAngleSets();
+    executed_ = true;
+    cbcd_fluds_.CopySavedPsiFromDevice();
+    cbcd_fluds_.CopySavedPsiToDestinationPsi(cbcd_sweep_chunk, this);
+    return true;
+  }
+
+  return work_done;
 }
 
 AngleSetStatus
 CBCD_AngleSet::AngleSetAdvance(SweepChunk& sweep_chunk, AngleSetStatus permission)
 {
-  OpenSnLogicalError("CBCD_AngleSet::AngleSetAdvance should not be called. Routine is handled by "
-                     "SweepScheduler::ScheduleAlgoAsyncFIFO.");
+  CALI_CXX_MARK_SCOPE("CBCD_AngleSet::AngleSetAdvance");
+
+  if (executed_)
+    return AngleSetStatus::FINISHED;
+
+  auto& cbcd_sweep_chunk = static_cast<CBCDSweepChunk&>(sweep_chunk);
+  if (not boundary_data_initialized_)
+  {
+    if (not TryInitialize(cbcd_sweep_chunk))
+      return AngleSetStatus::NOT_FINISHED;
+  }
+
+  while (not executed_)
+  {
+    if (TryAdvanceOneStep(cbcd_sweep_chunk))
+      continue;
+    std::this_thread::yield();
+  }
+
+  return AngleSetStatus::FINISHED;
 }
 
 void
 CBCD_AngleSet::ResetSweepBuffers()
 {
-  current_task_list_.clear();
-  async_comm_.Reset();
-  fluds_->ClearLocalAndReceivePsi();
+  batch_state_.Reset();
+  for (std::size_t i = 0; i < 3; ++i)
+    cbcd_fluds_.GetLocalCellIDs(i).clear();
+  cbcd_fluds_.ClearLocalAndReceivePsi();
+  num_completed_tasks_ = 0;
+  pending_reflecting_tasks_ = 0;
+  boundary_data_initialized_ = false;
+  following_angle_sets_notified_ = false;
+  ResetDependencyCounter();
   executed_ = false;
 }
 
@@ -71,13 +410,12 @@ CBCD_AngleSet::PsiBoundary(uint64_t boundary_id,
                            unsigned int g,
                            bool surface_source_active)
 {
-  if (boundaries_[boundary_id]->IsReflecting())
-    return boundaries_[boundary_id]->PsiIncoming(cell_local_id, face_num, fi, angle_num, g);
-
-  if (not surface_source_active)
-    return boundaries_[boundary_id]->ZeroFlux(g);
-
-  return boundaries_[boundary_id]->PsiIncoming(cell_local_id, face_num, fi, angle_num, g);
+  const auto boundary_it = boundary_ptrs_.find(boundary_id);
+  assert(boundary_it != boundary_ptrs_.end());
+  auto* boundary = boundary_it->second;
+  if (not boundary->IsReflecting() and (not surface_source_active))
+    return boundary->ZeroFlux(g);
+  return boundary->PsiIncoming(cell_local_id, face_num, fi, angle_num, g);
 }
 
 double*
@@ -87,7 +425,9 @@ CBCD_AngleSet::PsiReflected(uint64_t boundary_id,
                             unsigned int face_num,
                             unsigned int fi)
 {
-  return boundaries_[boundary_id]->PsiOutgoing(cell_local_id, face_num, fi, angle_num);
+  const auto boundary_it = boundary_ptrs_.find(boundary_id);
+  assert(boundary_it != boundary_ptrs_.end());
+  return boundary_it->second->PsiOutgoing(cell_local_id, face_num, fi, angle_num);
 }
 
-} // namespace opensn
\ No newline at end of file
+} // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbcd_angle_set.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbcd_angle_set.h
index 9842d4e21a..4a04046eda 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbcd_angle_set.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbcd_angle_set.h
@@ -4,21 +4,100 @@
 #pragma once
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/angle_set.h"
-#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.h"
 #include "caribou/main.hpp"
-#include <memory>
+#include <array>
+#include <atomic>
+#include <set>
+#include <unordered_map>
 
 namespace crb = caribou;
 
 namespace opensn
 {
 
+class CBCD_FLUDS;
 class CBC_SPDS;
+class CBCDSweepChunk;
+class CellFace;
 
-/// CBC angle set for device.
+/**
+ * CBCD angle set with task-graph-driven batched execution.
+ *
+ * Manages the host-side state machine for one device-resident CBCD angle set.
+ * The angle set waits for upstream dependencies, launches ready-cell batches on
+ * its stream, drains received non-local face data, and flushes completed outgoing
+ * data through the aggregated communicator.
+ */
 class CBCD_AngleSet : public AngleSet
 {
 public:
+  /// Per-sweep launch/completion state for the current kernel batch.
+  struct BatchState
+  {
+    /// Buffer receiving newly ready local cell IDs.
+    std::uint8_t ready_buffer_index = 0;
+    /// Buffer backing the currently running kernel launch.
+    std::uint8_t launch_buffer_index = 0;
+    /// Buffer holding the most recently completed kernel batch until it is flushed.
+    std::uint8_t completed_buffer_index = 0;
+    /// Indices of currently free mapped-host cell-ID buffers.
+    std::array<std::uint8_t, 3> free_buffer_indices = {1, 2, 0};
+    /// Number of free mapped-host cell-ID buffers.
+    std::uint8_t num_free_buffers = 2;
+    /// Number of local cells in the currently running kernel launch.
+    std::uint32_t launch_count = 0;
+    /// Number of local cells in the completed batch waiting to be flushed.
+    std::uint32_t completed_count = 0;
+    /// Flag indicating whether a kernel launch is currently outstanding.
+    bool kernel_in_flight = false;
+    /// Flag indicating whether a completed batch is waiting to be flushed.
+    bool completed_batch_pending = false;
+
+    /// Reset the batch state between sweeps.
+    void Reset()
+    {
+      ready_buffer_index = 0;
+      launch_buffer_index = 0;
+      completed_buffer_index = 0;
+      free_buffer_indices = {1, 2, 0};
+      num_free_buffers = 2;
+      launch_count = 0;
+      completed_count = 0;
+      kernel_in_flight = false;
+      completed_batch_pending = false;
+    }
+
+    /// Acquire one free mapped-host cell-ID buffer.
+    std::uint8_t AcquireFreeBuffer()
+    {
+      assert(num_free_buffers > 0);
+      return free_buffer_indices[--num_free_buffers];
+    }
+
+    /**
+     * Return one mapped-host cell-ID buffer to the free list.
+     *
+     * \param buffer_index Buffer index to release.
+     */
+    void ReleaseBuffer(const std::uint8_t buffer_index)
+    {
+      assert(num_free_buffers < free_buffer_indices.size());
+      free_buffer_indices[num_free_buffers++] = buffer_index;
+    }
+  };
+
+  /**
+   * Construct one CBCD angle set.
+   *
+   * \param id Angle-set ID.
+   * \param num_groups Number of groups in the angle set.
+   * \param spds Sweep plane data structure for this angle set.
+   * \param fluds Device FLUDS for this angle set.
+   * \param angle_indices Global angle indices represented by this angle set.
+   * \param boundaries Sweep-boundary table indexed by boundary ID.
+   * \param comm_set MPI communicator set used to build the aggregated communicator.
+   */
   CBCD_AngleSet(size_t id,
                 size_t num_groups,
                 const SPDS& spds,
@@ -29,24 +108,60 @@ class CBCD_AngleSet : public AngleSet
 
   ~CBCD_AngleSet();
 
+  /// Register following angle sets and initialize their startup dependency counts.
+  void UpdateSweepDependencies(std::set<AngleSet*>& following_angle_sets) override;
+
+  /// Reset the unresolved angle-set dependency counter before a sweep.
+  void ResetDependencyCounter();
+
+  /// Return the delayed-data communicator for this angle set.
   AsynchronousCommunicator* GetCommunicator() override;
 
+  /// Bind the angle set to the sweep-chunk-owned aggregated communicator.
+  void SetCommunicator(CBCD_AsynchronousCommunicator& async_comm) { async_comm_ = &async_comm; }
+
+  /// Return the communicator set used to construct the aggregated communicator.
+  const MPICommunicatorSet& GetCommunicatorSet() const { return comm_set_; }
+
   void InitializeDelayedUpstreamData() override {}
 
+  /// Return the buffered-message limit used by the scheduler.
   int GetMaxBufferMessages() const override { return 0; }
 
-  void SetMaxBufferMessages(int new_max) override {}
+  /// Set the buffered-message limit used by the scheduler.
+  void SetMaxBufferMessages(int) override {}
+
+  /**
+   * Initialize the angle set once all upstream angle-set dependencies are resolved.
+   *
+   * Copies incoming boundary data to the device, resets per-sweep task state, and
+   * marks the angle set ready for batched execution.
+   *
+   * \param sweep_chunk Owning CBCD sweep chunk.
+   * \return True when initialization was performed on this call.
+   */
+  bool TryInitialize(CBCDSweepChunk& sweep_chunk);
+
+  /**
+   * Advance the angle set by at most one scheduler step.
+   *
+   * One step may retire a completed batch, drain newly received faces, launch the
+   * next ready batch, flush completed outgoing data, or finalize the angle set.
+   *
+   * \param sweep_chunk Owning CBCD sweep chunk.
+   * \return True when any forward progress was made.
+   */
+  bool TryAdvanceOneStep(CBCDSweepChunk& sweep_chunk);
 
   AngleSetStatus AngleSetAdvance(SweepChunk& sweep_chunk, AngleSetStatus permission) override;
 
-  AngleSetStatus FlushSendBuffers() override
-  {
-    const bool all_messages_sent = async_comm_.SendData();
-    return all_messages_sent ? AngleSetStatus::MESSAGES_SENT : AngleSetStatus::MESSAGES_PENDING;
-  }
+  /// Flush buffered sends for this angle set.
+  AngleSetStatus FlushSendBuffers() override { return AngleSetStatus::MESSAGES_SENT; }
 
+  /// Reset per-sweep state and buffers.
   void ResetSweepBuffers() override;
 
+  /// Report whether delayed upstream data has been received.
   bool ReceiveDelayedData() override { return true; }
 
   const double* PsiBoundary(uint64_t boundary_id,
@@ -63,20 +178,91 @@ class CBCD_AngleSet : public AngleSet
                        unsigned int face_num,
                        unsigned int fi) override;
 
+  /// Return the stream associated with this angle set.
   crb::Stream& GetStream() { return stream_; }
 
+  /// Return the device pointer to the angle-index table.
   std::uint32_t* GetDeviceAngleIndices() { return device_angle_indices_.get(); }
 
-  std::vector<Task>& GetCurrentTaskList() { return current_task_list_; }
+  /// Check whether the angle set has completed its sweep.
+  bool IsExecuted() const { return executed_; }
+  /// Check whether the angle set has been initialized for the current sweep.
+  bool IsInitialized() const { return boundary_data_initialized_; }
 
-protected:
+private:
   const CBC_SPDS& cbc_spds_;
-  std::vector<Task> current_task_list_;
-  CBC_AsynchronousCommunicator async_comm_;
+  /// Communicator-set metadata for aggregated communicator construction.
+  const MPICommunicatorSet& comm_set_;
+  /// Per-angle-set FLUDS.
+  CBCD_FLUDS& cbcd_fluds_;
+  /// Sweep chunk-owned aggregated communicator.
+  CBCD_AsynchronousCommunicator* async_comm_ = nullptr;
   /// Associated crb::Stream.
   crb::Stream stream_;
   /// Angle indices on GPU.
   crb::DeviceMemory<std::uint32_t> device_angle_indices_;
+  /// Successor offsets indexed by local cell ID.
+  std::vector<std::uint32_t> successor_offsets_;
+  /// Successor local cell IDs stored in CSR order.
+  std::vector<std::uint32_t> successor_data_;
+  /// Initial dependency counts per local cell.
+  std::vector<int> initial_deps_;
+  /// Per-sweep dependency counts per local cell.
+  std::vector<int> remaining_deps_;
+  /// Local cell IDs with zero initial dependencies.
+  std::vector<std::uint32_t> initial_ready_cell_ids_;
+  /// Cached total number of local cells/tasks in task graph.
+  std::size_t num_tasks_ = 0;
+  /// Number of unresolved angleset dependencies at startup.
+  std::size_t num_dependencies_ = 0;
+  /// Atomic counter for unresolved angleset dependencies.
+  std::atomic<std::size_t> dependency_counter_;
+  /// Following anglesets that depend on this angleset.
+  std::vector<CBCD_AngleSet*> following_angle_sets_;
+  /// Cached boundary lookup table.
+  std::unordered_map<std::uint64_t, SweepBoundary*> boundary_ptrs_;
+  /// Reflecting boundaries touched by this angleset.
+  std::vector<SweepBoundary*> reflecting_boundaries_;
+  /// Explicit launch/completion state for the current sweep batch.
+  BatchState batch_state_;
+  /// Cached reflecting-boundary producer mask by local cell ID.
+  std::vector<std::uint8_t> cell_has_outgoing_reflecting_boundary_;
+  /// Number of completed local tasks.
+  std::size_t num_completed_tasks_ = 0;
+  /// Initial number of local cells that produce reflecting boundary data.
+  std::size_t initial_reflecting_task_count_ = 0;
+  /// Remaining number of local cells that still need to produce reflecting boundary data.
+  std::size_t pending_reflecting_tasks_ = 0;
+  /// Flag indicating if incoming boundary data has been copied to the device.
+  bool boundary_data_initialized_ = false;
+  /// Flag indicating if following anglesets have been notified of completion.
+  bool following_angle_sets_notified_ = false;
+
+  /// Build the reflecting-boundary producer mask from the CBC task graph.
+  void InitializeReflectingTaskMask();
+
+  /// Flatten the CBC task graph into lookup tables.
+  void InitializeTaskGraphData();
+
+  /// Check whether a cell face is an outgoing reflecting boundary face.
+  bool IsOutgoingReflectingFace(const CellFace& face,
+                                std::uint64_t cell_local_id,
+                                std::size_t face_id) const;
+
+  /// Reset mutable task state for a new sweep.
+  void InitializeTaskState();
+
+  /// Retire the completed kernel batch and update successor dependency state.
+  bool TryRetireCompletedBatch();
+
+  /// Launch the next ready-cell batch when the current stream is idle.
+  bool TryLaunchReadyBatch(CBCDSweepChunk& sweep_chunk);
+
+  /// Pack and send deferred outgoing data for the completed batch.
+  void FlushCompletedBatch(CBCDSweepChunk& sweep_chunk);
+
+  /// Notify following angle sets once all reflecting-boundary producers have completed.
+  void TryNotifyFollowingAngleSets();
 };
 
-} // namespace opensn
\ No newline at end of file
+} // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.cu b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.cu
new file mode 100644
index 0000000000..6fe07012cc
--- /dev/null
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.cu
@@ -0,0 +1,443 @@
+// SPDX-FileCopyrightText: 2026 The OpenSn Authors <https://open-sn.github.io/opensn/>
+// SPDX-License-Identifier: MIT
+
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/angle_set.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/spds.h"
+#include "framework/mpi/mpi_comm_set.h"
+#include "framework/runtime.h"
+#include "caliper/cali.h"
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <set>
+#include <cstddef>
+
+namespace opensn
+{
+
+namespace detail
+{
+
+// Bounded byte reader for communicator payload deserialization.
+struct BufferReader
+{
+  const std::byte* ptr = nullptr;
+  std::size_t remaining_bytes = 0;
+
+  std::size_t LoadSize()
+  {
+    assert(remaining_bytes >= sizeof(std::size_t));
+    std::size_t value{};
+    std::memcpy(&value, ptr, sizeof(std::size_t));
+    ptr += sizeof(std::size_t);
+    remaining_bytes -= sizeof(std::size_t);
+    return value;
+  }
+
+  std::uint32_t LoadFaceIndex()
+  {
+    assert(remaining_bytes >= sizeof(std::uint32_t));
+    std::uint32_t value{};
+    std::memcpy(&value, ptr, sizeof(std::uint32_t));
+    ptr += sizeof(std::uint32_t);
+    remaining_bytes -= sizeof(std::uint32_t);
+    return value;
+  }
+
+  void SkipBytes(const std::size_t num_bytes)
+  {
+    assert(remaining_bytes >= num_bytes);
+    ptr += num_bytes;
+    remaining_bytes -= num_bytes;
+  }
+
+  const std::byte* Data() const noexcept { return ptr; }
+};
+
+} // namespace detail
+
+CBCD_AsynchronousCommunicator::CBCD_AsynchronousCommunicator(
+  const std::vector<AngleSet*>& angle_sets,
+  const MPICommunicatorSet& comm_set,
+  const std::vector<std::vector<int>>& incoming_source_partitions,
+  const std::size_t max_message_bytes,
+  const std::vector<AngleSetCapacity>& capacities)
+  : comm_set_(comm_set),
+    num_angle_sets_(angle_sets.size()),
+    mpi_tag_(static_cast<int>(angle_sets.size())),
+    max_message_bytes_(max_message_bytes),
+    angle_set_done_(angle_sets.size())
+{
+  assert(incoming_source_partitions.size() == angle_sets.size());
+  assert(capacities.size() == angle_sets.size());
+
+  std::set<int> sources;
+  std::set<int> destinations;
+  std::size_t total_outgoing_faces = 0;
+  std::size_t max_outgoing_face_values = 0;
+
+  for (std::size_t i = 0; i < angle_sets.size(); ++i)
+  {
+    const auto* angle_set = angle_sets[i];
+    const auto& spds = angle_set->GetSPDS();
+    for (const int dep : spds.GetLocationDependencies())
+      sources.insert(dep);
+    for (const int succ : spds.GetLocationSuccessors())
+      destinations.insert(succ);
+
+    total_outgoing_faces += capacities[i].outgoing_faces;
+    max_outgoing_face_values =
+      std::max(max_outgoing_face_values, capacities[i].max_outgoing_face_values);
+    if (capacities[i].incoming_faces > 0)
+    {
+      // Each mailbox slot stores one incoming batch for a single angle set. Entry and value
+      // buffers are reserved once from the angle-set-local capacity summary and then reused.
+      auto mailbox = std::make_unique<LockFreeRingBuffer<IncomingFaceBatch>>();
+      mailbox->Preallocate(capacities[i].incoming_faces + 1);
+      mailbox->InitializeSlots(
+        [&](IncomingFaceBatch& batch)
+        {
+          batch.entries.reserve(capacities[i].max_incoming_batch_entries);
+          batch.psi_data.reserve(capacities[i].max_incoming_batch_values);
+          batch.entries.clear();
+          batch.psi_data.clear();
+          batch.source_slot = 0;
+        });
+      incoming_mailboxes_.push_back(std::move(mailbox));
+    }
+    else
+    {
+      incoming_mailboxes_.push_back(std::make_unique<LockFreeRingBuffer<IncomingFaceBatch>>());
+    }
+  }
+
+  my_rank_ = opensn::mpi_comm.rank();
+  source_partitions_.assign(sources.begin(), sources.end());
+  source_ranks_.reserve(source_partitions_.size());
+  for (const int source_partition : source_partitions_)
+    source_ranks_.push_back(comm_set_.MapIonJ(source_partition, my_rank_));
+
+  source_partition_to_slot_by_angle_set_.resize(angle_sets.size());
+  for (std::size_t angle_set_id = 0; angle_set_id < angle_sets.size(); ++angle_set_id)
+  {
+    auto& source_to_slot = source_partition_to_slot_by_angle_set_[angle_set_id];
+    const auto& source_partitions = incoming_source_partitions[angle_set_id];
+    source_to_slot.reserve(source_partitions.size());
+    for (std::size_t source_slot = 0; source_slot < source_partitions.size(); ++source_slot)
+      source_to_slot.emplace(source_partitions[source_slot],
+                             static_cast<std::uint32_t>(source_slot));
+  }
+
+  outgoing_queues_.reserve(destinations.size());
+  dest_to_queue_index_.reserve(destinations.size());
+  int queue_index = 0;
+  for (const int dest_rank : destinations)
+  {
+    // Each destination rank receives one bounded MPSC queue. The slots are preallocated once
+    // and their payload vectors retain capacity across all subsequent publications.
+    auto queue = std::make_unique<DestinationQueue>();
+    queue->dest_rank = dest_rank;
+    queue->queue = std::make_unique<LockFreeRingBuffer<OutgoingFaceData>>();
+    if (total_outgoing_faces > 0)
+      queue->queue->Preallocate(total_outgoing_faces + 1);
+    queue->queue->InitializeSlots([max_outgoing_face_values](OutgoingFaceData& payload)
+                                  { payload.psi_data.reserve(max_outgoing_face_values); });
+    outgoing_queues_.push_back(std::move(queue));
+    dest_to_queue_index_[dest_rank] = queue_index++;
+  }
+
+  send_batch_by_angle_set_.resize(num_angle_sets_);
+  for (auto& done : angle_set_done_)
+    done.store(false, std::memory_order_relaxed);
+
+  if (max_message_bytes_ > 0)
+    recv_buffer_.Data().reserve(max_message_bytes_);
+}
+
+CBCD_AsynchronousCommunicator::~CBCD_AsynchronousCommunicator()
+{
+  if (comm_thread_.joinable())
+    Stop();
+}
+
+void
+CBCD_AsynchronousCommunicator::SignalAngleSetComplete(const std::size_t angle_set_id)
+{
+  assert(angle_set_id < num_angle_sets_);
+  angle_set_done_[angle_set_id].store(true, std::memory_order_release);
+}
+
+void
+CBCD_AsynchronousCommunicator::Start()
+{
+  stop_requested_.store(false, std::memory_order_relaxed);
+  for (auto& done : angle_set_done_)
+    done.store(false, std::memory_order_relaxed);
+  in_flight_sends_.clear();
+  comm_thread_ = std::thread(&CBCD_AsynchronousCommunicator::CommThreadLoop, this);
+}
+
+void
+CBCD_AsynchronousCommunicator::Stop()
+{
+  stop_requested_.store(true, std::memory_order_release);
+  if (comm_thread_.joinable())
+    comm_thread_.join();
+}
+
+void
+CBCD_AsynchronousCommunicator::CommThreadLoop()
+{
+  CALI_CXX_MARK_SCOPE("CBCD_AsynchronousCommunicator::CommThreadLoop");
+
+  // The communication thread handles all MPI communication for CBCD.
+  // Each iteration advances all three communication phases: outgoing batching,
+  // incoming pushes to angleset mailboxes, and retirement of completed nonblocking sends.
+  while (true)
+  {
+    bool work_done = SerializeAndSend();
+    work_done |= ProbeAndReceive();
+    work_done |= PollInFlightSends();
+
+    if (stop_requested_.load(std::memory_order_acquire) and AllAngleSetsComplete())
+    {
+      SerializeAndSend();
+      while (not in_flight_sends_.empty())
+      {
+        PollInFlightSends();
+        if (not in_flight_sends_.empty())
+          std::this_thread::yield();
+      }
+      break;
+    }
+
+    if (not work_done)
+      std::this_thread::yield();
+  }
+}
+
+bool
+CBCD_AsynchronousCommunicator::SerializeAndSend()
+{
+  CALI_CXX_MARK_SCOPE("CBCD_AsynchronousCommunicator::SerializeAndSend");
+
+  bool sent_any = false;
+
+  for (auto& destination_queue : outgoing_queues_)
+  {
+    // Gather the currently published outgoing face payloads for this destination. The queue
+    // is drained in FIFO order, but the serialized message is batched by angle set so the
+    // receiver can publish one mailbox payload per angle set.
+    destination_queue->queue->GetReadySlots(slot_cache_);
+    if (slot_cache_.empty())
+      continue;
+
+    std::size_t current_payload_bytes = sizeof(std::size_t);
+    std::size_t active_angle_sets = 0;
+    std::size_t slots_processed = 0;
+
+    const auto send_batch = [&]()
+    {
+      // Wire format:
+      // [num_active_angle_sets]
+      //   repeated:
+      //   [angle_set_id][num_entries]
+      //     repeated:
+      //     [remote_face_index][payload_size][payload doubles...]
+      InFlightSend in_flight;
+      in_flight.data.Data().resize(current_payload_bytes);
+      std::size_t offset = 0;
+
+      const auto write_bytes = [&](const void* ptr, const std::size_t size)
+      {
+        std::memcpy(in_flight.data.Data().data() + offset, ptr, size);
+        offset += size;
+      };
+
+      write_bytes(&active_angle_sets, sizeof(std::size_t));
+      for (std::size_t angle_set_id = 0; angle_set_id < num_angle_sets_; ++angle_set_id)
+      {
+        auto& entries = send_batch_by_angle_set_[angle_set_id];
+        if (entries.empty())
+          continue;
+
+        write_bytes(&angle_set_id, sizeof(std::size_t));
+        const auto num_entries = entries.size();
+        write_bytes(&num_entries, sizeof(std::size_t));
+        for (const auto* entry : entries)
+        {
+          write_bytes(&entry->remote_face_index, sizeof(std::uint32_t));
+          const auto data_size = entry->psi_data.size();
+          write_bytes(&data_size, sizeof(std::size_t));
+          write_bytes(entry->psi_data.data(), data_size * sizeof(double));
+        }
+        entries.clear();
+      }
+
+      const auto& comm = comm_set_.LocICommunicator(destination_queue->dest_rank);
+      const auto mapped_rank =
+        comm_set_.MapIonJ(destination_queue->dest_rank, destination_queue->dest_rank);
+      in_flight.request = comm.isend(mapped_rank, mpi_tag_, in_flight.data.Data());
+      in_flight_sends_.push_back(std::move(in_flight));
+    };
+
+    for (std::size_t slot_index = 0; slot_index < slot_cache_.size(); ++slot_index)
+    {
+      const auto* slot = slot_cache_[slot_index];
+      const auto& entry = slot->payload;
+      const auto entry_bytes =
+        sizeof(std::uint32_t) + sizeof(std::size_t) + entry.psi_data.size() * sizeof(double);
+
+      // Attempt to adhere to the message-size limit.
+      // Once the next entry would exceed the limit, flush the current
+      // batch and continue packing the remaining queue entries.
+      if (max_message_bytes_ > 0 and current_payload_bytes + entry_bytes > max_message_bytes_ and
+          active_angle_sets > 0)
+      {
+        send_batch();
+        destination_queue->queue->FreeSlots(slots_processed);
+        current_payload_bytes = sizeof(std::size_t);
+        active_angle_sets = 0;
+        slots_processed = 0;
+      }
+
+      auto& entries = send_batch_by_angle_set_[entry.angle_set_id];
+      if (entries.empty())
+      {
+        ++active_angle_sets;
+        current_payload_bytes += 2 * sizeof(std::size_t);
+      }
+      entries.push_back(&entry);
+      current_payload_bytes += entry_bytes;
+      ++slots_processed;
+    }
+
+    if (active_angle_sets > 0)
+    {
+      send_batch();
+      destination_queue->queue->FreeSlots(slots_processed);
+    }
+
+    sent_any = true;
+  }
+
+  return sent_any;
+}
+
+bool
+CBCD_AsynchronousCommunicator::ProbeAndReceive()
+{
+  CALI_CXX_MARK_SCOPE("CBCD_AsynchronousCommunicator::ProbeAndReceive");
+
+  bool received_any = false;
+  const auto& recv_comm = comm_set_.LocICommunicator(my_rank_);
+
+  for (std::size_t source_index = 0; source_index < source_ranks_.size(); ++source_index)
+  {
+    const int source_partition = source_partitions_[source_index];
+    const int source_rank = source_ranks_[source_index];
+    mpi::Status status;
+
+    while (recv_comm.iprobe(source_rank, mpi_tag_, status))
+    {
+      received_any = true;
+      const auto num_bytes = status.count<std::byte>();
+      recv_buffer_.Data().resize(static_cast<std::size_t>(num_bytes));
+      recv_comm.recv(source_rank, status.tag(), recv_buffer_.Data().data(), num_bytes);
+
+      detail::BufferReader reader{reinterpret_cast<const std::byte*>(recv_buffer_.Data().data()),
+                                  recv_buffer_.Data().size()};
+
+      // Walk each angleset section to determine its source slot, entry count,
+      // and total number of doubles, which allows for exactly one mailbox
+      // payload allocation.
+      const auto num_active_angle_sets = reader.LoadSize();
+      for (std::size_t as_batch = 0; as_batch < num_active_angle_sets; ++as_batch)
+      {
+        const auto angle_set_id = reader.LoadSize();
+        const auto num_entries = reader.LoadSize();
+        assert(angle_set_id < num_angle_sets_);
+
+        const auto slot_it =
+          source_partition_to_slot_by_angle_set_[angle_set_id].find(source_partition);
+        assert(slot_it != source_partition_to_slot_by_angle_set_[angle_set_id].end());
+        const auto source_slot = slot_it->second;
+
+        const auto* const section_ptr = reader.Data();
+        std::size_t total_values = 0;
+        for (std::size_t entry_index = 0; entry_index < num_entries; ++entry_index)
+        {
+          reader.LoadFaceIndex();
+          const auto data_size = reader.LoadSize();
+          reader.SkipBytes(data_size * sizeof(double));
+          total_values += data_size;
+        }
+        const auto section_num_bytes = static_cast<std::size_t>(reader.Data() - section_ptr);
+
+        auto& slot = incoming_mailboxes_[angle_set_id]->ReserveSlot();
+        auto& batch = slot.payload;
+        batch.source_slot = source_slot;
+        batch.entries.resize(num_entries);
+        batch.psi_data.resize(total_values);
+        detail::BufferReader section_reader{section_ptr, section_num_bytes};
+        std::size_t value_offset = 0;
+        // Walk the compact mailbox payload with per-face offsets into one
+        // contiguous `psi_data` block.
+        for (std::size_t entry_index = 0; entry_index < num_entries; ++entry_index)
+        {
+          auto& entry = batch.entries[entry_index];
+          entry.source_face_index = section_reader.LoadFaceIndex();
+          entry.payload_offset = value_offset;
+          entry.payload_size = section_reader.LoadSize();
+          std::memcpy(batch.psi_data.data() + value_offset,
+                      section_reader.Data(),
+                      entry.payload_size * sizeof(double));
+          section_reader.SkipBytes(entry.payload_size * sizeof(double));
+          value_offset += entry.payload_size;
+        }
+
+        incoming_mailboxes_[angle_set_id]->PublishSlot(slot);
+      }
+    }
+  }
+
+  return received_any;
+}
+
+bool
+CBCD_AsynchronousCommunicator::PollInFlightSends()
+{
+  CALI_CXX_MARK_SCOPE("CBCD_AsynchronousCommunicator::PollInFlightSends");
+
+  // Compact the in-flight vector in place by swapping completed requests with the back.
+  bool completed_any = false;
+  for (std::size_t i = 0; i < in_flight_sends_.size();)
+  {
+    if (mpi::test(in_flight_sends_[i].request))
+    {
+      completed_any = true;
+      in_flight_sends_[i] = std::move(in_flight_sends_.back());
+      in_flight_sends_.pop_back();
+    }
+    else
+      ++i;
+  }
+  return completed_any;
+}
+
+bool
+CBCD_AsynchronousCommunicator::AllAngleSetsComplete() const
+{
+  for (const auto& done : angle_set_done_)
+    if (not done.load(std::memory_order_acquire))
+      return false;
+
+  for (const auto& destination_queue : outgoing_queues_)
+    if (not destination_queue->queue->Empty())
+      return false;
+
+  return true;
+}
+
+} // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.h
new file mode 100644
index 0000000000..3ce3a7bf4b
--- /dev/null
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.h
@@ -0,0 +1,246 @@
+// SPDX-FileCopyrightText: 2026 The OpenSn Authors <https://open-sn.github.io/opensn/>
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "framework/data_types/byte_array.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/lock_free_queues.h"
+#include "mpicpp-lite/mpicpp-lite.h"
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+namespace mpi = mpicpp_lite;
+
+namespace opensn
+{
+
+class AngleSet;
+class MPICommunicatorSet;
+
+/// Metadata for one received non-local face payload inside an incoming batch.
+struct IncomingFaceBatchEntry
+{
+  /// Source-slot-local face index carried on the wire.
+  std::uint32_t source_face_index = 0;
+  /// Offset of this payload within `IncomingFaceBatch::psi_data`.
+  std::size_t payload_offset = 0;
+  /// Number of doubles in this payload.
+  std::size_t payload_size = 0;
+};
+
+/// One received mailbox payload grouped by sending source slot and angle set.
+struct IncomingFaceBatch
+{
+  /// Source-locality slot for the sending partition.
+  std::uint32_t source_slot = 0;
+  /// Per-face metadata for the packed payload block.
+  std::vector<IncomingFaceBatchEntry> entries;
+  /// Packed received doubles for all faces in the batch.
+  std::vector<double> psi_data;
+};
+
+/// One outgoing non-local face payload published by a sweep worker.
+struct OutgoingFaceData
+{
+  /// Producing angle-set ID.
+  std::size_t angle_set_id = 0;
+  /// Receiver-local face index understood by the destination rank.
+  std::uint32_t remote_face_index = 0;
+  /// Packed outgoing doubles for one non-local face.
+  std::vector<double> psi_data;
+};
+
+/// Queue-capacity summary for one angle set.
+struct AngleSetCapacity
+{
+  /// Number of outgoing non-local faces produced by this angle set.
+  std::size_t outgoing_faces = 0;
+  /// Number of incoming non-local faces consumed by this angle set.
+  std::size_t incoming_faces = 0;
+  /// Maximum number of doubles in one outgoing face payload.
+  std::size_t max_outgoing_face_values = 0;
+  /// Maximum number of face entries in one received batch.
+  std::size_t max_incoming_batch_entries = 0;
+  /// Maximum number of doubles in one received batch.
+  std::size_t max_incoming_batch_values = 0;
+};
+
+/**
+ * Aggregated CBCD communicator with one dedicated progress thread.
+ *
+ * Sweep worker threads publish outgoing non-local face payloads into per-destination MPSC queues.
+ * The communication thread drains those queues, batches payloads by angle set subject to
+ * the configured message-size limit, serializes them into MPI messages, and posts nonblocking
+ * sends. The communication thread also probes for incoming messages, deserializes them into compact
+ * `IncomingFaceBatch` payloads, and publishes those batches into per-angle-set incoming
+ * mailboxes.
+ *
+ * The aggregated communicator assumes the following communication patterns and sweep worker
+ * thread interactions:
+ * - sweep worker threads only write outgoing queue slots,
+ * - the communication thread handles only the draining of outgoing queues and routing of
+ *   incoming batches to angle-set mailboxes,
+ * - each angle-set owner thread only drains its own incoming mailbox.
+ *
+ * Aggregated communicator flow:
+ * 1. A sweep worker publishes one completed non-local face payload into the ring buffer
+ *    associated with the destination rank.
+ * 2. The communication thread gathers ready slots, groups them by angle set, and serializes
+ *    one or more MPI messages subject to the configured byte limit.
+ * 3. The destination rank probes for those messages, maps the sending partition to its local
+ *    source slot, and reconstructs one compact `IncomingFaceBatch` per angle-set section.
+ * 4. The communication thread publishes each reconstructed batch into the mailbox owned by
+ *    that angle set.
+ * 5. The angle-set owner thread drains its mailbox and copies the received face data into
+ *    the corresponding non-local FLUDS storage.
+ */
+class CBCD_AsynchronousCommunicator
+{
+public:
+  /**
+   * Construct the CBCD asynchronous communicator.
+   *
+   * \param angle_sets Angle sets served by the communicator.
+   * \param comm_set MPI communicator set used for point-to-point exchanges.
+   * \param incoming_source_partitions Incoming source partitions grouped by angle set.
+   * \param max_message_bytes Maximum serialized MPI payload size. A value of zero disables
+   * message-size splitting.
+   * \param capacities Queue-capacity summary for each angle set.
+   */
+  CBCD_AsynchronousCommunicator(const std::vector<AngleSet*>& angle_sets,
+                                const MPICommunicatorSet& comm_set,
+                                const std::vector<std::vector<int>>& incoming_source_partitions,
+                                std::size_t max_message_bytes,
+                                const std::vector<AngleSetCapacity>& capacities);
+
+  ~CBCD_AsynchronousCommunicator();
+
+  /**
+   * Publish one outgoing non-local face payload.
+   *
+   * \param dest_rank Destination rank.
+   * \param angle_set_id Producing angle-set ID.
+   * \param remote_face_index Receiver-local face index.
+   * \param data_size Number of doubles in the payload.
+   * \param fill Callback that fills the reserved payload buffer.
+   */
+  template <typename FillCallback>
+  void EnqueueOutgoing(int dest_rank,
+                       std::size_t angle_set_id,
+                       std::uint32_t remote_face_index,
+                       std::size_t data_size,
+                       FillCallback&& fill)
+  {
+    const auto it = dest_to_queue_index_.find(dest_rank);
+    assert(it != dest_to_queue_index_.end());
+    auto& queue = *outgoing_queues_[it->second]->queue;
+    auto& slot = queue.ReserveSlot();
+    slot.payload.angle_set_id = angle_set_id;
+    slot.payload.remote_face_index = remote_face_index;
+    slot.payload.psi_data.resize(data_size);
+    fill(slot.payload.psi_data.data());
+    queue.PublishSlot(slot);
+  }
+
+  /**
+   * Drain all currently ready incoming batches for one angle set.
+   *
+   * \param angle_set_id Angle-set ID.
+   * \param callback Callback invoked for each incoming batch payload.
+   * \return `true` if at least one batch was consumed.
+   */
+  template <typename Callback>
+  bool ProcessIncoming(std::size_t angle_set_id, Callback&& callback)
+  {
+    assert(angle_set_id < num_angle_sets_);
+    return incoming_mailboxes_[angle_set_id]->ProcessReady(std::forward<Callback>(callback)) > 0;
+  }
+
+  /// Report whether the specified angle set currently has a published incoming batch.
+  bool HasIncoming(std::size_t angle_set_id) const
+  {
+    assert(angle_set_id < num_angle_sets_);
+    return not incoming_mailboxes_[angle_set_id]->Empty();
+  }
+
+  /// Mark one angle set as locally complete.
+  void SignalAngleSetComplete(std::size_t angle_set_id);
+  /// Start the communication thread.
+  void Start();
+  /// Request termination and join the communication thread.
+  void Stop();
+
+private:
+  /// Outgoing queue for one destination rank.
+  struct DestinationQueue
+  {
+    /// Destination rank.
+    int dest_rank = 0;
+    /// Outgoing MPSC queue drained by the communication thread.
+    std::unique_ptr<LockFreeRingBuffer<OutgoingFaceData>> queue;
+  };
+
+  /// One in-flight nonblocking MPI send and its owned serialized bytes.
+  struct InFlightSend
+  {
+    /// Nonblocking MPI request.
+    mpi::Request request;
+    /// Owned serialized payload storage.
+    ByteArray data;
+  };
+
+  /// Run the communication-thread progress loop.
+  void CommThreadLoop();
+  /// Drain outgoing queues, serialize batches, and post MPI sends.
+  bool SerializeAndSend();
+  /// Probe for incoming MPI messages, deserialize them, and publish mailbox batches.
+  bool ProbeAndReceive();
+  /// Retire completed nonblocking sends.
+  bool PollInFlightSends();
+  /// Report whether all angle sets are complete and no local outgoing work remains.
+  bool AllAngleSetsComplete() const;
+
+  /// Communicator set used for all CBCD point-to-point exchanges.
+  const MPICommunicatorSet& comm_set_;
+  /// Number of managed angle sets.
+  std::size_t num_angle_sets_;
+  /// MPI tag shared by all communicator messages in this instance.
+  int mpi_tag_;
+  /// Maximum serialized MPI payload size.
+  std::size_t max_message_bytes_;
+  /// Local MPI rank.
+  int my_rank_ = 0;
+  /// Source partitions that can send to this rank.
+  std::vector<int> source_partitions_;
+  /// Source ranks mapped into the local communicator for receives.
+  std::vector<int> source_ranks_;
+  /// Source-partition to source-slot map grouped by angle set.
+  std::vector<std::unordered_map<int, std::uint32_t>> source_partition_to_slot_by_angle_set_;
+  /// Outgoing destination queues.
+  std::vector<std::unique_ptr<DestinationQueue>> outgoing_queues_;
+  /// Destination-rank to outgoing-queue index map.
+  std::unordered_map<int, int> dest_to_queue_index_;
+  /// Per-angle-set incoming mailboxes.
+  std::vector<std::unique_ptr<LockFreeRingBuffer<IncomingFaceBatch>>> incoming_mailboxes_;
+  /// Per-angle-set transient send batches assembled by the communication thread.
+  std::vector<std::vector<const OutgoingFaceData*>> send_batch_by_angle_set_;
+  /// Reusable receive buffer for one incoming MPI payload.
+  ByteArray recv_buffer_;
+  /// Outstanding nonblocking sends owned by the communication thread.
+  std::vector<InFlightSend> in_flight_sends_;
+  /// Termination flag for the communication thread.
+  std::atomic<bool> stop_requested_{false};
+  /// Per-angle-set local completion flags.
+  std::vector<std::atomic<bool>> angle_set_done_;
+  /// Dedicated communication thread.
+  std::thread comm_thread_;
+  /// Scratch vector used while gathering ready outgoing queue slots.
+  std::vector<LockFreeRingBuffer<OutgoingFaceData>::Slot*> slot_cache_;
+};
+
+} // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/lock_free_queues.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/lock_free_queues.h
new file mode 100644
index 0000000000..337ca574a7
--- /dev/null
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/lock_free_queues.h
@@ -0,0 +1,170 @@
+// SPDX-FileCopyrightText: 2026 The OpenSn Authors <https://open-sn.github.io/opensn/>
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <atomic>
+#include <cstddef>
+#include <new>
+#include <thread>
+#include <vector>
+
+namespace opensn
+{
+
+/**
+ * Bounded lock-free multi-producer, single-consumer ring buffer.
+ *
+ * Producers reserve slots through an atomic head counter and publish them with a per-slot
+ * ready flag. The single consumer drains in FIFO order through the tail index. The queue
+ * is bounded and reuses preallocated slots; it performs no dynamic allocation once the
+ * storage has been initialized.
+ *
+ * In the CBCD aggregated communicator, LockFreeRingBuffer serves two roles:
+ * 1. an outgoing per-destination queue written by sweep worker threads and drained by the
+ *    communication thread,
+ * 2. an incoming per-angle-set queue written by the communication thread and drained by the
+ *    owning angleset worker thread.
+ *
+ * LockFreeRingBuffer works under the following assumptions:
+ * - producers reserve one slot, write the payload in place, and publish the slot exactly
+ *   once
+ * - the consumer drains published slots in FIFO order and returns them to the ring for
+ *   reuse.
+ *
+ * This yields a fixed-capacity queue with explicit slot reuse.
+ */
+template <typename T>
+class LockFreeRingBuffer
+{
+public:
+  /// Slot payload with a publication flag.
+  struct Slot
+  {
+    /// Stored payload.
+    T payload;
+    /// Publication flag visible to the single consumer.
+    std::atomic<bool> ready{false};
+  };
+
+  /**
+   * Allocate storage for the requested number of slots.
+   *
+   * \param capacity Number of ring-buffer slots.
+   */
+  void Preallocate(const std::size_t capacity) { buffer_ = std::vector<Slot>(capacity); }
+
+  /**
+   * Initialize every slot payload in place.
+   *
+   * \tparam Callback Callable invoked once per slot payload.
+   * \param cb Initialization callback.
+   */
+  template <typename Callback>
+  void InitializeSlots(Callback&& cb)
+  {
+    for (auto& slot : buffer_)
+      cb(slot.payload);
+  }
+
+  /**
+   * Reserve one slot for a producer.
+   *
+   * \return Writable slot reference.
+   */
+  Slot& ReserveSlot()
+  {
+    const auto idx = head_.fetch_add(1, std::memory_order_relaxed) % buffer_.size();
+    while (buffer_[idx].ready.load(std::memory_order_acquire))
+      std::this_thread::yield();
+    return buffer_[idx];
+  }
+
+  /**
+   * Publish one reserved slot to the consumer.
+   *
+   * \param slot Slot to publish.
+   */
+  void PublishSlot(Slot& slot) { slot.ready.store(true, std::memory_order_release); }
+
+  /**
+   * Gather currently ready slots without consuming them.
+   *
+   * \param out Output vector of ready slot pointers.
+   */
+  void GetReadySlots(std::vector<Slot*>& out)
+  {
+    out.clear();
+    if (buffer_.empty())
+      return;
+
+    const auto capacity = buffer_.size();
+    auto current_tail = tail_;
+    while (buffer_[current_tail % capacity].ready.load(std::memory_order_acquire))
+    {
+      out.push_back(&buffer_[current_tail % capacity]);
+      ++current_tail;
+    }
+  }
+
+  /**
+   * Release the next `count` ready slots after they have been consumed.
+   *
+   * \param count Number of slots to free.
+   */
+  void FreeSlots(const std::size_t count)
+  {
+    const auto capacity = buffer_.size();
+    for (std::size_t i = 0; i < count; ++i)
+    {
+      buffer_[tail_ % capacity].ready.store(false, std::memory_order_release);
+      ++tail_;
+    }
+  }
+
+  /**
+   * Consume all ready slots in FIFO order.
+   *
+   * \tparam Callback Callable invoked with each slot payload.
+   * \param cb Consumer callback.
+   * \return Number of consumed slots.
+   */
+  template <typename Callback>
+  std::size_t ProcessReady(Callback&& cb)
+  {
+    if (buffer_.empty())
+      return 0;
+
+    const auto capacity = buffer_.size();
+    std::size_t count = 0;
+    while (true)
+    {
+      auto& slot = buffer_[tail_ % capacity];
+      if (not slot.ready.load(std::memory_order_acquire))
+        break;
+      cb(slot.payload);
+      slot.ready.store(false, std::memory_order_release);
+      ++tail_;
+      ++count;
+    }
+    return count;
+  }
+
+  /// Check whether the queue currently has no published slots.
+  bool Empty() const
+  {
+    if (buffer_.empty())
+      return true;
+    return not buffer_[tail_ % buffer_.size()].ready.load(std::memory_order_acquire);
+  }
+
+private:
+  /// Ring-buffer storage.
+  std::vector<Slot> buffer_;
+  /// Producer reservation index.
+  alignas(std::hardware_destructive_interference_size) std::atomic<std::size_t> head_{0};
+  /// Consumer drain index.
+  alignas(std::hardware_destructive_interference_size) std::size_t tail_{0};
+};
+
+} // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.cu b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.cu
index dde99da1f6..0bf592d5e3 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.cu
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.cu
@@ -3,55 +3,67 @@
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbcd_angle_set.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h"
-#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h"
 #include "modules/linear_boltzmann_solvers/lbs_problem/device/carrier/mesh_carrier.h"
+#include "framework/mesh/mesh_continuum/mesh_continuum.h"
 #include "framework/math/unknown_manager/unknown_manager.h"
 #include "framework/math/spatial_discretization/spatial_discretization.h"
 #include "framework/logging/log.h"
 #include "framework/runtime.h"
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <memory>
+#include <unordered_map>
 #include <utility>
+#include "caliper/cali.h"
 
 namespace opensn
 {
 
-CBCD_FLUDS::CBCD_FLUDS(size_t num_groups,
-                       size_t num_angles,
-                       size_t num_local_cells,
+CBCD_FLUDS::CBCD_FLUDS(std::size_t num_groups,
+                       std::size_t num_angles,
+                       std::size_t num_local_cells,
                        const CBCD_FLUDSCommonData& common_data,
                        const UnknownManager& psi_uk_man,
                        const SpatialDiscretization& sdm,
                        bool save_angular_flux)
   : FLUDS(num_groups, num_angles, common_data.GetSPDS()),
     common_data_(common_data),
+    cbc_spds_(static_cast<const CBC_SPDS&>(common_data.GetSPDS())),
     psi_uk_man_(psi_uk_man),
     sdm_(sdm),
-    num_angles_in_gs_quadrature_(psi_uk_man_.GetNumberOfUnknowns()),
-    num_quadrature_local_dofs_(sdm_.GetNumLocalDOFs(psi_uk_man_)),
-    num_local_spatial_dofs_(num_quadrature_local_dofs_ / num_angles_in_gs_quadrature_ /
+    num_local_spatial_dofs_(sdm_.GetNumLocalDOFs(psi_uk_man_) / psi_uk_man_.GetNumberOfUnknowns() /
                             num_groups_),
-    local_psi_data_size_(num_local_spatial_dofs_ * num_groups_and_angles_),
-    incoming_boundary_node_map_(common_data_.GetIncomingBoundaryNodeMap()),
-    cell_to_outgoing_boundary_nodes_(common_data_.GetOutgoingBoundaryNodeMap()),
-    cell_to_incoming_nonlocal_nodes_(common_data_.GetIncomingNonlocalNodeMap()),
-    cell_to_outgoing_nonlocal_nodes_(common_data_.GetOutgoingNonlocalNodeMap()),
-    local_psi_(local_psi_data_size_),
+    local_psi_data_size_(cbc_spds_.GetTotalLocalFaceSlotNodes() * num_groups_and_angles_),
+    saved_psi_data_size_(num_local_spatial_dofs_ * num_groups_and_angles_),
     incoming_boundary_psi_(common_data_.GetNumIncomingBoundaryNodes() * num_groups_and_angles_),
     outgoing_boundary_psi_(common_data_.GetNumOutgoingBoundaryNodes() * num_groups_and_angles_),
     incoming_nonlocal_psi_(common_data_.GetNumIncomingNonlocalNodes() * num_groups_and_angles_),
     outgoing_nonlocal_psi_(common_data_.GetNumOutgoingNonlocalNodes() * num_groups_and_angles_),
-    local_cell_ids_(num_local_cells),
     save_angular_flux_(save_angular_flux)
 {
-  if (save_angular_flux_ and host_saved_psi_.empty())
+  grid_ptr_ = GetSPDS().GetGrid().get();
+  for (auto& local_cell_ids : local_cell_ids_)
+    local_cell_ids.reserve(num_local_cells);
+
+  outgoing_node_memcpy_plan_.reserve(common_data_.GetNumOutgoingNonlocalNodes());
+  for (std::size_t cell_local_id = 0; cell_local_id < common_data_.GetNumLocalCells();
+       ++cell_local_id)
   {
-    host_saved_psi_ = crb::HostVector<double>(local_psi_data_size_);
-    device_saved_psi_ = crb::DeviceMemory<double>(local_psi_data_size_);
+    for (const auto& face_info : common_data_.GetOutgoingNonlocalFaces(cell_local_id))
+    {
+      for (const auto& node : common_data_.GetOutgoingNodeCopies(face_info))
+      {
+        outgoing_node_memcpy_plan_.push_back(
+          {static_cast<std::size_t>(node.storage_index) * num_groups_and_angles_,
+           static_cast<std::size_t>(node.face_node) * num_groups_and_angles_});
+      }
+    }
   }
-  CreatePointerSet();
-  deplocs_outgoing_messages_.reserve(common_data.GetNumIncomingNonlocalFaces());
 }
 
 CBCD_FLUDS::~CBCD_FLUDS()
@@ -60,7 +72,8 @@ CBCD_FLUDS::~CBCD_FLUDS()
   {
     host_saved_psi_.clear();
   }
-  local_cell_ids_.clear();
+  for (auto& local_cell_ids : local_cell_ids_)
+    local_cell_ids.clear();
   incoming_boundary_psi_.clear();
   outgoing_boundary_psi_.clear();
   incoming_nonlocal_psi_.clear();
@@ -70,6 +83,65 @@ CBCD_FLUDS::~CBCD_FLUDS()
 void
 CBCD_FLUDS::AllocateLocalAndSavedPsi()
 {
+  local_psi_ = crb::DeviceMemory<double>(local_psi_data_size_);
+  if (save_angular_flux_ and host_saved_psi_.empty())
+  {
+    host_saved_psi_ = crb::HostVector<double>(saved_psi_data_size_);
+    device_saved_psi_ = crb::DeviceMemory<double>(saved_psi_data_size_);
+  }
+  CreatePointerSet();
+}
+
+void
+CBCD_FLUDS::InitializeReflectingBoundaryNodes(
+  const std::map<std::uint64_t, std::shared_ptr<SweepBoundary>>& boundaries)
+{
+  const auto num_local_cells = common_data_.GetNumLocalCells();
+  reflecting_outgoing_boundary_face_offsets_.assign(num_local_cells + 1, 0);
+  reflecting_boundary_face_plans_.clear();
+  reflecting_boundary_face_plans_.reserve(common_data_.GetNumOutgoingBoundaryNodes());
+
+  for (std::size_t cell_local_id = 0; cell_local_id < num_local_cells; ++cell_local_id)
+  {
+    reflecting_outgoing_boundary_face_offsets_[cell_local_id] =
+      static_cast<std::uint32_t>(reflecting_boundary_face_plans_.size());
+
+    const auto boundary_nodes = common_data_.GetOutgoingBoundaryNodes(cell_local_id);
+    for (std::size_t i = 0; i < boundary_nodes.size();)
+    {
+      const auto& first_node = boundary_nodes[i];
+      const auto boundary_it = boundaries.find(first_node.boundary_id);
+      if (boundary_it == boundaries.end() or not boundary_it->second->IsReflecting())
+      {
+        ++i;
+        continue;
+      }
+
+      std::size_t num_nodes = 1;
+      while (i + num_nodes < boundary_nodes.size())
+      {
+        const auto& node = boundary_nodes[i + num_nodes];
+        if (node.boundary_id != first_node.boundary_id or
+            node.cell_local_id != first_node.cell_local_id or node.face_id != first_node.face_id or
+            node.storage_index != first_node.storage_index + num_nodes or
+            node.face_node != first_node.face_node + num_nodes)
+          break;
+        ++num_nodes;
+      }
+
+      reflecting_boundary_face_plans_.push_back(
+        {boundary_it->second.get(),
+         static_cast<std::uint32_t>(first_node.cell_local_id),
+         first_node.face_id,
+         static_cast<std::uint16_t>(first_node.face_node),
+         static_cast<std::size_t>(first_node.storage_index) * num_groups_and_angles_,
+         static_cast<std::uint16_t>(num_nodes)});
+      i += num_nodes;
+    }
+
+    reflecting_outgoing_boundary_face_offsets_[cell_local_id + 1] =
+      static_cast<std::uint32_t>(reflecting_boundary_face_plans_.size());
+  }
 }
 
 void
@@ -101,116 +173,101 @@ CBCD_FLUDS::CreatePointerSet()
 void
 CBCD_FLUDS::CopyIncomingBoundaryPsiToDevice(CBCDSweepChunk& sweep_chunk, CBCD_AngleSet* angle_set)
 {
-  const auto& angle_indices = angle_set->GetAngleIndices();
-  const auto& num_angles = angle_indices.size();
+  CALI_CXX_MARK_SCOPE("CBCD_FLUDS::CopyIncomingBoundaryPsiToDevice");
 
-  for (const auto& node : incoming_boundary_node_map_)
-  {
-    for (size_t as_ss_idx = 0; as_ss_idx < num_angles; ++as_ss_idx)
-    {
-      auto direction_num = angle_indices[as_ss_idx];
-      double* dst_psi = incoming_boundary_psi_.data() +
-                        node.storage_index * num_groups_and_angles_ + as_ss_idx * num_groups_;
-      const double* src_psi = angle_set->PsiBoundary(node.boundary_id,
-                                                     direction_num,
-                                                     node.cell_local_id,
-                                                     node.face_id,
-                                                     node.face_node,
-                                                     sweep_chunk.GetGroupsetGroupIndex(),
-                                                     sweep_chunk.IsSurfaceSourceActive());
-      std::copy(src_psi, src_psi + num_groups_, dst_psi);
-    }
-  }
-}
-
-void
-CBCD_FLUDS::CopyIncomingNonlocalPsiToDevice(CBCD_AngleSet* angle_set,
-                                            const std::vector<std::uint32_t>& cell_local_ids)
-{
-  if (cell_to_incoming_nonlocal_nodes_.empty())
-    return;
   const auto& angle_indices = angle_set->GetAngleIndices();
-  const auto& num_angles = angle_indices.size();
-  for (const auto& cell_local_id : cell_local_ids)
+  const auto num_angles = angle_indices.size();
+  const std::size_t groups_bytes = num_groups_ * sizeof(double);
+  const auto gs_gi = sweep_chunk.GetGroupsetGroupIndex();
+  const bool surface_source_active = sweep_chunk.IsSurfaceSourceActive();
+
+  for (const auto& face_plan : common_data_.GetIncomingBoundaryFaces())
   {
-    auto incoming_boundary_it = cell_to_incoming_nonlocal_nodes_.find(cell_local_id);
-    if (incoming_boundary_it == cell_to_incoming_nonlocal_nodes_.end())
-      continue;
-    for (const auto& node : incoming_boundary_it->second)
+    for (std::size_t as_ss_idx = 0; as_ss_idx < num_angles; ++as_ss_idx)
     {
-      for (size_t as_ss_idx = 0; as_ss_idx < num_angles; ++as_ss_idx)
+      const auto direction_num = angle_indices[as_ss_idx];
+      double* dst_face =
+        incoming_boundary_psi_.data() +
+        static_cast<std::size_t>(face_plan.base_storage_index) * num_groups_and_angles_ +
+        as_ss_idx * num_groups_;
+      for (std::size_t node = 0; node < face_plan.num_nodes; ++node)
       {
-        double* dst_psi = incoming_nonlocal_psi_.data() +
-                          node.storage_index * num_groups_and_angles_ + as_ss_idx * num_groups_;
+        double* dst_psi = dst_face + node * num_groups_and_angles_;
         const double* src_psi =
-          NLUpwindPsi(node.cell_global_id, node.face_id, node.face_node_mapped, as_ss_idx);
-        std::copy(src_psi, src_psi + num_groups_, dst_psi);
+          angle_set->PsiBoundary(face_plan.boundary_id,
+                                 direction_num,
+                                 face_plan.cell_local_id,
+                                 face_plan.face_id,
+                                 static_cast<unsigned int>(face_plan.first_face_node + node),
+                                 gs_gi,
+                                 surface_source_active);
+        std::memcpy(dst_psi, src_psi, groups_bytes);
       }
     }
   }
 }
 
 void
-CBCD_FLUDS::CopyOutgoingPsiBackToHost(CBCDSweepChunk& sweep_chunk,
-                                      CBCD_AngleSet* angle_set,
-                                      const std::vector<std::uint32_t>& cell_local_ids)
+CBCD_FLUDS::CopyOutgoingPsiBackToHost(CBCDSweepChunk&,
+                                      CBCD_AsynchronousCommunicator& async_comm,
+                                      const std::size_t angle_set_id,
+                                      const std::vector<std::uint32_t>& angle_indices,
+                                      std::span<const std::uint32_t> cell_local_ids)
 {
-  if (cell_to_outgoing_boundary_nodes_.empty() and cell_to_outgoing_nonlocal_nodes_.empty())
+  if (common_data_.GetNumOutgoingBoundaryNodes() == 0 and
+      common_data_.GetNumOutgoingNonlocalFaces() == 0)
     return;
-  const auto& angle_indices = angle_set->GetAngleIndices();
-  const auto& num_angles = angle_indices.size();
+
+  CALI_CXX_MARK_SCOPE("CBCD_FLUDS::CopyOutgoingPsiBackToHost");
+
+  const auto num_angles = angle_indices.size();
   const auto& grid = *(GetSPDS().GetGrid());
+  const std::size_t groups_bytes = num_groups_ * sizeof(double);
+  const std::size_t stride_bytes = num_groups_and_angles_ * sizeof(double);
   for (const auto& cell_local_id : cell_local_ids)
   {
-    const auto& cell = grid.local_cells[cell_local_id];
-    auto outgoing_boundary_it = cell_to_outgoing_boundary_nodes_.find(cell_local_id);
-    if (outgoing_boundary_it != cell_to_outgoing_boundary_nodes_.end())
-      for (const auto& node : outgoing_boundary_it->second)
+    const auto reflecting_faces = GetReflectingOutgoingBoundaryFaces(cell_local_id);
+    for (const auto& face_plan : reflecting_faces)
+    {
+      for (std::size_t as_ss_idx = 0; as_ss_idx < num_angles; ++as_ss_idx)
       {
-        const auto& face = cell.faces[node.face_id];
-        if (angle_set->GetBoundaries().at(face.neighbor_id)->IsReflecting())
+        const auto direction_num = static_cast<unsigned int>(angle_indices[as_ss_idx]);
+        const double* src_face =
+          outgoing_boundary_psi_.data() + face_plan.src_base_offset + as_ss_idx * num_groups_;
+        for (std::size_t n = 0; n < face_plan.num_nodes; ++n)
         {
-          for (size_t as_ss_idx = 0; as_ss_idx < num_angles; ++as_ss_idx)
-          {
-            auto direction_num = angle_indices[as_ss_idx];
-            double* dst_psi = angle_set->PsiReflected(
-              face.neighbor_id, direction_num, node.cell_local_id, node.face_id, node.face_node);
-            const double* src_psi = outgoing_boundary_psi_.data() +
-                                    node.storage_index * num_groups_and_angles_ +
-                                    as_ss_idx * num_groups_;
-            std::copy(src_psi, src_psi + num_groups_, dst_psi);
-          }
+          double* dst = face_plan.boundary->PsiOutgoing(
+            face_plan.cell_local_id,
+            face_plan.face_id,
+            static_cast<unsigned int>(face_plan.first_face_node + n),
+            direction_num);
+          std::memcpy(dst, src_face + n * num_groups_and_angles_, groups_bytes);
         }
       }
-    auto outgoing_nonlocal_it = cell_to_outgoing_nonlocal_nodes_.find(cell_local_id);
-    if (outgoing_nonlocal_it != cell_to_outgoing_nonlocal_nodes_.end())
-      for (const auto& node : outgoing_nonlocal_it->second)
-      {
-        const auto& face = cell.faces[node.face_id];
-        const auto& cell_mapping = sdm_.GetCellMapping(cell);
-        const auto& face_nodal_mapping =
-          common_data_.GetFaceNodalMapping(node.cell_local_id, node.face_id);
-        const auto& num_face_nodes = cell_mapping.GetNumFaceNodes(node.face_id);
-        const auto& face_data_size = num_face_nodes * num_groups_and_angles_;
-        const int locality =
-          sweep_chunk.GetCellTransportView(node.cell_local_id).FaceLocality(node.face_id);
-        auto& async_comm =
-          static_cast<CBC_AsynchronousCommunicator&>(*angle_set->GetCommunicator());
-        std::vector<double>* psi_nonlocal_outgoing =
-          &async_comm.InitGetDownwindMessageData(locality,
-                                                 face.neighbor_id,
-                                                 face_nodal_mapping.associated_face_,
-                                                 angle_set->GetID(),
-                                                 face_data_size);
-        for (size_t as_ss_idx = 0; as_ss_idx < num_angles; ++as_ss_idx)
+    }
+
+    for (const auto& face_info : common_data_.GetOutgoingNonlocalFaces(cell_local_id))
+    {
+      const std::size_t face_data_size =
+        static_cast<std::size_t>(face_info.num_face_nodes) * num_groups_and_angles_;
+      const int dest_rank = common_data_.GetOutgoingLocalities()[face_info.dest_slot];
+      async_comm.EnqueueOutgoing(
+        dest_rank,
+        angle_set_id,
+        face_info.remote_face_index,
+        face_data_size,
+        [this, &face_info, stride_bytes](double* dst_base)
         {
-          auto* dst_psi = NLOutgoingPsi(psi_nonlocal_outgoing, node.face_node, as_ss_idx);
-          const double* src_psi = outgoing_nonlocal_psi_.data() +
-                                  node.storage_index * num_groups_and_angles_ +
-                                  as_ss_idx * num_groups_;
-          std::copy(src_psi, src_psi + num_groups_, dst_psi);
-        }
-      }
+          const auto* node_plan = outgoing_node_memcpy_plan_.data() + face_info.node_copy_offset;
+          const auto* node_plan_end = node_plan + face_info.num_node_copies;
+          for (; node_plan != node_plan_end; ++node_plan)
+          {
+            auto* dst = dst_base + node_plan->dest_offset;
+            const double* src = outgoing_nonlocal_psi_.data() + node_plan->src_offset;
+            std::memcpy(dst, src, stride_bytes);
+          }
+        });
+    }
   }
 }
 
@@ -219,6 +276,7 @@ CBCD_FLUDS::CopySavedPsiFromDevice()
 {
   if (not save_angular_flux_)
     return;
+  CALI_CXX_MARK_SCOPE("CBCD_FLUDS::CopySavedPsiFromDevice");
   crb::copy(host_saved_psi_, device_saved_psi_, host_saved_psi_.size(), 0, 0, stream_);
 }
 
@@ -227,9 +285,13 @@ CBCD_FLUDS::CopySavedPsiToDestinationPsi(CBCDSweepChunk& sweep_chunk, CBCD_Angle
 {
   if (not save_angular_flux_)
     return;
+
+  CALI_CXX_MARK_SCOPE("CBCD_FLUDS::CopySavedPsiToDestinationPsi");
+
+  stream_.synchronize();
+
   DiscreteOrdinatesProblem& problem = sweep_chunk.GetProblem();
   auto* mesh = problem.GetMeshCarrier();
-  auto grid = problem.GetGrid();
   auto& groupset = sweep_chunk.GetGroupset();
   auto& destination_psi = problem.GetPsiNewLocal()[groupset.id];
   const auto& discretization = problem.GetSpatialDiscretization();
@@ -237,7 +299,8 @@ CBCD_FLUDS::CopySavedPsiToDestinationPsi(CBCDSweepChunk& sweep_chunk, CBCD_Angle
     groupset.psi_uk_man_.GetNumberOfUnknowns() * groupset.GetNumGroups();
   const auto& angle_indices = angle_set->GetAngleIndices();
   const auto& num_angles = angle_set->GetNumAngles();
-  for (const auto& cell : grid->local_cells)
+  const std::size_t groups_bytes = num_groups_ * sizeof(double);
+  for (const auto& cell : grid_ptr_->local_cells)
   {
     double* dst_psi = &destination_psi[discretization.MapDOFLocal(cell, 0, psi_uk_man_, 0, 0)];
     double* src_psi =
@@ -250,7 +313,7 @@ CBCD_FLUDS::CopySavedPsiToDestinationPsi(CBCDSweepChunk& sweep_chunk, CBCD_Angle
         auto direction_num = angle_indices[as_ss_idx];
         double* dst = dst_psi + direction_num * num_groups_;
         double* src = src_psi + as_ss_idx * num_groups_;
-        std::copy(src, src + num_groups_, dst);
+        std::memcpy(dst, src, groups_bytes);
       }
       dst_psi += groupset_angle_group_stride;
       src_psi += num_groups_and_angles_;
@@ -258,32 +321,23 @@ CBCD_FLUDS::CopySavedPsiToDestinationPsi(CBCDSweepChunk& sweep_chunk, CBCD_Angle
   }
 }
 
-double*
-CBCD_FLUDS::NLUpwindPsi(uint64_t cell_global_id,
-                        unsigned int face_id,
-                        unsigned int face_node_mapped,
-                        size_t as_ss_idx)
+std::uint32_t
+CBCD_FLUDS::ScatterReceivedFaceData(const std::uint32_t source_slot,
+                                    const std::uint32_t source_face_index,
+                                    const double* psi_data)
 {
-  auto it = deplocs_outgoing_messages_.find({cell_global_id, face_id});
-  if (it == deplocs_outgoing_messages_.end())
-    return nullptr;
-  auto& psi = it->second;
-  const size_t dof_map =
-    face_node_mapped * num_groups_and_angles_ + //  Offset to start of data for face_node_mapped
-    as_ss_idx * num_groups_;                    // Offset to start of data for angle_set_index
-
-  assert(dof_map < psi.size());
-  return &psi[dof_map];
+  const auto& face_info = common_data_.GetIncomingNonlocalFace(source_slot, source_face_index);
+  double* dst = incoming_nonlocal_psi_.data() +
+                static_cast<std::size_t>(face_info.base_storage_index) * num_groups_and_angles_;
+  const std::size_t face_values =
+    static_cast<std::size_t>(face_info.num_nodes) * num_groups_and_angles_;
+  std::memcpy(dst, psi_data, face_values * sizeof(double));
+  return face_info.cell_local_id;
 }
 
-double*
-CBCD_FLUDS::NLOutgoingPsi(std::vector<double>* psi_nonlocal_outgoing,
-                          size_t face_node,
-                          size_t as_ss_idx)
+void
+CBCD_FLUDS::ClearLocalAndReceivePsi()
 {
-  assert(psi_nonlocal_outgoing != nullptr);
-  const size_t addr_offset = face_node * num_groups_and_angles_ + as_ss_idx * num_groups_;
-  return &(*psi_nonlocal_outgoing)[addr_offset];
 }
 
 } // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h
index f466af2052..9577ae6d29 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h
@@ -8,28 +8,48 @@
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/fluds.h"
 #include "modules/linear_boltzmann_solvers/lbs_problem/device/storage.h"
 #include "caribou/main.hpp"
+#include <array>
 #include <cstddef>
-#include <functional>
-#include <unordered_map>
+#include <span>
 
 namespace crb = caribou;
 
 namespace opensn
 {
 
+class CBC_SPDS;
 class CBCD_AngleSet;
+class CBCD_AsynchronousCommunicator;
+class CBCDSweepChunk;
 class UnknownManager;
 class SpatialDiscretization;
-class Cell;
-class CBCDSweepChunk;
-
-/// CBC FLUDS for device.
+class SweepBoundary;
+class MeshContinuum;
+
+/**
+ * CBCD FLUDS for managing boundary, local, and non-local psi buffers during sweeps.
+ *
+ * Owns the device and mapped-host angular-flux buffers used by one CBCD angle set.
+ * Local face data is stored in a compact slot bank sized from the static CBC slot
+ * assignment.
+ */
 class CBCD_FLUDS : public FLUDS
 {
 public:
-  CBCD_FLUDS(size_t num_groups,
-             size_t num_angles,
-             size_t num_local_cells,
+  /**
+   * Construct the CBCD FLUDS for one angle set.
+   *
+   * \param num_groups Number of groups in angleset's groupset.
+   * \param num_angles Number of angles in the angleset.
+   * \param num_local_cells Number of local cells assigned to the angle set.
+   * \param common_data Shared CBCD FLUDS metadata.
+   * \param psi_uk_man Unknown manager for angular flux storage.
+   * \param sdm Spatial discretization.
+   * \param save_angular_flux Save angular fluxes when true.
+   */
+  CBCD_FLUDS(std::size_t num_groups,
+             std::size_t num_angles,
+             std::size_t num_local_cells,
              const CBCD_FLUDSCommonData& common_data,
              const UnknownManager& psi_uk_man,
              const SpatialDiscretization& sdm,
@@ -37,54 +57,91 @@ class CBCD_FLUDS : public FLUDS
 
   ~CBCD_FLUDS();
 
-  /// Get reference to the common data.
+  /// Return the shared CBCD FLUDS metadata.
   const CBCD_FLUDSCommonData& GetCommonData() const { return common_data_; }
 
-  /// Get reference to stream.
+  /// Return the stream associated with this angle set.
   crb::Stream& GetStream() { return stream_; }
 
+  /// Bytes in the local psi backing buffer for this FLUDS instance.
+  std::size_t GetLocalPsiBytes() const noexcept { return local_psi_data_size_ * sizeof(double); }
+
   /// Allocate buffers asynchronously on the associated stream.
   void AllocateLocalAndSavedPsi();
 
+  /**
+   * Build reflecting-boundary copy plans for this angle set.
+   *
+   * \param boundaries Sweep-boundary table indexed by boundary ID.
+   */
+  void InitializeReflectingBoundaryNodes(
+    const std::map<std::uint64_t, std::shared_ptr<SweepBoundary>>& boundaries);
+
   /// Get the stride size for each face node's angular flux data.
   inline std::size_t GetStrideSize() const { return num_groups_and_angles_; }
 
-  /// Get vector of local cells to be swept.
-  crb::MappedHostVector<std::uint32_t>& GetLocalCellIDs() { return local_cell_ids_; }
+  /// Return one mapped host vector of local cells used by the CBCD launch pipeline.
+  crb::MappedHostVector<std::uint32_t>& GetLocalCellIDs(const std::size_t buffer_index)
+  {
+    return local_cell_ids_[buffer_index];
+  }
 
-  /// Get saved angular flux device pointer.
+  /// Return the device pointer to the saved angular flux buffer.
   double* GetSavedAngularFluxDevicePointer() { return device_saved_psi_.get(); }
 
-  /// Copy saved psi from device to host.
+  /// Copy saved angular fluxes from the device staging buffer to the host staging buffer.
   void CopySavedPsiFromDevice();
 
-  /// Copy saved psi from host to destination psi host buffer.
+  /**
+   * Copy saved angular fluxes into the destination psi vector.
+   *
+   * \param sweep_chunk Owning CBCD sweep chunk.
+   * \param angle_set Angle set owning these saved angular fluxes.
+   */
   void CopySavedPsiToDestinationPsi(CBCDSweepChunk& sweep_chunk, CBCD_AngleSet* angle_set);
 
-  /// Gets pointer set to device angular flux data.
+  /// Return the device pointer set used by the CBCD sweep kernel.
   CBCD_FLUDSPointerSet& GetDevicePointerSet() { return pointer_set_; }
 
-  /// Copies incoming boundary psi from host to device.
+  /**
+   * Copy incoming boundary angular flux data from the host buffers to the device buffers.
+   *
+   * \param sweep_chunk Owning CBCD sweep chunk.
+   * \param angle_set Angle set supplying boundary angular flux values.
+   */
   void CopyIncomingBoundaryPsiToDevice(CBCDSweepChunk& sweep_chunk, CBCD_AngleSet* angle_set);
 
-  /// Copies incoming non-local psi from host to device.
-  void CopyIncomingNonlocalPsiToDevice(CBCD_AngleSet* angle_set,
-                                       const std::vector<std::uint32_t>& cell_local_ids);
-
-  /// Copy outgoing psi on host after D2H copy is done.
+  /**
+   * Copy completed outgoing angular flux data into host-visible destinations.
+   *
+   * Reflecting boundary data is written back to the owning boundary objects. Outgoing
+   * non-local face data is enqueued directly into the aggregated communicator.
+   *
+   * \param sweep_chunk Owning CBCD sweep chunk.
+   * \param async_comm Aggregated communicator used to enqueue non-local face payloads.
+   * \param angle_set_id Producing angle-set ID.
+   * \param angle_indices Global angle indices carried by this angle set.
+   * \param cell_local_ids Local cells in the just-completed batch.
+   */
   void CopyOutgoingPsiBackToHost(CBCDSweepChunk& sweep_chunk,
-                                 CBCD_AngleSet* angle_set,
-                                 const std::vector<std::uint32_t>& cell_local_ids);
-
-  double* NLUpwindPsi(uint64_t cell_global_id,
-                      unsigned int face_id,
-                      unsigned int face_node_mapped,
-                      size_t as_ss_idx);
-
-  double*
-  NLOutgoingPsi(std::vector<double>* psi_nonlocal_outgoing, size_t face_node, size_t as_ss_idx);
-
-  void ClearLocalAndReceivePsi() override { deplocs_outgoing_messages_.clear(); }
+                                 CBCD_AsynchronousCommunicator& async_comm,
+                                 std::size_t angle_set_id,
+                                 const std::vector<std::uint32_t>& angle_indices,
+                                 std::span<const std::uint32_t> cell_local_ids);
+
+  /**
+   * Scatter one received non-local face payload into the mapped incoming buffer.
+   *
+   * \param source_slot Source-locality slot for the sending partition.
+   * \param source_face_index Source-slot-local face index carried on the wire.
+   * \param psi_data Packed payload doubles.
+   * \return Local cell ID whose dependency count should be updated.
+   */
+  std::uint32_t ScatterReceivedFaceData(std::uint32_t source_slot,
+                                        std::uint32_t source_face_index,
+                                        const double* psi_data);
+
+  void ClearLocalAndReceivePsi() override;
   void ClearSendPsi() override {}
   void AllocateInternalLocalPsi() override {}
   void AllocateOutgoingPsi() override {}
@@ -93,23 +150,31 @@ class CBCD_FLUDS : public FLUDS
   void AllocatePrelocIOutgoingPsi() override {}
   void AllocateDelayedPrelocIOutgoingPsi() override {}
 
+  std::span<const ReflectingBoundaryFacePlan>
+  GetReflectingOutgoingBoundaryFaces(const std::uint64_t cell_local_id) const
+  {
+    const auto begin = reflecting_outgoing_boundary_face_offsets_[cell_local_id];
+    const auto end = reflecting_outgoing_boundary_face_offsets_[cell_local_id + 1];
+    return {reflecting_boundary_face_plans_.data() + begin, end - begin};
+  }
+
 private:
   /// Reference to the common data.
   const CBCD_FLUDSCommonData& common_data_;
+  /// CBC sweep plane data structure for this angle set.
+  const CBC_SPDS& cbc_spds_;
+  /// Unknown manager for angular flux storage.
   const UnknownManager& psi_uk_man_;
+  /// Spatial discretization used for saved-psi layout.
   const SpatialDiscretization& sdm_;
-  size_t num_angles_in_gs_quadrature_;
-  size_t num_quadrature_local_dofs_;
-  size_t num_local_spatial_dofs_;
-  size_t local_psi_data_size_;
-  /// Map from incoming face boundary node to indexing metadata
-  std::vector<BoundaryNodeInfo> incoming_boundary_node_map_;
-  /// Map from cell to outgoing boundary node indexing metadata.
-  std::map<std::uint64_t, std::vector<BoundaryNodeInfo>> cell_to_outgoing_boundary_nodes_;
-  /// Map from cell to incoming nonlocal nodes indexing metadata.
-  std::map<std::uint64_t, std::vector<NonlocalNodeInfo>> cell_to_incoming_nonlocal_nodes_;
-  /// Map from cell to outgoing nonlocal node indexing metadata.
-  std::map<std::uint64_t, std::vector<NonlocalNodeInfo>> cell_to_outgoing_nonlocal_nodes_;
+  /// Number of local spatial degrees of freedom.
+  std::size_t num_local_spatial_dofs_;
+  /// Number of doubles in the local psi backing buffer.
+  std::size_t local_psi_data_size_;
+  /// Number of doubles in the saved angular-flux buffer.
+  std::size_t saved_psi_data_size_;
+  /// Owning grid pointer for cell-view access.
+  const MeshContinuum* grid_ptr_ = nullptr;
   /// Mapped host vectors for boundary and non-local angular fluxes.
   crb::MappedHostVector<double> incoming_boundary_psi_;
   crb::MappedHostVector<double> outgoing_boundary_psi_;
@@ -117,20 +182,26 @@ class CBCD_FLUDS : public FLUDS
   crb::MappedHostVector<double> outgoing_nonlocal_psi_;
   /// Associated angleset's stream.
   crb::Stream stream_;
-  crb::MappedHostVector<std::uint32_t> local_cell_ids_;
+  /// Mapped host launch buffers that hold ready local cell IDs.
+  std::array<crb::MappedHostVector<std::uint32_t>, 3> local_cell_ids_;
+  /// Flag indicating whether angular fluxes are saved after the sweep.
   bool save_angular_flux_;
   /// Device storage for local angular fluxes.
   crb::DeviceMemory<double> local_psi_;
   /// Host and device buffers for saved angular fluxes.
   crb::DeviceMemory<double> device_saved_psi_;
   crb::HostVector<double> host_saved_psi_;
-  /// Pointer set to device angular flux data
+  /// Pointer set used by the CBCD sweep kernel.
   CBCD_FLUDSPointerSet pointer_set_;
-
-  /// Creates device pointer set to the local, boundary, and non-local angular flux buffers.
+  /// Cell-to-reflecting-face offset table.
+  std::vector<std::uint32_t> reflecting_outgoing_boundary_face_offsets_;
+  /// Flat reflecting-boundary face plans.
+  std::vector<ReflectingBoundaryFacePlan> reflecting_boundary_face_plans_;
+  /// Flat byte-level memcpy descriptors referenced by outgoing faces.
+  std::vector<OutgoingNodeMemcpy> outgoing_node_memcpy_plan_;
+
+  /// Build the device pointer set exposed to the CBCD sweep kernel.
   void CreatePointerSet();
-
-  std::vector<std::vector<double>> boundaryI_incoming_psi_;
 };
 
-} // namespace opensn
\ No newline at end of file
+} // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cc
index 411bcebd7f..c160dc9f20 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cc
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cc
@@ -2,6 +2,9 @@
 // SPDX-License-Identifier: MIT
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h"
+#include "framework/utils/error.h"
+#include <cassert>
+#include <algorithm>
 
 namespace opensn
 {
@@ -19,11 +22,7 @@ CBCD_FLUDSCommonData::CBCD_FLUDSCommonData(
     num_incoming_nonlocal_nodes_(0),
     num_outgoing_nonlocal_faces_(0),
     num_outgoing_nonlocal_nodes_(0),
-    device_cell_face_node_map_(nullptr),
-    incoming_boundary_node_map_(),
-    cell_to_outgoing_boundary_nodes_(),
-    cell_to_incoming_nonlocal_nodes_(),
-    cell_to_outgoing_nonlocal_nodes_()
+    device_cell_face_node_map_(nullptr)
 {
   CopyFlattenedNodeIndexToDevice(sdm);
 }
@@ -45,4 +44,13 @@ CBCD_FLUDSCommonData::DeallocateDeviceMemory()
 }
 #endif
 
-} // namespace opensn
\ No newline at end of file
+const GroupedIncomingNonlocalFace&
+CBCD_FLUDSCommonData::GetIncomingNonlocalFace(const std::uint32_t source_slot,
+                                              const std::uint32_t source_face_index) const
+{
+  const auto begin = source_to_incoming_face_offsets_[source_slot];
+  assert(begin + source_face_index < source_to_incoming_face_offsets_[source_slot + 1]);
+  return incoming_nonlocal_faces_[incoming_face_indices_by_source_[begin + source_face_index]];
+}
+
+} // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cu b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cu
index 98d2294a72..4b8869a19b 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cu
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cu
@@ -2,11 +2,15 @@
 // SPDX-License-Identifier: MIT
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/spds.h"
 #include "framework/math/spatial_discretization/spatial_discretization.h"
 #include "framework/mesh/mesh_continuum/mesh_continuum.h"
 #include "caribou/main.hpp"
-#include <cinttypes>
+#include <algorithm>
+#include <cstring>
+#include <tuple>
+#include <unordered_map>
 
 namespace crb = caribou;
 
@@ -17,18 +21,16 @@ void
 CBCD_FLUDSCommonData::CopyFlattenedNodeIndexToDevice(const SpatialDiscretization& sdm)
 {
   const MeshContinuum& grid = *(spds_.GetGrid());
+  const auto& cbc_spds = static_cast<const CBC_SPDS&>(spds_);
   const size_t num_local_cells = grid.local_cells.size();
+  const auto& face_orientations = spds_.GetCellFaceOrientations();
+  const auto local_face_slot_ids = cbc_spds.GetLocalFaceSlotIDs();
+  const auto local_face_slot_node_offsets = cbc_spds.GetLocalFaceSlotNodeOffsets();
   std::uint64_t total_face_nodes = 0;
   for (const auto& cell : grid.local_cells)
     for (std::uint32_t f = 0; f < cell.faces.size(); ++f)
       total_face_nodes += sdm.GetCellMapping(cell).GetNumFaceNodes(f);
-  std::vector<size_t> cell_spatial_dof_offsets(num_local_cells);
-  size_t current_dof_offset = 0;
-  for (const auto& cell : grid.local_cells)
-  {
-    cell_spatial_dof_offsets[cell.local_id] = current_dof_offset;
-    current_dof_offset += sdm.GetCellMapping(cell).GetNumNodes();
-  }
+
   const size_t offsets_size = 2 * num_local_cells;
   const size_t total_size = offsets_size + total_face_nodes;
   std::vector<std::uint64_t> local_map(total_size);
@@ -36,15 +38,58 @@ CBCD_FLUDSCommonData::CopyFlattenedNodeIndexToDevice(const SpatialDiscretization
   std::uint64_t* indices_ptr = local_map.data() + offsets_size;
   std::uint64_t current_index_offset = offsets_size;
   std::uint64_t local_indices_filled = 0;
-  // Iterate over cells to fill the map and populate metadata structures
+
+  cell_to_outgoing_boundary_node_offsets_.assign(num_local_cells + 1, 0);
+  cell_to_incoming_nonlocal_face_offsets_.assign(num_local_cells + 1, 0);
+  cell_to_outgoing_nonlocal_face_offsets_.assign(num_local_cells + 1, 0);
+
+  std::unordered_map<int, std::uint32_t> locality_to_dest_slot;
+  std::unordered_map<int, std::uint32_t> source_partition_to_slot;
+  outgoing_localities_.reserve(num_local_cells);
+  incoming_source_partitions_.reserve(num_local_cells);
+  outgoing_boundary_nodes_.reserve(total_face_nodes);
+  outgoing_nonlocal_face_node_copies_.reserve(total_face_nodes);
+  struct OrderedIncomingFaceBuild
+  {
+    std::uint32_t source_slot = 0;
+    std::uint64_t cell_global_id = 0;
+    unsigned int face_id = 0;
+    std::uint32_t face_index = 0;
+  };
+  struct OrderedOutgoingFaceBuild
+  {
+    std::uint32_t dest_slot = 0;
+    std::uint64_t cell_global_id = 0;
+    unsigned int face_id = 0;
+    std::uint32_t face_index = 0;
+  };
+  std::vector<OrderedIncomingFaceBuild> incoming_face_order;
+  std::vector<OrderedOutgoingFaceBuild> outgoing_face_order;
+  incoming_face_order.reserve(total_face_nodes);
+  outgoing_face_order.reserve(total_face_nodes);
+
+  const auto update_cell_offsets = [this](const std::uint64_t cell_local_id)
+  {
+    cell_to_outgoing_boundary_node_offsets_[cell_local_id] =
+      static_cast<std::uint32_t>(outgoing_boundary_nodes_.size());
+    cell_to_incoming_nonlocal_face_offsets_[cell_local_id] =
+      static_cast<std::uint32_t>(incoming_nonlocal_faces_.size());
+    cell_to_outgoing_nonlocal_face_offsets_[cell_local_id] =
+      static_cast<std::uint32_t>(outgoing_nonlocal_faces_.size());
+  };
+
   for (const auto& cell : grid.local_cells)
   {
+    update_cell_offsets(cell.local_id);
+
     cell_offsets_ptr[2 * cell.local_id] = current_index_offset;
     std::uint64_t num_cell_nodes = 0;
+    std::vector<int> incoming_face_to_grouped_index(cell.faces.size(), -1);
+    std::vector<int> outgoing_face_to_grouped_index(cell.faces.size(), -1);
     for (size_t f = 0; f < cell.faces.size(); ++f)
     {
       const CellFace& face = cell.faces[f];
-      const FaceOrientation& orientation = spds_.GetCellFaceOrientations()[cell.local_id][f];
+      const FaceOrientation& orientation = face_orientations[cell.local_id][f];
       const FaceNodalMapping& face_nodal_mapping = grid_nodal_mappings_[cell.local_id][f];
       const size_t num_face_nodes = sdm.GetCellMapping(cell).GetNumFaceNodes(f);
       const bool is_outgoing_face = (orientation == FaceOrientation::OUTGOING);
@@ -52,14 +97,6 @@ CBCD_FLUDSCommonData::CopyFlattenedNodeIndexToDevice(const SpatialDiscretization
       const bool is_local_face = face.IsNeighborLocal(&grid);
       const bool is_boundary_face = not face.has_neighbor;
 
-      if ((not is_local_face) and (not is_boundary_face))
-      {
-        if (is_incoming_face)
-          ++num_incoming_nonlocal_faces_;
-        else if (is_outgoing_face)
-          ++num_outgoing_nonlocal_faces_;
-      }
-
       for (size_t fn = 0; fn < num_face_nodes; ++fn)
       {
         CBCD_NodeIndex node_index;
@@ -68,33 +105,63 @@ CBCD_FLUDSCommonData::CopyFlattenedNodeIndexToDevice(const SpatialDiscretization
         {
           if (is_local_face)
           {
-            std::uint32_t nbr_local_idx = face.GetNeighborLocalID(&grid);
-            std::uint32_t adj_cell_node = face_nodal_mapping.cell_node_mapping_[fn];
-            const std::uint64_t index = cell_spatial_dof_offsets[nbr_local_idx] + adj_cell_node;
-            node_index = CBCD_NodeIndex(index, is_outgoing_face, is_local_face);
+            const auto task_id = cbc_spds.GetIncomingLocalFaceTaskID(
+              static_cast<std::uint32_t>(cell.local_id), static_cast<unsigned int>(f));
+            const auto slot_id = local_face_slot_ids[task_id];
+            const auto local_face_node =
+              static_cast<std::uint64_t>(face_nodal_mapping.face_node_mapping_[fn]);
+            node_index = CBCD_NodeIndex(
+              static_cast<std::uint64_t>(local_face_slot_node_offsets[slot_id]) + local_face_node,
+              is_outgoing_face,
+              true);
           }
           else if (not is_boundary_face)
           {
             node_index =
               CBCD_NodeIndex(num_incoming_nonlocal_nodes_, is_outgoing_face, is_local_face);
-            cell_to_incoming_nonlocal_nodes_[cell.local_id].emplace_back(
-              NonlocalNodeInfo{cell.local_id,
-                               cell.global_id,
-                               static_cast<unsigned int>(f),
-                               fn,
-                               face_nodal_mapping.face_node_mapping_[fn],
-                               static_cast<std::uint64_t>(num_incoming_nonlocal_nodes_)});
+            int& grouped_face_index = incoming_face_to_grouped_index[f];
+            if (grouped_face_index < 0)
+            {
+              grouped_face_index =
+                static_cast<int>(incoming_nonlocal_faces_.size() -
+                                 cell_to_incoming_nonlocal_face_offsets_[cell.local_id]);
+              auto& grouped_face = incoming_nonlocal_faces_.emplace_back();
+              const int source_partition = grid.cells[face.neighbor_id].partition_id;
+              auto [source_it, inserted] = source_partition_to_slot.try_emplace(
+                source_partition, static_cast<std::uint32_t>(incoming_source_partitions_.size()));
+              if (inserted)
+                incoming_source_partitions_.push_back(source_partition);
+              grouped_face.cell_local_id = static_cast<std::uint32_t>(cell.local_id);
+              grouped_face.base_storage_index =
+                static_cast<std::uint32_t>(num_incoming_nonlocal_nodes_);
+              grouped_face.source_slot = source_it->second;
+              incoming_face_order.push_back(
+                {grouped_face.source_slot,
+                 cell.global_id,
+                 static_cast<unsigned int>(f),
+                 static_cast<std::uint32_t>(incoming_nonlocal_faces_.size() - 1)});
+              ++num_incoming_nonlocal_faces_;
+            }
+
+            auto& grouped_face =
+              incoming_nonlocal_faces_[cell_to_incoming_nonlocal_face_offsets_[cell.local_id] +
+                                       grouped_face_index];
+            ++grouped_face.num_nodes;
             ++num_incoming_nonlocal_nodes_;
           }
           else
           {
             node_index = CBCD_NodeIndex(num_incoming_boundary_nodes_, is_outgoing_face);
-            incoming_boundary_node_map_.emplace_back(
-              BoundaryNodeInfo{cell.local_id,
-                               static_cast<unsigned int>(f),
-                               fn,
-                               static_cast<std::uint64_t>(num_incoming_boundary_nodes_),
-                               face.neighbor_id});
+            if (fn == 0)
+            {
+              incoming_boundary_face_plans_.push_back(
+                {face.neighbor_id,
+                 static_cast<std::uint32_t>(cell.local_id),
+                 static_cast<unsigned int>(f),
+                 0,
+                 static_cast<std::uint32_t>(num_incoming_boundary_nodes_),
+                 static_cast<std::uint16_t>(num_face_nodes)});
+            }
             ++num_incoming_boundary_nodes_;
           }
         }
@@ -102,32 +169,71 @@ CBCD_FLUDSCommonData::CopyFlattenedNodeIndexToDevice(const SpatialDiscretization
         {
           if (is_local_face)
           {
-            const int cell_node = sdm.GetCellMapping(cell).MapFaceNode(f, fn);
-            const std::uint64_t index = cell_spatial_dof_offsets[cell.local_id] + cell_node;
-            node_index = CBCD_NodeIndex(index, is_outgoing_face, is_local_face);
+            const auto task_id = cbc_spds.GetOutgoingLocalFaceTaskID(
+              static_cast<std::uint32_t>(cell.local_id), static_cast<unsigned int>(f));
+            const auto slot_id = local_face_slot_ids[task_id];
+            node_index =
+              CBCD_NodeIndex(static_cast<std::uint64_t>(local_face_slot_node_offsets[slot_id]) +
+                               static_cast<std::uint64_t>(fn),
+                             is_outgoing_face,
+                             true);
           }
           else if (not is_boundary_face)
           {
             node_index =
               CBCD_NodeIndex(num_outgoing_nonlocal_nodes_, is_outgoing_face, is_local_face);
-            cell_to_outgoing_nonlocal_nodes_[cell.local_id].emplace_back(
-              NonlocalNodeInfo{cell.local_id,
-                               cell.global_id,
-                               static_cast<unsigned int>(f),
-                               fn,
-                               face_nodal_mapping.face_node_mapping_[fn],
-                               static_cast<std::uint64_t>(num_outgoing_nonlocal_nodes_)});
+            int& grouped_face_index = outgoing_face_to_grouped_index[f];
+            if (grouped_face_index < 0)
+            {
+              const int locality = grid.cells[face.neighbor_id].partition_id;
+              auto dest_slot_it = locality_to_dest_slot.find(locality);
+              std::uint32_t dest_slot = 0;
+              if (dest_slot_it == locality_to_dest_slot.end())
+              {
+                dest_slot = static_cast<std::uint32_t>(outgoing_localities_.size());
+                locality_to_dest_slot.emplace(locality, dest_slot);
+                outgoing_localities_.push_back(locality);
+              }
+              else
+                dest_slot = dest_slot_it->second;
+
+              const auto dest_cell_global_id = face.neighbor_id;
+              const auto dest_face_id =
+                static_cast<unsigned int>(face_nodal_mapping.associated_face_);
+              grouped_face_index =
+                static_cast<int>(outgoing_nonlocal_faces_.size() -
+                                 cell_to_outgoing_nonlocal_face_offsets_[cell.local_id]);
+              auto& grouped_face = outgoing_nonlocal_faces_.emplace_back();
+              grouped_face.dest_slot = dest_slot;
+              grouped_face.num_face_nodes = static_cast<std::uint16_t>(num_face_nodes);
+              grouped_face.node_copy_offset =
+                static_cast<std::uint32_t>(outgoing_nonlocal_face_node_copies_.size());
+              outgoing_face_order.push_back(
+                {dest_slot,
+                 dest_cell_global_id,
+                 dest_face_id,
+                 static_cast<std::uint32_t>(outgoing_nonlocal_faces_.size() - 1)});
+              ++num_outgoing_nonlocal_faces_;
+            }
+
+            auto& grouped_face =
+              outgoing_nonlocal_faces_[cell_to_outgoing_nonlocal_face_offsets_[cell.local_id] +
+                                       grouped_face_index];
+            outgoing_nonlocal_face_node_copies_.push_back(
+              {static_cast<std::uint32_t>(num_outgoing_nonlocal_nodes_),
+               static_cast<std::uint16_t>(face_nodal_mapping.face_node_mapping_[fn])});
+            ++grouped_face.num_node_copies;
             ++num_outgoing_nonlocal_nodes_;
           }
           else
           {
             node_index = CBCD_NodeIndex(num_outgoing_boundary_nodes_, is_outgoing_face);
-            cell_to_outgoing_boundary_nodes_[cell.local_id].emplace_back(
-              BoundaryNodeInfo{cell.local_id,
+            outgoing_boundary_nodes_.emplace_back(
+              BoundaryNodeInfo{face.neighbor_id,
+                               static_cast<std::uint32_t>(cell.local_id),
                                static_cast<unsigned int>(f),
-                               fn,
-                               static_cast<std::uint64_t>(num_outgoing_boundary_nodes_),
-                               face.neighbor_id});
+                               static_cast<std::uint32_t>(num_outgoing_boundary_nodes_),
+                               static_cast<std::uint16_t>(fn)});
             ++num_outgoing_boundary_nodes_;
           }
         }
@@ -139,9 +245,52 @@ CBCD_FLUDSCommonData::CopyFlattenedNodeIndexToDevice(const SpatialDiscretization
       }
       num_cell_nodes += num_face_nodes;
     }
+    update_cell_offsets(cell.local_id + 1);
     cell_offsets_ptr[2 * cell.local_id + 1] = num_cell_nodes;
     current_index_offset += num_cell_nodes;
   }
+
+  std::sort(incoming_face_order.begin(),
+            incoming_face_order.end(),
+            [](const OrderedIncomingFaceBuild& lhs, const OrderedIncomingFaceBuild& rhs)
+            {
+              return std::tuple(lhs.source_slot, lhs.cell_global_id, lhs.face_id) <
+                     std::tuple(rhs.source_slot, rhs.cell_global_id, rhs.face_id);
+            });
+
+  source_to_incoming_face_offsets_.assign(incoming_source_partitions_.size() + 1, 0);
+  for (const auto& build : incoming_face_order)
+    ++source_to_incoming_face_offsets_[build.source_slot + 1];
+  for (std::size_t i = 0; i < incoming_source_partitions_.size(); ++i)
+    source_to_incoming_face_offsets_[i + 1] += source_to_incoming_face_offsets_[i];
+
+  incoming_face_indices_by_source_.resize(incoming_face_order.size());
+  auto source_write_offsets = source_to_incoming_face_offsets_;
+  for (const auto& build : incoming_face_order)
+    incoming_face_indices_by_source_[source_write_offsets[build.source_slot]++] = build.face_index;
+
+  std::sort(outgoing_face_order.begin(),
+            outgoing_face_order.end(),
+            [](const OrderedOutgoingFaceBuild& lhs, const OrderedOutgoingFaceBuild& rhs)
+            {
+              return std::tuple(lhs.dest_slot, lhs.cell_global_id, lhs.face_id) <
+                     std::tuple(rhs.dest_slot, rhs.cell_global_id, rhs.face_id);
+            });
+
+  std::uint32_t current_dest_slot = 0;
+  std::uint32_t remote_face_index = 0;
+  bool first_outgoing_face = true;
+  for (const auto& build : outgoing_face_order)
+  {
+    if (first_outgoing_face or (build.dest_slot != current_dest_slot))
+    {
+      current_dest_slot = build.dest_slot;
+      remote_face_index = 0;
+      first_outgoing_face = false;
+    }
+    outgoing_nonlocal_faces_[build.face_index].remote_face_index = remote_face_index++;
+  }
+
   if (local_map.empty())
     return;
   crb::HostVector<std::uint64_t> host_mem(local_map.begin(), local_map.end());
@@ -160,4 +309,4 @@ CBCD_FLUDSCommonData::DeallocateDeviceMemory()
     device_cell_face_node_map_ = nullptr;
   }
 }
-} // namespace opensn
\ No newline at end of file
+} // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h
index 1d61b5201e..c77f56d975 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h
@@ -6,17 +6,31 @@
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_structs.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/fluds_common_data.h"
 #include <cstdint>
-#include <map>
+#include <span>
+#include <vector>
 
 namespace opensn
 {
 
 class SpatialDiscretization;
 
-/// Common data for CBCD_FLUDS
+/**
+ * Shared CBCD FLUDS metadata.
+ *
+ * Builds and owns the flattened indexing tables used by every CBCD FLUDS instance
+ * associated with one SPDS. The tables translate cell-face-node accesses into
+ * compact local, boundary, and non-local storage indices on both the host and device.
+ */
 class CBCD_FLUDSCommonData : public FLUDSCommonData
 {
 public:
+  /**
+   * Construct the shared CBCD FLUDS metadata for one SPDS.
+   *
+   * \param spds Sweep plane data structure providing the CBC cell and face ordering.
+   * \param grid_nodal_mappings Per-cell face-node mappings from the spatial discretization.
+   * \param sdm Spatial discretization used to enumerate face nodes.
+   */
   CBCD_FLUDSCommonData(const SPDS& spds,
                        const std::vector<CellFaceNodalMapping>& grid_nodal_mappings,
                        const SpatialDiscretization& sdm);
@@ -41,28 +55,70 @@ class CBCD_FLUDSCommonData : public FLUDSCommonData
   /// Get number of outgoing non-local faces.
   std::size_t GetNumOutgoingNonlocalFaces() const { return num_outgoing_nonlocal_faces_; }
 
-  /// Get incoming boundary node map.
-  const std::vector<BoundaryNodeInfo>& GetIncomingBoundaryNodeMap() const
+  /// Return grouped incoming-boundary faces.
+  const std::vector<IncomingBoundaryFacePlan>& GetIncomingBoundaryFaces() const
+  {
+    return incoming_boundary_face_plans_;
+  }
+
+  /// Return the number of grouped incoming non-local faces from one source locality slot.
+  std::size_t GetNumIncomingFacesFromSource(const std::size_t source_slot) const
+  {
+    return source_to_incoming_face_offsets_[source_slot + 1] -
+           source_to_incoming_face_offsets_[source_slot];
+  }
+
+  /// Return outgoing-boundary nodes for one cell.
+  std::span<const BoundaryNodeInfo> GetOutgoingBoundaryNodes(std::uint64_t cell_local_id) const
   {
-    return incoming_boundary_node_map_;
+    const auto begin = cell_to_outgoing_boundary_node_offsets_[cell_local_id];
+    const auto end = cell_to_outgoing_boundary_node_offsets_[cell_local_id + 1];
+    return {outgoing_boundary_nodes_.data() + begin, end - begin};
   }
 
-  /// Get outgoing boundary node map.
-  const std::map<std::uint64_t, std::vector<BoundaryNodeInfo>>& GetOutgoingBoundaryNodeMap() const
+  /// Return grouped outgoing non-local faces for one cell.
+  std::span<const GroupedOutgoingNonlocalFace>
+  GetOutgoingNonlocalFaces(std::uint64_t cell_local_id) const
   {
-    return cell_to_outgoing_boundary_nodes_;
+    const auto begin = cell_to_outgoing_nonlocal_face_offsets_[cell_local_id];
+    const auto end = cell_to_outgoing_nonlocal_face_offsets_[cell_local_id + 1];
+    return {outgoing_nonlocal_faces_.data() + begin, end - begin};
   }
 
-  /// Get incoming nonlocal node map.
-  const std::map<std::uint64_t, std::vector<NonlocalNodeInfo>>& GetIncomingNonlocalNodeMap() const
+  /// Return grouped incoming non-local faces for one cell.
+  std::span<const GroupedIncomingNonlocalFace>
+  GetIncomingNonlocalFaces(std::uint64_t cell_local_id) const
   {
-    return cell_to_incoming_nonlocal_nodes_;
+    const auto begin = cell_to_incoming_nonlocal_face_offsets_[cell_local_id];
+    const auto end = cell_to_incoming_nonlocal_face_offsets_[cell_local_id + 1];
+    return {incoming_nonlocal_faces_.data() + begin, end - begin};
   }
 
-  /// Get outgoing nonlocal node map.
-  const std::map<std::uint64_t, std::vector<NonlocalNodeInfo>>& GetOutgoingNonlocalNodeMap() const
+  /// Return the number of local cells represented in the grouped-face tables.
+  std::size_t GetNumLocalCells() const
+  {
+    return cell_to_outgoing_nonlocal_face_offsets_.size() - 1;
+  }
+
+  /// Return the ordered outgoing-locality table used to build communicator queue indices.
+  const std::vector<int>& GetOutgoingLocalities() const { return outgoing_localities_; }
+
+  /// Return the ordered incoming source-locality table.
+  const std::vector<int>& GetIncomingSourcePartitions() const
+  {
+    return incoming_source_partitions_;
+  }
+
+  /// Resolve one grouped incoming non-local face by source-slot-local face index.
+  const GroupedIncomingNonlocalFace& GetIncomingNonlocalFace(std::uint32_t source_slot,
+                                                             std::uint32_t source_face_index) const;
+
+  /// Return the outgoing-node-copy descriptors for one grouped outgoing face.
+  std::span<const OutgoingNodeCopy>
+  GetOutgoingNodeCopies(const GroupedOutgoingNonlocalFace& face) const
   {
-    return cell_to_outgoing_nonlocal_nodes_;
+    return {outgoing_nonlocal_face_node_copies_.data() + face.node_copy_offset,
+            face.num_node_copies};
   }
 
   /// Get pointer to cell-face-node map on device.
@@ -83,22 +139,39 @@ class CBCD_FLUDSCommonData : public FLUDSCommonData
   size_t num_outgoing_nonlocal_nodes_;
   /// Device pointer to cell-face-node map for angular flux buffer access.
   std::uint64_t* device_cell_face_node_map_;
-  /// Map from incoming face boundary node to indexing metadata.
-  std::vector<BoundaryNodeInfo> incoming_boundary_node_map_;
-  /// Map from cell to outgoing boundary nodes.
-  std::map<std::uint64_t, std::vector<BoundaryNodeInfo>> cell_to_outgoing_boundary_nodes_;
-  /// Map from cell to incoming nonlocal nodes.
-  std::map<std::uint64_t, std::vector<NonlocalNodeInfo>> cell_to_incoming_nonlocal_nodes_;
-  /// Map from cell to outgoing nonlocal nodes.
-  std::map<std::uint64_t, std::vector<NonlocalNodeInfo>> cell_to_outgoing_nonlocal_nodes_;
+  /// Flat grouped incoming-boundary face copy plans.
+  std::vector<IncomingBoundaryFacePlan> incoming_boundary_face_plans_;
+  /// Cell-to-outgoing-boundary-node offset table.
+  std::vector<std::uint32_t> cell_to_outgoing_boundary_node_offsets_;
+  /// Flat outgoing-boundary node list.
+  std::vector<BoundaryNodeInfo> outgoing_boundary_nodes_;
+  /// Cell-to-incoming-face offset table.
+  std::vector<std::uint32_t> cell_to_incoming_nonlocal_face_offsets_;
+  /// Cell-to-outgoing-face offset table.
+  std::vector<std::uint32_t> cell_to_outgoing_nonlocal_face_offsets_;
+  /// Flat grouped incoming nonlocal faces.
+  std::vector<GroupedIncomingNonlocalFace> incoming_nonlocal_faces_;
+  /// Flat grouped outgoing nonlocal faces.
+  std::vector<GroupedOutgoingNonlocalFace> outgoing_nonlocal_faces_;
+  /// Flat outgoing-node-copy metadata referenced by grouped outgoing faces.
+  std::vector<OutgoingNodeCopy> outgoing_nonlocal_face_node_copies_;
+  /// Ordered table of distinct outgoing localities.
+  std::vector<int> outgoing_localities_;
+  /// Ordered table of incoming source localities.
+  std::vector<int> incoming_source_partitions_;
+  /// Source-major incoming grouped-face spans.
+  std::vector<std::uint32_t> source_to_incoming_face_offsets_;
+  /// Source-major ordered incoming grouped-face indices.
+  std::vector<std::uint32_t> incoming_face_indices_by_source_;
 
   /**
-   * Compute cell-face-node map for device angular flux buffer access, and
-   * create auxiliary indexing maps for boundary and non-local nodes for host access.
+   * Build and upload the flattened cell-face-node index map.
+   *
+   * \param sdm Spatial discretization used to enumerate face nodes.
    */
   void CopyFlattenedNodeIndexToDevice(const SpatialDiscretization& sdm);
   /// Deallocate device memory for cell-face-node map.
   void DeallocateDeviceMemory();
 };
 
-} // namespace opensn
\ No newline at end of file
+} // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_structs.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_structs.h
index 1e959958ba..e406dccdd5 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_structs.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_structs.h
@@ -4,16 +4,30 @@
 #pragma once
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/fluds_structs.h"
+#include <array>
+#include <cstddef>
+#include <functional>
 
 namespace opensn
 {
 
+class SweepBoundary;
+
 /**
- * Node index specific to CBCD FLUDS.
+ * Packed 64-bit angular flux buffer index for CBCD FLUDS.
+ *
+ * Encodes the buffer type (local/boundary/non-lcaol, incoming/outgoing) and
+ * address into a single 64-bit value.
  * Does not support delayed nodes. Reclaims the delayed bit for indices.
- * - Bit 63: Incoming/outgoing bit.
- * - Bit 62: Boundary bit.
- * - Bit 61: Local bit.
+ *
+ * Bit layout:
+ * - Bit 63: incoming (0) / outgoing (1).
+ * - Bit 62: boundary (1) / non-boundary (0).
+ * - Bit 61: local (1) / non-local (0).
+ * - For local non-boundary nodes:
+ *   - Bits 0-60: flat local-face-slot node bank index.
+ * - For boundary or non-local nodes:
+ *   - Bits 0-60: flat bank index.
  * - Bits 0-60: Index bits (capacity ~2.3e18).
  */
 class CBCD_NodeIndex : public NodeIndex
@@ -162,24 +176,66 @@ struct CBCD_FLUDSPointerSet : public FLUDSPointerSet
  */
 struct BoundaryNodeInfo
 {
-  std::uint64_t cell_local_id;
-  unsigned int face_id;
-  size_t face_node;
-  std::uint64_t storage_index;
-  std::uint64_t boundary_id;
+  std::uint64_t boundary_id = 0;
+  std::uint32_t cell_local_id = 0;
+  unsigned int face_id = 0;
+  std::uint32_t storage_index = 0;
+  std::uint16_t face_node = 0;
 };
 
-/**
- * Metadata for non-local face nodes.
- */
-struct NonlocalNodeInfo
+/// Grouped incoming-boundary face copy plan.
+struct IncomingBoundaryFacePlan
+{
+  std::uint64_t boundary_id = 0;
+  std::uint32_t cell_local_id = 0;
+  unsigned int face_id = 0;
+  std::uint16_t first_face_node = 0;
+  std::uint32_t base_storage_index = 0;
+  std::uint16_t num_nodes = 0;
+};
+
+/// Grouped incoming non-local face.
+struct GroupedIncomingNonlocalFace
+{
+  std::uint32_t cell_local_id = 0;
+  std::uint32_t base_storage_index = 0;
+  std::uint32_t source_slot = 0;
+  std::uint16_t num_nodes = 0;
+};
+
+/// Outgoing node-copy descriptor
+struct OutgoingNodeCopy
+{
+  std::uint32_t storage_index = 0;
+  std::uint16_t face_node = 0;
+};
+
+/// Grouped outgoing non-local face.
+struct GroupedOutgoingNonlocalFace
+{
+  std::uint32_t dest_slot = 0;
+  std::uint32_t remote_face_index = 0;
+  std::uint32_t node_copy_offset = 0;
+  std::uint16_t num_face_nodes = 0;
+  std::uint16_t num_node_copies = 0;
+};
+
+/// Reflecting-boundary face copy plan.
+struct ReflectingBoundaryFacePlan
+{
+  SweepBoundary* boundary = nullptr;
+  std::uint32_t cell_local_id = 0;
+  unsigned int face_id = 0;
+  std::uint16_t first_face_node = 0;
+  std::size_t src_base_offset = 0;
+  std::uint16_t num_nodes = 0;
+};
+
+/// Outgoing node-copy plan entry.
+struct OutgoingNodeMemcpy
 {
-  std::uint64_t cell_local_id;
-  std::uint64_t cell_global_id;
-  unsigned int face_id;
-  size_t face_node;
-  short face_node_mapped;
-  std::uint64_t storage_index;
+  std::size_t src_offset = 0;
+  std::size_t dest_offset = 0;
 };
 
-} // namespace opensn
\ No newline at end of file
+} // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.cc
index af76cb45ec..a082ea62dd 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.cc
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.cc
@@ -29,7 +29,8 @@ SweepScheduler::SweepScheduler(SchedulingAlgorithm scheduler_type,
     InitializeAlgoDOG();
 
   if (scheduler_type_ == SchedulingAlgorithm::ALL_AT_ONCE ||
-      scheduler_type_ == SchedulingAlgorithm::DEPTH_OF_GRAPH)
+      scheduler_type_ == SchedulingAlgorithm::DEPTH_OF_GRAPH ||
+      scheduler_type_ == SchedulingAlgorithm::ASYNC_FIFO)
   {
     angle_agg_.SetupAngleSetDependencies();
   }
@@ -39,6 +40,14 @@ SweepScheduler::SweepScheduler(SchedulingAlgorithm scheduler_type,
     pool_.Resize(angle_agg_.GetNumAngleSets());
     execution_order_.reserve(angle_agg_.GetNumAngleSets());
   }
+  else if (scheduler_type_ == SchedulingAlgorithm::ASYNC_FIFO)
+  {
+    const std::size_t hardware_concurrency = std::thread::hardware_concurrency();
+    const std::size_t num_workers = std::max<std::size_t>(
+      1,
+      std::min(angle_agg_.GetNumAngleSets(), hardware_concurrency == 0 ? 1 : hardware_concurrency));
+    pool_.Resize(num_workers);
+  }
 
   // Initialize delayed upstream data
   for (auto& angset : angle_agg_)
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.cu b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.cu
index 861aeb305f..36ccade153 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.cu
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.cu
@@ -4,14 +4,10 @@
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/aahd_angle_set.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aahd_sweep_chunk.h"
-#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h"
-#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.h"
 #include "caribou/main.hpp"
 #include "caliper/cali.h"
-#include <thread>
-#include <vector>
 
 namespace opensn
 {
@@ -85,194 +81,52 @@ SweepScheduler::ScheduleAlgoAsyncFIFO(SweepChunk& sweep_chunk)
   CALI_CXX_MARK_SCOPE("SweepScheduler::ScheduleAlgoAsyncFIFO");
 
   auto& cbcd_sweep_chunk = static_cast<CBCDSweepChunk&>(sweep_chunk);
-  // Copy phi and source moments to device
   cbcd_sweep_chunk.GetProblem().CopyPhiAndSrcToDevice();
+  cbcd_sweep_chunk.RefreshCachedKernelArgs();
 
   auto& angle_sets = cbcd_sweep_chunk.GetAngleSets();
-  auto& fluds_list = cbcd_sweep_chunk.GetFLUDS();
-  auto& streams_list = cbcd_sweep_chunk.GetStreams();
-
-  const size_t num_angle_sets = angle_sets.size();
-  std::vector<bool> executed(num_angle_sets, 0);
-  std::vector<bool> boundary_data_set(num_angle_sets, 0);
-  std::vector<bool> kernel_in_flight(num_angle_sets, 0);
-  std::vector<std::vector<Task*>> ready_queues(num_angle_sets);
-  std::vector<size_t> num_completed_tasks(num_angle_sets, 0);
-  std::vector<std::vector<Task*>> ready_tasks(num_angle_sets);
-  std::vector<std::vector<std::uint32_t>> ready_cell_ids(num_angle_sets);
-  std::vector<std::vector<Task*>> in_flight_tasks(num_angle_sets);
-  std::vector<std::vector<std::uint32_t>> in_flight_cell_ids(num_angle_sets);
-
+  const auto num_angle_sets = angle_sets.size();
   for (auto* angle_set : angle_sets)
-  {
-    auto& current_task_list = angle_set->GetCurrentTaskList();
-    if (current_task_list.empty())
-      current_task_list = static_cast<const CBC_SPDS&>(angle_set->GetSPDS()).GetTaskList();
-  }
+    angle_set->ResetDependencyCounter();
 
-  size_t executed_anglesets = 0;
-  while (executed_anglesets < num_angle_sets)
-  {
-    bool any_work_done = false;
+  cbcd_sweep_chunk.StartCommunicator();
 
-    // Poll completed kernels
-    for (size_t i = 0; i < num_angle_sets; ++i)
+  const auto num_workers = pool_.GetSize();
+  pool_.ExecuteBatch(
+    [num_workers, num_angle_sets, &angle_sets, &cbcd_sweep_chunk](std::size_t worker_id)
     {
-      if (not kernel_in_flight[i])
-        continue;
-      // Check if the kernel is done
-      if (streams_list[i].is_completed())
+      const auto chunk_size = (num_angle_sets + num_workers - 1) / num_workers;
+      const auto begin = worker_id * chunk_size;
+      const auto end = std::min(begin + chunk_size, num_angle_sets);
+
+      bool all_done = false;
+      while (not all_done)
       {
-        // Copy back outgoing (reflecting) boundary and non-local psi
-        fluds_list[i]->CopyOutgoingPsiBackToHost(
-          cbcd_sweep_chunk, angle_sets[i], in_flight_cell_ids[i]);
-        // Update task dependencies
-        auto& current_task_list = angle_sets[i]->GetCurrentTaskList();
-        for (auto* task : in_flight_tasks[i])
+        all_done = true;
+        bool any_work_done = false;
+        for (std::size_t i = begin; i < end; ++i)
         {
-          for (uint64_t succ : task->successors)
+          auto* angle_set = angle_sets[i];
+          if (angle_set->IsExecuted())
+            continue;
+          all_done = false;
+          if (not angle_set->IsInitialized())
           {
-            --current_task_list[succ].num_dependencies;
-            if (current_task_list[succ].num_dependencies == 0 and boundary_data_set[i])
-              ready_queues[i].push_back(&current_task_list[succ]);
+            any_work_done |= angle_set->TryInitialize(cbcd_sweep_chunk);
+            continue;
           }
-          task->completed = true;
-        }
-        num_completed_tasks[i] += in_flight_tasks[i].size();
-        // Send MPI data
-        auto* comm = static_cast<CBC_AsynchronousCommunicator*>(angle_sets[i]->GetCommunicator());
-        comm->SendData();
-        in_flight_tasks[i].clear();
-        in_flight_cell_ids[i].clear();
-        kernel_in_flight[i] = false;
-        any_work_done = true;
-      }
-    }
-
-    // Receive and send MPI data
-    for (size_t i = 0; i < num_angle_sets; ++i)
-    {
-      if (executed[i])
-        continue;
-      auto* comm = static_cast<CBC_AsynchronousCommunicator*>(angle_sets[i]->GetCommunicator());
-      auto& current_task_list = angle_sets[i]->GetCurrentTaskList();
-      auto received = comm->ReceiveData();
-      if (not received.empty())
-      {
-        for (uint64_t t : received)
-        {
-          --current_task_list[t].num_dependencies;
-          if (current_task_list[t].num_dependencies == 0 and boundary_data_set[i])
-            ready_queues[i].push_back(&current_task_list[t]);
-        }
-        any_work_done = true;
-      }
-      comm->SendData();
-    }
-
-    // Set boundary data
-    for (size_t i = 0; i < num_angle_sets; ++i)
-    {
-      if (executed[i] or boundary_data_set[i] or kernel_in_flight[i])
-        continue;
-      auto* as = angle_sets[i];
-      bool boundaries_ready = true;
-      for (auto& [bid, boundary] : as->GetBoundaries())
-      {
-        if (not boundary->CheckAnglesReadyStatus(as->GetAngleIndices()))
-        {
-          boundaries_ready = false;
-          break;
-        }
-      }
-      if (boundaries_ready)
-      {
-        fluds_list[i]->CopyIncomingBoundaryPsiToDevice(cbcd_sweep_chunk, angle_sets[i]);
-        boundary_data_set[i] = true;
-        any_work_done = true;
-
-        auto& current_task_list = angle_sets[i]->GetCurrentTaskList();
-        for (auto& task : current_task_list)
-        {
-          if (task.num_dependencies == 0 and not task.completed)
-            ready_queues[i].push_back(&task);
+          any_work_done |= angle_set->TryAdvanceOneStep(cbcd_sweep_chunk);
         }
+        if ((not all_done) and (not any_work_done))
+          std::this_thread::yield();
       }
-    }
-
-    // Collect ready tasks and launch kernels (only if task dependencies changed)
-    if (any_work_done)
-    {
-      for (size_t i = 0; i < num_angle_sets; ++i)
-      {
-        if (executed[i] or (not boundary_data_set[i]) or kernel_in_flight[i])
-          continue;
-
-        if (ready_queues[i].empty())
-          continue;
-
-        ready_tasks[i] = std::move(ready_queues[i]);
-        ready_queues[i].clear();
-
-        ready_cell_ids[i].clear();
-        for (auto* task : ready_tasks[i])
-          ready_cell_ids[i].push_back(task->reference_id);
-
-        fluds_list[i]->CopyIncomingNonlocalPsiToDevice(angle_sets[i], ready_cell_ids[i]);
-        cbcd_sweep_chunk.Sweep(ready_cell_ids[i], i);
-        in_flight_tasks[i] = std::move(ready_tasks[i]);
-        in_flight_cell_ids[i] = std::move(ready_cell_ids[i]);
-        kernel_in_flight[i] = true;
-      }
-    }
+    });
 
-    // Check angleset completion
-    for (size_t i = 0; i < num_angle_sets; ++i)
-    {
-      if (executed[i] or (not boundary_data_set[i]) or kernel_in_flight[i])
-        continue;
-      auto& current_task_list = angle_sets[i]->GetCurrentTaskList();
-      auto* comm = static_cast<CBC_AsynchronousCommunicator*>(angle_sets[i]->GetCommunicator());
-      bool all_done = (num_completed_tasks[i] == current_task_list.size());
-      if (all_done and comm->SendData())
-      {
-        for (auto& [bid, boundary] : angle_sets[i]->GetBoundaries())
-          boundary->UpdateAnglesReadyStatus(angle_sets[i]->GetAngleIndices());
-        executed[i] = true;
-        ++executed_anglesets;
-        fluds_list[i]->CopySavedPsiFromDevice();
-        auto* fluds = fluds_list[i];
-        auto* as = angle_sets[i];
-        // Cast away constness to add a callback
-        streams_list[i].add_callback(
-          [fluds, &cbcd_sweep_chunk, as]()
-          { fluds->CopySavedPsiToDestinationPsi(cbcd_sweep_chunk, as); });
-      }
-    }
-  }
+  cbcd_sweep_chunk.StopCommunicator();
 
-  /// Copy phi and outflow data back to host
   cbcd_sweep_chunk.GetProblem().CopyPhiAndOutflowBackToHost();
 
-  // Receive delayed data
-  opensn::mpi_comm.barrier();
-  bool received_delayed_data = false;
-  while (not received_delayed_data)
-  {
-    received_delayed_data = true;
-
-    for (auto& angle_set : angle_sets)
-    {
-      if (angle_set->FlushSendBuffers() == AngleSetStatus::MESSAGES_PENDING)
-        received_delayed_data = false;
-
-      if (not angle_set->ReceiveDelayedData())
-        received_delayed_data = false;
-    }
-  }
-
-  // Reset all
-  for (auto& angle_set : angle_sets)
+  for (auto* angle_set : angle_sets)
     angle_set->ResetSweepBuffers();
 
   for (const auto& [bid, bndry] : angle_agg_.GetSimBoundaries())
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h
index 4504375d9f..71abd6eb1c 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h
@@ -71,7 +71,7 @@ struct CBCSweepData
   /// Number of nodes on the current cell.
   size_t cell_num_nodes;
 
-  /// Number of energy groups in the groupset.
+  /// Number of groups in the groupset.
   size_t gs_size;
   /// First group index in the groupset.
   unsigned int gs_gi;
@@ -481,7 +481,7 @@ CBC_Sweep_Generic(CBCSweepData& data, CBCGenericSweepScratch& scratch, AngleSet&
  *
  * Specialized in cbc_avx_sweep_chunk.cc for compile-time-known node counts
  * (4, 8, etc.), enabling stack-allocated matrices, loop unrolling, and SIMD
- * batch Gauss elimination across multiple energy groups simultaneously.
+ * batch Gauss elimination across multiple groups simultaneously.
  *
  * \tparam NumNodes compile-time number of cell nodes
  * \tparam time_dependent if true, include the time-derivative source term
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.cu b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.cu
index 4bd0a8bb16..316a0ea323 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.cu
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.cu
@@ -8,6 +8,8 @@
 #include "modules/linear_boltzmann_solvers/lbs_problem/device/carrier/mesh_carrier.h"
 #include "caliper/cali.h"
 #include <algorithm>
+#include <set>
+#include <unordered_map>
 
 namespace opensn
 {
@@ -27,54 +29,168 @@ CBCDSweepChunk::CBCDSweepChunk(DiscreteOrdinatesProblem& problem, LBSGroupset& g
                problem.GetMinCellDOFCount()),
     problem_(problem)
 {
+  std::vector<CBCD_FLUDS*> fluds_list;
   for (auto& as : *(groupset.angle_agg))
   {
     auto* angle_set = static_cast<CBCD_AngleSet*>(as.get());
     auto* fluds = static_cast<CBCD_FLUDS*>(&(angle_set->GetFLUDS()));
     angle_sets_.push_back(angle_set);
-    fluds_list_.push_back(fluds);
-    streams_list_.push_back(angle_set->GetStream());
+    fluds_list.push_back(fluds);
+
     gpu_kernel::Arguments<gpu_kernel::SweepType::CBC> args(problem_, groupset_, *angle_set, *fluds);
-    kernel_args_list_.push_back(args);
-    unsigned int stride_size =
+    const auto stride_size =
       gpu_kernel::RoundUp(static_cast<unsigned int>(args.flud_data.stride_size));
-    unsigned int block_size_x = std::min(stride_size, gpu_kernel::threshold);
-    unsigned int block_size_y = gpu_kernel::threshold / block_size_x;
-    unsigned int grid_size_x = (stride_size + gpu_kernel::threshold - 1) / gpu_kernel::threshold;
-    block_sizes_.push_back(crb::Dim3(block_size_x, block_size_y));
-    grid_size_x_list_.push_back(grid_size_x);
+    const auto block_size_x = std::min(stride_size, gpu_kernel::threshold);
+    const auto block_size_y = gpu_kernel::threshold / block_size_x;
+    const auto grid_size_x = (stride_size + gpu_kernel::threshold - 1) / gpu_kernel::threshold;
+    cached_params_.push_back({args,
+                              crb::Dim3(block_size_x, block_size_y),
+                              grid_size_x,
+                              fluds,
+                              fluds->GetSavedAngularFluxDevicePointer()});
+  }
+
+  if (not angle_sets_.empty())
+  {
+    std::vector<std::vector<int>> incoming_source_partitions_by_angle_set;
+    incoming_source_partitions_by_angle_set.reserve(angle_sets_.size());
+    std::unordered_map<int, std::vector<std::size_t>> source_as_section_bytes;
+    std::vector<AngleSetCapacity> capacities(angle_sets_.size());
+    for (std::size_t as_ss_idx = 0; as_ss_idx < angle_sets_.size(); ++as_ss_idx)
+    {
+      const auto stride = fluds_list[as_ss_idx]->GetStrideSize();
+      const auto& common_data = fluds_list[as_ss_idx]->GetCommonData();
+      incoming_source_partitions_by_angle_set.push_back(common_data.GetIncomingSourcePartitions());
+      capacities[as_ss_idx].outgoing_faces = common_data.GetNumOutgoingNonlocalFaces();
+      capacities[as_ss_idx].incoming_faces = common_data.GetNumIncomingNonlocalFaces();
+      for (std::size_t cell_local_id = 0; cell_local_id < common_data.GetNumLocalCells();
+           ++cell_local_id)
+      {
+        for (const auto& face_info : common_data.GetOutgoingNonlocalFaces(cell_local_id))
+        {
+          capacities[as_ss_idx].max_outgoing_face_values =
+            std::max(capacities[as_ss_idx].max_outgoing_face_values,
+                     static_cast<std::size_t>(face_info.num_face_nodes) * stride);
+        }
+      }
+
+      std::unordered_map<std::uint32_t, std::size_t> incoming_entries_by_source_slot;
+      std::unordered_map<std::uint32_t, std::size_t> incoming_values_by_source_slot;
+      for (std::size_t cell_local_id = 0; cell_local_id < common_data.GetNumLocalCells();
+           ++cell_local_id)
+      {
+        for (const auto& face_info : common_data.GetIncomingNonlocalFaces(cell_local_id))
+        {
+          if (face_info.num_nodes == 0)
+            continue;
+          ++incoming_entries_by_source_slot[face_info.source_slot];
+          incoming_values_by_source_slot[face_info.source_slot] +=
+            static_cast<std::size_t>(face_info.num_nodes) * stride;
+          const auto source_partition =
+            common_data.GetIncomingSourcePartitions()[face_info.source_slot];
+          auto& per_as_bytes = source_as_section_bytes[source_partition];
+          if (per_as_bytes.empty())
+            per_as_bytes.assign(angle_sets_.size(), 0);
+          per_as_bytes[as_ss_idx] +=
+            sizeof(std::uint32_t) + sizeof(std::size_t) +
+            static_cast<std::size_t>(face_info.num_nodes) * stride * sizeof(double);
+        }
+      }
+      for (const auto& [_, count] : incoming_entries_by_source_slot)
+        capacities[as_ss_idx].max_incoming_batch_entries =
+          std::max(capacities[as_ss_idx].max_incoming_batch_entries, count);
+      for (const auto& [_, values] : incoming_values_by_source_slot)
+        capacities[as_ss_idx].max_incoming_batch_values =
+          std::max(capacities[as_ss_idx].max_incoming_batch_values, values);
+    }
+
+    std::size_t max_message_bytes = 0;
+    for (const auto& [_, per_as_bytes] : source_as_section_bytes)
+    {
+      std::size_t msg_size_in_bytes = sizeof(std::size_t);
+      for (const auto& section_bytes : per_as_bytes)
+      {
+        if (section_bytes == 0)
+          continue;
+        msg_size_in_bytes += 2 * sizeof(std::size_t) + section_bytes;
+      }
+      max_message_bytes = std::max(max_message_bytes, msg_size_in_bytes);
+    }
+
+    std::vector<AngleSet*> base_angle_sets(angle_sets_.begin(), angle_sets_.end());
+    async_comm_ =
+      std::make_unique<CBCD_AsynchronousCommunicator>(base_angle_sets,
+                                                      angle_sets_.front()->GetCommunicatorSet(),
+                                                      incoming_source_partitions_by_angle_set,
+                                                      max_message_bytes,
+                                                      capacities);
+    for (auto* angle_set : angle_sets_)
+      angle_set->SetCommunicator(*async_comm_);
   }
 }
 
+CBCDSweepChunk::~CBCDSweepChunk()
+{
+  StopCommunicator();
+}
+
+void
+CBCDSweepChunk::StartCommunicator()
+{
+  if (async_comm_)
+    async_comm_->Start();
+}
+
 void
-CBCDSweepChunk::Sweep(const std::vector<std::uint32_t>& cell_local_ids, size_t angle_set_id)
+CBCDSweepChunk::StopCommunicator()
+{
+  if (async_comm_)
+    async_comm_->Stop();
+}
+
+void
+CBCDSweepChunk::RefreshCachedKernelArgs()
+{
+  CALI_CXX_MARK_SCOPE("CBCDSweepChunk::RefreshCachedKernelArgs");
+
+  for (std::size_t angle_set_id = 0; angle_set_id < angle_sets_.size(); ++angle_set_id)
+  {
+    auto& ck = cached_params_[angle_set_id];
+    {
+      CALI_CXX_MARK_SCOPE("CBCDSweepChunk::Sweep::ArgsRefresh");
+      ck.args = gpu_kernel::Arguments<gpu_kernel::SweepType::CBC>(
+        problem_, groupset_, *angle_sets_[angle_set_id], *ck.fluds);
+      ck.device_saved_psi = ck.fluds->GetSavedAngularFluxDevicePointer();
+    }
+  }
+}
+
+void
+CBCDSweepChunk::Sweep(std::uint32_t num_ready_cells,
+                      std::size_t angle_set_id,
+                      const std::uint32_t* local_cell_ids)
 {
   CALI_CXX_MARK_SCOPE("CBCDSweepChunk::Sweep");
 
-  auto* fluds = fluds_list_[angle_set_id];
-  auto* device_saved_psi = fluds->GetSavedAngularFluxDevicePointer();
-  auto& stream = streams_list_[angle_set_id];
-  auto& host_cell_local_ids = fluds->GetLocalCellIDs();
-  std::copy(cell_local_ids.begin(), cell_local_ids.end(), host_cell_local_ids.begin());
-  const auto& args = kernel_args_list_[angle_set_id];
-  crb::Dim3 block_size = block_sizes_[angle_set_id];
-  unsigned int num_ready_cells = static_cast<unsigned int>(cell_local_ids.size());
-  unsigned int grid_size_x = grid_size_x_list_[angle_set_id];
-  unsigned int grid_size_y = (num_ready_cells + block_size.y - 1) / block_size.y;
-  crb::Dim3 grid_size(grid_size_x, grid_size_y);
-  auto* host_cell_local_ids_data = host_cell_local_ids.data();
+  auto& ck = cached_params_[angle_set_id];
+  auto& stream = angle_sets_[angle_set_id]->GetStream();
+  const auto grid_size_y = (num_ready_cells + ck.block_size.y - 1) / ck.block_size.y;
+  crb::Dim3 grid_size(ck.grid_size_x, grid_size_y);
+  {
+    CALI_CXX_MARK_SCOPE("CBCDSweepChunk::Sweep::KernelLaunch");
 #if defined(__NVCC__) || defined(__HIPCC__)
-  gpu_kernel::SweepKernel<gpu_kernel::SweepType::CBC><<<grid_size, block_size, 0, stream>>>(
-    args, host_cell_local_ids_data, num_ready_cells, device_saved_psi);
+    gpu_kernel::SweepKernel<gpu_kernel::SweepType::CBC><<<grid_size, ck.block_size, 0, stream>>>(
+      ck.args, local_cell_ids, num_ready_cells, ck.device_saved_psi);
 #elif defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER)
-  stream.synchronize();
-  stream.parallel_for(sycl::nd_range<3>(grid_size * block_size, block_size),
-                      [=](sycl::nd_item<3> work_index)
-                      {
-                        gpu_kernel::SweepKernel<gpu_kernel::SweepType::CBC>(
-                          args, host_cell_local_ids_data, num_ready_cells, device_saved_psi);
-                      });
+    stream.synchronize();
+    stream.parallel_for(sycl::nd_range<3>(grid_size * ck.block_size, ck.block_size),
+                        [=](sycl::nd_item<3> work_index)
+                        {
+                          gpu_kernel::SweepKernel<gpu_kernel::SweepType::CBC>(
+                            ck.args, local_cell_ids, num_ready_cells, ck.device_saved_psi);
+                        });
 #endif
+  }
 }
 
-} // namespace opensn
\ No newline at end of file
+} // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.h
index e1194b26a5..6950096f1f 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbcd_angle_set.h"
+#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/sweep_chunk.h"
 #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.h"
@@ -15,40 +16,88 @@ namespace crb = caribou;
 namespace opensn
 {
 
-/// CBC sweep chunk for device.
+/**
+ * CBCD sweep chunk.
+ *
+ * Owns the shared CBCD communicator for one groupset, caches per-angle-set kernel
+ * launch parameters, and coordinates the transfer boundaries between the device sweep
+ * kernels and the host-side CBCD scheduler.
+ */
 class CBCDSweepChunk : public SweepChunk
 {
 public:
+  /**
+   * Construct the CBCD sweep chunk for one groupset.
+   *
+   * \param problem Discrete ordinates problem owning the sweep state.
+   * \param groupset Groupset served by this sweep chunk.
+   */
   CBCDSweepChunk(DiscreteOrdinatesProblem& problem, LBSGroupset& groupset);
 
+  ~CBCDSweepChunk() override;
+
+  /// Return the discrete ordinates problem owning this sweep chunk.
   DiscreteOrdinatesProblem& GetProblem() const { return problem_; }
 
+  /// Return the groupset served by this sweep chunk.
   const LBSGroupset& GetGroupset() const { return groupset_; }
 
+  /// Return the first group index of the groupset.
   unsigned int GetGroupsetGroupIndex() const { return groupset_.first_group; }
 
+  /// Return the cell transport view for one local cell.
   const CellLBSView& GetCellTransportView(std::uint64_t cell_local_id) const
   {
     return cell_transport_views_[cell_local_id];
   }
 
+  /// Return the CBCD angle sets coordinated by this sweep chunk.
   const std::vector<CBCD_AngleSet*>& GetAngleSets() const { return angle_sets_; }
 
-  const std::vector<CBCD_FLUDS*>& GetFLUDS() const { return fluds_list_; }
+  /// Start the aggregated communicator thread.
+  void StartCommunicator();
+
+  /// Stop the aggregated communicator thread.
+  void StopCommunicator();
 
-  std::vector<crb::Stream>& GetStreams() { return streams_list_; }
+  /// Refresh cached kernel arguments once at the start of a sweep.
+  void RefreshCachedKernelArgs();
 
   using SweepChunk::Sweep;
-  void Sweep(const std::vector<std::uint32_t>& cell_local_ids, size_t angle_set_id);
+  /**
+   * Launch the CBC sweep kernel for one angle set.
+   *
+   * \param num_ready_cells Number of local cells in the batch.
+   * \param angle_set_id Producing angle-set ID.
+   * \param local_cell_ids Pointer to the mapped host cell-ID buffer for the batch.
+   */
+  void Sweep(std::uint32_t num_ready_cells,
+             std::size_t angle_set_id,
+             const std::uint32_t* local_cell_ids);
 
 private:
+  /// Cached launch data for one angle set.
+  struct CachedKernelParams
+  {
+    /// Packed kernel arguments.
+    gpu_kernel::Arguments<gpu_kernel::SweepType::CBC> args;
+    /// Device block size for the launch.
+    crb::Dim3 block_size;
+    /// Device grid size in x.
+    unsigned int grid_size_x;
+    /// FLUDS instance bound to the angle set.
+    CBCD_FLUDS* fluds;
+    /// Device pointer to saved angular fluxes.
+    double* device_saved_psi;
+  };
+  /// Owning reference to the discrete ordinates problem.
   DiscreteOrdinatesProblem& problem_;
+  /// Aggregated communicator owned by this sweep chunk.
+  std::unique_ptr<CBCD_AsynchronousCommunicator> async_comm_;
+  /// Anglesets managed by this sweep chunk.
   std::vector<CBCD_AngleSet*> angle_sets_;
-  std::vector<CBCD_FLUDS*> fluds_list_;
-  std::vector<crb::Stream> streams_list_;
-  std::vector<gpu_kernel::Arguments<gpu_kernel::SweepType::CBC>> kernel_args_list_;
-  std::vector<crb::Dim3> block_sizes_;
-  std::vector<unsigned int> grid_size_x_list_;
+  /// Per-angleset cached kernel launch params.
+  std::vector<CachedKernelParams> cached_params_;
 };
 
-} // namespace opensn
\ No newline at end of file
+} // namespace opensn
diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/gpu_kernel/solver.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/gpu_kernel/solver.h
index e5894e2e00..2b912d8009 100644
--- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/gpu_kernel/solver.h
+++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/gpu_kernel/solver.h
@@ -26,9 +26,9 @@ ComputeGMS(double* sweep_matrix,
            const std::uint32_t& num_moments,
            const Arguments<t>& args)
 {
-  // get sigmaT
+  // Get sigmaT
   double sigma_t = cell.total_xs[args.groupset_start + group_idx];
-  // compute source term
+  // Compute source term
   const double* src_moment = args.src_moment + cell.phi_address + args.groupset_start + group_idx;
   _Pragma("unroll") for (std::uint32_t i = 0; i < ndofs; ++i)
   {
@@ -40,7 +40,7 @@ ComputeGMS(double* sweep_matrix,
     }
     s[i] = src_per_moment;
   }
-  // add source, transfer and mass contribution
+  // Add source, transfer and mass contribution
   double* A = sweep_matrix;
   const std::array<double, 4>* GM_data =
     reinterpret_cast<const std::array<double, 4>*>(cell.GM_data);
@@ -49,10 +49,10 @@ ComputeGMS(double* sweep_matrix,
     _Pragma("unroll") for (std::uint32_t j = 0; j < ndofs; ++j)
     {
       std::array<double, 4> GM = *(GM_data++);
-      // compute A += G * Omega + M * sigma_t
+      // Compute A += G * Omega + M * sigma_t
       A[j] += direction.omega[0] * GM[0] + direction.omega[1] * GM[1] + direction.omega[2] * GM[2] +
               sigma_t * GM[3];
-      // compute psi += M @ s
+      // Compute psi += M @ s
       psi[i] += GM[3] * s[j];
     }
     A += ndofs;
@@ -70,14 +70,14 @@ ComputeSurfaceIntegral(double* sweep_matrix,
                        const unsigned int& angle_group_idx,
                        const Arguments<t>& args)
 {
-  // loop over each face
+  // Loop over each face
   std::uint32_t face_node_counter = 0;
   for (std::uint32_t f = 0; f < cell.num_faces; ++f)
   {
-    // get face view
+    // Get face view
     FaceView face;
     cell.GetFaceView(face, f);
-    // determine if this face is incoming
+    // Determine if this face is incoming
     NodeIndexType<t> idx(cell_edge_data[face_node_counter]);
     if (idx.IsUndefined() || idx.IsOutgoing())
     {
@@ -86,7 +86,7 @@ ComputeSurfaceIntegral(double* sweep_matrix,
     }
     double mu = direction.omega[0] * face.normal[0] + direction.omega[1] * face.normal[1] +
                 direction.omega[2] * face.normal[2];
-    // compute surface integral
+    // Compute surface integral
     for (std::uint32_t fi = 0; fi < face.num_face_nodes; ++fi)
     {
       std::uint32_t i = face.cell_mapping_data[fi];
@@ -101,7 +101,7 @@ ComputeSurfaceIntegral(double* sweep_matrix,
         psi[i] += upwind_psi[angle_group_idx] * mu_Nij;
       }
     }
-    // update face node counter
+    // Update face node counter
     face_node_counter += face.num_face_nodes;
   }
 }
@@ -111,18 +111,18 @@ template <std::size_t ndofs>
 __CRB_DEVICE_FUNC__ void
 GaussianElimination(double* sweep_matrix, double* psi)
 {
-  // forward elimination
+  // Forward elimination
   double* A_i = sweep_matrix;
   _Pragma("unroll") for (std::uint32_t i = 0; i < ndofs; ++i)
   {
     double inv_diag = 1.0 / A_i[i];
-    // normalize the pivot row
+    // Normalize the pivot row
     _Pragma("unroll") for (std::uint32_t j = i; j < ndofs; ++j)
     {
       A_i[j] *= inv_diag;
     }
     psi[i] *= inv_diag;
-    // eliminate rows below
+    // Eliminate rows below
     double* A_k = A_i + ndofs;
     _Pragma("unroll") for (std::uint32_t k = i + 1; k < ndofs; ++k)
     {
@@ -136,7 +136,7 @@ GaussianElimination(double* sweep_matrix, double* psi)
     }
     A_i += ndofs;
   }
-  // back substitution — row-wise access
+  // Back substitution — row-wise access
   if constexpr (ndofs >= 2)
   {
     _Pragma("unroll") for (std::int32_t j = ndofs - 2; j >= 0; --j)
@@ -161,14 +161,14 @@ WritePsiToFludsAndOutflow(double* psi,
                           const unsigned int& group_idx,
                           const Arguments<t>& args)
 {
-  // loop over each face
+  // Loop over each face
   std::uint32_t face_node_counter = 0;
   for (std::uint32_t f = 0; f < cell.num_faces; ++f)
   {
-    // get face view
+    // Get face view
     FaceView face;
     cell.GetFaceView(face, f);
-    // determine if this face is outgoing
+    // Determine if this face is outgoing
     NodeIndexType<t> idx(cell_edge_data[face_node_counter]);
     if (idx.IsUndefined() || !idx.IsOutgoing())
     {
@@ -177,15 +177,15 @@ WritePsiToFludsAndOutflow(double* psi,
     }
     double mu = direction.omega[0] * face.normal[0] + direction.omega[1] * face.normal[1] +
                 direction.omega[2] * face.normal[2];
-    // loop over each face node
+    // Loop over each face node
     for (std::uint32_t fi = 0; fi < face.num_face_nodes; ++fi)
     {
       std::uint32_t i = face.cell_mapping_data[fi];
-      // put copy psi to FLUDS
+      // Put copy psi to FLUDS
       double* downwind_psi =
         args.flud_data.GetOutgoingFluxPointer(cell_edge_data[face_node_counter + fi]);
       downwind_psi[angle_group_idx] = psi[i];
-      // compute ouflow for boundary face
+      // Compute outflow for boundary face
       if (face.outflow != nullptr)
       {
         double outflow = direction.weight * mu * face.IntS_shapeI_data[fi] * psi[i];
@@ -246,16 +246,16 @@ Sweep(const Arguments<t>& args,
       const std::uint32_t& num_moments,
       double* saved_psi)
 {
-  // initialize buffer
+  // Initialize buffer
   Buffer<ndofs> buffer;
-  // prepare linear system to solve
+  // Prepare linear system to solve
   ComputeGMS<ndofs, t>(
     buffer.A(), buffer.b(), buffer.s(), cell, direction, group_idx, num_moments, args);
   ComputeSurfaceIntegral<ndofs, t>(
     buffer.A(), buffer.b(), cell, direction, cell_edge_data, angle_group_idx, args);
-  // solve for the angular flux
+  // Solve for the angular flux
   GaussianElimination<ndofs>(buffer.A(), buffer.b());
-  // save the result
+  // Save the result
   WritePsiToFludsAndOutflow<t>(
     buffer.b(), cell, direction, cell_edge_data, angle_group_idx, group_idx, args);
   ComputePhi<ndofs, t>(buffer.b(), cell, direction, group_idx, num_moments, args);