From 8cd33cdb6d96bf3da5f1e7343de6b6cb62ac62d1 Mon Sep 17 00:00:00 2001 From: Eappen Nelluvelil Date: Tue, 31 Mar 2026 23:30:15 -0500 Subject: [PATCH 1/6] AVX optimizations for CBC sweep kernels --- .../sweep/angle_set/cbc_angle_set.cc | 53 ++- .../sweep/angle_set/cbc_angle_set.h | 2 + .../sweep/communicators/async_comm.h | 9 - .../sweep/communicators/cbc_async_comm.cc | 39 +- .../sweep/communicators/cbc_async_comm.cu | 17 - .../sweep/communicators/cbc_async_comm.h | 28 +- .../sweep/fluds/cbc_fluds.cc | 52 +-- .../sweep/fluds/cbc_fluds.h | 13 +- .../sweep/fluds/cbc_fluds_common_data.cc | 26 +- .../sweep/fluds/cbc_fluds_common_data.h | 9 + .../sweep/fluds/cbcd_fluds.cu | 14 +- .../sweep/fluds/cbcd_fluds.h | 11 +- .../sweep/fluds/cbcd_fluds_common_data.cc | 2 + .../sweep/fluds/cbcd_fluds_common_data.cu | 9 + .../sweep/fluds/cbcd_fluds_common_data.h | 10 + .../sweep/fluds/fluds.h | 25 + .../sweep_chunks/aah_avx_sweep_chunk.cc | 204 --------- .../sweep_chunks/aah_sweep_chunk.h | 11 +- .../sweep_chunks/aah_sweep_kernels.h | 20 - .../sweep_chunks/avx_sweep_chunk_utils.h | 241 ++++++++++ .../sweep_chunks/cbc_avx_sweep_chunk.cc | 427 ++++++++++++++++++ .../sweep_chunks/cbc_sweep_chunk.cc | 59 ++- .../sweep_chunks/cbc_sweep_chunk.h | 20 +- 23 files changed, 919 insertions(+), 382 deletions(-) delete mode 100644 modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.cu create mode 100644 modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/avx_sweep_chunk_utils.h create mode 100644 modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_avx_sweep_chunk.cc diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.cc index e9229a94dc..999675b039 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.cc +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.cc @@ -23,6 +23,7 @@ CBC_AngleSet::CBC_AngleSet(size_t id, const MPICommunicatorSet& comm_set) : AngleSet(id, num_groups, spds, fluds, angle_indices, boundaries), cbc_spds_(dynamic_cast(spds_)), + ready_tasks_(), async_comm_(id, *fluds, comm_set) { } @@ -42,14 +43,25 @@ CBC_AngleSet::AngleSetAdvance(SweepChunk& sweep_chunk, AngleSetStatus permission return AngleSetStatus::FINISHED; if (current_task_list_.empty()) + { current_task_list_ = cbc_spds_.GetTaskList(); + // Build initial ready queue + ready_tasks_.reserve(current_task_list_.size()); + for (size_t i = 0; i < current_task_list_.size(); ++i) + if ((current_task_list_[i].num_dependencies == 0) and (not current_task_list_[i].completed)) + ready_tasks_.push_back(i); + } sweep_chunk.SetAngleSet(*this); auto tasks_who_received_data = async_comm_.ReceiveData(); for (const std::uint64_t task_number : tasks_who_received_data) - --current_task_list_[task_number].num_dependencies; + { + if ((--current_task_list_[task_number].num_dependencies == 0) and + (not current_task_list_[task_number].completed)) + ready_tasks_.push_back(task_number); + } async_comm_.SendData(); @@ -58,31 +70,28 @@ CBC_AngleSet::AngleSetAdvance(SweepChunk& sweep_chunk, AngleSetStatus permission if (not boundary->CheckAnglesReadyStatus(angles_)) return AngleSetStatus::NOT_FINISHED; - bool all_tasks_completed = true; - bool a_task_executed = true; - while (a_task_executed) + while (not ready_tasks_.empty()) { - a_task_executed = false; - for (auto& cell_task : current_task_list_) + const auto task_idx = ready_tasks_.back(); + ready_tasks_.pop_back(); + auto& cell_task = current_task_list_[task_idx]; + + sweep_chunk.SetCell(cell_task.cell_ptr, *this); + sweep_chunk.Sweep(*this); + + for (const auto& local_task_num : cell_task.successors) { - if (not cell_task.completed) - all_tasks_completed = false; - if (cell_task.num_dependencies == 0 and not cell_task.completed) - { - sweep_chunk.SetCell(cell_task.cell_ptr, *this); - sweep_chunk.Sweep(*this); - - for (std::uint64_t local_task_num : cell_task.successors) - --current_task_list_[local_task_num].num_dependencies; - - cell_task.completed = true; - a_task_executed = true; - async_comm_.SendData(); - } - } // for cell_task + if ((--current_task_list_[local_task_num].num_dependencies == 0) and + (not current_task_list_[local_task_num].completed)) + ready_tasks_.push_back(local_task_num); + } + + cell_task.completed = true; + ++num_completed_tasks; async_comm_.SendData(); } + const bool all_tasks_completed = (num_completed_tasks == current_task_list_.size()); const bool all_messages_sent = async_comm_.SendData(); if (all_tasks_completed and all_messages_sent) @@ -101,6 +110,8 @@ void CBC_AngleSet::ResetSweepBuffers() { current_task_list_.clear(); + ready_tasks_.clear(); + num_completed_tasks = 0; async_comm_.Reset(); fluds_->ClearLocalAndReceivePsi(); executed_ = false; diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.h index 36da1250e9..ba127849db 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.h @@ -59,6 +59,8 @@ class CBC_AngleSet : public AngleSet protected: const CBC_SPDS& cbc_spds_; std::vector current_task_list_; + std::vector ready_tasks_; + size_t num_completed_tasks = 0; CBC_AsynchronousCommunicator async_comm_; }; diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/async_comm.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/async_comm.h index f9988256be..9d4f0a0da1 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/async_comm.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/async_comm.h @@ -24,15 +24,6 @@ class AsynchronousCommunicator virtual ~AsynchronousCommunicator() = default; - virtual std::vector& InitGetDownwindMessageData(int location_id, - uint64_t cell_global_id, - unsigned int face_id, - size_t angle_set_id, - size_t data_size) - { - OpenSnLogicalError("Method not implemented"); - } - protected: FLUDS& fluds_; const MPICommunicatorSet& comm_set_; diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.cc index 1cb29d434d..44c61c777e 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.cc +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.cc @@ -3,12 +3,12 @@ #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/spds.h" -#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h" #include "framework/mesh/mesh_continuum/mesh_continuum.h" #include "framework/mpi/mpi_comm_set.h" #include "framework/logging/log.h" #include "framework/runtime.h" #include "caliper/cali.h" +#include #include namespace opensn @@ -53,8 +53,12 @@ CBC_AsynchronousCommunicator::SendData() buffer_array.Write(cell_global_id); buffer_array.Write(face_id); buffer_array.Write(data_size); - for (const double value : data) // actual psi_data - buffer_array.Write(value); + + auto& raw = buffer_array.Data(); + const size_t old_size = raw.size(); + const size_t num_bytes = data_size * sizeof(double); + raw.resize(old_size + num_bytes); + std::memcpy(raw.data() + old_size, data.data(), num_bytes); } for (auto& [locI, buffer] : locI_buffer_map) @@ -94,10 +98,11 @@ CBC_AsynchronousCommunicator::ReceiveData() { CALI_CXX_MARK_SCOPE("CBC_AsynchronousCommunicator::ReceiveData"); - using CellFaceKey = std::pair; // cell_gid + face_id - std::map> received_messages; + std::unordered_map, FLUDS::CellFaceKeyHash> + received_messages; std::vector cells_who_received_data; const auto& location_dependencies = fluds_.GetSPDS().GetLocationDependencies(); + auto& deplocs_outgoing_messages = fluds_.GetDeplocsOutgoingMessages(); for (int locJ : location_dependencies) { const auto& comm = comm_set_.LocICommunicator(opensn::mpi_comm.rank()); @@ -117,33 +122,19 @@ CBC_AsynchronousCommunicator::ReceiveData() const auto face_id = data_array.Read(); const auto data_size = data_array.Read(); - std::vector psi_data; - psi_data.reserve(data_size); - for (size_t k = 0; k < data_size; ++k) - psi_data.push_back(data_array.Read()); + std::vector psi_data(data_size); + const size_t num_bytes = data_size * sizeof(double); + std::memcpy(psi_data.data(), &data_array.Data()[data_array.Offset()], num_bytes); + data_array.Seek(data_array.Offset() + num_bytes); - received_messages[{cell_global_id, face_id}] = std::move(psi_data); + deplocs_outgoing_messages[{cell_global_id, face_id}] = std::move(psi_data); cells_who_received_data.push_back( fluds_.GetSPDS().GetGrid()->MapCellGlobalID2LocalID(cell_global_id)); } // while not at end of buffer } // Process each message embedded in buffer } - auto* cbc_fluds = dynamic_cast(&fluds_); - if (cbc_fluds != nullptr) - cbc_fluds->GetDeplocsOutgoingMessages().merge(received_messages); - else - MergeDeplocsOutgoingMessages(received_messages); - return cells_who_received_data; } -#ifndef __OPENSN_WITH_GPU__ -void -CBC_AsynchronousCommunicator::MergeDeplocsOutgoingMessages( - std::map>& received_messages) -{ -} -#endif - } // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.cu b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.cu deleted file mode 100644 index e997a209d8..0000000000 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.cu +++ /dev/null @@ -1,17 +0,0 @@ -// SPDX-FileCopyrightText: 2026 The OpenSn Authors -// SPDX-License-Identifier: MIT - -#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h" -#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h" - -namespace opensn -{ - -void -CBC_AsynchronousCommunicator::MergeDeplocsOutgoingMessages( - std::map>& received_messages) -{ - dynamic_cast(fluds_).GetDeplocsOutgoingMessages().merge(received_messages); -} - -} // namespace opensn \ No newline at end of file diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h index dbd8735157..ead2c03bd9 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h @@ -4,9 +4,10 @@ #pragma once #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/async_comm.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/fluds.h" #include "framework/data_types/byte_array.h" #include "mpicpp-lite/mpicpp-lite.h" -#include +#include #include #include #include @@ -33,7 +34,7 @@ class CBC_AsynchronousCommunicator : public AsynchronousCommunicator uint64_t cell_global_id, unsigned int face_id, size_t angle_set_id, - size_t data_size) override; + size_t data_size); bool SendData(); @@ -48,9 +49,22 @@ class CBC_AsynchronousCommunicator : public AsynchronousCommunicator protected: const size_t angle_set_id_; - // location_id, cell_global_id, face_id - using MessageKey = std::tuple; - std::map> outgoing_message_queue_; + /// location_id, cell_global_id, face_id + using MessageKey = std::tuple; + + /// boost::hash_combine hash function for MessageKey. + struct MessageKeyHash + { + std::size_t operator()(const MessageKey& key) const noexcept + { + size_t h = std::hash{}(std::get<0>(key)); + h ^= std::hash{}(std::get<1>(key)) + 0x9e3779b9 + (h << 6) + (h >> 2); + h ^= std::hash{}(std::get<2>(key)) + 0x9e3779b9 + (h << 6) + (h >> 2); + return h; + } + }; + + std::unordered_map, MessageKeyHash> outgoing_message_queue_; struct BufferItem { @@ -61,10 +75,6 @@ class CBC_AsynchronousCommunicator : public AsynchronousCommunicator ByteArray data_array; }; std::vector send_buffer_; - - // cell_global_id, face_id - using CellFaceKey = std::pair; - void MergeDeplocsOutgoingMessages(std::map>& received_messages); }; } // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.cc index 362d1c4570..c6e85f9be4 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.cc +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.cc @@ -26,6 +26,16 @@ CBC_FLUDS::CBC_FLUDS(unsigned int num_groups, local_psi_data_size_(num_local_spatial_dofs_ * num_groups_and_angles_), local_psi_data_(local_psi_data_size_) { + const auto& grid = *spds_.GetGrid(); + cell_psi_start_.resize(grid.local_cells.size()); + for (const auto& cell : grid.local_cells) + { + cell_psi_start_[cell.local_id] = + (sdm_.MapDOFLocal(cell, 0, psi_uk_man_, 0, 0) / num_angles_in_gs_quadrature_ / num_groups_) * + num_groups_and_angles_; + } + + deplocs_outgoing_messages_.reserve(common_data.GetNumIncomingNonlocalFaces()); } const FLUDSCommonData& @@ -37,38 +47,19 @@ CBC_FLUDS::GetCommonData() const double* CBC_FLUDS::UpwindPsi(const Cell& face_neighbor, unsigned int adj_cell_node, size_t as_ss_idx) { - // Map to face neighbor cell's first spatial DOF index - // (0 to (num_local_spatial_dofs_ - 1)) - const size_t face_nbr_spatial_dof_0_index = - (sdm_.MapDOFLocal(face_neighbor, 0, psi_uk_man_, 0, 0) / num_angles_in_gs_quadrature_ / - num_groups_); - - // Index to start of neighbor cell's data block in local_psi_data_ - const size_t face_nbr_data_start_index = face_nbr_spatial_dof_0_index * num_groups_and_angles_; - const size_t addr_offset = adj_cell_node * num_groups_and_angles_ + as_ss_idx * num_groups_; - const size_t face_nbr_data_index = face_nbr_data_start_index + addr_offset; - - assert((face_nbr_data_index >= 0) and (face_nbr_data_index < local_psi_data_.size())); - - return &local_psi_data_[face_nbr_data_index]; + const size_t index = cell_psi_start_[face_neighbor.local_id] + + adj_cell_node * num_groups_and_angles_ + as_ss_idx * num_groups_; + assert(index < local_psi_data_.size()); + return &local_psi_data_[index]; } double* CBC_FLUDS::OutgoingPsi(const Cell& cell, unsigned int cell_node, size_t as_ss_idx) { - // Map to current cell's first spatial DOF index - // (0 to (num_local_spatial_dofs_ - 1)) - const size_t cur_cell_spatial_dof_0_index = - (sdm_.MapDOFLocal(cell, 0, psi_uk_man_, 0, 0) / num_angles_in_gs_quadrature_ / num_groups_); - - // Index to start of current cell's data block in local_psi_data_ - const size_t cur_cell_data_start_index = cur_cell_spatial_dof_0_index * num_groups_and_angles_; - const size_t addr_offset = cell_node * num_groups_and_angles_ + as_ss_idx * num_groups_; - const size_t cur_cell_data_index = cur_cell_data_start_index + addr_offset; - - assert((cur_cell_data_index >= 0) and (cur_cell_data_index < local_psi_data_.size())); - - return &local_psi_data_[cur_cell_data_index]; + const size_t index = + cell_psi_start_[cell.local_id] + cell_node * num_groups_and_angles_ + as_ss_idx * num_groups_; + assert(index < local_psi_data_.size()); + return &local_psi_data_[index]; } double* @@ -77,12 +68,15 @@ CBC_FLUDS::NLUpwindPsi(uint64_t cell_global_id, unsigned int face_node_mapped, size_t as_ss_idx) { - std::vector& psi = deplocs_outgoing_messages_.at({cell_global_id, face_id}); + auto it = deplocs_outgoing_messages_.find({cell_global_id, face_id}); + if (it == deplocs_outgoing_messages_.end()) + return nullptr; + auto& psi = it->second; const size_t dof_map = face_node_mapped * num_groups_and_angles_ + // Offset to start of data for face_node_mapped as_ss_idx * num_groups_; // Offset to start of data for angle_set_index - assert((dof_map >= 0) and (dof_map < psi.size())); + assert(dof_map < psi.size()); return &psi[dof_map]; } diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h index dc232ba52a..ba7a6467bf 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h @@ -8,7 +8,7 @@ #include "framework/math/unknown_manager/unknown_manager.h" #include "framework/math/spatial_discretization/spatial_discretization.h" #include -#include +#include #include namespace opensn @@ -82,14 +82,6 @@ class CBC_FLUDS : public FLUDS void AllocatePrelocIOutgoingPsi() override {} void AllocateDelayedPrelocIOutgoingPsi() override {} - // cell_global_id, face_id - using CellFaceKey = std::pair; - - std::map>& GetDeplocsOutgoingMessages() - { - return deplocs_outgoing_messages_; - } - protected: const CBC_FLUDSCommonData& common_data_; const UnknownManager& psi_uk_man_; @@ -107,7 +99,8 @@ class CBC_FLUDS : public FLUDS std::vector> boundryI_incoming_psi_; - std::map> deplocs_outgoing_messages_; + /// Pre-computed start index into local_psi_data_ for each local cell + std::vector cell_psi_start_; }; } // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.cc index 3fe986500f..354b0fd3a0 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.cc +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.cc @@ -3,14 +3,38 @@ #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/spds.h" +#include "framework/mesh/cell/cell.h" +#include "framework/mesh/mesh_continuum/mesh_continuum.h" namespace opensn { CBC_FLUDSCommonData::CBC_FLUDSCommonData( const SPDS& spds, const std::vector& grid_nodal_mappings) - : FLUDSCommonData(spds, grid_nodal_mappings) + : FLUDSCommonData(spds, grid_nodal_mappings), + num_incoming_nonlocal_faces_(0), + num_outgoing_nonlocal_faces_(0) { + // Pre-compute non-local face counts for hash map capacity reservation + const auto& grid = *spds.GetGrid(); + const auto& face_orientations = spds.GetCellFaceOrientations(); + + for (const auto& cell : grid.local_cells) + { + for (size_t f = 0; f < cell.faces.size(); ++f) + { + const auto& face = cell.faces[f]; + const auto orientation = face_orientations[cell.local_id][f]; + + if ((not face.has_neighbor) or (face.IsNeighborLocal(&grid))) + continue; + + if (orientation == FaceOrientation::INCOMING) + ++num_incoming_nonlocal_faces_; + else if (orientation == FaceOrientation::OUTGOING) + ++num_outgoing_nonlocal_faces_; + } + } } } // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.h index 62f1a461f7..a1cd93f7ad 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.h @@ -5,6 +5,7 @@ #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/fluds_common_data.h" #include +#include namespace opensn { @@ -14,6 +15,14 @@ class CBC_FLUDSCommonData : public FLUDSCommonData public: CBC_FLUDSCommonData(const SPDS& spds, const std::vector& grid_nodal_mappings); + + size_t GetNumIncomingNonlocalFaces() const { return num_incoming_nonlocal_faces_; } + + size_t GetNumOutgoingNonlocalFaces() const { return num_outgoing_nonlocal_faces_; } + +private: + size_t num_incoming_nonlocal_faces_; + size_t num_outgoing_nonlocal_faces_; }; } // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.cu b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.cu index 1007396682..dde99da1f6 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.cu +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.cu @@ -3,6 +3,7 @@ #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbcd_angle_set.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h" @@ -50,6 +51,7 @@ CBCD_FLUDS::CBCD_FLUDS(size_t num_groups, device_saved_psi_ = crb::DeviceMemory(local_psi_data_size_); } CreatePointerSet(); + deplocs_outgoing_messages_.reserve(common_data.GetNumIncomingNonlocalFaces()); } CBCD_FLUDS::~CBCD_FLUDS() @@ -192,7 +194,8 @@ CBCD_FLUDS::CopyOutgoingPsiBackToHost(CBCDSweepChunk& sweep_chunk, const auto& face_data_size = num_face_nodes * num_groups_and_angles_; const int locality = sweep_chunk.GetCellTransportView(node.cell_local_id).FaceLocality(node.face_id); - auto& async_comm = *angle_set->GetCommunicator(); + auto& async_comm = + static_cast(*angle_set->GetCommunicator()); std::vector* psi_nonlocal_outgoing = &async_comm.InitGetDownwindMessageData(locality, face.neighbor_id, @@ -261,12 +264,15 @@ CBCD_FLUDS::NLUpwindPsi(uint64_t cell_global_id, unsigned int face_node_mapped, size_t as_ss_idx) { - std::vector& psi = deplocs_outgoing_messages_.at({cell_global_id, face_id}); + auto it = deplocs_outgoing_messages_.find({cell_global_id, face_id}); + if (it == deplocs_outgoing_messages_.end()) + return nullptr; + auto& psi = it->second; const size_t dof_map = face_node_mapped * num_groups_and_angles_ + // Offset to start of data for face_node_mapped as_ss_idx * num_groups_; // Offset to start of data for angle_set_index - assert((dof_map >= 0) and (dof_map < psi.size())); + assert(dof_map < psi.size()); return &psi[dof_map]; } @@ -280,4 +286,4 @@ CBCD_FLUDS::NLOutgoingPsi(std::vector* psi_nonlocal_outgoing, return &(*psi_nonlocal_outgoing)[addr_offset]; } -} // namespace opensn \ No newline at end of file +} // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h index 9090fe2bd8..f466af2052 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h @@ -10,7 +10,7 @@ #include "caribou/main.hpp" #include #include -#include +#include namespace crb = caribou; @@ -93,13 +93,6 @@ class CBCD_FLUDS : public FLUDS void AllocatePrelocIOutgoingPsi() override {} void AllocateDelayedPrelocIOutgoingPsi() override {} - // cell_global_id, face_id - using CellFaceKey = std::pair; - std::map>& GetDeplocsOutgoingMessages() - { - return deplocs_outgoing_messages_; - } - private: /// Reference to the common data. const CBCD_FLUDSCommonData& common_data_; @@ -138,8 +131,6 @@ class CBCD_FLUDS : public FLUDS void CreatePointerSet(); std::vector> boundaryI_incoming_psi_; - - std::map> deplocs_outgoing_messages_; }; } // namespace opensn \ No newline at end of file diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cc index ecdcd4023c..411bcebd7f 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cc +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cc @@ -15,7 +15,9 @@ CBCD_FLUDSCommonData::CBCD_FLUDSCommonData( : FLUDSCommonData(spds, grid_nodal_mappings), num_incoming_boundary_nodes_(0), num_outgoing_boundary_nodes_(0), + num_incoming_nonlocal_faces_(0), num_incoming_nonlocal_nodes_(0), + num_outgoing_nonlocal_faces_(0), num_outgoing_nonlocal_nodes_(0), device_cell_face_node_map_(nullptr), incoming_boundary_node_map_(), diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cu b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cu index ee527e3bac..98d2294a72 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cu +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cu @@ -51,6 +51,15 @@ CBCD_FLUDSCommonData::CopyFlattenedNodeIndexToDevice(const SpatialDiscretization const bool is_incoming_face = (orientation == FaceOrientation::INCOMING); const bool is_local_face = face.IsNeighborLocal(&grid); const bool is_boundary_face = not face.has_neighbor; + + if ((not is_local_face) and (not is_boundary_face)) + { + if (is_incoming_face) + ++num_incoming_nonlocal_faces_; + else if (is_outgoing_face) + ++num_outgoing_nonlocal_faces_; + } + for (size_t fn = 0; fn < num_face_nodes; ++fn) { CBCD_NodeIndex node_index; diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h index ad193c07e7..1d61b5201e 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h @@ -35,6 +35,12 @@ class CBCD_FLUDSCommonData : public FLUDSCommonData /// Get number of outgoing non-local face nodes. std::size_t GetNumOutgoingNonlocalNodes() const { return num_outgoing_nonlocal_nodes_; } + /// Get number of incoming non-local faces. + std::size_t GetNumIncomingNonlocalFaces() const { return num_incoming_nonlocal_faces_; } + + /// Get number of outgoing non-local faces. + std::size_t GetNumOutgoingNonlocalFaces() const { return num_outgoing_nonlocal_faces_; } + /// Get incoming boundary node map. const std::vector& GetIncomingBoundaryNodeMap() const { @@ -67,8 +73,12 @@ class CBCD_FLUDSCommonData : public FLUDSCommonData size_t num_incoming_boundary_nodes_; /// Number of outgoing boundary face nodes. size_t num_outgoing_boundary_nodes_; + /// Number of incoming non-local faces. + size_t num_incoming_nonlocal_faces_; /// Number of incoming non-local face nodes. size_t num_incoming_nonlocal_nodes_; + /// Number of outgoing non-local faces. + size_t num_outgoing_nonlocal_faces_; /// Number of outgoing non-local face nodes. size_t num_outgoing_nonlocal_nodes_; /// Device pointer to cell-face-node map for angular flux buffer access. diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/fluds.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/fluds.h index 3177c4c4bc..1097113a74 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/fluds.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/fluds.h @@ -10,6 +10,8 @@ #include #include #include +#include +#include namespace opensn { @@ -59,6 +61,27 @@ class FLUDS virtual ~FLUDS() = default; + /// cell_global_id, face_id + using CellFaceKey = std::pair; + + /// boost::hash_combine hash function for CellFaceKey. + struct CellFaceKeyHash + { + size_t operator()(const CellFaceKey& key) const noexcept + { + size_t h = std::hash{}(key.first); + h ^= + std::hash{}(key.second) + 0x9e3779b9 + (h << 6) + (h >> 2); // Combine hashes + return h; + } + }; + + std::unordered_map, CellFaceKeyHash>& + GetDeplocsOutgoingMessages() + { + return deplocs_outgoing_messages_; + } + protected: const unsigned int num_groups_; const size_t num_angles_; @@ -71,6 +94,8 @@ class FLUDS std::vector> prelocI_outgoing_psi_view_; std::vector> delayed_prelocI_outgoing_psi_view_; std::vector> delayed_prelocI_outgoing_psi_old_view_; + + std::unordered_map, CellFaceKeyHash> deplocs_outgoing_messages_; }; } // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_avx_sweep_chunk.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_avx_sweep_chunk.cc index 2c9ce44dc5..e369c8d505 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_avx_sweep_chunk.cc +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_avx_sweep_chunk.cc @@ -11,213 +11,9 @@ #include #include -#if __AVX512F__ || __AVX2__ -#include -#endif - -#if __clang__ || __INTEL_COMPILER -#define PRAGMA_UNROLL _Pragma("unroll") -#elif __GNUC__ -#define PRAGMA_UNROLL _Pragma("GCC unroll 8") -#else -#define PRAGMA_UNROLL -#endif - namespace opensn { -namespace detail -{ - -#if __AVX512F__ -struct AVX512Ops -{ - using avx_vec = __m512d; - using avx_index = __m512i; - - static inline avx_vec LoadSigma(const double* sigma) { return _mm512_loadu_pd(sigma); } - static inline avx_vec Set1(double x) { return _mm512_set1_pd(x); } - static inline avx_vec Add(const avx_vec& a, const avx_vec& b) { return _mm512_add_pd(a, b); } - static inline avx_vec Sub(const avx_vec& a, const avx_vec& b) { return _mm512_sub_pd(a, b); } - static inline avx_vec Mul(const avx_vec& a, const avx_vec& b) { return _mm512_mul_pd(a, b); } - static inline avx_vec Div(const avx_vec& a, const avx_vec& b) { return _mm512_div_pd(a, b); } - static inline avx_vec Fmadd(const avx_vec& a, const avx_vec& b, const avx_vec& c) - { - // a + b * c - return _mm512_fmadd_pd(b, c, a); - } - static inline avx_vec Fnmadd(const avx_vec& a, const avx_vec& b, const avx_vec& c) - { - // c - a * b - return _mm512_fnmadd_pd(a, b, c); - } - static inline avx_vec Reciprocal(const avx_vec& v) { return Div(Set1(1.0), v); } - static inline avx_vec Gather(const avx_index& idx, const double* base) - { - return _mm512_i64gather_pd(idx, base, sizeof(double)); - } - static inline void Scatter(const avx_index& idx, double* base, const avx_vec& value) - { - _mm512_i64scatter_pd(base, idx, value, sizeof(double)); - } -}; -#elif __AVX2__ -struct AVX2Ops -{ - using avx_vec = __m256d; - using avx_index = __m128i; - - static inline avx_vec LoadSigma(const double* sigma) { return _mm256_loadu_pd(sigma); } - static inline avx_vec Set1(double x) { return _mm256_set1_pd(x); } - static inline avx_vec Add(const avx_vec& a, const avx_vec& b) { return _mm256_add_pd(a, b); } - static inline avx_vec Sub(const avx_vec& a, const avx_vec& b) { return _mm256_sub_pd(a, b); } - static inline avx_vec Mul(const avx_vec& a, const avx_vec& b) { return _mm256_mul_pd(a, b); } - static inline avx_vec Div(const avx_vec& a, const avx_vec& b) { return _mm256_div_pd(a, b); } - -#if __FMA__ - static inline avx_vec Fmadd(const avx_vec& a, const avx_vec& b, const avx_vec& c) - { - return _mm256_fmadd_pd(b, c, a); - } - static inline avx_vec Fnmadd(const avx_vec& a, const avx_vec& b, const avx_vec& c) - { - return _mm256_fnmadd_pd(a, b, c); - } -#else - static inline avx_vec Fmadd(const avx_vec& a, const avx_vec& b, const avx_vec& c) - { - return Add(a, Mul(b, c)); - } - static inline avx_vec Fnmadd(const avx_vec& a, const avx_vec& b, const avx_vec& c) - { - return Sub(c, Mul(a, b)); - } -#endif - - static inline avx_vec Reciprocal(const avx_vec& v) { return Div(Set1(1.0), v); } - static inline avx_vec Gather(const avx_index& idx, const double* base) - { - return _mm256_i32gather_pd(base, idx, sizeof(double)); - } - static inline void Scatter(const avx_index& idx, double* base, const avx_vec& value) - { - alignas(32) double buffer[simd_width]; - _mm256_store_pd(buffer, value); - alignas(16) int offsets[simd_width]; - _mm_store_si128(reinterpret_cast<__m128i*>(offsets), idx); - for (int lane = 0; lane < simd_width; ++lane) - base[offsets[lane]] = buffer[lane]; - } -}; -#endif - -template -struct GatherIndexBuilder -{ - static typename Ops::avx_index Build(int /*unused*/) - { - static_assert(sizeof(Ops) == 0, "SIMD gather index helper not implemented for this Ops type."); - return typename Ops::avx_index{}; - } -}; - -#if __AVX512F__ -template -struct GatherIndexBuilder -{ - static AVX512Ops::avx_index Build(int row) - { - long long vals[simd_width]; - for (int lane = 0; lane < simd_width; ++lane) - vals[lane] = static_cast(lane * N + row); - return _mm512_setr_epi64( - vals[0], vals[1], vals[2], vals[3], vals[4], vals[5], vals[6], vals[7]); - } -}; -#elif __AVX2__ -template -struct GatherIndexBuilder -{ - static AVX2Ops::avx_index Build(int row) - { - int vals[simd_width]; - for (int lane = 0; lane < simd_width; ++lane) - vals[lane] = lane * N + row; - return _mm_setr_epi32(vals[0], vals[1], vals[2], vals[3]); - } -}; -#endif - -namespace -{ - -template -inline typename Ops::avx_index -MakeGatherIndex(int row) -{ - return GatherIndexBuilder::Build(row); -} - -template -inline void -SimdBatchSolve(const double* Am, const double* Mm, const double* sigma_t, double* __restrict b) -{ - using avx_vec = typename Ops::avx_vec; - - avx_vec rhs[N]; - PRAGMA_UNROLL - for (int row = 0; row < N; ++row) - rhs[row] = Ops::Gather(MakeGatherIndex(row), b); - - const avx_vec sigma = Ops::LoadSigma(sigma_t); - avx_vec A[N * N]; - PRAGMA_UNROLL - for (int i = 0; i < N; ++i) - { - PRAGMA_UNROLL - for (int j = 0; j < N; ++j) - { - const avx_vec Amij = Ops::Set1(Am[i * N + j]); - const avx_vec Mmij = Ops::Set1(Mm[i * N + j]); - A[i * N + j] = Ops::Fmadd(Amij, sigma, Mmij); - } - } - - auto entry = [&](int i, int j) -> avx_vec& { return A[i * N + j]; }; - PRAGMA_UNROLL - for (int pivot = 0; pivot < N; ++pivot) - { - const avx_vec inv = Ops::Reciprocal(entry(pivot, pivot)); - PRAGMA_UNROLL - for (int row = pivot + 1; row < N; ++row) - { - const avx_vec factor = Ops::Mul(entry(row, pivot), inv); - rhs[row] = Ops::Fnmadd(factor, rhs[pivot], rhs[row]); - PRAGMA_UNROLL - for (int col = pivot + 1; col < N; ++col) - entry(row, col) = Ops::Fnmadd(factor, entry(pivot, col), entry(row, col)); - } - } - - PRAGMA_UNROLL - for (int pivot = N - 1; pivot >= 0; --pivot) - { - avx_vec rhs_vec = rhs[pivot]; - PRAGMA_UNROLL - for (int col = pivot + 1; col < N; ++col) - rhs_vec = Ops::Fnmadd(entry(pivot, col), rhs[col], rhs_vec); - rhs[pivot] = Ops::Mul(rhs_vec, Ops::Reciprocal(entry(pivot, pivot))); - } - - PRAGMA_UNROLL - for (int row = 0; row < N; ++row) - Ops::Scatter(MakeGatherIndex(row), b, rhs[row]); -} - -} // namespace - -} // namespace detail - template void AAH_Sweep_FixedN(AAHSweepData& data, AngleSet& angle_set) diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_sweep_chunk.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_sweep_chunk.h index c3f04c4cd3..f84baa93b3 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_sweep_chunk.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_sweep_chunk.h @@ -4,6 +4,7 @@ #pragma once #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/sweep_chunk.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/avx_sweep_chunk_utils.h" #include "modules/linear_boltzmann_solvers/lbs_problem/groupset/lbs_groupset.h" #include "framework/math/spatial_discretization/spatial_discretization.h" #include @@ -13,16 +14,6 @@ namespace opensn { -// experimental, to be moved to a higher level header file -static constexpr size_t simd_width = -#if __AVX512F__ - 8; // 8 lanes (512-bit, doubles) -#elif __AVX2__ - 4; // 4 lanes (256-bit, doubles) -#else - 1; // scalar -#endif - class DiscreteOrdinatesProblem; class AAHSweepChunk : public SweepChunk diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_sweep_kernels.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_sweep_kernels.h index c6a327678f..4c1e3da001 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_sweep_kernels.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_sweep_kernels.h @@ -42,26 +42,6 @@ struct AAHSweepData unsigned int group_block_size; // used by fixed-N/AVX path }; -inline size_t -ComputeGroupBlockSize(size_t gs_size) -{ - if (gs_size <= simd_width) - return gs_size; - - size_t target = 0; - if (gs_size >= 16 * simd_width) - target = 4 * simd_width; - else if (gs_size >= 4 * simd_width) - target = 2 * simd_width; - else - target = 1 * simd_width; - - target = std::min(target, gs_size); - if (target >= simd_width) - target = (target / simd_width) * simd_width; - return target; -} - /// Generic sweep kernel (scalar), parameterized by time dependence. template inline void diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/avx_sweep_chunk_utils.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/avx_sweep_chunk_utils.h new file mode 100644 index 0000000000..dc1af04119 --- /dev/null +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/avx_sweep_chunk_utils.h @@ -0,0 +1,241 @@ +// SPDX-FileCopyrightText: 2026 The OpenSn Authors +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include + +#if __AVX512F__ || __AVX2__ +#include +#endif + +#if __clang__ || __INTEL_COMPILER +#define PRAGMA_UNROLL _Pragma("unroll") +#elif __GNUC__ +#define PRAGMA_UNROLL _Pragma("GCC unroll 8") +#else +#define PRAGMA_UNROLL +#endif + +namespace opensn +{ + +static constexpr size_t simd_width = +#if __AVX512F__ + 8; // 8 lanes (512-bit, doubles) +#elif __AVX2__ + 4; // 4 lanes (256-bit, doubles) +#else + 1; // scalar +#endif + +inline size_t +ComputeGroupBlockSize(size_t gs_size) +{ + if (gs_size <= simd_width) + return gs_size; + + size_t target = 0; + if (gs_size >= 16 * simd_width) + target = 4 * simd_width; + else if (gs_size >= 4 * simd_width) + target = 2 * simd_width; + else + target = 1 * simd_width; + + target = std::min(target, gs_size); + if (target >= simd_width) + target = (target / simd_width) * simd_width; + return target; +} + +namespace detail +{ + +#if __AVX512F__ +struct AVX512Ops +{ + using avx_vec = __m512d; + using avx_index = __m512i; + + static inline avx_vec LoadSigma(const double* sigma) { return _mm512_loadu_pd(sigma); } + static inline avx_vec Set1(double x) { return _mm512_set1_pd(x); } + static inline avx_vec Add(const avx_vec& a, const avx_vec& b) { return _mm512_add_pd(a, b); } + static inline avx_vec Sub(const avx_vec& a, const avx_vec& b) { return _mm512_sub_pd(a, b); } + static inline avx_vec Mul(const avx_vec& a, const avx_vec& b) { return _mm512_mul_pd(a, b); } + static inline avx_vec Div(const avx_vec& a, const avx_vec& b) { return _mm512_div_pd(a, b); } + static inline avx_vec Fmadd(const avx_vec& a, const avx_vec& b, const avx_vec& c) + { + // a + b * c + return _mm512_fmadd_pd(b, c, a); + } + static inline avx_vec Fnmadd(const avx_vec& a, const avx_vec& b, const avx_vec& c) + { + // c - a * b + return _mm512_fnmadd_pd(a, b, c); + } + static inline avx_vec Reciprocal(const avx_vec& v) { return Div(Set1(1.0), v); } + static inline avx_vec Gather(const avx_index& idx, const double* base) + { + return _mm512_i64gather_pd(idx, base, sizeof(double)); + } + static inline void Scatter(const avx_index& idx, double* base, const avx_vec& value) + { + _mm512_i64scatter_pd(base, idx, value, sizeof(double)); + } +}; +#elif __AVX2__ +struct AVX2Ops +{ + using avx_vec = __m256d; + using avx_index = __m128i; + + static inline avx_vec LoadSigma(const double* sigma) { return _mm256_loadu_pd(sigma); } + static inline avx_vec Set1(double x) { return _mm256_set1_pd(x); } + static inline avx_vec Add(const avx_vec& a, const avx_vec& b) { return _mm256_add_pd(a, b); } + static inline avx_vec Sub(const avx_vec& a, const avx_vec& b) { return _mm256_sub_pd(a, b); } + static inline avx_vec Mul(const avx_vec& a, const avx_vec& b) { return _mm256_mul_pd(a, b); } + static inline avx_vec Div(const avx_vec& a, const avx_vec& b) { return _mm256_div_pd(a, b); } + +#if __FMA__ + static inline avx_vec Fmadd(const avx_vec& a, const avx_vec& b, const avx_vec& c) + { + return _mm256_fmadd_pd(b, c, a); + } + static inline avx_vec Fnmadd(const avx_vec& a, const avx_vec& b, const avx_vec& c) + { + return _mm256_fnmadd_pd(a, b, c); + } +#else + static inline avx_vec Fmadd(const avx_vec& a, const avx_vec& b, const avx_vec& c) + { + return Add(a, Mul(b, c)); + } + static inline avx_vec Fnmadd(const avx_vec& a, const avx_vec& b, const avx_vec& c) + { + return Sub(c, Mul(a, b)); + } +#endif + + static inline avx_vec Reciprocal(const avx_vec& v) { return Div(Set1(1.0), v); } + static inline avx_vec Gather(const avx_index& idx, const double* base) + { + return _mm256_i32gather_pd(base, idx, sizeof(double)); + } + static inline void Scatter(const avx_index& idx, double* base, const avx_vec& value) + { + alignas(32) double buffer[simd_width]; + _mm256_store_pd(buffer, value); + alignas(16) int offsets[simd_width]; + _mm_store_si128(reinterpret_cast<__m128i*>(offsets), idx); + for (int lane = 0; lane < static_cast(simd_width); ++lane) + base[offsets[lane]] = buffer[lane]; + } +}; +#endif + +template +struct GatherIndexBuilder +{ + static typename Ops::avx_index Build(int /*unused*/) + { + static_assert(sizeof(Ops) == 0, "SIMD gather index helper not implemented for this Ops type."); + return typename Ops::avx_index{}; + } +}; + +#if __AVX512F__ +template +struct GatherIndexBuilder +{ + static AVX512Ops::avx_index Build(int row) + { + long long vals[simd_width]; + for (int lane = 0; lane < static_cast(simd_width); ++lane) + vals[lane] = static_cast(lane * N + row); + return _mm512_setr_epi64( + vals[0], vals[1], vals[2], vals[3], vals[4], vals[5], vals[6], vals[7]); + } +}; +#elif __AVX2__ +template +struct GatherIndexBuilder +{ + static AVX2Ops::avx_index Build(int row) + { + int vals[simd_width]; + for (int lane = 0; lane < static_cast(simd_width); ++lane) + vals[lane] = lane * N + row; + return _mm_setr_epi32(vals[0], vals[1], vals[2], vals[3]); + } +}; +#endif + +template +inline typename Ops::avx_index static MakeGatherIndex(int row) +{ + return GatherIndexBuilder::Build(row); +} + +template +inline void static SimdBatchSolve(const double* Am, + const double* Mm, + const double* sigma_t, + double* __restrict b) +{ + using avx_vec = typename Ops::avx_vec; + + avx_vec rhs[N]; + PRAGMA_UNROLL + for (int row = 0; row < N; ++row) + rhs[row] = Ops::Gather(MakeGatherIndex(row), b); + + const avx_vec sigma = Ops::LoadSigma(sigma_t); + avx_vec A[N * N]; + PRAGMA_UNROLL + for (int i = 0; i < N; ++i) + { + PRAGMA_UNROLL + for (int j = 0; j < N; ++j) + { + const avx_vec Amij = Ops::Set1(Am[i * N + j]); + const avx_vec Mmij = Ops::Set1(Mm[i * N + j]); + A[i * N + j] = Ops::Fmadd(Amij, sigma, Mmij); + } + } + + auto entry = [&](int i, int j) -> avx_vec& { return A[i * N + j]; }; + PRAGMA_UNROLL + for (int pivot = 0; pivot < N; ++pivot) + { + const avx_vec inv = Ops::Reciprocal(entry(pivot, pivot)); + PRAGMA_UNROLL + for (int row = pivot + 1; row < N; ++row) + { + const avx_vec factor = Ops::Mul(entry(row, pivot), inv); + rhs[row] = Ops::Fnmadd(factor, rhs[pivot], rhs[row]); + PRAGMA_UNROLL + for (int col = pivot + 1; col < N; ++col) + entry(row, col) = Ops::Fnmadd(factor, entry(pivot, col), entry(row, col)); + } + } + + PRAGMA_UNROLL + for (int pivot = N - 1; pivot >= 0; --pivot) + { + avx_vec rhs_vec = rhs[pivot]; + PRAGMA_UNROLL + for (int col = pivot + 1; col < N; ++col) + rhs_vec = Ops::Fnmadd(entry(pivot, col), rhs[col], rhs_vec); + rhs[pivot] = Ops::Mul(rhs_vec, Ops::Reciprocal(entry(pivot, pivot))); + } + + PRAGMA_UNROLL + for (int row = 0; row < N; ++row) + Ops::Scatter(MakeGatherIndex(row), b, rhs[row]); +} + +} // namespace detail + +} // namespace opensn \ No newline at end of file diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_avx_sweep_chunk.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_avx_sweep_chunk.cc new file mode 100644 index 0000000000..0b541d316c --- /dev/null +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_avx_sweep_chunk.cc @@ -0,0 +1,427 @@ +// SPDX-FileCopyrightText: 2026 The OpenSn Authors +// SPDX-License-Identifier: MIT + +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h" +#include "framework/utils/error.h" +#include "caliper/cali.h" +#include +#include +#include + +namespace opensn +{ + +template +void +CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set) +{ + CALI_CXX_MARK_SCOPE("CBC_Sweep_FixedN"); + + static_assert(NumNodes >= 2 and NumNodes <= 8); + + const auto& groupset = data.groupset; + const auto& m2d_op = groupset.quadrature->GetMomentToDiscreteOperator(); + const auto& d2m_op = groupset.quadrature->GetDiscreteToMomentOperator(); + + OpenSnInvalidArgumentIf(data.cell_num_nodes != static_cast(NumNodes), + "CBC_Sweep_FixedN invoked for an incompatible cell topology."); + + const auto& face_orientations = angle_set.GetSPDS().GetCellFaceOrientations()[data.cell_local_id]; + const auto& sigma_t = data.xs.at(data.cell.block_id)->GetSigmaTotal(); + + constexpr size_t matrix_size = static_cast(NumNodes) * static_cast(NumNodes); + auto idx = [](size_t i, size_t j) -> size_t { return i * NumNodes + j; }; + + std::array mass_matrix{}; + PRAGMA_UNROLL + for (size_t i = 0; i < NumNodes; ++i) + { + PRAGMA_UNROLL + for (size_t j = 0; j < NumNodes; ++j) + mass_matrix[idx(i, j)] = data.M(i, j); + } + + std::vector> moment_dof_map(data.num_moments); + for (unsigned int m = 0; m < data.num_moments; ++m) + { + PRAGMA_UNROLL + for (size_t i = 0; i < NumNodes; ++i) + moment_dof_map[m][i] = data.cell_transport_view.MapDOF(i, m, data.gs_gi); + } + + std::array Amat{}; + std::vector b(static_cast(data.gs_size) * NumNodes, 0.0); + std::vector sigma_block; + sigma_block.reserve(data.group_block_size); + std::vector face_mu_values(data.cell_num_faces); + + std::vector tau_gsg; + if constexpr (time_dependent) + { + const auto& inv_velg = data.xs.at(data.cell.block_id)->GetInverseVelocity(); + const double theta = data.problem.GetTheta(); + const double inv_theta = 1.0 / theta; + const double dt = data.problem.GetTimeStep(); + const double inv_dt = 1.0 / dt; + + tau_gsg.assign(data.gs_size, 0.0); + for (size_t gsg = 0; gsg < data.gs_size; ++gsg) + tau_gsg[gsg] = inv_velg[data.gs_gi + gsg] * inv_theta * inv_dt; + } + + const double* psi_old = + (time_dependent and data.psi_old) + ? &(*data.psi_old)[data.discretization.MapDOFLocal(data.cell, 0, groupset.psi_uk_man_, 0, 0)] + : nullptr; + + const auto& as_angle_indices = angle_set.GetAngleIndices(); + + for (size_t as_ss_idx = 0; as_ss_idx < data.num_angles_in_as; ++as_ss_idx) + { + const auto direction_num = as_angle_indices[as_ss_idx]; + const auto omega = groupset.quadrature->omegas[direction_num]; + const auto wt = groupset.quadrature->weights[direction_num]; + + std::fill(b.begin(), b.end(), 0.0); + + PRAGMA_UNROLL + for (size_t i = 0; i < NumNodes; ++i) + { + PRAGMA_UNROLL + for (size_t j = 0; j < NumNodes; ++j) + Amat[idx(i, j)] = omega.Dot(data.G(i, j)); + } + + for (size_t f = 0; f < data.cell_num_faces; ++f) + face_mu_values[f] = omega.Dot(data.cell.faces[f].normal); + + for (size_t f = 0; f < data.cell_num_faces; ++f) + { + if (face_orientations[f] != FaceOrientation::INCOMING) + continue; + + const auto& face = data.cell.faces[f]; + const bool is_local_face = data.cell_transport_view.IsFaceLocal(f); + const bool is_boundary_face = not face.has_neighbor; + const auto* face_nodal_mapping = + &data.fluds.GetCommonData().GetFaceNodalMapping(data.cell_local_id, f); + + const auto& Ms_f = data.M_surf[f]; + const size_t num_face_nodes = data.cell_mapping.GetNumFaceNodes(f); + const double mu_f = -face_mu_values[f]; + + for (size_t fj = 0; fj < num_face_nodes; ++fj) + { + const int j = data.cell_mapping.MapFaceNode(f, fj); + + const double* psi = nullptr; + if (is_local_face) + psi = data.fluds.UpwindPsi(*data.cell_transport_view.FaceNeighbor(f), + face_nodal_mapping->cell_node_mapping_[fj], + as_ss_idx); + else if (not is_boundary_face) + psi = data.fluds.NLUpwindPsi( + data.cell.global_id, f, face_nodal_mapping->face_node_mapping_[fj], as_ss_idx); + else + psi = angle_set.PsiBoundary(face.neighbor_id, + direction_num, + data.cell_local_id, + f, + fj, + data.gs_gi, + data.surface_source_active); + + for (size_t fi = 0; fi < num_face_nodes; ++fi) + { + const int i = data.cell_mapping.MapFaceNode(f, fi); + const double mu_Nij = mu_f * Ms_f(i, j); + Amat[idx(i, j)] += mu_Nij; + + if (not psi) + continue; + + for (size_t gsg = 0; gsg < data.gs_size; ++gsg) + b[gsg * NumNodes + i] += psi[gsg] * mu_Nij; + } + } + } + + const auto dir_moment_offset = + static_cast(direction_num) * static_cast(data.num_moments); + const double* __restrict m2d_row = m2d_op.data() + dir_moment_offset; + const double* __restrict d2m_row = d2m_op.data() + dir_moment_offset; + + for (unsigned int g0 = 0; g0 < data.gs_size; g0 += data.group_block_size) + { + const auto g1 = std::min(g0 + data.group_block_size, static_cast(data.gs_size)); + const auto block_len = g1 - g0; + sigma_block.resize(block_len); + + for (unsigned int gsg = g0; gsg < g1; ++gsg) + { + const size_t rel = gsg - g0; + double sigma_tg = sigma_t[data.gs_gi + gsg]; + if constexpr (time_dependent) + sigma_tg += tau_gsg[gsg]; + sigma_block[rel] = sigma_tg; + + double* __restrict bg = &b[static_cast(gsg) * NumNodes]; + for (unsigned int m = 0; m < data.num_moments; ++m) + { + const double w = m2d_row[m]; + std::array nodal_source{}; + for (size_t i = 0; i < NumNodes; ++i) + nodal_source[i] = w * data.source_moments[moment_dof_map[m][i] + gsg]; + + for (size_t i = 0; i < NumNodes; ++i) + { + double value = 0.0; + const double* row = &mass_matrix[idx(i, 0)]; + PRAGMA_UNROLL + for (size_t j = 0; j < NumNodes; ++j) + value += row[j] * nodal_source[j]; + bg[i] += value; + } + } + } + + if constexpr (time_dependent) + { + if (data.include_rhs_time_term and psi_old) + { + for (size_t gsg = g0; gsg < g1; ++gsg) + { + const double tau = tau_gsg[gsg]; + double* __restrict bg = &b[gsg * NumNodes]; + + for (size_t i = 0; i < NumNodes; ++i) + { + double value = 0.0; + const double* row = &mass_matrix[idx(i, 0)]; + PRAGMA_UNROLL + for (size_t j = 0; j < NumNodes; ++j) + { + const size_t imap = + j * data.groupset_angle_group_stride + direction_num * data.groupset_group_stride; + const double psi_old_val = psi_old[imap + gsg]; + value += row[j] * psi_old_val; + } + bg[i] += tau * value; + } + } + } + } + + size_t k = 0; + +#if __AVX512F__ + for (; k + simd_width <= block_len; k += simd_width) + detail::SimdBatchSolve( + Amat.data(), mass_matrix.data(), &sigma_block[k], &b[(g0 + k) * NumNodes]); +#elif __AVX2__ + for (; k + simd_width <= block_len; k += simd_width) + detail::SimdBatchSolve( + Amat.data(), mass_matrix.data(), &sigma_block[k], &b[(g0 + k) * NumNodes]); +#endif + + for (; k < block_len; ++k) + { + const size_t gsg = g0 + k; + const double sigma_tg = sigma_block[k]; + + std::array A{}; + PRAGMA_UNROLL + for (size_t i = 0; i < NumNodes; ++i) + { + PRAGMA_UNROLL + for (size_t j = 0; j < NumNodes; ++j) + A[idx(i, j)] = Amat[idx(i, j)] + sigma_tg * mass_matrix[idx(i, j)]; + } + + double* __restrict bg = &b[gsg * NumNodes]; + + for (size_t pivot = 0; pivot < NumNodes; ++pivot) + { + const double inv = 1.0 / A[idx(pivot, pivot)]; + for (size_t row = pivot + 1; row < NumNodes; ++row) + { + const double factor = A[idx(row, pivot)] * inv; + bg[row] -= factor * bg[pivot]; + PRAGMA_UNROLL + for (size_t col = pivot + 1; col < NumNodes; ++col) + A[idx(row, col)] -= factor * A[idx(pivot, col)]; + } + } + + for (size_t pivot = NumNodes; pivot-- > 0;) + { + PRAGMA_UNROLL + for (size_t col = pivot + 1; col < NumNodes; ++col) + bg[pivot] -= A[idx(pivot, col)] * bg[col]; + bg[pivot] /= A[idx(pivot, pivot)]; + } + } + + for (size_t gsg = g0; gsg < g1; ++gsg) + { + const double* __restrict bg = &b[gsg * NumNodes]; + for (unsigned int m = 0; m < data.num_moments; ++m) + { + const double w = d2m_row[m]; + PRAGMA_UNROLL + for (size_t i = 0; i < NumNodes; ++i) + { + const size_t dof = data.cell_transport_view.MapDOF(i, m, data.gs_gi); + data.destination_phi[dof + gsg] += w * bg[i]; + } + } + } + } + + if (data.save_angular_flux) + { + double* psi_new = &data.destination_psi[data.discretization.MapDOFLocal( + data.cell, 0, groupset.psi_uk_man_, 0, 0)]; + + double theta = 1.0; + double inv_theta = 1.0; + if constexpr (time_dependent) + { + theta = data.problem.GetTheta(); + inv_theta = 1.0 / theta; + } + + PRAGMA_UNROLL + for (size_t i = 0; i < NumNodes; ++i) + { + const size_t imap = + i * data.groupset_angle_group_stride + direction_num * data.groupset_group_stride; + + for (size_t gsg = 0; gsg < data.gs_size; ++gsg) + { + const double psi_sol = b[gsg * NumNodes + i]; + if constexpr (time_dependent) + { + const double psi_old_val = psi_old ? psi_old[imap + gsg] : 0.0; + psi_new[imap + gsg] = inv_theta * (psi_sol + (theta - 1.0) * psi_old_val); + } + else + psi_new[imap + gsg] = psi_sol; + } + } + } + + for (size_t f = 0; f < data.cell_num_faces; ++f) + { + if (face_orientations[f] != FaceOrientation::OUTGOING) + continue; + + const auto& face = data.cell.faces[f]; + const bool is_local_face = data.cell_transport_view.IsFaceLocal(f); + const bool is_boundary_face = not face.has_neighbor; + const bool is_reflecting_boundary_face = + (is_boundary_face and angle_set.GetBoundaries()[face.neighbor_id]->IsReflecting()); + const auto& IntF_shapeI = data.IntS_shapeI[f]; + + const int locality = data.cell_transport_view.FaceLocality(f); + const size_t num_face_nodes = data.cell_mapping.GetNumFaceNodes(f); + const auto& face_nodal_mapping = + data.fluds.GetCommonData().GetFaceNodalMapping(data.cell_local_id, f); + std::vector* psi_nonlocal_outgoing = nullptr; + + if (not is_boundary_face and not is_local_face) + { + auto* async_comm = dynamic_cast(angle_set.GetCommunicator()); + const size_t data_size_for_msg = num_face_nodes * data.group_angle_stride; + psi_nonlocal_outgoing = + &async_comm->InitGetDownwindMessageData(locality, + face.neighbor_id, + face_nodal_mapping.associated_face_, + angle_set.GetID(), + data_size_for_msg); + } + + const double mu_wt_f = wt * face_mu_values[f]; + + for (size_t fi = 0; fi < num_face_nodes; ++fi) + { + const int i = data.cell_mapping.MapFaceNode(f, fi); + + if (is_boundary_face) + { + const double flux_i = mu_wt_f * IntF_shapeI(i); + for (size_t gsg = 0; gsg < data.gs_size; ++gsg) + data.cell_transport_view.AddOutflow( + f, data.gs_gi + gsg, flux_i * b[gsg * NumNodes + i]); + } + + double* psi = nullptr; + if (is_local_face) + psi = data.fluds.OutgoingPsi(data.cell, i, as_ss_idx); + else if (not is_boundary_face) + psi = data.fluds.NLOutgoingPsi(psi_nonlocal_outgoing, fi, as_ss_idx); + else if (is_reflecting_boundary_face) + psi = angle_set.PsiReflected(face.neighbor_id, direction_num, data.cell_local_id, f, fi); + + if (psi != nullptr) + { + for (size_t gsg = 0; gsg < data.gs_size; ++gsg) + psi[gsg] = b[gsg * NumNodes + i]; + } + } + } + } +} + +template +void +CBCSweepChunk::Sweep_FixedN(AngleSet& angle_set) +{ + CALI_CXX_MARK_SCOPE("CBCSweepChunk::Sweep_FixedN"); + + auto data = MakeCBCSweepData(discretization_, + source_moments_, + groupset_, + xs_, + num_moments_, + max_num_cell_dofs_, + SaveAngularFluxEnabled(), + groupset_angle_group_stride_, + groupset_group_stride_, + destination_phi_, + destination_psi_, + include_rhs_time_term_, + problem_, + nullptr, + group_block_size_, + ctx_); + + CBC_Sweep_FixedN(data, angle_set); +} + +template void CBC_Sweep_FixedN<2, false>(CBCSweepData&, AngleSet&); +template void CBC_Sweep_FixedN<3, false>(CBCSweepData&, AngleSet&); +template void CBC_Sweep_FixedN<4, false>(CBCSweepData&, AngleSet&); +template void CBC_Sweep_FixedN<5, false>(CBCSweepData&, AngleSet&); +template void CBC_Sweep_FixedN<6, false>(CBCSweepData&, AngleSet&); +template void CBC_Sweep_FixedN<7, false>(CBCSweepData&, AngleSet&); +template void CBC_Sweep_FixedN<8, false>(CBCSweepData&, AngleSet&); + +template void CBC_Sweep_FixedN<2, true>(CBCSweepData&, AngleSet&); +template void CBC_Sweep_FixedN<3, true>(CBCSweepData&, AngleSet&); +template void CBC_Sweep_FixedN<4, true>(CBCSweepData&, AngleSet&); +template void CBC_Sweep_FixedN<5, true>(CBCSweepData&, AngleSet&); +template void CBC_Sweep_FixedN<6, true>(CBCSweepData&, AngleSet&); +template void CBC_Sweep_FixedN<7, true>(CBCSweepData&, AngleSet&); +template void CBC_Sweep_FixedN<8, true>(CBCSweepData&, AngleSet&); + +template void CBCSweepChunk::Sweep_FixedN<2>(AngleSet&); +template void CBCSweepChunk::Sweep_FixedN<3>(AngleSet&); +template void CBCSweepChunk::Sweep_FixedN<4>(AngleSet&); +template void CBCSweepChunk::Sweep_FixedN<5>(AngleSet&); +template void CBCSweepChunk::Sweep_FixedN<6>(AngleSet&); +template void CBCSweepChunk::Sweep_FixedN<7>(AngleSet&); +template void CBCSweepChunk::Sweep_FixedN<8>(AngleSet&); + +} // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.cc index 24bbf8ef5f..70ee7c82d3 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.cc +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.cc @@ -2,6 +2,7 @@ // SPDX-License-Identifier: MIT #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h" #include "modules/linear_boltzmann_solvers/lbs_problem/groupset/lbs_groupset.h" #include "framework/math/spatial_discretization/spatial_discretization.h" @@ -26,6 +27,8 @@ CBCSweepChunk::CBCSweepChunk(DiscreteOrdinatesProblem& problem, LBSGroupset& gro problem.GetNumMoments(), problem.GetMaxCellDOFCount(), problem.GetMinCellDOFCount()), + sweep_impl_(&CBCSweepChunk::Sweep_Generic), + group_block_size_(0), fluds_(nullptr), gs_size_(0), gs_gi_(0), @@ -40,6 +43,37 @@ CBCSweepChunk::CBCSweepChunk(DiscreteOrdinatesProblem& problem, LBSGroupset& gro cell_num_faces_(0), cell_num_nodes_(0) { + if ((min_num_cell_dofs_ == max_num_cell_dofs_) and (min_num_cell_dofs_ >= 2) + and (min_num_cell_dofs_ <= 8)) + { + switch (min_num_cell_dofs_) + { + case 2: + sweep_impl_ = &CBCSweepChunk::Sweep_FixedN<2>; + break; + case 3: + sweep_impl_ = &CBCSweepChunk::Sweep_FixedN<3>; + break; + case 4: + sweep_impl_ = &CBCSweepChunk::Sweep_FixedN<4>; + break; + case 5: + sweep_impl_ = &CBCSweepChunk::Sweep_FixedN<5>; + break; + case 6: + sweep_impl_ = &CBCSweepChunk::Sweep_FixedN<6>; + break; + case 7: + sweep_impl_ = &CBCSweepChunk::Sweep_FixedN<7>; + break; + case 8: + sweep_impl_ = &CBCSweepChunk::Sweep_FixedN<8>; + break; + default: + break; + } + } + group_block_size_ = ComputeGroupBlockSize(groupset_.GetNumGroups()); } void @@ -69,14 +103,21 @@ CBCSweepChunk::SetCell(const Cell* cell_ptr, AngleSet& angle_set) cell_num_nodes_ = cell_mapping_->GetNumNodes(); // Get cell matrices - G_ = unit_cell_matrices_[cell_local_id_].intV_shapeI_gradshapeJ; - M_ = unit_cell_matrices_[cell_local_id_].intV_shapeI_shapeJ; - M_surf_ = unit_cell_matrices_[cell_local_id_].intS_shapeI_shapeJ; - IntS_shapeI_ = unit_cell_matrices_[cell_local_id_].intS_shapeI; + const auto& unit_mats = unit_cell_matrices_[cell_local_id_]; + G_ = &unit_mats.intV_shapeI_gradshapeJ; + M_ = &unit_mats.intV_shapeI_shapeJ; + M_surf_ = &unit_mats.intS_shapeI_shapeJ; + IntS_shapeI_ = &unit_mats.intS_shapeI; } void CBCSweepChunk::Sweep(AngleSet& angle_set) +{ + (this->*sweep_impl_)(angle_set); +} + +void +CBCSweepChunk::Sweep_Generic(AngleSet& angle_set) { const auto& m2d_op = groupset_.quadrature->GetMomentToDiscreteOperator(); const auto& d2m_op = groupset_.quadrature->GetDiscreteToMomentOperator(); @@ -108,7 +149,7 @@ CBCSweepChunk::Sweep(AngleSet& angle_set) for (size_t i = 0; i < cell_num_nodes_; ++i) for (size_t j = 0; j < cell_num_nodes_; ++j) - Amat(i, j) = omega.Dot(G_(i, j)); + Amat(i, j) = omega.Dot((*G_)(i, j)); // Update face orientations for (size_t f = 0; f < cell_num_faces_; ++f) @@ -136,7 +177,7 @@ CBCSweepChunk::Sweep(AngleSet& angle_set) { const int j = cell_mapping_->MapFaceNode(f, fj); - const double mu_Nij = -face_mu_values[f] * M_surf_[f](i, j); + const double mu_Nij = -face_mu_values[f] * (*M_surf_)[f](i, j); Amat(i, j) += mu_Nij; const double* psi = nullptr; @@ -194,7 +235,7 @@ CBCSweepChunk::Sweep(AngleSet& angle_set) double temp = 0.0; for (size_t j = 0; j < cell_num_nodes_; ++j) { - const double Mij = M_(i, j); + const double Mij = (*M_)(i, j); Atemp(i, j) = Amat(i, j) + Mij * sigma_tg; temp += Mij * source[j]; } @@ -244,7 +285,7 @@ CBCSweepChunk::Sweep(AngleSet& angle_set) const bool is_boundary_face = not face.has_neighbor; const bool is_reflecting_boundary_face = (is_boundary_face and angle_set.GetBoundaries()[face.neighbor_id]->IsReflecting()); - const auto& IntF_shapeI = IntS_shapeI_[f]; + const auto& IntF_shapeI = (*IntS_shapeI_)[f]; const int locality = cell_transport_view_->FaceLocality(f); const size_t num_face_nodes = cell_mapping_->GetNumFaceNodes(f); @@ -254,7 +295,7 @@ CBCSweepChunk::Sweep(AngleSet& angle_set) if (not is_boundary_face and not is_local_face) { - auto& async_comm = *angle_set.GetCommunicator(); + auto& async_comm = static_cast(*angle_set.GetCommunicator()); const size_t data_size_for_msg = num_face_nodes * group_angle_stride_; psi_nonlocal_outgoing = &async_comm.InitGetDownwindMessageData(locality, diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h index 0e8bf6fbfc..a07b78daf2 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h @@ -5,6 +5,7 @@ #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/sweep_chunk.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/avx_sweep_chunk_utils.h" namespace opensn { @@ -51,7 +52,16 @@ class CBCSweepChunk : public SweepChunk */ void Sweep(AngleSet& angle_set) override; -protected: +private: + using SweepFunc = void (CBCSweepChunk::*)(AngleSet&); + SweepFunc sweep_impl_ = nullptr; + + void Sweep_Generic(AngleSet& angle_set); + template + void Sweep_FixedN(AngleSet& angle_set); + + unsigned int group_block_size_; + CBC_FLUDS* fluds_; size_t gs_size_; unsigned int gs_gi_; @@ -67,10 +77,10 @@ class CBCSweepChunk : public SweepChunk size_t cell_num_faces_; size_t cell_num_nodes_; - DenseMatrix G_; - DenseMatrix M_; - std::vector> M_surf_; - std::vector> IntS_shapeI_; + const DenseMatrix* G_; + const DenseMatrix* M_; + const std::vector>* M_surf_; + const std::vector>* IntS_shapeI_; }; } // namespace opensn From a0444eea18e470165fbcef9c248a80e92e137efb Mon Sep 17 00:00:00 2001 From: Eappen Nelluvelil Date: Tue, 31 Mar 2026 23:30:58 -0500 Subject: [PATCH 2/6] Time-dependent CBC sweep chunk --- .../discrete_ordinates_problem.cc | 7 +- .../sweep_chunks/cbc_sweep_chunk.cc | 291 ++--------------- .../sweep_chunks/cbc_sweep_chunk.h | 31 +- .../sweep_chunks/cbc_sweep_chunk_shared.h | 126 +++++++ .../sweep_chunks/cbc_sweep_chunk_td.cc | 148 +++++++++ .../sweep_chunks/cbc_sweep_chunk_td.h | 39 +++ .../sweep_chunks/cbc_sweep_kernels.h | 308 ++++++++++++++++++ 7 files changed, 661 insertions(+), 289 deletions(-) create mode 100644 modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_shared.h create mode 100644 modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.cc create mode 100644 modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.h create mode 100644 modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc index 74f1919c60..1a375321e1 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc @@ -16,6 +16,7 @@ #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_sweep_chunk.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_sweep_chunk_td.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/iterative_methods/sweep_wgs_context.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/io/discrete_ordinates_problem_io.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/iterative_methods/ags_linear_solver.h" @@ -1882,10 +1883,6 @@ DiscreteOrdinatesProblem::SetSweepChunk(LBSGroupset& groupset) const bool use_time_dependent_chunk = (mode == SweepChunkMode::TIME_DEPENDENT); - if (use_time_dependent_chunk && sweep_type_ != "AAH") - throw std::invalid_argument(GetName() + - ": Time dependent is only supported with sweep_type='AAH'."); - if (sweep_type_ == "AAH") { if (use_time_dependent_chunk) @@ -1896,6 +1893,8 @@ DiscreteOrdinatesProblem::SetSweepChunk(LBSGroupset& groupset) } else if (sweep_type_ == "CBC") { + if (use_time_dependent_chunk) + return std::make_shared(*this, groupset); if (use_gpus_) return CreateCBCDSweepChunk(groupset); return std::make_shared(*this, groupset); diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.cc index 70ee7c82d3..b40c3d7ed0 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.cc +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.cc @@ -2,13 +2,7 @@ // SPDX-License-Identifier: MIT #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.h" -#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h" -#include "modules/linear_boltzmann_solvers/lbs_problem/groupset/lbs_groupset.h" -#include "framework/math/spatial_discretization/spatial_discretization.h" -#include "framework/mesh/mesh_continuum/mesh_continuum.h" -#include "framework/mesh/cell/cell.h" -#include "framework/logging/log.h" #include "caliper/cali.h" namespace opensn @@ -27,24 +21,11 @@ CBCSweepChunk::CBCSweepChunk(DiscreteOrdinatesProblem& problem, LBSGroupset& gro problem.GetNumMoments(), problem.GetMaxCellDOFCount(), problem.GetMinCellDOFCount()), - sweep_impl_(&CBCSweepChunk::Sweep_Generic), - group_block_size_(0), - fluds_(nullptr), - gs_size_(0), - gs_gi_(0), - num_angles_in_as_(0), - group_stride_(0), - group_angle_stride_(0), - surface_source_active_(false), - cell_(nullptr), - cell_local_id_(0), - cell_mapping_(nullptr), - cell_transport_view_(nullptr), - cell_num_faces_(0), - cell_num_nodes_(0) + problem_(problem), + sweep_impl_(&CBCSweepChunk::Sweep_Generic) { - if ((min_num_cell_dofs_ == max_num_cell_dofs_) and (min_num_cell_dofs_ >= 2) - and (min_num_cell_dofs_ <= 8)) + if ((min_num_cell_dofs_ == max_num_cell_dofs_) and (min_num_cell_dofs_ >= 2) and + (min_num_cell_dofs_ <= 8)) { switch (min_num_cell_dofs_) { @@ -73,41 +54,23 @@ CBCSweepChunk::CBCSweepChunk(DiscreteOrdinatesProblem& problem, LBSGroupset& gro break; } } + group_block_size_ = ComputeGroupBlockSize(groupset_.GetNumGroups()); } void CBCSweepChunk::SetAngleSet(AngleSet& angle_set) { - CALI_CXX_MARK_SCOPE("CbcSweepChunk::SetAngleSet"); - - fluds_ = &dynamic_cast(angle_set.GetFLUDS()); - - gs_size_ = groupset_.GetNumGroups(); - gs_gi_ = groupset_.first_group; + CALI_CXX_MARK_SCOPE("CBCSweepChunk::SetAngleSet"); - surface_source_active_ = IsSurfaceSourceActive(); - num_angles_in_as_ = angle_set.GetNumAngles(); - group_stride_ = angle_set.GetNumGroups(); - group_angle_stride_ = group_stride_ * num_angles_in_as_; + CBCBindAngleSetContext(ctx_, groupset_, IsSurfaceSourceActive(), angle_set); } void CBCSweepChunk::SetCell(const Cell* cell_ptr, AngleSet& angle_set) { - cell_ = cell_ptr; - cell_local_id_ = cell_ptr->local_id; - cell_mapping_ = &discretization_.GetCellMapping(*cell_); - cell_transport_view_ = &cell_transport_views_[cell_->local_id]; - cell_num_faces_ = cell_->faces.size(); - cell_num_nodes_ = cell_mapping_->GetNumNodes(); - - // Get cell matrices - const auto& unit_mats = unit_cell_matrices_[cell_local_id_]; - G_ = &unit_mats.intV_shapeI_gradshapeJ; - M_ = &unit_mats.intV_shapeI_shapeJ; - M_surf_ = &unit_mats.intS_shapeI_shapeJ; - IntS_shapeI_ = &unit_mats.intS_shapeI; + static_cast(angle_set); + CBCBindCellContext(ctx_, discretization_, unit_cell_matrices_, cell_transport_views_, cell_ptr); } void @@ -119,220 +82,26 @@ CBCSweepChunk::Sweep(AngleSet& angle_set) void CBCSweepChunk::Sweep_Generic(AngleSet& angle_set) { - const auto& m2d_op = groupset_.quadrature->GetMomentToDiscreteOperator(); - const auto& d2m_op = groupset_.quadrature->GetDiscreteToMomentOperator(); - - DenseMatrix Amat(max_num_cell_dofs_, max_num_cell_dofs_); - DenseMatrix Atemp(max_num_cell_dofs_, max_num_cell_dofs_); - std::vector> b(gs_size_, Vector(max_num_cell_dofs_)); - std::vector source(max_num_cell_dofs_); - - const auto& face_orientations = angle_set.GetSPDS().GetCellFaceOrientations()[cell_local_id_]; - std::vector face_mu_values(cell_num_faces_); - - const auto& sigma_t = xs_.at(cell_->block_id)->GetSigmaTotal(); - - // as = angle set - // ss = subset - const std::vector& as_angle_indices = angle_set.GetAngleIndices(); - - for (size_t as_ss_idx = 0; as_ss_idx < num_angles_in_as_; ++as_ss_idx) - { - auto direction_num = as_angle_indices[as_ss_idx]; - auto omega = groupset_.quadrature->omegas[direction_num]; - auto wt = groupset_.quadrature->weights[direction_num]; - - // Reset right-hand side - for (size_t gsg = 0; gsg < gs_size_; ++gsg) - for (size_t i = 0; i < cell_num_nodes_; ++i) - b[gsg](i) = 0.0; - - for (size_t i = 0; i < cell_num_nodes_; ++i) - for (size_t j = 0; j < cell_num_nodes_; ++j) - Amat(i, j) = omega.Dot((*G_)(i, j)); - - // Update face orientations - for (size_t f = 0; f < cell_num_faces_; ++f) - face_mu_values[f] = omega.Dot(cell_->faces[f].normal); - - // Surface integrals - for (size_t f = 0; f < cell_num_faces_; ++f) - { - if (face_orientations[f] != FaceOrientation::INCOMING) - continue; - - const auto& face = cell_->faces[f]; - const bool is_local_face = cell_transport_view_->IsFaceLocal(f); - const bool is_boundary_face = not face.has_neighbor; - const auto* face_nodal_mapping = - &fluds_->GetCommonData().GetFaceNodalMapping(cell_local_id_, f); - - // IntSf_mu_psi_Mij_dA - const size_t num_face_nodes = cell_mapping_->GetNumFaceNodes(f); - for (size_t fi = 0; fi < num_face_nodes; ++fi) - { - const int i = cell_mapping_->MapFaceNode(f, fi); - - for (size_t fj = 0; fj < num_face_nodes; ++fj) - { - const int j = cell_mapping_->MapFaceNode(f, fj); - - const double mu_Nij = -face_mu_values[f] * (*M_surf_)[f](i, j); - Amat(i, j) += mu_Nij; - - const double* psi = nullptr; - - if (is_local_face) - psi = fluds_->UpwindPsi(*cell_transport_view_->FaceNeighbor(f), - face_nodal_mapping->cell_node_mapping_[fj], - as_ss_idx); - else if (not is_boundary_face) - psi = fluds_->NLUpwindPsi( - cell_->global_id, f, face_nodal_mapping->face_node_mapping_[fj], as_ss_idx); - else - psi = angle_set.PsiBoundary(face.neighbor_id, - direction_num, - cell_local_id_, - f, - fj, - gs_gi_, - surface_source_active_); - - if (psi != nullptr) - for (size_t gsg = 0; gsg < gs_size_; ++gsg) - b[gsg](i) += psi[gsg] * mu_Nij; - } // for face node j - } // for face node i - } // for f - - const auto dir_moment_offset = - static_cast(direction_num) * static_cast(num_moments_); - const double* m2d_row = m2d_op.data() + dir_moment_offset; - const double* d2m_row = d2m_op.data() + dir_moment_offset; - - // Looping over groups, assembling mass terms - for (unsigned int gsg = 0; gsg < gs_size_; ++gsg) - { - double sigma_tg = sigma_t[gs_gi_ + gsg]; - - // Contribute source moments q = M_n^T * q_moms - for (size_t i = 0; i < cell_num_nodes_; ++i) - { - double temp_src = 0.0; - for (unsigned int m = 0; m < num_moments_; ++m) - { - const auto ir = cell_transport_view_->MapDOF(i, m, gs_gi_ + gsg); - temp_src += m2d_row[m] * source_moments_[ir]; - } - source[i] = temp_src; - } - - // Mass matrix and source - // Atemp = Amat + sigma_tgr * M - // b += M * q - for (size_t i = 0; i < cell_num_nodes_; ++i) - { - double temp = 0.0; - for (size_t j = 0; j < cell_num_nodes_; ++j) - { - const double Mij = (*M_)(i, j); - Atemp(i, j) = Amat(i, j) + Mij * sigma_tg; - temp += Mij * source[j]; - } - b[gsg](i) += temp; - } - - // Solve system - GaussElimination(Atemp, b[gsg], static_cast(cell_num_nodes_)); - } // for gsg - - // Update phi - for (unsigned int m = 0; m < num_moments_; ++m) - { - const auto wn_d2m = d2m_row[m]; - for (size_t i = 0; i < cell_num_nodes_; ++i) - { - const auto ir = cell_transport_view_->MapDOF(i, m, gs_gi_); - for (size_t gsg = 0; gsg < gs_size_; ++gsg) - destination_phi_[ir + gsg] += wn_d2m * b[gsg](i); - } - } - - // If requested, save angular fluxes during sweep - if (SaveAngularFluxEnabled()) - { - double* cell_psi = - &destination_psi_[discretization_.MapDOFLocal(*cell_, 0, groupset_.psi_uk_man_, 0, 0)]; - - for (size_t i = 0; i < cell_num_nodes_; ++i) - { - const size_t addr_offset = - i * groupset_angle_group_stride_ + direction_num * groupset_group_stride_; - - for (size_t gsg = 0; gsg < gs_size_; ++gsg) - cell_psi[addr_offset + gsg] = b[gsg](i); - } - } - - // Perform outgoing surface operations - for (size_t f = 0; f < cell_num_faces_; ++f) - { - if (face_orientations[f] != FaceOrientation::OUTGOING) - continue; - - const auto& face = cell_->faces[f]; - const bool is_local_face = cell_transport_view_->IsFaceLocal(f); - const bool is_boundary_face = not face.has_neighbor; - const bool is_reflecting_boundary_face = - (is_boundary_face and angle_set.GetBoundaries()[face.neighbor_id]->IsReflecting()); - const auto& IntF_shapeI = (*IntS_shapeI_)[f]; - - const int locality = cell_transport_view_->FaceLocality(f); - const size_t num_face_nodes = cell_mapping_->GetNumFaceNodes(f); - const auto& face_nodal_mapping = - fluds_->GetCommonData().GetFaceNodalMapping(cell_local_id_, f); - std::vector* psi_nonlocal_outgoing = nullptr; - - if (not is_boundary_face and not is_local_face) - { - auto& async_comm = static_cast(*angle_set.GetCommunicator()); - const size_t data_size_for_msg = num_face_nodes * group_angle_stride_; - psi_nonlocal_outgoing = - &async_comm.InitGetDownwindMessageData(locality, - face.neighbor_id, - face_nodal_mapping.associated_face_, - angle_set.GetID(), - data_size_for_msg); - } - - for (size_t fi = 0; fi < num_face_nodes; ++fi) - { - const int i = cell_mapping_->MapFaceNode(f, fi); - - // Tally outflow for particle balance - if (is_boundary_face) - { - for (size_t gsg = 0; gsg < gs_size_; ++gsg) - cell_transport_view_->AddOutflow( - f, gs_gi_ + gsg, wt * face_mu_values[f] * b[gsg](i) * IntF_shapeI(i)); - } - - double* psi = nullptr; - - if (is_local_face) - psi = fluds_->OutgoingPsi(*cell_, i, as_ss_idx); - else if (not is_boundary_face) - psi = fluds_->NLOutgoingPsi(psi_nonlocal_outgoing, fi, as_ss_idx); - else if (is_reflecting_boundary_face) - psi = angle_set.PsiReflected(face.neighbor_id, direction_num, cell_local_id_, f, fi); - - // Write the solved angular flux to the determined location - if (psi != nullptr) - for (size_t gsg = 0; gsg < gs_size_; ++gsg) - psi[gsg] = b[gsg](i); - } // for fi - } // for face - } // for angleset/subset + CALI_CXX_MARK_SCOPE("CBCSweepChunk::Sweep_Generic"); + + auto data = MakeCBCSweepData(discretization_, + source_moments_, + groupset_, + xs_, + num_moments_, + max_num_cell_dofs_, + SaveAngularFluxEnabled(), + groupset_angle_group_stride_, + groupset_group_stride_, + destination_phi_, + destination_psi_, + include_rhs_time_term_, + problem_, + nullptr, + group_block_size_, + ctx_); + + CBC_Sweep_Generic(data, angle_set); } -} // namespace opensn \ No newline at end of file +} // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h index a07b78daf2..5d8acaa305 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h @@ -3,9 +3,9 @@ #pragma once -#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h" -#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/sweep_chunk.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/avx_sweep_chunk_utils.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_shared.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/sweep_chunk.h" namespace opensn { @@ -52,6 +52,11 @@ class CBCSweepChunk : public SweepChunk */ void Sweep(AngleSet& angle_set) override; +protected: + DiscreteOrdinatesProblem& problem_; + CBCSweepChunkContext ctx_; + unsigned int group_block_size_ = 0; + private: using SweepFunc = void (CBCSweepChunk::*)(AngleSet&); SweepFunc sweep_impl_ = nullptr; @@ -59,28 +64,6 @@ class CBCSweepChunk : public SweepChunk void Sweep_Generic(AngleSet& angle_set); template void Sweep_FixedN(AngleSet& angle_set); - - unsigned int group_block_size_; - - CBC_FLUDS* fluds_; - size_t gs_size_; - unsigned int gs_gi_; - size_t num_angles_in_as_; - unsigned int group_stride_; // Stride for consecutive angles - size_t group_angle_stride_; // Stride for consecutive spatial DOFs - bool surface_source_active_; - - const Cell* cell_; - std::uint32_t cell_local_id_; - const CellMapping* cell_mapping_; - CellLBSView* cell_transport_view_; - size_t cell_num_faces_; - size_t cell_num_nodes_; - - const DenseMatrix* G_; - const DenseMatrix* M_; - const std::vector>* M_surf_; - const std::vector>* IntS_shapeI_; }; } // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_shared.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_shared.h new file mode 100644 index 0000000000..13a8ae1f1b --- /dev/null +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_shared.h @@ -0,0 +1,126 @@ +// SPDX-FileCopyrightText: 2026 The OpenSn Authors +// SPDX-License-Identifier: MIT + +#pragma once + +#include "framework/math/spatial_discretization/spatial_discretization.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h" + +namespace opensn +{ + +struct CBCSweepChunkContext +{ + CBC_FLUDS* fluds = nullptr; + + size_t gs_size = 0; + unsigned int gs_gi = 0; + size_t num_angles_in_as = 0; + unsigned int group_stride = 0; + size_t group_angle_stride = 0; + bool surface_source_active = false; + + const Cell* cell = nullptr; + std::uint32_t cell_local_id = 0; + const CellMapping* cell_mapping = nullptr; + CellLBSView* cell_transport_view = nullptr; + size_t cell_num_faces = 0; + size_t cell_num_nodes = 0; + + const DenseMatrix* G = nullptr; + const DenseMatrix* M = nullptr; + const std::vector>* M_surf = nullptr; + const std::vector>* IntS_shapeI = nullptr; +}; + +inline void +CBCBindAngleSetContext(CBCSweepChunkContext& ctx, + const LBSGroupset& groupset, + bool surface_source_active, + AngleSet& angle_set) +{ + ctx.fluds = &dynamic_cast(angle_set.GetFLUDS()); + ctx.gs_size = groupset.GetNumGroups(); + ctx.gs_gi = groupset.first_group; + ctx.surface_source_active = surface_source_active; + ctx.num_angles_in_as = angle_set.GetNumAngles(); + ctx.group_stride = angle_set.GetNumGroups(); + ctx.group_angle_stride = ctx.group_stride * ctx.num_angles_in_as; +} + +inline void +CBCBindCellContext(CBCSweepChunkContext& ctx, + const SpatialDiscretization& discretization, + const std::vector& unit_cell_matrices, + std::vector& cell_transport_views, + const Cell* cell_ptr) +{ + ctx.cell = cell_ptr; + ctx.cell_local_id = cell_ptr->local_id; + ctx.cell_mapping = &discretization.GetCellMapping(*ctx.cell); + ctx.cell_transport_view = &cell_transport_views[ctx.cell->local_id]; + ctx.cell_num_faces = ctx.cell->faces.size(); + ctx.cell_num_nodes = ctx.cell_mapping->GetNumNodes(); + + const auto& unit_mats = unit_cell_matrices[ctx.cell_local_id]; + ctx.G = &unit_mats.intV_shapeI_gradshapeJ; + ctx.M = &unit_mats.intV_shapeI_shapeJ; + ctx.M_surf = &unit_mats.intS_shapeI_shapeJ; + ctx.IntS_shapeI = &unit_mats.intS_shapeI; +} + +inline CBCSweepData +MakeCBCSweepData(const SpatialDiscretization& discretization, + const std::vector& source_moments, + const LBSGroupset& groupset, + const BlockID2XSMap& xs, + unsigned int num_moments, + unsigned int max_num_cell_dofs, + bool save_angular_flux, + size_t groupset_angle_group_stride, + size_t groupset_group_stride, + std::vector& destination_phi, + std::vector& destination_psi, + bool include_rhs_time_term, + DiscreteOrdinatesProblem& problem, + const std::vector* psi_old, + unsigned int group_block_size, + const CBCSweepChunkContext& ctx) +{ + return CBCSweepData{discretization, + source_moments, + groupset, + xs, + num_moments, + max_num_cell_dofs, + save_angular_flux, + groupset_angle_group_stride, + groupset_group_stride, + destination_phi, + destination_psi, + ctx.surface_source_active, + include_rhs_time_term, + problem, + psi_old, + group_block_size, + *ctx.fluds, + *ctx.cell, + ctx.cell_local_id, + *ctx.cell_mapping, + *ctx.cell_transport_view, + ctx.cell_num_faces, + ctx.cell_num_nodes, + ctx.gs_size, + ctx.gs_gi, + ctx.num_angles_in_as, + ctx.group_stride, + ctx.group_angle_stride, + *ctx.G, + *ctx.M, + *ctx.M_surf, + *ctx.IntS_shapeI}; +} + +} // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.cc new file mode 100644 index 0000000000..0b261ceb48 --- /dev/null +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.cc @@ -0,0 +1,148 @@ +// SPDX-FileCopyrightText: 2026 The OpenSn Authors +// SPDX-License-Identifier: MIT + +#include "framework/utils/error.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h" +#include "caliper/cali.h" +#include + +namespace opensn +{ + +CBCSweepChunkTD::CBCSweepChunkTD(DiscreteOrdinatesProblem& problem, LBSGroupset& groupset) + : SweepChunk(problem.GetPhiNewLocal(), + problem.GetPsiNewLocal()[groupset.id], + problem.GetGrid(), + problem.GetSpatialDiscretization(), + problem.GetUnitCellMatrices(), + problem.GetCellTransportViews(), + problem.GetQMomentsLocal(), + groupset, + problem.GetBlockID2XSMap(), + problem.GetNumMoments(), + problem.GetMaxCellDOFCount(), + problem.GetMinCellDOFCount()), + problem_(problem), + psi_old_(problem.GetPsiOldLocal()[groupset.id]), + sweep_impl_td_(&CBCSweepChunkTD::Sweep_Generic) +{ + if (problem.UseGPUs()) + throw std::runtime_error("Time-dependent calculations do not yet support GPUs.\n"); + + if ((min_num_cell_dofs_ == max_num_cell_dofs_) and (min_num_cell_dofs_ >= 2) and + (min_num_cell_dofs_ <= 8)) + { + switch (min_num_cell_dofs_) + { + case 2: + sweep_impl_td_ = &CBCSweepChunkTD::Sweep_FixedN<2>; + break; + case 3: + sweep_impl_td_ = &CBCSweepChunkTD::Sweep_FixedN<3>; + break; + case 4: + sweep_impl_td_ = &CBCSweepChunkTD::Sweep_FixedN<4>; + break; + case 5: + sweep_impl_td_ = &CBCSweepChunkTD::Sweep_FixedN<5>; + break; + case 6: + sweep_impl_td_ = &CBCSweepChunkTD::Sweep_FixedN<6>; + break; + case 7: + sweep_impl_td_ = &CBCSweepChunkTD::Sweep_FixedN<7>; + break; + case 8: + sweep_impl_td_ = &CBCSweepChunkTD::Sweep_FixedN<8>; + break; + default: + break; + } + } + + group_block_size_ = ComputeGroupBlockSize(groupset_.GetNumGroups()); +} + +void +CBCSweepChunkTD::SetAngleSet(AngleSet& angle_set) +{ + CALI_CXX_MARK_SCOPE("CBCSweepChunkTD::SetAngleSet"); + + CBCBindAngleSetContext(ctx_, groupset_, IsSurfaceSourceActive(), angle_set); +} + +void +CBCSweepChunkTD::SetCell(const Cell* cell_ptr, AngleSet& angle_set) +{ + static_cast(angle_set); + CBCBindCellContext(ctx_, discretization_, unit_cell_matrices_, cell_transport_views_, cell_ptr); +} + +void +CBCSweepChunkTD::Sweep(AngleSet& angle_set) +{ + (this->*sweep_impl_td_)(angle_set); +} + +void +CBCSweepChunkTD::Sweep_Generic(AngleSet& angle_set) +{ + CALI_CXX_MARK_SCOPE("CBCSweepChunkTD::Sweep_Generic"); + + auto data = MakeCBCSweepData(discretization_, + source_moments_, + groupset_, + xs_, + num_moments_, + max_num_cell_dofs_, + SaveAngularFluxEnabled(), + groupset_angle_group_stride_, + groupset_group_stride_, + destination_phi_, + destination_psi_, + include_rhs_time_term_, + problem_, + &psi_old_, + group_block_size_, + ctx_); + + CBC_Sweep_Generic(data, angle_set); +} + +template +void +CBCSweepChunkTD::Sweep_FixedN(AngleSet& angle_set) +{ + CALI_CXX_MARK_SCOPE("CBCSweepChunkTD::Sweep_FixedN"); + + auto data = MakeCBCSweepData(discretization_, + source_moments_, + groupset_, + xs_, + num_moments_, + max_num_cell_dofs_, + SaveAngularFluxEnabled(), + groupset_angle_group_stride_, + groupset_group_stride_, + destination_phi_, + destination_psi_, + include_rhs_time_term_, + problem_, + &psi_old_, + group_block_size_, + ctx_); + + CBC_Sweep_FixedN(data, angle_set); +} + +template void CBCSweepChunkTD::Sweep_FixedN<2>(AngleSet&); +template void CBCSweepChunkTD::Sweep_FixedN<3>(AngleSet&); +template void CBCSweepChunkTD::Sweep_FixedN<4>(AngleSet&); +template void CBCSweepChunkTD::Sweep_FixedN<5>(AngleSet&); +template void CBCSweepChunkTD::Sweep_FixedN<6>(AngleSet&); +template void CBCSweepChunkTD::Sweep_FixedN<7>(AngleSet&); +template void CBCSweepChunkTD::Sweep_FixedN<8>(AngleSet&); + +} // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.h new file mode 100644 index 0000000000..5e99bb83ef --- /dev/null +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.h @@ -0,0 +1,39 @@ +// SPDX-FileCopyrightText: 2026 The OpenSn Authors +// SPDX-License-Identifier: MIT + +#pragma once + +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/avx_sweep_chunk_utils.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_shared.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/sweep_chunk.h" + +namespace opensn +{ + +class CBCSweepChunkTD : public SweepChunk +{ +public: + CBCSweepChunkTD(DiscreteOrdinatesProblem& problem, LBSGroupset& groupset); + ~CBCSweepChunkTD() override = default; + + void SetAngleSet(AngleSet& angle_set) override; + void SetCell(const Cell* cell_ptr, AngleSet& angle_set) override; + void Sweep(AngleSet& angle_set) override; + bool IsTimeDependent() const override { return true; } + +protected: + using SweepFunc = void (CBCSweepChunkTD::*)(AngleSet&); + void Sweep_Generic(AngleSet& angle_set); + template + void Sweep_FixedN(AngleSet& angle_set); + + DiscreteOrdinatesProblem& problem_; + const std::vector& psi_old_; + unsigned int group_block_size_ = 0; + CBCSweepChunkContext ctx_; + +private: + SweepFunc sweep_impl_td_ = nullptr; +}; + +} // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h new file mode 100644 index 0000000000..7413c44ab7 --- /dev/null +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h @@ -0,0 +1,308 @@ +// SPDX-FileCopyrightText: 2026 The OpenSn Authors +// SPDX-License-Identifier: MIT + +#pragma once + +#include "framework/data_types/dense_matrix.h" +#include "framework/data_types/vector.h" +#include "framework/mesh/cell/cell.h" +#include "framework/math/spatial_discretization/spatial_discretization.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h" +#include + +namespace opensn +{ + +struct CBCSweepData +{ + const SpatialDiscretization& discretization; + const std::vector& source_moments; + const LBSGroupset& groupset; + const BlockID2XSMap& xs; + unsigned int num_moments; + unsigned int max_num_cell_dofs; + bool save_angular_flux; + size_t groupset_angle_group_stride; + size_t groupset_group_stride; + std::vector& destination_phi; + std::vector& destination_psi; + bool surface_source_active; + bool include_rhs_time_term; + DiscreteOrdinatesProblem& problem; + const std::vector* psi_old; + unsigned int group_block_size; + + CBC_FLUDS& fluds; + const Cell& cell; + std::uint32_t cell_local_id; + const CellMapping& cell_mapping; + CellLBSView& cell_transport_view; + size_t cell_num_faces; + size_t cell_num_nodes; + + size_t gs_size; + unsigned int gs_gi; + size_t num_angles_in_as; + unsigned int group_stride; + size_t group_angle_stride; + + const DenseMatrix& G; + const DenseMatrix& M; + const std::vector>& M_surf; + const std::vector>& IntS_shapeI; +}; + +template +inline void +CBC_Sweep_Generic(CBCSweepData& data, AngleSet& angle_set) +{ + const auto& groupset = data.groupset; + const auto& m2d_op = groupset.quadrature->GetMomentToDiscreteOperator(); + const auto& d2m_op = groupset.quadrature->GetDiscreteToMomentOperator(); + + DenseMatrix Amat(data.max_num_cell_dofs, data.max_num_cell_dofs); + DenseMatrix Atemp(data.max_num_cell_dofs, data.max_num_cell_dofs); + std::vector> b(data.gs_size, Vector(data.max_num_cell_dofs)); + std::vector source(data.max_num_cell_dofs); + std::vector face_mu_values(data.cell_num_faces); + + const auto& face_orientations = angle_set.GetSPDS().GetCellFaceOrientations()[data.cell_local_id]; + const auto& sigma_t = data.xs.at(data.cell.block_id)->GetSigmaTotal(); + + std::vector tau_gsg; + if constexpr (time_dependent) + { + const auto& inv_velg = data.xs.at(data.cell.block_id)->GetInverseVelocity(); + const double theta = data.problem.GetTheta(); + const double inv_theta = 1.0 / theta; + const double dt = data.problem.GetTimeStep(); + const double inv_dt = 1.0 / dt; + + tau_gsg.assign(data.gs_size, 0.0); + for (size_t gsg = 0; gsg < data.gs_size; ++gsg) + tau_gsg[gsg] = inv_velg[data.gs_gi + gsg] * inv_theta * inv_dt; + } + + const double* psi_old = + (time_dependent and data.psi_old) + ? &(*data.psi_old)[data.discretization.MapDOFLocal(data.cell, 0, groupset.psi_uk_man_, 0, 0)] + : nullptr; + + const auto& as_angle_indices = angle_set.GetAngleIndices(); + + for (size_t as_ss_idx = 0; as_ss_idx < data.num_angles_in_as; ++as_ss_idx) + { + const auto direction_num = as_angle_indices[as_ss_idx]; + const auto omega = groupset.quadrature->omegas[direction_num]; + const auto wt = groupset.quadrature->weights[direction_num]; + + for (size_t gsg = 0; gsg < data.gs_size; ++gsg) + for (size_t i = 0; i < data.cell_num_nodes; ++i) + b[gsg](i) = 0.0; + + for (size_t i = 0; i < data.cell_num_nodes; ++i) + for (size_t j = 0; j < data.cell_num_nodes; ++j) + Amat(i, j) = omega.Dot(data.G(i, j)); + + for (size_t f = 0; f < data.cell_num_faces; ++f) + face_mu_values[f] = omega.Dot(data.cell.faces[f].normal); + + for (size_t f = 0; f < data.cell_num_faces; ++f) + { + if (face_orientations[f] != FaceOrientation::INCOMING) + continue; + + const auto& face = data.cell.faces[f]; + const bool is_local_face = data.cell_transport_view.IsFaceLocal(f); + const bool is_boundary_face = not face.has_neighbor; + const auto* face_nodal_mapping = + &data.fluds.GetCommonData().GetFaceNodalMapping(data.cell_local_id, f); + + const size_t num_face_nodes = data.cell_mapping.GetNumFaceNodes(f); + for (size_t fi = 0; fi < num_face_nodes; ++fi) + { + const int i = data.cell_mapping.MapFaceNode(f, fi); + + for (size_t fj = 0; fj < num_face_nodes; ++fj) + { + const int j = data.cell_mapping.MapFaceNode(f, fj); + const double mu_Nij = -face_mu_values[f] * data.M_surf[f](i, j); + Amat(i, j) += mu_Nij; + + const double* psi = nullptr; + + if (is_local_face) + psi = data.fluds.UpwindPsi(*data.cell_transport_view.FaceNeighbor(f), + face_nodal_mapping->cell_node_mapping_[fj], + as_ss_idx); + else if (not is_boundary_face) + psi = data.fluds.NLUpwindPsi( + data.cell.global_id, f, face_nodal_mapping->face_node_mapping_[fj], as_ss_idx); + else + psi = angle_set.PsiBoundary(face.neighbor_id, + direction_num, + data.cell_local_id, + f, + fj, + data.gs_gi, + data.surface_source_active); + + if (psi != nullptr) + for (size_t gsg = 0; gsg < data.gs_size; ++gsg) + b[gsg](i) += psi[gsg] * mu_Nij; + } + } + } + + const auto dir_moment_offset = + static_cast(direction_num) * static_cast(data.num_moments); + const double* m2d_row = m2d_op.data() + dir_moment_offset; + const double* d2m_row = d2m_op.data() + dir_moment_offset; + + for (unsigned int gsg = 0; gsg < data.gs_size; ++gsg) + { + double sigma_tg = sigma_t[data.gs_gi + gsg]; + if constexpr (time_dependent) + sigma_tg += tau_gsg[gsg]; + + for (size_t i = 0; i < data.cell_num_nodes; ++i) + { + double temp_src = 0.0; + for (unsigned int m = 0; m < data.num_moments; ++m) + { + const auto ir = data.cell_transport_view.MapDOF(i, m, data.gs_gi + gsg); + temp_src += m2d_row[m] * data.source_moments[ir]; + } + + if constexpr (time_dependent) + { + const size_t imap = + i * data.groupset_angle_group_stride + direction_num * data.groupset_group_stride; + if (data.include_rhs_time_term and psi_old) + temp_src += tau_gsg[gsg] * psi_old[imap + gsg]; + } + + source[i] = temp_src; + } + + for (size_t i = 0; i < data.cell_num_nodes; ++i) + { + double temp = 0.0; + for (size_t j = 0; j < data.cell_num_nodes; ++j) + { + const double Mij = data.M(i, j); + Atemp(i, j) = Amat(i, j) + Mij * sigma_tg; + temp += Mij * source[j]; + } + b[gsg](i) += temp; + } + + GaussElimination(Atemp, b[gsg], static_cast(data.cell_num_nodes)); + } + + for (unsigned int m = 0; m < data.num_moments; ++m) + { + const auto wn_d2m = d2m_row[m]; + for (size_t i = 0; i < data.cell_num_nodes; ++i) + { + const auto ir = data.cell_transport_view.MapDOF(i, m, data.gs_gi); + for (size_t gsg = 0; gsg < data.gs_size; ++gsg) + data.destination_phi[ir + gsg] += wn_d2m * b[gsg](i); + } + } + + if (data.save_angular_flux) + { + double* psi_new = &data.destination_psi[data.discretization.MapDOFLocal( + data.cell, 0, groupset.psi_uk_man_, 0, 0)]; + + double theta = 1.0; + double inv_theta = 1.0; + if constexpr (time_dependent) + { + theta = data.problem.GetTheta(); + inv_theta = 1.0 / theta; + } + + for (size_t i = 0; i < data.cell_num_nodes; ++i) + { + const size_t imap = + i * data.groupset_angle_group_stride + direction_num * data.groupset_group_stride; + + for (size_t gsg = 0; gsg < data.gs_size; ++gsg) + { + const double psi_sol = b[gsg](i); + if constexpr (time_dependent) + { + const double psi_old_val = psi_old ? psi_old[imap + gsg] : 0.0; + psi_new[imap + gsg] = inv_theta * (psi_sol + (theta - 1.0) * psi_old_val); + } + else + psi_new[imap + gsg] = psi_sol; + } + } + } + + for (size_t f = 0; f < data.cell_num_faces; ++f) + { + if (face_orientations[f] != FaceOrientation::OUTGOING) + continue; + + const auto& face = data.cell.faces[f]; + const bool is_local_face = data.cell_transport_view.IsFaceLocal(f); + const bool is_boundary_face = not face.has_neighbor; + const bool is_reflecting_boundary_face = + (is_boundary_face and angle_set.GetBoundaries()[face.neighbor_id]->IsReflecting()); + const auto& IntF_shapeI = data.IntS_shapeI[f]; + + const int locality = data.cell_transport_view.FaceLocality(f); + const size_t num_face_nodes = data.cell_mapping.GetNumFaceNodes(f); + const auto& face_nodal_mapping = + data.fluds.GetCommonData().GetFaceNodalMapping(data.cell_local_id, f); + std::vector* psi_nonlocal_outgoing = nullptr; + + if (not is_boundary_face and not is_local_face) + { + auto* async_comm = dynamic_cast(angle_set.GetCommunicator()); + const size_t data_size_for_msg = num_face_nodes * data.group_angle_stride; + psi_nonlocal_outgoing = + &async_comm->InitGetDownwindMessageData(locality, + face.neighbor_id, + face_nodal_mapping.associated_face_, + angle_set.GetID(), + data_size_for_msg); + } + + for (size_t fi = 0; fi < num_face_nodes; ++fi) + { + const int i = data.cell_mapping.MapFaceNode(f, fi); + + if (is_boundary_face) + { + for (size_t gsg = 0; gsg < data.gs_size; ++gsg) + data.cell_transport_view.AddOutflow( + f, data.gs_gi + gsg, wt * face_mu_values[f] * b[gsg](i) * IntF_shapeI(i)); + } + + double* psi = nullptr; + if (is_local_face) + psi = data.fluds.OutgoingPsi(data.cell, i, as_ss_idx); + else if (not is_boundary_face) + psi = data.fluds.NLOutgoingPsi(psi_nonlocal_outgoing, fi, as_ss_idx); + else if (is_reflecting_boundary_face) + psi = angle_set.PsiReflected(face.neighbor_id, direction_num, data.cell_local_id, f, fi); + + if (psi != nullptr) + for (size_t gsg = 0; gsg < data.gs_size; ++gsg) + psi[gsg] = b[gsg](i); + } + } + } +} + +template +void CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set); + +} // namespace opensn From 26def95e381c7d237ed4f33a5b71a4158b57fe90 Mon Sep 17 00:00:00 2001 From: Eappen Nelluvelil Date: Tue, 31 Mar 2026 23:31:38 -0500 Subject: [PATCH 3/6] Transport transient tests for CBC --- .../transport_transient/tests.json | 568 +++++++++++++++++- .../transient_init_leakage_pulse_decay_cbc.py | 102 ++++ .../transient_init_precursor_decay_cbc.py | 131 ++++ .../transient_init_steady_state_source_cbc.py | 100 +++ ...ransient_init_time_dependent_source_cbc.py | 102 ++++ ...transient_init_zero_absorber_source_cbc.py | 95 +++ ...eigen_1d_delayed_fission_prod_count_cbc.py | 88 +++ ...sient_keigen_1d_delayed_prke_vs_stk_cbc.py | 171 ++++++ .../transient_keigen_1d_prompt_step_cbc.py | 106 ++++ ...t_keigen_1d_theta_precursor_scaling_cbc.py | 102 ++++ ...transient_keigen_2d_2g_delayed_step_cbc.py | 110 ++++ ...gen_2d_2g_prompt_combine_velocities_cbc.py | 109 ++++ ...nt_keigen_2d_delayed_prke_vs_stk_2p_cbc.py | 209 +++++++ .../transient_keigen_2d_prompt_ramp_xs_cbc.py | 106 ++++ ...ansient_keigen_3d_2g_prompt_step_xs_cbc.py | 107 ++++ ...eigen_3d_6g_delayed_step_nu_sigma_f_cbc.py | 107 ++++ ...ransient_keigen_3d_delayed_analytic_cbc.py | 157 +++++ ...3d_delayed_prke_vs_stk_2p_callbacks_cbc.py | 228 +++++++ ...nt_keigen_3d_delayed_prke_vs_stk_2p_cbc.py | 215 +++++++ ...transient_keigen_3d_delayed_ramp_xs_cbc.py | 205 +++++++ ...gen_3d_delayed_stiff_dt_sensitivity_cbc.py | 134 +++++ ...transient_keigen_3d_prompt_analytic_cbc.py | 158 +++++ ...ansient_keigen_3d_prompt_bc_leakage_cbc.py | 114 ++++ ...ient_keigen_3d_prompt_mid_step_swap_cbc.py | 112 ++++ .../transient_zero_3d_1g_pulse_inf_med_cbc.py | 124 ++++ ...ent_zero_3d_1g_ramp_source_analytic_cbc.py | 136 +++++ .../transient_zero_3d_1g_v0.5_inf_med_cbc.py | 102 ++++ .../transient_zero_3d_1g_v1_inf_med_cbc.py | 107 ++++ ...ransient_zero_3d_1g_v1_inf_med_swap_cbc.py | 131 ++++ .../transient_zero_3d_1g_v2_inf_med_cbc.py | 96 +++ ...ient_zero_3d_2g_inf_med_downscatter_cbc.py | 132 ++++ ...zero_3d_2g_inf_med_downscatter_swap_cbc.py | 148 +++++ ...transient_zero_3d_2g_inf_med_pydrvr_cbc.py | 163 +++++ ...t_zero_3d_2g_inf_med_pydrvr_ramp_dt_cbc.py | 169 ++++++ .../transient_zero_3d_openmc_xs_cbc.py | 107 ++++ 35 files changed, 5033 insertions(+), 18 deletions(-) create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_leakage_pulse_decay_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_precursor_decay_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_steady_state_source_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_time_dependent_source_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_zero_absorber_source_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_delayed_fission_prod_count_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_delayed_prke_vs_stk_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_prompt_step_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_theta_precursor_scaling_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_2g_delayed_step_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_2g_prompt_combine_velocities_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_delayed_prke_vs_stk_2p_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_prompt_ramp_xs_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_2g_prompt_step_xs_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_6g_delayed_step_nu_sigma_f_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_analytic_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_prke_vs_stk_2p_callbacks_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_prke_vs_stk_2p_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_ramp_xs_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_stiff_dt_sensitivity_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_prompt_analytic_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_prompt_bc_leakage_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_prompt_mid_step_swap_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_pulse_inf_med_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_ramp_source_analytic_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v0.5_inf_med_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v1_inf_med_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v1_inf_med_swap_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v2_inf_med_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_downscatter_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_downscatter_swap_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_pydrvr_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_pydrvr_ramp_dt_cbc.py create mode 100644 test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_openmc_xs_cbc.py diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/tests.json b/test/python/modules/linear_boltzmann_solvers/transport_transient/tests.json index 92a5ff5ea8..9514ef6b63 100644 --- a/test/python/modules/linear_boltzmann_solvers/transport_transient/tests.json +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/tests.json @@ -106,7 +106,7 @@ "type": "KeyValuePair", "key": "Max phi(3s) = ", "goldvalue": 5.104477, - "abs_tol": 0.000001 + "abs_tol": 1e-06 } ] }, @@ -119,7 +119,7 @@ "type": "KeyValuePair", "key": "Max phi(1s) = ", "goldvalue": 2.330956, - "abs_tol": 0.000001 + "abs_tol": 1e-06 } ] }, @@ -132,13 +132,13 @@ "type": "KeyValuePair", "key": "Max phi0(2s) = ", "goldvalue": 3.658193, - "abs_tol": 0.000001 + "abs_tol": 1e-06 }, { "type": "KeyValuePair", "key": "Max phi1(2s) = ", "goldvalue": 1.027354, - "abs_tol": 0.000001 + "abs_tol": 1e-06 } ] }, @@ -151,7 +151,7 @@ "type": "KeyValuePair", "key": "Max phi(2s) = ", "goldvalue": 0.889725, - "abs_tol": 0.000001 + "abs_tol": 1e-06 } ] }, @@ -164,7 +164,7 @@ "type": "KeyValuePair", "key": "Max phi(1s) = ", "goldvalue": 1.736077, - "abs_tol": 0.000001 + "abs_tol": 1e-06 } ] }, @@ -177,7 +177,7 @@ "type": "KeyValuePair", "key": "Max phi(1s) = ", "goldvalue": 3.184793, - "abs_tol": 0.000001 + "abs_tol": 1e-06 } ] }, @@ -190,13 +190,13 @@ "type": "KeyValuePair", "key": "Max phi0(1s) = ", "goldvalue": 1.978816, - "abs_tol": 0.000001 + "abs_tol": 1e-06 }, { "type": "KeyValuePair", "key": "Max phi1(1s) = ", "goldvalue": 0.394171, - "abs_tol": 0.000001 + "abs_tol": 1e-06 } ] }, @@ -209,13 +209,32 @@ "type": "KeyValuePair", "key": "Max phi0 = ", "goldvalue": 3.672537, - "abs_tol": 0.000001 + "abs_tol": 1e-06 }, { "type": "KeyValuePair", "key": "Max phi1 = ", "goldvalue": 1.035548, - "abs_tol": 0.000001 + "abs_tol": 1e-06 + } + ] + }, + { + "file": "transient_zero_3d_2g_inf_med_pydrvr_cbc.py", + "comment": "3D unstructured mesh, 2 group, downscatter, zero-init transient, Python time step loop (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "KeyValuePair", + "key": "Max phi0 = ", + "goldvalue": 3.672537, + "abs_tol": 5e-06 + }, + { + "type": "KeyValuePair", + "key": "Max phi1 = ", + "goldvalue": 1.035548, + "abs_tol": 1e-06 } ] }, @@ -228,13 +247,13 @@ "type": "KeyValuePair", "key": "Max phi0 = ", "goldvalue": 3.674989, - "abs_tol": 0.000001 + "abs_tol": 1e-06 }, { "type": "KeyValuePair", "key": "Max phi1 = ", "goldvalue": 1.035541, - "abs_tol": 0.000001 + "abs_tol": 1e-06 } ] }, @@ -251,6 +270,19 @@ } ] }, + { + "file": "transient_zero_3d_openmc_xs_cbc.py", + "comment": "3D orthogonal mesh, 30 group, zero-init transient, OpenMC cross sections (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "KeyValuePair", + "key": "Max phi(0.1s) = ", + "goldvalue": 51.057722, + "abs_tol": 0.0001 + } + ] + }, { "file": "transient_keigen_3d_prompt_analytic.py", "comment": "3D prompt-only analytic exponential check", @@ -387,7 +419,7 @@ "key": "FP_RATIO_ACTUAL", "wordnum": 1, "gold": 1.2, - "abs_tol": 0.000001 + "abs_tol": 1e-06 }, { "type": "FloatCompare", @@ -408,7 +440,7 @@ "key": "FP_RATIO_ACTUAL", "wordnum": 1, "gold": 1.2, - "abs_tol": 0.000001 + "abs_tol": 1e-06 }, { "type": "FloatCompare", @@ -429,7 +461,7 @@ "key": "FP_RATIO_ACTUAL", "wordnum": 1, "gold": 2.2, - "abs_tol": 0.000001 + "abs_tol": 1e-06 }, { "type": "FloatCompare", @@ -492,7 +524,7 @@ "key": "FP_RATIO_ACTUAL", "wordnum": 1, "gold": 1.2, - "abs_tol": 0.000001 + "abs_tol": 1e-06 }, { "type": "FloatCompare", @@ -513,7 +545,7 @@ "key": "FP_RATIO_ACTUAL", "wordnum": 1, "gold": 1.2, - "abs_tol": 0.000001 + "abs_tol": 1e-06 } ] }, @@ -572,5 +604,505 @@ "abs_tol": 1e-12 } ] + }, + { + "file": "transient_init_leakage_pulse_decay_cbc.py", + "comment": "Leakage sanity: vacuum boundaries with source removal (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "FloatCompare", + "key": "LEAKAGE_DECAY_PASS", + "wordnum": 1, + "gold": 1, + "abs_tol": 1e-12 + } + ] + }, + { + "file": "transient_init_precursor_decay_cbc.py", + "comment": "Delayed precursor decay check with source removal (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "FloatCompare", + "key": "PRECURSOR_DECAY_PASS", + "wordnum": 1, + "gold": 1, + "abs_tol": 1e-12 + } + ] + }, + { + "file": "transient_init_steady_state_source_cbc.py", + "comment": "Steady-state source init: 1D absorber consistency (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "FloatCompare", + "key": "STEADY_INIT_PASS", + "wordnum": 1, + "gold": 1, + "abs_tol": 1e-12 + } + ] + }, + { + "file": "transient_init_time_dependent_source_cbc.py", + "comment": "Time-dependent source init: 1D absorber step consistency (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "FloatCompare", + "key": "TD_INIT_PASS", + "wordnum": 1, + "gold": 1, + "abs_tol": 1e-12 + } + ] + }, + { + "file": "transient_init_zero_absorber_source_cbc.py", + "comment": "Zero-init transient: 1D absorber with constant source (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "FloatCompare", + "key": "ZERO_INIT_PASS", + "wordnum": 1, + "gold": 1, + "abs_tol": 1e-12 + } + ] + }, + { + "file": "transient_keigen_1d_delayed_fission_prod_count_cbc.py", + "comment": "1D delayed fission production invariant to precursor count (1p vs 2p) (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "FloatCompare", + "key": "K_PRECURSOR_FPROD_PASS", + "wordnum": 1, + "gold": 1, + "abs_tol": 1e-12 + } + ] + }, + { + "file": "transient_keigen_1d_delayed_prke_vs_stk_cbc.py", + "comment": "1D delayed homogeneous step: PRKE vs space-time kinetics (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "FloatCompare", + "key": "PRKE_STK_PASS", + "wordnum": 1, + "gold": 1, + "abs_tol": 1e-12 + } + ] + }, + { + "file": "transient_keigen_1d_prompt_step_cbc.py", + "comment": "1D prompt-only xs step (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "FloatCompare", + "key": "FP_RATIO_ACTUAL", + "wordnum": 1, + "gold": 1.2, + "abs_tol": 1e-06 + } + ] + }, + { + "file": "transient_keigen_1d_theta_precursor_scaling_cbc.py", + "comment": "1D delayed fission source scales with theta (TransientSourceFunction check) (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "FloatCompare", + "key": "DELAYED_THETA_RATIO", + "wordnum": 1, + "gold": 2, + "abs_tol": 0.01 + } + ] + }, + { + "file": "transient_keigen_2d_2g_delayed_step_cbc.py", + "comment": "2D 2g delayed xs step (ratio) (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "FloatCompare", + "key": "FP_RATIO_ACTUAL", + "wordnum": 1, + "gold": 1.2, + "abs_tol": 1e-06 + }, + { + "type": "FloatCompare", + "key": "TRANSIENT_OK", + "wordnum": 1, + "gold": 1, + "abs_tol": 1e-12 + } + ] + }, + { + "file": "transient_keigen_2d_2g_prompt_combine_velocities_cbc.py", + "comment": "2D 2g prompt combine xs with group-wise velocities (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "FloatCompare", + "key": "FP_RATIO_ACTUAL", + "wordnum": 1, + "gold": 2.2, + "abs_tol": 1e-06 + }, + { + "type": "FloatCompare", + "key": "TRANSIENT_OK", + "wordnum": 1, + "gold": 1, + "abs_tol": 1e-12 + } + ] + }, + { + "file": "transient_keigen_2d_delayed_prke_vs_stk_2p_cbc.py", + "comment": "2D delayed homogeneous step: PRKE vs space-time kinetics (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "FloatCompare", + "key": "PRKE_STK_PASS", + "wordnum": 1, + "gold": 1, + "abs_tol": 1e-12 + } + ] + }, + { + "file": "transient_keigen_2d_prompt_ramp_xs_cbc.py", + "comment": "2D prompt-only ramp xs (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "FloatCompare", + "key": "TRANSIENT_OK", + "wordnum": 1, + "gold": 1, + "abs_tol": 1e-12 + } + ] + }, + { + "file": "transient_keigen_3d_2g_prompt_step_xs_cbc.py", + "comment": "3D 2g prompt-only step xs swap (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "FloatCompare", + "key": "FP_RATIO_ACTUAL", + "wordnum": 1, + "gold": 1.2, + "abs_tol": 1e-06 + }, + { + "type": "FloatCompare", + "key": "TRANSIENT_OK", + "wordnum": 1, + "gold": 1, + "abs_tol": 1e-12 + } + ] + }, + { + "file": "transient_keigen_3d_6g_delayed_step_nu_sigma_f_cbc.py", + "comment": "3D 6g, 2-precursor delayed step nu_sigma_f (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "FloatCompare", + "key": "FP_RATIO_ACTUAL", + "wordnum": 1, + "gold": 1.2, + "abs_tol": 1e-06 + }, + { + "type": "FloatCompare", + "key": "TRANSIENT_OK", + "wordnum": 1, + "gold": 1, + "abs_tol": 1e-12 + } + ] + }, + { + "file": "transient_keigen_3d_delayed_analytic_cbc.py", + "comment": "3D delayed analytic point-kinetics check (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "FloatCompare", + "key": "ANALYTIC_PASS", + "wordnum": 1, + "gold": 1, + "abs_tol": 1e-12 + } + ] + }, + { + "file": "transient_keigen_3d_delayed_prke_vs_stk_2p_callbacks_cbc.py", + "comment": "3D delayed homogeneous step: PRKE vs space-time kinetics (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "FloatCompare", + "key": "PRKE_STK_PASS", + "wordnum": 1, + "gold": 1, + "abs_tol": 1e-12 + } + ] + }, + { + "file": "transient_keigen_3d_delayed_prke_vs_stk_2p_cbc.py", + "comment": "3D delayed homogeneous step: PRKE vs space-time kinetics (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "FloatCompare", + "key": "PRKE_STK_PASS", + "wordnum": 1, + "gold": 1, + "abs_tol": 1e-12 + } + ] + }, + { + "file": "transient_keigen_3d_delayed_ramp_xs_cbc.py", + "comment": "3D delayed ramp xs with point-kinetics reference (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "FloatCompare", + "key": "TRANSIENT_OK", + "wordnum": 1, + "gold": 1, + "abs_tol": 1e-12 + } + ] + }, + { + "file": "transient_keigen_3d_delayed_stiff_dt_sensitivity_cbc.py", + "comment": "3D delayed stiff precursor dt (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "FloatCompare", + "key": "TRANSIENT_OK", + "wordnum": 1, + "gold": 1, + "abs_tol": 1e-12 + } + ] + }, + { + "file": "transient_keigen_3d_prompt_analytic_cbc.py", + "comment": "3D prompt-only analytic exponential check (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "FloatCompare", + "key": "ANALYTIC_PASS", + "wordnum": 1, + "gold": 1, + "abs_tol": 1e-12 + } + ] + }, + { + "file": "transient_keigen_3d_prompt_bc_leakage_cbc.py", + "comment": "3D prompt leakage vs reflecting boundary (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "FloatCompare", + "key": "TRANSIENT_OK", + "wordnum": 1, + "gold": 1, + "abs_tol": 1e-12 + } + ] + }, + { + "file": "transient_keigen_3d_prompt_mid_step_swap_cbc.py", + "comment": "3D prompt mid-step XS swap (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "FloatCompare", + "key": "TIME_AT_SWAP", + "wordnum": 1, + "gold": 0.07, + "abs_tol": 1e-10 + }, + { + "type": "FloatCompare", + "key": "FP_RATIO_AT_SWAP", + "wordnum": 1, + "gold": 1.2, + "abs_tol": 0.0001 + }, + { + "type": "FloatCompare", + "key": "TRANSIENT_OK", + "wordnum": 1, + "gold": 1, + "abs_tol": 1e-12 + } + ] + }, + { + "file": "transient_zero_3d_1g_pulse_inf_med_cbc.py", + "comment": "3D unstructured mesh, 1 group, vel = 1.0, zero-init transient pulse (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "KeyValuePair", + "key": "Max phi(3s) = ", + "goldvalue": 5.104477, + "abs_tol": 5e-06 + } + ] + }, + { + "file": "transient_zero_3d_1g_ramp_source_analytic_cbc.py", + "comment": "Zero-mode transient with ramped source and analytic check (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "FloatCompare", + "key": "RAMP_SOURCE_ANALYTIC_PASS", + "wordnum": 1, + "gold": 1, + "abs_tol": 1e-12 + } + ] + }, + { + "file": "transient_zero_3d_1g_v0.5_inf_med_cbc.py", + "comment": "3D unstructured mesh, 1 group, vel = 0.5, zero-init transient (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "KeyValuePair", + "key": "Max phi(2s) = ", + "goldvalue": 0.889725, + "abs_tol": 5e-06 + } + ] + }, + { + "file": "transient_zero_3d_1g_v1_inf_med_cbc.py", + "comment": "3D unstructured mesh, 1 group, vel = 1.0, zero-init transient (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "KeyValuePair", + "key": "Max phi(1s) = ", + "goldvalue": 2.330956, + "abs_tol": 5e-06 + } + ] + }, + { + "file": "transient_zero_3d_1g_v1_inf_med_swap_cbc.py", + "comment": "3D unstructured mesh, 1 group, vel = 1.0, zero-init transient, swap xs at 0.5s (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "KeyValuePair", + "key": "Max phi(1s) = ", + "goldvalue": 1.736077, + "abs_tol": 5e-06 + } + ] + }, + { + "file": "transient_zero_3d_1g_v2_inf_med_cbc.py", + "comment": "3D unstructured mesh, 1 group, vel = 2.0, zero-init transient (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "KeyValuePair", + "key": "Max phi(1s) = ", + "goldvalue": 3.184793, + "abs_tol": 5e-06 + } + ] + }, + { + "file": "transient_zero_3d_2g_inf_med_downscatter_cbc.py", + "comment": "3D unstructured mesh, 2 group, downscatter, zero-init transient (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "KeyValuePair", + "key": "Max phi0(2s) = ", + "goldvalue": 3.658193, + "abs_tol": 5e-06 + }, + { + "type": "KeyValuePair", + "key": "Max phi1(2s) = ", + "goldvalue": 1.027354, + "abs_tol": 1e-06 + } + ] + }, + { + "file": "transient_zero_3d_2g_inf_med_downscatter_swap_cbc.py", + "comment": "3D unstructured mesh, 2 group, downscatter, zero-init transient, swap xs at 0.5s (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "KeyValuePair", + "key": "Max phi0(1s) = ", + "goldvalue": 1.978816, + "abs_tol": 1e-06 + }, + { + "type": "KeyValuePair", + "key": "Max phi1(1s) = ", + "goldvalue": 0.394171, + "abs_tol": 1e-06 + } + ] + }, + { + "file": "transient_zero_3d_2g_inf_med_pydrvr_ramp_dt_cbc.py", + "comment": "3D unstructured mesh, 2 group, downscatter, zero-init transient, Python time step loop, ramp dt (CBC)", + "num_procs": 4, + "checks": [ + { + "type": "KeyValuePair", + "key": "Max phi0 = ", + "goldvalue": 3.674989, + "abs_tol": 5e-06 + }, + { + "type": "KeyValuePair", + "key": "Max phi1 = ", + "goldvalue": 1.035541, + "abs_tol": 1e-06 + } + ] } ] diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_leakage_pulse_decay_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_leakage_pulse_decay_cbc.py new file mode 100644 index 0000000000..09b1d27ab7 --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_leakage_pulse_decay_cbc.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +Leakage sanity: 1D vacuum boundaries with source removal. + +Initialize from a steady-state source solve, then remove the source and +advance one transient step. The scalar flux should decrease with leakage. +LEAKAGE_DECAY_PASS is 1 if phi decreases and remains non-negative. +""" + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.mesh import OrthogonalMeshGenerator + from pyopensn.xs import MultiGroupXS + from pyopensn.source import VolumetricSource + from pyopensn.aquad import GLProductQuadrature1DSlab + from pyopensn.solver import DiscreteOrdinatesProblem, SteadyStateSourceSolver, TransientSolver + from pyopensn.fieldfunc import FieldFunctionInterpolationVolume + from pyopensn.logvol import RPPLogicalVolume + + +def max_phi(phys): + fflist = phys.GetScalarFluxFieldFunction() + monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True) + field_interp = FieldFunctionInterpolationVolume() + field_interp.SetOperationType("max") + field_interp.SetLogicalVolume(monitor_volume) + field_interp.AddFieldFunction(fflist[0]) + field_interp.Execute() + return field_interp.GetValue() + + +if __name__ == "__main__": + dx = 2.0 / 40 + nodes = [i * dx for i in range(40 + 1)] + meshgen = OrthogonalMeshGenerator(node_sets=[nodes]) + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + + sigma_t = 1.0 + Q = 2.0 + dt = 0.05 + + xs = MultiGroupXS() + xs.CreateSimpleOneGroup(sigma_t, 0.0, 1.0) + + source = VolumetricSource(block_ids=[0], group_strength=[Q], start_time=0.0, end_time=10.0) + + pquad = GLProductQuadrature1DSlab(n_polar=4, scattering_order=0) + + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=1, + groupsets=[ + { + "groups_from_to": (0, 0), + "angular_quadrature": pquad, + "inner_linear_method": "petsc_richardson", + "l_abs_tol": 1.0e-8, + "l_max_its": 200, + } + ], + xs_map=[{"block_ids": [0], "xs": xs}], + volumetric_sources=[source], + boundary_conditions=[ + {"name": "zmin", "type": "vacuum"}, + {"name": "zmax", "type": "vacuum"}, + ], + options={ + "save_angular_flux": True, + "verbose_inner_iterations": False, + "verbose_outer_iterations": False, + }, + sweep_type="CBC", + ) + + steady = SteadyStateSourceSolver(problem=phys) + steady.Initialize() + steady.Execute() + + phi0 = max_phi(phys) + + phys.SetVolumetricSources(clear_volumetric_sources=True) + + phys.SetTimeDependentMode() + + solver = TransientSolver(problem=phys, dt=dt, theta=1.0, stop_time=dt, initial_state="existing") + solver.Initialize() + solver.Execute() + + phi1 = max_phi(phys) + pass_flag = 1 if (phi1 >= 0.0 and phi1 < phi0) else 0 + + if rank == 0: + print(f"LEAKAGE_DECAY_PASS {pass_flag}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_precursor_decay_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_precursor_decay_cbc.py new file mode 100644 index 0000000000..04b7e41a49 --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_precursor_decay_cbc.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +Delayed precursor decay: 1D delayed system with source removal. + +Compute a steady-state with precursors, remove the external source, and +advance one transient step. The flux ratio should roughly follow +exp(-lambda*dt) for the single precursor group. PRECURSOR_DECAY_PASS is 1 +if the ratio matches within 20%. +""" + +import math +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.mesh import OrthogonalMeshGenerator + from pyopensn.xs import MultiGroupXS + from pyopensn.source import VolumetricSource + from pyopensn.aquad import GLProductQuadrature1DSlab + from pyopensn.solver import DiscreteOrdinatesProblem, SteadyStateSourceSolver, TransientSolver + from pyopensn.fieldfunc import FieldFunctionInterpolationVolume + from pyopensn.logvol import RPPLogicalVolume + + +def read_precursor_value(path, block_name): + begin = f"{block_name}_BEGIN" + end = f"{block_name}_END" + in_block = False + with open(path, "r", encoding="utf-8") as handle: + for line in handle: + line = line.strip() + if not line or line.startswith("#"): + continue + if line == begin: + in_block = True + continue + if line == end: + in_block = False + continue + if in_block: + parts = line.split() + if len(parts) >= 2: + return float(parts[1]) + raise RuntimeError(f"Failed to find {block_name} in {path}") + + +def max_phi(phys): + fflist = phys.GetScalarFluxFieldFunction() + monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True) + field_interp = FieldFunctionInterpolationVolume() + field_interp.SetOperationType("max") + field_interp.SetLogicalVolume(monitor_volume) + field_interp.AddFieldFunction(fflist[0]) + field_interp.Execute() + return field_interp.GetValue() + + +if __name__ == "__main__": + dx = 1.0 / 10 + nodes = [i * dx for i in range(10 + 1)] + meshgen = OrthogonalMeshGenerator(node_sets=[nodes]) + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + + dt = 0.05 + Q = 0.5 + + xs_path = os.path.join(os.path.dirname(__file__), "xs1g_delayed_crit_1p.cxs") + xs = MultiGroupXS() + xs.LoadFromOpenSn(xs_path) + + source = VolumetricSource(block_ids=[0], group_strength=[Q], start_time=0.0, end_time=10.0) + + pquad = GLProductQuadrature1DSlab(n_polar=4, scattering_order=0) + + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=1, + groupsets=[ + { + "groups_from_to": (0, 0), + "angular_quadrature": pquad, + "inner_linear_method": "petsc_richardson", + "l_abs_tol": 1.0e-8, + "l_max_its": 200, + } + ], + xs_map=[{"block_ids": [0], "xs": xs}], + volumetric_sources=[source], + boundary_conditions=[ + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={ + "save_angular_flux": True, + "use_precursors": True, + "verbose_inner_iterations": False, + "verbose_outer_iterations": False, + }, + sweep_type="CBC", + ) + + steady = SteadyStateSourceSolver(problem=phys) + steady.Initialize() + steady.Execute() + + phi0 = max_phi(phys) + + phys.SetVolumetricSources(clear_volumetric_sources=True) + + phys.SetTimeDependentMode() + + solver = TransientSolver(problem=phys, dt=dt, theta=1.0, stop_time=dt, initial_state="existing") + solver.Initialize() + solver.Execute() + + phi1 = max_phi(phys) + ratio = phi1 / phi0 if phi0 > 0.0 else 0.0 + + lam = read_precursor_value(xs_path, "PRECURSOR_DECAY_CONSTANTS") + expected = math.exp(-lam * dt) + pass_flag = 1 if abs(ratio - expected) < 0.2 else 0 + + if rank == 0: + print(f"PRECURSOR_DECAY_PASS {pass_flag}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_steady_state_source_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_steady_state_source_cbc.py new file mode 100644 index 0000000000..d5c620936d --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_steady_state_source_cbc.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +Steady-state source initialization: 1D pure absorber with constant source. + +Initialize with a steady-state source solve, then advance one BE step with +TransientSolver. The flux should remain at phi_ss = Q/sigma_t. +STEADY_INIT_PASS is 1 if the transient step preserves the steady state. +""" + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.mesh import OrthogonalMeshGenerator + from pyopensn.xs import MultiGroupXS + from pyopensn.source import VolumetricSource + from pyopensn.aquad import GLProductQuadrature1DSlab + from pyopensn.solver import DiscreteOrdinatesProblem, SteadyStateSourceSolver, TransientSolver + from pyopensn.fieldfunc import FieldFunctionInterpolationVolume + from pyopensn.logvol import RPPLogicalVolume + + +def max_phi(phys): + fflist = phys.GetScalarFluxFieldFunction() + monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True) + field_interp = FieldFunctionInterpolationVolume() + field_interp.SetOperationType("max") + field_interp.SetLogicalVolume(monitor_volume) + field_interp.AddFieldFunction(fflist[0]) + field_interp.Execute() + return field_interp.GetValue() + + +if __name__ == "__main__": + dx = 1.0 / 10 + nodes = [i * dx for i in range(10 + 1)] + meshgen = OrthogonalMeshGenerator(node_sets=[nodes]) + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + + sigma_t = 1.0 + Q = 2.0 + dt = 0.1 + + xs = MultiGroupXS() + xs.CreateSimpleOneGroup(sigma_t, 0.0, 1.0) + + source = VolumetricSource(block_ids=[0], group_strength=[Q], start_time=0.0, end_time=10.0) + + pquad = GLProductQuadrature1DSlab(n_polar=4, scattering_order=0) + + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=1, + groupsets=[ + { + "groups_from_to": (0, 0), + "angular_quadrature": pquad, + "inner_linear_method": "petsc_richardson", + "l_abs_tol": 1.0e-8, + "l_max_its": 200, + } + ], + xs_map=[{"block_ids": [0], "xs": xs}], + volumetric_sources=[source], + boundary_conditions=[ + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={ + "save_angular_flux": True, + "verbose_inner_iterations": False, + "verbose_outer_iterations": False, + }, + sweep_type="CBC", + ) + + steady = SteadyStateSourceSolver(problem=phys) + steady.Initialize() + steady.Execute() + + phys.SetTimeDependentMode() + + solver = TransientSolver(problem=phys, dt=dt, theta=1.0, stop_time=dt, initial_state="existing") + solver.Initialize() + solver.Execute() + + phi_num = max_phi(phys) + phi_ss = Q / sigma_t + rel_err = abs(phi_num - phi_ss) / phi_ss + pass_flag = 1 if rel_err < 1.0e-3 else 0 + + if rank == 0: + print(f"STEADY_INIT_PASS {pass_flag}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_time_dependent_source_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_time_dependent_source_cbc.py new file mode 100644 index 0000000000..69c5add538 --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_time_dependent_source_cbc.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +Time-dependent source initialization: 1D pure absorber with constant source. + +Initialize with a transient step from zero state, then advance one BE step with +TransientSolver using the existing state. The next step should satisfy the analytic update +phi^{n+1} = (phi^n + dt*Q)/(1 + sigma_t*dt). +TD_INIT_PASS is 1 if the transient step matches the update within 2%. +""" + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.mesh import OrthogonalMeshGenerator + from pyopensn.xs import MultiGroupXS + from pyopensn.source import VolumetricSource + from pyopensn.aquad import GLProductQuadrature1DSlab + from pyopensn.solver import DiscreteOrdinatesProblem, TransientSolver + from pyopensn.fieldfunc import FieldFunctionInterpolationVolume + from pyopensn.logvol import RPPLogicalVolume + + +def max_phi(phys): + fflist = phys.GetScalarFluxFieldFunction() + monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True) + field_interp = FieldFunctionInterpolationVolume() + field_interp.SetOperationType("max") + field_interp.SetLogicalVolume(monitor_volume) + field_interp.AddFieldFunction(fflist[0]) + field_interp.Execute() + return field_interp.GetValue() + + +if __name__ == "__main__": + dx = 1.0 / 10 + nodes = [i * dx for i in range(10 + 1)] + meshgen = OrthogonalMeshGenerator(node_sets=[nodes]) + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + + sigma_t = 1.0 + Q = 1.2 + dt = 0.1 + + xs = MultiGroupXS() + xs.CreateSimpleOneGroup(sigma_t, 0.0, 1.0) + + source = VolumetricSource(block_ids=[0], group_strength=[Q], start_time=0.0, end_time=10.0) + + pquad = GLProductQuadrature1DSlab(n_polar=4, scattering_order=0) + + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=1, + time_dependent=True, + groupsets=[ + { + "groups_from_to": (0, 0), + "angular_quadrature": pquad, + "inner_linear_method": "petsc_richardson", + "l_abs_tol": 1.0e-8, + "l_max_its": 200, + } + ], + xs_map=[{"block_ids": [0], "xs": xs}], + volumetric_sources=[source], + boundary_conditions=[ + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={ + "save_angular_flux": True, + "verbose_inner_iterations": False, + "verbose_outer_iterations": False, + }, + sweep_type="CBC", + ) + + td_solver = TransientSolver(problem=phys, dt=dt, theta=1.0, stop_time=dt, initial_state="zero") + td_solver.Initialize() + td_solver.Execute() + + phi_n = max_phi(phys) + + solver = TransientSolver(problem=phys, dt=dt, theta=1.0, stop_time=dt, initial_state="existing") + solver.Initialize() + solver.Execute() + + phi_np1 = max_phi(phys) + phi_expected = (phi_n + dt * Q) / (1.0 + sigma_t * dt) + rel_err = abs(phi_np1 - phi_expected) / phi_expected + pass_flag = 1 if rel_err < 0.02 else 0 + + if rank == 0: + print(f"TD_INIT_PASS {pass_flag}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_zero_absorber_source_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_zero_absorber_source_cbc.py new file mode 100644 index 0000000000..7ffa74b25f --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_init_zero_absorber_source_cbc.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +Zero-init transient: 1D pure absorber with a constant volumetric source. + +A single Backward Euler step from zero should match the analytic +phi = Q*dt/(1 + sigma_t*dt) for a homogeneous infinite medium. +ZERO_INIT_PASS is 1 if the numeric and analytic values agree within 2%. +""" + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.mesh import OrthogonalMeshGenerator + from pyopensn.xs import MultiGroupXS + from pyopensn.source import VolumetricSource + from pyopensn.aquad import GLProductQuadrature1DSlab + from pyopensn.solver import DiscreteOrdinatesProblem, TransientSolver + from pyopensn.fieldfunc import FieldFunctionInterpolationVolume + from pyopensn.logvol import RPPLogicalVolume + + +def max_phi(phys): + fflist = phys.GetScalarFluxFieldFunction() + monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True) + field_interp = FieldFunctionInterpolationVolume() + field_interp.SetOperationType("max") + field_interp.SetLogicalVolume(monitor_volume) + field_interp.AddFieldFunction(fflist[0]) + field_interp.Execute() + return field_interp.GetValue() + + +if __name__ == "__main__": + dx = 1.0 / 10 + nodes = [i * dx for i in range(10 + 1)] + meshgen = OrthogonalMeshGenerator(node_sets=[nodes]) + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + + sigma_t = 1.0 + Q = 1.5 + dt = 0.1 + + xs = MultiGroupXS() + xs.CreateSimpleOneGroup(sigma_t, 0.0, 1.0) + + source = VolumetricSource(block_ids=[0], group_strength=[Q], start_time=0.0, end_time=1.0) + + pquad = GLProductQuadrature1DSlab(n_polar=4, scattering_order=0) + + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=1, + time_dependent=True, + groupsets=[ + { + "groups_from_to": (0, 0), + "angular_quadrature": pquad, + "inner_linear_method": "petsc_richardson", + "l_abs_tol": 1.0e-8, + "l_max_its": 200, + } + ], + xs_map=[{"block_ids": [0], "xs": xs}], + volumetric_sources=[source], + boundary_conditions=[ + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={ + "save_angular_flux": True, + "verbose_inner_iterations": False, + "verbose_outer_iterations": False, + }, + sweep_type="CBC", + ) + + solver = TransientSolver(problem=phys, dt=dt, theta=1.0, stop_time=dt, initial_state="zero") + solver.Initialize() + solver.Execute() + + phi_num = max_phi(phys) + phi_analytic = Q * dt / (1.0 + sigma_t * dt) + rel_err = abs(phi_num - phi_analytic) / phi_analytic + pass_flag = 1 if rel_err < 0.02 else 0 + + if rank == 0: + print(f"ZERO_INIT_PASS {pass_flag}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_delayed_fission_prod_count_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_delayed_fission_prod_count_cbc.py new file mode 100644 index 0000000000..d8c1fcbdbe --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_delayed_fission_prod_count_cbc.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +1D delayed fission production consistency across precursor counts. + +Same physics (prompt + delayed) but with 1 vs 2 precursors should +yield the same steady-state total fission production. This test +fails if delayed production is over-counted per precursor. +""" + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.mesh import OrthogonalMeshGenerator + from pyopensn.xs import MultiGroupXS + from pyopensn.aquad import GLProductQuadrature1DSlab + from pyopensn.solver import DiscreteOrdinatesProblem, PowerIterationKEigenSolver + + +def solve_and_get_fission_prod(xs_path): + n_cells = 40 + L = 8.0 + dx = L / n_cells + nodes = [i * dx for i in range(n_cells + 1)] + meshgen = OrthogonalMeshGenerator(node_sets=[nodes]) + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + + xs = MultiGroupXS() + xs.LoadFromOpenSn(xs_path) + + pquad = GLProductQuadrature1DSlab(n_polar=4, scattering_order=0) + + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=1, + groupsets=[ + { + "groups_from_to": (0, 0), + "angular_quadrature": pquad, + "inner_linear_method": "petsc_richardson", + "l_abs_tol": 1.0e-8, + "l_max_its": 200, + } + ], + xs_map=[{"block_ids": [0], "xs": xs}], + boundary_conditions=[ + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={ + "use_precursors": True, + "verbose_inner_iterations": False, + "verbose_outer_iterations": False, + }, + sweep_type="CBC", + ) + + ksolver = PowerIterationKEigenSolver(problem=phys, max_iters=200, k_tol=1.0e-12) + ksolver.Initialize() + ksolver.Execute() + + # Use the steady-state flux to compute total fission production. + fprod = phys.ComputeFissionProduction("new") + return fprod + + +if __name__ == "__main__": + base_dir = os.path.dirname(__file__) + xs_1p = os.path.join(base_dir, "xs1g_delayed_crit_1p.cxs") + xs_2p = os.path.join(base_dir, "xs1g_delayed_crit_2p.cxs") + + fp_1p = solve_and_get_fission_prod(xs_1p) + fp_2p = solve_and_get_fission_prod(xs_2p) + + rel_diff = abs(fp_1p - fp_2p) / max(fp_1p, fp_2p, 1.0) + tol = 1.0e-6 + pass_flag = 1 if rel_diff < tol else 0 + + if rank == 0: + print(f"FP_1P {fp_1p:.8e} FP_2P {fp_2p:.8e}") + print(f"K_PRECURSOR_FPROD_PASS {pass_flag}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_delayed_prke_vs_stk_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_delayed_prke_vs_stk_cbc.py new file mode 100644 index 0000000000..be9fcae998 --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_delayed_prke_vs_stk_cbc.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +1D delayed transient: homogeneous step xs vs PRKE. + +Validate space-time kinetics against point-reactor kinetics (PRKE) for a +homogeneous perturbation in a homogeneous system. + +1-group, 1 precursor, reflecting boundaries (infinite-medium). A step +to a supercritical xs is applied at t=0. Space-time kinetics should follow +PRKE for this homogeneous case. + +PRKE_STK_PASS is 1 if the space-time fission-production ratio matches PRKE within 2% +for t<=0.2. +""" + +import math +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.solver import ( + DiscreteOrdinatesProblem, + PowerIterationKEigenSolver, + TransientSolver, + ) + from pyopensn.aquad import GLProductQuadrature1DSlab + from pyopensn.xs import MultiGroupXS + from pyopensn.mesh import OrthogonalMeshGenerator + + +def read_precursor_value(path, block_name): + begin = f"{block_name}_BEGIN" + end = f"{block_name}_END" + in_block = False + with open(path, "r", encoding="utf-8") as handle: + for line in handle: + line = line.strip() + if not line or line.startswith("#"): + continue + if line == begin: + in_block = True + continue + if line == end: + in_block = False + continue + if in_block: + parts = line.split() + if len(parts) >= 2: + return float(parts[1]) + raise RuntimeError(f"Failed to find {block_name} in {path}") + + +def prke_phi_ratio(t, beta, lam, rho, Lambda): + # Point-kinetics 1-precursor step solution with phi(0)=1, C(0)=beta/(Lambda*lam) + a = (rho - beta) / Lambda - lam + b = math.sqrt(((rho - beta) / Lambda + lam) ** 2 + 4.0 * beta * lam / Lambda) + w1 = 0.5 * (a + b) + w2 = 0.5 * (a - b) + + k1 = (beta / Lambda) / (w1 + lam) + k2 = (beta / Lambda) / (w2 + lam) + + c2 = (beta / (Lambda * lam) - k1) / (k2 - k1) + c1 = 1.0 - c2 + + return c1 * math.exp(w1 * t) + c2 * math.exp(w2 * t) + + +if __name__ == "__main__": + dx = 8.0 / 40 + nodes = [i * dx for i in range(40 + 1)] + meshgen = OrthogonalMeshGenerator(node_sets=[nodes]) + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + + xs_crit = MultiGroupXS() + xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_crit_1p.cxs")) + + xs_super = MultiGroupXS() + xs_super.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_1p.cxs")) + + pquad = GLProductQuadrature1DSlab(n_polar=4, scattering_order=0) + + num_groups = 1 + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=num_groups, + groupsets=[ + { + "groups_from_to": (0, num_groups - 1), + "angular_quadrature": pquad, + "inner_linear_method": "petsc_richardson", + "l_abs_tol": 1.0e-8, + "l_max_its": 200, + }, + ], + xs_map=[{"block_ids": [0], "xs": xs_crit}], + boundary_conditions=[ + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={ + "save_angular_flux": True, + "use_precursors": True, + "verbose_inner_iterations": False, + "verbose_outer_iterations": False, + }, + sweep_type="CBC", + ) + + keigen = PowerIterationKEigenSolver(problem=phys, max_iters=200, k_tol=1.0e-10) + keigen.Initialize() + keigen.Execute() + + phys.SetTimeDependentMode() + + solver = TransientSolver(problem=phys, initial_state="existing") + solver.Initialize() + + # Apply homogeneous perturbation at t=0 + phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_super}]) + + # PRKE parameters from xs + sigma_a = xs_super.sigma_a[0] + nu_sigma_f = xs_super.nu_sigma_f[0] + inv_vel = xs_super.inv_velocity[0] + v = 1.0 / inv_vel + + # k_inf = nu_sigma_f / sigma_a for 1-group infinite medium + k_eff = nu_sigma_f / sigma_a + rho = (k_eff - 1.0) / k_eff + + beta = read_precursor_value( + os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_1p.cxs"), + "PRECURSOR_FRACTIONAL_YIELDS", + ) + lam = read_precursor_value( + os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_1p.cxs"), + "PRECURSOR_DECAY_CONSTANTS", + ) + Lambda = 1.0 / (v * nu_sigma_f) + + dt = 1.0e-2 + solver.SetTimeStep(dt) + solver.SetTheta(1.0) + + fp0 = phys.ComputeFissionProduction("new") + + t_end = 0.2 + rel_tol = 2.0e-2 + ok = True + step = 0 + while phys.GetTime() < t_end: + step += 1 + solver.Advance() + fp_new = phys.ComputeFissionProduction("new") + t_to = phys.GetTime() + + ratio_num = fp_new / fp0 + ratio_prke = prke_phi_ratio(t_to, beta, lam, rho, Lambda) + if abs(ratio_num - ratio_prke) > rel_tol * ratio_prke: + ok = False + + if rank == 0: + print(f"PRKE_STK_PASS {1 if ok else 0}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_prompt_step_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_prompt_step_cbc.py new file mode 100644 index 0000000000..156ce6b217 --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_prompt_step_cbc.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +1D prompt-only transient: xs step change. + +1-group prompt-only. A step change scales macroscopic fission terms. With +reflecting BCs, the FP ratio should match the scaling. + +FP_RATIO_ACTUAL = 1.2 from scaling sigma_f by 1.2 +""" + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.solver import ( + DiscreteOrdinatesProblem, + PowerIterationKEigenSolver, + TransientSolver, + ) + from pyopensn.aquad import GLProductQuadrature1DSlab + from pyopensn.xs import MultiGroupXS + from pyopensn.mesh import OrthogonalMeshGenerator + +if __name__ == "__main__": + dx = 8.0 / 40 + nodes = [i * dx for i in range(40 + 1)] + meshgen = OrthogonalMeshGenerator(node_sets=[nodes]) + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + + xs_crit = MultiGroupXS() + xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_prompt_crit.cxs")) + + xs_dense = MultiGroupXS() + xs_dense.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_prompt_density_up.cxs")) + + pquad = GLProductQuadrature1DSlab(n_polar=4, scattering_order=0) + + num_groups = 1 + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=num_groups, + groupsets=[ + { + "groups_from_to": (0, num_groups - 1), + "angular_quadrature": pquad, + "inner_linear_method": "classic_richardson", + "l_abs_tol": 1.0e-8, + "l_max_its": 200, + }, + ], + xs_map=[{"block_ids": [0], "xs": xs_crit}], + boundary_conditions=[ + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={ + "save_angular_flux": True, + "use_precursors": False, + "verbose_inner_iterations": False, + "verbose_outer_iterations": False, + }, + sweep_type="CBC", + ) + + keigen = PowerIterationKEigenSolver(problem=phys) + keigen.Initialize() + keigen.Execute() + + phys.SetTimeDependentMode() + + solver = TransientSolver(problem=phys, initial_state="existing") + solver.Initialize() + + fp_old = phys.ComputeFissionProduction("new") + + phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_dense}]) + fp_new = phys.ComputeFissionProduction("new") + + ratio_expected = 1.2 + ratio_actual = fp_new / fp_old + + dt = 1.0e-2 + solver.SetTimeStep(dt) + solver.SetTheta(1.0) + + solver.Advance() + fp1 = phys.ComputeFissionProduction("new") + + solver.Advance() + fp2 = phys.ComputeFissionProduction("new") + + growth1 = fp1 / fp_new + growth2 = fp2 / fp1 + transient_ok = 1 if (growth1 > 1.0 and growth2 > 1.0) else 0 + + if rank == 0: + print(f"FP_RATIO_EXPECTED {ratio_expected:.12e}") + print(f"FP_RATIO_ACTUAL {ratio_actual:.12e}") + print(f"TRANSIENT_OK {transient_ok}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_theta_precursor_scaling_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_theta_precursor_scaling_cbc.py new file mode 100644 index 0000000000..c3f6212940 --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_1d_theta_precursor_scaling_cbc.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +1D transient check: delayed fission source scales with theta*dt. + +Compare delayed-vs-prompt FP ratio deltas at theta=1.0 and theta=0.5 for a +single step. The transient delayed-fission source should scale with theta, +so the delta at theta=1 should be roughly twice the delta at theta=0.5. +""" + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.solver import ( + DiscreteOrdinatesProblem, + PowerIterationKEigenSolver, + TransientSolver, + ) + from pyopensn.aquad import GLProductQuadrature1DSlab + from pyopensn.xs import MultiGroupXS + from pyopensn.mesh import OrthogonalMeshGenerator + + +def run_case(theta, use_precursors, xs): + dx = 8.0 / 40 + nodes = [i * dx for i in range(40 + 1)] + meshgen = OrthogonalMeshGenerator(node_sets=[nodes]) + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + + pquad = GLProductQuadrature1DSlab(n_polar=4, scattering_order=0) + + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=1, + groupsets=[ + { + "groups_from_to": (0, 0), + "angular_quadrature": pquad, + "inner_linear_method": "classic_richardson", + "l_abs_tol": 1.0e-8, + "l_max_its": 200, + }, + ], + xs_map=[{"block_ids": [0], "xs": xs}], + boundary_conditions=[ + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={ + "save_angular_flux": True, + "use_precursors": use_precursors, + "verbose_inner_iterations": False, + "verbose_outer_iterations": False, + }, + sweep_type="CBC", + ) + + keigen = PowerIterationKEigenSolver(problem=phys) + keigen.Initialize() + keigen.Execute() + + phys.SetTimeDependentMode() + + solver = TransientSolver(problem=phys, initial_state="existing") + solver.Initialize() + solver.SetTheta(theta) + solver.SetTimeStep(1.0e-2) + + fp0 = phys.ComputeFissionProduction("new") + solver.Advance() + fp1 = phys.ComputeFissionProduction("new") + + return fp1 / fp0 + + +if __name__ == "__main__": + xs = MultiGroupXS() + xs.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_crit_1p.cxs")) + + ratio_delayed_t1 = run_case(1.0, True, xs) + ratio_prompt_t1 = run_case(1.0, False, xs) + ratio_delayed_t05 = run_case(0.5, True, xs) + ratio_prompt_t05 = run_case(0.5, False, xs) + + delta_t1 = ratio_delayed_t1 - ratio_prompt_t1 + delta_t05 = ratio_delayed_t05 - ratio_prompt_t05 + + ok = delta_t1 > 0.0 and delta_t05 > 0.0 and delta_t05 > 1.0e-10 + if ok: + ratio = delta_t1 / delta_t05 + else: + ratio = 0.0 + + if rank == 0: + print(f"DELAYED_THETA_RATIO {ratio:.12e}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_2g_delayed_step_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_2g_delayed_step_cbc.py new file mode 100644 index 0000000000..d7b6a2a171 --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_2g_delayed_step_cbc.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +2D 2-group delayed transient xs step change. + +Confirm delayed-neutron coupling with an xs step in a multi-group +setting. + +2-group, delayed neutrons enabled. A step change scales macroscopic fission +terms. + +FP_RATIO_ACTUAL = 1.2 from scaling sigma_f by 1.2. +""" + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.solver import ( + DiscreteOrdinatesProblem, + PowerIterationKEigenSolver, + TransientSolver, + ) + from pyopensn.aquad import GLCProductQuadrature2DXY + from pyopensn.xs import MultiGroupXS + from pyopensn.mesh import OrthogonalMeshGenerator + +if __name__ == "__main__": + dx = 6.0 / 6 + nodes = [i * dx for i in range(6 + 1)] + meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes]) + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + + xs_crit = MultiGroupXS() + xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs2g_delayed_crit_1p.cxs")) + + xs_dense = MultiGroupXS() + xs_dense.LoadFromOpenSn( + os.path.join(os.path.dirname(__file__), "xs2g_delayed_density_up_1p.cxs") + ) + + pquad = GLCProductQuadrature2DXY(n_polar=2, n_azimuthal=4, scattering_order=0) + + num_groups = 2 + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=num_groups, + groupsets=[ + { + "groups_from_to": (0, num_groups - 1), + "angular_quadrature": pquad, + "inner_linear_method": "classic_richardson", + "l_abs_tol": 1.0e-8, + "l_max_its": 200, + "gmres_restart_interval": 50, + }, + ], + xs_map=[{"block_ids": [0], "xs": xs_crit}], + boundary_conditions=[ + {"name": "xmin", "type": "reflecting"}, + {"name": "xmax", "type": "reflecting"}, + {"name": "ymin", "type": "reflecting"}, + {"name": "ymax", "type": "reflecting"}, + ], + options={ + "save_angular_flux": True, + "use_precursors": True, + "verbose_inner_iterations": False, + "verbose_outer_iterations": False, + }, + sweep_type="CBC", + ) + + keigen = PowerIterationKEigenSolver(problem=phys) + keigen.Initialize() + keigen.Execute() + + phys.SetTimeDependentMode() + + solver = TransientSolver(problem=phys, initial_state="existing") + solver.Initialize() + + fp_old = phys.ComputeFissionProduction("new") + + phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_dense}]) + fp_new = phys.ComputeFissionProduction("new") + + dt = 1.0e-2 + solver.SetTimeStep(dt) + solver.SetTheta(1.0) + + solver.Advance() + fp1 = phys.ComputeFissionProduction("new") + + solver.Advance() + fp2 = phys.ComputeFissionProduction("new") + + r1 = fp1 / fp_new + r2 = fp2 / fp1 + transient_ok = 1 if (fp1 > 0.0 and fp2 > 0.0 and 0.5 < r1 < 2.0 and 0.5 < r2 < 2.0) else 0 + + if rank == 0: + print(f"FP_RATIO_ACTUAL {fp_new / fp_old:.12e}") + print(f"TRANSIENT_OK {transient_ok}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_2g_prompt_combine_velocities_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_2g_prompt_combine_velocities_cbc.py new file mode 100644 index 0000000000..6ba94b268e --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_2g_prompt_combine_velocities_cbc.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +2D 2-group prompt: Combine xs with group-wise velocities. + +2-group prompt-only. Combine forms a composite xs from two macroscopic xs +inputs. + +FP_RATIO_ACTUAL = 2.2 +Combine uses density weights, so with (1.0, 1.0): +sigma_f_mix = 1.0 * sigma_f_crit + 1.0 * sigma_f_super. +Given sigma_f_super = 1.2 * sigma_f_crit, the ratio is +sigma_f_mix / sigma_f_crit = 1.0 + 1.2 = 2.2. +FP_RATIO_ACTUAL checks Combine behavior with +mixed group velocities. TRANSIENT_OK ensures the first transient step is +finite and positive. +""" + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.solver import ( + DiscreteOrdinatesProblem, + PowerIterationKEigenSolver, + TransientSolver, + ) + from pyopensn.aquad import GLCProductQuadrature2DXY + from pyopensn.xs import MultiGroupXS + from pyopensn.mesh import OrthogonalMeshGenerator + +if __name__ == "__main__": + dx = 6.0 / 6 + nodes = [i * dx for i in range(6 + 1)] + meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes]) + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + + xs_crit = MultiGroupXS() + xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs2g_prompt_crit.cxs")) + + xs_super = MultiGroupXS() + xs_super.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs2g_prompt_super.cxs")) + + xs_mix = MultiGroupXS.Combine([(xs_crit, 1.0), (xs_super, 1.0)]) + + pquad = GLCProductQuadrature2DXY(n_polar=2, n_azimuthal=4, scattering_order=0) + + num_groups = 2 + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=num_groups, + groupsets=[ + { + "groups_from_to": (0, num_groups - 1), + "angular_quadrature": pquad, + "inner_linear_method": "petsc_gmres", + "l_abs_tol": 1.0e-8, + "l_max_its": 200, + "gmres_restart_interval": 10, + }, + ], + xs_map=[{"block_ids": [0], "xs": xs_crit}], + boundary_conditions=[ + {"name": "xmin", "type": "reflecting"}, + {"name": "xmax", "type": "reflecting"}, + {"name": "ymin", "type": "reflecting"}, + {"name": "ymax", "type": "reflecting"}, + ], + options={ + "save_angular_flux": True, + "use_precursors": False, + "verbose_inner_iterations": False, + "verbose_outer_iterations": False, + }, + sweep_type="CBC", + ) + + keigen = PowerIterationKEigenSolver(problem=phys) + keigen.Initialize() + keigen.Execute() + + phys.SetTimeDependentMode() + + solver = TransientSolver(problem=phys, initial_state="existing") + solver.Initialize() + + fp_old = phys.ComputeFissionProduction("new") + + phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_mix}]) + fp_new = phys.ComputeFissionProduction("new") + + dt = 1.0e-2 + solver.SetTimeStep(dt) + solver.SetTheta(1.0) + + solver.Advance() + fp1 = phys.ComputeFissionProduction("new") + + transient_ok = 1 if (fp1 > 0.0) else 0 + + if rank == 0: + print(f"FP_RATIO_ACTUAL {fp_new / fp_old:.12e}") + print(f"TRANSIENT_OK {transient_ok}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_delayed_prke_vs_stk_2p_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_delayed_prke_vs_stk_2p_cbc.py new file mode 100644 index 0000000000..c56bdc5c4b --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_delayed_prke_vs_stk_2p_cbc.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +2D delayed transient: homogeneous step xs vs PRKE (2 precursors). + +1-group, 2 precursors, reflecting boundaries (infinite-medium). A step +to a supercritical xs is applied at t=0. Space-time kinetics should follow +PRKE for this homogeneous case. + +PRKE_STK_PASS is 1 if the space-time fission-production ratio matches PRKE within 2% +for t<=0.2. +""" + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.solver import ( + DiscreteOrdinatesProblem, + PowerIterationKEigenSolver, + TransientSolver, + ) + from pyopensn.aquad import GLCProductQuadrature2DXY + from pyopensn.xs import MultiGroupXS + from pyopensn.mesh import OrthogonalMeshGenerator + + +def read_block_values(path, block_name): + begin = f"{block_name}_BEGIN" + end = f"{block_name}_END" + values = [] + in_block = False + with open(path, "r", encoding="utf-8") as handle: + for line in handle: + line = line.strip() + if not line or line.startswith("#"): + continue + if line == begin: + in_block = True + continue + if line == end: + in_block = False + continue + if in_block: + parts = line.split() + if len(parts) >= 2: + values.append(float(parts[1])) + if not values: + raise RuntimeError(f"Failed to find {block_name} in {path}") + return values + + +def solve_linear(A, b): + n = len(b) + a = [row[:] for row in A] + x = b[:] + for i in range(n): + pivot = i + for r in range(i + 1, n): + if abs(a[r][i]) > abs(a[pivot][i]): + pivot = r + if abs(a[pivot][i]) < 1.0e-14: + raise RuntimeError("Singular system in PRKE solve") + if pivot != i: + a[i], a[pivot] = a[pivot], a[i] + x[i], x[pivot] = x[pivot], x[i] + piv = a[i][i] + for j in range(i, n): + a[i][j] /= piv + x[i] /= piv + for r in range(n): + if r == i: + continue + factor = a[r][i] + if factor == 0.0: + continue + for j in range(i, n): + a[r][j] -= factor * a[i][j] + x[r] -= factor * x[i] + return x + + +def prke_step(phi, C, dt, beta, lambdas, rho, Lambda): + m = len(lambdas) + beta_total = sum(beta) + size = 1 + m + A = [[0.0 for _ in range(size)] for _ in range(size)] + b = [0.0 for _ in range(size)] + + A[0][0] = 1.0 - dt * (rho - beta_total) / Lambda + for i in range(m): + A[0][1 + i] = -dt * lambdas[i] + + b[0] = phi + + for i in range(m): + A[1 + i][0] = -dt * (beta[i] / Lambda) + A[1 + i][1 + i] = 1.0 + dt * lambdas[i] + b[1 + i] = C[i] + + x = solve_linear(A, b) + return x[0], x[1:] + + +if __name__ == "__main__": + dx = 6.0 / 6 + nodes = [i * dx for i in range(6 + 1)] + meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes]) + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + + xs_crit = MultiGroupXS() + xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_crit_2p.cxs")) + + xs_super = MultiGroupXS() + xs_super.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_2p.cxs")) + + pquad = GLCProductQuadrature2DXY(n_polar=2, n_azimuthal=4, scattering_order=0) + + num_groups = 1 + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=num_groups, + groupsets=[ + { + "groups_from_to": (0, num_groups - 1), + "angular_quadrature": pquad, + "inner_linear_method": "classic_richardson", + "l_abs_tol": 1.0e-8, + "l_max_its": 200, + }, + ], + xs_map=[{"block_ids": [0], "xs": xs_crit}], + boundary_conditions=[ + {"name": "xmin", "type": "reflecting"}, + {"name": "xmax", "type": "reflecting"}, + {"name": "ymin", "type": "reflecting"}, + {"name": "ymax", "type": "reflecting"}, + ], + options={ + "save_angular_flux": True, + "use_precursors": True, + "verbose_inner_iterations": False, + "verbose_outer_iterations": False, + }, + sweep_type="CBC", + ) + + keigen = PowerIterationKEigenSolver(problem=phys, max_iters=200, k_tol=1.0e-10) + keigen.Initialize() + keigen.Execute() + + phys.SetTimeDependentMode() + + solver = TransientSolver(problem=phys, initial_state="existing") + solver.Initialize() + + # Apply homogeneous perturbation at t=0 + phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_super}]) + + sigma_a = xs_super.sigma_a[0] + nu_sigma_f = xs_super.nu_sigma_f[0] + inv_vel = xs_super.inv_velocity[0] + v = 1.0 / inv_vel + + k_eff = nu_sigma_f / sigma_a + rho = (k_eff - 1.0) / k_eff + Lambda = 1.0 / (v * nu_sigma_f) + + beta = read_block_values( + os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_2p.cxs"), + "PRECURSOR_FRACTIONAL_YIELDS", + ) + lambdas = read_block_values( + os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_2p.cxs"), + "PRECURSOR_DECAY_CONSTANTS", + ) + + dt = 1.0e-2 + solver.SetTimeStep(dt) + solver.SetTheta(1.0) + + fp0 = phys.ComputeFissionProduction("new") + + # PRKE initial conditions for steady state + phi = 1.0 + C = [beta[i] / (Lambda * lambdas[i]) for i in range(len(lambdas))] + + t_end = 0.2 + rel_tol = 2.0e-2 + ok = True + while phys.GetTime() < t_end: + solver.Advance() + fp_new = phys.ComputeFissionProduction("new") + t_to = phys.GetTime() + + phi, C = prke_step(phi, C, dt, beta, lambdas, rho, Lambda) + ratio_num = fp_new / fp0 + ratio_prke = phi + if abs(ratio_num - ratio_prke) > rel_tol * ratio_prke: + ok = False + + if rank == 0: + print(f"PRKE_STK_PASS {1 if ok else 0}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_prompt_ramp_xs_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_prompt_ramp_xs_cbc.py new file mode 100644 index 0000000000..867c1992cb --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_2d_prompt_ramp_xs_cbc.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +2D prompt-only transient with a ramped XS. + +Prompt-only with a monotonic increase in nu*sigma_f across a discrete xs list. +With reflecting BCs and no delayed neutrons, the fission production should be +increasing in time. + +TRANSIENT_OK checks finite response and increasing FP over the ramp. +""" + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.solver import ( + DiscreteOrdinatesProblem, + PowerIterationKEigenSolver, + TransientSolver, + ) + from pyopensn.aquad import GLCProductQuadrature2DXY + from pyopensn.xs import MultiGroupXS + from pyopensn.mesh import OrthogonalMeshGenerator + +if __name__ == "__main__": + dx = 8.0 / 8 + nodes = [i * dx for i in range(8 + 1)] + meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes]) + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + + xs_list = [] + for i in range(5): + xs = MultiGroupXS() + xs.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), f"xs1g_prompt_ramp_{i}.cxs")) + xs_list.append(xs) + + pquad = GLCProductQuadrature2DXY(n_polar=2, n_azimuthal=4, scattering_order=0) + + num_groups = 1 + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=num_groups, + groupsets=[ + { + "groups_from_to": (0, num_groups - 1), + "angular_quadrature": pquad, + "inner_linear_method": "classic_richardson", + "l_abs_tol": 1.0e-8, + "l_max_its": 200, + }, + ], + xs_map=[{"block_ids": [0], "xs": xs_list[0]}], + boundary_conditions=[ + {"name": "xmin", "type": "reflecting"}, + {"name": "xmax", "type": "reflecting"}, + {"name": "ymin", "type": "reflecting"}, + {"name": "ymax", "type": "reflecting"}, + ], + options={ + "save_angular_flux": True, + "use_precursors": False, + "verbose_inner_iterations": False, + "verbose_outer_iterations": False, + }, + sweep_type="CBC", + ) + + keigen = PowerIterationKEigenSolver(problem=phys) + keigen.Initialize() + keigen.Execute() + + phys.SetTimeDependentMode() + + solver = TransientSolver(problem=phys, initial_state="existing") + solver.Initialize() + + fp_old = phys.ComputeFissionProduction("new") + + sigma_f_vals = [0.150000, 0.157500, 0.165000, 0.172500, 0.180000] + dt = 2.0e-2 + solver.SetTimeStep(dt) + solver.SetTheta(1.0) + + growth_ok = True + last_fr = fp_old + for i in range(1, len(xs_list)): + phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_list[i]}]) + + solver.Advance() + fp = phys.ComputeFissionProduction("new") + + if fp <= last_fr: + growth_ok = False + last_fr = fp + + transient_ok = 1 if growth_ok else 0 + + if rank == 0: + print(f"TRANSIENT_OK {transient_ok}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_2g_prompt_step_xs_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_2g_prompt_step_xs_cbc.py new file mode 100644 index 0000000000..5b8198bac6 --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_2g_prompt_step_xs_cbc.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +3D 2-group prompt-only transient: step XS swap. + +2-group prompt-only. A step in sigma_f scales the prompt source. Scattering +couples groups so the transient response is not strictly monotonic. + +FP_RATIO_ACTUAL = 1.2 from scaling both groups' sigma_f by 1.2 (0.144/0.120). +TRANSIENT_OK checks positive response and reasonable step ratios (0.5 < r < 2). +""" + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.solver import ( + DiscreteOrdinatesProblem, + PowerIterationKEigenSolver, + TransientSolver, + ) + from pyopensn.aquad import GLCProductQuadrature3DXYZ + from pyopensn.xs import MultiGroupXS + from pyopensn.mesh import OrthogonalMeshGenerator + +if __name__ == "__main__": + dx = 8.0 / 4 + nodes = [i * dx for i in range(4 + 1)] + meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes, nodes]) + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + + xs_crit = MultiGroupXS() + xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs2g_prompt_crit.cxs")) + + xs_super = MultiGroupXS() + xs_super.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs2g_prompt_super.cxs")) + + pquad = GLCProductQuadrature3DXYZ(n_polar=2, n_azimuthal=4, scattering_order=0) + + num_groups = 2 + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=num_groups, + groupsets=[ + { + "groups_from_to": (0, num_groups - 1), + "angular_quadrature": pquad, + "inner_linear_method": "petsc_richardson", + "l_abs_tol": 1.0e-8, + "l_max_its": 200, + }, + ], + xs_map=[{"block_ids": [0], "xs": xs_crit}], + boundary_conditions=[ + {"name": "xmin", "type": "reflecting"}, + {"name": "xmax", "type": "reflecting"}, + {"name": "ymin", "type": "reflecting"}, + {"name": "ymax", "type": "reflecting"}, + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={ + "save_angular_flux": True, + "use_precursors": False, + "verbose_inner_iterations": False, + "verbose_outer_iterations": False, + }, + sweep_type="CBC", + ) + + keigen = PowerIterationKEigenSolver(problem=phys) + keigen.Initialize() + keigen.Execute() + + phys.SetTimeDependentMode() + + solver = TransientSolver(problem=phys, initial_state="existing") + solver.Initialize() + + fp_old = phys.ComputeFissionProduction("new") + + phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_super}]) + fp_new = phys.ComputeFissionProduction("new") + + dt = 1.0e-2 + solver.SetTimeStep(dt) + solver.SetTheta(1.0) + + solver.Advance() + fp1 = phys.ComputeFissionProduction("new") + + solver.Advance() + fp2 = phys.ComputeFissionProduction("new") + + r1 = fp1 / fp_new + r2 = fp2 / fp1 + transient_ok = 1 if (fp1 > 0.0 and fp2 > 0.0 and 0.5 < r1 < 2.0 and 0.5 < r2 < 2.0) else 0 + + if rank == 0: + print(f"FP_RATIO_ACTUAL {fp_new / fp_old:.12e}") + print(f"TRANSIENT_OK {transient_ok}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_6g_delayed_step_nu_sigma_f_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_6g_delayed_step_nu_sigma_f_cbc.py new file mode 100644 index 0000000000..985caa7ede --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_6g_delayed_step_nu_sigma_f_cbc.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +3D 6-group, 2-precursor delayed transient step. + +6 energy groups, 2 precursor families. A step in nu*sigma_f changes the prompt +source immediately and the delayed source through precursor evolution. + +FP_RATIO_ACTUAL = 1.2 from scaling all sigma_f by 1.2 between crit and super xs. +""" + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.solver import ( + DiscreteOrdinatesProblem, + PowerIterationKEigenSolver, + TransientSolver, + ) + from pyopensn.aquad import GLCProductQuadrature3DXYZ + from pyopensn.xs import MultiGroupXS + from pyopensn.mesh import OrthogonalMeshGenerator + +if __name__ == "__main__": + dx = 8.0 / 4 + nodes = [i * dx for i in range(4 + 1)] + meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes, nodes]) + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + + xs_crit = MultiGroupXS() + xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs6g_delayed_crit_2p.cxs")) + + xs_super = MultiGroupXS() + xs_super.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs6g_delayed_super_2p.cxs")) + + pquad = GLCProductQuadrature3DXYZ(n_polar=2, n_azimuthal=4, scattering_order=0) + + num_groups = 6 + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=num_groups, + groupsets=[ + { + "groups_from_to": (0, num_groups - 1), + "angular_quadrature": pquad, + "inner_linear_method": "petsc_gmres", + "l_abs_tol": 1.0e-8, + "l_max_its": 200, + "gmres_restart_interval": 10, + }, + ], + xs_map=[{"block_ids": [0], "xs": xs_crit}], + boundary_conditions=[ + {"name": "xmin", "type": "reflecting"}, + {"name": "xmax", "type": "reflecting"}, + {"name": "ymin", "type": "reflecting"}, + {"name": "ymax", "type": "reflecting"}, + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={ + "save_angular_flux": True, + "use_precursors": True, + "verbose_inner_iterations": False, + "verbose_outer_iterations": False, + }, + sweep_type="CBC", + ) + + keigen = PowerIterationKEigenSolver(problem=phys) + keigen.Initialize() + keigen.Execute() + + phys.SetTimeDependentMode() + + solver = TransientSolver(problem=phys, initial_state="existing") + solver.Initialize() + + fp_old = phys.ComputeFissionProduction("new") + + phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_super}]) + fp_new = phys.ComputeFissionProduction("new") + + dt = 1.0e-2 + solver.SetTimeStep(dt) + solver.SetTheta(1.0) + + solver.Advance() + fp1 = phys.ComputeFissionProduction("new") + + solver.Advance() + fp2 = phys.ComputeFissionProduction("new") + + transient_ok = 1 if (fp1 > 0.0 and fp2 > 0.0) else 0 + + if rank == 0: + print(f"FP_RATIO_ACTUAL {fp_new / fp_old:.12e}") + print(f"FP1 {fp1:.12e}") + print(f"FP2 {fp2:.12e}") + print(f"TRANSIENT_OK {transient_ok}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_analytic_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_analytic_cbc.py new file mode 100644 index 0000000000..1c60e38363 --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_analytic_cbc.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +3D delayed transient k-eigen with semi-analytic 1-precursor kinetics. + +Validate delayed-neutron coupling and precursor update against the closed-form +1-precursor point-kinetics solution for a reactivity step. + +1-group, 1 precursor. Point kinetics: dphi/dt = ((rho - beta)/Lambda) * phi + +lambda * C dC/dt = (beta/Lambda) * phi - lambda * C For a step to rho>0 with +phi(0)=1 and C(0)=beta/(Lambda*lambda), the solution is a sum of two +exponentials with eigenvalues w1,w2. The helper delayed_phi_ratio implements +that exact form. + +ANALYTIC_PASS is 1 if |FP_ratio - delayed_phi_ratio(t)| < 2% for all steps up to +t=0.2. Parameters: beta=0.0065, lambda=0.08, k=1.2 => rho=(k-1)/k=0.166666..., +nu_total=2.0, sigma_f=0.18, Lambda = 1/(v*nu*sigma_f) = 1/0.36 ≈ 2.7778. These +are the inputs to delayed_phi_ratio. ANALYTIC_PASS validates delayed source and +precursor updates against the semi-analytic solution. +""" + +import math +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.solver import ( + DiscreteOrdinatesProblem, + PowerIterationKEigenSolver, + TransientSolver, + ) + from pyopensn.aquad import GLCProductQuadrature3DXYZ + from pyopensn.xs import MultiGroupXS + from pyopensn.mesh import OrthogonalMeshGenerator + + +def delayed_phi_ratio(t, beta, lam, rho, Lambda): + # Point-kinetics 1-precursor step solution with phi(0)=1, C(0)=beta/(Lambda*lam) + a = (rho - beta) / Lambda - lam + b = math.sqrt(((rho - beta) / Lambda + lam) ** 2 + 4.0 * beta * lam / Lambda) + w1 = 0.5 * (a + b) + w2 = 0.5 * (a - b) + + # C = (beta/Lambda)/(w+lam) * phi for each mode + k1 = (beta / Lambda) / (w1 + lam) + k2 = (beta / Lambda) / (w2 + lam) + + # Solve for coefficients c1, c2 from phi(0)=1 and C(0)=beta/(Lambda*lam) + c2 = (beta / (Lambda * lam) - k1) / (k2 - k1) + c1 = 1.0 - c2 + + return c1 * math.exp(w1 * t) + c2 * math.exp(w2 * t) + + +if __name__ == "__main__": + dx = 8.0 / 4 + nodes = [i * dx for i in range(4 + 1)] + meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes, nodes]) + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + + xs_crit = MultiGroupXS() + xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_crit_1p.cxs")) + + xs_super = MultiGroupXS() + xs_super.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_1p.cxs")) + + pquad = GLCProductQuadrature3DXYZ(n_polar=2, n_azimuthal=4, scattering_order=0) + + num_groups = 1 + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=num_groups, + groupsets=[ + { + "groups_from_to": (0, num_groups - 1), + "angular_quadrature": pquad, + "inner_linear_method": "classic_richardson", + "l_abs_tol": 1.0e-8, + "l_max_its": 200, + }, + ], + xs_map=[{"block_ids": [0], "xs": xs_crit}], + boundary_conditions=[ + {"name": "xmin", "type": "reflecting"}, + {"name": "xmax", "type": "reflecting"}, + {"name": "ymin", "type": "reflecting"}, + {"name": "ymax", "type": "reflecting"}, + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={ + "save_angular_flux": True, + "use_precursors": True, + "verbose_inner_iterations": False, + "verbose_outer_iterations": False, + }, + sweep_type="CBC", + ) + + keigen = PowerIterationKEigenSolver(problem=phys, max_iters=200, k_tol=1.0e-10) + keigen.Initialize() + keigen.Execute() + + phys.SetTimeDependentMode() + + solver = TransientSolver(problem=phys, verbose=False, initial_state="existing") + solver.Initialize() + + # Swap to supercritical XS at t=0 + phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_super}]) + + # Semi-analytic parameters + beta = 0.0065 + lam = 0.08 + k = 1.2 + rho = (k - 1.0) / k + # Lambda based on infinite-medium definition (prompt gen time) + nu_prompt = 1.987 + nu_delayed = 0.013 + sigma_f = 0.180000 + v = 1.0 + nu_sigma_f = sigma_f * (nu_prompt + nu_delayed) + Lambda = 1.0 / (v * nu_sigma_f) + + dt = 1.0e-2 + solver.SetTimeStep(dt) + solver.SetTheta(1.0) + + fp0 = phys.ComputeFissionProduction("new") + + t_end = 0.2 + rel_tol = 2.0e-2 + ok = True + if rank == 0: + print("step time ratio_numeric ratio_analytic") + step = 0 + while phys.GetTime() < t_end: + step += 1 + solver.Advance() + fp_new = phys.ComputeFissionProduction("new") + t_to = phys.GetTime() + ratio_num = fp_new / fp0 + ratio_ana = delayed_phi_ratio(t_to, beta, lam, rho, Lambda) + + if rank == 0: + print(f"{step:4d} {t_to:10.4e} {ratio_num:12.6e} {ratio_ana:12.6e}") + if abs(ratio_num - ratio_ana) > rel_tol * ratio_ana: + ok = False + + if rank == 0: + print(f"ANALYTIC_PASS {1 if ok else 0}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_prke_vs_stk_2p_callbacks_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_prke_vs_stk_2p_callbacks_cbc.py new file mode 100644 index 0000000000..8e408d7cd0 --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_prke_vs_stk_2p_callbacks_cbc.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +3D delayed transient: homogeneous step xs vs PRKE (2 precursors) with callbacks. + +Validate space-time kinetics against point-reactor kinetics (PRKE) using the +transient solver Execute loop and pre/post-advance callbacks. + +1-group, 2 precursors, reflecting boundaries (infinite-medium). A step +to a supercritical xs is applied at t=0. Space-time kinetics should follow +PRKE for this homogeneous case. + +PRKE_STK_PASS is 1 if the space-time fission-production ratio matches PRKE within 2% +for t<=0.15. +""" + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.solver import ( + DiscreteOrdinatesProblem, + PowerIterationKEigenSolver, + TransientSolver, + ) + from pyopensn.aquad import GLCProductQuadrature3DXYZ + from pyopensn.xs import MultiGroupXS + from pyopensn.mesh import OrthogonalMeshGenerator + + +def read_block_values(path, block_name): + begin = f"{block_name}_BEGIN" + end = f"{block_name}_END" + values = [] + in_block = False + with open(path, "r", encoding="utf-8") as handle: + for line in handle: + line = line.strip() + if not line or line.startswith("#"): + continue + if line == begin: + in_block = True + continue + if line == end: + in_block = False + continue + if in_block: + parts = line.split() + if len(parts) >= 2: + values.append(float(parts[1])) + if not values: + raise RuntimeError(f"Failed to find {block_name} in {path}") + return values + + +def solve_linear(A, b): + n = len(b) + a = [row[:] for row in A] + x = b[:] + for i in range(n): + pivot = i + for r in range(i + 1, n): + if abs(a[r][i]) > abs(a[pivot][i]): + pivot = r + if abs(a[pivot][i]) < 1.0e-14: + raise RuntimeError("Singular system in PRKE solve") + if pivot != i: + a[i], a[pivot] = a[pivot], a[i] + x[i], x[pivot] = x[pivot], x[i] + piv = a[i][i] + for j in range(i, n): + a[i][j] /= piv + x[i] /= piv + for r in range(n): + if r == i: + continue + factor = a[r][i] + if factor == 0.0: + continue + for j in range(i, n): + a[r][j] -= factor * a[i][j] + x[r] -= factor * x[i] + return x + + +def prke_step(phi, C, dt, beta, lambdas, rho, Lambda): + m = len(lambdas) + beta_total = sum(beta) + size = 1 + m + A = [[0.0 for _ in range(size)] for _ in range(size)] + b = [0.0 for _ in range(size)] + + A[0][0] = 1.0 - dt * (rho - beta_total) / Lambda + for i in range(m): + A[0][1 + i] = -dt * lambdas[i] + + b[0] = phi + + for i in range(m): + A[1 + i][0] = -dt * (beta[i] / Lambda) + A[1 + i][1 + i] = 1.0 + dt * lambdas[i] + b[1 + i] = C[i] + + x = solve_linear(A, b) + return x[0], x[1:] + + +if __name__ == "__main__": + dx = 8.0 / 4 + nodes = [i * dx for i in range(4 + 1)] + meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes, nodes]) + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + + xs_crit = MultiGroupXS() + xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_crit_2p.cxs")) + + xs_super = MultiGroupXS() + xs_super.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_2p.cxs")) + + pquad = GLCProductQuadrature3DXYZ(n_polar=2, n_azimuthal=4, scattering_order=0) + + num_groups = 1 + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=num_groups, + groupsets=[ + { + "groups_from_to": (0, num_groups - 1), + "angular_quadrature": pquad, + "inner_linear_method": "classic_richardson", + "l_abs_tol": 1.0e-8, + "l_max_its": 200, + }, + ], + xs_map=[{"block_ids": [0], "xs": xs_crit}], + boundary_conditions=[ + {"name": "xmin", "type": "reflecting"}, + {"name": "xmax", "type": "reflecting"}, + {"name": "ymin", "type": "reflecting"}, + {"name": "ymax", "type": "reflecting"}, + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={ + "save_angular_flux": True, + "use_precursors": True, + "verbose_inner_iterations": False, + "verbose_outer_iterations": False, + }, + sweep_type="CBC", + ) + + t_end = 0.15 + dt = 1.0e-2 + keigen = PowerIterationKEigenSolver(problem=phys, max_iters=200, k_tol=1.0e-10) + keigen.Initialize() + keigen.Execute() + + phys.SetTimeDependentMode() + + solver = TransientSolver( + problem=phys, + stop_time=t_end, + dt=dt, + initial_state="existing", + ) + solver.Initialize() + solver.SetTheta(1.0) + + # Apply homogeneous perturbation at t=0 + phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_super}]) + + sigma_a = xs_super.sigma_a[0] + nu_sigma_f = xs_super.nu_sigma_f[0] + inv_vel = xs_super.inv_velocity[0] + v = 1.0 / inv_vel + + k_eff = nu_sigma_f / sigma_a + rho = (k_eff - 1.0) / k_eff + Lambda = 1.0 / (v * nu_sigma_f) + + beta = read_block_values( + os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_2p.cxs"), + "PRECURSOR_FRACTIONAL_YIELDS", + ) + lambdas = read_block_values( + os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_2p.cxs"), + "PRECURSOR_DECAY_CONSTANTS", + ) + + fp0 = phys.ComputeFissionProduction("new") + + state = { + "phi": 1.0, + "C": [beta[i] / (Lambda * lambdas[i]) for i in range(len(lambdas))], + "ok": True, + "steps": 0, + } + rel_tol = 2.0e-2 + + def pre_advance(): + state["steps"] += 1 + + def post_advance(): + fp_new = phys.ComputeFissionProduction("new") + step_dt = phys.GetTimeStep() + state["phi"], state["C"] = prke_step( + state["phi"], state["C"], step_dt, beta, lambdas, rho, Lambda + ) + ratio_num = fp_new / fp0 + ratio_prke = state["phi"] + if abs(ratio_num - ratio_prke) > rel_tol * ratio_prke: + state["ok"] = False + + solver.SetPreAdvanceCallback(pre_advance) + solver.SetPostAdvanceCallback(post_advance) + solver.Execute() + solver.SetPreAdvanceCallback(None) + solver.SetPostAdvanceCallback(None) + + if rank == 0: + print(f"PRKE_STK_PASS {1 if state['ok'] else 0}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_prke_vs_stk_2p_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_prke_vs_stk_2p_cbc.py new file mode 100644 index 0000000000..401dd97d37 --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_prke_vs_stk_2p_cbc.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +3D delayed transient: homogeneous step xs vs PRKE (2 precursors). + +Validate space-time kinetics against point-reactor kinetics (PRKE) for a +homogeneous perturbation in a homogeneous system with two precursors. + +1-group, 2 precursors, reflecting boundaries (infinite-medium). A step +to a supercritical xs is applied at t=0. Space-time kinetics should follow +PRKE for this homogeneous case. + +PRKE_STK_PASS is 1 if the space-time fission-production ratio matches PRKE within 2% +for t<=0.15. +""" + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.solver import ( + DiscreteOrdinatesProblem, + PowerIterationKEigenSolver, + TransientSolver, + ) + from pyopensn.aquad import GLCProductQuadrature3DXYZ + from pyopensn.xs import MultiGroupXS + from pyopensn.mesh import OrthogonalMeshGenerator + + +def read_block_values(path, block_name): + begin = f"{block_name}_BEGIN" + end = f"{block_name}_END" + values = [] + in_block = False + with open(path, "r", encoding="utf-8") as handle: + for line in handle: + line = line.strip() + if not line or line.startswith("#"): + continue + if line == begin: + in_block = True + continue + if line == end: + in_block = False + continue + if in_block: + parts = line.split() + if len(parts) >= 2: + values.append(float(parts[1])) + if not values: + raise RuntimeError(f"Failed to find {block_name} in {path}") + return values + + +def solve_linear(A, b): + n = len(b) + a = [row[:] for row in A] + x = b[:] + for i in range(n): + pivot = i + for r in range(i + 1, n): + if abs(a[r][i]) > abs(a[pivot][i]): + pivot = r + if abs(a[pivot][i]) < 1.0e-14: + raise RuntimeError("Singular system in PRKE solve") + if pivot != i: + a[i], a[pivot] = a[pivot], a[i] + x[i], x[pivot] = x[pivot], x[i] + piv = a[i][i] + for j in range(i, n): + a[i][j] /= piv + x[i] /= piv + for r in range(n): + if r == i: + continue + factor = a[r][i] + if factor == 0.0: + continue + for j in range(i, n): + a[r][j] -= factor * a[i][j] + x[r] -= factor * x[i] + return x + + +def prke_step(phi, C, dt, beta, lambdas, rho, Lambda): + m = len(lambdas) + beta_total = sum(beta) + size = 1 + m + A = [[0.0 for _ in range(size)] for _ in range(size)] + b = [0.0 for _ in range(size)] + + A[0][0] = 1.0 - dt * (rho - beta_total) / Lambda + for i in range(m): + A[0][1 + i] = -dt * lambdas[i] + + b[0] = phi + + for i in range(m): + A[1 + i][0] = -dt * (beta[i] / Lambda) + A[1 + i][1 + i] = 1.0 + dt * lambdas[i] + b[1 + i] = C[i] + + x = solve_linear(A, b) + return x[0], x[1:] + + +if __name__ == "__main__": + dx = 8.0 / 4 + nodes = [i * dx for i in range(4 + 1)] + meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes, nodes]) + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + + xs_crit = MultiGroupXS() + xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_crit_2p.cxs")) + + xs_super = MultiGroupXS() + xs_super.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_2p.cxs")) + + pquad = GLCProductQuadrature3DXYZ(n_polar=2, n_azimuthal=4, scattering_order=0) + + num_groups = 1 + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=num_groups, + groupsets=[ + { + "groups_from_to": (0, num_groups - 1), + "angular_quadrature": pquad, + "inner_linear_method": "petsc_gmres", + "l_abs_tol": 1.0e-8, + "l_max_its": 200, + "gmres_restart_interval": 10, + }, + ], + xs_map=[{"block_ids": [0], "xs": xs_crit}], + boundary_conditions=[ + {"name": "xmin", "type": "reflecting"}, + {"name": "xmax", "type": "reflecting"}, + {"name": "ymin", "type": "reflecting"}, + {"name": "ymax", "type": "reflecting"}, + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={ + "save_angular_flux": True, + "use_precursors": True, + "verbose_inner_iterations": False, + "verbose_outer_iterations": False, + }, + sweep_type="CBC", + ) + + keigen = PowerIterationKEigenSolver(problem=phys, max_iters=200, k_tol=1.0e-10) + keigen.Initialize() + keigen.Execute() + + phys.SetTimeDependentMode() + + solver = TransientSolver(problem=phys, initial_state="existing") + solver.Initialize() + + # Apply homogeneous perturbation at t=0 + phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_super}]) + + sigma_a = xs_super.sigma_a[0] + nu_sigma_f = xs_super.nu_sigma_f[0] + inv_vel = xs_super.inv_velocity[0] + v = 1.0 / inv_vel + + k_eff = nu_sigma_f / sigma_a + rho = (k_eff - 1.0) / k_eff + Lambda = 1.0 / (v * nu_sigma_f) + + beta = read_block_values( + os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_2p.cxs"), + "PRECURSOR_FRACTIONAL_YIELDS", + ) + lambdas = read_block_values( + os.path.join(os.path.dirname(__file__), "xs1g_delayed_super_2p.cxs"), + "PRECURSOR_DECAY_CONSTANTS", + ) + + dt = 1.0e-2 + solver.SetTimeStep(dt) + solver.SetTheta(1.0) + + fp0 = phys.ComputeFissionProduction("new") + + # PRKE initial conditions for steady state + phi = 1.0 + C = [beta[i] / (Lambda * lambdas[i]) for i in range(len(lambdas))] + + t_end = 0.15 + rel_tol = 2.0e-2 + ok = True + while phys.GetTime() < t_end: + solver.Advance() + fp_new = phys.ComputeFissionProduction("new") + t_to = phys.GetTime() + + phi, C = prke_step(phi, C, dt, beta, lambdas, rho, Lambda) + ratio_num = fp_new / fp0 + ratio_prke = phi + if abs(ratio_num - ratio_prke) > rel_tol * ratio_prke: + ok = False + + if rank == 0: + print(f"PRKE_STK_PASS {1 if ok else 0}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_ramp_xs_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_ramp_xs_cbc.py new file mode 100644 index 0000000000..6feb53a688 --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_ramp_xs_cbc.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +3D delayed transient with a ramped xs. + +Similar to the prompt ramp, but with delayed neutrons enabled. + +1-group, 1 precursor. nu*sigma_f ramps upward in time. The fission production +should grow monotonically for this case with reflecting BCs. + +TRANSIENT_OK checks finite response and non-decreasing FP. +""" + +import math +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.solver import ( + DiscreteOrdinatesProblem, + PowerIterationKEigenSolver, + TransientSolver, + ) + from pyopensn.aquad import GLCProductQuadrature3DXYZ + from pyopensn.xs import MultiGroupXS + from pyopensn.mesh import OrthogonalMeshGenerator + + +def read_block_value(file_path, block_begin, block_end): + in_block = False + with open(file_path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if line == block_begin: + in_block = True + continue + if line == block_end: + in_block = False + continue + if in_block: + parts = line.split() + if len(parts) == 2 and parts[0] == "0": + return float(parts[1]) + return None + + +def load_xs_scalar_params(xs_file): + nu_prompt = read_block_value(xs_file, "NU_PROMPT_BEGIN", "NU_PROMPT_END") + nu_delayed = read_block_value(xs_file, "NU_DELAYED_BEGIN", "NU_DELAYED_END") + lam = read_block_value( + xs_file, + "PRECURSOR_DECAY_CONSTANTS_BEGIN", + "PRECURSOR_DECAY_CONSTANTS_END", + ) + frac_yield = read_block_value( + xs_file, + "PRECURSOR_FRACTIONAL_YIELDS_BEGIN", + "PRECURSOR_FRACTIONAL_YIELDS_END", + ) + return nu_prompt, nu_delayed, lam, frac_yield + + +if __name__ == "__main__": + dx = 8.0 / 4 + nodes = [i * dx for i in range(4 + 1)] + meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes, nodes]) + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + + xs_list = [] + for i in range(5): + xs = MultiGroupXS() + xs.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), f"xs1g_delayed_ramp_{i}.cxs")) + xs_list.append(xs) + xs_crit = xs_list[0] + xs_super = xs_list[-1] + xs_scalar_file = os.path.join(os.path.dirname(__file__), "xs1g_delayed_ramp_0.cxs") + + pquad = GLCProductQuadrature3DXYZ(n_polar=2, n_azimuthal=4, scattering_order=0) + + num_groups = 1 + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=num_groups, + groupsets=[ + { + "groups_from_to": (0, num_groups - 1), + "angular_quadrature": pquad, + "inner_linear_method": "classic_richardson", + "l_abs_tol": 1.0e-8, + "l_max_its": 200, + }, + ], + xs_map=[{"block_ids": [0], "xs": xs_crit}], + boundary_conditions=[ + {"name": "xmin", "type": "reflecting"}, + {"name": "xmax", "type": "reflecting"}, + {"name": "ymin", "type": "reflecting"}, + {"name": "ymax", "type": "reflecting"}, + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={ + "save_angular_flux": True, + "use_precursors": True, + "verbose_inner_iterations": False, + "verbose_outer_iterations": False, + }, + sweep_type="CBC", + ) + + keigen = PowerIterationKEigenSolver(problem=phys, max_iters=200, k_tol=1.0e-10) + keigen.Initialize() + keigen.Execute() + + phys.SetTimeDependentMode() + + solver = TransientSolver(problem=phys, verbose=False, initial_state="existing") + solver.Initialize() + + # XS-based kinetics parameters (1-group, 1-precursor) + sigma_a = xs_super.sigma_a[0] + nu_prompt, nu_delayed, lam, frac_yield = load_xs_scalar_params(xs_scalar_file) + # Use beta from precursor fractional yield and nu_total from nu_prompt + nu_delayed + beta = frac_yield + v = 1.0 / xs_super.inv_velocity[0] + + def k_from_nu_sigma_f(nu_sigma_f): + return nu_sigma_f / sigma_a + + def rho_from_k(k): + return (k - 1.0) / k + + # Ramp parameters + dt = 1.0e-2 + solver.SetTimeStep(dt) + solver.SetTheta(1.0) + + t_end = 0.2 + ramp_time = t_end + + def mix_factor(t): + if t <= 0.0: + return 0.0 + if t >= ramp_time: + return 1.0 + return t / ramp_time + + def xs_index(t): + f = mix_factor(t) + return min(int(f * (len(xs_list) - 1) + 1.0e-12), len(xs_list) - 1) + + def nu_sigma_f_of_t(t): + sigma_f = xs_list[xs_index(t)].sigma_f[0] + nu_total = nu_prompt + nu_delayed + return sigma_f * nu_total + + def rho_of_t(t): + k = k_from_nu_sigma_f(nu_sigma_f_of_t(t)) + return rho_from_k(k) + + def Lambda_of_t(t): + return 1.0 / (v * nu_sigma_f_of_t(t)) + + nu_sigma_f_crit = nu_sigma_f_of_t(0.0) + + fp0 = phys.ComputeFissionProduction("new") + + rel_tol = 2.0e-2 + ok = True + growth_ok = True + last_ratio = None + if rank == 0: + print("step time ratio_numeric ratio_analytic") + step = 0 + while phys.GetTime() < t_end: + step += 1 + t_from = phys.GetTime() + # Update XS mix for current step (piecewise-constant over dt) + idx = xs_index(t_from) + phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_list[idx]}]) + + solver.Advance() + fp_new = phys.ComputeFissionProduction("new") + t_to = phys.GetTime() + + ratio_num = fp_new / fp0 + ratio_ana = 1.0 + + if rank == 0: + print(f"{step:4d} {t_to:10.4e} {ratio_num:12.6e} {ratio_ana:12.6e}") + if (not math.isfinite(fp_new)) or (not math.isfinite(ratio_num)): + ok = False + elif last_ratio is not None and ratio_num < (last_ratio - 1.0e-4): + growth_ok = False + last_ratio = ratio_num + ok = ok and growth_ok + + if rank == 0: + print(f"TRANSIENT_OK {1 if ok else 0}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_stiff_dt_sensitivity_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_stiff_dt_sensitivity_cbc.py new file mode 100644 index 0000000000..974d3a35f9 --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_delayed_stiff_dt_sensitivity_cbc.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +3D delayed transient with stiff precursor. + +Stresses the solver with a large decay constant (lambda). + +1-group, 1 precursor with large lambda. Two runs (dt_small and dt_large) are +compared at the same t_end. A correct theta-scheme should produce similar FP +ratios when dt is sufficiently small. + +REL_DIFF < 0.05 is a robustness threshold: the dt_large solution should be +within 5% of dt_small. +TRANSIENT_OK requires positive finite response, matching t_end, and relative +difference < 5% between dt_small and dt_large. +""" + +import math +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.solver import ( + DiscreteOrdinatesProblem, + PowerIterationKEigenSolver, + TransientSolver, + ) + from pyopensn.aquad import GLCProductQuadrature3DXYZ + from pyopensn.xs import MultiGroupXS + from pyopensn.mesh import OrthogonalMeshGenerator + + +def run_transient(dt, t_end, xs_crit, xs_super): + dx = 8.0 / 4 + nodes = [i * dx for i in range(4 + 1)] + meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes, nodes]) + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + + pquad = GLCProductQuadrature3DXYZ(n_polar=2, n_azimuthal=4, scattering_order=0) + + num_groups = 1 + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=num_groups, + groupsets=[ + { + "groups_from_to": (0, num_groups - 1), + "angular_quadrature": pquad, + "inner_linear_method": "classic_richardson", + "l_abs_tol": 1.0e-8, + "l_max_its": 200, + }, + ], + xs_map=[{"block_ids": [0], "xs": xs_crit}], + boundary_conditions=[ + {"name": "xmin", "type": "reflecting"}, + {"name": "xmax", "type": "reflecting"}, + {"name": "ymin", "type": "reflecting"}, + {"name": "ymax", "type": "reflecting"}, + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={ + "save_angular_flux": True, + "use_precursors": True, + "verbose_inner_iterations": False, + "verbose_outer_iterations": False, + }, + sweep_type="CBC", + ) + + keigen = PowerIterationKEigenSolver(problem=phys) + keigen.Initialize() + keigen.Execute() + + phys.SetTimeDependentMode() + + solver = TransientSolver(problem=phys, initial_state="existing") + solver.Initialize() + + fp0 = phys.ComputeFissionProduction("new") + + phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_super}]) + + solver.SetTimeStep(dt) + solver.SetTheta(1.0) + + while phys.GetTime() < t_end - 1.0e-12: + solver.Advance() + + fp_end = phys.ComputeFissionProduction("new") + return fp0, fp_end, phys.GetTime() + + +if __name__ == "__main__": + xs_crit = MultiGroupXS() + xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_stiff_crit.cxs")) + + xs_super = MultiGroupXS() + xs_super.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_delayed_stiff_super.cxs")) + + t_end = 0.2 + dt_small = 5.0e-3 + dt_large = 2.0e-2 + + fr0_s, fp_end_s, t_s = run_transient(dt_small, t_end, xs_crit, xs_super) + fr0_l, fp_end_l, t_l = run_transient(dt_large, t_end, xs_crit, xs_super) + + ratio_small = fp_end_s / fr0_s + ratio_large = fp_end_l / fr0_l + + rel_diff = abs(ratio_small - ratio_large) / max(abs(ratio_small), 1.0e-14) + + ok = ( + math.isfinite(ratio_small) + and math.isfinite(ratio_large) + and ratio_small > 1.0 + and ratio_large > 1.0 + and rel_diff < 5.0e-2 + and abs(t_s - t_end) < 1.0e-6 + and abs(t_l - t_end) < 1.0e-6 + ) + + if rank == 0: + print(f"DT_SMALL_RATIO {ratio_small:.12e}") + print(f"DT_LARGE_RATIO {ratio_large:.12e}") + print(f"REL_DIFF {rel_diff:.12e}") + print(f"TRANSIENT_OK {1 if ok else 0}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_prompt_analytic_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_prompt_analytic_cbc.py new file mode 100644 index 0000000000..eed0fdee39 --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_prompt_analytic_cbc.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +3D prompt-only transient k-eigen with analytic exponential response. + +1-group, reflecting BCs, no delayed neutrons. After a step to supercritical: +dphi/dt = alpha * phi, with alpha = nu*sigma_f - sigma_a. In discrete time with +theta=1, the update ratio per step is r = (tau + sigma_s + nu*sigma_f)/(tau + +sigma_t), where tau = v^{-1}/dt. For small dt, r ≈ exp(alpha dt), giving +phi(t)/phi(0) = exp(alpha t). + +ANALYTIC_PASS is 1 if |FP_ratio - exp(alpha t)| < 0.5% for all steps up to +t=0.1. With sigma_t=1.0, sigma_s=0.7 => sigma_a=0.3, nu=2, sigma_f=0.18, +alpha=0.36-0.3=0.06. Thus exp(alpha*0.1)=exp(0.006)≈1.0060 (used implicitly in +the comparison). ANALYTIC_PASS validates the time term and prompt fission source +handling against the analytic solution. +""" + +import math +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.solver import ( + DiscreteOrdinatesProblem, + PowerIterationKEigenSolver, + TransientSolver, + ) + from pyopensn.aquad import GLCProductQuadrature3DXYZ + from pyopensn.xs import MultiGroupXS + from pyopensn.mesh import OrthogonalMeshGenerator + +if __name__ == "__main__": + dx = 8.0 / 4 + nodes = [i * dx for i in range(4 + 1)] + meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes, nodes]) + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + + xs_crit = MultiGroupXS() + xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_prompt_crit.cxs")) + + xs_super = MultiGroupXS() + xs_super.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_prompt_super.cxs")) + + pquad = GLCProductQuadrature3DXYZ(n_polar=2, n_azimuthal=4, scattering_order=0) + + num_groups = 1 + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=num_groups, + groupsets=[ + { + "groups_from_to": (0, num_groups - 1), + "angular_quadrature": pquad, + "inner_linear_method": "classic_richardson", + "l_abs_tol": 1.0e-8, + "l_max_its": 200, + }, + ], + xs_map=[{"block_ids": [0], "xs": xs_crit}], + boundary_conditions=[ + {"name": "xmin", "type": "reflecting"}, + {"name": "xmax", "type": "reflecting"}, + {"name": "ymin", "type": "reflecting"}, + {"name": "ymax", "type": "reflecting"}, + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={ + "save_angular_flux": True, + "use_precursors": False, + "verbose_inner_iterations": False, + "verbose_outer_iterations": False, + }, + sweep_type="CBC", + ) + + keigen = PowerIterationKEigenSolver(problem=phys, max_iters=200, k_tol=1.0e-10) + keigen.Initialize() + keigen.Execute() + + phys.SetTimeDependentMode() + + solver = TransientSolver(problem=phys, verbose=False, initial_state="existing") + solver.Initialize() + phi_old = phys.GetPhiOldLocal() + phi_new = phys.GetPhiNewLocal() + if rank == 0: + print("phi_old[0]", phi_old[0], "phi_new[0]", phi_new[0]) + # Swap to supercritical XS at t=0 + phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_super}]) + fp_new = phys.ComputeFissionProduction("new") + if rank == 0: + print("fp_new (new)", fp_new) + # Analytic alpha for prompt-only + sigma_t = 1.0 + sigma_s = 0.7 + sigma_a = sigma_t - sigma_s + nu = 2.0 + sigma_f = 0.180000 + alpha = nu * sigma_f - sigma_a + + dt = 1.0e-2 + solver.SetTimeStep(dt) + solver.SetTheta(1.0) + + # Use the converged flux from the k-eigen solve as the initial state + fp0 = phys.ComputeFissionProduction("new") + + if rank == 0: + print("inv_velocity", xs_super.inv_velocity) + print("dt", dt, "theta", 1.0) + tau = xs_super.inv_velocity[0] / dt + r_expected = (tau + sigma_s + nu * sigma_f) / (tau + sigma_t) + if rank == 0: + print("tau", tau, "r_expected", r_expected, "r_expected^11", r_expected ** 11) + + t_end = 0.1 + rel_tol = 5.0e-3 + ok = True + if rank == 0: + print("step time ratio_numeric ratio_analytic") + step = 0 + while phys.GetTime() < t_end: + step += 1 + t_from = phys.GetTime() + solver.Advance() + fp_step_new = phys.ComputeFissionProduction("new") + fp_step_old = phys.ComputeFissionProduction("old") + if rank == 0: + print( + "fp_new(after step)", + fp_step_new, + "fp_old(after step)", + fp_step_old, + ) + phi_old = phys.GetPhiOldLocal() + phi_new = phys.GetPhiNewLocal() + if rank == 0: + print("phi_old[0]", phi_old[0], "phi_new[0]", phi_new[0]) + fp_new = fp_step_new + t_to = phys.GetTime() + ratio_num = fp_new / fp0 + ratio_ana = math.exp(alpha * t_to) + + if rank == 0: + print(f"{step:4d} {t_to:10.4e} {ratio_num:12.6e} {ratio_ana:12.6e}") + if abs(ratio_num - ratio_ana) > rel_tol * ratio_ana: + ok = False + + if rank == 0: + print(f"ANALYTIC_PASS {1 if ok else 0}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_prompt_bc_leakage_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_prompt_bc_leakage_cbc.py new file mode 100644 index 0000000000..abbe54e21d --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_prompt_bc_leakage_cbc.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +3D prompt transient: boundary leakage (vacuum vs reflecting). + +Ensure boundary conditions influence transient response: vacuum boundaries +should leak neutrons and yield lower growth than reflecting boundaries. + +Prompt-only with a step to supercritical material. Leakage reduces effective +reactivity when vacuum BCs are used. + +TRANSIENT_OK requires step growth with vacuum BCs to be less than that under +reflecting BCs. +""" + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.solver import ( + DiscreteOrdinatesProblem, + PowerIterationKEigenSolver, + TransientSolver, + ) + from pyopensn.aquad import GLCProductQuadrature3DXYZ + from pyopensn.xs import MultiGroupXS + from pyopensn.mesh import OrthogonalMeshGenerator + + +def run_case(bc_type, xs_crit, xs_super): + dx = 8.0 / 4 + nodes = [i * dx for i in range(4 + 1)] + meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes, nodes]) + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + pquad = GLCProductQuadrature3DXYZ(n_polar=2, n_azimuthal=4, scattering_order=0) + + bcs = [ + {"name": "xmin", "type": bc_type}, + {"name": "xmax", "type": bc_type}, + {"name": "ymin", "type": bc_type}, + {"name": "ymax", "type": bc_type}, + {"name": "zmin", "type": bc_type}, + {"name": "zmax", "type": bc_type}, + ] + + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=1, + groupsets=[ + { + "groups_from_to": (0, 0), + "angular_quadrature": pquad, + "inner_linear_method": "petsc_richardson", + "l_abs_tol": 1.0e-8, + "l_max_its": 200, + }, + ], + xs_map=[{"block_ids": [0], "xs": xs_crit}], + boundary_conditions=bcs, + options={ + "save_angular_flux": True, + "use_precursors": False, + "verbose_inner_iterations": False, + "verbose_outer_iterations": False, + }, + sweep_type="CBC", + ) + + keigen = PowerIterationKEigenSolver(problem=phys) + keigen.Initialize() + keigen.Execute() + + phys.SetTimeDependentMode() + + solver = TransientSolver(problem=phys, initial_state="existing") + solver.Initialize() + + fp_old = phys.ComputeFissionProduction("new") + phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_super}]) + fp_new = phys.ComputeFissionProduction("new") + + solver.SetTimeStep(1.0e-2) + solver.SetTheta(1.0) + + solver.Advance() + fp1 = phys.ComputeFissionProduction("new") + + return fp1 / fp_new, fp_new / fp_old + + +if __name__ == "__main__": + xs_crit = MultiGroupXS() + xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_prompt_crit.cxs")) + + xs_super = MultiGroupXS() + xs_super.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_prompt_super.cxs")) + + ratio_reflect, fp_ratio_reflect = run_case("reflecting", xs_crit, xs_super) + ratio_vacuum, fp_ratio_vacuum = run_case("vacuum", xs_crit, xs_super) + + ok = 1 if ratio_vacuum < ratio_reflect else 0 + + if rank == 0: + print(f"FP_RATIO_REFLECT {fp_ratio_reflect:.12e}") + print(f"FP_RATIO_VACUUM {fp_ratio_vacuum:.12e}") + print(f"STEP_RATIO_REFLECT {ratio_reflect:.12e}") + print(f"STEP_RATIO_VACUUM {ratio_vacuum:.12e}") + print(f"TRANSIENT_OK {ok}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_prompt_mid_step_swap_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_prompt_mid_step_swap_cbc.py new file mode 100644 index 0000000000..940af984df --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_keigen_3d_prompt_mid_step_swap_cbc.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +3D prompt transient: mid-step xs swap time. + +Validate swapping xs at a non-integer time, ensuring reported time matches +the swap and that the fission production reflects the new xs immediately. + +Prompt-only. First step to t=0.07, swap xs, then step to t=0.12. The fission +production computed at the swap time should scale by the xs ratio. + +TIME_AT_SWAP = 0.07 because we advance with dt=0.07 before swapping. +FP_RATIO_AT_SWAP = 1.2 from sigma_f ratio 0.180/0.150 at the swap time. +TIME_AT_SWAP verifies correct time advance. FP_RATIO_AT_SWAP verifies immediate +response to XS swap at that time. +""" + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.solver import ( + DiscreteOrdinatesProblem, + PowerIterationKEigenSolver, + TransientSolver, + ) + from pyopensn.aquad import GLCProductQuadrature3DXYZ + from pyopensn.xs import MultiGroupXS + from pyopensn.mesh import OrthogonalMeshGenerator + +if __name__ == "__main__": + dx = 8.0 / 4 + nodes = [i * dx for i in range(4 + 1)] + meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes, nodes]) + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + + xs_crit = MultiGroupXS() + xs_crit.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_prompt_crit.cxs")) + + xs_super = MultiGroupXS() + xs_super.LoadFromOpenSn(os.path.join(os.path.dirname(__file__), "xs1g_prompt_super.cxs")) + + pquad = GLCProductQuadrature3DXYZ(n_polar=2, n_azimuthal=4, scattering_order=0) + + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=1, + groupsets=[ + { + "groups_from_to": (0, 0), + "angular_quadrature": pquad, + "inner_linear_method": "petsc_gmres", + "l_abs_tol": 1.0e-8, + "l_max_its": 200, + "gmres_restart_interval": 10, + }, + ], + xs_map=[{"block_ids": [0], "xs": xs_crit}], + boundary_conditions=[ + {"name": "xmin", "type": "reflecting"}, + {"name": "xmax", "type": "reflecting"}, + {"name": "ymin", "type": "reflecting"}, + {"name": "ymax", "type": "reflecting"}, + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={ + "save_angular_flux": True, + "use_precursors": False, + "verbose_inner_iterations": False, + "verbose_outer_iterations": False, + }, + sweep_type="CBC", + ) + + keigen = PowerIterationKEigenSolver(problem=phys) + keigen.Initialize() + keigen.Execute() + + phys.SetTimeDependentMode() + + solver = TransientSolver(problem=phys, initial_state="existing") + solver.Initialize() + + fp0 = phys.ComputeFissionProduction("new") + + solver.SetTheta(1.0) + + # First step to t=0.07 + solver.SetTimeStep(0.07) + solver.Advance() + + time_at_swap = phys.GetTime() + + # Swap XS at non-integer time + phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_super}]) + fp_swap = phys.ComputeFissionProduction("new") + + # Next step to t=0.12 + solver.SetTimeStep(0.05) + solver.Advance() + + if rank == 0: + print(f"TIME_AT_SWAP {time_at_swap:.12e}") + print(f"FP_RATIO_AT_SWAP {fp_swap / fp0:.12e}") + print("TRANSIENT_OK 1") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_pulse_inf_med_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_pulse_inf_med_cbc.py new file mode 100644 index 0000000000..1e919a84f0 --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_pulse_inf_med_cbc.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +3D 1-group pulse, transient zero-init in a 3.2 cm reflecting cube. + +Pure absorber with sigma_t = 1.0 cm^-1, sigma_s = 0, v = 1.0 cm/s, and a +total Q_tot = 122.58 particles/s on for t=[0, 1] s, then 0 for t=[1, 2] s, +then 2Q_tot from t=[2, 3] s. V = 3.2^3 cm^3, so volumetric +Q = Q_tot / V ~= 3.7408 cm^-3 s^-1. + +phi1 = phi(1s) = Q * (1 - e^{-1}) ~= 2.365 +phi2 = phi(2s) = phi1 * e^{-1} ~=0.870 +phi3 = phi(3s) = phi2*e^{-1} + 2*Q*(1-e^{-1}) ~= 5.049 +""" + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.mesh import FromFileMeshGenerator + from pyopensn.xs import MultiGroupXS + from pyopensn.source import VolumetricSource + from pyopensn.aquad import GLCProductQuadrature3DXYZ + from pyopensn.solver import ( + CrankNicolson, + DiscreteOrdinatesProblem, + TransientSolver, + ) + from pyopensn.fieldfunc import FieldFunctionInterpolationVolume + from pyopensn.logvol import RPPLogicalVolume + +if __name__ == "__main__": + + meshgen = FromFileMeshGenerator(filename="../../../../assets/mesh/cube3.2.msh") + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + grid.SetOrthogonalBoundaries() + + num_groups = 1 + xs_diag = MultiGroupXS() + xs_diag.CreateSimpleOneGroup(1.0, 0.0, 1.0) + + Q_tot = 122.58 + Q_vol = Q_tot / (3.2 * 3.2 * 3.2) + + strength1 = [0.0 for _ in range(num_groups)] + strength2 = [0.0 for _ in range(num_groups)] + strength1[0] = Q_vol + strength2[0] = 2.0 * Q_vol + + src1 = VolumetricSource( + block_ids=[0], + group_strength=strength1, + start_time=0.0, + end_time=1.0, + ) + + src2 = VolumetricSource( + block_ids=[0], + group_strength=strength2, + start_time=2.0, + end_time=3.0, + ) + + pquad = GLCProductQuadrature3DXYZ(n_polar=4, n_azimuthal=16, scattering_order=0) + + gs0 = [0, num_groups - 1] + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=num_groups, + time_dependent=True, + groupsets=[ + { + "groups_from_to": gs0, + "angular_quadrature": pquad, + "angle_aggregation_type": "single", + "angle_aggregation_num_subsets": 1, + "inner_linear_method": "petsc_gmres", + "l_abs_tol": 1.0e-6, + "l_max_its": 500, + }, + ], + xs_map=[ + {"block_ids": [0], "xs": xs_diag}, + ], + volumetric_sources=[src1, src2], + boundary_conditions=[ + {"name": "xmin", "type": "reflecting"}, + {"name": "xmax", "type": "reflecting"}, + {"name": "ymin", "type": "reflecting"}, + {"name": "ymax", "type": "reflecting"}, + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={"save_angular_flux": True}, + sweep_type="CBC", + ) + + solver = TransientSolver( + problem=phys, + dt=0.1, + theta=CrankNicolson, + stop_time=3.0, + initial_state="zero", + ) + solver.Initialize() + solver.Execute() + + fflist = phys.GetScalarFluxFieldFunction() + monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True) + field_interp = FieldFunctionInterpolationVolume() + field_interp.SetOperationType("max") + field_interp.SetLogicalVolume(monitor_volume) + field_interp.AddFieldFunction(fflist[0]) + field_interp.Execute() + flux_max = field_interp.GetValue() + + if rank == 0: + print(f"Max phi(3s) = {flux_max:.6f}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_ramp_source_analytic_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_ramp_source_analytic_cbc.py new file mode 100644 index 0000000000..3a148dd056 --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_ramp_source_analytic_cbc.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +1-group infinite-medium transient with a ramped source and analytic solution. + +ODE (v=1): d(phi)/dt + sigma_t * phi = Q(t) +Q(t) ramps linearly from 0 at t=0 to Q0 at t=t_ramp, then stays at Q0. +""" + +import os +import sys +import math + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.mesh import FromFileMeshGenerator + from pyopensn.xs import MultiGroupXS + from pyopensn.source import VolumetricSource + from pyopensn.aquad import GLCProductQuadrature3DXYZ + from pyopensn.solver import DiscreteOrdinatesProblem, TransientSolver + from pyopensn.fieldfunc import FieldFunctionInterpolationVolume + from pyopensn.logvol import RPPLogicalVolume + + +def ramp_q(time_value: float, q0: float, t_ramp: float) -> float: + if time_value <= 0.0: + return 0.0 + if time_value < t_ramp: + return q0 * time_value / t_ramp + return q0 + + +def analytic_phi(time_value: float, + q0: float, + t_ramp: float, + sigma_t: float, + v: float) -> float: + lam = v * sigma_t + if time_value <= t_ramp: + a = q0 / t_ramp + return v * a * ( + time_value / lam + - 1.0 / (lam * lam) + + math.exp(-lam * time_value) / (lam * lam) + ) + # value at t_ramp + a = q0 / t_ramp + phi_tr = v * a * ( + t_ramp / lam - 1.0 / (lam * lam) + math.exp(-lam * t_ramp) / (lam * lam) + ) + dt = time_value - t_ramp + return phi_tr * math.exp(-lam * dt) + (v * q0 / lam) * (1.0 - math.exp(-lam * dt)) + + +if __name__ == "__main__": + meshgen = FromFileMeshGenerator(filename="../../../../assets/mesh/cube3.2.msh") + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + grid.SetOrthogonalBoundaries() + + sigma_t = 1.0 + v = 1.0 + q0 = 1.0 + t_ramp = 0.5 + + xs = MultiGroupXS() + xs.CreateSimpleOneGroup(sigma_t, 0.0, v) + + def source_func(group: int, time_value: float) -> float: + return ramp_q(time_value, q0, t_ramp) + vol_src = VolumetricSource(block_ids=[0], strength_function=source_func) + + pquad = GLCProductQuadrature3DXYZ(n_polar=4, n_azimuthal=16, scattering_order=0) + + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=1, + time_dependent=True, + groupsets=[ + { + "groups_from_to": (0, 0), + "angular_quadrature": pquad, + "angle_aggregation_type": "single", + "inner_linear_method": "petsc_gmres", + "l_abs_tol": 1.0e-8, + "l_max_its": 200, + } + ], + xs_map=[{"block_ids": [0], "xs": xs}], + volumetric_sources=[vol_src], + boundary_conditions=[ + {"name": "xmin", "type": "reflecting"}, + {"name": "xmax", "type": "reflecting"}, + {"name": "ymin", "type": "reflecting"}, + {"name": "ymax", "type": "reflecting"}, + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={"save_angular_flux": True, "verbose_inner_iterations": False}, + sweep_type="CBC", + ) + + solver = TransientSolver(problem=phys, initial_state="zero") + solver.Initialize() + solver.SetTheta(0.5) + + dt = 0.01 + stop_time = 1.0 + current_time = 0.0 + + while current_time < stop_time: + target_time = min(current_time + dt, stop_time) + solver.SetTimeStep(target_time - current_time) + solver.Advance() + current_time = target_time + + fflist = phys.GetScalarFluxFieldFunction() + monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True) + field_interp = FieldFunctionInterpolationVolume() + field_interp.SetOperationType("max") + field_interp.SetLogicalVolume(monitor_volume) + field_interp.AddFieldFunction(fflist[0]) + field_interp.Execute() + phi_num = field_interp.GetValue() + + phi_exact = analytic_phi(stop_time, q0, t_ramp, sigma_t, v) + rel_err = abs(phi_num - phi_exact) / phi_exact + pass_flag = 1 if rel_err < 0.01 else 0 + + if rank == 0: + print(f"RAMP_SOURCE_ANALYTIC phi_num {phi_num:.6f} phi_exact {phi_exact:.6f}") + print(f"RAMP_SOURCE_ANALYTIC_PASS {pass_flag}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v0.5_inf_med_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v0.5_inf_med_cbc.py new file mode 100644 index 0000000000..ad9d90e492 --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v0.5_inf_med_cbc.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# 1-group, infinite-medium, transient in a 3.2 cm cube (all reflecting). +# The cube is a pure absorber with sigma_t = 1.0 cm^-1, sigma_s = 0, v = 0.5 cm/s +# and a total Q_tot = 122.58 particles/s on for t=[0, 1] s, then 0 for t>1 s. +# V = 3.2^3 cm^3, so volumetric Q = Q_tot / V ~= 3.7408 cm^-3 s^-1. +# +# For 0 <= t <= 1: phi(t) = Q * (1 - e^{-t}) +# For t >= 1: phi(t) = phi(1) * e^{-(t - 1)} +# phi(1s) ~= 1.472 +# phi(2s) ~= 0.893 + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.mesh import FromFileMeshGenerator + from pyopensn.xs import MultiGroupXS + from pyopensn.source import VolumetricSource + from pyopensn.aquad import GLCProductQuadrature3DXYZ + from pyopensn.solver import BackwardEuler, DiscreteOrdinatesProblem, TransientSolver + from pyopensn.fieldfunc import FieldFunctionInterpolationVolume + from pyopensn.logvol import RPPLogicalVolume + +if __name__ == "__main__": + + meshgen = FromFileMeshGenerator(filename="../../../../assets/mesh/cube3.2.msh") + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + grid.SetOrthogonalBoundaries() + + num_groups = 1 + xs_diag = MultiGroupXS() + xs_diag.CreateSimpleOneGroup(1.0, 0.0, 0.5) + + strength = [0.0 for _ in range(num_groups)] + strength[0] = 122.58 / (3.2 * 3.2 * 3.2) + mg_src = VolumetricSource(block_ids=[0], + group_strength=strength, + start_time=0.0, + end_time=1.0) + + pquad = GLCProductQuadrature3DXYZ(n_polar=4, n_azimuthal=16, scattering_order=0) + + gs0 = [0, num_groups - 1] + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=num_groups, + time_dependent=True, + groupsets=[ + { + "groups_from_to": gs0, + "angular_quadrature": pquad, + "angle_aggregation_type": "single", + "angle_aggregation_num_subsets": 1, + "inner_linear_method": "petsc_richardson", + "l_abs_tol": 1.0e-6, + "l_max_its": 500, + }, + ], + xs_map=[ + {"block_ids": [0], "xs": xs_diag}, + ], + volumetric_sources=[mg_src], + boundary_conditions=[ + {"name": "xmin", "type": "reflecting"}, + {"name": "xmax", "type": "reflecting"}, + {"name": "ymin", "type": "reflecting"}, + {"name": "ymax", "type": "reflecting"}, + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={"save_angular_flux": True}, + sweep_type="CBC", + ) + + solver = TransientSolver( + problem=phys, + dt=0.05, + theta=BackwardEuler, + stop_time=2.0, + initial_state="zero", + ) + solver.Initialize() + solver.Execute() + + fflist = phys.GetScalarFluxFieldFunction() + monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True) + field_interp = FieldFunctionInterpolationVolume() + field_interp.SetOperationType("max") + field_interp.SetLogicalVolume(monitor_volume) + field_interp.AddFieldFunction(fflist[0]) + field_interp.Execute() + flux_max = field_interp.GetValue() + + if rank == 0: + print(f"Max phi(2s) = {flux_max:.6f}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v1_inf_med_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v1_inf_med_cbc.py new file mode 100644 index 0000000000..0b3e907bcc --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v1_inf_med_cbc.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +3D 1-group, v=1.0, transient zero-init in a 3.2 cm reflecting cube. + +Pure absorber with sigma_t = 1.0 cm^-1, sigma_s = 0, v = 1.0 cm/s and a +total Q_tot = 122.58 particles/s on for t=[0, 1] s, then 0 for t>1 s. +V = 3.2^3 cm^3, so volumetric Q = Q_tot / V ~= 3.7408 cm^-3 s^-1. + +For 0 <= t <= 1: phi(t) = Q * (1 - e^{-t}) +For t >= 1: phi(t) = phi(1) * e^{-(t - 1)} +phi(1s) ~= 2.365 +phi(2s) ~= 0.870 +""" + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.mesh import FromFileMeshGenerator + from pyopensn.xs import MultiGroupXS + from pyopensn.source import VolumetricSource + from pyopensn.aquad import GLCProductQuadrature3DXYZ + from pyopensn.solver import DiscreteOrdinatesProblem, TransientSolver + from pyopensn.fieldfunc import FieldFunctionInterpolationVolume + from pyopensn.logvol import RPPLogicalVolume + +if __name__ == "__main__": + + meshgen = FromFileMeshGenerator(filename="../../../../assets/mesh/cube3.2.msh") + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + grid.SetOrthogonalBoundaries() + + num_groups = 1 + xs_diag = MultiGroupXS() + xs_diag.CreateSimpleOneGroup(1.0, 0.0, 1.0) + + strength = [0.0 for _ in range(num_groups)] + strength[0] = 122.58 / (3.2 * 3.2 * 3.2) + mg_src = VolumetricSource( + block_ids=[0], + group_strength=strength, + start_time=0.0, + end_time=1.0, + ) + + pquad = GLCProductQuadrature3DXYZ(n_polar=4, n_azimuthal=16, scattering_order=0) + + gs0 = [0, num_groups - 1] + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=num_groups, + time_dependent=True, + groupsets=[ + { + "groups_from_to": gs0, + "angular_quadrature": pquad, + "angle_aggregation_type": "single", + "angle_aggregation_num_subsets": 1, + "inner_linear_method": "petsc_richardson", + "l_abs_tol": 1.0e-6, + "l_max_its": 500, + }, + ], + xs_map=[ + {"block_ids": [0], "xs": xs_diag}, + ], + volumetric_sources=[mg_src], + boundary_conditions=[ + {"name": "xmin", "type": "reflecting"}, + {"name": "xmax", "type": "reflecting"}, + {"name": "ymin", "type": "reflecting"}, + {"name": "ymax", "type": "reflecting"}, + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={"save_angular_flux": True}, + sweep_type="CBC", + ) + + solver = TransientSolver( + problem=phys, + dt=0.05, + theta=1.0, + stop_time=1.0, + initial_state="zero", + ) + solver.Initialize() + solver.Execute() + + fflist = phys.GetScalarFluxFieldFunction() + monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True) + field_interp = FieldFunctionInterpolationVolume() + field_interp.SetOperationType("max") + field_interp.SetLogicalVolume(monitor_volume) + field_interp.AddFieldFunction(fflist[0]) + field_interp.Execute() + flux_max = field_interp.GetValue() + + if rank == 0: + print(f"Max phi(1s) = {flux_max:.6f}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v1_inf_med_swap_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v1_inf_med_swap_cbc.py new file mode 100644 index 0000000000..c60df9e9cd --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v1_inf_med_swap_cbc.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# 1-group, infinite-medium, transient in a 3.2 cm cube (all reflecting). +# The cube is a pure absorber with sigma_t = 1.0 cm^-1, sigma_s = 0, v = 1.0 cm/s +# and a total Q_tot = 122.58 particles/s on for t=[0, 1] s, then 0 for t>1 s. +# At t = 0.5 s, cross sections are swapped to sigma_t = 2.0 cm^-1. +# V = 3.2^3 cm^3, so volumetric Q = Q_tot / V ~= 3.7408 cm^-3 s^-1. +# +# For 0 <= t <= 1: phi(t) = Q * (1 - e^{-t}) +# For t >= 1: phi(t) = phi(1) * e^{-2 (t - 1)} +# With backward Euler, dt = 0.05: +# phi(1s) ~= 1.706 +# phi(2s) ~= 0.233 + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.mesh import FromFileMeshGenerator + from pyopensn.xs import MultiGroupXS + from pyopensn.source import VolumetricSource + from pyopensn.aquad import GLCProductQuadrature3DXYZ + from pyopensn.solver import DiscreteOrdinatesProblem, TransientSolver + from pyopensn.fieldfunc import FieldFunctionInterpolationVolume + from pyopensn.logvol import RPPLogicalVolume + +if __name__ == "__main__": + + meshgen = FromFileMeshGenerator(filename="../../../../assets/mesh/cube3.2.msh") + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + grid.SetOrthogonalBoundaries() + + num_groups = 1 + xs_diag = MultiGroupXS() + xs_diag.CreateSimpleOneGroup(1.0, 0.0, 1.0) + xs_diag_swap = MultiGroupXS() + xs_diag_swap.CreateSimpleOneGroup(2.0, 0.0, 1.0) + + strength = [0.0 for _ in range(num_groups)] + strength[0] = 122.58 / (3.2 * 3.2 * 3.2) + mg_src = VolumetricSource(block_ids=[0], + group_strength=strength, + start_time=0.0, + end_time=1.0) + + pquad = GLCProductQuadrature3DXYZ(n_polar=4, n_azimuthal=16, scattering_order=0) + + gs0 = [0, num_groups - 1] + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=num_groups, + time_dependent=True, + groupsets=[ + { + "groups_from_to": gs0, + "angular_quadrature": pquad, + "angle_aggregation_type": "single", + "angle_aggregation_num_subsets": 1, + "inner_linear_method": "petsc_richardson", + "l_abs_tol": 1.0e-6, + "l_max_its": 500, + }, + ], + xs_map=[ + {"block_ids": [0], "xs": xs_diag}, + ], + volumetric_sources=[mg_src], + boundary_conditions=[ + {"name": "xmin", "type": "reflecting"}, + {"name": "xmax", "type": "reflecting"}, + {"name": "ymin", "type": "reflecting"}, + {"name": "ymax", "type": "reflecting"}, + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={"save_angular_flux": True}, + sweep_type="CBC", + ) + + solver = TransientSolver(problem=phys, verbose=False, initial_state="zero") + solver.Initialize() + + dt = 0.05 + theta = 1.0 + step = 0 + stop = 0 + stop_time = 1.0 + swap_time = 0.5 + current_time = 0.0 + swapped = False + solver.SetTheta(theta) + + while current_time < stop_time: + target_time = min(current_time + dt, stop_time) + step_dt = target_time - current_time + solver.SetTimeStep(step_dt) + + if rank == 0: + print("") + print( + f"*************** Time step #{step:d} t = {target_time:.6f} " + f"(from {current_time:.6f}, dt = {step_dt:.6f}, theta = {theta:.3f}) " + f"***************" + ) + + solver.Advance() + + if (not swapped) and target_time >= swap_time: + phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_diag_swap}]) + swapped = True + + current_time = target_time + step = step + 1 + + fflist = phys.GetScalarFluxFieldFunction() + monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True) + field_interp = FieldFunctionInterpolationVolume() + field_interp.SetOperationType("max") + field_interp.SetLogicalVolume(monitor_volume) + field_interp.AddFieldFunction(fflist[0]) + field_interp.Execute() + flux_max = field_interp.GetValue() + + if rank == 0: + print(f"Max phi(1s) = {flux_max:.6f}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v2_inf_med_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v2_inf_med_cbc.py new file mode 100644 index 0000000000..316b9b3409 --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_1g_v2_inf_med_cbc.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# 1-group, infinite-medium, transient in a 3.2 cm cube (all reflecting). +# The cube is a pure absorber with sigma_t = 1.0 cm^-1, sigma_s = 0, v = 2.0 cm/s +# and a total Q_tot = 122.58 particles/s on for t=[0, 1] s, then 0 for t>1 s. +# V = 3.2^3 cm^3, so volumetric Q = Q_tot / V ~= 3.7408 cm^-3 s^-1. +# +# For 0 <= t <= 1: phi(t) = Q * (1 - e^{-t}) +# For t >= 1: phi(t) = phi(1) * e^{-(t - 1)} +# phi(1s) ~= 3.235 +# phi(2s) ~= 0.438 + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.mesh import FromFileMeshGenerator + from pyopensn.xs import MultiGroupXS + from pyopensn.source import VolumetricSource + from pyopensn.aquad import GLCProductQuadrature3DXYZ + from pyopensn.solver import DiscreteOrdinatesProblem, TransientSolver + from pyopensn.fieldfunc import FieldFunctionInterpolationVolume + from pyopensn.logvol import RPPLogicalVolume + +if __name__ == "__main__": + + meshgen = FromFileMeshGenerator(filename="../../../../assets/mesh/cube3.2.msh") + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + grid.SetOrthogonalBoundaries() + + num_groups = 1 + xs_diag = MultiGroupXS() + xs_diag.CreateSimpleOneGroup(1.0, 0.0, 2.0) + + strength = [0.0 for _ in range(num_groups)] + strength[0] = 122.58 / (3.2 * 3.2 * 3.2) + mg_src = VolumetricSource(block_ids=[0], + group_strength=strength, + start_time=0.0, + end_time=1.0) + + pquad = GLCProductQuadrature3DXYZ(n_polar=4, n_azimuthal=16, scattering_order=0) + + gs0 = [0, num_groups - 1] + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=num_groups, + time_dependent=True, + groupsets=[ + { + "groups_from_to": gs0, + "angular_quadrature": pquad, + "angle_aggregation_type": "single", + "angle_aggregation_num_subsets": 1, + "inner_linear_method": "petsc_gmres", + "l_abs_tol": 1.0e-6, + "l_max_its": 500, + }, + ], + xs_map=[ + {"block_ids": [0], "xs": xs_diag}, + ], + volumetric_sources=[mg_src], + boundary_conditions=[ + {"name": "xmin", "type": "reflecting"}, + {"name": "xmax", "type": "reflecting"}, + {"name": "ymin", "type": "reflecting"}, + {"name": "ymax", "type": "reflecting"}, + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={"save_angular_flux": True}, + sweep_type="CBC", + ) + + solver = TransientSolver(problem=phys, dt=0.05, theta=1.0, stop_time=1.0, initial_state="zero") + solver.Initialize() + solver.Execute() + + fflist = phys.GetScalarFluxFieldFunction() + monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True) + field_interp = FieldFunctionInterpolationVolume() + field_interp.SetOperationType("max") + field_interp.SetLogicalVolume(monitor_volume) + field_interp.AddFieldFunction(fflist[0]) + field_interp.Execute() + flux_max = field_interp.GetValue() + + if rank == 0: + print(f"Max phi(1s) = {flux_max:.6f}") diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_downscatter_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_downscatter_cbc.py new file mode 100644 index 0000000000..109c1465c0 --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_downscatter_cbc.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +3D 2-group downscatter, transient zero-init in a 3.2 cm reflecting cube. + +g0 (fast): sigma_t0 = 1.0 cm^-1, v0 = 2.0 cm/s +g1 (thermal): sigma_t1 = 0.8 cm^-1, v1 = 0.5 cm/s +sigma_s(0 -> 1) = 0.5 cm^-1, all other sigma_s = 0 +Constant in time source in g0 only: +Q0 = 122.58 / 3.2^3 cm^-3 s^-1, for all t >= 0 + +(1/v0) d(phi0)/dt + sigma_t0 * phi0 = Q0 +(1/v1) d(phi1)/dt + sigma_t1 * phi1 = sigma_s(0->1) * phi0(t) +phi0(t) = (Q0 / sigma_t0) * (1 - exp(-v0 * sigma_t0 * t)) = Q0 * (1 - exp(-2 t)) +phi1(t) = exp(-v1 * sigma_t1 * t) * + [ v1 * sigma_s(0->1) * integral_0^t exp(v1 * sigma_t1 * s) * phi0(s) ds ] +phi0(1s) ~= 3.235, phi1(1s) ~= 0.458 +phi0(2s) ~= 3.672, phi1(2s) ~= 1.036 +""" + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.mesh import FromFileMeshGenerator + from pyopensn.xs import MultiGroupXS + from pyopensn.source import VolumetricSource + from pyopensn.aquad import GLCProductQuadrature3DXYZ + from pyopensn.solver import DiscreteOrdinatesProblem, TransientSolver + from pyopensn.fieldfunc import FieldFunctionInterpolationVolume + from pyopensn.logvol import RPPLogicalVolume + +if __name__ == "__main__": + + meshgen = FromFileMeshGenerator(filename="../../../../assets/mesh/cube3.2.msh") + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + grid.SetOrthogonalBoundaries() + + xs_diag = MultiGroupXS() + xs_diag.LoadFromOpenSn( + os.path.join( + os.path.dirname(__file__), + "simple_2g_downscatter_td.cxs", + ) + ) + num_groups = xs_diag.num_groups + + Q_tot = 122.58 + Q_vol = Q_tot / (3.2 * 3.2 * 3.2) + + strength = [0.0 for _ in range(num_groups)] + strength[0] = Q_vol # source only in group 0 + strength[1] = 0.0 + + mg_src = VolumetricSource( + block_ids=[0], + group_strength=strength, + start_time=0.0, + end_time=1.0e9, + ) # effectively always on + + pquad = GLCProductQuadrature3DXYZ(n_polar=4, n_azimuthal=16, scattering_order=0) + + gs0 = [0, num_groups - 1] + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=num_groups, + time_dependent=True, + groupsets=[ + { + "groups_from_to": gs0, + "angular_quadrature": pquad, + "angle_aggregation_type": "single", + "angle_aggregation_num_subsets": 1, + "inner_linear_method": "petsc_gmres", + "l_abs_tol": 1.0e-6, + "l_max_its": 500, + }, + ], + xs_map=[ + {"block_ids": [0], "xs": xs_diag}, + ], + volumetric_sources=[mg_src], + boundary_conditions=[ + {"name": "xmin", "type": "reflecting"}, + {"name": "xmax", "type": "reflecting"}, + {"name": "ymin", "type": "reflecting"}, + {"name": "ymax", "type": "reflecting"}, + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={"save_angular_flux": True}, + sweep_type="CBC", + ) + + solver = TransientSolver( + problem=phys, + dt=0.05, + theta=1.0, + stop_time=2.0, + initial_state="zero", + ) + solver.Initialize() + solver.Execute() + + fflist = phys.GetScalarFluxFieldFunction() + monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True) + + # Group 0 + ff_interp_g0 = FieldFunctionInterpolationVolume() + ff_interp_g0.SetOperationType("max") + ff_interp_g0.SetLogicalVolume(monitor_volume) + ff_interp_g0.AddFieldFunction(fflist[0]) + ff_interp_g0.Execute() + flux_max_g0 = ff_interp_g0.GetValue() + + # Group 1 + ff_interp_g1 = FieldFunctionInterpolationVolume() + ff_interp_g1.SetOperationType("max") + ff_interp_g1.SetLogicalVolume(monitor_volume) + ff_interp_g1.AddFieldFunction(fflist[1]) + ff_interp_g1.Execute() + flux_max_g1 = ff_interp_g1.GetValue() + + if rank == 0: + print("Max phi0(2s) = {:.6f}".format(flux_max_g0)) + print("Max phi1(2s) = {:.6f}".format(flux_max_g1)) diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_downscatter_swap_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_downscatter_swap_cbc.py new file mode 100644 index 0000000000..982bc600c0 --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_downscatter_swap_cbc.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# 2-group, infinite-medium transient with downscatter (group 0 -> group 1). +# Cross sections are swapped at t=0.5 s: +# before: sigma_t0=1.0, sigma_t1=0.8, sigma_s01=0.5 +# after : sigma_t0=2.0, sigma_t1=1.2, sigma_s01=0.6 +# Velocities remain v0=2.0 cm/s, v1=0.5 cm/s. +# +# (1/v0) d(phi0)/dt + sigma_t0 * phi0 = Q0 +# (1/v1) d(phi1)/dt + sigma_t1 * phi1 = sigma_s01 * phi0(t) +# With backward Euler, dt=0.05: phi0(1s) ~= 1.939573, phi1(1s) ~= 0.384769 + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.mesh import FromFileMeshGenerator + from pyopensn.xs import MultiGroupXS + from pyopensn.source import VolumetricSource + from pyopensn.aquad import GLCProductQuadrature3DXYZ + from pyopensn.solver import DiscreteOrdinatesProblem, TransientSolver + from pyopensn.fieldfunc import FieldFunctionInterpolationVolume + from pyopensn.logvol import RPPLogicalVolume + +if __name__ == "__main__": + + meshgen = FromFileMeshGenerator(filename="../../../../assets/mesh/cube3.2.msh") + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + grid.SetOrthogonalBoundaries() + + xs_diag = MultiGroupXS() + xs_diag.LoadFromOpenSn( + os.path.join(os.path.dirname(__file__), "simple_2g_downscatter_td.cxs") + ) + xs_diag_swap = MultiGroupXS() + xs_diag_swap.LoadFromOpenSn( + os.path.join(os.path.dirname(__file__), "simple_2g_downscatter_td_swap.cxs") + ) + num_groups = xs_diag.num_groups + + Q_tot = 122.58 + Q_vol = Q_tot / (3.2 * 3.2 * 3.2) + + strength = [0.0 for _ in range(num_groups)] + strength[0] = Q_vol # source only in group 0 + strength[1] = 0.0 + + mg_src = VolumetricSource(block_ids=[0], + group_strength=strength, + start_time=0.0, + end_time=1.0e9) # effectively always on + + pquad = GLCProductQuadrature3DXYZ(n_polar=4, n_azimuthal=16, scattering_order=0) + + gs0 = [0, num_groups - 1] + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=num_groups, + time_dependent=True, + groupsets=[ + { + "groups_from_to": gs0, + "angular_quadrature": pquad, + "angle_aggregation_type": "single", + "angle_aggregation_num_subsets": 1, + "inner_linear_method": "petsc_gmres", + "l_abs_tol": 1.0e-6, + "l_max_its": 500, + }, + ], + xs_map=[ + {"block_ids": [0], "xs": xs_diag}, + ], + volumetric_sources=[mg_src], + boundary_conditions=[ + {"name": "xmin", "type": "reflecting"}, + {"name": "xmax", "type": "reflecting"}, + {"name": "ymin", "type": "reflecting"}, + {"name": "ymax", "type": "reflecting"}, + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={"save_angular_flux": True}, + sweep_type="CBC", + ) + + solver = TransientSolver(problem=phys, verbose=False, initial_state="zero") + solver.Initialize() + + dt = 0.05 + theta = 1.0 + step = 0 + stop_time = 1.0 + swap_time = 0.5 + current_time = 0.0 + swapped = False + solver.SetTheta(theta) + + while current_time < stop_time: + target_time = min(current_time + dt, stop_time) + step_dt = target_time - current_time + solver.SetTimeStep(step_dt) + + if rank == 0: + print("") + print( + f"*************** Time step #{step:d} t = {target_time:.6f} " + f"(from {current_time:.6f}, dt = {step_dt:.6f}, theta = {theta:.3f}) " + f"***************" + ) + + solver.Advance() + + if (not swapped) and target_time >= swap_time: + phys.SetXSMap(xs_map=[{"block_ids": [0], "xs": xs_diag_swap}]) + swapped = True + + current_time = target_time + step = step + 1 + + fflist = phys.GetScalarFluxFieldFunction() + monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True) + + # Group 0 + ff_interp_g0 = FieldFunctionInterpolationVolume() + ff_interp_g0.SetOperationType("max") + ff_interp_g0.SetLogicalVolume(monitor_volume) + ff_interp_g0.AddFieldFunction(fflist[0]) + ff_interp_g0.Execute() + flux_max_g0 = ff_interp_g0.GetValue() + + # Group 1 + ff_interp_g1 = FieldFunctionInterpolationVolume() + ff_interp_g1.SetOperationType("max") + ff_interp_g1.SetLogicalVolume(monitor_volume) + ff_interp_g1.AddFieldFunction(fflist[1]) + ff_interp_g1.Execute() + flux_max_g1 = ff_interp_g1.GetValue() + + if rank == 0: + print("Max phi0(1s) = {:.6f}".format(flux_max_g0)) + print("Max phi1(1s) = {:.6f}".format(flux_max_g1)) diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_pydrvr_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_pydrvr_cbc.py new file mode 100644 index 0000000000..6c22f36e86 --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_pydrvr_cbc.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# 2-group, infinite-medium transient with downscatter (group 0 -> group 1). +# 3.2 cm reflecting cube (infinite medium) with: +# g0 (fast): sigma_t0 = 1.0 cm^-1, v0 = 2.0 cm/s +# g1 (thermal): sigma_t1 = 0.8 cm^-1, v1 = 0.5 cm/s +# sigma_s(0 -> 1) = 0.5 cm^-1, all other sigma_s = 0 +# Constant in time source in g0 only: +# Q0 = 122.58 / 3.2^3 cm^-3 s^-1, for all t >= 0 +# +# (1/v0) d(phi0)/dt + sigma_t0 * phi0 = Q0 +# (1/v1) d(phi1)/dt + sigma_t1 * phi1 = sigma_s(0->1) * phi0(t) +# phi0(t) = (Q0 / sigma_t0) * (1 - exp(-v0 * sigma_t0 * t)) = Q0 * (1 - exp(-2 t)) +# phi1(t) = exp(-v1 * sigma_t1 * t) * +# [ v1 * sigma_s(0->1) * integral_0^t exp(v1 * sigma_t1 * s) * phi0(s) ds ] +# phi0(1s) ~= 3.235, phi1(1s) ~= 0.458 +# phi0(2s) ~= 3.672, phi1(2s) ~= 1.036 + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.mesh import FromFileMeshGenerator + from pyopensn.xs import MultiGroupXS + from pyopensn.source import VolumetricSource + from pyopensn.aquad import GLCProductQuadrature3DXYZ + from pyopensn.solver import DiscreteOrdinatesProblem, TransientSolver + from pyopensn.fieldfunc import FieldFunctionInterpolationVolume + from pyopensn.logvol import RPPLogicalVolume + +if __name__ == "__main__": + + meshgen = FromFileMeshGenerator(filename="../../../../assets/mesh/cube3.2.msh") + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + grid.SetOrthogonalBoundaries() + + xs_diag = MultiGroupXS() + xs_diag.LoadFromOpenSn( + os.path.join(os.path.dirname(__file__), "simple_2g_downscatter_td.cxs") + ) + num_groups = xs_diag.num_groups + + # Total source in group 0, converted to volumetric rate + Q_tot = 122.58 + Q_vol = Q_tot / (3.2 * 3.2 * 3.2) + + strength = [0.0 for _ in range(num_groups)] + strength[0] = Q_vol # source only in group 0 + strength[1] = 0.0 + + # Volumetric source is effectively always on + mg_src = VolumetricSource(block_ids=[0], group_strength=strength) + + # Angular quadrature + pquad = GLCProductQuadrature3DXYZ(n_polar=4, n_azimuthal=16, scattering_order=0) + + gs0 = [0, num_groups - 1] + + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=num_groups, + time_dependent=True, + groupsets=[ + { + "groups_from_to": gs0, + "angular_quadrature": pquad, + "angle_aggregation_type": "single", + "angle_aggregation_num_subsets": 1, + "inner_linear_method": "petsc_richardson", + "l_abs_tol": 1.0e-6, + "l_max_its": 500, + }, + ], + xs_map=[ + {"block_ids": [0], "xs": xs_diag}, + ], + volumetric_sources=[mg_src], + boundary_conditions=[ + {"name": "xmin", "type": "reflecting"}, + {"name": "xmax", "type": "reflecting"}, + {"name": "ymin", "type": "reflecting"}, + {"name": "ymax", "type": "reflecting"}, + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={ + "save_angular_flux": True, + "verbose_inner_iterations": False, + }, + sweep_type="CBC", + ) + + # Create the time-dependent solver without stop_time, we will loop in Python + solver = TransientSolver(problem=phys, verbose=False, initial_state="zero") + solver.Initialize() + + # Time stepping parameters (constant dt) + dt = 0.05 + theta = 0.5 + stop_time = 2.0 + current_time = 0.0 + step = 0 + solver.SetTheta(theta) + + monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True) + fflist = phys.GetScalarFluxFieldFunction() + + # Group 0 + ff_interp_g0 = FieldFunctionInterpolationVolume() + ff_interp_g0.SetOperationType("max") + ff_interp_g0.SetLogicalVolume(monitor_volume) + ff_interp_g0.AddFieldFunction(fflist[0]) + + # Group 1 + ff_interp_g1 = FieldFunctionInterpolationVolume() + ff_interp_g1.SetOperationType("max") + ff_interp_g1.SetLogicalVolume(monitor_volume) + ff_interp_g1.AddFieldFunction(fflist[1]) + + phi0 = [] + phi1 = [] + while current_time < stop_time: + target_time = min(current_time + dt, stop_time) + step_dt = target_time - current_time + + # dt is constant here with the exception of the last step. + # We adjust dt for the last step so that we get the solution + # exactly at stop_time + solver.SetTimeStep(step_dt) + + if rank == 0: + print("") + print( + f"*************** Time step #{step:d} t = {target_time:.6f} " + f"(from {current_time:.6f}, dt = {step_dt:.6f}, theta = {theta:.3f}) " + f"***************" + ) + + # Advance the solution from current_time to target_time + solver.Advance() + fflist[0].Update() + fflist[1].Update() + + ff_interp_g0.Execute() + flux_max_g0 = ff_interp_g0.GetValue() + phi0.append(flux_max_g0) + + ff_interp_g1.Execute() + flux_max_g1 = ff_interp_g1.GetValue() + phi1.append(flux_max_g1) + + current_time = target_time + step += 1 + + if rank == 0: + print("Max phi0 = {:.6f}".format(max(phi0))) + print("Max phi1 = {:.6f}".format(max(phi1))) diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_pydrvr_ramp_dt_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_pydrvr_ramp_dt_cbc.py new file mode 100644 index 0000000000..9c49b34b09 --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_2g_inf_med_pydrvr_ramp_dt_cbc.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# 2-group, infinite-medium transient with downscatter (group 0 -> group 1). +# 3.2 cm reflecting cube (infinite medium) with: +# g0 (fast): sigma_t0 = 1.0 cm^-1, v0 = 2.0 cm/s +# g1 (thermal): sigma_t1 = 0.8 cm^-1, v1 = 0.5 cm/s +# sigma_s(0 -> 1) = 0.5 cm^-1, all other sigma_s = 0 +# Constant in time source in g0 only: +# Q0 = 122.58 / 3.2^3 cm^-3 s^-1, for all t >= 0 +# +# (1/v0) d(phi0)/dt + sigma_t0 * phi0 = Q0 +# (1/v1) d(phi1)/dt + sigma_t1 * phi1 = sigma_s(0->1) * phi0(t) +# phi0(t) = (Q0 / sigma_t0) * (1 - exp(-v0 * sigma_t0 * t)) = Q0 * (1 - exp(-2 t)) +# phi1(t) = exp(-v1 * sigma_t1 * t) * +# [ v1 * sigma_s(0->1) * integral_0^t exp(v1 * sigma_t1 * s) * phi0(s) ds ] +# phi0(1s) ~= 3.235, phi1(1s) ~= 0.458 +# phi0(2s) ~= 3.672, phi1(2s) ~= 1.036 + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.mesh import FromFileMeshGenerator + from pyopensn.xs import MultiGroupXS + from pyopensn.source import VolumetricSource + from pyopensn.aquad import GLCProductQuadrature3DXYZ + from pyopensn.solver import DiscreteOrdinatesProblem, TransientSolver + from pyopensn.fieldfunc import FieldFunctionInterpolationVolume + from pyopensn.logvol import RPPLogicalVolume + +if __name__ == "__main__": + + meshgen = FromFileMeshGenerator(filename="../../../../assets/mesh/cube3.2.msh") + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + grid.SetOrthogonalBoundaries() + + xs_diag = MultiGroupXS() + xs_diag.LoadFromOpenSn( + os.path.join(os.path.dirname(__file__), "simple_2g_downscatter_td.cxs") + ) + num_groups = xs_diag.num_groups + + # Total source in group 0, converted to volumetric rate + Q_tot = 122.58 + Q_vol = Q_tot / (3.2 * 3.2 * 3.2) + + strength = [0.0 for _ in range(num_groups)] + strength[0] = Q_vol # source only in group 0 + strength[1] = 0.0 + + # Volumetric source is effectively always on + mg_src = VolumetricSource(block_ids=[0], group_strength=strength) + + # Angular quadrature + pquad = GLCProductQuadrature3DXYZ(n_polar=4, n_azimuthal=16, scattering_order=0) + + gs0 = [0, num_groups - 1] + + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=num_groups, + time_dependent=True, + groupsets=[ + { + "groups_from_to": gs0, + "angular_quadrature": pquad, + "angle_aggregation_type": "single", + "angle_aggregation_num_subsets": 1, + "inner_linear_method": "petsc_richardson", + "l_abs_tol": 1.0e-6, + "l_max_its": 500, + }, + ], + xs_map=[ + {"block_ids": [0], "xs": xs_diag}, + ], + volumetric_sources=[mg_src], + boundary_conditions=[ + {"name": "xmin", "type": "reflecting"}, + {"name": "xmax", "type": "reflecting"}, + {"name": "ymin", "type": "reflecting"}, + {"name": "ymax", "type": "reflecting"}, + {"name": "zmin", "type": "reflecting"}, + {"name": "zmax", "type": "reflecting"}, + ], + options={ + "save_angular_flux": True, + "verbose_inner_iterations": False, + }, + sweep_type="CBC", + ) + + # Create the time-dependent solver without stop_time, we will loop in Python + solver = TransientSolver(problem=phys, verbose=False, initial_state="zero") + solver.Initialize() + + monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True) + fflist = phys.GetScalarFluxFieldFunction() + + ff_interp_g0 = FieldFunctionInterpolationVolume() + ff_interp_g0.SetOperationType("max") + ff_interp_g0.SetLogicalVolume(monitor_volume) + ff_interp_g0.AddFieldFunction(fflist[0]) + + ff_interp_g1 = FieldFunctionInterpolationVolume() + ff_interp_g1.SetOperationType("max") + ff_interp_g1.SetLogicalVolume(monitor_volume) + ff_interp_g1.AddFieldFunction(fflist[1]) + + # Time stepping parameters + theta = 0.5 + stop_time = 2.0 + current_time = 0.0 + step = 0 + solver.SetTheta(theta) + + # Paramters for ramp dt + dt_min = 0.01 + dt_max = 0.20 + ramp_steps = 10 + + while current_time < stop_time: + + # Determine dt for this step + if step < ramp_steps: + # Linear ramp from dt_min to dt_max over ramp_steps + frac = step / (ramp_steps - 1) if ramp_steps > 1 else 1.0 + dt = dt_min + frac * (dt_max - dt_min) + else: + # Constant dt after ramp up + dt = dt_max + + target_time = min(current_time + dt, stop_time) + step_dt = target_time - current_time + + # Set the timestep in OpenSn for the Advance call + solver.SetTimeStep(step_dt) + + if rank == 0: + print("") + print( + f"*************** Time step #{step:d} t = {target_time:.6f} " + f"(from {current_time:.6f}, dt = {step_dt:.6f}, theta = {theta:.3f}) " + f"***************" + ) + + # Advance the solution + solver.Advance() + fflist[0].Update() + fflist[1].Update() + + ff_interp_g0.Execute() + flux_max_g0 = ff_interp_g0.GetValue() + + ff_interp_g1.Execute() + flux_max_g1 = ff_interp_g1.GetValue() + + if rank == 0: + print("Max phi0 = {:.6f}".format(flux_max_g0)) + print("Max phi1 = {:.6f}".format(flux_max_g1)) + + current_time = target_time + step += 1 diff --git a/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_openmc_xs_cbc.py b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_openmc_xs_cbc.py new file mode 100644 index 0000000000..880849b51e --- /dev/null +++ b/test/python/modules/linear_boltzmann_solvers/transport_transient/transient_zero_3d_openmc_xs_cbc.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Fixed-source time-dependent transport in a homogeneous cube using OpenMC-generated +# macroscopic, multigroup cross sections. The time-dependent solution is advanced to +# steady state and compared with the OpenSn steady state solution and the OpenMC +# steady state solution. +# +# OpenSn time-dependent solution: 51.057722 +# OpenSn steady-state solution: 51.057722 +# OpenMC steady-state solution: 50.96678 + +import os +import sys + +if "opensn_console" not in globals(): + from mpi4py import MPI + size = MPI.COMM_WORLD.size + rank = MPI.COMM_WORLD.rank + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../"))) + from pyopensn.mesh import OrthogonalMeshGenerator + from pyopensn.xs import MultiGroupXS + from pyopensn.source import VolumetricSource + from pyopensn.aquad import GLCProductQuadrature3DXYZ + from pyopensn.solver import DiscreteOrdinatesProblem, TransientSolver + from pyopensn.fieldfunc import FieldFunctionInterpolationVolume + from pyopensn.logvol import RPPLogicalVolume + +if __name__ == "__main__": + + N = 10 + L = 10.0 + xmin = -L / 2.0 + dx = L / N + nodes = [xmin + i * dx for i in range(N + 1)] + + meshgen = OrthogonalMeshGenerator(node_sets=[nodes, nodes, nodes]) + grid = meshgen.Execute() + grid.SetUniformBlockID(0) + + xs_water = MultiGroupXS() + xs_water.LoadFromOpenMC( + os.path.join(os.path.dirname(__file__), "xs_water.h5"), "set1", 294 + ) + num_groups = xs_water.num_groups + + strength = [0.0 for _ in range(num_groups)] + strength[3] = 12.285 + src1 = VolumetricSource(block_ids=[0], group_strength=strength) + + pquad = GLCProductQuadrature3DXYZ(n_polar=8, n_azimuthal=16, scattering_order=1) + + phys = DiscreteOrdinatesProblem( + mesh=grid, + num_groups=num_groups, + time_dependent=True, + groupsets=[ + { + "groups_from_to": [0, num_groups - 1], + "angular_quadrature": pquad, + "inner_linear_method": "petsc_gmres", + "l_abs_tol": 1.0e-6, + "l_max_its": 300, + "gmres_restart_interval": 30, + }, + ], + xs_map=[ + {"block_ids": [0], "xs": xs_water}, + ], + volumetric_sources=[src1], + options={"save_angular_flux": True}, + sweep_type="CBC", + ) + + monitor_volume = RPPLogicalVolume(infx=True, infy=True, infz=True) + dt = 0.01 + theta_cn = 0.5 + theta_be = 1.0 + be_startup_steps = 2 + stop_time = 0.1 + + solver = TransientSolver(problem=phys, dt=dt, theta=theta_be, initial_state="zero") + solver.Initialize() + + current_time = 0.0 + flux_max = 0.0 + step = 0 + fflist = phys.GetScalarFluxFieldFunction() + field_interp = FieldFunctionInterpolationVolume() + field_interp.SetOperationType("max") + field_interp.SetLogicalVolume(monitor_volume) + field_interp.AddFieldFunction(fflist[3]) + + while current_time < stop_time - 1.0e-14: + target_time = min(current_time + dt, stop_time) + solver.SetTimeStep(target_time - current_time) + theta_step = theta_be if step < be_startup_steps else theta_cn + solver.SetTheta(theta_step) + solver.Advance() + current_time = target_time + fflist[3].Update() + field_interp.Execute() + flux_max = field_interp.GetValue() + step += 1 + + if rank == 0: + print(f"Max phi(0.1s) = {flux_max:.6f}") From a9b6634ab804539729697ebe0a08d528d0e8b2e9 Mon Sep 17 00:00:00 2001 From: Eappen Nelluvelil Date: Sun, 12 Apr 2026 00:04:54 -0500 Subject: [PATCH 4/6] CBC_SPDS calculates max number of cell-face slots for local psi data storage during sweeps --- .../discrete_ordinates_problem.cc | 88 +++ .../sweep/scheduler/spmd_threadpool.h | 2 +- .../sweep/spds/cbc.cc | 254 +++++-- .../sweep/spds/cbc.h | 127 +++- .../sweep/spds/cbc_slot_planner.cc | 622 ++++++++++++++++++ .../sweep/spds/cbc_slot_planner.h | 77 +++ 6 files changed, 1123 insertions(+), 47 deletions(-) create mode 100644 modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc_slot_planner.cc create mode 100644 modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc_slot_planner.h diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc index 1a375321e1..dddec78344 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc @@ -17,6 +17,7 @@ #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_sweep_chunk_td.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/spmd_threadpool.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/iterative_methods/sweep_wgs_context.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/io/discrete_ordinates_problem_io.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/iterative_methods/ags_linear_solver.h" @@ -43,11 +44,13 @@ #include "framework/runtime.h" #include "caliper/cali.h" #include +#include #include #include #include #include #include +#include namespace opensn { @@ -1402,6 +1405,7 @@ DiscreteOrdinatesProblem::InitializeSweepDataStructures() } else if (sweep_type_ == "CBC") { + std::vector> cbc_spds_list; // Build SPDS for (const auto& [quadrature, info] : quadrature_unq_so_grouping_map_) { @@ -1416,8 +1420,92 @@ DiscreteOrdinatesProblem::InitializeSweepDataStructures() const auto new_swp_order = std::make_shared(omega, this->grid_, quadrature_allow_cycles_map_[quadrature]); quadrature_spds_map_[quadrature].push_back(new_swp_order); + cbc_spds_list.push_back(new_swp_order); } } + + if (cbc_spds_list.size() == 1) + { + auto start_time = std::chrono::steady_clock::now(); + cbc_spds_list.front()->ComputeMaxNumLocalPsiSlots(); + auto end_time = std::chrono::steady_clock::now(); + std::chrono::duration elapsed_seconds = end_time - start_time; + + const auto local_face_slots = cbc_spds_list.front()->GetMaxNumLocalPsiSlots(); + log.Log() << program_timer.GetTimeString() << "CBC SPDS local cell-face psi slot summary\n" + << " SPDS count : 1\n" + << " Elapsed : " << elapsed_seconds.count() << " s\n" + << " Max : " << local_face_slots << "\n" + << " Min : " << local_face_slots << "\n" + << " Median : " << static_cast(local_face_slots) << "\n" + << " Average : " << static_cast(local_face_slots) << "\n"; + } + else if (not cbc_spds_list.empty()) + { + const auto hardware_threads = std::max(1, std::thread::hardware_concurrency()); + const auto num_workers = std::min(cbc_spds_list.size(), hardware_threads); + + SPMD_ThreadPool pool(num_workers); + std::atomic next_index{0}; + + log.Log() << program_timer.GetTimeString() + << " Compute max num local cell-face psi slots for " << cbc_spds_list.size() + << " CBC SPDS using " << num_workers << " worker threads.\n"; + + auto start_time = std::chrono::steady_clock::now(); + pool.ExecuteBatch( + [&](std::size_t /* thread ID */) + { + std::size_t index; + // Atomically fetch the next index to work on + // std::memory_order_relaxed is sufficient here because we need atomicity only for the + // fetch_add operation, and there are no other synchronization requirements between + // threads for calculating max num local psi slots. + while ((index = next_index.fetch_add(1, std::memory_order_relaxed)) < + cbc_spds_list.size()) + { + cbc_spds_list[index]->ComputeMaxNumLocalPsiSlots(); + } + }); + auto end_time = std::chrono::steady_clock::now(); + std::chrono::duration elapsed_seconds = end_time - start_time; + double elapsed_time = elapsed_seconds.count(); + + size_t max_local_psi_slots = 0; + size_t min_local_psi_slots = std::numeric_limits::max(); + std::vector local_psi_slot_counts; + local_psi_slot_counts.reserve(cbc_spds_list.size()); + + for (const auto& spds : cbc_spds_list) + { + const auto local_psi_slots = spds->GetMaxNumLocalPsiSlots(); + max_local_psi_slots = std::max(max_local_psi_slots, local_psi_slots); + min_local_psi_slots = std::min(min_local_psi_slots, local_psi_slots); + local_psi_slot_counts.push_back(local_psi_slots); + } + + std::sort(local_psi_slot_counts.begin(), local_psi_slot_counts.end()); + const auto num_counts = local_psi_slot_counts.size(); + const double avg_local_psi_slots = + static_cast(std::accumulate( + local_psi_slot_counts.begin(), local_psi_slot_counts.end(), std::size_t{0})) / + num_counts; + const double median_local_psi_slots = + (num_counts % 2 == 1) + ? static_cast(local_psi_slot_counts[num_counts / 2]) + : 0.5 * static_cast(local_psi_slot_counts[num_counts / 2 - 1] + + local_psi_slot_counts[num_counts / 2]); + + log.Log() << program_timer.GetTimeString() + << " CBC SPDS local cell-face psi slot statistics\n" + << " SPDS count : " << cbc_spds_list.size() << "\n" + << " Workers : " << num_workers << "\n" + << " Elapsed : " << elapsed_time << " s\n" + << " Max : " << max_local_psi_slots << "\n" + << " Min : " << min_local_psi_slots << "\n" + << " Median : " << median_local_psi_slots << "\n" + << " Average : " << avg_local_psi_slots << "\n"; + } } else OpenSnInvalidArgument("Unsupported sweep type \"" + sweep_type_ + "\""); diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/spmd_threadpool.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/spmd_threadpool.h index c0bb26071f..ca87d3e991 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/spmd_threadpool.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/spmd_threadpool.h @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -96,6 +95,7 @@ class SPMD_ThreadPool for (std::size_t i = 0; i < n; ++i) ++epoch_states_[i].request; } + cv_start_.notify_all(); WaitAll(); } diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.cc index b99109a3a9..e7d880e1d7 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.cc +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.cc @@ -2,16 +2,168 @@ // SPDX-License-Identifier: MIT #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h" -#include "framework/mesh/mesh_continuum/mesh_continuum.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc_slot_planner.h" #include "framework/logging/log.h" -#include "framework/utils/timer.h" +#include "framework/mesh/mesh_continuum/mesh_continuum.h" #include "framework/runtime.h" #include "caliper/cali.h" #include +#include +#include namespace opensn { +void +CBC_SPDS::BuildTaskGraph() +{ + constexpr auto INCOMING = FaceOrientation::INCOMING; + constexpr auto OUTGOING = FaceOrientation::OUTGOING; + + const auto num_loc_cells = grid_->local_cells.size(); + task_list_.assign(num_loc_cells, Task{}); + task_successor_rank_offsets_.assign(num_loc_cells + 1, 0); + task_successor_ranks_.clear(); + task_successor_ranks_.reserve(num_loc_cells * 4); + + for (std::size_t rank = 0; rank < topo_order_.size(); ++rank) + { + const auto& cell = grid_->local_cells[topo_order_[rank]]; + unsigned int num_dependencies = 0; + std::vector successors; + + successors.reserve(cell.faces.size()); + task_successor_rank_offsets_[rank] = static_cast(task_successor_ranks_.size()); + for (std::size_t f = 0; f < cell.faces.size(); ++f) + { + const auto& face = cell.faces[f]; + const auto& orientation = cell_face_orientations_[cell.local_id][f]; + + if (orientation == INCOMING and face.has_neighbor) + ++num_dependencies; + else if ((orientation == OUTGOING) and (face.has_neighbor) and + (face.IsNeighborLocal(grid_.get()))) + { + const auto successor_local_id = grid_->cells[face.neighbor_id].local_id; + successors.push_back(successor_local_id); + task_successor_ranks_.push_back(topo_rank_by_cell_local_id_[successor_local_id]); + } + } + + task_list_[cell.local_id] = + Task{num_dependencies, std::move(successors), cell.local_id, &cell, false}; + } + task_successor_rank_offsets_.back() = static_cast(task_successor_ranks_.size()); +} + +void +CBC_SPDS::BuildLocalFaceTaskGraph() +{ + // Each outgoing local face becomes one directed-face task. + // The task is keyed by the producer-cell topological rank, the consumer-cell topological + // rank, and the face-node count needed later when CBC/CBCD size the compact slot bank. + const auto num_loc_cells = grid_->local_cells.size(); + cell_face_offsets_.assign(num_loc_cells + 1, 0); + size_t total_num_faces = 0; + for (const auto& cell : grid_->local_cells) + { + cell_face_offsets_[cell.local_id] = static_cast(total_num_faces); + total_num_faces += cell.faces.size(); + } + cell_face_offsets_.back() = static_cast(total_num_faces); + outgoing_local_face_task_ids_.assign(total_num_faces, INVALID_LOCAL_FACE_TASK_ID); + incoming_local_face_task_ids_.assign(total_num_faces, INVALID_LOCAL_FACE_TASK_ID); + + producer_cell_face_offsets_.assign(num_loc_cells + 1, 0); + local_face_producer_ranks_.clear(); + local_face_consumer_ranks_.clear(); + local_face_node_counts_.clear(); + max_local_face_node_count_ = 0; + + for (std::size_t producer_rank = 0; producer_rank < topo_order_.size(); ++producer_rank) + { + producer_cell_face_offsets_[producer_rank] = + static_cast(local_face_producer_ranks_.size()); + + const auto producer_cell_local_id = topo_order_[producer_rank]; + const auto& cell = grid_->local_cells[producer_cell_local_id]; + const auto& face_orientations = cell_face_orientations_[producer_cell_local_id]; + + for (std::size_t f = 0; f < cell.faces.size(); ++f) + { + const auto& face = cell.faces[f]; + const auto& orientation = face_orientations[f]; + if ((orientation != FaceOrientation::OUTGOING) or (not face.IsNeighborLocal(grid_.get()))) + continue; + + const auto consumer_cell_local_id = face.GetNeighborLocalID(grid_.get()); + const auto consumer_face_id = + static_cast(face.GetNeighborAdjacentFaceIndex(grid_.get())); + const auto num_face_nodes = static_cast(face.vertex_ids.size()); + max_local_face_node_count_ = + std::max(max_local_face_node_count_, static_cast(num_face_nodes)); + + const auto face_task_id = static_cast(local_face_producer_ranks_.size()); + local_face_producer_ranks_.push_back(static_cast(producer_rank)); + local_face_consumer_ranks_.push_back(topo_rank_by_cell_local_id_[consumer_cell_local_id]); + local_face_node_counts_.push_back(static_cast(num_face_nodes)); + outgoing_local_face_task_ids_[cell_face_offsets_[producer_cell_local_id] + f] = face_task_id; + incoming_local_face_task_ids_[cell_face_offsets_[consumer_cell_local_id] + consumer_face_id] = + face_task_id; + } + } + + producer_cell_face_offsets_.back() = + static_cast(local_face_producer_ranks_.size()); + local_face_slot_ids_.resize(local_face_producer_ranks_.size()); + std::iota(local_face_slot_ids_.begin(), local_face_slot_ids_.end(), std::uint32_t{0}); +} + +void +CBC_SPDS::UpdateLocalFaceSlotLayout() +{ + // The slot planner only decides which faces may share one slot. The physical storage bank is + // then sized slot-by-slot by taking the maximum face-node extent over each slot chain. + local_face_slot_node_counts_.assign(max_num_local_psi_slots_, std::uint16_t{0}); + local_face_slot_node_offsets_.assign(max_num_local_psi_slots_ + 1, std::uint32_t{0}); + total_local_face_slot_nodes_ = 0; + + bool is_identity_layout = max_num_local_psi_slots_ == local_face_slot_ids_.size(); + for (std::size_t face_task_id = 0; + is_identity_layout and face_task_id < local_face_slot_ids_.size(); + ++face_task_id) + is_identity_layout = local_face_slot_ids_[face_task_id] == face_task_id; + + if (is_identity_layout) + { + for (std::size_t slot_id = 0; slot_id < local_face_node_counts_.size(); ++slot_id) + { + local_face_slot_node_counts_[slot_id] = local_face_node_counts_[slot_id]; + local_face_slot_node_offsets_[slot_id] = + static_cast(total_local_face_slot_nodes_); + total_local_face_slot_nodes_ += local_face_node_counts_[slot_id]; + } + local_face_slot_node_offsets_.back() = static_cast(total_local_face_slot_nodes_); + return; + } + + for (std::size_t face_task_id = 0; face_task_id < local_face_slot_ids_.size(); ++face_task_id) + { + const auto slot_id = local_face_slot_ids_[face_task_id]; + assert(slot_id < local_face_slot_node_counts_.size()); + local_face_slot_node_counts_[slot_id] = + std::max(local_face_slot_node_counts_[slot_id], local_face_node_counts_[face_task_id]); + } + + for (std::size_t slot_id = 0; slot_id < local_face_slot_node_counts_.size(); ++slot_id) + { + local_face_slot_node_offsets_[slot_id] = + static_cast(total_local_face_slot_nodes_); + total_local_face_slot_nodes_ += local_face_slot_node_counts_[slot_id]; + } + local_face_slot_node_offsets_.back() = static_cast(total_local_face_slot_nodes_); +} + CBC_SPDS::CBC_SPDS(const Vector3& omega, const std::shared_ptr& grid, bool allow_cycles) @@ -21,7 +173,6 @@ CBC_SPDS::CBC_SPDS(const Vector3& omega, size_t num_loc_cells = grid->local_cells.size(); - // Populate Cell Relationships std::vector>> cell_successors(num_loc_cells); std::set location_successors; std::set location_dependencies; @@ -37,10 +188,8 @@ CBC_SPDS::CBC_SPDS(const Vector3& omega, for (auto v : location_dependencies) location_dependencies_.push_back(v); - // Build local cell graph Graph local_DG(num_loc_cells); - // Create graph edges for (size_t c = 0; c < num_loc_cells; ++c) // NOLINT for (const auto& successor : cell_successors[c]) boost::add_edge(c, successor.first, successor.second, local_DG); @@ -48,11 +197,10 @@ CBC_SPDS::CBC_SPDS(const Vector3& omega, if (allow_cycles) // NOLINT { auto edges_to_remove = RemoveCyclicDependencies(local_DG); - for (auto& edge_to_remove : edges_to_remove) - local_sweep_fas_.emplace_back(edge_to_remove.first, edge_to_remove.second); + for (const auto& [u, v] : edges_to_remove) + local_sweep_fas_.emplace_back(u, v); } - // Generate topological sorting spls_.clear(); boost::topological_sort(local_DG, std::back_inserter(spls_)); // NOLINT std::reverse(spls_.begin(), spls_.end()); @@ -62,44 +210,76 @@ CBC_SPDS::CBC_SPDS(const Vector3& omega, "Cycles need to be allowed by the calling application."); } - // Create task list - std::vector> global_dependencies; - global_dependencies.resize(opensn::mpi_comm.size()); + topo_order_.assign(spls_.begin(), spls_.end()); + topo_rank_by_cell_local_id_.assign(num_loc_cells, 0); + for (std::size_t rank = 0; rank < topo_order_.size(); ++rank) + topo_rank_by_cell_local_id_[topo_order_[rank]] = static_cast(rank); + + std::vector> global_dependencies(opensn::mpi_comm.size()); CommunicateLocationDependencies(location_dependencies_, global_dependencies); + BuildTaskGraph(); + BuildLocalFaceTaskGraph(); - constexpr auto INCOMING = FaceOrientation::INCOMING; - constexpr auto OUTGOING = FaceOrientation::OUTGOING; + max_num_local_psi_slots_ = local_face_producer_ranks_.size(); + UpdateLocalFaceSlotLayout(); +} - // For each local cell create a task - for (const auto& cell : grid_->local_cells) - { - const size_t num_faces = cell.faces.size(); - unsigned int num_dependencies = 0; - std::vector successors; +const std::vector& +CBC_SPDS::GetTaskList() const noexcept +{ + return task_list_; +} - for (size_t f = 0; f < num_faces; ++f) - { - if (cell_face_orientations_[cell.local_id][f] == INCOMING) - { - if (cell.faces[f].has_neighbor) - ++num_dependencies; - } - else if (cell_face_orientations_[cell.local_id][f] == OUTGOING) - { - const auto& face = cell.faces[f]; - if (face.has_neighbor and grid->IsCellLocal(face.neighbor_id)) - successors.push_back(grid->cells[face.neighbor_id].local_id); - } - } +void +CBC_SPDS::ComputeMaxNumLocalPsiSlots() +{ + CALI_CXX_MARK_SCOPE("CBC_SPDS::ComputeMaxNumLocalPsiSlots"); - task_list_.push_back({num_dependencies, successors, cell.local_id, &cell, false}); + if (task_list_.empty()) + { + max_num_local_psi_slots_ = 0; + local_face_slot_ids_.clear(); + UpdateLocalFaceSlotLayout(); + return; } + + if (local_face_producer_ranks_.empty()) + { + max_num_local_psi_slots_ = 0; + local_face_slot_ids_.clear(); + UpdateLocalFaceSlotLayout(); + return; + } + + // Solve the exact minimum chain cover of the local-face reuse poset, then turn that chain + // decomposition into a static slot assignment and compact slot-bank layout. + const auto result = detail::ComputeLocalFaceSlotPlan(task_successor_rank_offsets_, + task_successor_ranks_, + local_face_producer_ranks_, + local_face_consumer_ranks_, + producer_cell_face_offsets_, + local_face_slot_ids_); + max_num_local_psi_slots_ = result.slot_count; + UpdateLocalFaceSlotLayout(); + if (result.verifier_rejected) + opensn::log.LogAllWarning() + << "CBC_SPDS::ComputeMaxNumLocalPsiSlots: local cell-face slot assignment verifier rejected " + << " the computed slot count; falling back to the identity assignment " + << " (one slot per local directed face, no reuse)."; } -const std::vector& -CBC_SPDS::GetTaskList() const +std::uint32_t +CBC_SPDS::GetOutgoingLocalFaceTaskID(const std::uint32_t cell_local_id, + const unsigned int face_id) const noexcept { - return task_list_; + return outgoing_local_face_task_ids_[cell_face_offsets_[cell_local_id] + face_id]; +} + +std::uint32_t +CBC_SPDS::GetIncomingLocalFaceTaskID(const std::uint32_t cell_local_id, + const unsigned int face_id) const noexcept +{ + return incoming_local_face_task_ids_[cell_face_offsets_[cell_local_id] + face_id]; } } // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h index 09135920f5..b2de98ad50 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h @@ -5,28 +5,137 @@ #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/spds.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/sweep.h" +#include +#include +#include +#include namespace opensn { +/** + * Cell-by-cell sweep plane data structure and exact local-face slot metadata. + * + * This class stores the local CBC task DAG for one sweep direction, its topological ordering, + * and the local directed-face metadata consumed by both the host CBC FLUDS and the device + * CBCD FLUDS. Each outgoing local face is represented by one directed-face task with a + * producer-cell rank, a consumer-cell rank, and a face-node count. + * + * `ComputeMaxNumLocalPsiSlots()` computes the exact minimum safe number of reusable local-face + * slots and the corresponding static face-to-slot map. The result is shared by the host and + * device CBC implementations. `UpdateLocalFaceSlotLayout()` converts the face-to-slot mapping + * plan into a compact slot bank sized by the maximum face-node extent within each slot. + */ class CBC_SPDS : public SPDS { public: + /// Value returned when a local face does not participate in the requested face-task map. + static constexpr std::uint32_t INVALID_LOCAL_FACE_TASK_ID = + std::numeric_limits::max(); + + /// Construct the CBC sweep plane data structure for one angular direction. + CBC_SPDS(const Vector3& omega, const std::shared_ptr& grid, bool allow_cycles); + + /// Return the local CBC task list. + const std::vector& GetTaskList() const noexcept; + /** - * Constructs a cell-by-cell sweep-plane data strcture (SPDS) with the given direction and grid. + * Compute the exact minimum number of reusable local-face psi slots. + * + * The local directed faces define a poset under the safe reuse relation. A chain in this + * poset is one statically reusable slot. The required slot count is therefore the minimum + * chain-cover cardinality, equivalently the maximum antichain cardinality by Dilworth's + * theorem. * - * \param omega The angular direction vector. - * \param grid Reference to the grid. - * \param allow_cycles Whether cycles are allowed in the local sweep dependency graph. + * The planner obtains this value from a maximum cardinality matching on the bipartite + * split graph of the reuse relation. The extracted chains define the static slot IDs. + * `UpdateLocalFaceSlotLayout()` then sizes each slot by the maximum face-node extent + * over the faces assigned to that slot. */ - CBC_SPDS(const Vector3& omega, const std::shared_ptr& grid, bool allow_cycles); + void ComputeMaxNumLocalPsiSlots(); + + std::size_t GetMaxNumLocalPsiSlots() const noexcept { return max_num_local_psi_slots_; } + + const std::vector& GetLocalFaceSlotIDs() const noexcept + { + return local_face_slot_ids_; + } + + const std::vector& GetLocalFaceSlotNodeOffsets() const noexcept + { + return local_face_slot_node_offsets_; + } + + const std::vector& GetLocalFaceSlotNodeCounts() const noexcept + { + return local_face_slot_node_counts_; + } - /// Returns the cell-by-cell task list. - const std::vector& GetTaskList() const; + std::size_t GetTotalLocalFaceSlotNodes() const noexcept { return total_local_face_slot_nodes_; } -protected: - /// Cell-by-cell task list. + std::size_t GetMaxLocalFaceNodeCount() const noexcept { return max_local_face_node_count_; } + + /// Return the local directed-face task ID for an outgoing local face. + std::uint32_t GetOutgoingLocalFaceTaskID(std::uint32_t cell_local_id, + unsigned int face_id) const noexcept; + + /// Return the local directed-face task ID for an incoming local face. + std::uint32_t GetIncomingLocalFaceTaskID(std::uint32_t cell_local_id, + unsigned int face_id) const noexcept; + + ~CBC_SPDS() override = default; + +private: + /// Build the local cell task DAG and its successor-rank adjacency. + void BuildTaskGraph(); + + /// Enumerate local directed faces and map them to producer and consumer cell ranks. + void BuildLocalFaceTaskGraph(); + + /// Topological ordering of local cell IDs: topo_order_[rank] = cell_local_id. + std::vector topo_order_; + /// Topological rank keyed by local cell ID. + std::vector topo_rank_by_cell_local_id_; + /// Per-cell task descriptors with successor adjacency lists. std::vector task_list_; + /// Offsets into the flat successor-rank array indexed by topological task rank. + std::vector task_successor_rank_offsets_; + /// Flat successor topological ranks grouped by producer task rank. + std::vector task_successor_ranks_; + /// Flat face-table offsets indexed by cell local IDs. + std::vector cell_face_offsets_; + /// Flat outgoing local-face task IDs indexed by face storage index. + std::vector outgoing_local_face_task_ids_; + /// Flat incoming local-face task IDs indexed by face storage index. + std::vector incoming_local_face_task_ids_; + /// Face-rank offsets grouped by producer-cell topological rank. + std::vector producer_cell_face_offsets_; + /// Producer-cell topological rank for each local directed face. + std::vector local_face_producer_ranks_; + /// Consumer-cell topological rank for each local directed face. + std::vector local_face_consumer_ranks_; + /// Number of nodes for each local directed face task. + std::vector local_face_node_counts_; + /// Static slot assignment: local_face_slot_ids_[face_task_id] = slot_id. + std::vector local_face_slot_ids_; + /// Slot-local node extents: local_face_slot_node_counts_[slot_id] = max nodes in that slot. + std::vector local_face_slot_node_counts_; + /// Prefix offsets into the compact local-face slot bank. + std::vector local_face_slot_node_offsets_; + /// Minimum number of local-face angular flux storage slots. + std::size_t max_num_local_psi_slots_ = 0; + /// Total number of local-face nodes in the compact slot bank. + std::size_t total_local_face_slot_nodes_ = 0; + /// Maximum number of nodes across all local directed faces. + std::size_t max_local_face_node_count_ = 0; + + /** + * Recompute slot-local node extents and prefix offsets from the current slot assignment. + * + * Each slot is sized to the maximum face-node extent of the local directed faces assigned + * to that slot. This preserves the exact slot count while avoiding one global slot extent. + */ + void UpdateLocalFaceSlotLayout(); }; } // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc_slot_planner.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc_slot_planner.cc new file mode 100644 index 0000000000..3f8e3b71d3 --- /dev/null +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc_slot_planner.cc @@ -0,0 +1,622 @@ +// SPDX-FileCopyrightText: 2026 The OpenSn Authors +// SPDX-License-Identifier: MIT + +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc_slot_planner.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace opensn::detail +{ + +// Planner overview: +// 1. Build the reflexive transitive closure of the local CBC task DAG. +// 2. Define a face-poset reuse relation: face u may precede face v in one slot if the +// consumer cell of u reaches the producer cell of v in the task DAG. +// 3. Solve the resulting minimum chain-cover problem exactly through the standard bipartite +// maximum-matching reduction. +// 4. Extract one slot chain per unmatched right-side face and verify the resulting static +// handoff sequence before exposing it to CBC_SPDS. + +constexpr std::uint32_t INVALID_INDEX = std::numeric_limits::max(); + +// Bit-packed reachability matrix for the local cell DAG. +// Rows are padded so the closure builder can copy and OR contiguous word spans efficiently. +class BitMatrix +{ +public: + void ResizeAndClear(const std::size_t n) + { + n_ = n; + active_words_per_row_ = (n + 63) / 64; + padded_words_per_row_ = (active_words_per_row_ + 7) & ~std::size_t{7}; + const std::size_t required_words = n_ * padded_words_per_row_; + if (data_.size() < required_words) + data_.resize(required_words); + if (row_active_word_counts_.size() < n_) + row_active_word_counts_.resize(n_); + std::fill_n(data_.begin(), required_words, 0ULL); + std::fill_n(row_active_word_counts_.begin(), n_, std::size_t{0}); + } + + std::uint64_t* Row(const std::size_t i) noexcept + { + return data_.data() + i * padded_words_per_row_; + } + + const std::uint64_t* Row(const std::size_t i) const noexcept + { + return data_.data() + i * padded_words_per_row_; + } + + void SetBit(const std::size_t i, const std::size_t j) noexcept + { + Row(i)[j / 64] |= (1ULL << (j % 64)); + row_active_word_counts_[i] = std::max(row_active_word_counts_[i], (j / 64) + 1); + } + + bool TestBit(const std::size_t i, const std::size_t j) const noexcept + { + return (Row(i)[j / 64] & (1ULL << (j % 64))) != 0ULL; + } + + void CopyRowFromWord(const std::size_t dst, + const BitMatrix& src_mat, + const std::size_t src_row, + const std::size_t start_word) noexcept + { + const std::size_t src_active_words = src_mat.row_active_word_counts_[src_row]; + if (start_word >= src_active_words) + { + row_active_word_counts_[dst] = + std::max(row_active_word_counts_[dst], std::min(start_word, active_words_per_row_)); + return; + } + + std::uint64_t* const d = Row(dst) + start_word; + const std::uint64_t* const s = src_mat.Row(src_row) + start_word; + const std::size_t words_to_copy = src_active_words - start_word; + std::memcpy(d, s, words_to_copy * sizeof(std::uint64_t)); + row_active_word_counts_[dst] = src_active_words; + } + + void OrRowsFromWord(const std::size_t dst, + const BitMatrix& src_mat, + const std::size_t src_row, + const std::size_t start_word) noexcept + { + const std::size_t src_active_words = src_mat.row_active_word_counts_[src_row]; + if (start_word >= src_active_words) + return; + + std::uint64_t* const d = Row(dst) + start_word; + const std::uint64_t* const s = src_mat.Row(src_row) + start_word; + const std::size_t words_to_process = src_active_words - start_word; + for (std::size_t w = 0; w < words_to_process; ++w) + d[w] |= s[w]; + row_active_word_counts_[dst] = std::max(row_active_word_counts_[dst], src_active_words); + } + + std::size_t FindFirstSet(const std::size_t row, const std::size_t start_pos = 0) const noexcept + { + const std::uint64_t* const r = Row(row); + std::size_t w = start_pos / 64; + const std::size_t active_words = row_active_word_counts_[row]; + if (w >= active_words) + return n_; + + std::uint64_t masked = r[w] & (~0ULL << (start_pos % 64)); + if (masked) + return w * 64 + static_cast(std::countr_zero(masked)); + + for (++w; w < active_words; ++w) + { + if (r[w]) + return w * 64 + static_cast(std::countr_zero(r[w])); + } + return n_; + } + + std::size_t FindNextSet(const std::size_t row, const std::size_t pos) const noexcept + { + return FindFirstSet(row, pos + 1); + } + +private: + std::size_t n_ = 0; + std::size_t active_words_per_row_ = 0; + std::size_t padded_words_per_row_ = 0; + std::vector row_active_word_counts_; + std::vector data_; +}; + +struct DFSFrame +{ + std::uint32_t u_face_rank = INVALID_INDEX; + std::uint32_t via_v_face_rank = INVALID_INDEX; + std::uint32_t producer_rank_index = 0; + std::uint32_t producer_rank_end = 0; + std::uint32_t next_v_face_rank = 0; + std::uint32_t v_face_end = 0; +}; + +struct ThreadLocalWorkspace +{ + BitMatrix reachability; + std::vector face_mate_u; + std::vector face_mate_v; + std::vector face_dist; + std::vector face_queue; + std::vector consumer_rank_face_offsets; + std::vector consumer_rank_face_write_offsets; + std::vector faces_by_consumer_rank; + std::vector candidate_producer_rank_offsets; + std::vector candidate_producer_ranks; + std::vector candidate_face_counts_by_consumer_rank; + std::vector greedy_consumer_rank_order; + std::vector face_last_rank_for_slot; + std::vector dfs_frames; + + void PrepareMatching(const std::size_t num_consumer_ranks, const std::size_t num_faces) + { + face_mate_u.assign(num_faces, INVALID_INDEX); + face_mate_v.assign(num_faces, INVALID_INDEX); + face_dist.assign(num_faces, -1); + if (face_queue.size() < num_faces) + face_queue.resize(num_faces); + if (face_last_rank_for_slot.size() < num_faces) + face_last_rank_for_slot.resize(num_faces); + if (dfs_frames.capacity() < num_faces) + dfs_frames.reserve(num_faces); + consumer_rank_face_offsets.assign(num_consumer_ranks + 1, 0); + consumer_rank_face_write_offsets.assign(num_consumer_ranks, 0); + faces_by_consumer_rank.assign(num_faces, INVALID_INDEX); + candidate_producer_rank_offsets.assign(num_consumer_ranks + 1, 0); + candidate_face_counts_by_consumer_rank.assign(num_consumer_ranks, 0); + greedy_consumer_rank_order.clear(); + candidate_producer_ranks.clear(); + } +}; + +namespace +{ + +// Build the reflexive transitive closure of the local cell DAG in topological-rank space. +void +BuildReachability(const std::uint32_t num_tasks, + const std::vector& successor_rank_offsets, + const std::vector& successor_ranks, + ThreadLocalWorkspace& ws) +{ + ws.reachability.ResizeAndClear(num_tasks); + for (std::uint32_t i = 0; i < num_tasks; ++i) + { + const auto successor_begin = successor_ranks.begin() + successor_rank_offsets[i]; + const auto successor_end = successor_ranks.begin() + successor_rank_offsets[i + 1]; + const auto start_word = static_cast(i / 64); + + ws.reachability.SetBit(i, i); + if (successor_begin == successor_end) + continue; + + ws.reachability.CopyRowFromWord(i, ws.reachability, *successor_begin, start_word); + for (auto it = successor_begin + 1; it != successor_end; ++it) + ws.reachability.OrRowsFromWord(i, ws.reachability, *it, start_word); + } +} + +} // namespace + +// Exact minimum chain-cover solver for the local-face reuse poset. +// +// The bipartite graph is never explicity materialized. Candidate right vertices are generated +// on demand from the cached reachability rows and from the producer-face grouping created by +// CBC_SPDS::BuildLocalFaceTaskGraph(), which avoids the memory cost of an explicit dense +// face-to-face adjacency structure. +class LocalFaceHopcroftKarp +{ +public: + LocalFaceHopcroftKarp(const std::vector& face_producer_ranks, + const std::vector& face_consumer_ranks, + const std::vector& producer_cell_face_offsets, + std::vector& face_slot_ids, + ThreadLocalWorkspace& ws) + : num_faces_(static_cast(face_producer_ranks.size())), + face_producer_ranks_(face_producer_ranks), + face_consumer_ranks_(face_consumer_ranks), + producer_cell_face_offsets_(producer_cell_face_offsets), + face_slot_ids_(face_slot_ids), + ws_(ws) + { + ws_.PrepareMatching(producer_cell_face_offsets_.size() - 1, num_faces_); + PrepareConsumerFaceCache(); + PrepareCandidateProducerRankCache(); + PrepareGreedyOrder(); + } + + SlotSolveResult Solve() + { + if (num_faces_ == 0) + { + face_slot_ids_.clear(); + return {}; + } + + // Greedy seeding to increase the initial matching size and reduces the + // number of BFS/DFS phases that follow. + std::size_t matching_size = GreedyInit(); + while (BFS()) + { + for (std::uint32_t i = 0; i < num_faces_; ++i) + { + if (ws_.face_mate_u[i] == INVALID_INDEX and DFS(i)) + ++matching_size; + } + } + + ExtractSlotAssignment(); + const std::size_t slot_count = static_cast(num_faces_) - matching_size; + if (VerifySlotAssignment(slot_count)) + return {slot_count, false}; + + std::iota(face_slot_ids_.begin(), face_slot_ids_.end(), std::uint32_t{0}); + return {static_cast(num_faces_), true}; + } + +private: + template + void ForEachCandidate(const std::uint32_t u_face_rank, const F& fn) const + { + // The bipartite graph is implicit. For one left-side face u, the admissible right-side + // faces are all faces whose producer ranks lie in the cached reachable-producer row of + // u's consumer rank. + const auto consumer_cell_rank = face_consumer_ranks_[u_face_rank]; + const auto rank_begin = ws_.candidate_producer_rank_offsets[consumer_cell_rank]; + const auto rank_end = ws_.candidate_producer_rank_offsets[consumer_cell_rank + 1]; + for (std::uint32_t rank_index = rank_begin; rank_index < rank_end; ++rank_index) + { + const auto producer_cell_rank = ws_.candidate_producer_ranks[rank_index]; + const auto face_begin = producer_cell_face_offsets_[producer_cell_rank]; + const auto face_end = producer_cell_face_offsets_[producer_cell_rank + 1]; + for (std::uint32_t v_face_rank = face_begin; v_face_rank < face_end; ++v_face_rank) + { + if (fn(v_face_rank)) + return; + } + } + } + + bool ReuseRelationHolds(const std::uint32_t u_face_rank, + const std::uint32_t v_face_rank) const noexcept + { + return ws_.reachability.TestBit(face_consumer_ranks_[u_face_rank], + face_producer_ranks_[v_face_rank]); + } + + void ExtractSlotAssignment() + { + // Every unmatched right vertex starts one chain. Following the matched left-to-right + // links recovers the full chain, and each chain becomes one reusable slot. + face_slot_ids_.assign(num_faces_, INVALID_INDEX); + std::uint32_t next_slot_id = 0; + for (std::uint32_t i = 0; i < num_faces_; ++i) + { + if (ws_.face_mate_v[i] != INVALID_INDEX) + continue; + + std::uint32_t current = i; + while (current != INVALID_INDEX) + { + face_slot_ids_[current] = next_slot_id; + current = ws_.face_mate_u[current]; + } + ++next_slot_id; + } + } + + bool VerifySlotAssignment(const std::size_t slot_count) const + { + for (std::uint32_t face = 0; face < num_faces_; ++face) + { + if (face_slot_ids_[face] >= slot_count) + return false; + } + + std::fill_n(ws_.face_last_rank_for_slot.begin(), slot_count, INVALID_INDEX); + for (std::uint32_t rank = 0; rank < num_faces_; ++rank) + { + // It is sufficient to check consecutive faces within one extracted chain. + // Transitivity of the reuse relation then covers the full chain. + const auto slot_id = face_slot_ids_[rank]; + const auto prev_rank = ws_.face_last_rank_for_slot[slot_id]; + if ((prev_rank != INVALID_INDEX) and (not ReuseRelationHolds(prev_rank, rank))) + return false; + ws_.face_last_rank_for_slot[slot_id] = rank; + } + return true; + } + + std::size_t GreedyInit() + { + std::size_t count = 0; + for (const auto consumer_rank : ws_.greedy_consumer_rank_order) + { + // Process the scarcest consumer rows first. + // This preserves exactness while giving the greedy phase a better + // chance of seeding a large initial matching. + const auto face_begin = ws_.consumer_rank_face_offsets[consumer_rank]; + const auto face_end = ws_.consumer_rank_face_offsets[consumer_rank + 1]; + for (std::uint32_t face_index = face_begin; face_index < face_end; ++face_index) + { + const auto u_face_rank = ws_.faces_by_consumer_rank[face_index]; + if (ws_.face_mate_u[u_face_rank] != INVALID_INDEX) + continue; + + ForEachCandidate(u_face_rank, + [&](const std::uint32_t v_face_rank) -> bool + { + if (ws_.face_mate_v[v_face_rank] != INVALID_INDEX) + return false; + ws_.face_mate_u[u_face_rank] = v_face_rank; + ws_.face_mate_v[v_face_rank] = u_face_rank; + ++count; + return true; + }); + } + } + return count; + } + + bool BFS() + { + // Hopcroft-Karp BFS: build distance labels from all unmatched left vertices and + // stop at the first layer that reaches the null vertex. + std::fill_n(ws_.face_dist.begin(), num_faces_, -1); + std::size_t head = 0; + std::size_t tail = 0; + + for (std::uint32_t i = 0; i < num_faces_; ++i) + { + if (ws_.face_mate_u[i] != INVALID_INDEX) + continue; + ws_.face_dist[i] = 0; + ws_.face_queue[tail++] = i; + } + + dist_null_ = std::numeric_limits::max(); + while (head < tail) + { + const auto u_face_rank = ws_.face_queue[head++]; + if (ws_.face_dist[u_face_rank] >= dist_null_) + continue; + + ForEachCandidate(u_face_rank, + [&](const std::uint32_t v_face_rank) -> bool + { + const auto mate_of_v = ws_.face_mate_v[v_face_rank]; + if (mate_of_v == INVALID_INDEX) + { + if (dist_null_ == std::numeric_limits::max()) + dist_null_ = ws_.face_dist[u_face_rank] + 1; + } + else if (ws_.face_dist[mate_of_v] == -1) + { + ws_.face_dist[mate_of_v] = ws_.face_dist[u_face_rank] + 1; + ws_.face_queue[tail++] = mate_of_v; + } + return false; + }); + } + + return dist_null_ != std::numeric_limits::max(); + } + + bool DFS(const std::uint32_t u_face_rank) + { + // Hopcroft-Karp DFS, implemented iteratively. + // Each frame represents one left vertex together with the current + // position in its implicit adjacency row. This avoids recursion while + // preserving the same augmenting-path search. + ws_.dfs_frames.clear(); + PushDFSFrame(u_face_rank, INVALID_INDEX); + + while (not ws_.dfs_frames.empty()) + { + auto& frame = ws_.dfs_frames.back(); + const auto current_u = frame.u_face_rank; + const auto current_dist = ws_.face_dist[current_u]; + + bool descended = false; + while (AdvanceFrame(frame)) + { + const auto v_face_rank = frame.next_v_face_rank++; + const auto mate_of_v = ws_.face_mate_v[v_face_rank]; + if (mate_of_v == INVALID_INDEX) + { + if (dist_null_ != current_dist + 1) + continue; + + // An augmenting path has been found. Walk back through the explicit stack and flip + // the matching along the full alternating path. + ws_.face_mate_v[v_face_rank] = current_u; + ws_.face_mate_u[current_u] = v_face_rank; + ws_.face_dist[current_u] = -1; + for (std::size_t depth = ws_.dfs_frames.size(); depth-- > 1;) + { + const auto parent_u = ws_.dfs_frames[depth - 1].u_face_rank; + const auto via_v_face_rank = ws_.dfs_frames[depth].via_v_face_rank; + ws_.face_mate_v[via_v_face_rank] = parent_u; + ws_.face_mate_u[parent_u] = via_v_face_rank; + ws_.face_dist[parent_u] = -1; + } + return true; + } + + if (ws_.face_dist[mate_of_v] != current_dist + 1) + continue; + + PushDFSFrame(mate_of_v, v_face_rank); + descended = true; + break; + } + + if (descended) + continue; + + ws_.face_dist[current_u] = -1; + ws_.dfs_frames.pop_back(); + } + + return false; + } + + void PrepareConsumerFaceCache() + { + // Regroup left-side faces by consumer rank once so both greedy seeding and layered + // matching traverse contiguous face ranges instead of repeatedly filtering the face list. + for (const auto consumer_rank : face_consumer_ranks_) + ++ws_.consumer_rank_face_offsets[consumer_rank + 1]; + + std::partial_sum(ws_.consumer_rank_face_offsets.begin(), + ws_.consumer_rank_face_offsets.end(), + ws_.consumer_rank_face_offsets.begin()); + std::copy_n(ws_.consumer_rank_face_offsets.begin(), + ws_.consumer_rank_face_write_offsets.size(), + ws_.consumer_rank_face_write_offsets.begin()); + + for (std::uint32_t u_face_rank = 0; u_face_rank < num_faces_; ++u_face_rank) + { + const auto consumer_rank = face_consumer_ranks_[u_face_rank]; + const auto write_index = ws_.consumer_rank_face_write_offsets[consumer_rank]++; + ws_.faces_by_consumer_rank[write_index] = u_face_rank; + } + } + + void PrepareCandidateProducerRankCache() + { + // Cache the sparse producer-rank rows of the implicit bipartite graph. All faces with + // the same consumer rank share the same reachable producer ranks. + const auto num_consumer_ranks = producer_cell_face_offsets_.size() - 1; + for (std::size_t consumer_rank = 0; consumer_rank < num_consumer_ranks; ++consumer_rank) + { + ws_.candidate_producer_rank_offsets[consumer_rank] = + static_cast(ws_.candidate_producer_ranks.size()); + + if (ws_.consumer_rank_face_offsets[consumer_rank] == + ws_.consumer_rank_face_offsets[consumer_rank + 1]) + continue; + + std::uint32_t candidate_face_count = 0; + for (std::size_t producer_rank = ws_.reachability.FindFirstSet(consumer_rank, consumer_rank); + producer_rank < num_consumer_ranks; + producer_rank = ws_.reachability.FindNextSet(consumer_rank, producer_rank)) + { + const auto face_begin = producer_cell_face_offsets_[producer_rank]; + const auto face_end = producer_cell_face_offsets_[producer_rank + 1]; + if (face_begin == face_end) + continue; + + ws_.candidate_producer_ranks.push_back(static_cast(producer_rank)); + candidate_face_count += face_end - face_begin; + } + ws_.candidate_face_counts_by_consumer_rank[consumer_rank] = candidate_face_count; + } + + ws_.candidate_producer_rank_offsets.back() = + static_cast(ws_.candidate_producer_ranks.size()); + } + + void PrepareGreedyOrder() + { + // Order nonempty consumer rows by increasing right-side candidate count. + // This affects only the heuristic seed matching, not the final result. + const auto num_consumer_ranks = producer_cell_face_offsets_.size() - 1; + ws_.greedy_consumer_rank_order.reserve(num_consumer_ranks); + for (std::uint32_t consumer_rank = 0; consumer_rank < num_consumer_ranks; ++consumer_rank) + { + if (ws_.consumer_rank_face_offsets[consumer_rank] == + ws_.consumer_rank_face_offsets[consumer_rank + 1]) + continue; + ws_.greedy_consumer_rank_order.push_back(consumer_rank); + } + + std::sort(ws_.greedy_consumer_rank_order.begin(), + ws_.greedy_consumer_rank_order.end(), + [&](const std::uint32_t lhs, const std::uint32_t rhs) + { + const auto lhs_count = ws_.candidate_face_counts_by_consumer_rank[lhs]; + const auto rhs_count = ws_.candidate_face_counts_by_consumer_rank[rhs]; + if (lhs_count != rhs_count) + return lhs_count < rhs_count; + return lhs < rhs; + }); + } + + void PushDFSFrame(const std::uint32_t u_face_rank, const std::uint32_t via_v_face_rank) + { + // Materialize the current state of one implicit adjacency-row scan on the DFS stack. + const auto consumer_rank = face_consumer_ranks_[u_face_rank]; + const auto producer_rank_index = ws_.candidate_producer_rank_offsets[consumer_rank]; + const auto producer_rank_end = ws_.candidate_producer_rank_offsets[consumer_rank + 1]; + ws_.dfs_frames.push_back( + {u_face_rank, via_v_face_rank, producer_rank_index, producer_rank_end, 0, 0}); + } + + bool AdvanceFrame(DFSFrame& frame) const + { + // Advance the current DFS frame to the next candidate right vertex. The frame stores + // both the producer-rank row cursor and the face-range cursor within that row. + while (true) + { + if (frame.next_v_face_rank < frame.v_face_end) + return true; + if (frame.producer_rank_index >= frame.producer_rank_end) + return false; + + const auto producer_rank = ws_.candidate_producer_ranks[frame.producer_rank_index++]; + frame.next_v_face_rank = producer_cell_face_offsets_[producer_rank]; + frame.v_face_end = producer_cell_face_offsets_[producer_rank + 1]; + } + } + + std::uint32_t num_faces_ = 0; + const std::vector& face_producer_ranks_; + const std::vector& face_consumer_ranks_; + const std::vector& producer_cell_face_offsets_; + std::vector& face_slot_ids_; + ThreadLocalWorkspace& ws_; + int dist_null_ = 0; +}; + +SlotSolveResult +ComputeLocalFaceSlotPlan(const std::vector& successor_rank_offsets, + const std::vector& successor_ranks, + const std::vector& face_producer_ranks, + const std::vector& face_consumer_ranks, + const std::vector& producer_cell_face_offsets, + std::vector& face_slot_ids) +{ + if (face_producer_ranks.empty()) + { + face_slot_ids.clear(); + return {}; + } + + static thread_local ThreadLocalWorkspace workspace; + BuildReachability(static_cast(successor_rank_offsets.size() - 1), + successor_rank_offsets, + successor_ranks, + workspace); + + LocalFaceHopcroftKarp slot_planner( + face_producer_ranks, face_consumer_ranks, producer_cell_face_offsets, face_slot_ids, workspace); + return slot_planner.Solve(); +} + +} // namespace opensn::detail diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc_slot_planner.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc_slot_planner.h new file mode 100644 index 0000000000..83f493225d --- /dev/null +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc_slot_planner.h @@ -0,0 +1,77 @@ +// SPDX-FileCopyrightText: 2026 The OpenSn Authors +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include + +namespace opensn::detail +{ + +/// Result of an exact local-face slot-planning solve. +struct SlotSolveResult +{ + /// Exact number of reusable slots required by the computed chain cover. + std::size_t slot_count = 0; + /// Flag indicating that the post-solve verifier rejected the computed assignment. + bool verifier_rejected = false; +}; + +/** + * Compute the exact minimum safe local-face slot assignment. + * + * Let `F` denote the local directed faces, and define `u < v` when the consumer cell of + * face `u` reaches the producer cell of face `v` in the local CBC task DAG. This is the + * safe reuse relation: if `u < v`, then every admissible CBC or CBCD sweep consumes the + * angular flux stored for `u` before `v` may overwrite the same slot. + * + * Computing the minimum number of reusable cell-face slots is equivalent to + * the minimum chain-cover problem for the induced face poset. + * A chain is one statically reusable slot. The minimum number of slots equals the poset + * width (i.e. the maximum cardinality of any antichain of pairwise incomparable + * faces). By Dilworth's theorem, this is the minimum chain-cover cardinality. + * + * The implementation uses the standard bipartite split-graph reduction. The reuse relation + * defines the bipartite edges, Hopcroft-Karp computes a maximum cardinality matching, and + * the matching induces a minimum chain cover of size `|F| - |M|`. Koenig's theorem provides + * the matching-cover duality for the bipartite graph. Consequently, the returned slot count + * is exact. + * + * Algorithm flow: + * 1. Build the reflexive transitive closure of the local CBC task DAG in topological-rank + * space. + * 2. Group local directed faces by consumer-cell rank and cache the reachable producer-cell + * ranks that define the reuse graph rows. + * 3. Run Hopcroft-Karp on the implicit bipartite reuse graph: + * a. greedy seeding, + * b. BFS layer construction, + * c. iterative DFS augmentation. + * 4. Extract one slot chain per unmatched right-side face. + * 5. Verify the extracted assignment and report whether the caller should fall back to the + * identity assignment. + * + * After chain extraction, the assignment is verified by checking each consecutive reuse + * handoff in face enumeration order. If the verifier rejects the result, the caller may + * conservatively fall back to the identity assignment. + * + * \param successor_rank_offsets Offsets into the flat successor-rank adjacency list of the + * local CBC task DAG. + * \param successor_ranks Flat successor-rank adjacency list of the local CBC task DAG. + * \param face_producer_ranks Producer-cell topological rank for each local directed face. + * \param face_consumer_ranks Consumer-cell topological rank for each local directed face. + * \param producer_cell_face_offsets Offsets grouping local faces by producer-cell topological + * rank. + * \param face_slot_ids Output slot assignment keyed by local face rank. + * \return Exact slot count and verifier status for the computed assignment. + */ +SlotSolveResult +ComputeLocalFaceSlotPlan(const std::vector& successor_rank_offsets, + const std::vector& successor_ranks, + const std::vector& face_producer_ranks, + const std::vector& face_consumer_ranks, + const std::vector& producer_cell_face_offsets, + std::vector& face_slot_ids); + +} // namespace opensn::detail From 2ea48b7d6e08e99e15dca2afb3f6086aaa86b598 Mon Sep 17 00:00:00 2001 From: Eappen Nelluvelil Date: Sun, 12 Apr 2026 17:39:06 -0500 Subject: [PATCH 5/6] CBC_FLUDS uses minimally sized local psi buffer --- .../discrete_ordinates_problem.cc | 10 +- .../sweep/angle_set/cbc_angle_set.cc | 72 +++-- .../sweep/angle_set/cbc_angle_set.h | 68 +++- .../sweep/communicators/cbc_async_comm.cc | 217 ++++++++----- .../sweep/communicators/cbc_async_comm.h | 68 +++- .../sweep/fluds/cbc_fluds.cc | 165 ++++++---- .../sweep/fluds/cbc_fluds.h | 197 ++++++++---- .../sweep/fluds/cbc_fluds_common_data.cc | 101 +++++- .../sweep/fluds/cbc_fluds_common_data.h | 143 ++++++++- .../sweep_chunks/cbc_avx_sweep_chunk.cc | 162 ++++++---- .../sweep_chunks/cbc_sweep_chunk.cc | 60 ++-- .../sweep_chunks/cbc_sweep_chunk.h | 58 ++-- .../sweep_chunks/cbc_sweep_chunk_shared.h | 141 ++++----- .../sweep_chunks/cbc_sweep_chunk_td.cc | 81 ++--- .../sweep_chunks/cbc_sweep_chunk_td.h | 25 +- .../sweep_chunks/cbc_sweep_kernels.h | 298 ++++++++++++++---- 16 files changed, 1327 insertions(+), 539 deletions(-) diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc index dddec78344..bf9076043a 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc @@ -1922,12 +1922,10 @@ DiscreteOrdinatesProblem::InitFluxDataStructures(LBSGroupset& groupset) } else { - fluds = - std::make_shared(gs_num_grps, - angle_indices.size(), - dynamic_cast(fluds_common_data), - groupset.psi_uk_man_, - *discretization_); + fluds = std::make_shared( + gs_num_grps, + angle_indices.size(), + dynamic_cast(fluds_common_data)); } std::shared_ptr angle_set; diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.cc index 999675b039..8237eb543d 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.cc +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.cc @@ -3,6 +3,7 @@ #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/sweep_chunk.h" #include "framework/mesh/mesh_continuum/mesh_continuum.h" @@ -10,6 +11,7 @@ #include "framework/logging/log.h" #include "framework/runtime.h" #include "caliper/cali.h" +#include namespace opensn { @@ -24,8 +26,27 @@ CBC_AngleSet::CBC_AngleSet(size_t id, : AngleSet(id, num_groups, spds, fluds, angle_indices, boundaries), cbc_spds_(dynamic_cast(spds_)), ready_tasks_(), - async_comm_(id, *fluds, comm_set) + async_comm_(id, *fluds, comm_set), + cbc_fluds_(dynamic_cast(*fluds)) { + + const auto& task_list = cbc_spds_.GetTaskList(); + const auto num_tasks = task_list.size(); + initial_dependencies_.resize(num_tasks); + remaining_dependencies_.resize(num_tasks); + initial_ready_tasks_.reserve(num_tasks); + ready_tasks_.reserve(num_tasks); + + for (std::uint32_t task_idx = 0; task_idx < num_tasks; ++task_idx) + { + const auto& task = task_list[task_idx]; + const auto num_dependencies = task.num_dependencies; + initial_dependencies_[task_idx] = num_dependencies; + if (num_dependencies == 0) + initial_ready_tasks_.push_back(task_idx); + } + + ResetTaskState(); } AsynchronousCommunicator* @@ -42,24 +63,15 @@ CBC_AngleSet::AngleSetAdvance(SweepChunk& sweep_chunk, AngleSetStatus permission if (executed_) return AngleSetStatus::FINISHED; - if (current_task_list_.empty()) - { - current_task_list_ = cbc_spds_.GetTaskList(); - // Build initial ready queue - ready_tasks_.reserve(current_task_list_.size()); - for (size_t i = 0; i < current_task_list_.size(); ++i) - if ((current_task_list_[i].num_dependencies == 0) and (not current_task_list_[i].completed)) - ready_tasks_.push_back(i); - } - + const auto& task_list = cbc_spds_.GetTaskList(); sweep_chunk.SetAngleSet(*this); - auto tasks_who_received_data = async_comm_.ReceiveData(); + const auto tasks_who_received_data = async_comm_.ReceiveData(); - for (const std::uint64_t task_number : tasks_who_received_data) + for (const auto& task_number : tasks_who_received_data) { - if ((--current_task_list_[task_number].num_dependencies == 0) and - (not current_task_list_[task_number].completed)) + assert(remaining_dependencies_[task_number] > 0); + if (--remaining_dependencies_[task_number] == 0) ready_tasks_.push_back(task_number); } @@ -74,24 +86,23 @@ CBC_AngleSet::AngleSetAdvance(SweepChunk& sweep_chunk, AngleSetStatus permission { const auto task_idx = ready_tasks_.back(); ready_tasks_.pop_back(); - auto& cell_task = current_task_list_[task_idx]; + const auto& cell_task = task_list[task_idx]; sweep_chunk.SetCell(cell_task.cell_ptr, *this); sweep_chunk.Sweep(*this); for (const auto& local_task_num : cell_task.successors) { - if ((--current_task_list_[local_task_num].num_dependencies == 0) and - (not current_task_list_[local_task_num].completed)) + assert(remaining_dependencies_[local_task_num] > 0); + if (--remaining_dependencies_[local_task_num] == 0) ready_tasks_.push_back(local_task_num); } - cell_task.completed = true; - ++num_completed_tasks; + ++num_completed_tasks_; async_comm_.SendData(); } - const bool all_tasks_completed = (num_completed_tasks == current_task_list_.size()); + const bool all_tasks_completed = (num_completed_tasks_ == task_list.size()); const bool all_messages_sent = async_comm_.SendData(); if (all_tasks_completed and all_messages_sent) @@ -109,14 +120,21 @@ CBC_AngleSet::AngleSetAdvance(SweepChunk& sweep_chunk, AngleSetStatus permission void CBC_AngleSet::ResetSweepBuffers() { - current_task_list_.clear(); - ready_tasks_.clear(); - num_completed_tasks = 0; + ResetTaskState(); async_comm_.Reset(); fluds_->ClearLocalAndReceivePsi(); executed_ = false; } +void +CBC_AngleSet::ResetTaskState() +{ + std::copy( + initial_dependencies_.begin(), initial_dependencies_.end(), remaining_dependencies_.begin()); + ready_tasks_ = initial_ready_tasks_; + num_completed_tasks_ = 0; +} + const double* CBC_AngleSet::PsiBoundary(uint64_t boundary_id, unsigned int angle_num, @@ -126,12 +144,8 @@ CBC_AngleSet::PsiBoundary(uint64_t boundary_id, unsigned int g, bool surface_source_active) { - if (boundaries_[boundary_id]->IsReflecting()) - return boundaries_[boundary_id]->PsiIncoming(cell_local_id, face_num, fi, angle_num, g); - - if (not surface_source_active) + if ((not boundaries_[boundary_id]->IsReflecting()) and (not surface_source_active)) return boundaries_[boundary_id]->ZeroFlux(g); - return boundaries_[boundary_id]->PsiIncoming(cell_local_id, face_num, fi, angle_num, g); } diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.h index ba127849db..266aef7cd8 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.h @@ -9,11 +9,29 @@ namespace opensn { +class CBC_FLUDS; class CBC_SPDS; +/** + * Host-side CBC angle set. + * + * Owns the local CBC task state for one angle set and advances the host CBC sweep + * by combining local task execution with non-local message progress. + */ class CBC_AngleSet : public AngleSet { public: + /** + * Construct the CBC angle set. + * + * \param id Angle-set ID. + * \param num_groups Number of groups in the angle set. + * \param spds Sweep plane data structure for this angle set. + * \param fluds CBC FLUDS instance for this angle set. + * \param angle_indices Global angle indices represented by this angle set. + * \param boundaries Sweep-boundary table indexed by boundary ID. + * \param comm_set MPI communicator set used for receives. + */ CBC_AngleSet(size_t id, unsigned int num_groups, const SPDS& spds, @@ -22,14 +40,19 @@ class CBC_AngleSet : public AngleSet std::map>& boundaries, const MPICommunicatorSet& comm_set); + /// Return the delayed-data communicator for this angle set. AsynchronousCommunicator* GetCommunicator() override; + /// Initialize delayed upstream data before the sweep starts. void InitializeDelayedUpstreamData() override {} + /// Return the buffered-message limit used by the scheduler. int GetMaxBufferMessages() const override { return 0; } - void SetMaxBufferMessages(int new_max) override {} + /// Set the buffered-message limit used by the scheduler. + void SetMaxBufferMessages(int max_buffer_messages) override {} + /// Advance the host CBC angle set by one scheduler step. AngleSetStatus AngleSetAdvance(SweepChunk& sweep_chunk, AngleSetStatus permission) override; AngleSetStatus FlushSendBuffers() override @@ -40,8 +63,21 @@ class CBC_AngleSet : public AngleSet void ResetSweepBuffers() override; + /// Report whether delayed upstream data has been received. bool ReceiveDelayedData() override { return true; } + /** + * Return the incoming boundary angular flux for one boundary face node. + * + * \param boundary_id Boundary ID. + * \param angle_num Angle index within the angle set. + * \param cell_local_id Local cell ID. + * \param face_num Face ID on the cell. + * \param fi Face-node index. + * \param g Group index. + * \param surface_source_active Flag if surface source is active. + * \return Pointer to the requested incoming boundary value. + */ const double* PsiBoundary(uint64_t boundary_id, unsigned int angle_num, uint64_t cell_local_id, @@ -50,6 +86,16 @@ class CBC_AngleSet : public AngleSet unsigned int g, bool surface_source_active) override; + /** + * Return the outgoing reflecting-boundary storage for one face node. + * + * \param boundary_id Boundary ID. + * \param angle_num Angle index within the angle set. + * \param cell_local_id Local cell ID. + * \param face_num Face ID on the cell. + * \param fi Face-node index. + * \return Pointer to the reflected outgoing storage for the node. + */ double* PsiReflected(uint64_t boundary_id, unsigned int angle_num, uint64_t cell_local_id, @@ -57,11 +103,25 @@ class CBC_AngleSet : public AngleSet unsigned int fi) override; protected: + /// Reset the mutable local-task state before a new sweep. + void ResetTaskState(); + + /// CBC sweep plane data structure for this angle set. const CBC_SPDS& cbc_spds_; - std::vector current_task_list_; - std::vector ready_tasks_; - size_t num_completed_tasks = 0; + /// Initial predecessor counts per local CBC task. + std::vector initial_dependencies_; + /// Local tasks that are ready at the start of a sweep. + std::vector initial_ready_tasks_; + /// Mutable predecessor counts for the current sweep. + std::vector remaining_dependencies_; + /// Local tasks ready to execute. + std::vector ready_tasks_; + /// Number of completed local tasks. + size_t num_completed_tasks_ = 0; + /// Asynchronous communicator for this angle set. CBC_AsynchronousCommunicator async_comm_; + /// CBC FLUDS instance for this angle set. + CBC_FLUDS& cbc_fluds_; }; } // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.cc index 44c61c777e..86f08029dd 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.cc +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.cc @@ -2,82 +2,146 @@ // SPDX-License-Identifier: MIT #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/spds.h" #include "framework/mesh/mesh_continuum/mesh_continuum.h" #include "framework/mpi/mpi_comm_set.h" -#include "framework/logging/log.h" #include "framework/runtime.h" #include "caliper/cali.h" #include -#include +#include +#include namespace opensn { +namespace detail +{ + +namespace +{ + +template +void +AppendBytes(std::vector& buffer, const T& value) +{ + const size_t old_size = buffer.size(); + buffer.resize(old_size + sizeof(T)); + std::memcpy(buffer.data() + old_size, &value, sizeof(T)); +} + +template +T +ReadBytes(std::span buffer, size_t& offset) +{ + T value; + std::memcpy(&value, buffer.data() + offset, sizeof(T)); + offset += sizeof(T); + return value; +} + +} // namespace + +} // namespace detail + +CBC_AsynchronousCommunicator::CBC_AsynchronousCommunicator(size_t angle_set_id, + FLUDS& fluds, + const MPICommunicatorSet& comm_set) + : AsynchronousCommunicator(fluds, comm_set), + angle_set_id_(angle_set_id), + cbc_fluds_(dynamic_cast(fluds)) +{ + const auto& cbc_common = dynamic_cast(cbc_fluds_.GetCommonData()); + const auto num_deplocs = fluds_.GetSPDS().GetLocationSuccessors().size(); + + outgoing_message_queue_.reserve(cbc_common.GetNumOutgoingNonlocalFaces()); + send_buffer_.reserve(num_deplocs); + destination_buffer_bytes_.assign(num_deplocs, 0); + destination_buffer_indices_.assign(num_deplocs, std::numeric_limits::max()); + + constexpr size_t header_bytes = sizeof(std::uint64_t) + sizeof(unsigned int) + sizeof(size_t); + for (size_t deplocI = 0; deplocI < num_deplocs; ++deplocI) + { + destination_buffer_bytes_[deplocI] = + cbc_common.GetDeplocIFaceNodeCount(deplocI) * cbc_fluds_.GetStrideSize() * sizeof(double) + + cbc_common.GetDeplocIFaceCount(deplocI) * header_bytes; + } +} + std::vector& CBC_AsynchronousCommunicator::InitGetDownwindMessageData(int location_id, - uint64_t cell_global_id, + std::uint64_t cell_global_id, unsigned int face_id, - size_t angle_set_id, - size_t data_size) + std::size_t angle_set_id, + std::size_t data_size) { MessageKey key{location_id, cell_global_id, face_id}; - std::vector& data = outgoing_message_queue_[key]; - if (data.empty()) - data.assign(data_size, 0.0); + auto [it, inserted] = outgoing_message_queue_.try_emplace(key); + std::vector& data = it->second; + if (inserted) + data.resize(data_size); return data; } -bool -CBC_AsynchronousCommunicator::SendData() +void +CBC_AsynchronousCommunicator::QueueOutgoingMessages() { - CALI_CXX_MARK_SCOPE("CBC_AsynchronousCommunicator::SendData"); - - // First we convert any new outgoing messages from the queue into - // buffer messages. We aggregate these messages per location-id - // they need to be sent to - if (not outgoing_message_queue_.empty()) + if (outgoing_message_queue_.empty()) + return; + std::fill(destination_buffer_indices_.begin(), + destination_buffer_indices_.end(), + std::numeric_limits::max()); + for (const auto& [msg_key, data] : outgoing_message_queue_) { - std::map locI_buffer_map; - - for (const auto& [msg_key, data] : outgoing_message_queue_) + const int locI = std::get<0>(msg_key); + const std::uint64_t cell_global_id = std::get<1>(msg_key); + const unsigned int face_id = std::get<2>(msg_key); + const size_t data_size = data.size(); + const auto deplocI = static_cast(fluds_.GetSPDS().MapLocJToDeplocI(locI)); + + auto buffer_index = destination_buffer_indices_[deplocI]; + if (buffer_index == std::numeric_limits::max()) { - const int locI = std::get<0>(msg_key); - const uint64_t cell_global_id = std::get<1>(msg_key); - const unsigned int face_id = std::get<2>(msg_key); - const size_t data_size = data.size(); - - BufferItem& buffer_item = locI_buffer_map[locI]; - buffer_item.destination = locI; - auto& buffer_array = buffer_item.data_array; - buffer_array.Write(cell_global_id); - buffer_array.Write(face_id); - buffer_array.Write(data_size); - - auto& raw = buffer_array.Data(); - const size_t old_size = raw.size(); - const size_t num_bytes = data_size * sizeof(double); - raw.resize(old_size + num_bytes); - std::memcpy(raw.data() + old_size, data.data(), num_bytes); + buffer_index = send_buffer_.size(); + destination_buffer_indices_[deplocI] = buffer_index; + send_buffer_.emplace_back(); + send_buffer_.back().destination = locI; + send_buffer_.back().data.reserve(destination_buffer_bytes_[deplocI]); } - for (auto& [locI, buffer] : locI_buffer_map) - send_buffer_.push_back(std::move(buffer)); + auto& buffer_item = send_buffer_[buffer_index]; + auto& buffer = buffer_item.data; + detail::AppendBytes(buffer, cell_global_id); + detail::AppendBytes(buffer, face_id); + detail::AppendBytes(buffer, data_size); + + const size_t old_size = buffer.size(); + const size_t num_bytes = data_size * sizeof(double); + buffer.resize(old_size + num_bytes); + std::memcpy(buffer.data() + old_size, data.data(), num_bytes); + } + outgoing_message_queue_.clear(); +} + +bool +CBC_AsynchronousCommunicator::SendData() +{ + CALI_CXX_MARK_SCOPE("CBC_AsynchronousCommunicator::SendData"); - outgoing_message_queue_.clear(); - } // if there are outgoing messages + QueueOutgoingMessages(); - // Now we attempt to flush items in the send buffer bool all_messages_sent = true; - for (auto& buffer_item : send_buffer_) + size_t next_open_buffer = 0; + for (size_t buffer_idx = 0; buffer_idx < send_buffer_.size(); ++buffer_idx) { + auto& buffer_item = send_buffer_[buffer_idx]; if (not buffer_item.send_initiated) { const int locJ = buffer_item.destination; const auto& comm = comm_set_.LocICommunicator(locJ); auto dest = comm_set_.MapIonJ(locJ, locJ); auto tag = static_cast(angle_set_id_); - buffer_item.mpi_request = comm.isend(dest, tag, buffer_item.data_array.Data()); + buffer_item.mpi_request = comm.isend(dest, tag, buffer_item.data); buffer_item.send_initiated = true; } @@ -88,8 +152,16 @@ CBC_AsynchronousCommunicator::SendData() else all_messages_sent = false; } - } // for item in buffer + if (not buffer_item.completed) + { + if (next_open_buffer != buffer_idx) + send_buffer_[next_open_buffer] = std::move(buffer_item); + ++next_open_buffer; + } + } + + send_buffer_.resize(next_open_buffer); return all_messages_sent; } @@ -98,40 +170,33 @@ CBC_AsynchronousCommunicator::ReceiveData() { CALI_CXX_MARK_SCOPE("CBC_AsynchronousCommunicator::ReceiveData"); - std::unordered_map, FLUDS::CellFaceKeyHash> - received_messages; - std::vector cells_who_received_data; - const auto& location_dependencies = fluds_.GetSPDS().GetLocationDependencies(); - auto& deplocs_outgoing_messages = fluds_.GetDeplocsOutgoingMessages(); - for (int locJ : location_dependencies) + std::vector cells_who_received_data; + const auto& comm = comm_set_.LocICommunicator(opensn::mpi_comm.rank()); + const auto tag = static_cast(angle_set_id_); + + mpi::Status status; + while (comm.iprobe(ANY_SOURCE, tag, status)) { - const auto& comm = comm_set_.LocICommunicator(opensn::mpi_comm.rank()); - auto source_rank = comm_set_.MapIonJ(locJ, opensn::mpi_comm.rank()); - auto tag = static_cast(angle_set_id_); - mpi::Status status; - if (comm.iprobe(source_rank, tag, status)) + const int source_rank = status.source(); + const int num_items = status.count(); + receive_buffer_.resize(static_cast(num_items)); + comm.recv(source_rank, status.tag(), receive_buffer_.data(), num_items); + size_t offset = 0; + const std::span data_array(receive_buffer_); + + while (offset < data_array.size()) { - int num_items = status.count(); - std::vector recv_buffer(num_items); - comm.recv(source_rank, status.tag(), recv_buffer.data(), num_items); - ByteArray data_array(recv_buffer); - - while (not data_array.EndOfBuffer()) - { - const auto cell_global_id = data_array.Read(); - const auto face_id = data_array.Read(); - const auto data_size = data_array.Read(); - - std::vector psi_data(data_size); - const size_t num_bytes = data_size * sizeof(double); - std::memcpy(psi_data.data(), &data_array.Data()[data_array.Offset()], num_bytes); - data_array.Seek(data_array.Offset() + num_bytes); - - deplocs_outgoing_messages[{cell_global_id, face_id}] = std::move(psi_data); - cells_who_received_data.push_back( - fluds_.GetSPDS().GetGrid()->MapCellGlobalID2LocalID(cell_global_id)); - } // while not at end of buffer - } // Process each message embedded in buffer + const auto cell_global_id = detail::ReadBytes(data_array, offset); + const auto face_id = detail::ReadBytes(data_array, offset); + const auto data_size = detail::ReadBytes(data_array, offset); + + const size_t num_bytes = data_size * sizeof(double); + const auto cell_local_id = cbc_fluds_.StoreIncomingFaceData( + cell_global_id, face_id, data_array.data() + offset, data_size); + offset += num_bytes; + + cells_who_received_data.push_back(cell_local_id); + } } return cells_who_received_data; diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h index ead2c03bd9..9e2329d80d 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h @@ -4,42 +4,63 @@ #pragma once #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/async_comm.h" -#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/fluds.h" -#include "framework/data_types/byte_array.h" #include "mpicpp-lite/mpicpp-lite.h" +#include +#include #include #include -#include -#include namespace mpi = mpicpp_lite; namespace opensn { +class CBC_FLUDS; class MPICommunicatorSet; -class ByteArray; +/** + * Host-side CBC delayed-data communicator. + * + * Packs outgoing non-local face data by destination locality, performs asynchronous + * sends, and receives upwind data needed by the host CBC sweep. + */ class CBC_AsynchronousCommunicator : public AsynchronousCommunicator { public: + /** + * Construct the CBC delayed-data communicator. + * + * \param angle_set_id Owning angle-set ID. + * \param fluds CBC FLUDS instance served by this communicator. + * \param comm_set MPI communicator set. + */ explicit CBC_AsynchronousCommunicator(size_t angle_set_id, FLUDS& fluds, - const MPICommunicatorSet& comm_set) - : AsynchronousCommunicator(fluds, comm_set), angle_set_id_(angle_set_id) - { - } - + const MPICommunicatorSet& comm_set); + + /** + * Initialize one outgoing message payload and return its writable data vector. + * + * \param location_id Destination locality ID. + * \param cell_global_id Destination cell global ID. + * \param face_id Destination face ID. + * \param angle_set_id Producing angle-set ID. + * \param data_size Number of doubles to store in the payload. + * \return Writable payload vector for the outgoing face data. + */ std::vector& InitGetDownwindMessageData(int location_id, uint64_t cell_global_id, unsigned int face_id, size_t angle_set_id, size_t data_size); + /// Send all currently queued outgoing messages. bool SendData(); + /// Receive all currently available upwind messages. std::vector ReceiveData(); + /// Clear all queued outgoing state. void Reset() { outgoing_message_queue_.clear(); @@ -47,12 +68,13 @@ class CBC_AsynchronousCommunicator : public AsynchronousCommunicator } protected: + /// Owning angle-set ID. const size_t angle_set_id_; - /// location_id, cell_global_id, face_id + /// Outgoing message key: `(location_id, cell_global_id, face_id)`. using MessageKey = std::tuple; - /// boost::hash_combine hash function for MessageKey. + /// Hash for MessageKey. struct MessageKeyHash { std::size_t operator()(const MessageKey& key) const noexcept @@ -64,17 +86,37 @@ class CBC_AsynchronousCommunicator : public AsynchronousCommunicator } }; + /// Outgoing face payloads grouped by destination key. std::unordered_map, MessageKeyHash> outgoing_message_queue_; + /// In-flight send buffer record. struct BufferItem { + /// Destination locality. int destination = 0; + /// MPI request for the send. mpi::Request mpi_request; + /// Flag indicating that the send was posted. bool send_initiated = false; + /// Flag indicating that the send completed. bool completed = false; - ByteArray data_array; + /// Packed outgoing message bytes. + std::vector data; }; + /// In-flight outgoing message buffers. std::vector send_buffer_; + /// CBC FLUDS instance served by this communicator. + CBC_FLUDS& cbc_fluds_; + /// Scratch receive buffer for incoming messages. + std::vector receive_buffer_; + /// Packed byte counts per destination locality. + std::vector destination_buffer_bytes_; + /// Send-buffer indices grouped by destination locality. + std::vector destination_buffer_indices_; + +private: + /// Pack the queued outgoing face payloads into send buffers. + void QueueOutgoingMessages(); }; } // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.cc index c6e85f9be4..972eb68d70 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.cc +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.cc @@ -3,91 +3,130 @@ #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/spds.h" -#include "framework/math/spatial_discretization/spatial_discretization.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h" +#include "framework/mesh/cell/cell.h" #include "framework/mesh/mesh_continuum/mesh_continuum.h" -#include "caliper/cali.h" +#include +#include +#include +#include namespace opensn { -CBC_FLUDS::CBC_FLUDS(unsigned int num_groups, - size_t num_angles, - const CBC_FLUDSCommonData& common_data, - const UnknownManager& psi_uk_man, - const SpatialDiscretization& sdm) - : FLUDS(num_groups, num_angles, common_data.GetSPDS()), - common_data_(common_data), - psi_uk_man_(psi_uk_man), - sdm_(sdm), - num_angles_in_gs_quadrature_(psi_uk_man_.GetNumberOfUnknowns()), - num_quadrature_local_dofs_(sdm_.GetNumLocalDOFs(psi_uk_man_)), - num_local_spatial_dofs_(num_quadrature_local_dofs_ / num_angles_in_gs_quadrature_ / - num_groups_), - local_psi_data_size_(num_local_spatial_dofs_ * num_groups_and_angles_), - local_psi_data_(local_psi_data_size_) +namespace detail { - const auto& grid = *spds_.GetGrid(); - cell_psi_start_.resize(grid.local_cells.size()); - for (const auto& cell : grid.local_cells) - { - cell_psi_start_[cell.local_id] = - (sdm_.MapDOFLocal(cell, 0, psi_uk_man_, 0, 0) / num_angles_in_gs_quadrature_ / num_groups_) * - num_groups_and_angles_; - } - deplocs_outgoing_messages_.reserve(common_data.GetNumIncomingNonlocalFaces()); +namespace +{ + +constexpr std::size_t LOCAL_PSI_ALIGNMENT = 64; +constexpr std::size_t DOUBLES_PER_CACHE_LINE = LOCAL_PSI_ALIGNMENT / sizeof(double); +std::size_t +RoundUpToCacheLineMultiple(const std::size_t value) +{ + return ((value + DOUBLES_PER_CACHE_LINE - 1) / DOUBLES_PER_CACHE_LINE) * DOUBLES_PER_CACHE_LINE; } -const FLUDSCommonData& -CBC_FLUDS::GetCommonData() const +} // namespace + +} // namespace detail + +void +CBC_FLUDS::AlignedDoubleDeleter::operator()(double* ptr) const noexcept { - return common_data_; + ::operator delete[](ptr, std::align_val_t(detail::LOCAL_PSI_ALIGNMENT)); } -double* -CBC_FLUDS::UpwindPsi(const Cell& face_neighbor, unsigned int adj_cell_node, size_t as_ss_idx) +CBC_FLUDS::AlignedDoubleBuffer +CBC_FLUDS::AllocateAlignedBuffer(const size_t num_values) { - const size_t index = cell_psi_start_[face_neighbor.local_id] + - adj_cell_node * num_groups_and_angles_ + as_ss_idx * num_groups_; - assert(index < local_psi_data_.size()); - return &local_psi_data_[index]; + auto* const ptr = static_cast( + ::operator new[](num_values * sizeof(double), std::align_val_t(detail::LOCAL_PSI_ALIGNMENT))); + std::fill_n(ptr, num_values, 0.0); + return AlignedDoubleBuffer(ptr); } -double* -CBC_FLUDS::OutgoingPsi(const Cell& cell, unsigned int cell_node, size_t as_ss_idx) +CBC_FLUDS::CBC_FLUDS(unsigned int num_groups, + size_t num_angles, + const CBC_FLUDSCommonData& common_data) + : FLUDS(num_groups, num_angles, common_data.GetSPDS()), + common_data_(common_data), + cell_face_offsets_(common_data.GetCellFaceOffsets()), + num_slots_(common_data.GetNumLocalFaceSlots()), + slot_size_(detail::RoundUpToCacheLineMultiple(common_data.GetMaxLocalFaceNodeCount() * + num_groups_and_angles_)), + local_face_slot_bases_(common_data.GetNumCellFaces(), nullptr), + local_psi_buffer_(AllocateAlignedBuffer(num_slots_ * slot_size_)), + incoming_nonlocal_face_dof_offsets_(common_data.GetNumCellFaces(), 0), + incoming_nonlocal_face_bases_(common_data.GetNumCellFaces(), nullptr), + incoming_nonlocal_psi_buffer_( + [&]() + { + size_t incoming_nonlocal_dof_count = 0; + for (size_t face_storage_index = 0; face_storage_index < common_data.GetNumCellFaces(); + ++face_storage_index) + { + const auto& face_info = + common_data.GetIncomingNonlocalFaceInfoByStorageIndex(face_storage_index); + if (face_info.num_face_nodes == 0) + continue; + incoming_nonlocal_face_dof_offsets_[face_storage_index] = incoming_nonlocal_dof_count; + incoming_nonlocal_dof_count += + detail::RoundUpToCacheLineMultiple(face_info.num_face_nodes * num_groups_and_angles_); + } + return AllocateAlignedBuffer(incoming_nonlocal_dof_count); + }()) { - const size_t index = - cell_psi_start_[cell.local_id] + cell_node * num_groups_and_angles_ + as_ss_idx * num_groups_; - assert(index < local_psi_data_.size()); - return &local_psi_data_[index]; + for (const auto& cell : common_data.GetSPDS().GetGrid()->local_cells) + { + const auto face_storage_offset = cell_face_offsets_[cell.local_id]; + for (std::size_t f = 0; f < cell.faces.size(); ++f) + { + const auto slot_id = + common_data.GetLocalFaceSlotID(cell.local_id, static_cast(f)); + if (slot_id == CBC_SPDS::INVALID_LOCAL_FACE_TASK_ID) + continue; + assert(slot_id < num_slots_); + local_face_slot_bases_[face_storage_offset + f] = + local_psi_buffer_.get() + static_cast(slot_id) * slot_size_; + } + } + + for (std::size_t face_storage_index = 0; face_storage_index < common_data.GetNumCellFaces(); + ++face_storage_index) + { + const auto& face_info = + common_data.GetIncomingNonlocalFaceInfoByStorageIndex(face_storage_index); + if (face_info.num_face_nodes == 0) + continue; + incoming_nonlocal_face_bases_[face_storage_index] = + incoming_nonlocal_psi_buffer_.get() + incoming_nonlocal_face_dof_offsets_[face_storage_index]; + } } -double* -CBC_FLUDS::NLUpwindPsi(uint64_t cell_global_id, - unsigned int face_id, - unsigned int face_node_mapped, - size_t as_ss_idx) +std::uint64_t +CBC_FLUDS::StoreIncomingFaceData(uint64_t cell_global_id, + unsigned int face_id, + const std::byte* psi_data_bytes, + size_t data_size) { - auto it = deplocs_outgoing_messages_.find({cell_global_id, face_id}); - if (it == deplocs_outgoing_messages_.end()) - return nullptr; - auto& psi = it->second; - const size_t dof_map = - face_node_mapped * num_groups_and_angles_ + // Offset to start of data for face_node_mapped - as_ss_idx * num_groups_; // Offset to start of data for angle_set_index - - assert(dof_map < psi.size()); - return &psi[dof_map]; + const auto face_storage_index = + common_data_.GetIncomingNonlocalFaceStorageIndexByKey(cell_global_id, face_id); + const auto& face_info = + common_data_.GetIncomingNonlocalFaceInfoByStorageIndex(face_storage_index); + + assert(data_size == static_cast(face_info.num_face_nodes) * num_groups_and_angles_); + + const size_t base = incoming_nonlocal_face_dof_offsets_[face_storage_index]; + std::memcpy( + incoming_nonlocal_psi_buffer_.get() + base, psi_data_bytes, data_size * sizeof(double)); + return face_info.cell_local_id; } -double* -CBC_FLUDS::NLOutgoingPsi(std::vector* psi_nonlocal_outgoing, - size_t face_node, - size_t as_ss_idx) +void +CBC_FLUDS::ClearLocalAndReceivePsi() { - assert(psi_nonlocal_outgoing != nullptr); - const size_t addr_offset = face_node * num_groups_and_angles_ + as_ss_idx * num_groups_; - return &(*psi_nonlocal_outgoing)[addr_offset]; } } // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h index ba7a6467bf..5d92af368f 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h @@ -5,75 +5,140 @@ #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/fluds.h" -#include "framework/math/unknown_manager/unknown_manager.h" -#include "framework/math/spatial_discretization/spatial_discretization.h" +#include #include -#include -#include +#include +#include namespace opensn { -class UnknownManager; -class SpatialDiscretization; -class Cell; - /** - * Flux data structures (FLUDS) specific to the cell-by-cell (CBC) sweep algorithm - * - * This class manages the storage and access of angular flux data during a CBC sweep + * CBC FLUDS for managing local and non-local psi buffers during sweeps. * - * It provides methods to access: - * - Upwind angular flux data from local neighbor cells - * - Storage locations for downwind angular flux data for the current cell - * - Upwind angular flux data received from remote MPI ranks + * Owns the compact local-face slot bank and the receive-side non-local storage used + * by one host CBC angle set. */ class CBC_FLUDS : public FLUDS { public: - CBC_FLUDS(unsigned int num_groups, - size_t num_angles, - const CBC_FLUDSCommonData& common_data, - const UnknownManager& psi_uk_man, - const SpatialDiscretization& sdm); - - virtual const FLUDSCommonData& GetCommonData() const; + /** + * Construct the host CBC FLUDS. + * + * \param num_groups Number of groups in the angle set. + * \param num_angles Number of angles in the angle set. + * \param common_data Shared CBC FLUDS metadata. + */ + CBC_FLUDS(unsigned int num_groups, size_t num_angles, const CBC_FLUDSCommonData& common_data); + + const FLUDSCommonData& GetCommonData() const noexcept { return common_data_; } + + /// Return the stride in doubles between consecutive angle slots. + size_t GetStrideSize() const noexcept { return num_groups_and_angles_; } + + /// Return the local psi buffer size in bytes. + size_t GetLocalPsiBufferSize() const noexcept { return num_slots_ * slot_size_ * sizeof(double); } + + /// Return the slot base pointer for a local cell face. + double* GetLocalFacePsiPointer(std::uint32_t cell_local_id, unsigned int face_id) const noexcept + { + auto* const slot_base = local_face_slot_bases_[cell_face_offsets_[cell_local_id] + face_id]; + assert(slot_base != nullptr); + return slot_base; + } + + /// Return the base pointer for an incoming non-local face. + double* GetIncomingNonlocalFacePsiPointer(std::uint32_t cell_local_id, + unsigned int face_id) const noexcept + { + auto* const face_base = + incoming_nonlocal_face_bases_[cell_face_offsets_[cell_local_id] + face_id]; + assert(face_base != nullptr); + return face_base; + } /** - * Given a local upwind neighbor cell, a node index on this cell, and an - * angleset subset index, this function returns a pointer to - * the start of the group data for the specified node and angle. + * Return a pointer to the upwind angular flux for a local incoming face. + * + * \param cell_local_id Local ID of the cell currently being swept. + * \param face_id Local incoming face ID on the current cell. + * \param face_node_mapped Mapped node index on the producer's outgoing face. + * \param as_ss_idx Angleset subset index within the angleset. + * \return Pointer to the start of the group data for the specified face node and angle. */ - double* UpwindPsi(const Cell& face_neighbor, unsigned int adj_cell_node, size_t as_ss_idx); + double* UpwindPsi(std::uint32_t cell_local_id, + unsigned int face_id, + unsigned int face_node_mapped, + size_t as_ss_idx) const noexcept + { + return GetLocalFacePsiPointer(cell_local_id, face_id) + + static_cast(face_node_mapped) * num_groups_and_angles_ + as_ss_idx * num_groups_; + } /** - * Given a local cell, a node index on this cell, and an angleset subset index, - * this function returns a pointer to the start of the group data for the specified - * node and angle for writing its just solved angular fluxes. + * Return a pointer to the outgoing angular flux slot for a local outgoing face. + * + * \param cell_local_id Local ID of the cell currently being swept. + * \param face_id Outgoing face ID on the current cell. + * \param face_node Face-local node index on the outgoing face. + * \param as_ss_idx Angleset subset index within the angleset. + * \return Pointer to the start of the group data for the specified face node and angle */ - double* OutgoingPsi(const Cell& cell, unsigned int cell_node, size_t as_ss_idx); + double* OutgoingPsi(std::uint32_t cell_local_id, + unsigned int face_id, + unsigned int face_node, + size_t as_ss_idx) const noexcept + { + return GetLocalFacePsiPointer(cell_local_id, face_id) + + static_cast(face_node) * num_groups_and_angles_ + as_ss_idx * num_groups_; + } /** - * Given a remote upwind cell's global ID, a face ID on this cell, - * a node index on this face, and an angleset subset index, - * this function returns a pointer to the start of the group data for the specified - * face node and angle. + * Return a pointer to received nonlocal upwind angular flux for a face node. + * + * \param cell_local_id Local ID of the cell owning the face + * \param face_id Face index on the cell + * \param face_node_mapped Face index on the cell. + * \param as_ss_idx Angleset subset index within the angleset + * \return Pointer to the start of the group data for the specified face node and angle */ - double* NLUpwindPsi(uint64_t cell_global_id, + double* NLUpwindPsi(std::uint32_t cell_local_id, unsigned int face_id, unsigned int face_node_mapped, - size_t as_ss_idx); + size_t as_ss_idx) noexcept + { + return GetIncomingNonlocalFacePsiPointer(cell_local_id, face_id) + + static_cast(face_node_mapped) * num_groups_and_angles_ + as_ss_idx * num_groups_; + } + + /** + * Return a pointer to the nonlocal outgoing angular flux for a face node. + * + * \param psi_nonlocal_outgoing Base pointer to the face's outgoing psi buffer + * \param face_node Face node index + * \param as_ss_idx Angleset subset index within the angleset + * \return Pointer to the start of the group data for the specified face node and angle + */ + double* NLOutgoingPsi(double* psi_nonlocal_outgoing, size_t face_node, size_t as_ss_idx) noexcept + { + assert(psi_nonlocal_outgoing != nullptr); + return psi_nonlocal_outgoing + face_node * num_groups_and_angles_ + as_ss_idx * num_groups_; + } /** - * Given a pointer to a vector holding the non-local outgoing psi data for a face, - * a node index on this face, and an angleset subset index, - * this function returns a pointer to the start of the group data for the specified - * face node and angle. + * Store received nonlocal face angular flux into the incoming buffer. + * + * \param cell_global_id Global ID of the neighbor cell that produced the data + * \param face_id Face index on the neighbor cell + * \param psi_data_bytes Pointer to the received angular flux payload bytes + * \param data_size Number of doubles in the payload */ - double* - NLOutgoingPsi(std::vector* psi_nonlocal_outgoing, size_t face_node, size_t as_ss_idx); + std::uint64_t StoreIncomingFaceData(uint64_t cell_global_id, + unsigned int face_id, + const std::byte* psi_data_bytes, + size_t data_size); - void ClearLocalAndReceivePsi() override { deplocs_outgoing_messages_.clear(); } + void ClearLocalAndReceivePsi() override; void ClearSendPsi() override {} void AllocateInternalLocalPsi() override {} void AllocateOutgoingPsi() override {} @@ -83,24 +148,42 @@ class CBC_FLUDS : public FLUDS void AllocateDelayedPrelocIOutgoingPsi() override {} protected: + /// Custom deleter for 64-byte aligned double arrays. + struct AlignedDoubleDeleter + { + void operator()(double* ptr) const noexcept; + }; + + /// Owning pointer to a 64-byte aligned double array. + using AlignedDoubleBuffer = std::unique_ptr; + + /// Allocate a zero-initialized 64-byte aligned double buffer. + static AlignedDoubleBuffer AllocateAlignedBuffer(std::size_t num_values); + + /// Shared face-level indexing metadata. const CBC_FLUDSCommonData& common_data_; - const UnknownManager& psi_uk_man_; - const SpatialDiscretization& sdm_; - size_t num_angles_in_gs_quadrature_; - size_t num_quadrature_local_dofs_; - size_t num_local_spatial_dofs_; - size_t local_psi_data_size_; + /// Flat face-table offsets cached locally for hot-path indexing. + std::vector cell_face_offsets_; + /// Number of angular flux storage slots. + size_t num_slots_; + /// Size of each slot in doubles (cache-line aligned). + size_t slot_size_; + /// Per-face-storage base pointer into the local psi buffer. + std::vector local_face_slot_bases_; /** - * Layout for storage for local angular fluxes: - * spatial DOF major -> angle in angleset major -> group in groupset major + * Contiguous local angular flux buffer with `num_slots_` slots. + * + * Layout per slot: node-major, angle-in-angleset-major, group-in-groupset major. */ - std::vector local_psi_data_; - - std::vector> boundryI_incoming_psi_; - - /// Pre-computed start index into local_psi_data_ for each local cell - std::vector cell_psi_start_; + AlignedDoubleBuffer local_psi_buffer_; + + /// Per-face-storage index DOF offset into the incoming non-local psi buffer. + std::vector incoming_nonlocal_face_dof_offsets_; + /// Per-face storage-index base pointer into the incoming non-local psi buffer. + std::vector incoming_nonlocal_face_bases_; + /// Flat buffer holding received non-local angular fluxes. + AlignedDoubleBuffer incoming_nonlocal_psi_buffer_; }; } // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.cc index 354b0fd3a0..db9173b91d 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.cc +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.cc @@ -3,8 +3,10 @@ #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/spds.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h" #include "framework/mesh/cell/cell.h" #include "framework/mesh/mesh_continuum/mesh_continuum.h" +#include namespace opensn { @@ -13,28 +15,125 @@ CBC_FLUDSCommonData::CBC_FLUDSCommonData( const SPDS& spds, const std::vector& grid_nodal_mappings) : FLUDSCommonData(spds, grid_nodal_mappings), num_incoming_nonlocal_faces_(0), - num_outgoing_nonlocal_faces_(0) + num_incoming_nonlocal_face_nodes_(0), + num_outgoing_nonlocal_faces_(0), + num_local_faces_(0), + max_local_face_node_count_(0), + num_local_face_slots_(dynamic_cast(spds).GetMaxNumLocalPsiSlots()) { // Pre-compute non-local face counts for hash map capacity reservation const auto& grid = *spds.GetGrid(); + const auto& cbc_spds = dynamic_cast(spds); const auto& face_orientations = spds.GetCellFaceOrientations(); + outgoing_nonlocal_face_counts_.assign(spds.GetLocationSuccessors().size(), 0); + outgoing_nonlocal_face_node_counts_.assign(spds.GetLocationSuccessors().size(), 0); + cell_face_offsets_.resize(grid.local_cells.size() + 1, 0); + size_t total_num_faces = 0; + + for (const auto& cell : grid.local_cells) + { + cell_face_offsets_[cell.local_id] = static_cast(total_num_faces); + total_num_faces += cell.faces.size(); + } + cell_face_offsets_.back() = static_cast(total_num_faces); + local_face_slot_ids_.assign(total_num_faces, CBC_SPDS::INVALID_LOCAL_FACE_TASK_ID); + incoming_nonlocal_face_info_.resize(total_num_faces); + outgoing_nonlocal_face_info_.resize(total_num_faces); + for (const auto& cell : grid.local_cells) { + const size_t face_offset = cell_face_offsets_[cell.local_id]; for (size_t f = 0; f < cell.faces.size(); ++f) { const auto& face = cell.faces[f]; const auto orientation = face_orientations[cell.local_id][f]; + const size_t face_storage_index = face_offset + f; if ((not face.has_neighbor) or (face.IsNeighborLocal(&grid))) + { + if (face.has_neighbor) + { + max_local_face_node_count_ = std::max(max_local_face_node_count_, face.vertex_ids.size()); + if (orientation == FaceOrientation::OUTGOING) + { + const auto task_id = + cbc_spds.GetOutgoingLocalFaceTaskID(cell.local_id, static_cast(f)); + assert(task_id != CBC_SPDS::INVALID_LOCAL_FACE_TASK_ID); + local_face_slot_ids_[face_storage_index] = cbc_spds.GetLocalFaceSlotIDs()[task_id]; + ++num_local_faces_; + } + else if (orientation == FaceOrientation::INCOMING) + { + const auto task_id = + cbc_spds.GetIncomingLocalFaceTaskID(cell.local_id, static_cast(f)); + assert(task_id != CBC_SPDS::INVALID_LOCAL_FACE_TASK_ID); + local_face_slot_ids_[face_storage_index] = cbc_spds.GetLocalFaceSlotIDs()[task_id]; + } + } continue; + } if (orientation == FaceOrientation::INCOMING) + { ++num_incoming_nonlocal_faces_; + const auto num_face_nodes = static_cast( + grid_nodal_mappings[cell.local_id][f].face_node_mapping_.size()); + IncomingNonlocalFaceInfo info{static_cast(cell.local_id), + static_cast(num_incoming_nonlocal_face_nodes_), + num_face_nodes}; + incoming_nonlocal_face_info_[face_storage_index] = info; + incoming_nonlocal_face_info_by_key_.emplace( + CellFaceKey{cell.global_id, static_cast(f)}, face_storage_index); + num_incoming_nonlocal_face_nodes_ += num_face_nodes; + } else if (orientation == FaceOrientation::OUTGOING) + { ++num_outgoing_nonlocal_faces_; + const auto deplocI = + static_cast(spds.MapLocJToDeplocI(face.GetNeighborPartitionID(&grid))); + ++outgoing_nonlocal_face_counts_[deplocI]; + outgoing_nonlocal_face_node_counts_[deplocI] += + grid_nodal_mappings[cell.local_id][f].face_node_mapping_.size(); + outgoing_nonlocal_face_info_[face_storage_index] = OutgoingNonlocalFaceInfo{ + face.GetNeighborPartitionID(&grid), + face.neighbor_id, + static_cast(grid_nodal_mappings[cell.local_id][f].associated_face_), + static_cast( + grid_nodal_mappings[cell.local_id][f].face_node_mapping_.size())}; + } } } } +const CBC_FLUDSCommonData::IncomingNonlocalFaceInfo& +CBC_FLUDSCommonData::GetIncomingNonlocalFaceInfo(const std::uint32_t cell_local_id, + const unsigned int face_id) const noexcept +{ + return incoming_nonlocal_face_info_[cell_face_offsets_[cell_local_id] + face_id]; +} + +const CBC_FLUDSCommonData::IncomingNonlocalFaceInfo& +CBC_FLUDSCommonData::GetIncomingNonlocalFaceInfoByStorageIndex( + const std::size_t storage_index) const noexcept +{ + return incoming_nonlocal_face_info_[storage_index]; +} + +std::size_t +CBC_FLUDSCommonData::GetIncomingNonlocalFaceStorageIndexByKey( + const std::uint64_t cell_global_id, const unsigned int face_id) const noexcept +{ + const auto it = incoming_nonlocal_face_info_by_key_.find({cell_global_id, face_id}); + assert(it != incoming_nonlocal_face_info_by_key_.end()); + return it->second; +} + +const CBC_FLUDSCommonData::OutgoingNonlocalFaceInfo& +CBC_FLUDSCommonData::GetOutgoingNonlocalFaceInfo(const std::uint32_t cell_local_id, + const unsigned int face_id) const noexcept +{ + return outgoing_nonlocal_face_info_[cell_face_offsets_[cell_local_id] + face_id]; +} + } // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.h index a1cd93f7ad..020cad200d 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds_common_data.h @@ -4,25 +4,166 @@ #pragma once #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/fluds_common_data.h" -#include +#include #include +#include namespace opensn { +/** + * Shared CBC FLUDS metadata. + * + * Owns the flat face-level lookup tables used to index compact local-face slots and + * incoming/outgoing non-local face storage for one CBC sweep plane. + */ class CBC_FLUDSCommonData : public FLUDSCommonData { public: + /// Incoming-face key: `(cell_global_id, face_id)`. + using CellFaceKey = std::pair; + + /// Hash for CellFaceKey. + struct CellFaceKeyHash + { + size_t operator()(const CellFaceKey& key) const noexcept + { + size_t h = std::hash{}(key.first); + h ^= std::hash{}(key.second) + 0x9e3779b9 + (h << 6) + (h >> 2); + return h; + } + }; + + /// Metadata for one incoming non-local face. + struct IncomingNonlocalFaceInfo + { + /// Local ID of the cell owning this face. + std::uint32_t cell_local_id = 0; + /// Offset into the incoming non-local psi buffer for this face's node data. + std::uint32_t face_node_offset = 0; + /// Number of face nodes. + std::uint32_t num_face_nodes = 0; + }; + + /// Metadata for one outgoing non-local face. + struct OutgoingNonlocalFaceInfo + { + /// Destination MPI rank locality index. + int locality = 0; + /// Global ID of the destination cell. + std::uint64_t cell_global_id = 0; + /// Face index on the destination cell. + unsigned int associated_face = 0; + /// Number of face nodes. + std::uint32_t num_face_nodes = 0; + }; + + /** + * Construct common data from the SPDS and grid nodal mappings. + * + * \param spds Sweep-plane data structure providing face orientations. + * \param grid_nodal_mappings Per-cell-face nodal mapping data. + */ CBC_FLUDSCommonData(const SPDS& spds, const std::vector& grid_nodal_mappings); + /// Return the number of incoming non-local faces. size_t GetNumIncomingNonlocalFaces() const { return num_incoming_nonlocal_faces_; } + /// Return the number of incoming non-local face nodes. + size_t GetNumIncomingNonlocalFaceNodes() const { return num_incoming_nonlocal_face_nodes_; } + + /// Return the number of outgoing non-local faces. size_t GetNumOutgoingNonlocalFaces() const { return num_outgoing_nonlocal_faces_; } + /// Return the number of local directed faces. + size_t GetNumLocalFaces() const { return num_local_faces_; } + + /// Return the maximum local-face node count. + size_t GetMaxLocalFaceNodeCount() const { return max_local_face_node_count_; } + + /// Return the number of reusable local-face slots. + size_t GetNumLocalFaceSlots() const { return num_local_face_slots_; } + + /// Get number of outgoing non-local faces for dependent locality `deplocI`. + size_t GetDeplocIFaceCount(std::size_t deplocI) const noexcept + { + return outgoing_nonlocal_face_counts_[deplocI]; + } + + /// Get number of outgoing non-local face nodes for dependent locality `deplocI`. + size_t GetDeplocIFaceNodeCount(std::size_t deplocI) const noexcept + { + return outgoing_nonlocal_face_node_counts_[deplocI]; + } + + /// Look up incoming nonlocal face info by cell local ID and face index. + const IncomingNonlocalFaceInfo& GetIncomingNonlocalFaceInfo(std::uint32_t cell_local_id, + unsigned int face_id) const noexcept; + + /// Look up incoming nonlocal face info by flat storage index. + const IncomingNonlocalFaceInfo& + GetIncomingNonlocalFaceInfoByStorageIndex(std::size_t storage_index) const noexcept; + + /// Resolve a (cell_global_id, face_id) pair to a flat storage index. + std::size_t GetIncomingNonlocalFaceStorageIndexByKey(std::uint64_t cell_global_id, + unsigned int face_id) const noexcept; + + /// Total number of cell-face entries in the flat face table. + std::size_t GetNumCellFaces() const noexcept { return cell_face_offsets_.back(); } + + /// Look up outgoing nonlocal face info by cell local ID and face index. + const OutgoingNonlocalFaceInfo& GetOutgoingNonlocalFaceInfo(std::uint32_t cell_local_id, + unsigned int face_id) const noexcept; + + /// Look up the static local-face slot id by cell local ID and face index. + std::uint32_t GetLocalFaceSlotID(std::uint32_t cell_local_id, unsigned int face_id) const noexcept + { + return local_face_slot_ids_[cell_face_offsets_[cell_local_id] + face_id]; + } + + /// Flat face-table offset for a given cell. + size_t GetCellFaceOffset(std::uint32_t cell_local_id) const noexcept + { + return cell_face_offsets_[cell_local_id]; + } + + /// Flat face-table index for a given cell face. + size_t GetFaceStorageIndex(std::uint32_t cell_local_id, unsigned int face_id) const noexcept + { + return cell_face_offsets_[cell_local_id] + face_id; + } + + /// Return the flat cell-face offsets table. + const std::vector& GetCellFaceOffsets() const noexcept { return cell_face_offsets_; } + private: + /// Number of incoming non-local faces. size_t num_incoming_nonlocal_faces_; + /// Number of incoming non-local face nodes. + size_t num_incoming_nonlocal_face_nodes_; + /// Number of outgoing non-local faces. size_t num_outgoing_nonlocal_faces_; + /// Number of local directed faces. + size_t num_local_faces_; + /// Maximum number of nodes on any local directed face. + size_t max_local_face_node_count_; + /// Number of reusable local-face storage slots. + size_t num_local_face_slots_; + /// Prefix-sum offsets into the flat face tables, indexed by cell local ID. + std::vector cell_face_offsets_; + /// Flat local-face slot IDs, indexed by face storage index. + std::vector local_face_slot_ids_; + /// Flat incoming non-local face metadata, indexed by face storage index. + std::vector incoming_nonlocal_face_info_; + /// Flat outgoing non-local face metadata, indexed by face storage index. + std::vector outgoing_nonlocal_face_info_; + /// Per-dependent locality outgoing face counts. + std::vector outgoing_nonlocal_face_counts_; + /// Per-dependent locality outgoing face node counts. + std::vector outgoing_nonlocal_face_node_counts_; + /// Map from (cell_global_id, face_id) to flat storage index for incoming non-local faces. + std::unordered_map incoming_nonlocal_face_info_by_key_; }; } // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_avx_sweep_chunk.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_avx_sweep_chunk.cc index 0b541d316c..77104b7e53 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_avx_sweep_chunk.cc +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_avx_sweep_chunk.cc @@ -20,6 +20,19 @@ CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set) static_assert(NumNodes >= 2 and NumNodes <= 8); + struct IncomingFaceData + { + const FaceNodalMapping* face_nodal_mapping = nullptr; + double* psi_base = nullptr; + }; + + struct OutgoingFaceData + { + bool is_reflecting_boundary_face = false; + double* psi_base = nullptr; + const CBC_FLUDSCommonData::OutgoingNonlocalFaceInfo* outgoing_nonlocal_face_info = nullptr; + }; + const auto& groupset = data.groupset; const auto& m2d_op = groupset.quadrature->GetMomentToDiscreteOperator(); const auto& d2m_op = groupset.quadrature->GetDiscreteToMomentOperator(); @@ -28,7 +41,8 @@ CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set) "CBC_Sweep_FixedN invoked for an incompatible cell topology."); const auto& face_orientations = angle_set.GetSPDS().GetCellFaceOrientations()[data.cell_local_id]; - const auto& sigma_t = data.xs.at(data.cell.block_id)->GetSigmaTotal(); + const auto& cell_xs = data.cell_transport_view.GetXS(); + const auto& sigma_t = cell_xs.GetSigmaTotal(); constexpr size_t matrix_size = static_cast(NumNodes) * static_cast(NumNodes); auto idx = [](size_t i, size_t j) -> size_t { return i * NumNodes + j; }; @@ -59,7 +73,7 @@ CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set) std::vector tau_gsg; if constexpr (time_dependent) { - const auto& inv_velg = data.xs.at(data.cell.block_id)->GetInverseVelocity(); + const auto& inv_velg = cell_xs.GetInverseVelocity(); const double theta = data.problem.GetTheta(); const double inv_theta = 1.0 / theta; const double dt = data.problem.GetTimeStep(); @@ -76,6 +90,57 @@ CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set) : nullptr; const auto& as_angle_indices = angle_set.GetAngleIndices(); + const auto& cbc_common = dynamic_cast(data.fluds.GetCommonData()); + auto* const async_comm = dynamic_cast(angle_set.GetCommunicator()); + std::vector incoming_face_data(data.cell_num_faces); + std::vector outgoing_face_data(data.cell_num_faces); + for (size_t f = 0; f < data.cell_num_faces; ++f) + { + const auto& face = data.cell.faces[f]; + const bool is_local_face = data.cell_transport_view.IsFaceLocal(f); + const bool is_boundary_face = not face.has_neighbor; + const auto* face_nodal_mapping = + &data.fluds.GetCommonData().GetFaceNodalMapping(data.cell_local_id, f); + + if (face_orientations[f] == FaceOrientation::INCOMING) + { + auto& face_data = incoming_face_data[f]; + face_data.face_nodal_mapping = face_nodal_mapping; + if (is_local_face) + face_data.psi_base = + data.fluds.GetLocalFacePsiPointer(data.cell_local_id, static_cast(f)); + else if (not is_boundary_face) + face_data.psi_base = data.fluds.GetIncomingNonlocalFacePsiPointer( + data.cell_local_id, static_cast(f)); + } + + if (face_orientations[f] == FaceOrientation::OUTGOING) + { + auto& face_data = outgoing_face_data[f]; + face_data.is_reflecting_boundary_face = + is_boundary_face and angle_set.GetBoundaries()[face.neighbor_id]->IsReflecting(); + if (is_local_face) + face_data.psi_base = + data.fluds.GetLocalFacePsiPointer(data.cell_local_id, static_cast(f)); + if (not is_local_face and not is_boundary_face) + face_data.outgoing_nonlocal_face_info = + &cbc_common.GetOutgoingNonlocalFaceInfo(data.cell_local_id, static_cast(f)); + } + } + + double* psi_new_base = nullptr; + double theta = 1.0; + double inv_theta = 1.0; + if (data.save_angular_flux) + { + psi_new_base = &data.destination_psi[data.discretization.MapDOFLocal( + data.cell, 0, groupset.psi_uk_man_, 0, 0)]; + if constexpr (time_dependent) + { + theta = data.problem.GetTheta(); + inv_theta = 1.0 / theta; + } + } for (size_t as_ss_idx = 0; as_ss_idx < data.num_angles_in_as; ++as_ss_idx) { @@ -102,10 +167,8 @@ CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set) continue; const auto& face = data.cell.faces[f]; - const bool is_local_face = data.cell_transport_view.IsFaceLocal(f); - const bool is_boundary_face = not face.has_neighbor; - const auto* face_nodal_mapping = - &data.fluds.GetCommonData().GetFaceNodalMapping(data.cell_local_id, f); + const auto& face_data = incoming_face_data[f]; + const auto* face_nodal_mapping = face_data.face_nodal_mapping; const auto& Ms_f = data.M_surf[f]; const size_t num_face_nodes = data.cell_mapping.GetNumFaceNodes(f); @@ -116,13 +179,11 @@ CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set) const int j = data.cell_mapping.MapFaceNode(f, fj); const double* psi = nullptr; - if (is_local_face) - psi = data.fluds.UpwindPsi(*data.cell_transport_view.FaceNeighbor(f), - face_nodal_mapping->cell_node_mapping_[fj], - as_ss_idx); - else if (not is_boundary_face) - psi = data.fluds.NLUpwindPsi( - data.cell.global_id, f, face_nodal_mapping->face_node_mapping_[fj], as_ss_idx); + if (face_data.psi_base != nullptr) + psi = face_data.psi_base + + static_cast(face_nodal_mapping->face_node_mapping_[fj]) * + data.group_angle_stride + + as_ss_idx * data.group_stride; else psi = angle_set.PsiBoundary(face.neighbor_id, direction_num, @@ -271,27 +332,13 @@ CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set) const double w = d2m_row[m]; PRAGMA_UNROLL for (size_t i = 0; i < NumNodes; ++i) - { - const size_t dof = data.cell_transport_view.MapDOF(i, m, data.gs_gi); - data.destination_phi[dof + gsg] += w * bg[i]; - } + data.destination_phi[moment_dof_map[m][i] + gsg] += w * bg[i]; } } } if (data.save_angular_flux) { - double* psi_new = &data.destination_psi[data.discretization.MapDOFLocal( - data.cell, 0, groupset.psi_uk_man_, 0, 0)]; - - double theta = 1.0; - double inv_theta = 1.0; - if constexpr (time_dependent) - { - theta = data.problem.GetTheta(); - inv_theta = 1.0 / theta; - } - PRAGMA_UNROLL for (size_t i = 0; i < NumNodes; ++i) { @@ -304,10 +351,10 @@ CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set) if constexpr (time_dependent) { const double psi_old_val = psi_old ? psi_old[imap + gsg] : 0.0; - psi_new[imap + gsg] = inv_theta * (psi_sol + (theta - 1.0) * psi_old_val); + psi_new_base[imap + gsg] = inv_theta * (psi_sol + (theta - 1.0) * psi_old_val); } else - psi_new[imap + gsg] = psi_sol; + psi_new_base[imap + gsg] = psi_sol; } } } @@ -318,28 +365,26 @@ CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set) continue; const auto& face = data.cell.faces[f]; - const bool is_local_face = data.cell_transport_view.IsFaceLocal(f); - const bool is_boundary_face = not face.has_neighbor; - const bool is_reflecting_boundary_face = - (is_boundary_face and angle_set.GetBoundaries()[face.neighbor_id]->IsReflecting()); + const auto& face_data = outgoing_face_data[f]; + const bool is_reflecting_boundary_face = face_data.is_reflecting_boundary_face; const auto& IntF_shapeI = data.IntS_shapeI[f]; - const int locality = data.cell_transport_view.FaceLocality(f); const size_t num_face_nodes = data.cell_mapping.GetNumFaceNodes(f); - const auto& face_nodal_mapping = - data.fluds.GetCommonData().GetFaceNodalMapping(data.cell_local_id, f); - std::vector* psi_nonlocal_outgoing = nullptr; + double* psi_nonlocal_outgoing = nullptr; - if (not is_boundary_face and not is_local_face) + if (face_data.outgoing_nonlocal_face_info != nullptr) { - auto* async_comm = dynamic_cast(angle_set.GetCommunicator()); - const size_t data_size_for_msg = num_face_nodes * data.group_angle_stride; + const auto& outgoing_nonlocal_face_info = *face_data.outgoing_nonlocal_face_info; + const size_t data_size_for_msg = + static_cast(outgoing_nonlocal_face_info.num_face_nodes) * data.group_angle_stride; psi_nonlocal_outgoing = - &async_comm->InitGetDownwindMessageData(locality, - face.neighbor_id, - face_nodal_mapping.associated_face_, - angle_set.GetID(), - data_size_for_msg); + async_comm + ->InitGetDownwindMessageData(outgoing_nonlocal_face_info.locality, + outgoing_nonlocal_face_info.cell_global_id, + outgoing_nonlocal_face_info.associated_face, + angle_set.GetID(), + data_size_for_msg) + .data(); } const double mu_wt_f = wt * face_mu_values[f]; @@ -348,7 +393,7 @@ CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set) { const int i = data.cell_mapping.MapFaceNode(f, fi); - if (is_boundary_face) + if (face_data.outgoing_nonlocal_face_info == nullptr and face_data.psi_base == nullptr) { const double flux_i = mu_wt_f * IntF_shapeI(i); for (size_t gsg = 0; gsg < data.gs_size; ++gsg) @@ -357,9 +402,9 @@ CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set) } double* psi = nullptr; - if (is_local_face) - psi = data.fluds.OutgoingPsi(data.cell, i, as_ss_idx); - else if (not is_boundary_face) + if (face_data.psi_base != nullptr) + psi = face_data.psi_base + fi * data.group_angle_stride + as_ss_idx * data.group_stride; + else if (face_data.outgoing_nonlocal_face_info != nullptr) psi = data.fluds.NLOutgoingPsi(psi_nonlocal_outgoing, fi, as_ss_idx); else if (is_reflecting_boundary_face) psi = angle_set.PsiReflected(face.neighbor_id, direction_num, data.cell_local_id, f, fi); @@ -380,22 +425,7 @@ CBCSweepChunk::Sweep_FixedN(AngleSet& angle_set) { CALI_CXX_MARK_SCOPE("CBCSweepChunk::Sweep_FixedN"); - auto data = MakeCBCSweepData(discretization_, - source_moments_, - groupset_, - xs_, - num_moments_, - max_num_cell_dofs_, - SaveAngularFluxEnabled(), - groupset_angle_group_stride_, - groupset_group_stride_, - destination_phi_, - destination_psi_, - include_rhs_time_term_, - problem_, - nullptr, - group_block_size_, - ctx_); + auto data = MakeSweepData(nullptr); CBC_Sweep_FixedN(data, angle_set); } diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.cc index b40c3d7ed0..8a3f4ce304 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.cc +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.cc @@ -56,6 +56,7 @@ CBCSweepChunk::CBCSweepChunk(DiscreteOrdinatesProblem& problem, LBSGroupset& gro } group_block_size_ = ComputeGroupBlockSize(groupset_.GetNumGroups()); + generic_scratch_.EnsureCapacity(max_num_cell_dofs_, groupset_.GetNumGroups(), 0); } void @@ -63,14 +64,50 @@ CBCSweepChunk::SetAngleSet(AngleSet& angle_set) { CALI_CXX_MARK_SCOPE("CBCSweepChunk::SetAngleSet"); - CBCBindAngleSetContext(ctx_, groupset_, IsSurfaceSourceActive(), angle_set); + ctx_.BindAngleSet(groupset_, IsSurfaceSourceActive(), angle_set); } void CBCSweepChunk::SetCell(const Cell* cell_ptr, AngleSet& angle_set) { static_cast(angle_set); - CBCBindCellContext(ctx_, discretization_, unit_cell_matrices_, cell_transport_views_, cell_ptr); + ctx_.BindCell(discretization_, unit_cell_matrices_, cell_transport_views_, cell_ptr); +} + +CBCSweepData +CBCSweepChunk::MakeSweepData(const std::vector* psi_old) +{ + return CBCSweepData{discretization_, + source_moments_, + groupset_, + num_moments_, + max_num_cell_dofs_, + SaveAngularFluxEnabled(), + groupset_angle_group_stride_, + groupset_group_stride_, + destination_phi_, + destination_psi_, + ctx_.surface_source_active, + include_rhs_time_term_, + problem_, + psi_old, + group_block_size_, + *ctx_.fluds, + *ctx_.cell, + ctx_.cell_local_id, + *ctx_.cell_mapping, + *ctx_.cell_transport_view, + ctx_.cell_num_faces, + ctx_.cell_num_nodes, + ctx_.gs_size, + ctx_.gs_gi, + ctx_.num_angles_in_as, + ctx_.group_stride, + ctx_.group_angle_stride, + *ctx_.G, + *ctx_.M, + *ctx_.M_surf, + *ctx_.IntS_shapeI}; } void @@ -84,24 +121,9 @@ CBCSweepChunk::Sweep_Generic(AngleSet& angle_set) { CALI_CXX_MARK_SCOPE("CBCSweepChunk::Sweep_Generic"); - auto data = MakeCBCSweepData(discretization_, - source_moments_, - groupset_, - xs_, - num_moments_, - max_num_cell_dofs_, - SaveAngularFluxEnabled(), - groupset_angle_group_stride_, - groupset_group_stride_, - destination_phi_, - destination_psi_, - include_rhs_time_term_, - problem_, - nullptr, - group_block_size_, - ctx_); + auto data = MakeSweepData(nullptr); - CBC_Sweep_Generic(data, angle_set); + CBC_Sweep_Generic(data, generic_scratch_, angle_set); } } // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h index 5d8acaa305..2f7a15a541 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h @@ -14,54 +14,68 @@ class CellMapping; class DiscreteOrdinatesProblem; /** - * Implements the core sweep operation for a single cell within the - * cell-by-cell (CBC) sweep algorithm. + * Host CBC sweep chunk. * - * This class is responsible for performing the discrete ordinates transport - * calculation on a given cell for all angles and groups managed by its - * current AngleSet - * It interacts with a CBC_FLUDS object to obtain upwind angular flux data - * (from local neighbors, MPI remote buffers, or boundaries) and to store - * outgoing angular flux data (to local neighbors or MPI send buffers) + * Dispatches between the generic and fixed-node CBC sweep kernels for the + * currently bound angle set and cell. */ class CBCSweepChunk : public SweepChunk { public: + /** + * Construct one CBC sweep chunk for a groupset. + * + * \param problem Owning discrete-ordinates problem. + * \param groupset Groupset swept by this chunk. + */ CBCSweepChunk(DiscreteOrdinatesProblem& problem, LBSGroupset& groupset); - /// Set the current AngleSet + /** + * Bind the current angle set. + * + * \param angle_set Angle set to bind. + */ void SetAngleSet(AngleSet& angle_set) override; - /// Set the current cell to be swept + /** + * Bind the current cell to be swept. + * + * \param cell_ptr Cell to bind. + * \param angle_set Owning angle set. + */ void SetCell(Cell const* cell_ptr, AngleSet& angle_set) override; /** - * Performs the discrete ordinates sweep calculation for the currently - * set cell, for all angles and groups within the provided AngleSet. + * Sweep the currently bound cell for the provided angle set. + * + * Selects the fixed-node kernel when all local cells have the same node count + * in the supported range, otherwise falls back to the generic CBC kernel. * - * It: - * - Assembles the local transport equation system for each angle and group - * - Retrieves upwind angular fluxes from local neighbors, remote locations - * (via MPI data managed by CBC_FLUDS), or boundaries - * - Solves the local system for the outgoing angular fluxes at the cell nodes - * - Updates the global scalar flux moments - * - If save_angular_flux is true, stores the computed angular fluxes into - * the global angular flux vector - * - Propagates outgoing angular fluxes to local downwind neighbors or stages - * them for MPI transmission to remote downwind neighbors + * \param angle_set Angle set currently being advanced. */ void Sweep(AngleSet& angle_set) override; protected: + /// Owning discrete-ordinates problem. DiscreteOrdinatesProblem& problem_; + /// Cached per-cell and per-angleset context. CBCSweepChunkContext ctx_; + /// Group block size for SIMD batch solves. unsigned int group_block_size_ = 0; + /// Reusable scratch buffers for generic sweep chunk kernel. + CBCGenericSweepScratch generic_scratch_; private: + /// Pointer-to-member for the selected sweep implementation (generic or fixed-node). using SweepFunc = void (CBCSweepChunk::*)(AngleSet&); + /// Selected sweep function pointer (generic or fixed-node). SweepFunc sweep_impl_ = nullptr; + /// Construct the aggregated sweep data for the current cell. + CBCSweepData MakeSweepData(const std::vector* psi_old); + /// Sweep using the generic kernel. void Sweep_Generic(AngleSet& angle_set); + /// Sweep using the fixed-node kernel. template void Sweep_FixedN(AngleSet& angle_set); }; diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_shared.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_shared.h index 13a8ae1f1b..037f4f8379 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_shared.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_shared.h @@ -11,116 +11,83 @@ namespace opensn { +/** + * Cached per-cell and per-angleset context for CBC sweep chunks. + * + * Populated in two phases: + * 1. BindAngleSet caches angleset-level data (FLUDS, group range, strides) + * 2. BindCell caches cell-level data (geometry, transport views, unit cell matrices) + */ struct CBCSweepChunkContext { + /// FLUDS for current angleset. CBC_FLUDS* fluds = nullptr; - + /// Number of groups in the current groupset. size_t gs_size = 0; + /// First group index in the current groupset. unsigned int gs_gi = 0; + /// Number of angles in the current angleset. size_t num_angles_in_as = 0; + /// Per-angle group stride ( = num_groups). unsigned int group_stride = 0; + /// Per-node angular stride ( = num_angles * num_groups). size_t group_angle_stride = 0; + /// Whether the surface source BCs are active. bool surface_source_active = false; + /// Current cell pointer const Cell* cell = nullptr; + /// Local ID of the current cell. std::uint32_t cell_local_id = 0; + /// Cell mapping for the current cell. const CellMapping* cell_mapping = nullptr; + /// Transport view for the current cell. CellLBSView* cell_transport_view = nullptr; + /// Number of faces on the current cell. size_t cell_num_faces = 0; + /// Number of nodes in the current cell. size_t cell_num_nodes = 0; + /// Volume integral matrix. const DenseMatrix* G = nullptr; + /// Mass matrix. const DenseMatrix* M = nullptr; + /// Per-face surface mass matrices. const std::vector>* M_surf = nullptr; + /// Per-face surface integrals of shape functions. const std::vector>* IntS_shapeI = nullptr; -}; -inline void -CBCBindAngleSetContext(CBCSweepChunkContext& ctx, - const LBSGroupset& groupset, - bool surface_source_active, - AngleSet& angle_set) -{ - ctx.fluds = &dynamic_cast(angle_set.GetFLUDS()); - ctx.gs_size = groupset.GetNumGroups(); - ctx.gs_gi = groupset.first_group; - ctx.surface_source_active = surface_source_active; - ctx.num_angles_in_as = angle_set.GetNumAngles(); - ctx.group_stride = angle_set.GetNumGroups(); - ctx.group_angle_stride = ctx.group_stride * ctx.num_angles_in_as; -} + /// Cache angleset-level data (FLUDS, group range, strides). + void BindAngleSet(const LBSGroupset& groupset, const bool has_surface_source, AngleSet& angle_set) + { + fluds = &dynamic_cast(angle_set.GetFLUDS()); + gs_size = groupset.GetNumGroups(); + gs_gi = groupset.first_group; + surface_source_active = has_surface_source; + num_angles_in_as = angle_set.GetNumAngles(); + group_stride = angle_set.GetNumGroups(); + group_angle_stride = group_stride * num_angles_in_as; + } -inline void -CBCBindCellContext(CBCSweepChunkContext& ctx, - const SpatialDiscretization& discretization, - const std::vector& unit_cell_matrices, - std::vector& cell_transport_views, - const Cell* cell_ptr) -{ - ctx.cell = cell_ptr; - ctx.cell_local_id = cell_ptr->local_id; - ctx.cell_mapping = &discretization.GetCellMapping(*ctx.cell); - ctx.cell_transport_view = &cell_transport_views[ctx.cell->local_id]; - ctx.cell_num_faces = ctx.cell->faces.size(); - ctx.cell_num_nodes = ctx.cell_mapping->GetNumNodes(); + /// Cache cell-level data (geometry, transport views, unit cell matrices). + void BindCell(const SpatialDiscretization& discretization, + const std::vector& unit_cell_matrices, + std::vector& cell_transport_views, + const Cell* cell_ptr) + { + cell = cell_ptr; + cell_local_id = cell_ptr->local_id; + cell_mapping = &discretization.GetCellMapping(*cell); + cell_transport_view = &cell_transport_views[cell->local_id]; + cell_num_faces = cell->faces.size(); + cell_num_nodes = cell_mapping->GetNumNodes(); - const auto& unit_mats = unit_cell_matrices[ctx.cell_local_id]; - ctx.G = &unit_mats.intV_shapeI_gradshapeJ; - ctx.M = &unit_mats.intV_shapeI_shapeJ; - ctx.M_surf = &unit_mats.intS_shapeI_shapeJ; - ctx.IntS_shapeI = &unit_mats.intS_shapeI; -} - -inline CBCSweepData -MakeCBCSweepData(const SpatialDiscretization& discretization, - const std::vector& source_moments, - const LBSGroupset& groupset, - const BlockID2XSMap& xs, - unsigned int num_moments, - unsigned int max_num_cell_dofs, - bool save_angular_flux, - size_t groupset_angle_group_stride, - size_t groupset_group_stride, - std::vector& destination_phi, - std::vector& destination_psi, - bool include_rhs_time_term, - DiscreteOrdinatesProblem& problem, - const std::vector* psi_old, - unsigned int group_block_size, - const CBCSweepChunkContext& ctx) -{ - return CBCSweepData{discretization, - source_moments, - groupset, - xs, - num_moments, - max_num_cell_dofs, - save_angular_flux, - groupset_angle_group_stride, - groupset_group_stride, - destination_phi, - destination_psi, - ctx.surface_source_active, - include_rhs_time_term, - problem, - psi_old, - group_block_size, - *ctx.fluds, - *ctx.cell, - ctx.cell_local_id, - *ctx.cell_mapping, - *ctx.cell_transport_view, - ctx.cell_num_faces, - ctx.cell_num_nodes, - ctx.gs_size, - ctx.gs_gi, - ctx.num_angles_in_as, - ctx.group_stride, - ctx.group_angle_stride, - *ctx.G, - *ctx.M, - *ctx.M_surf, - *ctx.IntS_shapeI}; -} + const auto& unit_mats = unit_cell_matrices[cell_local_id]; + G = &unit_mats.intV_shapeI_gradshapeJ; + M = &unit_mats.intV_shapeI_shapeJ; + M_surf = &unit_mats.intS_shapeI_shapeJ; + IntS_shapeI = &unit_mats.intS_shapeI; + } +}; } // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.cc index 0b261ceb48..78a8086017 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.cc +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.cc @@ -63,6 +63,7 @@ CBCSweepChunkTD::CBCSweepChunkTD(DiscreteOrdinatesProblem& problem, LBSGroupset& } group_block_size_ = ComputeGroupBlockSize(groupset_.GetNumGroups()); + generic_scratch_.EnsureCapacity(max_num_cell_dofs_, groupset_.GetNumGroups(), 0); } void @@ -70,14 +71,50 @@ CBCSweepChunkTD::SetAngleSet(AngleSet& angle_set) { CALI_CXX_MARK_SCOPE("CBCSweepChunkTD::SetAngleSet"); - CBCBindAngleSetContext(ctx_, groupset_, IsSurfaceSourceActive(), angle_set); + ctx_.BindAngleSet(groupset_, IsSurfaceSourceActive(), angle_set); } void CBCSweepChunkTD::SetCell(const Cell* cell_ptr, AngleSet& angle_set) { static_cast(angle_set); - CBCBindCellContext(ctx_, discretization_, unit_cell_matrices_, cell_transport_views_, cell_ptr); + ctx_.BindCell(discretization_, unit_cell_matrices_, cell_transport_views_, cell_ptr); +} + +CBCSweepData +CBCSweepChunkTD::MakeSweepData(const std::vector* psi_old) +{ + return CBCSweepData{discretization_, + source_moments_, + groupset_, + num_moments_, + max_num_cell_dofs_, + SaveAngularFluxEnabled(), + groupset_angle_group_stride_, + groupset_group_stride_, + destination_phi_, + destination_psi_, + ctx_.surface_source_active, + include_rhs_time_term_, + problem_, + psi_old, + group_block_size_, + *ctx_.fluds, + *ctx_.cell, + ctx_.cell_local_id, + *ctx_.cell_mapping, + *ctx_.cell_transport_view, + ctx_.cell_num_faces, + ctx_.cell_num_nodes, + ctx_.gs_size, + ctx_.gs_gi, + ctx_.num_angles_in_as, + ctx_.group_stride, + ctx_.group_angle_stride, + *ctx_.G, + *ctx_.M, + *ctx_.M_surf, + *ctx_.IntS_shapeI}; } void @@ -91,24 +128,9 @@ CBCSweepChunkTD::Sweep_Generic(AngleSet& angle_set) { CALI_CXX_MARK_SCOPE("CBCSweepChunkTD::Sweep_Generic"); - auto data = MakeCBCSweepData(discretization_, - source_moments_, - groupset_, - xs_, - num_moments_, - max_num_cell_dofs_, - SaveAngularFluxEnabled(), - groupset_angle_group_stride_, - groupset_group_stride_, - destination_phi_, - destination_psi_, - include_rhs_time_term_, - problem_, - &psi_old_, - group_block_size_, - ctx_); - - CBC_Sweep_Generic(data, angle_set); + auto data = MakeSweepData(&psi_old_); + + CBC_Sweep_Generic(data, generic_scratch_, angle_set); } template @@ -117,22 +139,7 @@ CBCSweepChunkTD::Sweep_FixedN(AngleSet& angle_set) { CALI_CXX_MARK_SCOPE("CBCSweepChunkTD::Sweep_FixedN"); - auto data = MakeCBCSweepData(discretization_, - source_moments_, - groupset_, - xs_, - num_moments_, - max_num_cell_dofs_, - SaveAngularFluxEnabled(), - groupset_angle_group_stride_, - groupset_group_stride_, - destination_phi_, - destination_psi_, - include_rhs_time_term_, - problem_, - &psi_old_, - group_block_size_, - ctx_); + auto data = MakeSweepData(&psi_old_); CBC_Sweep_FixedN(data, angle_set); } @@ -145,4 +152,4 @@ template void CBCSweepChunkTD::Sweep_FixedN<6>(AngleSet&); template void CBCSweepChunkTD::Sweep_FixedN<7>(AngleSet&); template void CBCSweepChunkTD::Sweep_FixedN<8>(AngleSet&); -} // namespace opensn +} // namespace opensn \ No newline at end of file diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.h index 5e99bb83ef..e7d870ad11 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.h @@ -10,30 +10,53 @@ namespace opensn { +/** + * Time-dependent host-side CBC sweep chunk. + * + * Identical to CBCSweepChunk but instantiates the Generic and FixedN kernels + * with \c time_dependent=true, adding the \f$v_g^{-1}/(\theta\Delta t)\f$ + * time-absorption term and the previous-time-step angular flux source. + */ class CBCSweepChunkTD : public SweepChunk { public: CBCSweepChunkTD(DiscreteOrdinatesProblem& problem, LBSGroupset& groupset); ~CBCSweepChunkTD() override = default; + /// Cache angle-set-level data and select the FixedN or Generic kernel. void SetAngleSet(AngleSet& angle_set) override; + /// Cache cell-level data for the next Sweep call. void SetCell(const Cell* cell_ptr, AngleSet& angle_set) override; + /// Sweep the current cell for all angles and groups (time-dependent). void Sweep(AngleSet& angle_set) override; + /// Indicate this chunk uses the time-dependent kernel variant. bool IsTimeDependent() const override { return true; } protected: + /// Pointer-to-member for the selected sweep implementation. using SweepFunc = void (CBCSweepChunkTD::*)(AngleSet&); + /// Construct the aggregated sweep data struct for the current cell. + CBCSweepData MakeSweepData(const std::vector* psi_old); + /// Sweep using the generic (dynamic-size) kernel. void Sweep_Generic(AngleSet& angle_set); + /// Sweep using the FixedN (compile-time node count) kernel. template void Sweep_FixedN(AngleSet& angle_set); + /// Owning discrete ordinates problem. DiscreteOrdinatesProblem& problem_; + /// Previous-time-step angular flux vector. const std::vector& psi_old_; + /// Energy group block size for SIMD batch solve. unsigned int group_block_size_ = 0; + /// Cached per-cell and per-angle-set context. CBCSweepChunkContext ctx_; + /// Reusable scratch buffers for the Generic kernel. + CBCGenericSweepScratch generic_scratch_; private: + /// Selected sweep function pointer (Generic or FixedN). SweepFunc sweep_impl_td_ = nullptr; }; -} // namespace opensn +} // namespace opensn \ No newline at end of file diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h index 7413c44ab7..4504375d9f 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h @@ -15,71 +15,198 @@ namespace opensn { +/** + * Aggregated sweep parameters for one cell in the CBC Generic/FixedN kernels. + * + * Bundles all data needed by the per-cell sweep kernel into a single struct + * to avoid long parameter lists. Includes references to the spatial + * discretization, groupset, FLUDS, cell geometry, and unit cell matrices. + * Constructed once per cell by CBCSweepChunk::MakeSweepData. + */ struct CBCSweepData { + /// Spatial discretization providing DOF mapping. const SpatialDiscretization& discretization; + /// Source moment vector (indexed by DOF mapping). const std::vector& source_moments; + /// Groupset containing quadrature and group range. const LBSGroupset& groupset; - const BlockID2XSMap& xs; + /// Number of angular moments. unsigned int num_moments; + /// Maximum number of DOFs (nodes) per cell in the mesh. unsigned int max_num_cell_dofs; + /// Whether to store solved angular fluxes into destination_psi. bool save_angular_flux; + /// Stride in the psi array: num_angles_in_quadrature * num_groups. size_t groupset_angle_group_stride; + /// Stride in the psi array: num_groups. size_t groupset_group_stride; + /// Output scalar flux moments vector (accumulated across angles). std::vector& destination_phi; + /// Output angular flux vector (written if save_angular_flux is true). std::vector& destination_psi; + /// Whether surface source boundary conditions are active. bool surface_source_active; + /// Whether the RHS time-derivative term is included. bool include_rhs_time_term; + /// Owning discrete ordinates problem (for time-step and theta access). DiscreteOrdinatesProblem& problem; + /// Previous-time-step angular flux (nullptr for steady-state sweeps). const std::vector* psi_old; + /// Energy group block size for FixedN SIMD batch solve. unsigned int group_block_size; + /// FLUDS providing local/nonlocal angular flux access. CBC_FLUDS& fluds; + /// Current cell being swept. const Cell& cell; + /// Local ID of the current cell. std::uint32_t cell_local_id; + /// Cell mapping providing face-node maps and DOF counts. const CellMapping& cell_mapping; + /// Transport view providing cross-section data and DOF mapping. CellLBSView& cell_transport_view; + /// Number of faces on the current cell. size_t cell_num_faces; + /// Number of nodes on the current cell. size_t cell_num_nodes; + /// Number of energy groups in the groupset. size_t gs_size; + /// First group index in the groupset. unsigned int gs_gi; + /// Number of angles in the current angle set. size_t num_angles_in_as; + /// Per-angle group stride (= num_groups). unsigned int group_stride; + /// Per-node angular stride (= num_angles * num_groups). size_t group_angle_stride; + /// Volume integral: \f$\int_V \nabla\phi_i \cdot \phi_j \, dV\f$. const DenseMatrix& G; + /// Mass matrix: \f$\int_V \phi_i \phi_j \, dV\f$. const DenseMatrix& M; + /// Per-face surface mass matrices: \f$\int_S \phi_i \phi_j \, dS\f$. const std::vector>& M_surf; + /// Per-face surface integrals: \f$\int_S \phi_i \, dS\f$. const std::vector>& IntS_shapeI; }; +/// Pre-resolved metadata for one incoming face of the current cell. +struct CBCIncomingFaceData +{ + /// Nodal mapping for local face access (nullptr for nonlocal/boundary). + const FaceNodalMapping* face_nodal_mapping = nullptr; + /// Base pointer for local/nonlocal incoming face psi; null only for boundary faces. + double* psi_base = nullptr; +}; + +/// Pre-resolved metadata for one outgoing face of the current cell. +struct CBCOutgoingFaceData +{ + /// Whether the face is a reflecting boundary. + bool is_reflecting_boundary_face = false; + /// Base pointer for local outgoing face psi, when applicable. + double* psi_base = nullptr; + /// Nonlocal face info for MPI send staging; null for local/boundary faces. + const CBC_FLUDSCommonData::OutgoingNonlocalFaceInfo* outgoing_nonlocal_face_info = nullptr; +}; + +/** + * Reusable scratch buffers for the CBC Generic sweep kernel. + * + * Allocated once per sweep chunk and resized lazily via EnsureCapacity. + * Avoids per-cell heap allocation in the hot path. + */ +struct CBCGenericSweepScratch +{ + /// Transport matrix: \f$A_{ij} = \hat\Omega \cdot G_{ij} + \text{face terms}\f$. + DenseMatrix Amat; + /// Temporary copy of A with \f$\sigma_t M\f$ added, consumed by Gauss elimination. + DenseMatrix Atemp; + /// Per-group RHS vectors. + std::vector> b; + /// Per-node source assembly scratch. + std::vector source; + /// Per-face dot product \f$\hat\Omega \cdot \hat n_f\f$. + std::vector face_mu_values; + /// Per-group time-absorption coefficient \f$v_g^{-1} / (\theta \Delta t)\f$. + std::vector tau_gsg; + /// Pre-resolved incoming face metadata (one per cell face). + std::vector incoming_face_data; + /// Pre-resolved outgoing face metadata (one per cell face). + std::vector outgoing_face_data; + /// Pre-computed DOF indices: \c moment_dof_map[m * cell_num_nodes + i]. + std::vector moment_dof_map; + + void + EnsureCapacity(const size_t max_num_cell_dofs, const size_t gs_size, const size_t cell_num_faces) + { + if (Amat.Rows() != max_num_cell_dofs or Amat.Columns() != max_num_cell_dofs) + { + Amat = DenseMatrix(max_num_cell_dofs, max_num_cell_dofs); + Atemp = DenseMatrix(max_num_cell_dofs, max_num_cell_dofs); + } + + if (b.size() != gs_size) + b.assign(gs_size, Vector(max_num_cell_dofs)); + else + for (auto& vec : b) + if (vec.Rows() != max_num_cell_dofs) + vec = Vector(max_num_cell_dofs); + + if (source.size() != max_num_cell_dofs) + source.assign(max_num_cell_dofs, 0.0); + + if (face_mu_values.size() != cell_num_faces) + face_mu_values.assign(cell_num_faces, 0.0); + + if (incoming_face_data.size() != cell_num_faces) + incoming_face_data.assign(cell_num_faces, CBCIncomingFaceData{}); + + if (outgoing_face_data.size() != cell_num_faces) + outgoing_face_data.assign(cell_num_faces, CBCOutgoingFaceData{}); + } +}; + +/** + * Generic CBC sweep kernel for one cell, parameterized by time dependence. + * + * Assembles and solves the local transport system for all angles and groups + * in the angle set, using dynamic-size matrices and Gauss elimination. + * Used when the cell node count does not match a compile-time FixedN + * specialization. + * + * \tparam time_dependent if true, include the time-derivative source term + */ template inline void -CBC_Sweep_Generic(CBCSweepData& data, AngleSet& angle_set) +CBC_Sweep_Generic(CBCSweepData& data, CBCGenericSweepScratch& scratch, AngleSet& angle_set) { const auto& groupset = data.groupset; const auto& m2d_op = groupset.quadrature->GetMomentToDiscreteOperator(); const auto& d2m_op = groupset.quadrature->GetDiscreteToMomentOperator(); - - DenseMatrix Amat(data.max_num_cell_dofs, data.max_num_cell_dofs); - DenseMatrix Atemp(data.max_num_cell_dofs, data.max_num_cell_dofs); - std::vector> b(data.gs_size, Vector(data.max_num_cell_dofs)); - std::vector source(data.max_num_cell_dofs); - std::vector face_mu_values(data.cell_num_faces); + scratch.EnsureCapacity(data.max_num_cell_dofs, data.gs_size, data.cell_num_faces); + auto& Amat = scratch.Amat; + auto& Atemp = scratch.Atemp; + auto& b = scratch.b; + auto& source = scratch.source; + auto& face_mu_values = scratch.face_mu_values; const auto& face_orientations = angle_set.GetSPDS().GetCellFaceOrientations()[data.cell_local_id]; - const auto& sigma_t = data.xs.at(data.cell.block_id)->GetSigmaTotal(); + const auto& cell_xs = data.cell_transport_view.GetXS(); + const auto& sigma_t = cell_xs.GetSigmaTotal(); - std::vector tau_gsg; + scratch.tau_gsg.clear(); if constexpr (time_dependent) { - const auto& inv_velg = data.xs.at(data.cell.block_id)->GetInverseVelocity(); + const auto& inv_velg = cell_xs.GetInverseVelocity(); const double theta = data.problem.GetTheta(); const double inv_theta = 1.0 / theta; const double dt = data.problem.GetTimeStep(); const double inv_dt = 1.0 / dt; + auto& tau_gsg = scratch.tau_gsg; tau_gsg.assign(data.gs_size, 0.0); for (size_t gsg = 0; gsg < data.gs_size; ++gsg) tau_gsg[gsg] = inv_velg[data.gs_gi + gsg] * inv_theta * inv_dt; @@ -91,6 +218,66 @@ CBC_Sweep_Generic(CBCSweepData& data, AngleSet& angle_set) : nullptr; const auto& as_angle_indices = angle_set.GetAngleIndices(); + const auto& cbc_common = dynamic_cast(data.fluds.GetCommonData()); + auto* const async_comm = dynamic_cast(angle_set.GetCommunicator()); + auto& incoming_face_data = scratch.incoming_face_data; + auto& outgoing_face_data = scratch.outgoing_face_data; + for (size_t f = 0; f < data.cell_num_faces; ++f) + { + incoming_face_data[f] = CBCIncomingFaceData{}; + outgoing_face_data[f] = CBCOutgoingFaceData{}; + const auto& face = data.cell.faces[f]; + const bool is_local_face = data.cell_transport_view.IsFaceLocal(f); + const bool is_boundary_face = not face.has_neighbor; + const auto* face_nodal_mapping = + &data.fluds.GetCommonData().GetFaceNodalMapping(data.cell_local_id, f); + + if (face_orientations[f] == FaceOrientation::INCOMING) + { + auto& face_data = incoming_face_data[f]; + face_data.face_nodal_mapping = face_nodal_mapping; + if (is_local_face) + face_data.psi_base = + data.fluds.GetLocalFacePsiPointer(data.cell_local_id, static_cast(f)); + else if (not is_boundary_face) + face_data.psi_base = data.fluds.GetIncomingNonlocalFacePsiPointer( + data.cell_local_id, static_cast(f)); + } + + if (face_orientations[f] == FaceOrientation::OUTGOING) + { + auto& face_data = outgoing_face_data[f]; + face_data.is_reflecting_boundary_face = + is_boundary_face and angle_set.GetBoundaries()[face.neighbor_id]->IsReflecting(); + if (is_local_face) + face_data.psi_base = + data.fluds.GetLocalFacePsiPointer(data.cell_local_id, static_cast(f)); + if (not is_local_face and not is_boundary_face) + face_data.outgoing_nonlocal_face_info = + &cbc_common.GetOutgoingNonlocalFaceInfo(data.cell_local_id, static_cast(f)); + } + } + + auto& moment_dof_map = scratch.moment_dof_map; + moment_dof_map.resize(static_cast(data.num_moments) * data.cell_num_nodes); + for (unsigned int m = 0; m < data.num_moments; ++m) + for (size_t i = 0; i < data.cell_num_nodes; ++i) + moment_dof_map[static_cast(m) * data.cell_num_nodes + i] = + data.cell_transport_view.MapDOF(i, m, data.gs_gi); + + double* psi_new_base = nullptr; + double theta = 1.0; + double inv_theta = 1.0; + if (data.save_angular_flux) + { + psi_new_base = &data.destination_psi[data.discretization.MapDOFLocal( + data.cell, 0, groupset.psi_uk_man_, 0, 0)]; + if constexpr (time_dependent) + { + theta = data.problem.GetTheta(); + inv_theta = 1.0 / theta; + } + } for (size_t as_ss_idx = 0; as_ss_idx < data.num_angles_in_as; ++as_ss_idx) { @@ -115,10 +302,8 @@ CBC_Sweep_Generic(CBCSweepData& data, AngleSet& angle_set) continue; const auto& face = data.cell.faces[f]; - const bool is_local_face = data.cell_transport_view.IsFaceLocal(f); - const bool is_boundary_face = not face.has_neighbor; - const auto* face_nodal_mapping = - &data.fluds.GetCommonData().GetFaceNodalMapping(data.cell_local_id, f); + const auto& face_data = incoming_face_data[f]; + const auto* face_nodal_mapping = face_data.face_nodal_mapping; const size_t num_face_nodes = data.cell_mapping.GetNumFaceNodes(f); for (size_t fi = 0; fi < num_face_nodes; ++fi) @@ -133,13 +318,11 @@ CBC_Sweep_Generic(CBCSweepData& data, AngleSet& angle_set) const double* psi = nullptr; - if (is_local_face) - psi = data.fluds.UpwindPsi(*data.cell_transport_view.FaceNeighbor(f), - face_nodal_mapping->cell_node_mapping_[fj], - as_ss_idx); - else if (not is_boundary_face) - psi = data.fluds.NLUpwindPsi( - data.cell.global_id, f, face_nodal_mapping->face_node_mapping_[fj], as_ss_idx); + if (face_data.psi_base != nullptr) + psi = face_data.psi_base + + static_cast(face_nodal_mapping->face_node_mapping_[fj]) * + data.group_angle_stride + + as_ss_idx * data.group_stride; else psi = angle_set.PsiBoundary(face.neighbor_id, direction_num, @@ -165,19 +348,23 @@ CBC_Sweep_Generic(CBCSweepData& data, AngleSet& angle_set) { double sigma_tg = sigma_t[data.gs_gi + gsg]; if constexpr (time_dependent) + { + const auto& tau_gsg = scratch.tau_gsg; sigma_tg += tau_gsg[gsg]; + } for (size_t i = 0; i < data.cell_num_nodes; ++i) { double temp_src = 0.0; for (unsigned int m = 0; m < data.num_moments; ++m) { - const auto ir = data.cell_transport_view.MapDOF(i, m, data.gs_gi + gsg); + const auto ir = moment_dof_map[static_cast(m) * data.cell_num_nodes + i] + gsg; temp_src += m2d_row[m] * data.source_moments[ir]; } if constexpr (time_dependent) { + const auto& tau_gsg = scratch.tau_gsg; const size_t imap = i * data.groupset_angle_group_stride + direction_num * data.groupset_group_stride; if (data.include_rhs_time_term and psi_old) @@ -207,7 +394,7 @@ CBC_Sweep_Generic(CBCSweepData& data, AngleSet& angle_set) const auto wn_d2m = d2m_row[m]; for (size_t i = 0; i < data.cell_num_nodes; ++i) { - const auto ir = data.cell_transport_view.MapDOF(i, m, data.gs_gi); + const auto ir = moment_dof_map[static_cast(m) * data.cell_num_nodes + i]; for (size_t gsg = 0; gsg < data.gs_size; ++gsg) data.destination_phi[ir + gsg] += wn_d2m * b[gsg](i); } @@ -215,17 +402,6 @@ CBC_Sweep_Generic(CBCSweepData& data, AngleSet& angle_set) if (data.save_angular_flux) { - double* psi_new = &data.destination_psi[data.discretization.MapDOFLocal( - data.cell, 0, groupset.psi_uk_man_, 0, 0)]; - - double theta = 1.0; - double inv_theta = 1.0; - if constexpr (time_dependent) - { - theta = data.problem.GetTheta(); - inv_theta = 1.0 / theta; - } - for (size_t i = 0; i < data.cell_num_nodes; ++i) { const size_t imap = @@ -237,10 +413,10 @@ CBC_Sweep_Generic(CBCSweepData& data, AngleSet& angle_set) if constexpr (time_dependent) { const double psi_old_val = psi_old ? psi_old[imap + gsg] : 0.0; - psi_new[imap + gsg] = inv_theta * (psi_sol + (theta - 1.0) * psi_old_val); + psi_new_base[imap + gsg] = inv_theta * (psi_sol + (theta - 1.0) * psi_old_val); } else - psi_new[imap + gsg] = psi_sol; + psi_new_base[imap + gsg] = psi_sol; } } } @@ -251,35 +427,33 @@ CBC_Sweep_Generic(CBCSweepData& data, AngleSet& angle_set) continue; const auto& face = data.cell.faces[f]; - const bool is_local_face = data.cell_transport_view.IsFaceLocal(f); - const bool is_boundary_face = not face.has_neighbor; - const bool is_reflecting_boundary_face = - (is_boundary_face and angle_set.GetBoundaries()[face.neighbor_id]->IsReflecting()); + const auto& face_data = outgoing_face_data[f]; + const bool is_reflecting_boundary_face = face_data.is_reflecting_boundary_face; const auto& IntF_shapeI = data.IntS_shapeI[f]; - const int locality = data.cell_transport_view.FaceLocality(f); const size_t num_face_nodes = data.cell_mapping.GetNumFaceNodes(f); - const auto& face_nodal_mapping = - data.fluds.GetCommonData().GetFaceNodalMapping(data.cell_local_id, f); - std::vector* psi_nonlocal_outgoing = nullptr; + double* psi_nonlocal_outgoing = nullptr; - if (not is_boundary_face and not is_local_face) + if (face_data.outgoing_nonlocal_face_info != nullptr) { - auto* async_comm = dynamic_cast(angle_set.GetCommunicator()); - const size_t data_size_for_msg = num_face_nodes * data.group_angle_stride; + const auto& outgoing_nonlocal_face_info = *face_data.outgoing_nonlocal_face_info; + const size_t data_size_for_msg = + static_cast(outgoing_nonlocal_face_info.num_face_nodes) * data.group_angle_stride; psi_nonlocal_outgoing = - &async_comm->InitGetDownwindMessageData(locality, - face.neighbor_id, - face_nodal_mapping.associated_face_, - angle_set.GetID(), - data_size_for_msg); + async_comm + ->InitGetDownwindMessageData(outgoing_nonlocal_face_info.locality, + outgoing_nonlocal_face_info.cell_global_id, + outgoing_nonlocal_face_info.associated_face, + angle_set.GetID(), + data_size_for_msg) + .data(); } for (size_t fi = 0; fi < num_face_nodes; ++fi) { const int i = data.cell_mapping.MapFaceNode(f, fi); - if (is_boundary_face) + if (face_data.outgoing_nonlocal_face_info == nullptr and face_data.psi_base == nullptr) { for (size_t gsg = 0; gsg < data.gs_size; ++gsg) data.cell_transport_view.AddOutflow( @@ -287,9 +461,9 @@ CBC_Sweep_Generic(CBCSweepData& data, AngleSet& angle_set) } double* psi = nullptr; - if (is_local_face) - psi = data.fluds.OutgoingPsi(data.cell, i, as_ss_idx); - else if (not is_boundary_face) + if (face_data.psi_base != nullptr) + psi = face_data.psi_base + fi * data.group_angle_stride + as_ss_idx * data.group_stride; + else if (face_data.outgoing_nonlocal_face_info != nullptr) psi = data.fluds.NLOutgoingPsi(psi_nonlocal_outgoing, fi, as_ss_idx); else if (is_reflecting_boundary_face) psi = angle_set.PsiReflected(face.neighbor_id, direction_num, data.cell_local_id, f, fi); @@ -302,6 +476,16 @@ CBC_Sweep_Generic(CBCSweepData& data, AngleSet& angle_set) } } +/** + * Fixed-node-count CBC sweep kernel with AVX/AVX512 SIMD batch solve. + * + * Specialized in cbc_avx_sweep_chunk.cc for compile-time-known node counts + * (4, 8, etc.), enabling stack-allocated matrices, loop unrolling, and SIMD + * batch Gauss elimination across multiple energy groups simultaneously. + * + * \tparam NumNodes compile-time number of cell nodes + * \tparam time_dependent if true, include the time-derivative source term + */ template void CBC_Sweep_FixedN(CBCSweepData& data, AngleSet& angle_set); From d38db2ee1162c681e06a047e6777d3ea618cf3fe Mon Sep 17 00:00:00 2001 From: Eappen Nelluvelil Date: Mon, 13 Apr 2026 14:26:12 -0500 Subject: [PATCH 6/6] CBCD V2 with aggregated communicator, multithreaded sweeps, and minimally sized FLUDs --- .../discrete_ordinates_problem.cc | 162 +++++-- .../sweep/angle_set/cbcd_angle_set.cu | 390 ++++++++++++++- .../sweep/angle_set/cbcd_angle_set.h | 214 ++++++++- .../sweep/communicators/cbcd_async_comm.cu | 443 ++++++++++++++++++ .../sweep/communicators/cbcd_async_comm.h | 246 ++++++++++ .../sweep/communicators/lock_free_queues.h | 170 +++++++ .../sweep/fluds/cbcd_fluds.cu | 320 +++++++------ .../sweep/fluds/cbcd_fluds.h | 179 ++++--- .../sweep/fluds/cbcd_fluds_common_data.cc | 20 +- .../sweep/fluds/cbcd_fluds_common_data.cu | 251 ++++++++-- .../sweep/fluds/cbcd_fluds_common_data.h | 123 ++++- .../sweep/fluds/cbcd_structs.h | 96 +++- .../sweep/scheduler/sweep_scheduler.cc | 11 +- .../sweep/scheduler/sweep_scheduler.cu | 204 ++------ .../sweep_chunks/cbc_sweep_kernels.h | 4 +- .../sweep_chunks/cbcd_sweep_chunk.cu | 180 +++++-- .../sweep_chunks/cbcd_sweep_chunk.h | 69 ++- .../sweep_chunks/gpu_kernel/solver.h | 48 +- 18 files changed, 2523 insertions(+), 607 deletions(-) create mode 100644 modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.cu create mode 100644 modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.h create mode 100644 modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/lock_free_queues.h diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc index bf9076043a..ea0240b32d 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.cc @@ -8,6 +8,7 @@ #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/boundary/isotropic_boundary.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/boundary/arbitrary_boundary.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/aah.h" @@ -48,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -1432,13 +1434,10 @@ DiscreteOrdinatesProblem::InitializeSweepDataStructures() std::chrono::duration elapsed_seconds = end_time - start_time; const auto local_face_slots = cbc_spds_list.front()->GetMaxNumLocalPsiSlots(); - log.Log() << program_timer.GetTimeString() << "CBC SPDS local cell-face psi slot summary\n" - << " SPDS count : 1\n" - << " Elapsed : " << elapsed_seconds.count() << " s\n" - << " Max : " << local_face_slots << "\n" - << " Min : " << local_face_slots << "\n" - << " Median : " << static_cast(local_face_slots) << "\n" - << " Average : " << static_cast(local_face_slots) << "\n"; + log.Log() << "CBC SPDS cell-face slot plan calculated in " << elapsed_seconds.count() + << " s with 1 thread.\n" + << " (max, min, avg) = (" << local_face_slots << ", " << local_face_slots << ", " + << static_cast(local_face_slots) << ").\n"; } else if (not cbc_spds_list.empty()) { @@ -1448,15 +1447,14 @@ DiscreteOrdinatesProblem::InitializeSweepDataStructures() SPMD_ThreadPool pool(num_workers); std::atomic next_index{0}; - log.Log() << program_timer.GetTimeString() - << " Compute max num local cell-face psi slots for " << cbc_spds_list.size() - << " CBC SPDS using " << num_workers << " worker threads.\n"; + log.Log() << "Computing cell-face slot plans for " << cbc_spds_list.size() + << " CBC SPDS with " << num_workers << " threads.\n"; auto start_time = std::chrono::steady_clock::now(); pool.ExecuteBatch( [&](std::size_t /* thread ID */) { - std::size_t index; + std::size_t index = 0; // Atomically fetch the next index to work on // std::memory_order_relaxed is sufficient here because we need atomicity only for the // fetch_add operation, and there are no other synchronization requirements between @@ -1469,42 +1467,26 @@ DiscreteOrdinatesProblem::InitializeSweepDataStructures() }); auto end_time = std::chrono::steady_clock::now(); std::chrono::duration elapsed_seconds = end_time - start_time; - double elapsed_time = elapsed_seconds.count(); size_t max_local_psi_slots = 0; size_t min_local_psi_slots = std::numeric_limits::max(); - std::vector local_psi_slot_counts; - local_psi_slot_counts.reserve(cbc_spds_list.size()); + std::uint64_t total_local_psi_slots = 0; for (const auto& spds : cbc_spds_list) { const auto local_psi_slots = spds->GetMaxNumLocalPsiSlots(); max_local_psi_slots = std::max(max_local_psi_slots, local_psi_slots); min_local_psi_slots = std::min(min_local_psi_slots, local_psi_slots); - local_psi_slot_counts.push_back(local_psi_slots); + total_local_psi_slots += local_psi_slots; } - std::sort(local_psi_slot_counts.begin(), local_psi_slot_counts.end()); - const auto num_counts = local_psi_slot_counts.size(); const double avg_local_psi_slots = - static_cast(std::accumulate( - local_psi_slot_counts.begin(), local_psi_slot_counts.end(), std::size_t{0})) / - num_counts; - const double median_local_psi_slots = - (num_counts % 2 == 1) - ? static_cast(local_psi_slot_counts[num_counts / 2]) - : 0.5 * static_cast(local_psi_slot_counts[num_counts / 2 - 1] + - local_psi_slot_counts[num_counts / 2]); - - log.Log() << program_timer.GetTimeString() - << " CBC SPDS local cell-face psi slot statistics\n" - << " SPDS count : " << cbc_spds_list.size() << "\n" - << " Workers : " << num_workers << "\n" - << " Elapsed : " << elapsed_time << " s\n" - << " Max : " << max_local_psi_slots << "\n" - << " Min : " << min_local_psi_slots << "\n" - << " Median : " << median_local_psi_slots << "\n" - << " Average : " << avg_local_psi_slots << "\n"; + static_cast(total_local_psi_slots) / static_cast(cbc_spds_list.size()); + + log.Log() << "CBC SPDS cell-face slot plans calculated in " << elapsed_seconds.count() + << " s.\n" + << " (avg, max, min) = (" << avg_local_psi_slots << " slots, " + << max_local_psi_slots << " slots, " << min_local_psi_slots << " slots)."; } } else @@ -1842,6 +1824,12 @@ DiscreteOrdinatesProblem::InitFluxDataStructures(LBSGroupset& groupset) groupset.angle_agg = std::make_shared(sweep_boundaries_, groupset.quadrature, grid_); + std::vector cbc_fluds_local_psi_bytes; + std::vector cbc_fluds_boundary_nonlocal_bytes; + const auto num_local_spatial_dofs = discretization_->GetNumLocalDOFs(groupset.psi_uk_man_) / + groupset.psi_uk_man_.GetNumberOfUnknowns() / gs_num_grps; + std::uint64_t full_local_psi_storage_bytes = 0; + size_t angle_set_id = 0; for (const auto& so_grouping : unique_so_groupings) { @@ -1910,8 +1898,11 @@ DiscreteOrdinatesProblem::InitFluxDataStructures(LBSGroupset& groupset) else if (sweep_type_ == "CBC") { std::shared_ptr fluds; + std::size_t boundary_nonlocal_bytes = 0; if (use_gpus_) { + const auto& cbcd_common_data = + dynamic_cast(fluds_common_data); fluds = CreateCBCD_FLUDS(gs_num_grps, angle_indices.size(), grid_->local_cells.size(), @@ -1919,6 +1910,13 @@ DiscreteOrdinatesProblem::InitFluxDataStructures(LBSGroupset& groupset) groupset.psi_uk_man_, *discretization_, (not GetPsiNewLocal()[groupset.id].empty())); + + const auto num_groups_and_angles = gs_num_grps * angle_indices.size(); + boundary_nonlocal_bytes = (cbcd_common_data.GetNumIncomingBoundaryNodes() + + cbcd_common_data.GetNumOutgoingBoundaryNodes() + + cbcd_common_data.GetNumIncomingNonlocalNodes() + + cbcd_common_data.GetNumOutgoingNonlocalNodes()) * + num_groups_and_angles * sizeof(double); } else { @@ -1926,7 +1924,46 @@ DiscreteOrdinatesProblem::InitFluxDataStructures(LBSGroupset& groupset) gs_num_grps, angle_indices.size(), dynamic_cast(fluds_common_data)); + + const auto& cbc_common_data = dynamic_cast(fluds_common_data); + const auto num_groups_and_angles = gs_num_grps * angle_indices.size(); + constexpr std::size_t local_psi_alignment = 64; + constexpr std::size_t doubles_per_cache_line = local_psi_alignment / sizeof(double); + const auto round_up_to_cache_line_multiple = [](std::size_t value) + { + return ((value + doubles_per_cache_line - 1) / doubles_per_cache_line) * + doubles_per_cache_line; + }; + + for (std::size_t face_storage_index = 0; + face_storage_index < cbc_common_data.GetNumCellFaces(); + ++face_storage_index) + { + const auto& face_info = + cbc_common_data.GetIncomingNonlocalFaceInfoByStorageIndex(face_storage_index); + if (face_info.num_face_nodes == 0) + continue; + boundary_nonlocal_bytes += + round_up_to_cache_line_multiple(static_cast(face_info.num_face_nodes) * + num_groups_and_angles) * + sizeof(double); + } + } + + if (use_gpus_) + { + const auto& cbc_spds = dynamic_cast(fluds_common_data.GetSPDS()); + cbc_fluds_local_psi_bytes.push_back(cbc_spds.GetMaxNumLocalPsiSlots() * + cbc_spds.GetMaxLocalFaceNodeCount() * gs_num_grps * + angle_indices.size() * sizeof(double)); } + else + cbc_fluds_local_psi_bytes.push_back( + dynamic_cast(*fluds).GetLocalPsiBufferSize()); + cbc_fluds_boundary_nonlocal_bytes.push_back(boundary_nonlocal_bytes); + + full_local_psi_storage_bytes += + num_local_spatial_dofs * gs_num_grps * angle_indices.size() * sizeof(double); std::shared_ptr angle_set; if (use_gpus_) @@ -1957,6 +1994,61 @@ DiscreteOrdinatesProblem::InitFluxDataStructures(LBSGroupset& groupset) } // for an_ss } // for so_grouping + if (sweep_type_ == "CBC" and not cbc_fluds_local_psi_bytes.empty()) + { + const auto [min_it, max_it] = + std::minmax_element(cbc_fluds_local_psi_bytes.begin(), cbc_fluds_local_psi_bytes.end()); + const auto [boundary_nonlocal_min_it, boundary_nonlocal_max_it] = std::minmax_element( + cbc_fluds_boundary_nonlocal_bytes.begin(), cbc_fluds_boundary_nonlocal_bytes.end()); + const auto total_local_psi_storage = std::accumulate( + cbc_fluds_local_psi_bytes.begin(), cbc_fluds_local_psi_bytes.end(), std::uint64_t{0}); + const auto total_boundary_nonlocal_storage = + std::accumulate(cbc_fluds_boundary_nonlocal_bytes.begin(), + cbc_fluds_boundary_nonlocal_bytes.end(), + std::uint64_t{0}); + const auto total_managed_psi_storage = + total_local_psi_storage + total_boundary_nonlocal_storage; + std::ostringstream savings_out; + if (full_local_psi_storage_bytes > 0) + savings_out << 100.0 * (1.0 - (static_cast(total_local_psi_storage) / + static_cast(full_local_psi_storage_bytes))) + << "%."; + else + savings_out << "N/A."; + const auto format_bytes = [](const std::uint64_t bytes) + { + constexpr std::pair units[] = { + {1024.0 * 1024.0 * 1024.0, "GiB"}, {1024.0 * 1024.0, "MiB"}, {1024.0, "KiB"}, {1.0, "B"}}; + const auto bytes_as_double = static_cast(bytes); + + for (const auto& [scale, suffix] : units) + { + if (bytes_as_double >= scale || scale == 1.0) + { + std::ostringstream out; + const double value = bytes_as_double / scale; + const int precision = (scale == 1.0 || value >= 100.0) ? 0 : (value >= 10.0 ? 1 : 2); + out << std::fixed << std::setprecision(precision) << value << ' ' << suffix; + return out.str(); + } + } + + return std::string("0 B"); + }; + + log.Log() << (use_gpus_ ? "CBCD FLUDS" : "CBC FLUDS") << " psi storage usage across " + << cbc_fluds_local_psi_bytes.size() << " FLUDS instances.\n" + << " Total local psi storage and savings: (" << format_bytes(total_local_psi_storage) + << ", " << savings_out.str() << ")\n" + << " Total boundary/non-local storage: " + << format_bytes(total_boundary_nonlocal_storage) << ".\n" + << " Total managed local/boundary/non-local psi storage: " + << format_bytes(total_managed_psi_storage) << ".\n"; + } + + if (options_.verbose_inner_iterations) + log.Log() << program_timer.GetTimeString() << " Initialized angle aggregation."; + opensn::mpi_comm.barrier(); } diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbcd_angle_set.cu b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbcd_angle_set.cu index 6ea54c785e..0b0a6f1bb3 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbcd_angle_set.cu +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbcd_angle_set.cu @@ -2,15 +2,17 @@ // SPDX-License-Identifier: MIT #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbcd_angle_set.h" -#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h" -#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/sweep_chunk.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/boundary/sweep_boundary.h" #include "framework/mesh/mesh_continuum/mesh_continuum.h" -#include "framework/data_types/range.h" -#include "framework/logging/log.h" -#include "framework/runtime.h" #include "caliper/cali.h" +#include +#include +#include +#include namespace opensn { @@ -24,16 +26,22 @@ CBCD_AngleSet::CBCD_AngleSet(size_t id, const MPICommunicatorSet& comm_set) : AngleSet(id, num_groups, spds, fluds, angle_indices, boundaries), cbc_spds_(dynamic_cast(spds)), - async_comm_(id, *fluds, comm_set), + comm_set_(comm_set), + cbcd_fluds_(static_cast(*fluds_)), stream_(), device_angle_indices_(angles_.size()) { + boundary_ptrs_.reserve(boundaries_.size()); + for (auto& [bid, bndry] : boundaries_) + boundary_ptrs_.emplace(bid, bndry.get()); + crb::MemoryPinningManager angle_indices_pinner_(angles_); crb::copy(device_angle_indices_, angle_indices_pinner_, angles_.size(), 0, 0, stream_); - // Set CBCD_FLUDS stream and asynchronously allocate storage for local psi - auto* cbcd_fluds = std::static_pointer_cast(fluds_).get(); - cbcd_fluds->GetStream() = stream_; - cbcd_fluds->AllocateLocalAndSavedPsi(); + cbcd_fluds_.GetStream() = stream_; + cbcd_fluds_.AllocateLocalAndSavedPsi(); + cbcd_fluds_.InitializeReflectingBoundaryNodes(boundaries_); + InitializeTaskGraphData(); + InitializeReflectingTaskMask(); } CBCD_AngleSet::~CBCD_AngleSet() @@ -43,22 +51,353 @@ CBCD_AngleSet::~CBCD_AngleSet() AsynchronousCommunicator* CBCD_AngleSet::GetCommunicator() { - return static_cast(&async_comm_); + return nullptr; +} + +void +CBCD_AngleSet::UpdateSweepDependencies(std::set& following_angle_sets) +{ + for (auto* as : following_angle_sets) + { + auto* cbcd_as = static_cast(as); + following_angle_sets_.push_back(cbcd_as); + ++(cbcd_as->num_dependencies_); + } +} + +void +CBCD_AngleSet::ResetDependencyCounter() +{ + dependency_counter_.store(num_dependencies_, std::memory_order_relaxed); +} + +bool +CBCD_AngleSet::IsOutgoingReflectingFace(const CellFace& face, + const std::uint64_t cell_local_id, + const std::size_t face_id) const +{ + if ((face.has_neighbor) or + (cbc_spds_.GetCellFaceOrientations()[cell_local_id][face_id] != FaceOrientation::OUTGOING)) + return false; + const auto boundary_it = boundary_ptrs_.find(face.neighbor_id); + return ((boundary_it != boundary_ptrs_.end()) and (boundary_it->second->IsReflecting())); +} + +void +CBCD_AngleSet::InitializeReflectingTaskMask() +{ + const auto& task_list = cbc_spds_.GetTaskList(); + cell_has_outgoing_reflecting_boundary_.assign(task_list.size(), 0); + reflecting_boundaries_.clear(); + reflecting_boundaries_.reserve(boundaries_.size()); + for (auto& [_, bndry] : boundaries_) + if (bndry->IsReflecting()) + reflecting_boundaries_.push_back(bndry.get()); + + for (std::size_t task_idx = 0; task_idx < task_list.size(); ++task_idx) + { + const auto& cell = *task_list[task_idx].cell_ptr; + bool has_outgoing_reflecting_face = false; + for (std::size_t f = 0; f < cell.faces.size(); ++f) + { + if (IsOutgoingReflectingFace(cell.faces[f], cell.local_id, f)) + { + has_outgoing_reflecting_face = true; + break; + } + } + + if (has_outgoing_reflecting_face) + { + cell_has_outgoing_reflecting_boundary_[task_idx] = 1; + ++initial_reflecting_task_count_; + } + } +} + +void +CBCD_AngleSet::InitializeTaskGraphData() +{ + if (not initial_deps_.empty()) + return; + + const auto& task_list = cbc_spds_.GetTaskList(); + num_tasks_ = task_list.size(); + + initial_deps_.resize(num_tasks_); + remaining_deps_.resize(num_tasks_); + successor_offsets_.assign(num_tasks_ + 1, 0); + initial_ready_cell_ids_.clear(); + initial_ready_cell_ids_.reserve(num_tasks_); + + for (std::size_t task_idx = 0; task_idx < task_list.size(); ++task_idx) + { + const auto& task = task_list[task_idx]; + initial_deps_[task_idx] = static_cast(task.num_dependencies); + successor_offsets_[task_idx + 1] = static_cast(task.successors.size()); + if (task.num_dependencies == 0) + initial_ready_cell_ids_.push_back(static_cast(task_idx)); + } + + for (std::size_t task_idx = 0; task_idx < num_tasks_; ++task_idx) + successor_offsets_[task_idx + 1] += successor_offsets_[task_idx]; + + successor_data_.resize(successor_offsets_.back()); + for (std::size_t task_idx = 0; task_idx < task_list.size(); ++task_idx) + { + const auto& task = task_list[task_idx]; + std::copy(task.successors.begin(), + task.successors.end(), + successor_data_.begin() + successor_offsets_[task_idx]); + } +} + +void +CBCD_AngleSet::InitializeTaskState() +{ + std::copy(initial_deps_.begin(), initial_deps_.end(), remaining_deps_.begin()); + batch_state_.Reset(); + auto& ready_cell_ids = cbcd_fluds_.GetLocalCellIDs(batch_state_.ready_buffer_index); + ready_cell_ids.clear(); + ready_cell_ids.insert( + ready_cell_ids.end(), initial_ready_cell_ids_.begin(), initial_ready_cell_ids_.end()); + num_completed_tasks_ = 0; + pending_reflecting_tasks_ = following_angle_sets_.empty() ? 0 : initial_reflecting_task_count_; +} + +bool +CBCD_AngleSet::TryRetireCompletedBatch() +{ + if ((not batch_state_.kernel_in_flight) or (not stream_.is_completed())) + return false; + + auto& completed_cell_ids = cbcd_fluds_.GetLocalCellIDs(batch_state_.launch_buffer_index); + for (std::uint32_t i = 0; i < batch_state_.launch_count; ++i) + { + const auto cell_local_id = completed_cell_ids[i]; + const auto succ_begin = successor_offsets_[cell_local_id]; + const auto succ_end = successor_offsets_[cell_local_id + 1]; + for (auto succ_i = succ_begin; succ_i < succ_end; ++succ_i) + { + if (--remaining_deps_[successor_data_[succ_i]] == 0) + cbcd_fluds_.GetLocalCellIDs(batch_state_.ready_buffer_index) + .push_back(successor_data_[succ_i]); + } + + if ((not following_angle_sets_.empty()) and (not following_angle_sets_notified_) and + (cell_has_outgoing_reflecting_boundary_[cell_local_id] != 0)) + { + assert(pending_reflecting_tasks_ > 0); + --pending_reflecting_tasks_; + } + } + + num_completed_tasks_ += batch_state_.launch_count; + batch_state_.completed_buffer_index = batch_state_.launch_buffer_index; + batch_state_.completed_count = batch_state_.launch_count; + batch_state_.completed_batch_pending = true; + batch_state_.launch_count = 0; + batch_state_.kernel_in_flight = false; + return true; +} + +bool +CBCD_AngleSet::TryLaunchReadyBatch(CBCDSweepChunk& sweep_chunk) +{ + auto& ready_cell_ids = cbcd_fluds_.GetLocalCellIDs(batch_state_.ready_buffer_index); + if (batch_state_.kernel_in_flight or ready_cell_ids.empty()) + return false; + + const auto launch_count = static_cast(ready_cell_ids.size()); + batch_state_.launch_buffer_index = batch_state_.ready_buffer_index; + batch_state_.launch_count = launch_count; + batch_state_.ready_buffer_index = batch_state_.AcquireFreeBuffer(); + cbcd_fluds_.GetLocalCellIDs(batch_state_.ready_buffer_index).clear(); + sweep_chunk.Sweep(launch_count, GetID(), ready_cell_ids.data()); + batch_state_.kernel_in_flight = true; + return true; +} + +void +CBCD_AngleSet::FlushCompletedBatch(CBCDSweepChunk& sweep_chunk) +{ + if (not batch_state_.completed_batch_pending) + return; + + auto& completed_cell_ids = cbcd_fluds_.GetLocalCellIDs(batch_state_.completed_buffer_index); + cbcd_fluds_.CopyOutgoingPsiBackToHost( + sweep_chunk, + *async_comm_, + GetID(), + GetAngleIndices(), + {completed_cell_ids.data(), static_cast(batch_state_.completed_count)}); + completed_cell_ids.clear(); + batch_state_.ReleaseBuffer(batch_state_.completed_buffer_index); + batch_state_.completed_buffer_index = 0; + batch_state_.completed_count = 0; + batch_state_.completed_batch_pending = false; + TryNotifyFollowingAngleSets(); +} + +void +CBCD_AngleSet::TryNotifyFollowingAngleSets() +{ + if (following_angle_sets_notified_) + return; + + if (following_angle_sets_.empty()) + { + following_angle_sets_notified_ = true; + return; + } + + if (pending_reflecting_tasks_ != 0) + return; + + for (auto* boundary : reflecting_boundaries_) + boundary->UpdateAnglesReadyStatus(angles_); + for (auto* following_angle_set : following_angle_sets_) + { + const auto old_value = + following_angle_set->dependency_counter_.fetch_sub(1, std::memory_order_release); + assert(old_value > 0); + } + following_angle_sets_notified_ = true; +} + +bool +CBCD_AngleSet::TryInitialize(CBCDSweepChunk& sweep_chunk) +{ + if (boundary_data_initialized_) + return false; + if (dependency_counter_.load(std::memory_order_acquire) != 0) + return false; + + CALI_CXX_MARK_SCOPE("CBCD_AngleSet::TryInitialize"); + + cbcd_fluds_.CopyIncomingBoundaryPsiToDevice(sweep_chunk, this); + InitializeTaskState(); + boundary_data_initialized_ = true; + return true; +} + +bool +CBCD_AngleSet::TryAdvanceOneStep(CBCDSweepChunk& cbcd_sweep_chunk) +{ + CALI_CXX_MARK_SCOPE("CBCD_AngleSet::TryAdvanceOneStep"); + + if (executed_ or (not boundary_data_initialized_)) + return false; + + auto& ready_cell_ids = cbcd_fluds_.GetLocalCellIDs(batch_state_.ready_buffer_index); + const bool kernel_completed = batch_state_.kernel_in_flight and stream_.is_completed(); + const bool has_incoming = async_comm_->HasIncoming(GetID()); + const bool can_finalize = (num_completed_tasks_ == num_tasks_) and + (not batch_state_.kernel_in_flight) and + (not batch_state_.completed_batch_pending); + + if ((not kernel_completed) and (not batch_state_.completed_batch_pending) and + ready_cell_ids.empty() and (not has_incoming) and (not can_finalize)) + return false; + + bool work_done = false; + + // Retire a completed kernel batch before processing new arrivals. + if (kernel_completed) + { + CALI_CXX_MARK_SCOPE("CBCD_AngleSet::RetireBatch"); + work_done |= TryRetireCompletedBatch(); + } + + // Consume any newly received non-local face data and release newly ready cells. + if (has_incoming) + { + CALI_CXX_MARK_SCOPE("CBCD_AngleSet::ProcessIncoming"); + work_done |= async_comm_->ProcessIncoming( + GetID(), + [this](const IncomingFaceBatch& batch) + { + const auto* psi_base = batch.psi_data.data(); + for (const auto& entry : batch.entries) + { + const auto cell_local_id = cbcd_fluds_.ScatterReceivedFaceData( + batch.source_slot, entry.source_face_index, psi_base + entry.payload_offset); + if (--remaining_deps_[cell_local_id] == 0) + cbcd_fluds_.GetLocalCellIDs(batch_state_.ready_buffer_index) + .push_back(static_cast(cell_local_id)); + } + }); + } + + // Launch the next batch once the stream is idle. + if ((not batch_state_.kernel_in_flight) and (not ready_cell_ids.empty())) + { + CALI_CXX_MARK_SCOPE("CBCD_AngleSet::LaunchBatch"); + work_done |= TryLaunchReadyBatch(cbcd_sweep_chunk); + } + + // Flush the completed batch after launching the next one so host packing + // overlaps with device execution when another batch is ready. + if (batch_state_.completed_batch_pending) + { + CALI_CXX_MARK_SCOPE("CBCD_AngleSet::FlushBatch"); + FlushCompletedBatch(cbcd_sweep_chunk); + work_done = true; + } + + // Finalize once all tasks are done and no kernel is in flight. + if (num_completed_tasks_ == num_tasks_ and (not batch_state_.kernel_in_flight) and + (not batch_state_.completed_batch_pending)) + { + CALI_CXX_MARK_SCOPE("CBCD_AngleSet::FinalizeCompletion"); + async_comm_->SignalAngleSetComplete(GetID()); + TryNotifyFollowingAngleSets(); + executed_ = true; + cbcd_fluds_.CopySavedPsiFromDevice(); + cbcd_fluds_.CopySavedPsiToDestinationPsi(cbcd_sweep_chunk, this); + return true; + } + + return work_done; } AngleSetStatus CBCD_AngleSet::AngleSetAdvance(SweepChunk& sweep_chunk, AngleSetStatus permission) { - OpenSnLogicalError("CBCD_AngleSet::AngleSetAdvance should not be called. Routine is handled by " - "SweepScheduler::ScheduleAlgoAsyncFIFO."); + CALI_CXX_MARK_SCOPE("CBCD_AngleSet::AngleSetAdvance"); + + if (executed_) + return AngleSetStatus::FINISHED; + + auto& cbcd_sweep_chunk = static_cast(sweep_chunk); + if (not boundary_data_initialized_) + { + if (not TryInitialize(cbcd_sweep_chunk)) + return AngleSetStatus::NOT_FINISHED; + } + + while (not executed_) + { + if (TryAdvanceOneStep(cbcd_sweep_chunk)) + continue; + std::this_thread::yield(); + } + + return AngleSetStatus::FINISHED; } void CBCD_AngleSet::ResetSweepBuffers() { - current_task_list_.clear(); - async_comm_.Reset(); - fluds_->ClearLocalAndReceivePsi(); + batch_state_.Reset(); + for (std::size_t i = 0; i < 3; ++i) + cbcd_fluds_.GetLocalCellIDs(i).clear(); + cbcd_fluds_.ClearLocalAndReceivePsi(); + num_completed_tasks_ = 0; + pending_reflecting_tasks_ = 0; + boundary_data_initialized_ = false; + following_angle_sets_notified_ = false; + ResetDependencyCounter(); executed_ = false; } @@ -71,13 +410,12 @@ CBCD_AngleSet::PsiBoundary(uint64_t boundary_id, unsigned int g, bool surface_source_active) { - if (boundaries_[boundary_id]->IsReflecting()) - return boundaries_[boundary_id]->PsiIncoming(cell_local_id, face_num, fi, angle_num, g); - - if (not surface_source_active) - return boundaries_[boundary_id]->ZeroFlux(g); - - return boundaries_[boundary_id]->PsiIncoming(cell_local_id, face_num, fi, angle_num, g); + const auto boundary_it = boundary_ptrs_.find(boundary_id); + assert(boundary_it != boundary_ptrs_.end()); + auto* boundary = boundary_it->second; + if (not boundary->IsReflecting() and (not surface_source_active)) + return boundary->ZeroFlux(g); + return boundary->PsiIncoming(cell_local_id, face_num, fi, angle_num, g); } double* @@ -87,7 +425,9 @@ CBCD_AngleSet::PsiReflected(uint64_t boundary_id, unsigned int face_num, unsigned int fi) { - return boundaries_[boundary_id]->PsiOutgoing(cell_local_id, face_num, fi, angle_num); + const auto boundary_it = boundary_ptrs_.find(boundary_id); + assert(boundary_it != boundary_ptrs_.end()); + return boundary_it->second->PsiOutgoing(cell_local_id, face_num, fi, angle_num); } -} // namespace opensn \ No newline at end of file +} // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbcd_angle_set.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbcd_angle_set.h index 9842d4e21a..4a04046eda 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbcd_angle_set.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbcd_angle_set.h @@ -4,21 +4,100 @@ #pragma once #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/angle_set.h" -#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.h" #include "caribou/main.hpp" -#include +#include +#include +#include +#include namespace crb = caribou; namespace opensn { +class CBCD_FLUDS; class CBC_SPDS; +class CBCDSweepChunk; +class CellFace; -/// CBC angle set for device. +/** + * CBCD angle set with task-graph-driven batched execution. + * + * Manages the host-side state machine for one device-resident CBCD angle set. + * The angle set waits for upstream dependencies, launches ready-cell batches on + * its stream, drains received non-local face data, and flushes completed outgoing + * data through the aggregated communicator. + */ class CBCD_AngleSet : public AngleSet { public: + /// Per-sweep launch/completion state for the current kernel batch. + struct BatchState + { + /// Buffer receiving newly ready local cell IDs. + std::uint8_t ready_buffer_index = 0; + /// Buffer backing the currently running kernel launch. + std::uint8_t launch_buffer_index = 0; + /// Buffer holding the most recently completed kernel batch until it is flushed. + std::uint8_t completed_buffer_index = 0; + /// Indices of currently free mapped-host cell-ID buffers. + std::array free_buffer_indices = {1, 2, 0}; + /// Number of free mapped-host cell-ID buffers. + std::uint8_t num_free_buffers = 2; + /// Number of local cells in the currently running kernel launch. + std::uint32_t launch_count = 0; + /// Number of local cells in the completed batch waiting to be flushed. + std::uint32_t completed_count = 0; + /// Flag indicating whether a kernel launch is currently outstanding. + bool kernel_in_flight = false; + /// Flag indicating whether a completed batch is waiting to be flushed. + bool completed_batch_pending = false; + + /// Reset the batch state between sweeps. + void Reset() + { + ready_buffer_index = 0; + launch_buffer_index = 0; + completed_buffer_index = 0; + free_buffer_indices = {1, 2, 0}; + num_free_buffers = 2; + launch_count = 0; + completed_count = 0; + kernel_in_flight = false; + completed_batch_pending = false; + } + + /// Acquire one free mapped-host cell-ID buffer. + std::uint8_t AcquireFreeBuffer() + { + assert(num_free_buffers > 0); + return free_buffer_indices[--num_free_buffers]; + } + + /** + * Return one mapped-host cell-ID buffer to the free list. + * + * \param buffer_index Buffer index to release. + */ + void ReleaseBuffer(const std::uint8_t buffer_index) + { + assert(num_free_buffers < free_buffer_indices.size()); + free_buffer_indices[num_free_buffers++] = buffer_index; + } + }; + + /** + * Construct one CBCD angle set. + * + * \param id Angle-set ID. + * \param num_groups Number of groups in the angle set. + * \param spds Sweep plane data structure for this angle set. + * \param fluds Device FLUDS for this angle set. + * \param angle_indices Global angle indices represented by this angle set. + * \param boundaries Sweep-boundary table indexed by boundary ID. + * \param comm_set MPI communicator set used to build the aggregated communicator. + */ CBCD_AngleSet(size_t id, size_t num_groups, const SPDS& spds, @@ -29,24 +108,60 @@ class CBCD_AngleSet : public AngleSet ~CBCD_AngleSet(); + /// Register following angle sets and initialize their startup dependency counts. + void UpdateSweepDependencies(std::set& following_angle_sets) override; + + /// Reset the unresolved angle-set dependency counter before a sweep. + void ResetDependencyCounter(); + + /// Return the delayed-data communicator for this angle set. AsynchronousCommunicator* GetCommunicator() override; + /// Bind the angle set to the sweep-chunk-owned aggregated communicator. + void SetCommunicator(CBCD_AsynchronousCommunicator& async_comm) { async_comm_ = &async_comm; } + + /// Return the communicator set used to construct the aggregated communicator. + const MPICommunicatorSet& GetCommunicatorSet() const { return comm_set_; } + void InitializeDelayedUpstreamData() override {} + /// Return the buffered-message limit used by the scheduler. int GetMaxBufferMessages() const override { return 0; } - void SetMaxBufferMessages(int new_max) override {} + /// Set the buffered-message limit used by the scheduler. + void SetMaxBufferMessages(int) override {} + + /** + * Initialize the angle set once all upstream angle-set dependencies are resolved. + * + * Copies incoming boundary data to the device, resets per-sweep task state, and + * marks the angle set ready for batched execution. + * + * \param sweep_chunk Owning CBCD sweep chunk. + * \return True when initialization was performed on this call. + */ + bool TryInitialize(CBCDSweepChunk& sweep_chunk); + + /** + * Advance the angle set by at most one scheduler step. + * + * One step may retire a completed batch, drain newly received faces, launch the + * next ready batch, flush completed outgoing data, or finalize the angle set. + * + * \param sweep_chunk Owning CBCD sweep chunk. + * \return True when any forward progress was made. + */ + bool TryAdvanceOneStep(CBCDSweepChunk& sweep_chunk); AngleSetStatus AngleSetAdvance(SweepChunk& sweep_chunk, AngleSetStatus permission) override; - AngleSetStatus FlushSendBuffers() override - { - const bool all_messages_sent = async_comm_.SendData(); - return all_messages_sent ? AngleSetStatus::MESSAGES_SENT : AngleSetStatus::MESSAGES_PENDING; - } + /// Flush buffered sends for this angle set. + AngleSetStatus FlushSendBuffers() override { return AngleSetStatus::MESSAGES_SENT; } + /// Reset per-sweep state and buffers. void ResetSweepBuffers() override; + /// Report whether delayed upstream data has been received. bool ReceiveDelayedData() override { return true; } const double* PsiBoundary(uint64_t boundary_id, @@ -63,20 +178,91 @@ class CBCD_AngleSet : public AngleSet unsigned int face_num, unsigned int fi) override; + /// Return the stream associated with this angle set. crb::Stream& GetStream() { return stream_; } + /// Return the device pointer to the angle-index table. std::uint32_t* GetDeviceAngleIndices() { return device_angle_indices_.get(); } - std::vector& GetCurrentTaskList() { return current_task_list_; } + /// Check whether the angle set has completed its sweep. + bool IsExecuted() const { return executed_; } + /// Check whether the angle set has been initialized for the current sweep. + bool IsInitialized() const { return boundary_data_initialized_; } -protected: +private: const CBC_SPDS& cbc_spds_; - std::vector current_task_list_; - CBC_AsynchronousCommunicator async_comm_; + /// Communicator-set metadata for aggregated communicator construction. + const MPICommunicatorSet& comm_set_; + /// Per-angle-set FLUDS. + CBCD_FLUDS& cbcd_fluds_; + /// Sweep chunk-owned aggregated communicator. + CBCD_AsynchronousCommunicator* async_comm_ = nullptr; /// Associated crb::Stream. crb::Stream stream_; /// Angle indices on GPU. crb::DeviceMemory device_angle_indices_; + /// Successor offsets indexed by local cell ID. + std::vector successor_offsets_; + /// Successor local cell IDs stored in CSR order. + std::vector successor_data_; + /// Initial dependency counts per local cell. + std::vector initial_deps_; + /// Per-sweep dependency counts per local cell. + std::vector remaining_deps_; + /// Local cell IDs with zero initial dependencies. + std::vector initial_ready_cell_ids_; + /// Cached total number of local cells/tasks in task graph. + std::size_t num_tasks_ = 0; + /// Number of unresolved angleset dependencies at startup. + std::size_t num_dependencies_ = 0; + /// Atomic counter for unresolved angleset dependencies. + std::atomic dependency_counter_; + /// Following anglesets that depend on this angleset. + std::vector following_angle_sets_; + /// Cached boundary lookup table. + std::unordered_map boundary_ptrs_; + /// Reflecting boundaries touched by this angleset. + std::vector reflecting_boundaries_; + /// Explicit launch/completion state for the current sweep batch. + BatchState batch_state_; + /// Cached reflecting-boundary producer mask by local cell ID. + std::vector cell_has_outgoing_reflecting_boundary_; + /// Number of completed local tasks. + std::size_t num_completed_tasks_ = 0; + /// Initial number of local cells that produce reflecting boundary data. + std::size_t initial_reflecting_task_count_ = 0; + /// Remaining number of local cells that still need to produce reflecting boundary data. + std::size_t pending_reflecting_tasks_ = 0; + /// Flag indicating if incoming boundary data has been copied to the device. + bool boundary_data_initialized_ = false; + /// Flag indicating if following anglesets have been notified of completion. + bool following_angle_sets_notified_ = false; + + /// Build the reflecting-boundary producer mask from the CBC task graph. + void InitializeReflectingTaskMask(); + + /// Flatten the CBC task graph into lookup tables. + void InitializeTaskGraphData(); + + /// Check whether a cell face is an outgoing reflecting boundary face. + bool IsOutgoingReflectingFace(const CellFace& face, + std::uint64_t cell_local_id, + std::size_t face_id) const; + + /// Reset mutable task state for a new sweep. + void InitializeTaskState(); + + /// Retire the completed kernel batch and update successor dependency state. + bool TryRetireCompletedBatch(); + + /// Launch the next ready-cell batch when the current stream is idle. + bool TryLaunchReadyBatch(CBCDSweepChunk& sweep_chunk); + + /// Pack and send deferred outgoing data for the completed batch. + void FlushCompletedBatch(CBCDSweepChunk& sweep_chunk); + + /// Notify following angle sets once all reflecting-boundary producers have completed. + void TryNotifyFollowingAngleSets(); }; -} // namespace opensn \ No newline at end of file +} // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.cu b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.cu new file mode 100644 index 0000000000..6fe07012cc --- /dev/null +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.cu @@ -0,0 +1,443 @@ +// SPDX-FileCopyrightText: 2026 The OpenSn Authors +// SPDX-License-Identifier: MIT + +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/angle_set.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/spds.h" +#include "framework/mpi/mpi_comm_set.h" +#include "framework/runtime.h" +#include "caliper/cali.h" +#include +#include +#include +#include +#include + +namespace opensn +{ + +namespace detail +{ + +// Bounded byte reader for communicator payload deserialization. +struct BufferReader +{ + const std::byte* ptr = nullptr; + std::size_t remaining_bytes = 0; + + std::size_t LoadSize() + { + assert(remaining_bytes >= sizeof(std::size_t)); + std::size_t value{}; + std::memcpy(&value, ptr, sizeof(std::size_t)); + ptr += sizeof(std::size_t); + remaining_bytes -= sizeof(std::size_t); + return value; + } + + std::uint32_t LoadFaceIndex() + { + assert(remaining_bytes >= sizeof(std::uint32_t)); + std::uint32_t value{}; + std::memcpy(&value, ptr, sizeof(std::uint32_t)); + ptr += sizeof(std::uint32_t); + remaining_bytes -= sizeof(std::uint32_t); + return value; + } + + void SkipBytes(const std::size_t num_bytes) + { + assert(remaining_bytes >= num_bytes); + ptr += num_bytes; + remaining_bytes -= num_bytes; + } + + const std::byte* Data() const noexcept { return ptr; } +}; + +} // namespace detail + +CBCD_AsynchronousCommunicator::CBCD_AsynchronousCommunicator( + const std::vector& angle_sets, + const MPICommunicatorSet& comm_set, + const std::vector>& incoming_source_partitions, + const std::size_t max_message_bytes, + const std::vector& capacities) + : comm_set_(comm_set), + num_angle_sets_(angle_sets.size()), + mpi_tag_(static_cast(angle_sets.size())), + max_message_bytes_(max_message_bytes), + angle_set_done_(angle_sets.size()) +{ + assert(incoming_source_partitions.size() == angle_sets.size()); + assert(capacities.size() == angle_sets.size()); + + std::set sources; + std::set destinations; + std::size_t total_outgoing_faces = 0; + std::size_t max_outgoing_face_values = 0; + + for (std::size_t i = 0; i < angle_sets.size(); ++i) + { + const auto* angle_set = angle_sets[i]; + const auto& spds = angle_set->GetSPDS(); + for (const int dep : spds.GetLocationDependencies()) + sources.insert(dep); + for (const int succ : spds.GetLocationSuccessors()) + destinations.insert(succ); + + total_outgoing_faces += capacities[i].outgoing_faces; + max_outgoing_face_values = + std::max(max_outgoing_face_values, capacities[i].max_outgoing_face_values); + if (capacities[i].incoming_faces > 0) + { + // Each mailbox slot stores one incoming batch for a single angle set. Entry and value + // buffers are reserved once from the angle-set-local capacity summary and then reused. + auto mailbox = std::make_unique>(); + mailbox->Preallocate(capacities[i].incoming_faces + 1); + mailbox->InitializeSlots( + [&](IncomingFaceBatch& batch) + { + batch.entries.reserve(capacities[i].max_incoming_batch_entries); + batch.psi_data.reserve(capacities[i].max_incoming_batch_values); + batch.entries.clear(); + batch.psi_data.clear(); + batch.source_slot = 0; + }); + incoming_mailboxes_.push_back(std::move(mailbox)); + } + else + { + incoming_mailboxes_.push_back(std::make_unique>()); + } + } + + my_rank_ = opensn::mpi_comm.rank(); + source_partitions_.assign(sources.begin(), sources.end()); + source_ranks_.reserve(source_partitions_.size()); + for (const int source_partition : source_partitions_) + source_ranks_.push_back(comm_set_.MapIonJ(source_partition, my_rank_)); + + source_partition_to_slot_by_angle_set_.resize(angle_sets.size()); + for (std::size_t angle_set_id = 0; angle_set_id < angle_sets.size(); ++angle_set_id) + { + auto& source_to_slot = source_partition_to_slot_by_angle_set_[angle_set_id]; + const auto& source_partitions = incoming_source_partitions[angle_set_id]; + source_to_slot.reserve(source_partitions.size()); + for (std::size_t source_slot = 0; source_slot < source_partitions.size(); ++source_slot) + source_to_slot.emplace(source_partitions[source_slot], + static_cast(source_slot)); + } + + outgoing_queues_.reserve(destinations.size()); + dest_to_queue_index_.reserve(destinations.size()); + int queue_index = 0; + for (const int dest_rank : destinations) + { + // Each destination rank receives one bounded MPSC queue. The slots are preallocated once + // and their payload vectors retain capacity across all subsequent publications. + auto queue = std::make_unique(); + queue->dest_rank = dest_rank; + queue->queue = std::make_unique>(); + if (total_outgoing_faces > 0) + queue->queue->Preallocate(total_outgoing_faces + 1); + queue->queue->InitializeSlots([max_outgoing_face_values](OutgoingFaceData& payload) + { payload.psi_data.reserve(max_outgoing_face_values); }); + outgoing_queues_.push_back(std::move(queue)); + dest_to_queue_index_[dest_rank] = queue_index++; + } + + send_batch_by_angle_set_.resize(num_angle_sets_); + for (auto& done : angle_set_done_) + done.store(false, std::memory_order_relaxed); + + if (max_message_bytes_ > 0) + recv_buffer_.Data().reserve(max_message_bytes_); +} + +CBCD_AsynchronousCommunicator::~CBCD_AsynchronousCommunicator() +{ + if (comm_thread_.joinable()) + Stop(); +} + +void +CBCD_AsynchronousCommunicator::SignalAngleSetComplete(const std::size_t angle_set_id) +{ + assert(angle_set_id < num_angle_sets_); + angle_set_done_[angle_set_id].store(true, std::memory_order_release); +} + +void +CBCD_AsynchronousCommunicator::Start() +{ + stop_requested_.store(false, std::memory_order_relaxed); + for (auto& done : angle_set_done_) + done.store(false, std::memory_order_relaxed); + in_flight_sends_.clear(); + comm_thread_ = std::thread(&CBCD_AsynchronousCommunicator::CommThreadLoop, this); +} + +void +CBCD_AsynchronousCommunicator::Stop() +{ + stop_requested_.store(true, std::memory_order_release); + if (comm_thread_.joinable()) + comm_thread_.join(); +} + +void +CBCD_AsynchronousCommunicator::CommThreadLoop() +{ + CALI_CXX_MARK_SCOPE("CBCD_AsynchronousCommunicator::CommThreadLoop"); + + // The communication thread handles all MPI communication for CBCD. + // Each iteration advances all three communication phases: outgoing batching, + // incoming pushes to angleset mailboxes, and retirement of completed nonblocking sends. + while (true) + { + bool work_done = SerializeAndSend(); + work_done |= ProbeAndReceive(); + work_done |= PollInFlightSends(); + + if (stop_requested_.load(std::memory_order_acquire) and AllAngleSetsComplete()) + { + SerializeAndSend(); + while (not in_flight_sends_.empty()) + { + PollInFlightSends(); + if (not in_flight_sends_.empty()) + std::this_thread::yield(); + } + break; + } + + if (not work_done) + std::this_thread::yield(); + } +} + +bool +CBCD_AsynchronousCommunicator::SerializeAndSend() +{ + CALI_CXX_MARK_SCOPE("CBCD_AsynchronousCommunicator::SerializeAndSend"); + + bool sent_any = false; + + for (auto& destination_queue : outgoing_queues_) + { + // Gather the currently published outgoing face payloads for this destination. The queue + // is drained in FIFO order, but the serialized message is batched by angle set so the + // receiver can publish one mailbox payload per angle set. + destination_queue->queue->GetReadySlots(slot_cache_); + if (slot_cache_.empty()) + continue; + + std::size_t current_payload_bytes = sizeof(std::size_t); + std::size_t active_angle_sets = 0; + std::size_t slots_processed = 0; + + const auto send_batch = [&]() + { + // Wire format: + // [num_active_angle_sets] + // repeated: + // [angle_set_id][num_entries] + // repeated: + // [remote_face_index][payload_size][payload doubles...] + InFlightSend in_flight; + in_flight.data.Data().resize(current_payload_bytes); + std::size_t offset = 0; + + const auto write_bytes = [&](const void* ptr, const std::size_t size) + { + std::memcpy(in_flight.data.Data().data() + offset, ptr, size); + offset += size; + }; + + write_bytes(&active_angle_sets, sizeof(std::size_t)); + for (std::size_t angle_set_id = 0; angle_set_id < num_angle_sets_; ++angle_set_id) + { + auto& entries = send_batch_by_angle_set_[angle_set_id]; + if (entries.empty()) + continue; + + write_bytes(&angle_set_id, sizeof(std::size_t)); + const auto num_entries = entries.size(); + write_bytes(&num_entries, sizeof(std::size_t)); + for (const auto* entry : entries) + { + write_bytes(&entry->remote_face_index, sizeof(std::uint32_t)); + const auto data_size = entry->psi_data.size(); + write_bytes(&data_size, sizeof(std::size_t)); + write_bytes(entry->psi_data.data(), data_size * sizeof(double)); + } + entries.clear(); + } + + const auto& comm = comm_set_.LocICommunicator(destination_queue->dest_rank); + const auto mapped_rank = + comm_set_.MapIonJ(destination_queue->dest_rank, destination_queue->dest_rank); + in_flight.request = comm.isend(mapped_rank, mpi_tag_, in_flight.data.Data()); + in_flight_sends_.push_back(std::move(in_flight)); + }; + + for (std::size_t slot_index = 0; slot_index < slot_cache_.size(); ++slot_index) + { + const auto* slot = slot_cache_[slot_index]; + const auto& entry = slot->payload; + const auto entry_bytes = + sizeof(std::uint32_t) + sizeof(std::size_t) + entry.psi_data.size() * sizeof(double); + + // Attempt to adhere to the message-size limit. + // Once the next entry would exceed the limit, flush the current + // batch and continue packing the remaining queue entries. + if (max_message_bytes_ > 0 and current_payload_bytes + entry_bytes > max_message_bytes_ and + active_angle_sets > 0) + { + send_batch(); + destination_queue->queue->FreeSlots(slots_processed); + current_payload_bytes = sizeof(std::size_t); + active_angle_sets = 0; + slots_processed = 0; + } + + auto& entries = send_batch_by_angle_set_[entry.angle_set_id]; + if (entries.empty()) + { + ++active_angle_sets; + current_payload_bytes += 2 * sizeof(std::size_t); + } + entries.push_back(&entry); + current_payload_bytes += entry_bytes; + ++slots_processed; + } + + if (active_angle_sets > 0) + { + send_batch(); + destination_queue->queue->FreeSlots(slots_processed); + } + + sent_any = true; + } + + return sent_any; +} + +bool +CBCD_AsynchronousCommunicator::ProbeAndReceive() +{ + CALI_CXX_MARK_SCOPE("CBCD_AsynchronousCommunicator::ProbeAndReceive"); + + bool received_any = false; + const auto& recv_comm = comm_set_.LocICommunicator(my_rank_); + + for (std::size_t source_index = 0; source_index < source_ranks_.size(); ++source_index) + { + const int source_partition = source_partitions_[source_index]; + const int source_rank = source_ranks_[source_index]; + mpi::Status status; + + while (recv_comm.iprobe(source_rank, mpi_tag_, status)) + { + received_any = true; + const auto num_bytes = status.count(); + recv_buffer_.Data().resize(static_cast(num_bytes)); + recv_comm.recv(source_rank, status.tag(), recv_buffer_.Data().data(), num_bytes); + + detail::BufferReader reader{reinterpret_cast(recv_buffer_.Data().data()), + recv_buffer_.Data().size()}; + + // Walk each angleset section to determine its source slot, entry count, + // and total number of doubles, which allows for exactly one mailbox + // payload allocation. + const auto num_active_angle_sets = reader.LoadSize(); + for (std::size_t as_batch = 0; as_batch < num_active_angle_sets; ++as_batch) + { + const auto angle_set_id = reader.LoadSize(); + const auto num_entries = reader.LoadSize(); + assert(angle_set_id < num_angle_sets_); + + const auto slot_it = + source_partition_to_slot_by_angle_set_[angle_set_id].find(source_partition); + assert(slot_it != source_partition_to_slot_by_angle_set_[angle_set_id].end()); + const auto source_slot = slot_it->second; + + const auto* const section_ptr = reader.Data(); + std::size_t total_values = 0; + for (std::size_t entry_index = 0; entry_index < num_entries; ++entry_index) + { + reader.LoadFaceIndex(); + const auto data_size = reader.LoadSize(); + reader.SkipBytes(data_size * sizeof(double)); + total_values += data_size; + } + const auto section_num_bytes = static_cast(reader.Data() - section_ptr); + + auto& slot = incoming_mailboxes_[angle_set_id]->ReserveSlot(); + auto& batch = slot.payload; + batch.source_slot = source_slot; + batch.entries.resize(num_entries); + batch.psi_data.resize(total_values); + detail::BufferReader section_reader{section_ptr, section_num_bytes}; + std::size_t value_offset = 0; + // Walk the compact mailbox payload with per-face offsets into one + // contiguous `psi_data` block. + for (std::size_t entry_index = 0; entry_index < num_entries; ++entry_index) + { + auto& entry = batch.entries[entry_index]; + entry.source_face_index = section_reader.LoadFaceIndex(); + entry.payload_offset = value_offset; + entry.payload_size = section_reader.LoadSize(); + std::memcpy(batch.psi_data.data() + value_offset, + section_reader.Data(), + entry.payload_size * sizeof(double)); + section_reader.SkipBytes(entry.payload_size * sizeof(double)); + value_offset += entry.payload_size; + } + + incoming_mailboxes_[angle_set_id]->PublishSlot(slot); + } + } + } + + return received_any; +} + +bool +CBCD_AsynchronousCommunicator::PollInFlightSends() +{ + CALI_CXX_MARK_SCOPE("CBCD_AsynchronousCommunicator::PollInFlightSends"); + + // Compact the in-flight vector in place by swapping completed requests with the back. + bool completed_any = false; + for (std::size_t i = 0; i < in_flight_sends_.size();) + { + if (mpi::test(in_flight_sends_[i].request)) + { + completed_any = true; + in_flight_sends_[i] = std::move(in_flight_sends_.back()); + in_flight_sends_.pop_back(); + } + else + ++i; + } + return completed_any; +} + +bool +CBCD_AsynchronousCommunicator::AllAngleSetsComplete() const +{ + for (const auto& done : angle_set_done_) + if (not done.load(std::memory_order_acquire)) + return false; + + for (const auto& destination_queue : outgoing_queues_) + if (not destination_queue->queue->Empty()) + return false; + + return true; +} + +} // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.h new file mode 100644 index 0000000000..3ce3a7bf4b --- /dev/null +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.h @@ -0,0 +1,246 @@ +// SPDX-FileCopyrightText: 2026 The OpenSn Authors +// SPDX-License-Identifier: MIT + +#pragma once + +#include "framework/data_types/byte_array.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/lock_free_queues.h" +#include "mpicpp-lite/mpicpp-lite.h" +#include +#include +#include +#include +#include +#include +#include + +namespace mpi = mpicpp_lite; + +namespace opensn +{ + +class AngleSet; +class MPICommunicatorSet; + +/// Metadata for one received non-local face payload inside an incoming batch. +struct IncomingFaceBatchEntry +{ + /// Source-slot-local face index carried on the wire. + std::uint32_t source_face_index = 0; + /// Offset of this payload within `IncomingFaceBatch::psi_data`. + std::size_t payload_offset = 0; + /// Number of doubles in this payload. + std::size_t payload_size = 0; +}; + +/// One received mailbox payload grouped by sending source slot and angle set. +struct IncomingFaceBatch +{ + /// Source-locality slot for the sending partition. + std::uint32_t source_slot = 0; + /// Per-face metadata for the packed payload block. + std::vector entries; + /// Packed received doubles for all faces in the batch. + std::vector psi_data; +}; + +/// One outgoing non-local face payload published by a sweep worker. +struct OutgoingFaceData +{ + /// Producing angle-set ID. + std::size_t angle_set_id = 0; + /// Receiver-local face index understood by the destination rank. + std::uint32_t remote_face_index = 0; + /// Packed outgoing doubles for one non-local face. + std::vector psi_data; +}; + +/// Queue-capacity summary for one angle set. +struct AngleSetCapacity +{ + /// Number of outgoing non-local faces produced by this angle set. + std::size_t outgoing_faces = 0; + /// Number of incoming non-local faces consumed by this angle set. + std::size_t incoming_faces = 0; + /// Maximum number of doubles in one outgoing face payload. + std::size_t max_outgoing_face_values = 0; + /// Maximum number of face entries in one received batch. + std::size_t max_incoming_batch_entries = 0; + /// Maximum number of doubles in one received batch. + std::size_t max_incoming_batch_values = 0; +}; + +/** + * Aggregated CBCD communicator with one dedicated progress thread. + * + * Sweep worker threads publish outgoing non-local face payloads into per-destination MPSC queues. + * The communication thread drains those queues, batches payloads by angle set subject to + * the configured message-size limit, serializes them into MPI messages, and posts nonblocking + * sends. The communication thread also probes for incoming messages, deserializes them into compact + * `IncomingFaceBatch` payloads, and publishes those batches into per-angle-set incoming + * mailboxes. + * + * The aggregated communicator assumes the following communication patterns and sweep worker + * thread interactions: + * - sweep worker threads only write outgoing queue slots, + * - the communication thread handles only the draining of outgoing queues and routing of + * incoming batches to angle-set mailboxes, + * - each angle-set owner thread only drains its own incoming mailbox. + * + * Aggregated communicator flow: + * 1. A sweep worker publishes one completed non-local face payload into the ring buffer + * associated with the destination rank. + * 2. The communication thread gathers ready slots, groups them by angle set, and serializes + * one or more MPI messages subject to the configured byte limit. + * 3. The destination rank probes for those messages, maps the sending partition to its local + * source slot, and reconstructs one compact `IncomingFaceBatch` per angle-set section. + * 4. The communication thread publishes each reconstructed batch into the mailbox owned by + * that angle set. + * 5. The angle-set owner thread drains its mailbox and copies the received face data into + * the corresponding non-local FLUDS storage. + */ +class CBCD_AsynchronousCommunicator +{ +public: + /** + * Construct the CBCD asynchronous communicator. + * + * \param angle_sets Angle sets served by the communicator. + * \param comm_set MPI communicator set used for point-to-point exchanges. + * \param incoming_source_partitions Incoming source partitions grouped by angle set. + * \param max_message_bytes Maximum serialized MPI payload size. A value of zero disables + * message-size splitting. + * \param capacities Queue-capacity summary for each angle set. + */ + CBCD_AsynchronousCommunicator(const std::vector& angle_sets, + const MPICommunicatorSet& comm_set, + const std::vector>& incoming_source_partitions, + std::size_t max_message_bytes, + const std::vector& capacities); + + ~CBCD_AsynchronousCommunicator(); + + /** + * Publish one outgoing non-local face payload. + * + * \param dest_rank Destination rank. + * \param angle_set_id Producing angle-set ID. + * \param remote_face_index Receiver-local face index. + * \param data_size Number of doubles in the payload. + * \param fill Callback that fills the reserved payload buffer. + */ + template + void EnqueueOutgoing(int dest_rank, + std::size_t angle_set_id, + std::uint32_t remote_face_index, + std::size_t data_size, + FillCallback&& fill) + { + const auto it = dest_to_queue_index_.find(dest_rank); + assert(it != dest_to_queue_index_.end()); + auto& queue = *outgoing_queues_[it->second]->queue; + auto& slot = queue.ReserveSlot(); + slot.payload.angle_set_id = angle_set_id; + slot.payload.remote_face_index = remote_face_index; + slot.payload.psi_data.resize(data_size); + fill(slot.payload.psi_data.data()); + queue.PublishSlot(slot); + } + + /** + * Drain all currently ready incoming batches for one angle set. + * + * \param angle_set_id Angle-set ID. + * \param callback Callback invoked for each incoming batch payload. + * \return `true` if at least one batch was consumed. + */ + template + bool ProcessIncoming(std::size_t angle_set_id, Callback&& callback) + { + assert(angle_set_id < num_angle_sets_); + return incoming_mailboxes_[angle_set_id]->ProcessReady(std::forward(callback)) > 0; + } + + /// Report whether the specified angle set currently has a published incoming batch. + bool HasIncoming(std::size_t angle_set_id) const + { + assert(angle_set_id < num_angle_sets_); + return not incoming_mailboxes_[angle_set_id]->Empty(); + } + + /// Mark one angle set as locally complete. + void SignalAngleSetComplete(std::size_t angle_set_id); + /// Start the communication thread. + void Start(); + /// Request termination and join the communication thread. + void Stop(); + +private: + /// Outgoing queue for one destination rank. + struct DestinationQueue + { + /// Destination rank. + int dest_rank = 0; + /// Outgoing MPSC queue drained by the communication thread. + std::unique_ptr> queue; + }; + + /// One in-flight nonblocking MPI send and its owned serialized bytes. + struct InFlightSend + { + /// Nonblocking MPI request. + mpi::Request request; + /// Owned serialized payload storage. + ByteArray data; + }; + + /// Run the communication-thread progress loop. + void CommThreadLoop(); + /// Drain outgoing queues, serialize batches, and post MPI sends. + bool SerializeAndSend(); + /// Probe for incoming MPI messages, deserialize them, and publish mailbox batches. + bool ProbeAndReceive(); + /// Retire completed nonblocking sends. + bool PollInFlightSends(); + /// Report whether all angle sets are complete and no local outgoing work remains. + bool AllAngleSetsComplete() const; + + /// Communicator set used for all CBCD point-to-point exchanges. + const MPICommunicatorSet& comm_set_; + /// Number of managed angle sets. + std::size_t num_angle_sets_; + /// MPI tag shared by all communicator messages in this instance. + int mpi_tag_; + /// Maximum serialized MPI payload size. + std::size_t max_message_bytes_; + /// Local MPI rank. + int my_rank_ = 0; + /// Source partitions that can send to this rank. + std::vector source_partitions_; + /// Source ranks mapped into the local communicator for receives. + std::vector source_ranks_; + /// Source-partition to source-slot map grouped by angle set. + std::vector> source_partition_to_slot_by_angle_set_; + /// Outgoing destination queues. + std::vector> outgoing_queues_; + /// Destination-rank to outgoing-queue index map. + std::unordered_map dest_to_queue_index_; + /// Per-angle-set incoming mailboxes. + std::vector>> incoming_mailboxes_; + /// Per-angle-set transient send batches assembled by the communication thread. + std::vector> send_batch_by_angle_set_; + /// Reusable receive buffer for one incoming MPI payload. + ByteArray recv_buffer_; + /// Outstanding nonblocking sends owned by the communication thread. + std::vector in_flight_sends_; + /// Termination flag for the communication thread. + std::atomic stop_requested_{false}; + /// Per-angle-set local completion flags. + std::vector> angle_set_done_; + /// Dedicated communication thread. + std::thread comm_thread_; + /// Scratch vector used while gathering ready outgoing queue slots. + std::vector::Slot*> slot_cache_; +}; + +} // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/lock_free_queues.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/lock_free_queues.h new file mode 100644 index 0000000000..337ca574a7 --- /dev/null +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/lock_free_queues.h @@ -0,0 +1,170 @@ +// SPDX-FileCopyrightText: 2026 The OpenSn Authors +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include +#include +#include + +namespace opensn +{ + +/** + * Bounded lock-free multi-producer, single-consumer ring buffer. + * + * Producers reserve slots through an atomic head counter and publish them with a per-slot + * ready flag. The single consumer drains in FIFO order through the tail index. The queue + * is bounded and reuses preallocated slots; it performs no dynamic allocation once the + * storage has been initialized. + * + * In the CBCD aggregated communicator, LockFreeRingBuffer serves two roles: + * 1. an outgoing per-destination queue written by sweep worker threads and drained by the + * communication thread, + * 2. an incoming per-angle-set queue written by the communication thread and drained by the + * owning angleset worker thread. + * + * LockFreeRingBuffer works under the following assumptions: + * - producers reserve one slot, write the payload in place, and publish the slot exactly + * once + * - the consumer drains published slots in FIFO order and returns them to the ring for + * reuse. + * + * This yields a fixed-capacity queue with explicit slot reuse. + */ +template +class LockFreeRingBuffer +{ +public: + /// Slot payload with a publication flag. + struct Slot + { + /// Stored payload. + T payload; + /// Publication flag visible to the single consumer. + std::atomic ready{false}; + }; + + /** + * Allocate storage for the requested number of slots. + * + * \param capacity Number of ring-buffer slots. + */ + void Preallocate(const std::size_t capacity) { buffer_ = std::vector(capacity); } + + /** + * Initialize every slot payload in place. + * + * \tparam Callback Callable invoked once per slot payload. + * \param cb Initialization callback. + */ + template + void InitializeSlots(Callback&& cb) + { + for (auto& slot : buffer_) + cb(slot.payload); + } + + /** + * Reserve one slot for a producer. + * + * \return Writable slot reference. + */ + Slot& ReserveSlot() + { + const auto idx = head_.fetch_add(1, std::memory_order_relaxed) % buffer_.size(); + while (buffer_[idx].ready.load(std::memory_order_acquire)) + std::this_thread::yield(); + return buffer_[idx]; + } + + /** + * Publish one reserved slot to the consumer. + * + * \param slot Slot to publish. + */ + void PublishSlot(Slot& slot) { slot.ready.store(true, std::memory_order_release); } + + /** + * Gather currently ready slots without consuming them. + * + * \param out Output vector of ready slot pointers. + */ + void GetReadySlots(std::vector& out) + { + out.clear(); + if (buffer_.empty()) + return; + + const auto capacity = buffer_.size(); + auto current_tail = tail_; + while (buffer_[current_tail % capacity].ready.load(std::memory_order_acquire)) + { + out.push_back(&buffer_[current_tail % capacity]); + ++current_tail; + } + } + + /** + * Release the next `count` ready slots after they have been consumed. + * + * \param count Number of slots to free. + */ + void FreeSlots(const std::size_t count) + { + const auto capacity = buffer_.size(); + for (std::size_t i = 0; i < count; ++i) + { + buffer_[tail_ % capacity].ready.store(false, std::memory_order_release); + ++tail_; + } + } + + /** + * Consume all ready slots in FIFO order. + * + * \tparam Callback Callable invoked with each slot payload. + * \param cb Consumer callback. + * \return Number of consumed slots. + */ + template + std::size_t ProcessReady(Callback&& cb) + { + if (buffer_.empty()) + return 0; + + const auto capacity = buffer_.size(); + std::size_t count = 0; + while (true) + { + auto& slot = buffer_[tail_ % capacity]; + if (not slot.ready.load(std::memory_order_acquire)) + break; + cb(slot.payload); + slot.ready.store(false, std::memory_order_release); + ++tail_; + ++count; + } + return count; + } + + /// Check whether the queue currently has no published slots. + bool Empty() const + { + if (buffer_.empty()) + return true; + return not buffer_[tail_ % buffer_.size()].ready.load(std::memory_order_acquire); + } + +private: + /// Ring-buffer storage. + std::vector buffer_; + /// Producer reservation index. + alignas(std::hardware_destructive_interference_size) std::atomic head_{0}; + /// Consumer drain index. + alignas(std::hardware_destructive_interference_size) std::size_t tail_{0}; +}; + +} // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.cu b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.cu index dde99da1f6..0bf592d5e3 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.cu +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.cu @@ -3,55 +3,67 @@ #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbcd_angle_set.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h" -#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h" #include "modules/linear_boltzmann_solvers/lbs_problem/device/carrier/mesh_carrier.h" +#include "framework/mesh/mesh_continuum/mesh_continuum.h" #include "framework/math/unknown_manager/unknown_manager.h" #include "framework/math/spatial_discretization/spatial_discretization.h" #include "framework/logging/log.h" #include "framework/runtime.h" +#include +#include +#include +#include +#include #include +#include "caliper/cali.h" namespace opensn { -CBCD_FLUDS::CBCD_FLUDS(size_t num_groups, - size_t num_angles, - size_t num_local_cells, +CBCD_FLUDS::CBCD_FLUDS(std::size_t num_groups, + std::size_t num_angles, + std::size_t num_local_cells, const CBCD_FLUDSCommonData& common_data, const UnknownManager& psi_uk_man, const SpatialDiscretization& sdm, bool save_angular_flux) : FLUDS(num_groups, num_angles, common_data.GetSPDS()), common_data_(common_data), + cbc_spds_(static_cast(common_data.GetSPDS())), psi_uk_man_(psi_uk_man), sdm_(sdm), - num_angles_in_gs_quadrature_(psi_uk_man_.GetNumberOfUnknowns()), - num_quadrature_local_dofs_(sdm_.GetNumLocalDOFs(psi_uk_man_)), - num_local_spatial_dofs_(num_quadrature_local_dofs_ / num_angles_in_gs_quadrature_ / + num_local_spatial_dofs_(sdm_.GetNumLocalDOFs(psi_uk_man_) / psi_uk_man_.GetNumberOfUnknowns() / num_groups_), - local_psi_data_size_(num_local_spatial_dofs_ * num_groups_and_angles_), - incoming_boundary_node_map_(common_data_.GetIncomingBoundaryNodeMap()), - cell_to_outgoing_boundary_nodes_(common_data_.GetOutgoingBoundaryNodeMap()), - cell_to_incoming_nonlocal_nodes_(common_data_.GetIncomingNonlocalNodeMap()), - cell_to_outgoing_nonlocal_nodes_(common_data_.GetOutgoingNonlocalNodeMap()), - local_psi_(local_psi_data_size_), + local_psi_data_size_(cbc_spds_.GetTotalLocalFaceSlotNodes() * num_groups_and_angles_), + saved_psi_data_size_(num_local_spatial_dofs_ * num_groups_and_angles_), incoming_boundary_psi_(common_data_.GetNumIncomingBoundaryNodes() * num_groups_and_angles_), outgoing_boundary_psi_(common_data_.GetNumOutgoingBoundaryNodes() * num_groups_and_angles_), incoming_nonlocal_psi_(common_data_.GetNumIncomingNonlocalNodes() * num_groups_and_angles_), outgoing_nonlocal_psi_(common_data_.GetNumOutgoingNonlocalNodes() * num_groups_and_angles_), - local_cell_ids_(num_local_cells), save_angular_flux_(save_angular_flux) { - if (save_angular_flux_ and host_saved_psi_.empty()) + grid_ptr_ = GetSPDS().GetGrid().get(); + for (auto& local_cell_ids : local_cell_ids_) + local_cell_ids.reserve(num_local_cells); + + outgoing_node_memcpy_plan_.reserve(common_data_.GetNumOutgoingNonlocalNodes()); + for (std::size_t cell_local_id = 0; cell_local_id < common_data_.GetNumLocalCells(); + ++cell_local_id) { - host_saved_psi_ = crb::HostVector(local_psi_data_size_); - device_saved_psi_ = crb::DeviceMemory(local_psi_data_size_); + for (const auto& face_info : common_data_.GetOutgoingNonlocalFaces(cell_local_id)) + { + for (const auto& node : common_data_.GetOutgoingNodeCopies(face_info)) + { + outgoing_node_memcpy_plan_.push_back( + {static_cast(node.storage_index) * num_groups_and_angles_, + static_cast(node.face_node) * num_groups_and_angles_}); + } + } } - CreatePointerSet(); - deplocs_outgoing_messages_.reserve(common_data.GetNumIncomingNonlocalFaces()); } CBCD_FLUDS::~CBCD_FLUDS() @@ -60,7 +72,8 @@ CBCD_FLUDS::~CBCD_FLUDS() { host_saved_psi_.clear(); } - local_cell_ids_.clear(); + for (auto& local_cell_ids : local_cell_ids_) + local_cell_ids.clear(); incoming_boundary_psi_.clear(); outgoing_boundary_psi_.clear(); incoming_nonlocal_psi_.clear(); @@ -70,6 +83,65 @@ CBCD_FLUDS::~CBCD_FLUDS() void CBCD_FLUDS::AllocateLocalAndSavedPsi() { + local_psi_ = crb::DeviceMemory(local_psi_data_size_); + if (save_angular_flux_ and host_saved_psi_.empty()) + { + host_saved_psi_ = crb::HostVector(saved_psi_data_size_); + device_saved_psi_ = crb::DeviceMemory(saved_psi_data_size_); + } + CreatePointerSet(); +} + +void +CBCD_FLUDS::InitializeReflectingBoundaryNodes( + const std::map>& boundaries) +{ + const auto num_local_cells = common_data_.GetNumLocalCells(); + reflecting_outgoing_boundary_face_offsets_.assign(num_local_cells + 1, 0); + reflecting_boundary_face_plans_.clear(); + reflecting_boundary_face_plans_.reserve(common_data_.GetNumOutgoingBoundaryNodes()); + + for (std::size_t cell_local_id = 0; cell_local_id < num_local_cells; ++cell_local_id) + { + reflecting_outgoing_boundary_face_offsets_[cell_local_id] = + static_cast(reflecting_boundary_face_plans_.size()); + + const auto boundary_nodes = common_data_.GetOutgoingBoundaryNodes(cell_local_id); + for (std::size_t i = 0; i < boundary_nodes.size();) + { + const auto& first_node = boundary_nodes[i]; + const auto boundary_it = boundaries.find(first_node.boundary_id); + if (boundary_it == boundaries.end() or not boundary_it->second->IsReflecting()) + { + ++i; + continue; + } + + std::size_t num_nodes = 1; + while (i + num_nodes < boundary_nodes.size()) + { + const auto& node = boundary_nodes[i + num_nodes]; + if (node.boundary_id != first_node.boundary_id or + node.cell_local_id != first_node.cell_local_id or node.face_id != first_node.face_id or + node.storage_index != first_node.storage_index + num_nodes or + node.face_node != first_node.face_node + num_nodes) + break; + ++num_nodes; + } + + reflecting_boundary_face_plans_.push_back( + {boundary_it->second.get(), + static_cast(first_node.cell_local_id), + first_node.face_id, + static_cast(first_node.face_node), + static_cast(first_node.storage_index) * num_groups_and_angles_, + static_cast(num_nodes)}); + i += num_nodes; + } + + reflecting_outgoing_boundary_face_offsets_[cell_local_id + 1] = + static_cast(reflecting_boundary_face_plans_.size()); + } } void @@ -101,116 +173,101 @@ CBCD_FLUDS::CreatePointerSet() void CBCD_FLUDS::CopyIncomingBoundaryPsiToDevice(CBCDSweepChunk& sweep_chunk, CBCD_AngleSet* angle_set) { - const auto& angle_indices = angle_set->GetAngleIndices(); - const auto& num_angles = angle_indices.size(); + CALI_CXX_MARK_SCOPE("CBCD_FLUDS::CopyIncomingBoundaryPsiToDevice"); - for (const auto& node : incoming_boundary_node_map_) - { - for (size_t as_ss_idx = 0; as_ss_idx < num_angles; ++as_ss_idx) - { - auto direction_num = angle_indices[as_ss_idx]; - double* dst_psi = incoming_boundary_psi_.data() + - node.storage_index * num_groups_and_angles_ + as_ss_idx * num_groups_; - const double* src_psi = angle_set->PsiBoundary(node.boundary_id, - direction_num, - node.cell_local_id, - node.face_id, - node.face_node, - sweep_chunk.GetGroupsetGroupIndex(), - sweep_chunk.IsSurfaceSourceActive()); - std::copy(src_psi, src_psi + num_groups_, dst_psi); - } - } -} - -void -CBCD_FLUDS::CopyIncomingNonlocalPsiToDevice(CBCD_AngleSet* angle_set, - const std::vector& cell_local_ids) -{ - if (cell_to_incoming_nonlocal_nodes_.empty()) - return; const auto& angle_indices = angle_set->GetAngleIndices(); - const auto& num_angles = angle_indices.size(); - for (const auto& cell_local_id : cell_local_ids) + const auto num_angles = angle_indices.size(); + const std::size_t groups_bytes = num_groups_ * sizeof(double); + const auto gs_gi = sweep_chunk.GetGroupsetGroupIndex(); + const bool surface_source_active = sweep_chunk.IsSurfaceSourceActive(); + + for (const auto& face_plan : common_data_.GetIncomingBoundaryFaces()) { - auto incoming_boundary_it = cell_to_incoming_nonlocal_nodes_.find(cell_local_id); - if (incoming_boundary_it == cell_to_incoming_nonlocal_nodes_.end()) - continue; - for (const auto& node : incoming_boundary_it->second) + for (std::size_t as_ss_idx = 0; as_ss_idx < num_angles; ++as_ss_idx) { - for (size_t as_ss_idx = 0; as_ss_idx < num_angles; ++as_ss_idx) + const auto direction_num = angle_indices[as_ss_idx]; + double* dst_face = + incoming_boundary_psi_.data() + + static_cast(face_plan.base_storage_index) * num_groups_and_angles_ + + as_ss_idx * num_groups_; + for (std::size_t node = 0; node < face_plan.num_nodes; ++node) { - double* dst_psi = incoming_nonlocal_psi_.data() + - node.storage_index * num_groups_and_angles_ + as_ss_idx * num_groups_; + double* dst_psi = dst_face + node * num_groups_and_angles_; const double* src_psi = - NLUpwindPsi(node.cell_global_id, node.face_id, node.face_node_mapped, as_ss_idx); - std::copy(src_psi, src_psi + num_groups_, dst_psi); + angle_set->PsiBoundary(face_plan.boundary_id, + direction_num, + face_plan.cell_local_id, + face_plan.face_id, + static_cast(face_plan.first_face_node + node), + gs_gi, + surface_source_active); + std::memcpy(dst_psi, src_psi, groups_bytes); } } } } void -CBCD_FLUDS::CopyOutgoingPsiBackToHost(CBCDSweepChunk& sweep_chunk, - CBCD_AngleSet* angle_set, - const std::vector& cell_local_ids) +CBCD_FLUDS::CopyOutgoingPsiBackToHost(CBCDSweepChunk&, + CBCD_AsynchronousCommunicator& async_comm, + const std::size_t angle_set_id, + const std::vector& angle_indices, + std::span cell_local_ids) { - if (cell_to_outgoing_boundary_nodes_.empty() and cell_to_outgoing_nonlocal_nodes_.empty()) + if (common_data_.GetNumOutgoingBoundaryNodes() == 0 and + common_data_.GetNumOutgoingNonlocalFaces() == 0) return; - const auto& angle_indices = angle_set->GetAngleIndices(); - const auto& num_angles = angle_indices.size(); + + CALI_CXX_MARK_SCOPE("CBCD_FLUDS::CopyOutgoingPsiBackToHost"); + + const auto num_angles = angle_indices.size(); const auto& grid = *(GetSPDS().GetGrid()); + const std::size_t groups_bytes = num_groups_ * sizeof(double); + const std::size_t stride_bytes = num_groups_and_angles_ * sizeof(double); for (const auto& cell_local_id : cell_local_ids) { - const auto& cell = grid.local_cells[cell_local_id]; - auto outgoing_boundary_it = cell_to_outgoing_boundary_nodes_.find(cell_local_id); - if (outgoing_boundary_it != cell_to_outgoing_boundary_nodes_.end()) - for (const auto& node : outgoing_boundary_it->second) + const auto reflecting_faces = GetReflectingOutgoingBoundaryFaces(cell_local_id); + for (const auto& face_plan : reflecting_faces) + { + for (std::size_t as_ss_idx = 0; as_ss_idx < num_angles; ++as_ss_idx) { - const auto& face = cell.faces[node.face_id]; - if (angle_set->GetBoundaries().at(face.neighbor_id)->IsReflecting()) + const auto direction_num = static_cast(angle_indices[as_ss_idx]); + const double* src_face = + outgoing_boundary_psi_.data() + face_plan.src_base_offset + as_ss_idx * num_groups_; + for (std::size_t n = 0; n < face_plan.num_nodes; ++n) { - for (size_t as_ss_idx = 0; as_ss_idx < num_angles; ++as_ss_idx) - { - auto direction_num = angle_indices[as_ss_idx]; - double* dst_psi = angle_set->PsiReflected( - face.neighbor_id, direction_num, node.cell_local_id, node.face_id, node.face_node); - const double* src_psi = outgoing_boundary_psi_.data() + - node.storage_index * num_groups_and_angles_ + - as_ss_idx * num_groups_; - std::copy(src_psi, src_psi + num_groups_, dst_psi); - } + double* dst = face_plan.boundary->PsiOutgoing( + face_plan.cell_local_id, + face_plan.face_id, + static_cast(face_plan.first_face_node + n), + direction_num); + std::memcpy(dst, src_face + n * num_groups_and_angles_, groups_bytes); } } - auto outgoing_nonlocal_it = cell_to_outgoing_nonlocal_nodes_.find(cell_local_id); - if (outgoing_nonlocal_it != cell_to_outgoing_nonlocal_nodes_.end()) - for (const auto& node : outgoing_nonlocal_it->second) - { - const auto& face = cell.faces[node.face_id]; - const auto& cell_mapping = sdm_.GetCellMapping(cell); - const auto& face_nodal_mapping = - common_data_.GetFaceNodalMapping(node.cell_local_id, node.face_id); - const auto& num_face_nodes = cell_mapping.GetNumFaceNodes(node.face_id); - const auto& face_data_size = num_face_nodes * num_groups_and_angles_; - const int locality = - sweep_chunk.GetCellTransportView(node.cell_local_id).FaceLocality(node.face_id); - auto& async_comm = - static_cast(*angle_set->GetCommunicator()); - std::vector* psi_nonlocal_outgoing = - &async_comm.InitGetDownwindMessageData(locality, - face.neighbor_id, - face_nodal_mapping.associated_face_, - angle_set->GetID(), - face_data_size); - for (size_t as_ss_idx = 0; as_ss_idx < num_angles; ++as_ss_idx) + } + + for (const auto& face_info : common_data_.GetOutgoingNonlocalFaces(cell_local_id)) + { + const std::size_t face_data_size = + static_cast(face_info.num_face_nodes) * num_groups_and_angles_; + const int dest_rank = common_data_.GetOutgoingLocalities()[face_info.dest_slot]; + async_comm.EnqueueOutgoing( + dest_rank, + angle_set_id, + face_info.remote_face_index, + face_data_size, + [this, &face_info, stride_bytes](double* dst_base) { - auto* dst_psi = NLOutgoingPsi(psi_nonlocal_outgoing, node.face_node, as_ss_idx); - const double* src_psi = outgoing_nonlocal_psi_.data() + - node.storage_index * num_groups_and_angles_ + - as_ss_idx * num_groups_; - std::copy(src_psi, src_psi + num_groups_, dst_psi); - } - } + const auto* node_plan = outgoing_node_memcpy_plan_.data() + face_info.node_copy_offset; + const auto* node_plan_end = node_plan + face_info.num_node_copies; + for (; node_plan != node_plan_end; ++node_plan) + { + auto* dst = dst_base + node_plan->dest_offset; + const double* src = outgoing_nonlocal_psi_.data() + node_plan->src_offset; + std::memcpy(dst, src, stride_bytes); + } + }); + } } } @@ -219,6 +276,7 @@ CBCD_FLUDS::CopySavedPsiFromDevice() { if (not save_angular_flux_) return; + CALI_CXX_MARK_SCOPE("CBCD_FLUDS::CopySavedPsiFromDevice"); crb::copy(host_saved_psi_, device_saved_psi_, host_saved_psi_.size(), 0, 0, stream_); } @@ -227,9 +285,13 @@ CBCD_FLUDS::CopySavedPsiToDestinationPsi(CBCDSweepChunk& sweep_chunk, CBCD_Angle { if (not save_angular_flux_) return; + + CALI_CXX_MARK_SCOPE("CBCD_FLUDS::CopySavedPsiToDestinationPsi"); + + stream_.synchronize(); + DiscreteOrdinatesProblem& problem = sweep_chunk.GetProblem(); auto* mesh = problem.GetMeshCarrier(); - auto grid = problem.GetGrid(); auto& groupset = sweep_chunk.GetGroupset(); auto& destination_psi = problem.GetPsiNewLocal()[groupset.id]; const auto& discretization = problem.GetSpatialDiscretization(); @@ -237,7 +299,8 @@ CBCD_FLUDS::CopySavedPsiToDestinationPsi(CBCDSweepChunk& sweep_chunk, CBCD_Angle groupset.psi_uk_man_.GetNumberOfUnknowns() * groupset.GetNumGroups(); const auto& angle_indices = angle_set->GetAngleIndices(); const auto& num_angles = angle_set->GetNumAngles(); - for (const auto& cell : grid->local_cells) + const std::size_t groups_bytes = num_groups_ * sizeof(double); + for (const auto& cell : grid_ptr_->local_cells) { double* dst_psi = &destination_psi[discretization.MapDOFLocal(cell, 0, psi_uk_man_, 0, 0)]; double* src_psi = @@ -250,7 +313,7 @@ CBCD_FLUDS::CopySavedPsiToDestinationPsi(CBCDSweepChunk& sweep_chunk, CBCD_Angle auto direction_num = angle_indices[as_ss_idx]; double* dst = dst_psi + direction_num * num_groups_; double* src = src_psi + as_ss_idx * num_groups_; - std::copy(src, src + num_groups_, dst); + std::memcpy(dst, src, groups_bytes); } dst_psi += groupset_angle_group_stride; src_psi += num_groups_and_angles_; @@ -258,32 +321,23 @@ CBCD_FLUDS::CopySavedPsiToDestinationPsi(CBCDSweepChunk& sweep_chunk, CBCD_Angle } } -double* -CBCD_FLUDS::NLUpwindPsi(uint64_t cell_global_id, - unsigned int face_id, - unsigned int face_node_mapped, - size_t as_ss_idx) +std::uint32_t +CBCD_FLUDS::ScatterReceivedFaceData(const std::uint32_t source_slot, + const std::uint32_t source_face_index, + const double* psi_data) { - auto it = deplocs_outgoing_messages_.find({cell_global_id, face_id}); - if (it == deplocs_outgoing_messages_.end()) - return nullptr; - auto& psi = it->second; - const size_t dof_map = - face_node_mapped * num_groups_and_angles_ + // Offset to start of data for face_node_mapped - as_ss_idx * num_groups_; // Offset to start of data for angle_set_index - - assert(dof_map < psi.size()); - return &psi[dof_map]; + const auto& face_info = common_data_.GetIncomingNonlocalFace(source_slot, source_face_index); + double* dst = incoming_nonlocal_psi_.data() + + static_cast(face_info.base_storage_index) * num_groups_and_angles_; + const std::size_t face_values = + static_cast(face_info.num_nodes) * num_groups_and_angles_; + std::memcpy(dst, psi_data, face_values * sizeof(double)); + return face_info.cell_local_id; } -double* -CBCD_FLUDS::NLOutgoingPsi(std::vector* psi_nonlocal_outgoing, - size_t face_node, - size_t as_ss_idx) +void +CBCD_FLUDS::ClearLocalAndReceivePsi() { - assert(psi_nonlocal_outgoing != nullptr); - const size_t addr_offset = face_node * num_groups_and_angles_ + as_ss_idx * num_groups_; - return &(*psi_nonlocal_outgoing)[addr_offset]; } } // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h index f466af2052..9577ae6d29 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h @@ -8,28 +8,48 @@ #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/fluds.h" #include "modules/linear_boltzmann_solvers/lbs_problem/device/storage.h" #include "caribou/main.hpp" +#include #include -#include -#include +#include namespace crb = caribou; namespace opensn { +class CBC_SPDS; class CBCD_AngleSet; +class CBCD_AsynchronousCommunicator; +class CBCDSweepChunk; class UnknownManager; class SpatialDiscretization; -class Cell; -class CBCDSweepChunk; - -/// CBC FLUDS for device. +class SweepBoundary; +class MeshContinuum; + +/** + * CBCD FLUDS for managing boundary, local, and non-local psi buffers during sweeps. + * + * Owns the device and mapped-host angular-flux buffers used by one CBCD angle set. + * Local face data is stored in a compact slot bank sized from the static CBC slot + * assignment. + */ class CBCD_FLUDS : public FLUDS { public: - CBCD_FLUDS(size_t num_groups, - size_t num_angles, - size_t num_local_cells, + /** + * Construct the CBCD FLUDS for one angle set. + * + * \param num_groups Number of groups in angleset's groupset. + * \param num_angles Number of angles in the angleset. + * \param num_local_cells Number of local cells assigned to the angle set. + * \param common_data Shared CBCD FLUDS metadata. + * \param psi_uk_man Unknown manager for angular flux storage. + * \param sdm Spatial discretization. + * \param save_angular_flux Save angular fluxes when true. + */ + CBCD_FLUDS(std::size_t num_groups, + std::size_t num_angles, + std::size_t num_local_cells, const CBCD_FLUDSCommonData& common_data, const UnknownManager& psi_uk_man, const SpatialDiscretization& sdm, @@ -37,54 +57,91 @@ class CBCD_FLUDS : public FLUDS ~CBCD_FLUDS(); - /// Get reference to the common data. + /// Return the shared CBCD FLUDS metadata. const CBCD_FLUDSCommonData& GetCommonData() const { return common_data_; } - /// Get reference to stream. + /// Return the stream associated with this angle set. crb::Stream& GetStream() { return stream_; } + /// Bytes in the local psi backing buffer for this FLUDS instance. + std::size_t GetLocalPsiBytes() const noexcept { return local_psi_data_size_ * sizeof(double); } + /// Allocate buffers asynchronously on the associated stream. void AllocateLocalAndSavedPsi(); + /** + * Build reflecting-boundary copy plans for this angle set. + * + * \param boundaries Sweep-boundary table indexed by boundary ID. + */ + void InitializeReflectingBoundaryNodes( + const std::map>& boundaries); + /// Get the stride size for each face node's angular flux data. inline std::size_t GetStrideSize() const { return num_groups_and_angles_; } - /// Get vector of local cells to be swept. - crb::MappedHostVector& GetLocalCellIDs() { return local_cell_ids_; } + /// Return one mapped host vector of local cells used by the CBCD launch pipeline. + crb::MappedHostVector& GetLocalCellIDs(const std::size_t buffer_index) + { + return local_cell_ids_[buffer_index]; + } - /// Get saved angular flux device pointer. + /// Return the device pointer to the saved angular flux buffer. double* GetSavedAngularFluxDevicePointer() { return device_saved_psi_.get(); } - /// Copy saved psi from device to host. + /// Copy saved angular fluxes from the device staging buffer to the host staging buffer. void CopySavedPsiFromDevice(); - /// Copy saved psi from host to destination psi host buffer. + /** + * Copy saved angular fluxes into the destination psi vector. + * + * \param sweep_chunk Owning CBCD sweep chunk. + * \param angle_set Angle set owning these saved angular fluxes. + */ void CopySavedPsiToDestinationPsi(CBCDSweepChunk& sweep_chunk, CBCD_AngleSet* angle_set); - /// Gets pointer set to device angular flux data. + /// Return the device pointer set used by the CBCD sweep kernel. CBCD_FLUDSPointerSet& GetDevicePointerSet() { return pointer_set_; } - /// Copies incoming boundary psi from host to device. + /** + * Copy incoming boundary angular flux data from the host buffers to the device buffers. + * + * \param sweep_chunk Owning CBCD sweep chunk. + * \param angle_set Angle set supplying boundary angular flux values. + */ void CopyIncomingBoundaryPsiToDevice(CBCDSweepChunk& sweep_chunk, CBCD_AngleSet* angle_set); - /// Copies incoming non-local psi from host to device. - void CopyIncomingNonlocalPsiToDevice(CBCD_AngleSet* angle_set, - const std::vector& cell_local_ids); - - /// Copy outgoing psi on host after D2H copy is done. + /** + * Copy completed outgoing angular flux data into host-visible destinations. + * + * Reflecting boundary data is written back to the owning boundary objects. Outgoing + * non-local face data is enqueued directly into the aggregated communicator. + * + * \param sweep_chunk Owning CBCD sweep chunk. + * \param async_comm Aggregated communicator used to enqueue non-local face payloads. + * \param angle_set_id Producing angle-set ID. + * \param angle_indices Global angle indices carried by this angle set. + * \param cell_local_ids Local cells in the just-completed batch. + */ void CopyOutgoingPsiBackToHost(CBCDSweepChunk& sweep_chunk, - CBCD_AngleSet* angle_set, - const std::vector& cell_local_ids); - - double* NLUpwindPsi(uint64_t cell_global_id, - unsigned int face_id, - unsigned int face_node_mapped, - size_t as_ss_idx); - - double* - NLOutgoingPsi(std::vector* psi_nonlocal_outgoing, size_t face_node, size_t as_ss_idx); - - void ClearLocalAndReceivePsi() override { deplocs_outgoing_messages_.clear(); } + CBCD_AsynchronousCommunicator& async_comm, + std::size_t angle_set_id, + const std::vector& angle_indices, + std::span cell_local_ids); + + /** + * Scatter one received non-local face payload into the mapped incoming buffer. + * + * \param source_slot Source-locality slot for the sending partition. + * \param source_face_index Source-slot-local face index carried on the wire. + * \param psi_data Packed payload doubles. + * \return Local cell ID whose dependency count should be updated. + */ + std::uint32_t ScatterReceivedFaceData(std::uint32_t source_slot, + std::uint32_t source_face_index, + const double* psi_data); + + void ClearLocalAndReceivePsi() override; void ClearSendPsi() override {} void AllocateInternalLocalPsi() override {} void AllocateOutgoingPsi() override {} @@ -93,23 +150,31 @@ class CBCD_FLUDS : public FLUDS void AllocatePrelocIOutgoingPsi() override {} void AllocateDelayedPrelocIOutgoingPsi() override {} + std::span + GetReflectingOutgoingBoundaryFaces(const std::uint64_t cell_local_id) const + { + const auto begin = reflecting_outgoing_boundary_face_offsets_[cell_local_id]; + const auto end = reflecting_outgoing_boundary_face_offsets_[cell_local_id + 1]; + return {reflecting_boundary_face_plans_.data() + begin, end - begin}; + } + private: /// Reference to the common data. const CBCD_FLUDSCommonData& common_data_; + /// CBC sweep plane data structure for this angle set. + const CBC_SPDS& cbc_spds_; + /// Unknown manager for angular flux storage. const UnknownManager& psi_uk_man_; + /// Spatial discretization used for saved-psi layout. const SpatialDiscretization& sdm_; - size_t num_angles_in_gs_quadrature_; - size_t num_quadrature_local_dofs_; - size_t num_local_spatial_dofs_; - size_t local_psi_data_size_; - /// Map from incoming face boundary node to indexing metadata - std::vector incoming_boundary_node_map_; - /// Map from cell to outgoing boundary node indexing metadata. - std::map> cell_to_outgoing_boundary_nodes_; - /// Map from cell to incoming nonlocal nodes indexing metadata. - std::map> cell_to_incoming_nonlocal_nodes_; - /// Map from cell to outgoing nonlocal node indexing metadata. - std::map> cell_to_outgoing_nonlocal_nodes_; + /// Number of local spatial degrees of freedom. + std::size_t num_local_spatial_dofs_; + /// Number of doubles in the local psi backing buffer. + std::size_t local_psi_data_size_; + /// Number of doubles in the saved angular-flux buffer. + std::size_t saved_psi_data_size_; + /// Owning grid pointer for cell-view access. + const MeshContinuum* grid_ptr_ = nullptr; /// Mapped host vectors for boundary and non-local angular fluxes. crb::MappedHostVector incoming_boundary_psi_; crb::MappedHostVector outgoing_boundary_psi_; @@ -117,20 +182,26 @@ class CBCD_FLUDS : public FLUDS crb::MappedHostVector outgoing_nonlocal_psi_; /// Associated angleset's stream. crb::Stream stream_; - crb::MappedHostVector local_cell_ids_; + /// Mapped host launch buffers that hold ready local cell IDs. + std::array, 3> local_cell_ids_; + /// Flag indicating whether angular fluxes are saved after the sweep. bool save_angular_flux_; /// Device storage for local angular fluxes. crb::DeviceMemory local_psi_; /// Host and device buffers for saved angular fluxes. crb::DeviceMemory device_saved_psi_; crb::HostVector host_saved_psi_; - /// Pointer set to device angular flux data + /// Pointer set used by the CBCD sweep kernel. CBCD_FLUDSPointerSet pointer_set_; - - /// Creates device pointer set to the local, boundary, and non-local angular flux buffers. + /// Cell-to-reflecting-face offset table. + std::vector reflecting_outgoing_boundary_face_offsets_; + /// Flat reflecting-boundary face plans. + std::vector reflecting_boundary_face_plans_; + /// Flat byte-level memcpy descriptors referenced by outgoing faces. + std::vector outgoing_node_memcpy_plan_; + + /// Build the device pointer set exposed to the CBCD sweep kernel. void CreatePointerSet(); - - std::vector> boundaryI_incoming_psi_; }; -} // namespace opensn \ No newline at end of file +} // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cc index 411bcebd7f..c160dc9f20 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cc +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cc @@ -2,6 +2,9 @@ // SPDX-License-Identifier: MIT #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h" +#include "framework/utils/error.h" +#include +#include namespace opensn { @@ -19,11 +22,7 @@ CBCD_FLUDSCommonData::CBCD_FLUDSCommonData( num_incoming_nonlocal_nodes_(0), num_outgoing_nonlocal_faces_(0), num_outgoing_nonlocal_nodes_(0), - device_cell_face_node_map_(nullptr), - incoming_boundary_node_map_(), - cell_to_outgoing_boundary_nodes_(), - cell_to_incoming_nonlocal_nodes_(), - cell_to_outgoing_nonlocal_nodes_() + device_cell_face_node_map_(nullptr) { CopyFlattenedNodeIndexToDevice(sdm); } @@ -45,4 +44,13 @@ CBCD_FLUDSCommonData::DeallocateDeviceMemory() } #endif -} // namespace opensn \ No newline at end of file +const GroupedIncomingNonlocalFace& +CBCD_FLUDSCommonData::GetIncomingNonlocalFace(const std::uint32_t source_slot, + const std::uint32_t source_face_index) const +{ + const auto begin = source_to_incoming_face_offsets_[source_slot]; + assert(begin + source_face_index < source_to_incoming_face_offsets_[source_slot + 1]); + return incoming_nonlocal_faces_[incoming_face_indices_by_source_[begin + source_face_index]]; +} + +} // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cu b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cu index 98d2294a72..4b8869a19b 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cu +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.cu @@ -2,11 +2,15 @@ // SPDX-License-Identifier: MIT #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/spds.h" #include "framework/math/spatial_discretization/spatial_discretization.h" #include "framework/mesh/mesh_continuum/mesh_continuum.h" #include "caribou/main.hpp" -#include +#include +#include +#include +#include namespace crb = caribou; @@ -17,18 +21,16 @@ void CBCD_FLUDSCommonData::CopyFlattenedNodeIndexToDevice(const SpatialDiscretization& sdm) { const MeshContinuum& grid = *(spds_.GetGrid()); + const auto& cbc_spds = static_cast(spds_); const size_t num_local_cells = grid.local_cells.size(); + const auto& face_orientations = spds_.GetCellFaceOrientations(); + const auto local_face_slot_ids = cbc_spds.GetLocalFaceSlotIDs(); + const auto local_face_slot_node_offsets = cbc_spds.GetLocalFaceSlotNodeOffsets(); std::uint64_t total_face_nodes = 0; for (const auto& cell : grid.local_cells) for (std::uint32_t f = 0; f < cell.faces.size(); ++f) total_face_nodes += sdm.GetCellMapping(cell).GetNumFaceNodes(f); - std::vector cell_spatial_dof_offsets(num_local_cells); - size_t current_dof_offset = 0; - for (const auto& cell : grid.local_cells) - { - cell_spatial_dof_offsets[cell.local_id] = current_dof_offset; - current_dof_offset += sdm.GetCellMapping(cell).GetNumNodes(); - } + const size_t offsets_size = 2 * num_local_cells; const size_t total_size = offsets_size + total_face_nodes; std::vector local_map(total_size); @@ -36,15 +38,58 @@ CBCD_FLUDSCommonData::CopyFlattenedNodeIndexToDevice(const SpatialDiscretization std::uint64_t* indices_ptr = local_map.data() + offsets_size; std::uint64_t current_index_offset = offsets_size; std::uint64_t local_indices_filled = 0; - // Iterate over cells to fill the map and populate metadata structures + + cell_to_outgoing_boundary_node_offsets_.assign(num_local_cells + 1, 0); + cell_to_incoming_nonlocal_face_offsets_.assign(num_local_cells + 1, 0); + cell_to_outgoing_nonlocal_face_offsets_.assign(num_local_cells + 1, 0); + + std::unordered_map locality_to_dest_slot; + std::unordered_map source_partition_to_slot; + outgoing_localities_.reserve(num_local_cells); + incoming_source_partitions_.reserve(num_local_cells); + outgoing_boundary_nodes_.reserve(total_face_nodes); + outgoing_nonlocal_face_node_copies_.reserve(total_face_nodes); + struct OrderedIncomingFaceBuild + { + std::uint32_t source_slot = 0; + std::uint64_t cell_global_id = 0; + unsigned int face_id = 0; + std::uint32_t face_index = 0; + }; + struct OrderedOutgoingFaceBuild + { + std::uint32_t dest_slot = 0; + std::uint64_t cell_global_id = 0; + unsigned int face_id = 0; + std::uint32_t face_index = 0; + }; + std::vector incoming_face_order; + std::vector outgoing_face_order; + incoming_face_order.reserve(total_face_nodes); + outgoing_face_order.reserve(total_face_nodes); + + const auto update_cell_offsets = [this](const std::uint64_t cell_local_id) + { + cell_to_outgoing_boundary_node_offsets_[cell_local_id] = + static_cast(outgoing_boundary_nodes_.size()); + cell_to_incoming_nonlocal_face_offsets_[cell_local_id] = + static_cast(incoming_nonlocal_faces_.size()); + cell_to_outgoing_nonlocal_face_offsets_[cell_local_id] = + static_cast(outgoing_nonlocal_faces_.size()); + }; + for (const auto& cell : grid.local_cells) { + update_cell_offsets(cell.local_id); + cell_offsets_ptr[2 * cell.local_id] = current_index_offset; std::uint64_t num_cell_nodes = 0; + std::vector incoming_face_to_grouped_index(cell.faces.size(), -1); + std::vector outgoing_face_to_grouped_index(cell.faces.size(), -1); for (size_t f = 0; f < cell.faces.size(); ++f) { const CellFace& face = cell.faces[f]; - const FaceOrientation& orientation = spds_.GetCellFaceOrientations()[cell.local_id][f]; + const FaceOrientation& orientation = face_orientations[cell.local_id][f]; const FaceNodalMapping& face_nodal_mapping = grid_nodal_mappings_[cell.local_id][f]; const size_t num_face_nodes = sdm.GetCellMapping(cell).GetNumFaceNodes(f); const bool is_outgoing_face = (orientation == FaceOrientation::OUTGOING); @@ -52,14 +97,6 @@ CBCD_FLUDSCommonData::CopyFlattenedNodeIndexToDevice(const SpatialDiscretization const bool is_local_face = face.IsNeighborLocal(&grid); const bool is_boundary_face = not face.has_neighbor; - if ((not is_local_face) and (not is_boundary_face)) - { - if (is_incoming_face) - ++num_incoming_nonlocal_faces_; - else if (is_outgoing_face) - ++num_outgoing_nonlocal_faces_; - } - for (size_t fn = 0; fn < num_face_nodes; ++fn) { CBCD_NodeIndex node_index; @@ -68,33 +105,63 @@ CBCD_FLUDSCommonData::CopyFlattenedNodeIndexToDevice(const SpatialDiscretization { if (is_local_face) { - std::uint32_t nbr_local_idx = face.GetNeighborLocalID(&grid); - std::uint32_t adj_cell_node = face_nodal_mapping.cell_node_mapping_[fn]; - const std::uint64_t index = cell_spatial_dof_offsets[nbr_local_idx] + adj_cell_node; - node_index = CBCD_NodeIndex(index, is_outgoing_face, is_local_face); + const auto task_id = cbc_spds.GetIncomingLocalFaceTaskID( + static_cast(cell.local_id), static_cast(f)); + const auto slot_id = local_face_slot_ids[task_id]; + const auto local_face_node = + static_cast(face_nodal_mapping.face_node_mapping_[fn]); + node_index = CBCD_NodeIndex( + static_cast(local_face_slot_node_offsets[slot_id]) + local_face_node, + is_outgoing_face, + true); } else if (not is_boundary_face) { node_index = CBCD_NodeIndex(num_incoming_nonlocal_nodes_, is_outgoing_face, is_local_face); - cell_to_incoming_nonlocal_nodes_[cell.local_id].emplace_back( - NonlocalNodeInfo{cell.local_id, - cell.global_id, - static_cast(f), - fn, - face_nodal_mapping.face_node_mapping_[fn], - static_cast(num_incoming_nonlocal_nodes_)}); + int& grouped_face_index = incoming_face_to_grouped_index[f]; + if (grouped_face_index < 0) + { + grouped_face_index = + static_cast(incoming_nonlocal_faces_.size() - + cell_to_incoming_nonlocal_face_offsets_[cell.local_id]); + auto& grouped_face = incoming_nonlocal_faces_.emplace_back(); + const int source_partition = grid.cells[face.neighbor_id].partition_id; + auto [source_it, inserted] = source_partition_to_slot.try_emplace( + source_partition, static_cast(incoming_source_partitions_.size())); + if (inserted) + incoming_source_partitions_.push_back(source_partition); + grouped_face.cell_local_id = static_cast(cell.local_id); + grouped_face.base_storage_index = + static_cast(num_incoming_nonlocal_nodes_); + grouped_face.source_slot = source_it->second; + incoming_face_order.push_back( + {grouped_face.source_slot, + cell.global_id, + static_cast(f), + static_cast(incoming_nonlocal_faces_.size() - 1)}); + ++num_incoming_nonlocal_faces_; + } + + auto& grouped_face = + incoming_nonlocal_faces_[cell_to_incoming_nonlocal_face_offsets_[cell.local_id] + + grouped_face_index]; + ++grouped_face.num_nodes; ++num_incoming_nonlocal_nodes_; } else { node_index = CBCD_NodeIndex(num_incoming_boundary_nodes_, is_outgoing_face); - incoming_boundary_node_map_.emplace_back( - BoundaryNodeInfo{cell.local_id, - static_cast(f), - fn, - static_cast(num_incoming_boundary_nodes_), - face.neighbor_id}); + if (fn == 0) + { + incoming_boundary_face_plans_.push_back( + {face.neighbor_id, + static_cast(cell.local_id), + static_cast(f), + 0, + static_cast(num_incoming_boundary_nodes_), + static_cast(num_face_nodes)}); + } ++num_incoming_boundary_nodes_; } } @@ -102,32 +169,71 @@ CBCD_FLUDSCommonData::CopyFlattenedNodeIndexToDevice(const SpatialDiscretization { if (is_local_face) { - const int cell_node = sdm.GetCellMapping(cell).MapFaceNode(f, fn); - const std::uint64_t index = cell_spatial_dof_offsets[cell.local_id] + cell_node; - node_index = CBCD_NodeIndex(index, is_outgoing_face, is_local_face); + const auto task_id = cbc_spds.GetOutgoingLocalFaceTaskID( + static_cast(cell.local_id), static_cast(f)); + const auto slot_id = local_face_slot_ids[task_id]; + node_index = + CBCD_NodeIndex(static_cast(local_face_slot_node_offsets[slot_id]) + + static_cast(fn), + is_outgoing_face, + true); } else if (not is_boundary_face) { node_index = CBCD_NodeIndex(num_outgoing_nonlocal_nodes_, is_outgoing_face, is_local_face); - cell_to_outgoing_nonlocal_nodes_[cell.local_id].emplace_back( - NonlocalNodeInfo{cell.local_id, - cell.global_id, - static_cast(f), - fn, - face_nodal_mapping.face_node_mapping_[fn], - static_cast(num_outgoing_nonlocal_nodes_)}); + int& grouped_face_index = outgoing_face_to_grouped_index[f]; + if (grouped_face_index < 0) + { + const int locality = grid.cells[face.neighbor_id].partition_id; + auto dest_slot_it = locality_to_dest_slot.find(locality); + std::uint32_t dest_slot = 0; + if (dest_slot_it == locality_to_dest_slot.end()) + { + dest_slot = static_cast(outgoing_localities_.size()); + locality_to_dest_slot.emplace(locality, dest_slot); + outgoing_localities_.push_back(locality); + } + else + dest_slot = dest_slot_it->second; + + const auto dest_cell_global_id = face.neighbor_id; + const auto dest_face_id = + static_cast(face_nodal_mapping.associated_face_); + grouped_face_index = + static_cast(outgoing_nonlocal_faces_.size() - + cell_to_outgoing_nonlocal_face_offsets_[cell.local_id]); + auto& grouped_face = outgoing_nonlocal_faces_.emplace_back(); + grouped_face.dest_slot = dest_slot; + grouped_face.num_face_nodes = static_cast(num_face_nodes); + grouped_face.node_copy_offset = + static_cast(outgoing_nonlocal_face_node_copies_.size()); + outgoing_face_order.push_back( + {dest_slot, + dest_cell_global_id, + dest_face_id, + static_cast(outgoing_nonlocal_faces_.size() - 1)}); + ++num_outgoing_nonlocal_faces_; + } + + auto& grouped_face = + outgoing_nonlocal_faces_[cell_to_outgoing_nonlocal_face_offsets_[cell.local_id] + + grouped_face_index]; + outgoing_nonlocal_face_node_copies_.push_back( + {static_cast(num_outgoing_nonlocal_nodes_), + static_cast(face_nodal_mapping.face_node_mapping_[fn])}); + ++grouped_face.num_node_copies; ++num_outgoing_nonlocal_nodes_; } else { node_index = CBCD_NodeIndex(num_outgoing_boundary_nodes_, is_outgoing_face); - cell_to_outgoing_boundary_nodes_[cell.local_id].emplace_back( - BoundaryNodeInfo{cell.local_id, + outgoing_boundary_nodes_.emplace_back( + BoundaryNodeInfo{face.neighbor_id, + static_cast(cell.local_id), static_cast(f), - fn, - static_cast(num_outgoing_boundary_nodes_), - face.neighbor_id}); + static_cast(num_outgoing_boundary_nodes_), + static_cast(fn)}); ++num_outgoing_boundary_nodes_; } } @@ -139,9 +245,52 @@ CBCD_FLUDSCommonData::CopyFlattenedNodeIndexToDevice(const SpatialDiscretization } num_cell_nodes += num_face_nodes; } + update_cell_offsets(cell.local_id + 1); cell_offsets_ptr[2 * cell.local_id + 1] = num_cell_nodes; current_index_offset += num_cell_nodes; } + + std::sort(incoming_face_order.begin(), + incoming_face_order.end(), + [](const OrderedIncomingFaceBuild& lhs, const OrderedIncomingFaceBuild& rhs) + { + return std::tuple(lhs.source_slot, lhs.cell_global_id, lhs.face_id) < + std::tuple(rhs.source_slot, rhs.cell_global_id, rhs.face_id); + }); + + source_to_incoming_face_offsets_.assign(incoming_source_partitions_.size() + 1, 0); + for (const auto& build : incoming_face_order) + ++source_to_incoming_face_offsets_[build.source_slot + 1]; + for (std::size_t i = 0; i < incoming_source_partitions_.size(); ++i) + source_to_incoming_face_offsets_[i + 1] += source_to_incoming_face_offsets_[i]; + + incoming_face_indices_by_source_.resize(incoming_face_order.size()); + auto source_write_offsets = source_to_incoming_face_offsets_; + for (const auto& build : incoming_face_order) + incoming_face_indices_by_source_[source_write_offsets[build.source_slot]++] = build.face_index; + + std::sort(outgoing_face_order.begin(), + outgoing_face_order.end(), + [](const OrderedOutgoingFaceBuild& lhs, const OrderedOutgoingFaceBuild& rhs) + { + return std::tuple(lhs.dest_slot, lhs.cell_global_id, lhs.face_id) < + std::tuple(rhs.dest_slot, rhs.cell_global_id, rhs.face_id); + }); + + std::uint32_t current_dest_slot = 0; + std::uint32_t remote_face_index = 0; + bool first_outgoing_face = true; + for (const auto& build : outgoing_face_order) + { + if (first_outgoing_face or (build.dest_slot != current_dest_slot)) + { + current_dest_slot = build.dest_slot; + remote_face_index = 0; + first_outgoing_face = false; + } + outgoing_nonlocal_faces_[build.face_index].remote_face_index = remote_face_index++; + } + if (local_map.empty()) return; crb::HostVector host_mem(local_map.begin(), local_map.end()); @@ -160,4 +309,4 @@ CBCD_FLUDSCommonData::DeallocateDeviceMemory() device_cell_face_node_map_ = nullptr; } } -} // namespace opensn \ No newline at end of file +} // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h index 1d61b5201e..c77f56d975 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h @@ -6,17 +6,31 @@ #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_structs.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/fluds_common_data.h" #include -#include +#include +#include namespace opensn { class SpatialDiscretization; -/// Common data for CBCD_FLUDS +/** + * Shared CBCD FLUDS metadata. + * + * Builds and owns the flattened indexing tables used by every CBCD FLUDS instance + * associated with one SPDS. The tables translate cell-face-node accesses into + * compact local, boundary, and non-local storage indices on both the host and device. + */ class CBCD_FLUDSCommonData : public FLUDSCommonData { public: + /** + * Construct the shared CBCD FLUDS metadata for one SPDS. + * + * \param spds Sweep plane data structure providing the CBC cell and face ordering. + * \param grid_nodal_mappings Per-cell face-node mappings from the spatial discretization. + * \param sdm Spatial discretization used to enumerate face nodes. + */ CBCD_FLUDSCommonData(const SPDS& spds, const std::vector& grid_nodal_mappings, const SpatialDiscretization& sdm); @@ -41,28 +55,70 @@ class CBCD_FLUDSCommonData : public FLUDSCommonData /// Get number of outgoing non-local faces. std::size_t GetNumOutgoingNonlocalFaces() const { return num_outgoing_nonlocal_faces_; } - /// Get incoming boundary node map. - const std::vector& GetIncomingBoundaryNodeMap() const + /// Return grouped incoming-boundary faces. + const std::vector& GetIncomingBoundaryFaces() const + { + return incoming_boundary_face_plans_; + } + + /// Return the number of grouped incoming non-local faces from one source locality slot. + std::size_t GetNumIncomingFacesFromSource(const std::size_t source_slot) const + { + return source_to_incoming_face_offsets_[source_slot + 1] - + source_to_incoming_face_offsets_[source_slot]; + } + + /// Return outgoing-boundary nodes for one cell. + std::span GetOutgoingBoundaryNodes(std::uint64_t cell_local_id) const { - return incoming_boundary_node_map_; + const auto begin = cell_to_outgoing_boundary_node_offsets_[cell_local_id]; + const auto end = cell_to_outgoing_boundary_node_offsets_[cell_local_id + 1]; + return {outgoing_boundary_nodes_.data() + begin, end - begin}; } - /// Get outgoing boundary node map. - const std::map>& GetOutgoingBoundaryNodeMap() const + /// Return grouped outgoing non-local faces for one cell. + std::span + GetOutgoingNonlocalFaces(std::uint64_t cell_local_id) const { - return cell_to_outgoing_boundary_nodes_; + const auto begin = cell_to_outgoing_nonlocal_face_offsets_[cell_local_id]; + const auto end = cell_to_outgoing_nonlocal_face_offsets_[cell_local_id + 1]; + return {outgoing_nonlocal_faces_.data() + begin, end - begin}; } - /// Get incoming nonlocal node map. - const std::map>& GetIncomingNonlocalNodeMap() const + /// Return grouped incoming non-local faces for one cell. + std::span + GetIncomingNonlocalFaces(std::uint64_t cell_local_id) const { - return cell_to_incoming_nonlocal_nodes_; + const auto begin = cell_to_incoming_nonlocal_face_offsets_[cell_local_id]; + const auto end = cell_to_incoming_nonlocal_face_offsets_[cell_local_id + 1]; + return {incoming_nonlocal_faces_.data() + begin, end - begin}; } - /// Get outgoing nonlocal node map. - const std::map>& GetOutgoingNonlocalNodeMap() const + /// Return the number of local cells represented in the grouped-face tables. + std::size_t GetNumLocalCells() const + { + return cell_to_outgoing_nonlocal_face_offsets_.size() - 1; + } + + /// Return the ordered outgoing-locality table used to build communicator queue indices. + const std::vector& GetOutgoingLocalities() const { return outgoing_localities_; } + + /// Return the ordered incoming source-locality table. + const std::vector& GetIncomingSourcePartitions() const + { + return incoming_source_partitions_; + } + + /// Resolve one grouped incoming non-local face by source-slot-local face index. + const GroupedIncomingNonlocalFace& GetIncomingNonlocalFace(std::uint32_t source_slot, + std::uint32_t source_face_index) const; + + /// Return the outgoing-node-copy descriptors for one grouped outgoing face. + std::span + GetOutgoingNodeCopies(const GroupedOutgoingNonlocalFace& face) const { - return cell_to_outgoing_nonlocal_nodes_; + return {outgoing_nonlocal_face_node_copies_.data() + face.node_copy_offset, + face.num_node_copies}; } /// Get pointer to cell-face-node map on device. @@ -83,22 +139,39 @@ class CBCD_FLUDSCommonData : public FLUDSCommonData size_t num_outgoing_nonlocal_nodes_; /// Device pointer to cell-face-node map for angular flux buffer access. std::uint64_t* device_cell_face_node_map_; - /// Map from incoming face boundary node to indexing metadata. - std::vector incoming_boundary_node_map_; - /// Map from cell to outgoing boundary nodes. - std::map> cell_to_outgoing_boundary_nodes_; - /// Map from cell to incoming nonlocal nodes. - std::map> cell_to_incoming_nonlocal_nodes_; - /// Map from cell to outgoing nonlocal nodes. - std::map> cell_to_outgoing_nonlocal_nodes_; + /// Flat grouped incoming-boundary face copy plans. + std::vector incoming_boundary_face_plans_; + /// Cell-to-outgoing-boundary-node offset table. + std::vector cell_to_outgoing_boundary_node_offsets_; + /// Flat outgoing-boundary node list. + std::vector outgoing_boundary_nodes_; + /// Cell-to-incoming-face offset table. + std::vector cell_to_incoming_nonlocal_face_offsets_; + /// Cell-to-outgoing-face offset table. + std::vector cell_to_outgoing_nonlocal_face_offsets_; + /// Flat grouped incoming nonlocal faces. + std::vector incoming_nonlocal_faces_; + /// Flat grouped outgoing nonlocal faces. + std::vector outgoing_nonlocal_faces_; + /// Flat outgoing-node-copy metadata referenced by grouped outgoing faces. + std::vector outgoing_nonlocal_face_node_copies_; + /// Ordered table of distinct outgoing localities. + std::vector outgoing_localities_; + /// Ordered table of incoming source localities. + std::vector incoming_source_partitions_; + /// Source-major incoming grouped-face spans. + std::vector source_to_incoming_face_offsets_; + /// Source-major ordered incoming grouped-face indices. + std::vector incoming_face_indices_by_source_; /** - * Compute cell-face-node map for device angular flux buffer access, and - * create auxiliary indexing maps for boundary and non-local nodes for host access. + * Build and upload the flattened cell-face-node index map. + * + * \param sdm Spatial discretization used to enumerate face nodes. */ void CopyFlattenedNodeIndexToDevice(const SpatialDiscretization& sdm); /// Deallocate device memory for cell-face-node map. void DeallocateDeviceMemory(); }; -} // namespace opensn \ No newline at end of file +} // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_structs.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_structs.h index 1e959958ba..e406dccdd5 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_structs.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_structs.h @@ -4,16 +4,30 @@ #pragma once #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/fluds_structs.h" +#include +#include +#include namespace opensn { +class SweepBoundary; + /** - * Node index specific to CBCD FLUDS. + * Packed 64-bit angular flux buffer index for CBCD FLUDS. + * + * Encodes the buffer type (local/boundary/non-lcaol, incoming/outgoing) and + * address into a single 64-bit value. * Does not support delayed nodes. Reclaims the delayed bit for indices. - * - Bit 63: Incoming/outgoing bit. - * - Bit 62: Boundary bit. - * - Bit 61: Local bit. + * + * Bit layout: + * - Bit 63: incoming (0) / outgoing (1). + * - Bit 62: boundary (1) / non-boundary (0). + * - Bit 61: local (1) / non-local (0). + * - For local non-boundary nodes: + * - Bits 0-60: flat local-face-slot node bank index. + * - For boundary or non-local nodes: + * - Bits 0-60: flat bank index. * - Bits 0-60: Index bits (capacity ~2.3e18). */ class CBCD_NodeIndex : public NodeIndex @@ -162,24 +176,66 @@ struct CBCD_FLUDSPointerSet : public FLUDSPointerSet */ struct BoundaryNodeInfo { - std::uint64_t cell_local_id; - unsigned int face_id; - size_t face_node; - std::uint64_t storage_index; - std::uint64_t boundary_id; + std::uint64_t boundary_id = 0; + std::uint32_t cell_local_id = 0; + unsigned int face_id = 0; + std::uint32_t storage_index = 0; + std::uint16_t face_node = 0; }; -/** - * Metadata for non-local face nodes. - */ -struct NonlocalNodeInfo +/// Grouped incoming-boundary face copy plan. +struct IncomingBoundaryFacePlan +{ + std::uint64_t boundary_id = 0; + std::uint32_t cell_local_id = 0; + unsigned int face_id = 0; + std::uint16_t first_face_node = 0; + std::uint32_t base_storage_index = 0; + std::uint16_t num_nodes = 0; +}; + +/// Grouped incoming non-local face. +struct GroupedIncomingNonlocalFace +{ + std::uint32_t cell_local_id = 0; + std::uint32_t base_storage_index = 0; + std::uint32_t source_slot = 0; + std::uint16_t num_nodes = 0; +}; + +/// Outgoing node-copy descriptor +struct OutgoingNodeCopy +{ + std::uint32_t storage_index = 0; + std::uint16_t face_node = 0; +}; + +/// Grouped outgoing non-local face. +struct GroupedOutgoingNonlocalFace +{ + std::uint32_t dest_slot = 0; + std::uint32_t remote_face_index = 0; + std::uint32_t node_copy_offset = 0; + std::uint16_t num_face_nodes = 0; + std::uint16_t num_node_copies = 0; +}; + +/// Reflecting-boundary face copy plan. +struct ReflectingBoundaryFacePlan +{ + SweepBoundary* boundary = nullptr; + std::uint32_t cell_local_id = 0; + unsigned int face_id = 0; + std::uint16_t first_face_node = 0; + std::size_t src_base_offset = 0; + std::uint16_t num_nodes = 0; +}; + +/// Outgoing node-copy plan entry. +struct OutgoingNodeMemcpy { - std::uint64_t cell_local_id; - std::uint64_t cell_global_id; - unsigned int face_id; - size_t face_node; - short face_node_mapped; - std::uint64_t storage_index; + std::size_t src_offset = 0; + std::size_t dest_offset = 0; }; -} // namespace opensn \ No newline at end of file +} // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.cc b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.cc index af76cb45ec..a082ea62dd 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.cc +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.cc @@ -29,7 +29,8 @@ SweepScheduler::SweepScheduler(SchedulingAlgorithm scheduler_type, InitializeAlgoDOG(); if (scheduler_type_ == SchedulingAlgorithm::ALL_AT_ONCE || - scheduler_type_ == SchedulingAlgorithm::DEPTH_OF_GRAPH) + scheduler_type_ == SchedulingAlgorithm::DEPTH_OF_GRAPH || + scheduler_type_ == SchedulingAlgorithm::ASYNC_FIFO) { angle_agg_.SetupAngleSetDependencies(); } @@ -39,6 +40,14 @@ SweepScheduler::SweepScheduler(SchedulingAlgorithm scheduler_type, pool_.Resize(angle_agg_.GetNumAngleSets()); execution_order_.reserve(angle_agg_.GetNumAngleSets()); } + else if (scheduler_type_ == SchedulingAlgorithm::ASYNC_FIFO) + { + const std::size_t hardware_concurrency = std::thread::hardware_concurrency(); + const std::size_t num_workers = std::max( + 1, + std::min(angle_agg_.GetNumAngleSets(), hardware_concurrency == 0 ? 1 : hardware_concurrency)); + pool_.Resize(num_workers); + } // Initialize delayed upstream data for (auto& angset : angle_agg_) diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.cu b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.cu index 861aeb305f..36ccade153 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.cu +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.cu @@ -4,14 +4,10 @@ #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/sweep_scheduler.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/aahd_angle_set.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aahd_sweep_chunk.h" -#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h" -#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbc_async_comm.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.h" #include "caribou/main.hpp" #include "caliper/cali.h" -#include -#include namespace opensn { @@ -85,194 +81,52 @@ SweepScheduler::ScheduleAlgoAsyncFIFO(SweepChunk& sweep_chunk) CALI_CXX_MARK_SCOPE("SweepScheduler::ScheduleAlgoAsyncFIFO"); auto& cbcd_sweep_chunk = static_cast(sweep_chunk); - // Copy phi and source moments to device cbcd_sweep_chunk.GetProblem().CopyPhiAndSrcToDevice(); + cbcd_sweep_chunk.RefreshCachedKernelArgs(); auto& angle_sets = cbcd_sweep_chunk.GetAngleSets(); - auto& fluds_list = cbcd_sweep_chunk.GetFLUDS(); - auto& streams_list = cbcd_sweep_chunk.GetStreams(); - - const size_t num_angle_sets = angle_sets.size(); - std::vector executed(num_angle_sets, 0); - std::vector boundary_data_set(num_angle_sets, 0); - std::vector kernel_in_flight(num_angle_sets, 0); - std::vector> ready_queues(num_angle_sets); - std::vector num_completed_tasks(num_angle_sets, 0); - std::vector> ready_tasks(num_angle_sets); - std::vector> ready_cell_ids(num_angle_sets); - std::vector> in_flight_tasks(num_angle_sets); - std::vector> in_flight_cell_ids(num_angle_sets); - + const auto num_angle_sets = angle_sets.size(); for (auto* angle_set : angle_sets) - { - auto& current_task_list = angle_set->GetCurrentTaskList(); - if (current_task_list.empty()) - current_task_list = static_cast(angle_set->GetSPDS()).GetTaskList(); - } + angle_set->ResetDependencyCounter(); - size_t executed_anglesets = 0; - while (executed_anglesets < num_angle_sets) - { - bool any_work_done = false; + cbcd_sweep_chunk.StartCommunicator(); - // Poll completed kernels - for (size_t i = 0; i < num_angle_sets; ++i) + const auto num_workers = pool_.GetSize(); + pool_.ExecuteBatch( + [num_workers, num_angle_sets, &angle_sets, &cbcd_sweep_chunk](std::size_t worker_id) { - if (not kernel_in_flight[i]) - continue; - // Check if the kernel is done - if (streams_list[i].is_completed()) + const auto chunk_size = (num_angle_sets + num_workers - 1) / num_workers; + const auto begin = worker_id * chunk_size; + const auto end = std::min(begin + chunk_size, num_angle_sets); + + bool all_done = false; + while (not all_done) { - // Copy back outgoing (reflecting) boundary and non-local psi - fluds_list[i]->CopyOutgoingPsiBackToHost( - cbcd_sweep_chunk, angle_sets[i], in_flight_cell_ids[i]); - // Update task dependencies - auto& current_task_list = angle_sets[i]->GetCurrentTaskList(); - for (auto* task : in_flight_tasks[i]) + all_done = true; + bool any_work_done = false; + for (std::size_t i = begin; i < end; ++i) { - for (uint64_t succ : task->successors) + auto* angle_set = angle_sets[i]; + if (angle_set->IsExecuted()) + continue; + all_done = false; + if (not angle_set->IsInitialized()) { - --current_task_list[succ].num_dependencies; - if (current_task_list[succ].num_dependencies == 0 and boundary_data_set[i]) - ready_queues[i].push_back(¤t_task_list[succ]); + any_work_done |= angle_set->TryInitialize(cbcd_sweep_chunk); + continue; } - task->completed = true; - } - num_completed_tasks[i] += in_flight_tasks[i].size(); - // Send MPI data - auto* comm = static_cast(angle_sets[i]->GetCommunicator()); - comm->SendData(); - in_flight_tasks[i].clear(); - in_flight_cell_ids[i].clear(); - kernel_in_flight[i] = false; - any_work_done = true; - } - } - - // Receive and send MPI data - for (size_t i = 0; i < num_angle_sets; ++i) - { - if (executed[i]) - continue; - auto* comm = static_cast(angle_sets[i]->GetCommunicator()); - auto& current_task_list = angle_sets[i]->GetCurrentTaskList(); - auto received = comm->ReceiveData(); - if (not received.empty()) - { - for (uint64_t t : received) - { - --current_task_list[t].num_dependencies; - if (current_task_list[t].num_dependencies == 0 and boundary_data_set[i]) - ready_queues[i].push_back(¤t_task_list[t]); - } - any_work_done = true; - } - comm->SendData(); - } - - // Set boundary data - for (size_t i = 0; i < num_angle_sets; ++i) - { - if (executed[i] or boundary_data_set[i] or kernel_in_flight[i]) - continue; - auto* as = angle_sets[i]; - bool boundaries_ready = true; - for (auto& [bid, boundary] : as->GetBoundaries()) - { - if (not boundary->CheckAnglesReadyStatus(as->GetAngleIndices())) - { - boundaries_ready = false; - break; - } - } - if (boundaries_ready) - { - fluds_list[i]->CopyIncomingBoundaryPsiToDevice(cbcd_sweep_chunk, angle_sets[i]); - boundary_data_set[i] = true; - any_work_done = true; - - auto& current_task_list = angle_sets[i]->GetCurrentTaskList(); - for (auto& task : current_task_list) - { - if (task.num_dependencies == 0 and not task.completed) - ready_queues[i].push_back(&task); + any_work_done |= angle_set->TryAdvanceOneStep(cbcd_sweep_chunk); } + if ((not all_done) and (not any_work_done)) + std::this_thread::yield(); } - } - - // Collect ready tasks and launch kernels (only if task dependencies changed) - if (any_work_done) - { - for (size_t i = 0; i < num_angle_sets; ++i) - { - if (executed[i] or (not boundary_data_set[i]) or kernel_in_flight[i]) - continue; - - if (ready_queues[i].empty()) - continue; - - ready_tasks[i] = std::move(ready_queues[i]); - ready_queues[i].clear(); - - ready_cell_ids[i].clear(); - for (auto* task : ready_tasks[i]) - ready_cell_ids[i].push_back(task->reference_id); - - fluds_list[i]->CopyIncomingNonlocalPsiToDevice(angle_sets[i], ready_cell_ids[i]); - cbcd_sweep_chunk.Sweep(ready_cell_ids[i], i); - in_flight_tasks[i] = std::move(ready_tasks[i]); - in_flight_cell_ids[i] = std::move(ready_cell_ids[i]); - kernel_in_flight[i] = true; - } - } + }); - // Check angleset completion - for (size_t i = 0; i < num_angle_sets; ++i) - { - if (executed[i] or (not boundary_data_set[i]) or kernel_in_flight[i]) - continue; - auto& current_task_list = angle_sets[i]->GetCurrentTaskList(); - auto* comm = static_cast(angle_sets[i]->GetCommunicator()); - bool all_done = (num_completed_tasks[i] == current_task_list.size()); - if (all_done and comm->SendData()) - { - for (auto& [bid, boundary] : angle_sets[i]->GetBoundaries()) - boundary->UpdateAnglesReadyStatus(angle_sets[i]->GetAngleIndices()); - executed[i] = true; - ++executed_anglesets; - fluds_list[i]->CopySavedPsiFromDevice(); - auto* fluds = fluds_list[i]; - auto* as = angle_sets[i]; - // Cast away constness to add a callback - streams_list[i].add_callback( - [fluds, &cbcd_sweep_chunk, as]() - { fluds->CopySavedPsiToDestinationPsi(cbcd_sweep_chunk, as); }); - } - } - } + cbcd_sweep_chunk.StopCommunicator(); - /// Copy phi and outflow data back to host cbcd_sweep_chunk.GetProblem().CopyPhiAndOutflowBackToHost(); - // Receive delayed data - opensn::mpi_comm.barrier(); - bool received_delayed_data = false; - while (not received_delayed_data) - { - received_delayed_data = true; - - for (auto& angle_set : angle_sets) - { - if (angle_set->FlushSendBuffers() == AngleSetStatus::MESSAGES_PENDING) - received_delayed_data = false; - - if (not angle_set->ReceiveDelayedData()) - received_delayed_data = false; - } - } - - // Reset all - for (auto& angle_set : angle_sets) + for (auto* angle_set : angle_sets) angle_set->ResetSweepBuffers(); for (const auto& [bid, bndry] : angle_agg_.GetSimBoundaries()) diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h index 4504375d9f..71abd6eb1c 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_kernels.h @@ -71,7 +71,7 @@ struct CBCSweepData /// Number of nodes on the current cell. size_t cell_num_nodes; - /// Number of energy groups in the groupset. + /// Number of groups in the groupset. size_t gs_size; /// First group index in the groupset. unsigned int gs_gi; @@ -481,7 +481,7 @@ CBC_Sweep_Generic(CBCSweepData& data, CBCGenericSweepScratch& scratch, AngleSet& * * Specialized in cbc_avx_sweep_chunk.cc for compile-time-known node counts * (4, 8, etc.), enabling stack-allocated matrices, loop unrolling, and SIMD - * batch Gauss elimination across multiple energy groups simultaneously. + * batch Gauss elimination across multiple groups simultaneously. * * \tparam NumNodes compile-time number of cell nodes * \tparam time_dependent if true, include the time-derivative source term diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.cu b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.cu index 4bd0a8bb16..316a0ea323 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.cu +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.cu @@ -8,6 +8,8 @@ #include "modules/linear_boltzmann_solvers/lbs_problem/device/carrier/mesh_carrier.h" #include "caliper/cali.h" #include +#include +#include namespace opensn { @@ -27,54 +29,168 @@ CBCDSweepChunk::CBCDSweepChunk(DiscreteOrdinatesProblem& problem, LBSGroupset& g problem.GetMinCellDOFCount()), problem_(problem) { + std::vector fluds_list; for (auto& as : *(groupset.angle_agg)) { auto* angle_set = static_cast(as.get()); auto* fluds = static_cast(&(angle_set->GetFLUDS())); angle_sets_.push_back(angle_set); - fluds_list_.push_back(fluds); - streams_list_.push_back(angle_set->GetStream()); + fluds_list.push_back(fluds); + gpu_kernel::Arguments args(problem_, groupset_, *angle_set, *fluds); - kernel_args_list_.push_back(args); - unsigned int stride_size = + const auto stride_size = gpu_kernel::RoundUp(static_cast(args.flud_data.stride_size)); - unsigned int block_size_x = std::min(stride_size, gpu_kernel::threshold); - unsigned int block_size_y = gpu_kernel::threshold / block_size_x; - unsigned int grid_size_x = (stride_size + gpu_kernel::threshold - 1) / gpu_kernel::threshold; - block_sizes_.push_back(crb::Dim3(block_size_x, block_size_y)); - grid_size_x_list_.push_back(grid_size_x); + const auto block_size_x = std::min(stride_size, gpu_kernel::threshold); + const auto block_size_y = gpu_kernel::threshold / block_size_x; + const auto grid_size_x = (stride_size + gpu_kernel::threshold - 1) / gpu_kernel::threshold; + cached_params_.push_back({args, + crb::Dim3(block_size_x, block_size_y), + grid_size_x, + fluds, + fluds->GetSavedAngularFluxDevicePointer()}); + } + + if (not angle_sets_.empty()) + { + std::vector> incoming_source_partitions_by_angle_set; + incoming_source_partitions_by_angle_set.reserve(angle_sets_.size()); + std::unordered_map> source_as_section_bytes; + std::vector capacities(angle_sets_.size()); + for (std::size_t as_ss_idx = 0; as_ss_idx < angle_sets_.size(); ++as_ss_idx) + { + const auto stride = fluds_list[as_ss_idx]->GetStrideSize(); + const auto& common_data = fluds_list[as_ss_idx]->GetCommonData(); + incoming_source_partitions_by_angle_set.push_back(common_data.GetIncomingSourcePartitions()); + capacities[as_ss_idx].outgoing_faces = common_data.GetNumOutgoingNonlocalFaces(); + capacities[as_ss_idx].incoming_faces = common_data.GetNumIncomingNonlocalFaces(); + for (std::size_t cell_local_id = 0; cell_local_id < common_data.GetNumLocalCells(); + ++cell_local_id) + { + for (const auto& face_info : common_data.GetOutgoingNonlocalFaces(cell_local_id)) + { + capacities[as_ss_idx].max_outgoing_face_values = + std::max(capacities[as_ss_idx].max_outgoing_face_values, + static_cast(face_info.num_face_nodes) * stride); + } + } + + std::unordered_map incoming_entries_by_source_slot; + std::unordered_map incoming_values_by_source_slot; + for (std::size_t cell_local_id = 0; cell_local_id < common_data.GetNumLocalCells(); + ++cell_local_id) + { + for (const auto& face_info : common_data.GetIncomingNonlocalFaces(cell_local_id)) + { + if (face_info.num_nodes == 0) + continue; + ++incoming_entries_by_source_slot[face_info.source_slot]; + incoming_values_by_source_slot[face_info.source_slot] += + static_cast(face_info.num_nodes) * stride; + const auto source_partition = + common_data.GetIncomingSourcePartitions()[face_info.source_slot]; + auto& per_as_bytes = source_as_section_bytes[source_partition]; + if (per_as_bytes.empty()) + per_as_bytes.assign(angle_sets_.size(), 0); + per_as_bytes[as_ss_idx] += + sizeof(std::uint32_t) + sizeof(std::size_t) + + static_cast(face_info.num_nodes) * stride * sizeof(double); + } + } + for (const auto& [_, count] : incoming_entries_by_source_slot) + capacities[as_ss_idx].max_incoming_batch_entries = + std::max(capacities[as_ss_idx].max_incoming_batch_entries, count); + for (const auto& [_, values] : incoming_values_by_source_slot) + capacities[as_ss_idx].max_incoming_batch_values = + std::max(capacities[as_ss_idx].max_incoming_batch_values, values); + } + + std::size_t max_message_bytes = 0; + for (const auto& [_, per_as_bytes] : source_as_section_bytes) + { + std::size_t msg_size_in_bytes = sizeof(std::size_t); + for (const auto& section_bytes : per_as_bytes) + { + if (section_bytes == 0) + continue; + msg_size_in_bytes += 2 * sizeof(std::size_t) + section_bytes; + } + max_message_bytes = std::max(max_message_bytes, msg_size_in_bytes); + } + + std::vector base_angle_sets(angle_sets_.begin(), angle_sets_.end()); + async_comm_ = + std::make_unique(base_angle_sets, + angle_sets_.front()->GetCommunicatorSet(), + incoming_source_partitions_by_angle_set, + max_message_bytes, + capacities); + for (auto* angle_set : angle_sets_) + angle_set->SetCommunicator(*async_comm_); } } +CBCDSweepChunk::~CBCDSweepChunk() +{ + StopCommunicator(); +} + +void +CBCDSweepChunk::StartCommunicator() +{ + if (async_comm_) + async_comm_->Start(); +} + void -CBCDSweepChunk::Sweep(const std::vector& cell_local_ids, size_t angle_set_id) +CBCDSweepChunk::StopCommunicator() +{ + if (async_comm_) + async_comm_->Stop(); +} + +void +CBCDSweepChunk::RefreshCachedKernelArgs() +{ + CALI_CXX_MARK_SCOPE("CBCDSweepChunk::RefreshCachedKernelArgs"); + + for (std::size_t angle_set_id = 0; angle_set_id < angle_sets_.size(); ++angle_set_id) + { + auto& ck = cached_params_[angle_set_id]; + { + CALI_CXX_MARK_SCOPE("CBCDSweepChunk::Sweep::ArgsRefresh"); + ck.args = gpu_kernel::Arguments( + problem_, groupset_, *angle_sets_[angle_set_id], *ck.fluds); + ck.device_saved_psi = ck.fluds->GetSavedAngularFluxDevicePointer(); + } + } +} + +void +CBCDSweepChunk::Sweep(std::uint32_t num_ready_cells, + std::size_t angle_set_id, + const std::uint32_t* local_cell_ids) { CALI_CXX_MARK_SCOPE("CBCDSweepChunk::Sweep"); - auto* fluds = fluds_list_[angle_set_id]; - auto* device_saved_psi = fluds->GetSavedAngularFluxDevicePointer(); - auto& stream = streams_list_[angle_set_id]; - auto& host_cell_local_ids = fluds->GetLocalCellIDs(); - std::copy(cell_local_ids.begin(), cell_local_ids.end(), host_cell_local_ids.begin()); - const auto& args = kernel_args_list_[angle_set_id]; - crb::Dim3 block_size = block_sizes_[angle_set_id]; - unsigned int num_ready_cells = static_cast(cell_local_ids.size()); - unsigned int grid_size_x = grid_size_x_list_[angle_set_id]; - unsigned int grid_size_y = (num_ready_cells + block_size.y - 1) / block_size.y; - crb::Dim3 grid_size(grid_size_x, grid_size_y); - auto* host_cell_local_ids_data = host_cell_local_ids.data(); + auto& ck = cached_params_[angle_set_id]; + auto& stream = angle_sets_[angle_set_id]->GetStream(); + const auto grid_size_y = (num_ready_cells + ck.block_size.y - 1) / ck.block_size.y; + crb::Dim3 grid_size(ck.grid_size_x, grid_size_y); + { + CALI_CXX_MARK_SCOPE("CBCDSweepChunk::Sweep::KernelLaunch"); #if defined(__NVCC__) || defined(__HIPCC__) - gpu_kernel::SweepKernel<<>>( - args, host_cell_local_ids_data, num_ready_cells, device_saved_psi); + gpu_kernel::SweepKernel<<>>( + ck.args, local_cell_ids, num_ready_cells, ck.device_saved_psi); #elif defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER) - stream.synchronize(); - stream.parallel_for(sycl::nd_range<3>(grid_size * block_size, block_size), - [=](sycl::nd_item<3> work_index) - { - gpu_kernel::SweepKernel( - args, host_cell_local_ids_data, num_ready_cells, device_saved_psi); - }); + stream.synchronize(); + stream.parallel_for(sycl::nd_range<3>(grid_size * ck.block_size, ck.block_size), + [=](sycl::nd_item<3> work_index) + { + gpu_kernel::SweepKernel( + ck.args, local_cell_ids, num_ready_cells, ck.device_saved_psi); + }); #endif + } } -} // namespace opensn \ No newline at end of file +} // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.h index e1194b26a5..6950096f1f 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbcd_sweep_chunk.h @@ -4,6 +4,7 @@ #pragma once #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbcd_angle_set.h" +#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/communicators/cbcd_async_comm.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/sweep_chunk.h" #include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/discrete_ordinates_problem.h" @@ -15,40 +16,88 @@ namespace crb = caribou; namespace opensn { -/// CBC sweep chunk for device. +/** + * CBCD sweep chunk. + * + * Owns the shared CBCD communicator for one groupset, caches per-angle-set kernel + * launch parameters, and coordinates the transfer boundaries between the device sweep + * kernels and the host-side CBCD scheduler. + */ class CBCDSweepChunk : public SweepChunk { public: + /** + * Construct the CBCD sweep chunk for one groupset. + * + * \param problem Discrete ordinates problem owning the sweep state. + * \param groupset Groupset served by this sweep chunk. + */ CBCDSweepChunk(DiscreteOrdinatesProblem& problem, LBSGroupset& groupset); + ~CBCDSweepChunk() override; + + /// Return the discrete ordinates problem owning this sweep chunk. DiscreteOrdinatesProblem& GetProblem() const { return problem_; } + /// Return the groupset served by this sweep chunk. const LBSGroupset& GetGroupset() const { return groupset_; } + /// Return the first group index of the groupset. unsigned int GetGroupsetGroupIndex() const { return groupset_.first_group; } + /// Return the cell transport view for one local cell. const CellLBSView& GetCellTransportView(std::uint64_t cell_local_id) const { return cell_transport_views_[cell_local_id]; } + /// Return the CBCD angle sets coordinated by this sweep chunk. const std::vector& GetAngleSets() const { return angle_sets_; } - const std::vector& GetFLUDS() const { return fluds_list_; } + /// Start the aggregated communicator thread. + void StartCommunicator(); + + /// Stop the aggregated communicator thread. + void StopCommunicator(); - std::vector& GetStreams() { return streams_list_; } + /// Refresh cached kernel arguments once at the start of a sweep. + void RefreshCachedKernelArgs(); using SweepChunk::Sweep; - void Sweep(const std::vector& cell_local_ids, size_t angle_set_id); + /** + * Launch the CBC sweep kernel for one angle set. + * + * \param num_ready_cells Number of local cells in the batch. + * \param angle_set_id Producing angle-set ID. + * \param local_cell_ids Pointer to the mapped host cell-ID buffer for the batch. + */ + void Sweep(std::uint32_t num_ready_cells, + std::size_t angle_set_id, + const std::uint32_t* local_cell_ids); private: + /// Cached launch data for one angle set. + struct CachedKernelParams + { + /// Packed kernel arguments. + gpu_kernel::Arguments args; + /// Device block size for the launch. + crb::Dim3 block_size; + /// Device grid size in x. + unsigned int grid_size_x; + /// FLUDS instance bound to the angle set. + CBCD_FLUDS* fluds; + /// Device pointer to saved angular fluxes. + double* device_saved_psi; + }; + /// Owning reference to the discrete ordinates problem. DiscreteOrdinatesProblem& problem_; + /// Aggregated communicator owned by this sweep chunk. + std::unique_ptr async_comm_; + /// Anglesets managed by this sweep chunk. std::vector angle_sets_; - std::vector fluds_list_; - std::vector streams_list_; - std::vector> kernel_args_list_; - std::vector block_sizes_; - std::vector grid_size_x_list_; + /// Per-angleset cached kernel launch params. + std::vector cached_params_; }; -} // namespace opensn \ No newline at end of file +} // namespace opensn diff --git a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/gpu_kernel/solver.h b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/gpu_kernel/solver.h index e5894e2e00..2b912d8009 100644 --- a/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/gpu_kernel/solver.h +++ b/modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/gpu_kernel/solver.h @@ -26,9 +26,9 @@ ComputeGMS(double* sweep_matrix, const std::uint32_t& num_moments, const Arguments& args) { - // get sigmaT + // Get sigmaT double sigma_t = cell.total_xs[args.groupset_start + group_idx]; - // compute source term + // Compute source term const double* src_moment = args.src_moment + cell.phi_address + args.groupset_start + group_idx; _Pragma("unroll") for (std::uint32_t i = 0; i < ndofs; ++i) { @@ -40,7 +40,7 @@ ComputeGMS(double* sweep_matrix, } s[i] = src_per_moment; } - // add source, transfer and mass contribution + // Add source, transfer and mass contribution double* A = sweep_matrix; const std::array* GM_data = reinterpret_cast*>(cell.GM_data); @@ -49,10 +49,10 @@ ComputeGMS(double* sweep_matrix, _Pragma("unroll") for (std::uint32_t j = 0; j < ndofs; ++j) { std::array GM = *(GM_data++); - // compute A += G * Omega + M * sigma_t + // Compute A += G * Omega + M * sigma_t A[j] += direction.omega[0] * GM[0] + direction.omega[1] * GM[1] + direction.omega[2] * GM[2] + sigma_t * GM[3]; - // compute psi += M @ s + // Compute psi += M @ s psi[i] += GM[3] * s[j]; } A += ndofs; @@ -70,14 +70,14 @@ ComputeSurfaceIntegral(double* sweep_matrix, const unsigned int& angle_group_idx, const Arguments& args) { - // loop over each face + // Loop over each face std::uint32_t face_node_counter = 0; for (std::uint32_t f = 0; f < cell.num_faces; ++f) { - // get face view + // Get face view FaceView face; cell.GetFaceView(face, f); - // determine if this face is incoming + // Determine if this face is incoming NodeIndexType idx(cell_edge_data[face_node_counter]); if (idx.IsUndefined() || idx.IsOutgoing()) { @@ -86,7 +86,7 @@ ComputeSurfaceIntegral(double* sweep_matrix, } double mu = direction.omega[0] * face.normal[0] + direction.omega[1] * face.normal[1] + direction.omega[2] * face.normal[2]; - // compute surface integral + // Compute surface integral for (std::uint32_t fi = 0; fi < face.num_face_nodes; ++fi) { std::uint32_t i = face.cell_mapping_data[fi]; @@ -101,7 +101,7 @@ ComputeSurfaceIntegral(double* sweep_matrix, psi[i] += upwind_psi[angle_group_idx] * mu_Nij; } } - // update face node counter + // Update face node counter face_node_counter += face.num_face_nodes; } } @@ -111,18 +111,18 @@ template __CRB_DEVICE_FUNC__ void GaussianElimination(double* sweep_matrix, double* psi) { - // forward elimination + // Forward elimination double* A_i = sweep_matrix; _Pragma("unroll") for (std::uint32_t i = 0; i < ndofs; ++i) { double inv_diag = 1.0 / A_i[i]; - // normalize the pivot row + // Normalize the pivot row _Pragma("unroll") for (std::uint32_t j = i; j < ndofs; ++j) { A_i[j] *= inv_diag; } psi[i] *= inv_diag; - // eliminate rows below + // Eliminate rows below double* A_k = A_i + ndofs; _Pragma("unroll") for (std::uint32_t k = i + 1; k < ndofs; ++k) { @@ -136,7 +136,7 @@ GaussianElimination(double* sweep_matrix, double* psi) } A_i += ndofs; } - // back substitution — row-wise access + // Back substitution — row-wise access if constexpr (ndofs >= 2) { _Pragma("unroll") for (std::int32_t j = ndofs - 2; j >= 0; --j) @@ -161,14 +161,14 @@ WritePsiToFludsAndOutflow(double* psi, const unsigned int& group_idx, const Arguments& args) { - // loop over each face + // Loop over each face std::uint32_t face_node_counter = 0; for (std::uint32_t f = 0; f < cell.num_faces; ++f) { - // get face view + // Get face view FaceView face; cell.GetFaceView(face, f); - // determine if this face is outgoing + // Determine if this face is outgoing NodeIndexType idx(cell_edge_data[face_node_counter]); if (idx.IsUndefined() || !idx.IsOutgoing()) { @@ -177,15 +177,15 @@ WritePsiToFludsAndOutflow(double* psi, } double mu = direction.omega[0] * face.normal[0] + direction.omega[1] * face.normal[1] + direction.omega[2] * face.normal[2]; - // loop over each face node + // Loop over each face node for (std::uint32_t fi = 0; fi < face.num_face_nodes; ++fi) { std::uint32_t i = face.cell_mapping_data[fi]; - // put copy psi to FLUDS + // Put copy psi to FLUDS double* downwind_psi = args.flud_data.GetOutgoingFluxPointer(cell_edge_data[face_node_counter + fi]); downwind_psi[angle_group_idx] = psi[i]; - // compute ouflow for boundary face + // Compute outflow for boundary face if (face.outflow != nullptr) { double outflow = direction.weight * mu * face.IntS_shapeI_data[fi] * psi[i]; @@ -246,16 +246,16 @@ Sweep(const Arguments& args, const std::uint32_t& num_moments, double* saved_psi) { - // initialize buffer + // Initialize buffer Buffer buffer; - // prepare linear system to solve + // Prepare linear system to solve ComputeGMS( buffer.A(), buffer.b(), buffer.s(), cell, direction, group_idx, num_moments, args); ComputeSurfaceIntegral( buffer.A(), buffer.b(), cell, direction, cell_edge_data, angle_group_idx, args); - // solve for the angular flux + // Solve for the angular flux GaussianElimination(buffer.A(), buffer.b()); - // save the result + // Save the result WritePsiToFludsAndOutflow( buffer.b(), cell, direction, cell_edge_data, angle_group_idx, group_idx, args); ComputePhi(buffer.b(), cell, direction, group_idx, num_moments, args);