Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/boundary/isotropic_boundary.h"
#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/boundary/arbitrary_boundary.h"
#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbc_fluds.h"
#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/fluds/cbcd_fluds_common_data.h"
#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/angle_set/cbc_angle_set.h"
#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/cbc.h"
#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/spds/aah.h"
Expand All @@ -16,6 +17,8 @@
#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_sweep_chunk.h"
#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/aah_sweep_chunk_td.h"
#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk.h"
#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep_chunks/cbc_sweep_chunk_td.h"
#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/sweep/scheduler/spmd_threadpool.h"
#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/iterative_methods/sweep_wgs_context.h"
#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/io/discrete_ordinates_problem_io.h"
#include "modules/linear_boltzmann_solvers/discrete_ordinates_problem/iterative_methods/ags_linear_solver.h"
Expand All @@ -42,11 +45,14 @@
#include "framework/runtime.h"
#include "caliper/cali.h"
#include <algorithm>
#include <atomic>
#include <cassert>
#include <cmath>
#include <iomanip>
#include <numeric>
#include <sstream>
#include <stdexcept>
#include <thread>

namespace opensn
{
Expand Down Expand Up @@ -1401,6 +1407,7 @@ DiscreteOrdinatesProblem::InitializeSweepDataStructures()
}
else if (sweep_type_ == "CBC")
{
std::vector<std::shared_ptr<CBC_SPDS>> cbc_spds_list;
// Build SPDS
for (const auto& [quadrature, info] : quadrature_unq_so_grouping_map_)
{
Expand All @@ -1415,8 +1422,72 @@ DiscreteOrdinatesProblem::InitializeSweepDataStructures()
const auto new_swp_order =
std::make_shared<CBC_SPDS>(omega, this->grid_, quadrature_allow_cycles_map_[quadrature]);
quadrature_spds_map_[quadrature].push_back(new_swp_order);
cbc_spds_list.push_back(new_swp_order);
}
}

if (cbc_spds_list.size() == 1)
{
auto start_time = std::chrono::steady_clock::now();
cbc_spds_list.front()->ComputeMaxNumLocalPsiSlots();
auto end_time = std::chrono::steady_clock::now();
std::chrono::duration<double> elapsed_seconds = end_time - start_time;

const auto local_face_slots = cbc_spds_list.front()->GetMaxNumLocalPsiSlots();
log.Log() << "CBC SPDS cell-face slot plan calculated in " << elapsed_seconds.count()
<< " s with 1 thread.\n"
<< " (max, min, avg) = (" << local_face_slots << ", " << local_face_slots << ", "
<< static_cast<double>(local_face_slots) << ").\n";
}
else if (not cbc_spds_list.empty())
{
const auto hardware_threads = std::max<std::size_t>(1, std::thread::hardware_concurrency());
const auto num_workers = std::min(cbc_spds_list.size(), hardware_threads);

SPMD_ThreadPool pool(num_workers);
std::atomic<std::size_t> next_index{0};

log.Log() << "Computing cell-face slot plans for " << cbc_spds_list.size()
<< " CBC SPDS with " << num_workers << " threads.\n";

auto start_time = std::chrono::steady_clock::now();
pool.ExecuteBatch(
[&](std::size_t /* thread ID */)
{
std::size_t index = 0;
// Atomically fetch the next index to work on
// std::memory_order_relaxed is sufficient here because we need atomicity only for the
// fetch_add operation, and there are no other synchronization requirements between
// threads for calculating max num local psi slots.
while ((index = next_index.fetch_add(1, std::memory_order_relaxed)) <
cbc_spds_list.size())
{
cbc_spds_list[index]->ComputeMaxNumLocalPsiSlots();
}
});
auto end_time = std::chrono::steady_clock::now();
std::chrono::duration<double> elapsed_seconds = end_time - start_time;

size_t max_local_psi_slots = 0;
size_t min_local_psi_slots = std::numeric_limits<size_t>::max();
std::uint64_t total_local_psi_slots = 0;

for (const auto& spds : cbc_spds_list)
{
const auto local_psi_slots = spds->GetMaxNumLocalPsiSlots();
max_local_psi_slots = std::max(max_local_psi_slots, local_psi_slots);
min_local_psi_slots = std::min(min_local_psi_slots, local_psi_slots);
total_local_psi_slots += local_psi_slots;
}

const double avg_local_psi_slots =
static_cast<double>(total_local_psi_slots) / static_cast<double>(cbc_spds_list.size());

log.Log() << "CBC SPDS cell-face slot plans calculated in " << elapsed_seconds.count()
<< " s.\n"
<< " (avg, max, min) = (" << avg_local_psi_slots << " slots, "
<< max_local_psi_slots << " slots, " << min_local_psi_slots << " slots).";
}
}
else
OpenSnInvalidArgument("Unsupported sweep type \"" + sweep_type_ + "\"");
Expand Down Expand Up @@ -1753,6 +1824,12 @@ DiscreteOrdinatesProblem::InitFluxDataStructures(LBSGroupset& groupset)
groupset.angle_agg =
std::make_shared<AngleAggregation>(sweep_boundaries_, groupset.quadrature, grid_);

std::vector<std::size_t> cbc_fluds_local_psi_bytes;
std::vector<std::size_t> cbc_fluds_boundary_nonlocal_bytes;
const auto num_local_spatial_dofs = discretization_->GetNumLocalDOFs(groupset.psi_uk_man_) /
groupset.psi_uk_man_.GetNumberOfUnknowns() / gs_num_grps;
std::uint64_t full_local_psi_storage_bytes = 0;

size_t angle_set_id = 0;
for (const auto& so_grouping : unique_so_groupings)
{
Expand Down Expand Up @@ -1821,25 +1898,72 @@ DiscreteOrdinatesProblem::InitFluxDataStructures(LBSGroupset& groupset)
else if (sweep_type_ == "CBC")
{
std::shared_ptr<FLUDS> fluds;
std::size_t boundary_nonlocal_bytes = 0;
if (use_gpus_)
{
const auto& cbcd_common_data =
dynamic_cast<const CBCD_FLUDSCommonData&>(fluds_common_data);
fluds = CreateCBCD_FLUDS(gs_num_grps,
angle_indices.size(),
grid_->local_cells.size(),
fluds_common_data,
groupset.psi_uk_man_,
*discretization_,
(not GetPsiNewLocal()[groupset.id].empty()));

const auto num_groups_and_angles = gs_num_grps * angle_indices.size();
boundary_nonlocal_bytes = (cbcd_common_data.GetNumIncomingBoundaryNodes() +
cbcd_common_data.GetNumOutgoingBoundaryNodes() +
cbcd_common_data.GetNumIncomingNonlocalNodes() +
cbcd_common_data.GetNumOutgoingNonlocalNodes()) *
num_groups_and_angles * sizeof(double);
}
else
{
fluds =
std::make_shared<CBC_FLUDS>(gs_num_grps,
angle_indices.size(),
dynamic_cast<const CBC_FLUDSCommonData&>(fluds_common_data),
groupset.psi_uk_man_,
*discretization_);
fluds = std::make_shared<CBC_FLUDS>(
gs_num_grps,
angle_indices.size(),
dynamic_cast<const CBC_FLUDSCommonData&>(fluds_common_data));

const auto& cbc_common_data = dynamic_cast<const CBC_FLUDSCommonData&>(fluds_common_data);
const auto num_groups_and_angles = gs_num_grps * angle_indices.size();
constexpr std::size_t local_psi_alignment = 64;
constexpr std::size_t doubles_per_cache_line = local_psi_alignment / sizeof(double);
const auto round_up_to_cache_line_multiple = [](std::size_t value)
{
return ((value + doubles_per_cache_line - 1) / doubles_per_cache_line) *
doubles_per_cache_line;
};

for (std::size_t face_storage_index = 0;
face_storage_index < cbc_common_data.GetNumCellFaces();
++face_storage_index)
{
const auto& face_info =
cbc_common_data.GetIncomingNonlocalFaceInfoByStorageIndex(face_storage_index);
if (face_info.num_face_nodes == 0)
continue;
boundary_nonlocal_bytes +=
round_up_to_cache_line_multiple(static_cast<std::size_t>(face_info.num_face_nodes) *
num_groups_and_angles) *
sizeof(double);
}
}

if (use_gpus_)
{
const auto& cbc_spds = dynamic_cast<const CBC_SPDS&>(fluds_common_data.GetSPDS());
cbc_fluds_local_psi_bytes.push_back(cbc_spds.GetMaxNumLocalPsiSlots() *
cbc_spds.GetMaxLocalFaceNodeCount() * gs_num_grps *
angle_indices.size() * sizeof(double));
}
else
cbc_fluds_local_psi_bytes.push_back(
dynamic_cast<const CBC_FLUDS&>(*fluds).GetLocalPsiBufferSize());
cbc_fluds_boundary_nonlocal_bytes.push_back(boundary_nonlocal_bytes);

full_local_psi_storage_bytes +=
num_local_spatial_dofs * gs_num_grps * angle_indices.size() * sizeof(double);

std::shared_ptr<AngleSet> angle_set;
if (use_gpus_)
Expand Down Expand Up @@ -1870,6 +1994,61 @@ DiscreteOrdinatesProblem::InitFluxDataStructures(LBSGroupset& groupset)
} // for an_ss
} // for so_grouping

if (sweep_type_ == "CBC" and not cbc_fluds_local_psi_bytes.empty())
{
const auto [min_it, max_it] =
std::minmax_element(cbc_fluds_local_psi_bytes.begin(), cbc_fluds_local_psi_bytes.end());
const auto [boundary_nonlocal_min_it, boundary_nonlocal_max_it] = std::minmax_element(
cbc_fluds_boundary_nonlocal_bytes.begin(), cbc_fluds_boundary_nonlocal_bytes.end());
const auto total_local_psi_storage = std::accumulate(
cbc_fluds_local_psi_bytes.begin(), cbc_fluds_local_psi_bytes.end(), std::uint64_t{0});
const auto total_boundary_nonlocal_storage =
std::accumulate(cbc_fluds_boundary_nonlocal_bytes.begin(),
cbc_fluds_boundary_nonlocal_bytes.end(),
std::uint64_t{0});
const auto total_managed_psi_storage =
total_local_psi_storage + total_boundary_nonlocal_storage;
std::ostringstream savings_out;
if (full_local_psi_storage_bytes > 0)
savings_out << 100.0 * (1.0 - (static_cast<double>(total_local_psi_storage) /
static_cast<double>(full_local_psi_storage_bytes)))
<< "%.";
else
savings_out << "N/A.";
const auto format_bytes = [](const std::uint64_t bytes)
{
constexpr std::pair<double, const char*> units[] = {
{1024.0 * 1024.0 * 1024.0, "GiB"}, {1024.0 * 1024.0, "MiB"}, {1024.0, "KiB"}, {1.0, "B"}};
const auto bytes_as_double = static_cast<double>(bytes);

for (const auto& [scale, suffix] : units)
{
if (bytes_as_double >= scale || scale == 1.0)
{
std::ostringstream out;
const double value = bytes_as_double / scale;
const int precision = (scale == 1.0 || value >= 100.0) ? 0 : (value >= 10.0 ? 1 : 2);
out << std::fixed << std::setprecision(precision) << value << ' ' << suffix;
return out.str();
}
}

return std::string("0 B");
};

log.Log() << (use_gpus_ ? "CBCD FLUDS" : "CBC FLUDS") << " psi storage usage across "
<< cbc_fluds_local_psi_bytes.size() << " FLUDS instances.\n"
<< " Total local psi storage and savings: (" << format_bytes(total_local_psi_storage)
<< ", " << savings_out.str() << ")\n"
<< " Total boundary/non-local storage: "
<< format_bytes(total_boundary_nonlocal_storage) << ".\n"
<< " Total managed local/boundary/non-local psi storage: "
<< format_bytes(total_managed_psi_storage) << ".\n";
}

if (options_.verbose_inner_iterations)
log.Log() << program_timer.GetTimeString() << " Initialized angle aggregation.";

opensn::mpi_comm.barrier();
}

Expand All @@ -1882,10 +2061,6 @@ DiscreteOrdinatesProblem::SetSweepChunk(LBSGroupset& groupset)

const bool use_time_dependent_chunk = (mode == SweepChunkMode::TIME_DEPENDENT);

if (use_time_dependent_chunk && sweep_type_ != "AAH")
throw std::invalid_argument(GetName() +
": Time dependent is only supported with sweep_type='AAH'.");

if (sweep_type_ == "AAH")
{
if (use_time_dependent_chunk)
Expand All @@ -1896,6 +2071,8 @@ DiscreteOrdinatesProblem::SetSweepChunk(LBSGroupset& groupset)
}
else if (sweep_type_ == "CBC")
{
if (use_time_dependent_chunk)
return std::make_shared<CBCSweepChunkTD>(*this, groupset);
if (use_gpus_)
return CreateCBCDSweepChunk(groupset);
return std::make_shared<CBCSweepChunk>(*this, groupset);
Expand Down
Loading
Loading