From 8ecc887751abb798c33383c8c3fb9d5ba69d1e69 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Wed, 29 Apr 2026 13:57:20 -0500 Subject: [PATCH 01/10] ENH: Adding reduction operators, WIP --- examples/CMakeLists.txt | 3 + examples/matar_mpi.cpp | 134 +++++++++++++++++++++++++++++++ src/include/communication_plan.h | 28 ++++++- src/include/mpi_types.h | 83 +++++++++++++++++-- 4 files changed, 240 insertions(+), 8 deletions(-) create mode 100644 examples/matar_mpi.cpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 32238913..6026c9db 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -163,6 +163,9 @@ if (KOKKOS) include_directories(mesh_decomp) add_subdirectory(mesh_decomp) + + add_executable(matar_mpi matar_mpi.cpp) + target_link_libraries(matar_mpi ${LINKING_LIBRARIES}) endif() endif() diff --git a/examples/matar_mpi.cpp b/examples/matar_mpi.cpp new file mode 100644 index 00000000..5875b15c --- /dev/null +++ b/examples/matar_mpi.cpp @@ -0,0 +1,134 @@ +// Example: basic MPI features in MATAR (requires HAVE_MPI and HAVE_KOKKOS). +// - MPICArrayKokkos: distributed array with optional halo exchange via communicate() +// - CommunicationPlan: MPI distributed graph + per-neighbor index lists +// - Reductions use mpi_type_map (in mpi_types.h) and ::operation / mpi_op_for +// (in communication_plan.h) internally for MPI_Allreduce. + +#if !defined(HAVE_MPI) || !defined(HAVE_KOKKOS) + +#include + +int main() { + std::cerr + << "This example requires MATAR built with HAVE_MPI and HAVE_KOKKOS.\n"; + return 0; +} + +#else + +#include +#include + +#include + +#include + +using namespace mtr; + +int main(int argc, char* argv[]) { + MPI_Init(&argc, &argv); + MATAR_INITIALIZE(argc, argv); + + int rank = 0; + int size = 1; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + // ------------------------------------------------------------------------- + // MPICArrayKokkos::all_reduce — uses mpi_type_map for MPI_Datatype and + // mpi_op_for(operation) for MPI_Op (no CommunicationPlan required). + // ------------------------------------------------------------------------- + const int num_values = 100; + MPICArrayKokkos locals(num_values, "values"); + locals.set_values(1.0); + + + const double global_sum = locals.all_reduce(operation::sum); + const double expected_sum = static_cast(num_values * size); + + if (rank == 0) { + std::cout << "all_reduce(sum): " << global_sum << " (expect " + << expected_sum << ")\n"; + } + + // ------------------------------------------------------------------------- + // CommunicationPlan + communicate(): periodic 1D halo (needs 2+ ranks). + // Layout per rank: index 0 = left ghost, 1..L = owned, L+1 = right ghost. + // ------------------------------------------------------------------------- + if (size < 2) { + if (rank == 0) { + std::cout + << "Re-run with 2+ MPI processes to run the halo exchange demo.\n"; + } + } else { + const int L = 4; + const int left = (rank + size - 1) % size; + const int right = (rank + 1) % size; + + CommunicationPlan comm_plan; + comm_plan.initialize(MPI_COMM_WORLD); + + int send_ranks[2] = { left, right }; + int recv_ranks[2] = { left, right }; + comm_plan.initialize_graph_communicator(2, send_ranks, 2, recv_ranks); + + DCArrayKokkos send_strides(2, "send_strides"); + send_strides.host(0) = 1; + send_strides.host(1) = 1; + send_strides.update_device(); + DRaggedRightArrayKokkos rank_send_ids(send_strides, + "rank_send_ids"); + // Send first owned cell to the left neighbor; last owned to the right. + rank_send_ids.host(0, 0) = 1; + rank_send_ids.host(1, 0) = L; + rank_send_ids.update_device(); + + DCArrayKokkos recv_strides(2, "recv_strides"); + recv_strides.host(0) = 1; + recv_strides.host(1) = 1; + recv_strides.update_device(); + DRaggedRightArrayKokkos rank_recv_ids(recv_strides, + "rank_recv_ids"); + rank_recv_ids.host(0, 0) = 0; + rank_recv_ids.host(1, 0) = L + 1; + rank_recv_ids.update_device(); + + MATAR_FENCE(); + comm_plan.setup_send_recv(rank_send_ids, rank_recv_ids); + + MPICArrayKokkos field(static_cast(L + 2), "field"); + field.initialize_comm_plan(comm_plan); + + FOR_ALL(i, 0, L + 2, { + field(i) = -1.0; + }); + FOR_ALL(i, 1, L + 1, { field(i) = static_cast(rank); }); + MATAR_FENCE(); + + field.communicate(); + + field.update_host(); + MATAR_FENCE(); + + const double gl = field.host(0); + const double gr = field.host(L + 1); + const bool halo_ok = + (std::fabs(gl - static_cast(left)) < 1.0e-14) && + (std::fabs(gr - static_cast(right)) < 1.0e-14); + + if (rank == 0) { + std::cout << "After halo exchange (periodic 1D): "; + if (halo_ok) { + std::cout << "ghost values match neighbor ranks.\n"; + } else { + std::cout << "verification failed.\n"; + } + } + } + + MATAR_FINALIZE(); + MPI_Finalize(); + return 0; +} + +#endif diff --git a/src/include/communication_plan.h b/src/include/communication_plan.h index 0f642b2e..c0286d5e 100644 --- a/src/include/communication_plan.h +++ b/src/include/communication_plan.h @@ -15,6 +15,30 @@ enum class communication_plan_type { all_to_all_graph }; +/// Reduction operation for @c MPICArrayKokkos::all_reduce (maps to @c MPI_Op for the MPI stage). +enum class operation { + sum, + product, + max, + min +}; + +/// Map @ref operation to the corresponding @c MPI_Op (host-only; call after @c MPI_Init). +inline MPI_Op mpi_op_for(operation op) { + switch (op) { + case operation::sum: + return MPI_SUM; + case operation::product: + return MPI_PROD; + case operation::max: + return MPI_MAX; + case operation::min: + return MPI_MIN; + default: + return MPI_SUM; + } +} + struct CommunicationPlan { @@ -65,7 +89,6 @@ struct CommunicationPlan { DCArrayKokkos send_counts_; // [size: num_send_ranks] Number of items to send to each rank DCArrayKokkos recv_counts_; // [size: num_recv_ranks] Number of items to receive from each rank - DCArrayKokkos send_displs_; // [size: num_send_ranks] Starting index of items to send to each rank DCArrayKokkos recv_displs_; // [size: num_recv_ranks] Starting index of items to receive from each rank @@ -91,8 +114,7 @@ struct CommunicationPlan { MPI_Comm_free(&mpi_comm_graph); } } - - + void initialize(MPI_Comm comm_world){ int mpi_init = 0; MPI_Initialized(&mpi_init); diff --git a/src/include/mpi_types.h b/src/include/mpi_types.h index bc00d852..f5782d3d 100644 --- a/src/include/mpi_types.h +++ b/src/include/mpi_types.h @@ -87,7 +87,7 @@ class MPICArrayKokkos { size_t length_ = 0; size_t order_ = 0; // tensor order (rank) - MPI_Comm mpi_comm_; + MPI_Comm mpi_comm_ = MPI_COMM_NULL; MPI_Status mpi_status_; MPI_Datatype mpi_datatype_; MPI_Request mpi_request_; @@ -109,8 +109,9 @@ class MPICArrayKokkos { DRaggedRightArrayKokkos recv_indices_; // [size: num_recv_ranks, num_items_to_recv_by_rank] Indices of items to receive from each rank - size_t num_owned_; // Number of owned items (nodes/elements) - size_t num_ghost_; // Number of ghost items (nodes/elements) + size_t num_owned_ = 0; // Number of owned items (nodes/elements); optional override + size_t num_ghost_ = 0; // Number of ghost items (nodes/elements); informational when user-set + public: // Data member to access host view (initialized as pointer to this_array_.host_pointer()) @@ -358,6 +359,9 @@ class MPICArrayKokkos { MATAR_FENCE(); }; + /// 1D only: local reduce over an owned prefix then @c MPI_Allreduce to one scalar per rank. + T all_reduce(::operation op); + void set_values(const T& value){ this_array_.set_values(value); }; @@ -635,12 +639,81 @@ void MPICArrayKokkos::update_device() { this_array_.update_device(); } + + template -KOKKOS_INLINE_FUNCTION -MPICArrayKokkos::~MPICArrayKokkos() { +T MPICArrayKokkos::all_reduce(::operation op) { + + // assert(order_ == 1 && stride_ == 1 && "MPICArrayKokkos::all_reduce requires a 1D array (stride 1)."); + assert(!(op == ::operation::product && sizeof(T) == sizeof(bool)) && + "MPICArrayKokkos::all_reduce: product reduction is not supported for bool."); + + + assert(num_owned_ <= dims_[0] && "MPICArrayKokkos::all_reduce: num_owned exceeds dim0."); + const size_t owned_len = (num_owned_ > 0) ? num_owned_ : dims_[0]; + assert(owned_len > 0 && "MPICArrayKokkos::all_reduce: empty reduction range."); + + // this_array_.update_host(); + // MATAR_FENCE(); + + // const T* p = this_array_.host_pointer(); + // T local{}; + T local; + + switch (op) { + case ::operation::sum: + local = 0; + T loc_sum = 0; + FOR_REDUCE_SUM(i, 0, owned_len, + loc_sum, { + loc_sum += this_array_(i); + }, local); + break; + case ::operation::product: + local = T(1); + T loc_prod = 1; + FOR_REDUCE_PRODUCT(i, 0, owned_len, + loc_prod, { + loc_prod *= this_array_(i); + }, local); + break; + case ::operation::max: + local = this_array_(0); + T loc_max = this_array_(0); + FOR_REDUCE_MAX(i, 0, owned_len, + loc_max, { + loc_max = (this_array_(i) > loc_max) ? this_array_(i) : loc_max; + }, local); + break; + case ::operation::min: + local = this_array_(0); + T loc_min = this_array_(0); + FOR_REDUCE_MIN(i, 0, owned_len, + loc_min, { + loc_min = (this_array_(i) < loc_min) ? this_array_(i) : loc_min; + }, local); + break; + default: + printf("MPICArrayKokkos::all_reduce: unsupported operation %d\n", op); + printf("Supported operations are: sum, product, max, min\n"); + Kokkos::abort("MPICArrayKokkos::all_reduce: unsupported operation"); + break; + } + T global = local; + MPI_Comm comm = MPI_COMM_WORLD; + if (comm_plan_ != nullptr && comm_plan_->has_comm_world) { + comm = comm_plan_->mpi_comm_world; + } + MPI_Allreduce(&local, &global, 1, mpi_type_map::value(), ::mpi_op_for(op), comm); + return global; } + +template +KOKKOS_INLINE_FUNCTION +MPICArrayKokkos::~MPICArrayKokkos() {} + } // end namespace mtr From bf871fc5ad4d85478013dfd3e1e3e036c9de6a07 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Wed, 29 Apr 2026 16:34:15 -0500 Subject: [PATCH 02/10] ENH: Adding reduction operators to MPI types --- examples/matar_mpi.cpp | 310 +++++++++++++++++++++++------- src/include/communication_plan.h | 6 +- src/include/mpi_types.h | 312 ++++++++++++++++++++++++++----- 3 files changed, 506 insertions(+), 122 deletions(-) diff --git a/examples/matar_mpi.cpp b/examples/matar_mpi.cpp index 5875b15c..3f1847e3 100644 --- a/examples/matar_mpi.cpp +++ b/examples/matar_mpi.cpp @@ -1,8 +1,8 @@ // Example: basic MPI features in MATAR (requires HAVE_MPI and HAVE_KOKKOS). // - MPICArrayKokkos: distributed array with optional halo exchange via communicate() // - CommunicationPlan: MPI distributed graph + per-neighbor index lists -// - Reductions use mpi_type_map (in mpi_types.h) and ::operation / mpi_op_for -// (in communication_plan.h) internally for MPI_Allreduce. +// - MPICArrayKokkos::all_reduce: 1D, and fixed trailing indices for rank 2/3/4 +// (mpi_type_map + mpi_op_for in mpi_types.h / communication_plan.h) #if !defined(HAVE_MPI) || !defined(HAVE_KOKKOS) @@ -20,115 +20,281 @@ int main() { #include #include - #include using namespace mtr; int main(int argc, char* argv[]) { - MPI_Init(&argc, &argv); - MATAR_INITIALIZE(argc, argv); + +MPI_Init(&argc, &argv); +MATAR_INITIALIZE(argc, argv); +{ int rank = 0; int size = 1; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); + // Create a basic communication plan, this handles things like storing MPI world comms and + // allows for basic reduction operations. For more complex communication patterns, you can + // use the CommunicationPlan to build a more complex communication plan. + CommunicationPlan comm_plan; + comm_plan.initialize(MPI_COMM_WORLD); + // ------------------------------------------------------------------------- // MPICArrayKokkos::all_reduce — uses mpi_type_map for MPI_Datatype and // mpi_op_for(operation) for MPI_Op (no CommunicationPlan required). // ------------------------------------------------------------------------- + + // Same size arrays on each rank, just do a simple reduction. const int num_values = 100; MPICArrayKokkos locals(num_values, "values"); + locals.initialize_comm_plan(comm_plan); locals.set_values(1.0); - + locals.update_device(); - const double global_sum = locals.all_reduce(operation::sum); - const double expected_sum = static_cast(num_values * size); + double global_sum = locals.all_reduce(operation::sum); + double expected_sum = static_cast(num_values * size); if (rank == 0) { std::cout << "all_reduce(sum): " << global_sum << " (expect " << expected_sum << ")\n"; } - // ------------------------------------------------------------------------- - // CommunicationPlan + communicate(): periodic 1D halo (needs 2+ ranks). - // Layout per rank: index 0 = left ghost, 1..L = owned, L+1 = right ghost. - // ------------------------------------------------------------------------- - if (size < 2) { - if (rank == 0) { - std::cout - << "Re-run with 2+ MPI processes to run the halo exchange demo.\n"; - } - } else { - const int L = 4; - const int left = (rank + size - 1) % size; - const int right = (rank + 1) % size; - - CommunicationPlan comm_plan; - comm_plan.initialize(MPI_COMM_WORLD); - - int send_ranks[2] = { left, right }; - int recv_ranks[2] = { left, right }; - comm_plan.initialize_graph_communicator(2, send_ranks, 2, recv_ranks); - - DCArrayKokkos send_strides(2, "send_strides"); - send_strides.host(0) = 1; - send_strides.host(1) = 1; - send_strides.update_device(); - DRaggedRightArrayKokkos rank_send_ids(send_strides, - "rank_send_ids"); - // Send first owned cell to the left neighbor; last owned to the right. - rank_send_ids.host(0, 0) = 1; - rank_send_ids.host(1, 0) = L; - rank_send_ids.update_device(); - - DCArrayKokkos recv_strides(2, "recv_strides"); - recv_strides.host(0) = 1; - recv_strides.host(1) = 1; - recv_strides.update_device(); - DRaggedRightArrayKokkos rank_recv_ids(recv_strides, - "rank_recv_ids"); - rank_recv_ids.host(0, 0) = 0; - rank_recv_ids.host(1, 0) = L + 1; - rank_recv_ids.update_device(); + // Different size arrays on each rank, do a simple reduction. + const int num_values_per_rank = 10*(1+rank); + MPICArrayKokkos rank_locals(num_values_per_rank, "values"); + rank_locals.initialize_comm_plan(comm_plan); + rank_locals.set_values(1.0); + rank_locals.update_device(); - MATAR_FENCE(); - comm_plan.setup_send_recv(rank_send_ids, rank_recv_ids); + global_sum = rank_locals.all_reduce(operation::sum); + // Rank r has 10*(r+1) entries of 1 → total = 10 * (1 + … + size). + expected_sum = 10.0 * static_cast(size * (size + 1) / 2); + + + if (rank == 0) { + std::cout << "all_reduce(sum) with different size arrays on each rank (10*rank_id): " << global_sum << " (expect " + << expected_sum << ")\n"; + } + + + // Different size arrays on each rank, find the minimum value + MPICArrayKokkos min_locals(num_values_per_rank, "values"); + min_locals.initialize_comm_plan(comm_plan); + + FOR_ALL(i, 0, num_values_per_rank, { + min_locals(i) = static_cast(10*rank + i); + }); + + min_locals.update_device(); + float global_min = min_locals.all_reduce(operation::min); + float expected_min = 0.0F; + if (rank == 0) { + std::cout << "all_reduce(min) with different size arrays on each rank (10*rank_id): " << global_min << " (expect " + << expected_min << ")\n"; + } + + float global_max = min_locals.all_reduce(operation::max); + // Largest entry is on rank (size-1) at i = 10*size - 1. + const float expected_max = static_cast(10 * (size - 1) + (10 * size - 1)); + if (rank == 0) { + std::cout << "all_reduce(max) with different size arrays on each rank (10*rank_id): " << global_max << " (expect " + << expected_max << ")\n"; + } + + + + // Example: all_reduce with product + // Initialize a MPICArrayKokkos with all values 2.0 + MPICArrayKokkos prod_locals(4, "prod_values"); + prod_locals.initialize_comm_plan(comm_plan); + prod_locals.set_values(2.0); + prod_locals.update_device(); + + // Compute the product across all ranks and all values + double global_product = prod_locals.all_reduce(operation::product); - MPICArrayKokkos field(static_cast(L + 2), "field"); - field.initialize_comm_plan(comm_plan); + // The expected product is pow(2, 4*size), i.e., each rank contributes 4 twos + double expected_product = std::pow(2.0, 4 * size); + if (rank == 0) { + std::cout << "all_reduce(product): " << global_product << " (expect " + << expected_product << ")\n"; + } - FOR_ALL(i, 0, L + 2, { - field(i) = -1.0; + // ------------------------------------------------------------------------- + // all_reduce with fixed trailing indices (multi-dimensional arrays). + // ------------------------------------------------------------------------- + const size_t n_elem = size * 10; + + // Rank-2: element centroid coordinates — elem_centroids(elem_id, elem_position) + // with elem_position ∈ {0,1,2} as x, y, z. Reduce over elem_id for each axis. + { + size_t n_elem = 3; + size_t num_coords = 3; + MPICArrayKokkos elem_centroids(n_elem, num_coords,"elem_centroids"); + elem_centroids.initialize_comm_plan(comm_plan); + FOR_ALL(elem_id, 0, n_elem, + elem_position, 0, num_coords, { + const double base = 1000.0 * rank + 100.0 * elem_id; + elem_centroids(elem_id, elem_position) = + base + 10.0 * static_cast(elem_position); }); - FOR_ALL(i, 1, L + 1, { field(i) = static_cast(rank); }); MATAR_FENCE(); + elem_centroids.update_device(); - field.communicate(); + const double max_x = elem_centroids.all_reduce(operation::max, 0); + const double max_y = elem_centroids.all_reduce(operation::max, 1); + const double max_z = elem_centroids.all_reduce(operation::max, 2); - field.update_host(); - MATAR_FENCE(); + const double base_rank = 1000.0 * static_cast(size - 1); + const double base_elem = 100.0 * static_cast(n_elem - 1); + const double expect_max_x = base_rank + base_elem + 0.0; + const double expect_max_y = base_rank + base_elem + 10.0; + const double expect_max_z = base_rank + base_elem + 20.0; - const double gl = field.host(0); - const double gr = field.host(L + 1); - const bool halo_ok = - (std::fabs(gl - static_cast(left)) < 1.0e-14) && - (std::fabs(gr - static_cast(right)) < 1.0e-14); + if (rank == 0) { + std::cout << "all_reduce(max, coord) rank-2 centroids — max x: " << max_x + << " (expect " << expect_max_x << ")\n"; + std::cout << "all_reduce(max, coord) rank-2 centroids — max y: " << max_y + << " (expect " << expect_max_y << ")\n"; + std::cout << "all_reduce(max, coord) rank-2 centroids — max z: " << max_z + << " (expect " << expect_max_z << ")\n"; + } + } + // Rank-3: reduce over e at fixed tensor component — e.g. stress(e, 0, 1). + { + MPICArrayKokkos stress(n_elem, 3, 3, "stress"); + stress.initialize_comm_plan(comm_plan); + FOR_ALL(e, 0, n_elem, r, 0, 3, c, 0, 3, { + stress(e, r, c) = + 10000.0 * rank + 1000.0 * e + 100.0 * r + c; + }); + MATAR_FENCE(); + stress.update_device(); + const double max_comp = stress.all_reduce(operation::max, 0, 1); + + const double expect_3d = + 10000.0 * static_cast(size - 1) + + 1000.0 * static_cast(n_elem - 1) + 1.0; if (rank == 0) { - std::cout << "After halo exchange (periodic 1D): "; - if (halo_ok) { - std::cout << "ghost values match neighbor ranks.\n"; - } else { - std::cout << "verification failed.\n"; + std::cout << "all_reduce(max, i, j) rank-3 s(e,i,j): " << max_comp + << " (expect " << expect_3d << ")\n"; + } + } + + // Rank-4: reduce over element at fixed Gauss point and tensor component. + { + const size_t n_gauss = 2; + MPICArrayKokkos s4(n_elem, n_gauss, 3, 3, "s4"); + s4.initialize_comm_plan(comm_plan); + for (size_t e = 0; e < n_elem; ++e) { + for (size_t g = 0; g < n_gauss; ++g) { + for (size_t r = 0; r < 3; ++r) { + for (size_t c = 0; c < 3; ++c) { + s4.host(e, g, r, c) = + 100000.0 * rank + 1000.0 * static_cast(e) + + 100.0 * static_cast(g) + + 10.0 * static_cast(r) + + static_cast(c); + } + } } } + s4.update_device(); + const size_t g_fix = 1; + const size_t ti_fix = 0; + const size_t tj_fix = 1; + const double max_qp = + s4.all_reduce(operation::max, g_fix, ti_fix, tj_fix); + const double expect_4d = + 100000.0 * static_cast(size - 1) + + 1000.0 * static_cast(n_elem - 1) + 101.0; + if (rank == 0) { + std::cout << "all_reduce(max, g, i, j) rank-4 stress(e,g,i,j): " + << max_qp << " (expect " << expect_4d << ")\n"; + } } - MATAR_FINALIZE(); - MPI_Finalize(); - return 0; + // ------------------------------------------------------------------------- + // CommunicationPlan + communicate(): periodic 1D halo (needs 2+ ranks). + // Layout per rank: index 0 = left ghost, 1..L = owned, L+1 = right ghost. + // ------------------------------------------------------------------------- + // if (size < 2) { + // if (rank == 0) { + // std::cout + // << "Re-run with 2+ MPI processes to run the halo exchange demo.\n"; + // } + // } else { + // const int L = 4; + // const int left = (rank + size - 1) % size; + // const int right = (rank + 1) % size; + + // CommunicationPlan comm_plan; + // comm_plan.initialize(MPI_COMM_WORLD); + + // int send_ranks[2] = { left, right }; + // int recv_ranks[2] = { left, right }; + // comm_plan.initialize_graph_communicator(2, send_ranks, 2, recv_ranks); + + // DCArrayKokkos send_strides(2, "send_strides"); + // send_strides.host(0) = 1; + // send_strides.host(1) = 1; + // send_strides.update_device(); + // DRaggedRightArrayKokkos rank_send_ids(send_strides, + // "rank_send_ids"); + // // Send first owned cell to the left neighbor; last owned to the right. + // rank_send_ids.host(0, 0) = 1; + // rank_send_ids.host(1, 0) = L; + // rank_send_ids.update_device(); + + // DCArrayKokkos recv_strides(2, "recv_strides"); + // recv_strides.host(0) = 1; + // recv_strides.host(1) = 1; + // recv_strides.update_device(); + // DRaggedRightArrayKokkos rank_recv_ids(recv_strides, + // "rank_recv_ids"); + // rank_recv_ids.host(0, 0) = 0; + // rank_recv_ids.host(1, 0) = L + 1; + // rank_recv_ids.update_device(); + + // MATAR_FENCE(); + // comm_plan.setup_send_recv(rank_send_ids, rank_recv_ids); + + // MPICArrayKokkos field(static_cast(L + 2), "field"); + // field.initialize_comm_plan(comm_plan); + + // FOR_ALL(i, 0, L + 2, { + // field(i) = -1.0; + // }); + // FOR_ALL(i, 1, L + 1, { field(i) = static_cast(rank); }); + // MATAR_FENCE(); + + // field.communicate(); + + // field.update_host(); + // MATAR_FENCE(); + + // const double gl = field.host(0); + // const double gr = field.host(L + 1); + // const bool halo_ok = + // (std::fabs(gl - static_cast(left)) < 1.0e-14) && + // (std::fabs(gr - static_cast(right)) < 1.0e-14); + + // if (rank == 0) { + // std::cout << "After halo exchange (periodic 1D): "; + // if (halo_ok) { + // std::cout << "ghost values match neighbor ranks.\n"; + // } else { + // std::cout << "verification failed.\n"; + // } + // } + // } +} +MATAR_FINALIZE(); +MPI_Finalize(); +return 0; } #endif diff --git a/src/include/communication_plan.h b/src/include/communication_plan.h index c0286d5e..619ad158 100644 --- a/src/include/communication_plan.h +++ b/src/include/communication_plan.h @@ -7,8 +7,8 @@ #include -using namespace mtr; - +namespace mtr +{ enum class communication_plan_type { no_communication, @@ -534,6 +534,8 @@ struct CommunicationPlan { } }; // End of CommunicationPlan +} // end namespace mtr + #endif // end if HAVE_MPI #endif // end if COMMUNICATION_PLAN_H diff --git a/src/include/mpi_types.h b/src/include/mpi_types.h index f5782d3d..84e8266c 100644 --- a/src/include/mpi_types.h +++ b/src/include/mpi_types.h @@ -359,8 +359,19 @@ class MPICArrayKokkos { MATAR_FENCE(); }; - /// 1D only: local reduce over an owned prefix then @c MPI_Allreduce to one scalar per rank. - T all_reduce(::operation op); + /// Reduce over index 0 (e.g. elements); array must be rank 1. + T all_reduce(operation op); + + /// Reduce over leading dimension only; fix one trailing index (rank-2 arrays). + T all_reduce(operation op, size_t j); + + /// Reduce over leading dimension only; fix two trailing indices (e.g. + /// stress(elem,3,3) → all_reduce(op,0,1) is max of stress(e,0,1) over owned elements e). + T all_reduce(operation op, size_t j, size_t k); + + /// Reduce over elements only; fix Gauss point and tensor indices (rank-4), e.g. + /// stress(elem, gauss, i, j) → all_reduce(op, g, i, j) over owned elem. + T all_reduce(operation op, size_t g, size_t ti, size_t tj); void set_values(const T& value){ this_array_.set_values(value); @@ -642,10 +653,10 @@ void MPICArrayKokkos::update_device() { template -T MPICArrayKokkos::all_reduce(::operation op) { +T MPICArrayKokkos::all_reduce(operation op) { // assert(order_ == 1 && stride_ == 1 && "MPICArrayKokkos::all_reduce requires a 1D array (stride 1)."); - assert(!(op == ::operation::product && sizeof(T) == sizeof(bool)) && + assert(!(op == operation::product && sizeof(T) == sizeof(bool)) && "MPICArrayKokkos::all_reduce: product reduction is not supported for bool."); @@ -653,51 +664,56 @@ T MPICArrayKokkos::all_reduce(::operation op const size_t owned_len = (num_owned_ > 0) ? num_owned_ : dims_[0]; assert(owned_len > 0 && "MPICArrayKokkos::all_reduce: empty reduction range."); - // this_array_.update_host(); - // MATAR_FENCE(); - - // const T* p = this_array_.host_pointer(); - // T local{}; + // Local varaible for on node reduction T local; - switch (op) { - case ::operation::sum: - local = 0; - T loc_sum = 0; - FOR_REDUCE_SUM(i, 0, owned_len, - loc_sum, { - loc_sum += this_array_(i); - }, local); - break; - case ::operation::product: - local = T(1); - T loc_prod = 1; - FOR_REDUCE_PRODUCT(i, 0, owned_len, - loc_prod, { - loc_prod *= this_array_(i); - }, local); - break; - case ::operation::max: - local = this_array_(0); - T loc_max = this_array_(0); - FOR_REDUCE_MAX(i, 0, owned_len, - loc_max, { - loc_max = (this_array_(i) > loc_max) ? this_array_(i) : loc_max; - }, local); - break; - case ::operation::min: - local = this_array_(0); - T loc_min = this_array_(0); - FOR_REDUCE_MIN(i, 0, owned_len, - loc_min, { - loc_min = (this_array_(i) < loc_min) ? this_array_(i) : loc_min; - }, local); - break; - default: - printf("MPICArrayKokkos::all_reduce: unsupported operation %d\n", op); - printf("Supported operations are: sum, product, max, min\n"); - Kokkos::abort("MPICArrayKokkos::all_reduce: unsupported operation"); - break; + if (comm_plan_ == nullptr || comm_plan_->comm_type == communication_plan_type::no_communication) { + + switch (op) { + case operation::sum: { + local = 0; + T loc_sum = 0; + FOR_REDUCE_SUM(i, 0, owned_len, + loc_sum, { + loc_sum += this_array_(i); + }, local); + break; + } + case operation::product: { + local = T(1); + T loc_prod = 1; + FOR_REDUCE_PRODUCT(i, 0, owned_len, + loc_prod, { + loc_prod *= this_array_(i); + }, local); + break; + } + case operation::max: { + local = this_array_(0); + T loc_max = this_array_(0); + FOR_REDUCE_MAX(i, 0, owned_len, + loc_max, { + loc_max = (this_array_(i) > loc_max) ? this_array_(i) : loc_max; + }, local); + break; + } + case operation::min: { + local = this_array_(0); + T loc_min = this_array_(0); + FOR_REDUCE_MIN(i, 0, owned_len, + loc_min, { + loc_min = (this_array_(i) < loc_min) ? this_array_(i) : loc_min; + }, local); + break; + } + default: + printf("MPICArrayKokkos::all_reduce: unsupported operation %d\n", op); + printf("Supported operations are: sum, product, max, min\n"); + Kokkos::abort("MPICArrayKokkos::all_reduce: unsupported operation"); + break; + } + } else { + printf("MPICArrayKokkos::all_reduce: communication plan requires info on ghost vs owned\n"); } T global = local; @@ -705,7 +721,207 @@ T MPICArrayKokkos::all_reduce(::operation op if (comm_plan_ != nullptr && comm_plan_->has_comm_world) { comm = comm_plan_->mpi_comm_world; } - MPI_Allreduce(&local, &global, 1, mpi_type_map::value(), ::mpi_op_for(op), comm); + MPI_Allreduce(&local, &global, 1, mpi_type_map::value(), mpi_op_for(op), comm); + return global; +} + +template +T MPICArrayKokkos::all_reduce(operation op, size_t j) { + assert(order_ == 2 && "MPICArrayKokkos::all_reduce(op,j) requires a rank-2 array."); + assert(j < dims_[1] && "Fixed index j is out of bounds."); + assert(!(op == operation::product && sizeof(T) == sizeof(bool)) && + "MPICArrayKokkos::all_reduce: product reduction is not supported for bool."); + assert(num_owned_ <= dims_[0] && "MPICArrayKokkos::all_reduce: num_owned exceeds dim0."); + const size_t owned_len = (num_owned_ > 0) ? num_owned_ : dims_[0]; + assert(owned_len > 0 && "MPICArrayKokkos::all_reduce: empty reduction range."); + + T local; + if (comm_plan_ == nullptr || comm_plan_->comm_type == communication_plan_type::no_communication) { + switch (op) { + case operation::sum: { + local = 0; + T loc_sum = 0; + FOR_REDUCE_SUM(e, 0, owned_len, loc_sum, { + loc_sum += this_array_(e, j); + }, local); + break; + } + case operation::product: { + local = T(1); + T loc_prod = 1; + FOR_REDUCE_PRODUCT(e, 0, owned_len, loc_prod, { + loc_prod *= this_array_(e, j); + }, local); + break; + } + case operation::max: { + local = this_array_(0, j); + T loc_max = local; + FOR_REDUCE_MAX(e, 0, owned_len, loc_max, { + const T v = this_array_(e, j); + loc_max = (v > loc_max) ? v : loc_max; + }, local); + break; + } + case operation::min: { + local = this_array_(0, j); + T loc_min = local; + FOR_REDUCE_MIN(e, 0, owned_len, loc_min, { + const T v = this_array_(e, j); + loc_min = (v < loc_min) ? v : loc_min; + }, local); + break; + } + default: + printf("MPICArrayKokkos::all_reduce: unsupported operation %d\n", op); + printf("Supported operations are: sum, product, max, min\n"); + Kokkos::abort("MPICArrayKokkos::all_reduce: unsupported operation"); + break; + } + } else { + printf("MPICArrayKokkos::all_reduce: communication plan requires info on ghost vs owned\n"); + } + + T global = local; + MPI_Comm mpi_comm = MPI_COMM_WORLD; + if (comm_plan_ != nullptr && comm_plan_->has_comm_world) { + mpi_comm = comm_plan_->mpi_comm_world; + } + MPI_Allreduce(&local, &global, 1, mpi_type_map::value(), mpi_op_for(op), mpi_comm); + return global; +} + +template +T MPICArrayKokkos::all_reduce(operation op, size_t j, size_t k) { + assert(order_ == 3 && "MPICArrayKokkos::all_reduce(op,j,k) requires a rank-3 array."); + assert(j < dims_[1] && k < dims_[2] && "Fixed tensor indices (j,k) are out of bounds."); + assert(!(op == operation::product && sizeof(T) == sizeof(bool)) && + "MPICArrayKokkos::all_reduce: product reduction is not supported for bool."); + assert(num_owned_ <= dims_[0] && "MPICArrayKokkos::all_reduce: num_owned exceeds dim0."); + const size_t owned_len = (num_owned_ > 0) ? num_owned_ : dims_[0]; + assert(owned_len > 0 && "MPICArrayKokkos::all_reduce: empty reduction range."); + + T local; + if (comm_plan_ == nullptr || comm_plan_->comm_type == communication_plan_type::no_communication) { + switch (op) { + case operation::sum: { + local = 0; + T loc_sum = 0; + FOR_REDUCE_SUM(e, 0, owned_len, loc_sum, { + loc_sum += this_array_(e, j, k); + }, local); + break; + } + case operation::product: { + local = T(1); + T loc_prod = 1; + FOR_REDUCE_PRODUCT(e, 0, owned_len, loc_prod, { + loc_prod *= this_array_(e, j, k); + }, local); + break; + } + case operation::max: { + local = this_array_(0, j, k); + T loc_max = local; + FOR_REDUCE_MAX(e, 0, owned_len, loc_max, { + const T v = this_array_(e, j, k); + loc_max = (v > loc_max) ? v : loc_max; + }, local); + break; + } + case operation::min: { + local = this_array_(0, j, k); + T loc_min = local; + FOR_REDUCE_MIN(e, 0, owned_len, loc_min, { + const T v = this_array_(e, j, k); + loc_min = (v < loc_min) ? v : loc_min; + }, local); + break; + } + default: + printf("MPICArrayKokkos::all_reduce: unsupported operation %d\n", op); + printf("Supported operations are: sum, product, max, min\n"); + Kokkos::abort("MPICArrayKokkos::all_reduce: unsupported operation"); + break; + } + } else { + printf("MPICArrayKokkos::all_reduce: communication plan requires info on ghost vs owned\n"); + } + + T global = local; + MPI_Comm mpi_comm = MPI_COMM_WORLD; + if (comm_plan_ != nullptr && comm_plan_->has_comm_world) { + mpi_comm = comm_plan_->mpi_comm_world; + } + MPI_Allreduce(&local, &global, 1, mpi_type_map::value(), mpi_op_for(op), mpi_comm); + return global; +} + +template +T MPICArrayKokkos::all_reduce(operation op, size_t g, size_t ti, + size_t tj) { + assert(order_ == 4 && "MPICArrayKokkos::all_reduce(op,g,ti,tj) requires a rank-4 array."); + assert(g < dims_[1] && ti < dims_[2] && tj < dims_[3] && + "Fixed indices (Gauss, tensor i, j) are out of bounds."); + assert(!(op == operation::product && sizeof(T) == sizeof(bool)) && + "MPICArrayKokkos::all_reduce: product reduction is not supported for bool."); + assert(num_owned_ <= dims_[0] && "MPICArrayKokkos::all_reduce: num_owned exceeds dim0."); + const size_t owned_len = (num_owned_ > 0) ? num_owned_ : dims_[0]; + assert(owned_len > 0 && "MPICArrayKokkos::all_reduce: empty reduction range."); + + T local; + if (comm_plan_ == nullptr || comm_plan_->comm_type == communication_plan_type::no_communication) { + switch (op) { + case operation::sum: { + local = 0; + T loc_sum = 0; + FOR_REDUCE_SUM(e, 0, owned_len, loc_sum, { + loc_sum += this_array_(e, g, ti, tj); + }, local); + break; + } + case operation::product: { + local = T(1); + T loc_prod = 1; + FOR_REDUCE_PRODUCT(e, 0, owned_len, loc_prod, { + loc_prod *= this_array_(e, g, ti, tj); + }, local); + break; + } + case operation::max: { + local = this_array_(0, g, ti, tj); + T loc_max = local; + FOR_REDUCE_MAX(e, 0, owned_len, loc_max, { + const T v = this_array_(e, g, ti, tj); + loc_max = (v > loc_max) ? v : loc_max; + }, local); + break; + } + case operation::min: { + local = this_array_(0, g, ti, tj); + T loc_min = local; + FOR_REDUCE_MIN(e, 0, owned_len, loc_min, { + const T v = this_array_(e, g, ti, tj); + loc_min = (v < loc_min) ? v : loc_min; + }, local); + break; + } + default: + printf("MPICArrayKokkos::all_reduce: unsupported operation %d\n", op); + printf("Supported operations are: sum, product, max, min\n"); + Kokkos::abort("MPICArrayKokkos::all_reduce: unsupported operation"); + break; + } + } else { + printf("MPICArrayKokkos::all_reduce: communication plan requires info on ghost vs owned\n"); + } + + T global = local; + MPI_Comm mpi_comm = MPI_COMM_WORLD; + if (comm_plan_ != nullptr && comm_plan_->has_comm_world) { + mpi_comm = comm_plan_->mpi_comm_world; + } + MPI_Allreduce(&local, &global, 1, mpi_type_map::value(), mpi_op_for(op), mpi_comm); return global; } From 08eac8750564e6e022dd1278fe8954ecc2ccd9f5 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Thu, 30 Apr 2026 10:01:42 -0500 Subject: [PATCH 03/10] ENH: Tweaking example, and swapping computers --- examples/matar_mpi.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/matar_mpi.cpp b/examples/matar_mpi.cpp index 3f1847e3..8f616fbf 100644 --- a/examples/matar_mpi.cpp +++ b/examples/matar_mpi.cpp @@ -54,8 +54,8 @@ MATAR_INITIALIZE(argc, argv); locals.update_device(); double global_sum = locals.all_reduce(operation::sum); + double expected_sum = static_cast(num_values * size); - if (rank == 0) { std::cout << "all_reduce(sum): " << global_sum << " (expect " << expected_sum << ")\n"; @@ -165,6 +165,7 @@ MATAR_INITIALIZE(argc, argv); // Rank-3: reduce over e at fixed tensor component — e.g. stress(e, 0, 1). { + // Rank-3: reduce over e at fixed tensor component — e.g. stress(e, 0, 1). MPICArrayKokkos stress(n_elem, 3, 3, "stress"); stress.initialize_comm_plan(comm_plan); FOR_ALL(e, 0, n_elem, r, 0, 3, c, 0, 3, { From 9a62f41131d2ff080b189991e6be3bf31f4c5dd1 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Mon, 4 May 2026 16:14:19 -0500 Subject: [PATCH 04/10] ENH: Adding test for MPI types --- .github/workflows/test.yml | 12 +- test/test_cases/CMakeLists.txt | 20 ++- test/test_cases/mpi_test_main.cpp | 15 +++ test/test_cases/test_mpi_types.cpp | 208 +++++++++++++++++++++++++++++ 4 files changed, 252 insertions(+), 3 deletions(-) create mode 100644 test/test_cases/mpi_test_main.cpp create mode 100644 test/test_cases/test_mpi_types.cpp diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index abdc732f..c049e7e6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -25,6 +25,8 @@ jobs: - {name: "TEST_UBUNTU_OPENMP_DEBUG", os: ubuntu-latest, debug: "enabled", cc: "gcc", cxx: "g++", kokkos_backend: "openmp"} - {name: "TEST_UBUNTU_SERIAL_RELEASE", os: ubuntu-latest, debug: "disabled", cc: "gcc", cxx: "g++", kokkos_backend: "serial"} - {name: "TEST_UBUNTU_OPENMP_RELEASE", os: ubuntu-latest, debug: "disabled", cc: "gcc", cxx: "g++", kokkos_backend: "openmp"} + # Kokkos + MPI: builds MATAR with MPI, registers MPICArrayKokkos_mpi_suite (mpirun from CTest). + - {name: "TEST_UBUNTU_SERIAL_MPI_DEBUG", os: ubuntu-latest, debug: "enabled", cc: "gcc", cxx: "g++", kokkos_backend: "serial_mpi"} - {name: "TEST_MAC_SERIAL_DEBUG", os: macos-14, debug: "enabled", cc: "clang", cxx: "clang++", kokkos_backend: "serial"} - {name: "TEST_MAC_SERIAL_RELEASE", os: macos-14, debug: "disabled", cc: "clang", cxx: "clang++", kokkos_backend: "serial"} @@ -35,6 +37,12 @@ jobs: - name: Checkout submodules run: git submodule update --init --recursive + - name: Install Open MPI (MPICArrayKokkos / matar_mpi_tests) + if: contains(matrix.config.kokkos_backend, 'mpi') && matrix.config.os == 'ubuntu-latest' + run: | + sudo apt-get update + sudo apt-get install -y libopenmpi-dev openmpi-bin + # Build MATAR tests using the build-matar.sh script - name: Build MATAR Tests shell: bash @@ -52,6 +60,6 @@ jobs: - name: Run Tests shell: bash run: | - cd ${{ github.workspace }}/build-matar-${{ matrix.config.kokkos_backend }}/test_cases + cd ${{ github.workspace }}/build-matar-${{ matrix.config.kokkos_backend }} ctest --output-on-failure - + diff --git a/test/test_cases/CMakeLists.txt b/test/test_cases/CMakeLists.txt index a0e07edd..d54d9c3e 100644 --- a/test/test_cases/CMakeLists.txt +++ b/test/test_cases/CMakeLists.txt @@ -1,7 +1,8 @@ cmake_minimum_required(VERSION 3.5) -# Find all test files in the current directory except test_main.cpp +# Find all test files in the current directory except test_main.cpp and MPI-only sources. file(GLOB TEST_SOURCES "test_*.cpp") +list(REMOVE_ITEM TEST_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/test_mpi_types.cpp") # Create a single test executable that includes all test files add_executable(matar_tests test_main.cpp ${TEST_SOURCES}) @@ -17,3 +18,20 @@ include(GoogleTest) enable_testing() gtest_discover_tests(matar_tests) + +# --- MPI + Kokkos: MPICArrayKokkos / CommunicationPlan (run suite under mpirun) --- +if(KOKKOS) + find_package(MPI COMPONENTS CXX QUIET) +endif() + +if(KOKKOS AND MPI_CXX_FOUND) + add_executable(matar_mpi_tests mpi_test_main.cpp test_mpi_types.cpp) + target_link_libraries(matar_mpi_tests PRIVATE matar gtest Kokkos::kokkos MPI::MPI_CXX) + target_compile_definitions(matar_mpi_tests PRIVATE HAVE_MPI=1 HAVE_KOKKOS=1) + + # Run all MPI tests with multiple ranks (single CTest entry; avoids running shards without MPI). + add_test( + NAME MPICArrayKokkos_mpi_suite + COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 4 $ + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) +endif() diff --git a/test/test_cases/mpi_test_main.cpp b/test/test_cases/mpi_test_main.cpp new file mode 100644 index 00000000..570e6ed6 --- /dev/null +++ b/test/test_cases/mpi_test_main.cpp @@ -0,0 +1,15 @@ +#include +#include + +#include + +// MPI must initialize before Kokkos (MATAR_INITIALIZE). +int main(int argc, char** argv) { + MPI_Init(&argc, &argv); + MATAR_INITIALIZE(argc, argv); + ::testing::InitGoogleTest(&argc, argv); + const int result = RUN_ALL_TESTS(); + MATAR_FINALIZE(); + MPI_Finalize(); + return result; +} diff --git a/test/test_cases/test_mpi_types.cpp b/test/test_cases/test_mpi_types.cpp new file mode 100644 index 00000000..3c262411 --- /dev/null +++ b/test/test_cases/test_mpi_types.cpp @@ -0,0 +1,208 @@ +// Google tests for MPICArrayKokkos / CommunicationPlan (HAVE_MPI + HAVE_KOKKOS). +// Run the mpi_test_main executable under mpirun (see test_cases/CMakeLists.txt). + +#if !defined(HAVE_MPI) || !defined(HAVE_KOKKOS) + +#include + +TEST(MPI_Types, SkippedWithoutMpiKokkos) { + GTEST_SKIP() << "Build MATAR tests with MPI and Kokkos enabled."; +} + +#else + +#include +#include + +#include +#include + +using namespace mtr; + +namespace { + +void mpi_rank_size(int* rank, int* size) { + MPI_Comm_rank(MPI_COMM_WORLD, rank); + MPI_Comm_size(MPI_COMM_WORLD, size); +} + +} // namespace + +TEST(MPICArrayKokkos, AllReduce_Sum_1D) { + int rank = 0; + int size = 1; + mpi_rank_size(&rank, &size); + + CommunicationPlan comm_plan; + comm_plan.initialize(MPI_COMM_WORLD); + + const int num_values = 100; + MPICArrayKokkos locals(num_values, "ut_values"); + locals.initialize_comm_plan(comm_plan); + locals.set_values(1.0); + locals.update_device(); + + const double global_sum = locals.all_reduce(operation::sum); + const double expected = static_cast(num_values * size); + EXPECT_DOUBLE_EQ(global_sum, expected); +} + +TEST(MPICArrayKokkos, AllReduce_Sum_VariableLengthPerRank) { + int rank = 0; + int size = 1; + mpi_rank_size(&rank, &size); + + CommunicationPlan comm_plan; + comm_plan.initialize(MPI_COMM_WORLD); + + const int num_values_per_rank = 10 * (1 + rank); + MPICArrayKokkos rank_locals(num_values_per_rank, "ut_varlen"); + rank_locals.initialize_comm_plan(comm_plan); + rank_locals.set_values(1.0); + rank_locals.update_device(); + + const double global_sum = rank_locals.all_reduce(operation::sum); + const double expected = + 10.0 * static_cast(size * (size + 1) / 2); + EXPECT_DOUBLE_EQ(global_sum, expected); +} + +TEST(MPICArrayKokkos, AllReduce_MinMax_1D) { + int rank = 0; + int size = 1; + mpi_rank_size(&rank, &size); + + CommunicationPlan comm_plan; + comm_plan.initialize(MPI_COMM_WORLD); + + const int num_values_per_rank = 10 * (1 + rank); + MPICArrayKokkos vals(num_values_per_rank, "ut_minmax"); + vals.initialize_comm_plan(comm_plan); + + FOR_ALL(i, 0, num_values_per_rank, { + vals(i) = static_cast(10 * rank + i); + }); + MATAR_FENCE(); + vals.update_device(); + + const float global_min = vals.all_reduce(operation::min); + const float global_max = vals.all_reduce(operation::max); + EXPECT_FLOAT_EQ(global_min, 0.0F); + const float expected_max = + static_cast(10 * (size - 1) + (10 * size - 1)); + EXPECT_FLOAT_EQ(global_max, expected_max); +} + +TEST(MPICArrayKokkos, AllReduce_Product) { + int rank = 0; + int size = 1; + mpi_rank_size(&rank, &size); + + CommunicationPlan comm_plan; + comm_plan.initialize(MPI_COMM_WORLD); + + MPICArrayKokkos prod_locals(4, "ut_prod"); + prod_locals.initialize_comm_plan(comm_plan); + prod_locals.set_values(2.0); + prod_locals.update_device(); + + const double global_product = prod_locals.all_reduce(operation::product); + const double expected = std::pow(2.0, 4 * size); + EXPECT_DOUBLE_EQ(global_product, expected); +} + +TEST(MPICArrayKokkos, AllReduce_Rank2_CentroidXYZ) { + int rank = 0; + int size = 1; + mpi_rank_size(&rank, &size); + + CommunicationPlan comm_plan; + comm_plan.initialize(MPI_COMM_WORLD); + + const size_t n_elem = 3; + constexpr int num_coords = 3; + + MPICArrayKokkos elem_centroids(n_elem, static_cast(num_coords), + "ut_centroids"); + elem_centroids.initialize_comm_plan(comm_plan); + FOR_ALL(elem_id, 0, n_elem, elem_position, 0, num_coords, { + const double base = 1000.0 * rank + 100.0 * elem_id; + elem_centroids(elem_id, elem_position) = + base + 10.0 * static_cast(elem_position); + }); + MATAR_FENCE(); + elem_centroids.update_device(); + + const double max_x = elem_centroids.all_reduce(operation::max, 0U); + const double max_y = elem_centroids.all_reduce(operation::max, 1U); + const double max_z = elem_centroids.all_reduce(operation::max, 2U); + + const double base_rank = 1000.0 * static_cast(size - 1); + const double base_elem = 100.0 * static_cast(n_elem - 1); + EXPECT_DOUBLE_EQ(max_x, base_rank + base_elem + 0.0); + EXPECT_DOUBLE_EQ(max_y, base_rank + base_elem + 10.0); + EXPECT_DOUBLE_EQ(max_z, base_rank + base_elem + 20.0); +} + +TEST(MPICArrayKokkos, AllReduce_Rank3_StressComponent) { + int rank = 0; + int size = 1; + mpi_rank_size(&rank, &size); + + CommunicationPlan comm_plan; + comm_plan.initialize(MPI_COMM_WORLD); + + const size_t n_elem = 3; + MPICArrayKokkos stress(n_elem, 3, 3, "ut_stress"); + stress.initialize_comm_plan(comm_plan); + FOR_ALL(e, 0, n_elem, r, 0, 3, c, 0, 3, { + stress(e, r, c) = 10000.0 * rank + 1000.0 * e + 100.0 * r + c; + }); + MATAR_FENCE(); + stress.update_device(); + + const double max_comp = + stress.all_reduce(operation::max, static_cast(0), + static_cast(1)); + const double expected = 10000.0 * static_cast(size - 1) + + 1000.0 * static_cast(n_elem - 1) + 1.0; + EXPECT_DOUBLE_EQ(max_comp, expected); +} + +TEST(MPICArrayKokkos, AllReduce_Rank4_GaussStressComponent) { + int rank = 0; + int size = 1; + mpi_rank_size(&rank, &size); + + CommunicationPlan comm_plan; + comm_plan.initialize(MPI_COMM_WORLD); + + const size_t n_elem = 3; + const size_t n_gauss = 2; + + MPICArrayKokkos s4(n_elem, n_gauss, 3, 3, "ut_s4"); + s4.initialize_comm_plan(comm_plan); + for (size_t e = 0; e < n_elem; ++e) { + for (size_t g = 0; g < n_gauss; ++g) { + for (size_t r = 0; r < 3; ++r) { + for (size_t c = 0; c < 3; ++c) { + s4.host(e, g, r, c) = + 100000.0 * rank + 1000.0 * static_cast(e) + + 100.0 * static_cast(g) + + 10.0 * static_cast(r) + + static_cast(c); + } + } + } + } + s4.update_device(); + + const double max_qp = + s4.all_reduce(operation::max, static_cast(1), + static_cast(0), static_cast(1)); + const double expected = 100000.0 * static_cast(size - 1) + + 1000.0 * static_cast(n_elem - 1) + 101.0; + EXPECT_DOUBLE_EQ(max_qp, expected); +} + +#endif // HAVE_MPI && HAVE_KOKKOS From 8ca63a5e0ad50a52b69d37e578eaf5a192691805 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Mon, 4 May 2026 16:15:28 -0500 Subject: [PATCH 05/10] ENH: Making comm_plan accessable --- src/include/mpi_types.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/include/mpi_types.h b/src/include/mpi_types.h index 84e8266c..78c02f4c 100644 --- a/src/include/mpi_types.h +++ b/src/include/mpi_types.h @@ -92,11 +92,6 @@ class MPICArrayKokkos { MPI_Datatype mpi_datatype_; MPI_Request mpi_request_; - - // --- Ghost Communication Support --- - CommunicationPlan* comm_plan_ = NULL; // Pointer to shared communication plan - - DCArrayKokkos send_counts_; // [size: num_send_ranks] Number of items to send to each rank DCArrayKokkos recv_counts_; // [size: num_recv_ranks] Number of items to receive from each rank DCArrayKokkos send_displs_; // [size: num_send_ranks] Starting index of items to send to each rank @@ -114,6 +109,10 @@ class MPICArrayKokkos { public: + + // --- Ghost Communication Support --- + CommunicationPlan* comm_plan_ = NULL; // Pointer to shared communication plan + // Data member to access host view (initialized as pointer to this_array_.host_pointer()) ViewCArray host; From 4411cbf2894daa563e934bb59deb918aac80c4cb Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Mon, 4 May 2026 16:27:33 -0500 Subject: [PATCH 06/10] ENH: Fixing tests --- .github/workflows/test.yml | 12 +++++++++--- test/test_cases/CMakeLists.txt | 11 +++++------ 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c049e7e6..542c2a70 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -37,11 +37,17 @@ jobs: - name: Checkout submodules run: git submodule update --init --recursive - - name: Install Open MPI (MPICArrayKokkos / matar_mpi_tests) + - name: Install Open MPI if: contains(matrix.config.kokkos_backend, 'mpi') && matrix.config.os == 'ubuntu-latest' + env: + DEBIAN_FRONTEND: noninteractive run: | - sudo apt-get update - sudo apt-get install -y libopenmpi-dev openmpi-bin + sudo apt-get update -qq + sudo apt-get install -y --no-install-recommends \ + libopenmpi-dev \ + openmpi-bin \ + mpi-default-bin \ + mpi-default-dev # Build MATAR tests using the build-matar.sh script - name: Build MATAR Tests diff --git a/test/test_cases/CMakeLists.txt b/test/test_cases/CMakeLists.txt index d54d9c3e..6f9ddd86 100644 --- a/test/test_cases/CMakeLists.txt +++ b/test/test_cases/CMakeLists.txt @@ -1,5 +1,9 @@ cmake_minimum_required(VERSION 3.5) +# Imported `matar` may use MPI::MPI_CXX (when MATAR was built with MPI). Load MPI before any +# target links `matar`, otherwise CMake cannot resolve the imported INTERFACE. +find_package(MPI COMPONENTS CXX QUIET) + # Find all test files in the current directory except test_main.cpp and MPI-only sources. file(GLOB TEST_SOURCES "test_*.cpp") list(REMOVE_ITEM TEST_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/test_mpi_types.cpp") @@ -20,16 +24,11 @@ enable_testing() gtest_discover_tests(matar_tests) # --- MPI + Kokkos: MPICArrayKokkos / CommunicationPlan (run suite under mpirun) --- -if(KOKKOS) - find_package(MPI COMPONENTS CXX QUIET) -endif() - -if(KOKKOS AND MPI_CXX_FOUND) +if(KOKKOS AND TARGET MPI::MPI_CXX) add_executable(matar_mpi_tests mpi_test_main.cpp test_mpi_types.cpp) target_link_libraries(matar_mpi_tests PRIVATE matar gtest Kokkos::kokkos MPI::MPI_CXX) target_compile_definitions(matar_mpi_tests PRIVATE HAVE_MPI=1 HAVE_KOKKOS=1) - # Run all MPI tests with multiple ranks (single CTest entry; avoids running shards without MPI). add_test( NAME MPICArrayKokkos_mpi_suite COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 4 $ From 49da4642d047125c0f64e03430fed77e601dfa13 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Mon, 4 May 2026 16:30:22 -0500 Subject: [PATCH 07/10] ENH: Fixing workflow bug --- test/CMakeLists.txt | 3 +++ test/test_cases/CMakeLists.txt | 4 +--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e6c2bfaf..afce3177 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -2,6 +2,9 @@ cmake_minimum_required(VERSION 3.5) project (matartest) +# Must run before find_package(Matar): imported target matar may list MPI::MPI_CXX in INTERFACE, +# and MatarConfig.cmake includes MatarTargets.cmake during find_package. +find_package(MPI COMPONENTS CXX QUIET) find_package(Matar REQUIRED) #------------------------------------------- diff --git a/test/test_cases/CMakeLists.txt b/test/test_cases/CMakeLists.txt index 6f9ddd86..5e497f61 100644 --- a/test/test_cases/CMakeLists.txt +++ b/test/test_cases/CMakeLists.txt @@ -1,8 +1,6 @@ cmake_minimum_required(VERSION 3.5) -# Imported `matar` may use MPI::MPI_CXX (when MATAR was built with MPI). Load MPI before any -# target links `matar`, otherwise CMake cannot resolve the imported INTERFACE. -find_package(MPI COMPONENTS CXX QUIET) +# MPI is found in parent ../CMakeLists.txt before find_package(Matar). # Find all test files in the current directory except test_main.cpp and MPI-only sources. file(GLOB TEST_SOURCES "test_*.cpp") From 5b7045856c9e22fe5402881dd8e0f3034c83e8bd Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Mon, 4 May 2026 16:36:01 -0500 Subject: [PATCH 08/10] BUG: Remove MPI for non MPI builds for tests --- scripts/cmake_build_test.sh | 4 ++++ test/CMakeLists.txt | 9 ++++++--- test/test_cases/CMakeLists.txt | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/scripts/cmake_build_test.sh b/scripts/cmake_build_test.sh index 21e9a62b..07d2e987 100644 --- a/scripts/cmake_build_test.sh +++ b/scripts/cmake_build_test.sh @@ -32,6 +32,10 @@ else ) fi +if [[ "${kokkos_build_type}" == *"mpi"* ]]; then + cmake_options+=(-D MATAR_TEST_USE_MPI=1) +fi + # Print CMake options for reference echo "CMake Options: ${cmake_options[@]}" diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index afce3177..1c640725 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -2,9 +2,12 @@ cmake_minimum_required(VERSION 3.5) project (matartest) -# Must run before find_package(Matar): imported target matar may list MPI::MPI_CXX in INTERFACE, -# and MatarConfig.cmake includes MatarTargets.cmake during find_package. -find_package(MPI COMPONENTS CXX QUIET) +# MATAR built with MPI lists MPI::MPI_CXX on imported target `matar`; MatarConfig loads targets during +# find_package(Matar). Only pre-find MPI for those test builds (see cmake_build_test.sh). +option(MATAR_TEST_USE_MPI "Configure tests against MPI-enabled MATAR (requires MPI before find_package)" OFF) +if(MATAR_TEST_USE_MPI) + find_package(MPI REQUIRED COMPONENTS CXX) +endif() find_package(Matar REQUIRED) #------------------------------------------- diff --git a/test/test_cases/CMakeLists.txt b/test/test_cases/CMakeLists.txt index 5e497f61..fa67b02c 100644 --- a/test/test_cases/CMakeLists.txt +++ b/test/test_cases/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.5) -# MPI is found in parent ../CMakeLists.txt before find_package(Matar). +# Optional MPI for matar_mpi_tests: loaded in ../CMakeLists.txt when MATAR_TEST_USE_MPI is set. # Find all test files in the current directory except test_main.cpp and MPI-only sources. file(GLOB TEST_SOURCES "test_*.cpp") From d6a1ba2e62ae44ddb76c0aeb4fda100faa55c8c8 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Tue, 5 May 2026 08:35:02 -0500 Subject: [PATCH 09/10] BUG: Debugging tests --- test/CMakeLists.txt | 4 ++++ test/test_cases/CMakeLists.txt | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 1c640725..55a0a7d4 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -2,6 +2,10 @@ cmake_minimum_required(VERSION 3.5) project (matartest) +# Required at top level so CMake generates ${CMAKE_BINARY_DIR}/CTestTestfile.cmake. +# Without this, ctest from the build root finds no tests (enable_testing in a subdirectory is not enough). +enable_testing() + # MATAR built with MPI lists MPI::MPI_CXX on imported target `matar`; MatarConfig loads targets during # find_package(Matar). Only pre-find MPI for those test builds (see cmake_build_test.sh). option(MATAR_TEST_USE_MPI "Configure tests against MPI-enabled MATAR (requires MPI before find_package)" OFF) diff --git a/test/test_cases/CMakeLists.txt b/test/test_cases/CMakeLists.txt index fa67b02c..81f63ce1 100644 --- a/test/test_cases/CMakeLists.txt +++ b/test/test_cases/CMakeLists.txt @@ -17,7 +17,6 @@ endif() # Add test discovery include(GoogleTest) -enable_testing() gtest_discover_tests(matar_tests) From 82106a3ba407c20c63cbf5e5a1050839bfe30952 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Tue, 5 May 2026 08:51:14 -0500 Subject: [PATCH 10/10] ENH: stabalizing for workflow runs --- test/test_cases/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_cases/CMakeLists.txt b/test/test_cases/CMakeLists.txt index 81f63ce1..b9a7ee07 100644 --- a/test/test_cases/CMakeLists.txt +++ b/test/test_cases/CMakeLists.txt @@ -26,8 +26,9 @@ if(KOKKOS AND TARGET MPI::MPI_CXX) target_link_libraries(matar_mpi_tests PRIVATE matar gtest Kokkos::kokkos MPI::MPI_CXX) target_compile_definitions(matar_mpi_tests PRIVATE HAVE_MPI=1 HAVE_KOKKOS=1) + # --oversubscribe: GitHub Actions / small VMs often expose fewer slots than ranks (Open MPI). add_test( NAME MPICArrayKokkos_mpi_suite - COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 4 $ + COMMAND ${MPIEXEC_EXECUTABLE} --oversubscribe ${MPIEXEC_NUMPROC_FLAG} 4 $ WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endif()