diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index abdc732f..542c2a70 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -25,6 +25,8 @@ jobs: - {name: "TEST_UBUNTU_OPENMP_DEBUG", os: ubuntu-latest, debug: "enabled", cc: "gcc", cxx: "g++", kokkos_backend: "openmp"} - {name: "TEST_UBUNTU_SERIAL_RELEASE", os: ubuntu-latest, debug: "disabled", cc: "gcc", cxx: "g++", kokkos_backend: "serial"} - {name: "TEST_UBUNTU_OPENMP_RELEASE", os: ubuntu-latest, debug: "disabled", cc: "gcc", cxx: "g++", kokkos_backend: "openmp"} + # Kokkos + MPI: builds MATAR with MPI, registers MPICArrayKokkos_mpi_suite (mpirun from CTest). + - {name: "TEST_UBUNTU_SERIAL_MPI_DEBUG", os: ubuntu-latest, debug: "enabled", cc: "gcc", cxx: "g++", kokkos_backend: "serial_mpi"} - {name: "TEST_MAC_SERIAL_DEBUG", os: macos-14, debug: "enabled", cc: "clang", cxx: "clang++", kokkos_backend: "serial"} - {name: "TEST_MAC_SERIAL_RELEASE", os: macos-14, debug: "disabled", cc: "clang", cxx: "clang++", kokkos_backend: "serial"} @@ -35,6 +37,18 @@ jobs: - name: Checkout submodules run: git submodule update --init --recursive + - name: Install Open MPI + if: contains(matrix.config.kokkos_backend, 'mpi') && matrix.config.os == 'ubuntu-latest' + env: + DEBIAN_FRONTEND: noninteractive + run: | + sudo apt-get update -qq + sudo apt-get install -y --no-install-recommends \ + libopenmpi-dev \ + openmpi-bin \ + mpi-default-bin \ + mpi-default-dev + # Build MATAR tests using the build-matar.sh script - name: Build MATAR Tests shell: bash @@ -52,6 +66,6 @@ jobs: - name: Run Tests shell: bash run: | - cd ${{ github.workspace }}/build-matar-${{ matrix.config.kokkos_backend }}/test_cases + cd ${{ github.workspace }}/build-matar-${{ matrix.config.kokkos_backend }} ctest --output-on-failure - + diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 32238913..6026c9db 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -163,6 +163,9 @@ if (KOKKOS) include_directories(mesh_decomp) add_subdirectory(mesh_decomp) + + add_executable(matar_mpi matar_mpi.cpp) + target_link_libraries(matar_mpi ${LINKING_LIBRARIES}) endif() endif() diff --git a/examples/matar_mpi.cpp b/examples/matar_mpi.cpp new file mode 100644 index 00000000..8f616fbf --- /dev/null +++ b/examples/matar_mpi.cpp @@ -0,0 +1,301 @@ +// Example: basic MPI features in MATAR (requires HAVE_MPI and HAVE_KOKKOS). +// - MPICArrayKokkos: distributed array with optional halo exchange via communicate() +// - CommunicationPlan: MPI distributed graph + per-neighbor index lists +// - MPICArrayKokkos::all_reduce: 1D, and fixed trailing indices for rank 2/3/4 +// (mpi_type_map + mpi_op_for in mpi_types.h / communication_plan.h) + +#if !defined(HAVE_MPI) || !defined(HAVE_KOKKOS) + +#include + +int main() { + std::cerr + << "This example requires MATAR built with HAVE_MPI and HAVE_KOKKOS.\n"; + return 0; +} + +#else + +#include +#include + +#include +#include + +using namespace mtr; + +int main(int argc, char* argv[]) { + +MPI_Init(&argc, &argv); +MATAR_INITIALIZE(argc, argv); +{ + + int rank = 0; + int size = 1; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + // Create a basic communication plan, this handles things like storing MPI world comms and + // allows for basic reduction operations. For more complex communication patterns, you can + // use the CommunicationPlan to build a more complex communication plan. + CommunicationPlan comm_plan; + comm_plan.initialize(MPI_COMM_WORLD); + + // ------------------------------------------------------------------------- + // MPICArrayKokkos::all_reduce — uses mpi_type_map for MPI_Datatype and + // mpi_op_for(operation) for MPI_Op (no CommunicationPlan required). + // ------------------------------------------------------------------------- + + // Same size arrays on each rank, just do a simple reduction. + const int num_values = 100; + MPICArrayKokkos locals(num_values, "values"); + locals.initialize_comm_plan(comm_plan); + locals.set_values(1.0); + locals.update_device(); + + double global_sum = locals.all_reduce(operation::sum); + + double expected_sum = static_cast(num_values * size); + if (rank == 0) { + std::cout << "all_reduce(sum): " << global_sum << " (expect " + << expected_sum << ")\n"; + } + + // Different size arrays on each rank, do a simple reduction. + const int num_values_per_rank = 10*(1+rank); + MPICArrayKokkos rank_locals(num_values_per_rank, "values"); + rank_locals.initialize_comm_plan(comm_plan); + rank_locals.set_values(1.0); + rank_locals.update_device(); + + global_sum = rank_locals.all_reduce(operation::sum); + // Rank r has 10*(r+1) entries of 1 → total = 10 * (1 + … + size). + expected_sum = 10.0 * static_cast(size * (size + 1) / 2); + + + if (rank == 0) { + std::cout << "all_reduce(sum) with different size arrays on each rank (10*rank_id): " << global_sum << " (expect " + << expected_sum << ")\n"; + } + + + // Different size arrays on each rank, find the minimum value + MPICArrayKokkos min_locals(num_values_per_rank, "values"); + min_locals.initialize_comm_plan(comm_plan); + + FOR_ALL(i, 0, num_values_per_rank, { + min_locals(i) = static_cast(10*rank + i); + }); + + min_locals.update_device(); + float global_min = min_locals.all_reduce(operation::min); + float expected_min = 0.0F; + if (rank == 0) { + std::cout << "all_reduce(min) with different size arrays on each rank (10*rank_id): " << global_min << " (expect " + << expected_min << ")\n"; + } + + float global_max = min_locals.all_reduce(operation::max); + // Largest entry is on rank (size-1) at i = 10*size - 1. + const float expected_max = static_cast(10 * (size - 1) + (10 * size - 1)); + if (rank == 0) { + std::cout << "all_reduce(max) with different size arrays on each rank (10*rank_id): " << global_max << " (expect " + << expected_max << ")\n"; + } + + + + // Example: all_reduce with product + // Initialize a MPICArrayKokkos with all values 2.0 + MPICArrayKokkos prod_locals(4, "prod_values"); + prod_locals.initialize_comm_plan(comm_plan); + prod_locals.set_values(2.0); + prod_locals.update_device(); + + // Compute the product across all ranks and all values + double global_product = prod_locals.all_reduce(operation::product); + + // The expected product is pow(2, 4*size), i.e., each rank contributes 4 twos + double expected_product = std::pow(2.0, 4 * size); + if (rank == 0) { + std::cout << "all_reduce(product): " << global_product << " (expect " + << expected_product << ")\n"; + } + + // ------------------------------------------------------------------------- + // all_reduce with fixed trailing indices (multi-dimensional arrays). + // ------------------------------------------------------------------------- + const size_t n_elem = size * 10; + + // Rank-2: element centroid coordinates — elem_centroids(elem_id, elem_position) + // with elem_position ∈ {0,1,2} as x, y, z. Reduce over elem_id for each axis. + { + size_t n_elem = 3; + size_t num_coords = 3; + MPICArrayKokkos elem_centroids(n_elem, num_coords,"elem_centroids"); + elem_centroids.initialize_comm_plan(comm_plan); + FOR_ALL(elem_id, 0, n_elem, + elem_position, 0, num_coords, { + const double base = 1000.0 * rank + 100.0 * elem_id; + elem_centroids(elem_id, elem_position) = + base + 10.0 * static_cast(elem_position); + }); + MATAR_FENCE(); + elem_centroids.update_device(); + + const double max_x = elem_centroids.all_reduce(operation::max, 0); + const double max_y = elem_centroids.all_reduce(operation::max, 1); + const double max_z = elem_centroids.all_reduce(operation::max, 2); + + const double base_rank = 1000.0 * static_cast(size - 1); + const double base_elem = 100.0 * static_cast(n_elem - 1); + const double expect_max_x = base_rank + base_elem + 0.0; + const double expect_max_y = base_rank + base_elem + 10.0; + const double expect_max_z = base_rank + base_elem + 20.0; + + if (rank == 0) { + std::cout << "all_reduce(max, coord) rank-2 centroids — max x: " << max_x + << " (expect " << expect_max_x << ")\n"; + std::cout << "all_reduce(max, coord) rank-2 centroids — max y: " << max_y + << " (expect " << expect_max_y << ")\n"; + std::cout << "all_reduce(max, coord) rank-2 centroids — max z: " << max_z + << " (expect " << expect_max_z << ")\n"; + } + } + + // Rank-3: reduce over e at fixed tensor component — e.g. stress(e, 0, 1). + { + // Rank-3: reduce over e at fixed tensor component — e.g. stress(e, 0, 1). + MPICArrayKokkos stress(n_elem, 3, 3, "stress"); + stress.initialize_comm_plan(comm_plan); + FOR_ALL(e, 0, n_elem, r, 0, 3, c, 0, 3, { + stress(e, r, c) = + 10000.0 * rank + 1000.0 * e + 100.0 * r + c; + }); + MATAR_FENCE(); + stress.update_device(); + const double max_comp = stress.all_reduce(operation::max, 0, 1); + + const double expect_3d = + 10000.0 * static_cast(size - 1) + + 1000.0 * static_cast(n_elem - 1) + 1.0; + if (rank == 0) { + std::cout << "all_reduce(max, i, j) rank-3 s(e,i,j): " << max_comp + << " (expect " << expect_3d << ")\n"; + } + } + + // Rank-4: reduce over element at fixed Gauss point and tensor component. + { + const size_t n_gauss = 2; + MPICArrayKokkos s4(n_elem, n_gauss, 3, 3, "s4"); + s4.initialize_comm_plan(comm_plan); + for (size_t e = 0; e < n_elem; ++e) { + for (size_t g = 0; g < n_gauss; ++g) { + for (size_t r = 0; r < 3; ++r) { + for (size_t c = 0; c < 3; ++c) { + s4.host(e, g, r, c) = + 100000.0 * rank + 1000.0 * static_cast(e) + + 100.0 * static_cast(g) + + 10.0 * static_cast(r) + + static_cast(c); + } + } + } + } + s4.update_device(); + const size_t g_fix = 1; + const size_t ti_fix = 0; + const size_t tj_fix = 1; + const double max_qp = + s4.all_reduce(operation::max, g_fix, ti_fix, tj_fix); + const double expect_4d = + 100000.0 * static_cast(size - 1) + + 1000.0 * static_cast(n_elem - 1) + 101.0; + if (rank == 0) { + std::cout << "all_reduce(max, g, i, j) rank-4 stress(e,g,i,j): " + << max_qp << " (expect " << expect_4d << ")\n"; + } + } + + // ------------------------------------------------------------------------- + // CommunicationPlan + communicate(): periodic 1D halo (needs 2+ ranks). + // Layout per rank: index 0 = left ghost, 1..L = owned, L+1 = right ghost. + // ------------------------------------------------------------------------- + // if (size < 2) { + // if (rank == 0) { + // std::cout + // << "Re-run with 2+ MPI processes to run the halo exchange demo.\n"; + // } + // } else { + // const int L = 4; + // const int left = (rank + size - 1) % size; + // const int right = (rank + 1) % size; + + // CommunicationPlan comm_plan; + // comm_plan.initialize(MPI_COMM_WORLD); + + // int send_ranks[2] = { left, right }; + // int recv_ranks[2] = { left, right }; + // comm_plan.initialize_graph_communicator(2, send_ranks, 2, recv_ranks); + + // DCArrayKokkos send_strides(2, "send_strides"); + // send_strides.host(0) = 1; + // send_strides.host(1) = 1; + // send_strides.update_device(); + // DRaggedRightArrayKokkos rank_send_ids(send_strides, + // "rank_send_ids"); + // // Send first owned cell to the left neighbor; last owned to the right. + // rank_send_ids.host(0, 0) = 1; + // rank_send_ids.host(1, 0) = L; + // rank_send_ids.update_device(); + + // DCArrayKokkos recv_strides(2, "recv_strides"); + // recv_strides.host(0) = 1; + // recv_strides.host(1) = 1; + // recv_strides.update_device(); + // DRaggedRightArrayKokkos rank_recv_ids(recv_strides, + // "rank_recv_ids"); + // rank_recv_ids.host(0, 0) = 0; + // rank_recv_ids.host(1, 0) = L + 1; + // rank_recv_ids.update_device(); + + // MATAR_FENCE(); + // comm_plan.setup_send_recv(rank_send_ids, rank_recv_ids); + + // MPICArrayKokkos field(static_cast(L + 2), "field"); + // field.initialize_comm_plan(comm_plan); + + // FOR_ALL(i, 0, L + 2, { + // field(i) = -1.0; + // }); + // FOR_ALL(i, 1, L + 1, { field(i) = static_cast(rank); }); + // MATAR_FENCE(); + + // field.communicate(); + + // field.update_host(); + // MATAR_FENCE(); + + // const double gl = field.host(0); + // const double gr = field.host(L + 1); + // const bool halo_ok = + // (std::fabs(gl - static_cast(left)) < 1.0e-14) && + // (std::fabs(gr - static_cast(right)) < 1.0e-14); + + // if (rank == 0) { + // std::cout << "After halo exchange (periodic 1D): "; + // if (halo_ok) { + // std::cout << "ghost values match neighbor ranks.\n"; + // } else { + // std::cout << "verification failed.\n"; + // } + // } + // } +} +MATAR_FINALIZE(); +MPI_Finalize(); +return 0; +} + +#endif diff --git a/scripts/cmake_build_test.sh b/scripts/cmake_build_test.sh index 21e9a62b..07d2e987 100644 --- a/scripts/cmake_build_test.sh +++ b/scripts/cmake_build_test.sh @@ -32,6 +32,10 @@ else ) fi +if [[ "${kokkos_build_type}" == *"mpi"* ]]; then + cmake_options+=(-D MATAR_TEST_USE_MPI=1) +fi + # Print CMake options for reference echo "CMake Options: ${cmake_options[@]}" diff --git a/src/include/communication_plan.h b/src/include/communication_plan.h index 0f642b2e..619ad158 100644 --- a/src/include/communication_plan.h +++ b/src/include/communication_plan.h @@ -7,14 +7,38 @@ #include -using namespace mtr; - +namespace mtr +{ enum class communication_plan_type { no_communication, all_to_all_graph }; +/// Reduction operation for @c MPICArrayKokkos::all_reduce (maps to @c MPI_Op for the MPI stage). +enum class operation { + sum, + product, + max, + min +}; + +/// Map @ref operation to the corresponding @c MPI_Op (host-only; call after @c MPI_Init). +inline MPI_Op mpi_op_for(operation op) { + switch (op) { + case operation::sum: + return MPI_SUM; + case operation::product: + return MPI_PROD; + case operation::max: + return MPI_MAX; + case operation::min: + return MPI_MIN; + default: + return MPI_SUM; + } +} + struct CommunicationPlan { @@ -65,7 +89,6 @@ struct CommunicationPlan { DCArrayKokkos send_counts_; // [size: num_send_ranks] Number of items to send to each rank DCArrayKokkos recv_counts_; // [size: num_recv_ranks] Number of items to receive from each rank - DCArrayKokkos send_displs_; // [size: num_send_ranks] Starting index of items to send to each rank DCArrayKokkos recv_displs_; // [size: num_recv_ranks] Starting index of items to receive from each rank @@ -91,8 +114,7 @@ struct CommunicationPlan { MPI_Comm_free(&mpi_comm_graph); } } - - + void initialize(MPI_Comm comm_world){ int mpi_init = 0; MPI_Initialized(&mpi_init); @@ -512,6 +534,8 @@ struct CommunicationPlan { } }; // End of CommunicationPlan +} // end namespace mtr + #endif // end if HAVE_MPI #endif // end if COMMUNICATION_PLAN_H diff --git a/src/include/mpi_types.h b/src/include/mpi_types.h index bc00d852..78c02f4c 100644 --- a/src/include/mpi_types.h +++ b/src/include/mpi_types.h @@ -87,16 +87,11 @@ class MPICArrayKokkos { size_t length_ = 0; size_t order_ = 0; // tensor order (rank) - MPI_Comm mpi_comm_; + MPI_Comm mpi_comm_ = MPI_COMM_NULL; MPI_Status mpi_status_; MPI_Datatype mpi_datatype_; MPI_Request mpi_request_; - - // --- Ghost Communication Support --- - CommunicationPlan* comm_plan_ = NULL; // Pointer to shared communication plan - - DCArrayKokkos send_counts_; // [size: num_send_ranks] Number of items to send to each rank DCArrayKokkos recv_counts_; // [size: num_recv_ranks] Number of items to receive from each rank DCArrayKokkos send_displs_; // [size: num_send_ranks] Starting index of items to send to each rank @@ -109,10 +104,15 @@ class MPICArrayKokkos { DRaggedRightArrayKokkos recv_indices_; // [size: num_recv_ranks, num_items_to_recv_by_rank] Indices of items to receive from each rank - size_t num_owned_; // Number of owned items (nodes/elements) - size_t num_ghost_; // Number of ghost items (nodes/elements) + size_t num_owned_ = 0; // Number of owned items (nodes/elements); optional override + size_t num_ghost_ = 0; // Number of ghost items (nodes/elements); informational when user-set + public: + + // --- Ghost Communication Support --- + CommunicationPlan* comm_plan_ = NULL; // Pointer to shared communication plan + // Data member to access host view (initialized as pointer to this_array_.host_pointer()) ViewCArray host; @@ -358,6 +358,20 @@ class MPICArrayKokkos { MATAR_FENCE(); }; + /// Reduce over index 0 (e.g. elements); array must be rank 1. + T all_reduce(operation op); + + /// Reduce over leading dimension only; fix one trailing index (rank-2 arrays). + T all_reduce(operation op, size_t j); + + /// Reduce over leading dimension only; fix two trailing indices (e.g. + /// stress(elem,3,3) → all_reduce(op,0,1) is max of stress(e,0,1) over owned elements e). + T all_reduce(operation op, size_t j, size_t k); + + /// Reduce over elements only; fix Gauss point and tensor indices (rank-4), e.g. + /// stress(elem, gauss, i, j) → all_reduce(op, g, i, j) over owned elem. + T all_reduce(operation op, size_t g, size_t ti, size_t tj); + void set_values(const T& value){ this_array_.set_values(value); }; @@ -635,12 +649,286 @@ void MPICArrayKokkos::update_device() { this_array_.update_device(); } + + template -KOKKOS_INLINE_FUNCTION -MPICArrayKokkos::~MPICArrayKokkos() { +T MPICArrayKokkos::all_reduce(operation op) { + + // assert(order_ == 1 && stride_ == 1 && "MPICArrayKokkos::all_reduce requires a 1D array (stride 1)."); + assert(!(op == operation::product && sizeof(T) == sizeof(bool)) && + "MPICArrayKokkos::all_reduce: product reduction is not supported for bool."); + + + assert(num_owned_ <= dims_[0] && "MPICArrayKokkos::all_reduce: num_owned exceeds dim0."); + const size_t owned_len = (num_owned_ > 0) ? num_owned_ : dims_[0]; + assert(owned_len > 0 && "MPICArrayKokkos::all_reduce: empty reduction range."); + + // Local varaible for on node reduction + T local; + + if (comm_plan_ == nullptr || comm_plan_->comm_type == communication_plan_type::no_communication) { + + switch (op) { + case operation::sum: { + local = 0; + T loc_sum = 0; + FOR_REDUCE_SUM(i, 0, owned_len, + loc_sum, { + loc_sum += this_array_(i); + }, local); + break; + } + case operation::product: { + local = T(1); + T loc_prod = 1; + FOR_REDUCE_PRODUCT(i, 0, owned_len, + loc_prod, { + loc_prod *= this_array_(i); + }, local); + break; + } + case operation::max: { + local = this_array_(0); + T loc_max = this_array_(0); + FOR_REDUCE_MAX(i, 0, owned_len, + loc_max, { + loc_max = (this_array_(i) > loc_max) ? this_array_(i) : loc_max; + }, local); + break; + } + case operation::min: { + local = this_array_(0); + T loc_min = this_array_(0); + FOR_REDUCE_MIN(i, 0, owned_len, + loc_min, { + loc_min = (this_array_(i) < loc_min) ? this_array_(i) : loc_min; + }, local); + break; + } + default: + printf("MPICArrayKokkos::all_reduce: unsupported operation %d\n", op); + printf("Supported operations are: sum, product, max, min\n"); + Kokkos::abort("MPICArrayKokkos::all_reduce: unsupported operation"); + break; + } + } else { + printf("MPICArrayKokkos::all_reduce: communication plan requires info on ghost vs owned\n"); + } + T global = local; + MPI_Comm comm = MPI_COMM_WORLD; + if (comm_plan_ != nullptr && comm_plan_->has_comm_world) { + comm = comm_plan_->mpi_comm_world; + } + MPI_Allreduce(&local, &global, 1, mpi_type_map::value(), mpi_op_for(op), comm); + return global; } +template +T MPICArrayKokkos::all_reduce(operation op, size_t j) { + assert(order_ == 2 && "MPICArrayKokkos::all_reduce(op,j) requires a rank-2 array."); + assert(j < dims_[1] && "Fixed index j is out of bounds."); + assert(!(op == operation::product && sizeof(T) == sizeof(bool)) && + "MPICArrayKokkos::all_reduce: product reduction is not supported for bool."); + assert(num_owned_ <= dims_[0] && "MPICArrayKokkos::all_reduce: num_owned exceeds dim0."); + const size_t owned_len = (num_owned_ > 0) ? num_owned_ : dims_[0]; + assert(owned_len > 0 && "MPICArrayKokkos::all_reduce: empty reduction range."); + + T local; + if (comm_plan_ == nullptr || comm_plan_->comm_type == communication_plan_type::no_communication) { + switch (op) { + case operation::sum: { + local = 0; + T loc_sum = 0; + FOR_REDUCE_SUM(e, 0, owned_len, loc_sum, { + loc_sum += this_array_(e, j); + }, local); + break; + } + case operation::product: { + local = T(1); + T loc_prod = 1; + FOR_REDUCE_PRODUCT(e, 0, owned_len, loc_prod, { + loc_prod *= this_array_(e, j); + }, local); + break; + } + case operation::max: { + local = this_array_(0, j); + T loc_max = local; + FOR_REDUCE_MAX(e, 0, owned_len, loc_max, { + const T v = this_array_(e, j); + loc_max = (v > loc_max) ? v : loc_max; + }, local); + break; + } + case operation::min: { + local = this_array_(0, j); + T loc_min = local; + FOR_REDUCE_MIN(e, 0, owned_len, loc_min, { + const T v = this_array_(e, j); + loc_min = (v < loc_min) ? v : loc_min; + }, local); + break; + } + default: + printf("MPICArrayKokkos::all_reduce: unsupported operation %d\n", op); + printf("Supported operations are: sum, product, max, min\n"); + Kokkos::abort("MPICArrayKokkos::all_reduce: unsupported operation"); + break; + } + } else { + printf("MPICArrayKokkos::all_reduce: communication plan requires info on ghost vs owned\n"); + } + + T global = local; + MPI_Comm mpi_comm = MPI_COMM_WORLD; + if (comm_plan_ != nullptr && comm_plan_->has_comm_world) { + mpi_comm = comm_plan_->mpi_comm_world; + } + MPI_Allreduce(&local, &global, 1, mpi_type_map::value(), mpi_op_for(op), mpi_comm); + return global; +} + +template +T MPICArrayKokkos::all_reduce(operation op, size_t j, size_t k) { + assert(order_ == 3 && "MPICArrayKokkos::all_reduce(op,j,k) requires a rank-3 array."); + assert(j < dims_[1] && k < dims_[2] && "Fixed tensor indices (j,k) are out of bounds."); + assert(!(op == operation::product && sizeof(T) == sizeof(bool)) && + "MPICArrayKokkos::all_reduce: product reduction is not supported for bool."); + assert(num_owned_ <= dims_[0] && "MPICArrayKokkos::all_reduce: num_owned exceeds dim0."); + const size_t owned_len = (num_owned_ > 0) ? num_owned_ : dims_[0]; + assert(owned_len > 0 && "MPICArrayKokkos::all_reduce: empty reduction range."); + + T local; + if (comm_plan_ == nullptr || comm_plan_->comm_type == communication_plan_type::no_communication) { + switch (op) { + case operation::sum: { + local = 0; + T loc_sum = 0; + FOR_REDUCE_SUM(e, 0, owned_len, loc_sum, { + loc_sum += this_array_(e, j, k); + }, local); + break; + } + case operation::product: { + local = T(1); + T loc_prod = 1; + FOR_REDUCE_PRODUCT(e, 0, owned_len, loc_prod, { + loc_prod *= this_array_(e, j, k); + }, local); + break; + } + case operation::max: { + local = this_array_(0, j, k); + T loc_max = local; + FOR_REDUCE_MAX(e, 0, owned_len, loc_max, { + const T v = this_array_(e, j, k); + loc_max = (v > loc_max) ? v : loc_max; + }, local); + break; + } + case operation::min: { + local = this_array_(0, j, k); + T loc_min = local; + FOR_REDUCE_MIN(e, 0, owned_len, loc_min, { + const T v = this_array_(e, j, k); + loc_min = (v < loc_min) ? v : loc_min; + }, local); + break; + } + default: + printf("MPICArrayKokkos::all_reduce: unsupported operation %d\n", op); + printf("Supported operations are: sum, product, max, min\n"); + Kokkos::abort("MPICArrayKokkos::all_reduce: unsupported operation"); + break; + } + } else { + printf("MPICArrayKokkos::all_reduce: communication plan requires info on ghost vs owned\n"); + } + + T global = local; + MPI_Comm mpi_comm = MPI_COMM_WORLD; + if (comm_plan_ != nullptr && comm_plan_->has_comm_world) { + mpi_comm = comm_plan_->mpi_comm_world; + } + MPI_Allreduce(&local, &global, 1, mpi_type_map::value(), mpi_op_for(op), mpi_comm); + return global; +} + +template +T MPICArrayKokkos::all_reduce(operation op, size_t g, size_t ti, + size_t tj) { + assert(order_ == 4 && "MPICArrayKokkos::all_reduce(op,g,ti,tj) requires a rank-4 array."); + assert(g < dims_[1] && ti < dims_[2] && tj < dims_[3] && + "Fixed indices (Gauss, tensor i, j) are out of bounds."); + assert(!(op == operation::product && sizeof(T) == sizeof(bool)) && + "MPICArrayKokkos::all_reduce: product reduction is not supported for bool."); + assert(num_owned_ <= dims_[0] && "MPICArrayKokkos::all_reduce: num_owned exceeds dim0."); + const size_t owned_len = (num_owned_ > 0) ? num_owned_ : dims_[0]; + assert(owned_len > 0 && "MPICArrayKokkos::all_reduce: empty reduction range."); + + T local; + if (comm_plan_ == nullptr || comm_plan_->comm_type == communication_plan_type::no_communication) { + switch (op) { + case operation::sum: { + local = 0; + T loc_sum = 0; + FOR_REDUCE_SUM(e, 0, owned_len, loc_sum, { + loc_sum += this_array_(e, g, ti, tj); + }, local); + break; + } + case operation::product: { + local = T(1); + T loc_prod = 1; + FOR_REDUCE_PRODUCT(e, 0, owned_len, loc_prod, { + loc_prod *= this_array_(e, g, ti, tj); + }, local); + break; + } + case operation::max: { + local = this_array_(0, g, ti, tj); + T loc_max = local; + FOR_REDUCE_MAX(e, 0, owned_len, loc_max, { + const T v = this_array_(e, g, ti, tj); + loc_max = (v > loc_max) ? v : loc_max; + }, local); + break; + } + case operation::min: { + local = this_array_(0, g, ti, tj); + T loc_min = local; + FOR_REDUCE_MIN(e, 0, owned_len, loc_min, { + const T v = this_array_(e, g, ti, tj); + loc_min = (v < loc_min) ? v : loc_min; + }, local); + break; + } + default: + printf("MPICArrayKokkos::all_reduce: unsupported operation %d\n", op); + printf("Supported operations are: sum, product, max, min\n"); + Kokkos::abort("MPICArrayKokkos::all_reduce: unsupported operation"); + break; + } + } else { + printf("MPICArrayKokkos::all_reduce: communication plan requires info on ghost vs owned\n"); + } + + T global = local; + MPI_Comm mpi_comm = MPI_COMM_WORLD; + if (comm_plan_ != nullptr && comm_plan_->has_comm_world) { + mpi_comm = comm_plan_->mpi_comm_world; + } + MPI_Allreduce(&local, &global, 1, mpi_type_map::value(), mpi_op_for(op), mpi_comm); + return global; +} + + +template +KOKKOS_INLINE_FUNCTION +MPICArrayKokkos::~MPICArrayKokkos() {} + } // end namespace mtr diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e6c2bfaf..55a0a7d4 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -2,6 +2,16 @@ cmake_minimum_required(VERSION 3.5) project (matartest) +# Required at top level so CMake generates ${CMAKE_BINARY_DIR}/CTestTestfile.cmake. +# Without this, ctest from the build root finds no tests (enable_testing in a subdirectory is not enough). +enable_testing() + +# MATAR built with MPI lists MPI::MPI_CXX on imported target `matar`; MatarConfig loads targets during +# find_package(Matar). Only pre-find MPI for those test builds (see cmake_build_test.sh). +option(MATAR_TEST_USE_MPI "Configure tests against MPI-enabled MATAR (requires MPI before find_package)" OFF) +if(MATAR_TEST_USE_MPI) + find_package(MPI REQUIRED COMPONENTS CXX) +endif() find_package(Matar REQUIRED) #------------------------------------------- diff --git a/test/test_cases/CMakeLists.txt b/test/test_cases/CMakeLists.txt index a0e07edd..b9a7ee07 100644 --- a/test/test_cases/CMakeLists.txt +++ b/test/test_cases/CMakeLists.txt @@ -1,7 +1,10 @@ cmake_minimum_required(VERSION 3.5) -# Find all test files in the current directory except test_main.cpp +# Optional MPI for matar_mpi_tests: loaded in ../CMakeLists.txt when MATAR_TEST_USE_MPI is set. + +# Find all test files in the current directory except test_main.cpp and MPI-only sources. file(GLOB TEST_SOURCES "test_*.cpp") +list(REMOVE_ITEM TEST_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/test_mpi_types.cpp") # Create a single test executable that includes all test files add_executable(matar_tests test_main.cpp ${TEST_SOURCES}) @@ -14,6 +17,18 @@ endif() # Add test discovery include(GoogleTest) -enable_testing() gtest_discover_tests(matar_tests) + +# --- MPI + Kokkos: MPICArrayKokkos / CommunicationPlan (run suite under mpirun) --- +if(KOKKOS AND TARGET MPI::MPI_CXX) + add_executable(matar_mpi_tests mpi_test_main.cpp test_mpi_types.cpp) + target_link_libraries(matar_mpi_tests PRIVATE matar gtest Kokkos::kokkos MPI::MPI_CXX) + target_compile_definitions(matar_mpi_tests PRIVATE HAVE_MPI=1 HAVE_KOKKOS=1) + + # --oversubscribe: GitHub Actions / small VMs often expose fewer slots than ranks (Open MPI). + add_test( + NAME MPICArrayKokkos_mpi_suite + COMMAND ${MPIEXEC_EXECUTABLE} --oversubscribe ${MPIEXEC_NUMPROC_FLAG} 4 $ + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) +endif() diff --git a/test/test_cases/mpi_test_main.cpp b/test/test_cases/mpi_test_main.cpp new file mode 100644 index 00000000..570e6ed6 --- /dev/null +++ b/test/test_cases/mpi_test_main.cpp @@ -0,0 +1,15 @@ +#include +#include + +#include + +// MPI must initialize before Kokkos (MATAR_INITIALIZE). +int main(int argc, char** argv) { + MPI_Init(&argc, &argv); + MATAR_INITIALIZE(argc, argv); + ::testing::InitGoogleTest(&argc, argv); + const int result = RUN_ALL_TESTS(); + MATAR_FINALIZE(); + MPI_Finalize(); + return result; +} diff --git a/test/test_cases/test_mpi_types.cpp b/test/test_cases/test_mpi_types.cpp new file mode 100644 index 00000000..3c262411 --- /dev/null +++ b/test/test_cases/test_mpi_types.cpp @@ -0,0 +1,208 @@ +// Google tests for MPICArrayKokkos / CommunicationPlan (HAVE_MPI + HAVE_KOKKOS). +// Run the mpi_test_main executable under mpirun (see test_cases/CMakeLists.txt). + +#if !defined(HAVE_MPI) || !defined(HAVE_KOKKOS) + +#include + +TEST(MPI_Types, SkippedWithoutMpiKokkos) { + GTEST_SKIP() << "Build MATAR tests with MPI and Kokkos enabled."; +} + +#else + +#include +#include + +#include +#include + +using namespace mtr; + +namespace { + +void mpi_rank_size(int* rank, int* size) { + MPI_Comm_rank(MPI_COMM_WORLD, rank); + MPI_Comm_size(MPI_COMM_WORLD, size); +} + +} // namespace + +TEST(MPICArrayKokkos, AllReduce_Sum_1D) { + int rank = 0; + int size = 1; + mpi_rank_size(&rank, &size); + + CommunicationPlan comm_plan; + comm_plan.initialize(MPI_COMM_WORLD); + + const int num_values = 100; + MPICArrayKokkos locals(num_values, "ut_values"); + locals.initialize_comm_plan(comm_plan); + locals.set_values(1.0); + locals.update_device(); + + const double global_sum = locals.all_reduce(operation::sum); + const double expected = static_cast(num_values * size); + EXPECT_DOUBLE_EQ(global_sum, expected); +} + +TEST(MPICArrayKokkos, AllReduce_Sum_VariableLengthPerRank) { + int rank = 0; + int size = 1; + mpi_rank_size(&rank, &size); + + CommunicationPlan comm_plan; + comm_plan.initialize(MPI_COMM_WORLD); + + const int num_values_per_rank = 10 * (1 + rank); + MPICArrayKokkos rank_locals(num_values_per_rank, "ut_varlen"); + rank_locals.initialize_comm_plan(comm_plan); + rank_locals.set_values(1.0); + rank_locals.update_device(); + + const double global_sum = rank_locals.all_reduce(operation::sum); + const double expected = + 10.0 * static_cast(size * (size + 1) / 2); + EXPECT_DOUBLE_EQ(global_sum, expected); +} + +TEST(MPICArrayKokkos, AllReduce_MinMax_1D) { + int rank = 0; + int size = 1; + mpi_rank_size(&rank, &size); + + CommunicationPlan comm_plan; + comm_plan.initialize(MPI_COMM_WORLD); + + const int num_values_per_rank = 10 * (1 + rank); + MPICArrayKokkos vals(num_values_per_rank, "ut_minmax"); + vals.initialize_comm_plan(comm_plan); + + FOR_ALL(i, 0, num_values_per_rank, { + vals(i) = static_cast(10 * rank + i); + }); + MATAR_FENCE(); + vals.update_device(); + + const float global_min = vals.all_reduce(operation::min); + const float global_max = vals.all_reduce(operation::max); + EXPECT_FLOAT_EQ(global_min, 0.0F); + const float expected_max = + static_cast(10 * (size - 1) + (10 * size - 1)); + EXPECT_FLOAT_EQ(global_max, expected_max); +} + +TEST(MPICArrayKokkos, AllReduce_Product) { + int rank = 0; + int size = 1; + mpi_rank_size(&rank, &size); + + CommunicationPlan comm_plan; + comm_plan.initialize(MPI_COMM_WORLD); + + MPICArrayKokkos prod_locals(4, "ut_prod"); + prod_locals.initialize_comm_plan(comm_plan); + prod_locals.set_values(2.0); + prod_locals.update_device(); + + const double global_product = prod_locals.all_reduce(operation::product); + const double expected = std::pow(2.0, 4 * size); + EXPECT_DOUBLE_EQ(global_product, expected); +} + +TEST(MPICArrayKokkos, AllReduce_Rank2_CentroidXYZ) { + int rank = 0; + int size = 1; + mpi_rank_size(&rank, &size); + + CommunicationPlan comm_plan; + comm_plan.initialize(MPI_COMM_WORLD); + + const size_t n_elem = 3; + constexpr int num_coords = 3; + + MPICArrayKokkos elem_centroids(n_elem, static_cast(num_coords), + "ut_centroids"); + elem_centroids.initialize_comm_plan(comm_plan); + FOR_ALL(elem_id, 0, n_elem, elem_position, 0, num_coords, { + const double base = 1000.0 * rank + 100.0 * elem_id; + elem_centroids(elem_id, elem_position) = + base + 10.0 * static_cast(elem_position); + }); + MATAR_FENCE(); + elem_centroids.update_device(); + + const double max_x = elem_centroids.all_reduce(operation::max, 0U); + const double max_y = elem_centroids.all_reduce(operation::max, 1U); + const double max_z = elem_centroids.all_reduce(operation::max, 2U); + + const double base_rank = 1000.0 * static_cast(size - 1); + const double base_elem = 100.0 * static_cast(n_elem - 1); + EXPECT_DOUBLE_EQ(max_x, base_rank + base_elem + 0.0); + EXPECT_DOUBLE_EQ(max_y, base_rank + base_elem + 10.0); + EXPECT_DOUBLE_EQ(max_z, base_rank + base_elem + 20.0); +} + +TEST(MPICArrayKokkos, AllReduce_Rank3_StressComponent) { + int rank = 0; + int size = 1; + mpi_rank_size(&rank, &size); + + CommunicationPlan comm_plan; + comm_plan.initialize(MPI_COMM_WORLD); + + const size_t n_elem = 3; + MPICArrayKokkos stress(n_elem, 3, 3, "ut_stress"); + stress.initialize_comm_plan(comm_plan); + FOR_ALL(e, 0, n_elem, r, 0, 3, c, 0, 3, { + stress(e, r, c) = 10000.0 * rank + 1000.0 * e + 100.0 * r + c; + }); + MATAR_FENCE(); + stress.update_device(); + + const double max_comp = + stress.all_reduce(operation::max, static_cast(0), + static_cast(1)); + const double expected = 10000.0 * static_cast(size - 1) + + 1000.0 * static_cast(n_elem - 1) + 1.0; + EXPECT_DOUBLE_EQ(max_comp, expected); +} + +TEST(MPICArrayKokkos, AllReduce_Rank4_GaussStressComponent) { + int rank = 0; + int size = 1; + mpi_rank_size(&rank, &size); + + CommunicationPlan comm_plan; + comm_plan.initialize(MPI_COMM_WORLD); + + const size_t n_elem = 3; + const size_t n_gauss = 2; + + MPICArrayKokkos s4(n_elem, n_gauss, 3, 3, "ut_s4"); + s4.initialize_comm_plan(comm_plan); + for (size_t e = 0; e < n_elem; ++e) { + for (size_t g = 0; g < n_gauss; ++g) { + for (size_t r = 0; r < 3; ++r) { + for (size_t c = 0; c < 3; ++c) { + s4.host(e, g, r, c) = + 100000.0 * rank + 1000.0 * static_cast(e) + + 100.0 * static_cast(g) + + 10.0 * static_cast(r) + + static_cast(c); + } + } + } + } + s4.update_device(); + + const double max_qp = + s4.all_reduce(operation::max, static_cast(1), + static_cast(0), static_cast(1)); + const double expected = 100000.0 * static_cast(size - 1) + + 1000.0 * static_cast(n_elem - 1) + 101.0; + EXPECT_DOUBLE_EQ(max_qp, expected); +} + +#endif // HAVE_MPI && HAVE_KOKKOS