From d744256fe2b4f03aab181f9ef01c4acf41186666 Mon Sep 17 00:00:00 2001 From: LudwigBoess Date: Sun, 3 May 2026 23:34:50 +0000 Subject: [PATCH] replace explicit host copy with `Kokkos::fence()` to fix gpu aware mpi issues --- CMakeLists.txt | 13 ----- cmake/defaults.cmake | 12 ----- cmake/report.cmake | 15 ------ src/framework/containers/particles_comm.cpp | 56 ++++----------------- src/framework/domain/comm_mpi.hpp | 52 ++++--------------- src/global/utils/reporter.cpp | 13 ----- src/global/utils/reporter.h | 1 - 7 files changed, 21 insertions(+), 141 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1c4204843..befd9e90c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,10 +54,6 @@ set(mpi ${default_mpi} CACHE BOOL "Use MPI") -set(gpu_aware_mpi - ${default_gpu_aware_mpi} - CACHE BOOL "Enable GPU-aware MPI") - # -------------------------- Compilation settings -------------------------- # set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -142,15 +138,6 @@ if(${mpi}) include_directories(${MPI_CXX_INCLUDE_PATH}) add_compile_options("-D MPI_ENABLED") set(DEPENDENCIES ${DEPENDENCIES} MPI::MPI_CXX) - if(${DEVICE_ENABLED}) - if(${gpu_aware_mpi}) - add_compile_options("-D GPU_AWARE_MPI") - endif() - else() - set(gpu_aware_mpi - OFF - CACHE BOOL "Use explicit copy when using MPI + GPU") - endif() endif() # Output diff --git a/cmake/defaults.cmake b/cmake/defaults.cmake index fb8790019..db26cdc82 100644 --- a/cmake/defaults.cmake +++ b/cmake/defaults.cmake @@ -80,15 +80,3 @@ else() endif() set_property(CACHE default_mpi PROPERTY TYPE BOOL) - -if(DEFINED ENV{Entity_ENABLE_GPU_AWARE_MPI}) - set(default_gpu_aware_mpi - $ENV{Entity_ENABLE_GPU_AWARE_MPI} - CACHE INTERNAL "Default flag for GPU-aware MPI") -else() - set(default_gpu_aware_mpi - ON - CACHE INTERNAL "Default flag for GPU-aware MPI") -endif() - -set_property(CACHE default_gpu_aware_mpi PROPERTY TYPE BOOL) diff --git a/cmake/report.cmake b/cmake/report.cmake index 7e4779468..dc3b0f296 100644 --- a/cmake/report.cmake +++ b/cmake/report.cmake @@ -111,17 +111,6 @@ printchoices( "${Green}" MPI_REPORT 46) -if(${mpi} AND ${DEVICE_ENABLED}) - printchoices( - "GPU-aware MPI" - "gpu_aware_mpi" - "${ON_OFF_VALUES}" - ${gpu_aware_mpi} - OFF - "${Green}" - GPU_AWARE_MPI_REPORT - 46) -endif() printchoices( "Debug mode" "DEBUG" @@ -193,10 +182,6 @@ string( ${MPI_REPORT} "\n") -if(${mpi} AND ${DEVICE_ENABLED}) - string(APPEND REPORT_TEXT " " ${GPU_AWARE_MPI_REPORT} "\n") -endif() - string( APPEND REPORT_TEXT diff --git a/src/framework/containers/particles_comm.cpp b/src/framework/containers/particles_comm.cpp index 8fe9d1ec5..08aecfdb4 100644 --- a/src/framework/containers/particles_comm.cpp +++ b/src/framework/containers/particles_comm.cpp @@ -28,7 +28,9 @@ namespace ntt { npart_t nsend, npart_t nrecv, npart_t offset) { -#if !defined(DEVICE_ENABLED) || defined(GPU_AWARE_MPI) +#if defined(DEVICE_ENABLED) + Kokkos::fence(); +#endif MPI_Sendrecv(send_arr.data(), nsend, mpi::get_type(), @@ -41,27 +43,6 @@ namespace ntt { 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); -#else - const auto slice = std::make_pair(offset, offset + nrecv); - - auto send_arr_h = Kokkos::create_mirror_view(send_arr); - auto recv_arr_h = Kokkos::create_mirror_view( - Kokkos::subview(recv_arr, slice)); - Kokkos::deep_copy(send_arr_h, send_arr); - MPI_Sendrecv(send_arr_h.data(), - nsend, - mpi::get_type(), - send_rank, - 0, - recv_arr_h.data(), - nrecv, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - Kokkos::deep_copy(Kokkos::subview(recv_arr, slice), recv_arr_h); -#endif } void send_recv_count(int send_rank, @@ -98,18 +79,17 @@ namespace ntt { template void send(array_t& send_arr, int send_rank, npart_t nsend) { -#if !defined(DEVICE_ENABLED) || defined(GPU_AWARE_MPI) - MPI_Send(send_arr.data(), nsend, mpi::get_type(), send_rank, 0, MPI_COMM_WORLD); -#else - auto send_arr_h = Kokkos::create_mirror_view(send_arr); - Kokkos::deep_copy(send_arr_h, send_arr); - MPI_Send(send_arr_h.data(), nsend, mpi::get_type(), send_rank, 0, MPI_COMM_WORLD); +#if defined(DEVICE_ENABLED) + Kokkos::fence(); #endif + MPI_Send(send_arr.data(), nsend, mpi::get_type(), send_rank, 0, MPI_COMM_WORLD); } template void recv(array_t& recv_arr, int recv_rank, npart_t nrecv, npart_t offset) { -#if !defined(DEVICE_ENABLED) || defined(GPU_AWARE_MPI) +#if defined(DEVICE_ENABLED) + Kokkos::fence(); +#endif MPI_Recv(recv_arr.data() + offset, nrecv, mpi::get_type(), @@ -117,20 +97,6 @@ namespace ntt { 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); -#else - const auto slice = std::make_pair(offset, offset + nrecv); - - auto recv_arr_h = Kokkos::create_mirror_view( - Kokkos::subview(recv_arr, slice)); - MPI_Recv(recv_arr_h.data(), - nrecv, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - Kokkos::deep_copy(Kokkos::subview(recv_arr, slice), recv_arr_h); -#endif } template @@ -232,8 +198,8 @@ namespace ntt { // buffers to store recv data const auto npart_recv = std::accumulate(npptag_recv_vec.begin(), - npptag_recv_vec.end(), - static_cast(0)); + npptag_recv_vec.end(), + static_cast(0)); array_t recv_buff_int { "recv_buff_int", npart_recv * NINTS }; array_t recv_buff_real { "recv_buff_real", npart_recv * NREALS }; array_t recv_buff_prtldx { "recv_buff_prtldx", npart_recv * NPRTLDX }; diff --git a/src/framework/domain/comm_mpi.hpp b/src/framework/domain/comm_mpi.hpp index 266c844c5..6e4b1a318 100644 --- a/src/framework/domain/comm_mpi.hpp +++ b/src/framework/domain/comm_mpi.hpp @@ -33,7 +33,9 @@ namespace comm { int recv_rank, ncells_t nsend, ncells_t nrecv) { -#if !defined(DEVICE_ENABLED) || defined(GPU_AWARE_MPI) +#if defined(DEVICE_ENABLED) + Kokkos::fence(); +#endif MPI_Sendrecv(send_arr.data(), nsend, mpi::get_type(), @@ -46,45 +48,22 @@ namespace comm { 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); -#else - auto send_arr_h = Kokkos::create_mirror_view(send_arr); - auto recv_arr_h = Kokkos::create_mirror_view(recv_arr); - Kokkos::deep_copy(send_arr_h, send_arr); - MPI_Sendrecv(send_arr_h.data(), - nsend, - mpi::get_type(), - send_rank, - 0, - recv_arr_h.data(), - nrecv, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - Kokkos::deep_copy(recv_arr, recv_arr_h); -#endif } template void send(ndarray_t& send_arr, int send_rank, ncells_t nsend) { -#if !defined(DEVICE_ENABLED) || defined(GPU_AWARE_MPI) - MPI_Send(send_arr.data(), nsend, mpi::get_type(), send_rank, 0, MPI_COMM_WORLD); -#else - auto send_arr_h = Kokkos::create_mirror_view(send_arr); - Kokkos::deep_copy(send_arr_h, send_arr); - MPI_Send(send_arr_h.data(), - nsend, - mpi::get_type(), - send_rank, - 0, - MPI_COMM_WORLD); +#if defined(DEVICE_ENABLED) + Kokkos::fence(); #endif + MPI_Send(send_arr.data(), nsend, mpi::get_type(), send_rank, 0, MPI_COMM_WORLD); + } template void recv(ndarray_t& recv_arr, int recv_rank, ncells_t nrecv) { -#if !defined(DEVICE_ENABLED) || defined(GPU_AWARE_MPI) +#if defined(DEVICE_ENABLED) + Kokkos::fence(); +#endif MPI_Recv(recv_arr.data(), nrecv, mpi::get_type(), @@ -92,17 +71,6 @@ namespace comm { 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); -#else - auto recv_arr_h = Kokkos::create_mirror_view(recv_arr); - MPI_Recv(recv_arr_h.data(), - nrecv, - mpi::get_type(), - recv_rank, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - Kokkos::deep_copy(recv_arr, recv_arr_h); -#endif } template diff --git a/src/global/utils/reporter.cpp b/src/global/utils/reporter.cpp index 77117c4b9..54424ab43 100644 --- a/src/global/utils/reporter.cpp +++ b/src/global/utils/reporter.cpp @@ -185,14 +185,6 @@ namespace reporter { AddParam(report, 4, "HIP", "%s", hip_version.c_str()); #endif AddParam(report, 4, "MPI", "%s", mpi_version.c_str()); -#if defined(MPI_ENABLED) && defined(DEVICE_ENABLED) - #if defined(GPU_AWARE_MPI) - const std::string gpu_aware_mpi = "ON"; - #else - const std::string gpu_aware_mpi = "OFF"; - #endif - AddParam(report, 4, "GPU-aware MPI", "%s", gpu_aware_mpi.c_str()); -#endif AddParam(report, 4, "Kokkos", "%s", kokkos_version.c_str()); AddParam(report, 4, "ADIOS2", "%s", adios2_version.c_str()); AddParam(report, 4, "Precision", "%s", precision); @@ -245,11 +237,6 @@ namespace reporter { AddParam(report, 4, "MPI_ENABLED", "%s", "OFF"); #endif -#if defined(GPU_AWARE_MPI) - AddParam(report, 4, "GPU_AWARE_MPI", "%s", "ON"); -#else - AddParam(report, 4, "GPU_AWARE_MPI", "%s", "OFF"); -#endif report += "\n"; return report; } diff --git a/src/global/utils/reporter.h b/src/global/utils/reporter.h index a7bd9b364..6bbc4b738 100644 --- a/src/global/utils/reporter.h +++ b/src/global/utils/reporter.h @@ -20,7 +20,6 @@ * - DEVICE_ENABLED * - DEBUG * - SINGLE_PRECISION - * - GPU_AWARE_MPI */ #ifndef GLOBAL_UTILS_REPORTER_H