From d15acc80812afa8857db20187c2a20326936dc31 Mon Sep 17 00:00:00 2001 From: Philip Oesterle-Pekrun Date: Sun, 3 May 2026 20:09:06 +0200 Subject: [PATCH 1/4] Enable the usage of Kokkos with CUDA backend Signed-off-by: Philip Oesterle-Pekrun --- apps/global_full/4C_global_full_main.cpp | 55 ++++++++ apps/global_full/CMakeLists.txt | 29 ++++- cmake/configure/configure_MIRCO.cmake | 15 ++- cmake/configure/configure_Trilinos.cmake | 15 +++ .../functions/four_c_auto_define_module.cmake | 11 ++ src/cut/4C_cut_pointgraph.cpp | 3 + utilities/clangcuda++ | 123 ++++++++++++++++++ 7 files changed, 244 insertions(+), 7 deletions(-) create mode 100755 utilities/clangcuda++ diff --git a/apps/global_full/4C_global_full_main.cpp b/apps/global_full/4C_global_full_main.cpp index 4d50d6faed4..ebef9d802d6 100644 --- a/apps/global_full/4C_global_full_main.cpp +++ b/apps/global_full/4C_global_full_main.cpp @@ -29,6 +29,14 @@ #ifdef FOUR_C_ENABLE_FE_TRAPPING #include #endif +//#/#{ +#include +#include + +#include +#include +#include +//#/#} using namespace FourC; @@ -67,6 +75,53 @@ int main(int argc, char* argv[]) .np_type = arguments.nptype, .diffgroup = arguments.diffgroup, }; + + // # TEST KOKKOS STUFF HERE:{ + // Kokkos + { + using ExecSpace_DefaultHost_t = Kokkos::DefaultHostExecutionSpace; + using ExecSpace_Default_t = Kokkos::DefaultExecutionSpace; + using MemorySpace_Host_t = Kokkos::HostSpace; + using MemorySpace_ofDefaultExec_t = ExecSpace_Default_t::memory_space; + using Device_Host_t = Kokkos::Device; + using Device_Default_t = Kokkos::Device; + + using ViewVector_d = Kokkos::View; + using ViewMatrix_d = Kokkos::View; + + std::cout << "-- Kokkos information --\n"; + std::cout << "Threads in use: " << ExecSpace_Default_t().concurrency() << "\n"; + std::cout << "Default execution space: " << typeid(ExecSpace_Default_t).name() << "\n"; + std::cout << "Default host execution space: " << typeid(ExecSpace_DefaultHost_t).name() << "\n"; + std::cout << "Default memory space: " << typeid(MemorySpace_ofDefaultExec_t).name() << "\n"; + std::cout << "Default host memory space: " << typeid(MemorySpace_Host_t).name() << "\n"; + std::cout << "Num devices = " << Kokkos::num_devices() << "\n"; + std::cout << "\n"; + + } + + + // TPETRA + { + using LO = int; + using GO = int; + using map_type = Tpetra::Map; + using vec_type = Tpetra::Vector; + + + + using node_type = typename vec_type::node_type; + using device_type = typename vec_type::device_type; + using execution_space = typename vec_type::execution_space; + using memory_space = typename device_type::memory_space; + + std::cout << "-- Tpetra type information --\n"; + std::cout << "vec_type::node_type = " << typeid(node_type).name() << '\n'; + std::cout << "vec_type::device_type = " << typeid(device_type).name() << '\n'; + std::cout << "vec_type::execution_space = " << typeid(execution_space).name() << '\n'; + std::cout << "vec_type::memory_space = " << typeid(memory_space).name() << '\n'; + std::cout << '\n'; + } // Initialize communicators and use RAII to ensure that they are finalized properly in the end. // Note: Communicators must be finalized after singleton cleanup and before MPI finalization diff --git a/apps/global_full/CMakeLists.txt b/apps/global_full/CMakeLists.txt index c417bfee89b..a440a05c2db 100644 --- a/apps/global_full/CMakeLists.txt +++ b/apps/global_full/CMakeLists.txt @@ -16,8 +16,35 @@ add_executable(${FOUR_C_EXECUTABLE_NAME} ${OBJS_FOUR_C_MAIN}) set_target_properties( ${FOUR_C_EXECUTABLE_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR} ) +if(FOUR_C_CLANGCUDA) + set_target_properties( + ${FOUR_C_EXECUTABLE_NAME} + PROPERTIES + CXX_COMPILER_LAUNCHER "" + C_COMPILER_LAUNCHER "" + CUDA_COMPILER_LAUNCHER "" + RULE_LAUNCH_COMPILE "" + RULE_LAUNCH_LINK "" + ) + target_compile_definitions(${FOUR_C_EXECUTABLE_NAME} PRIVATE + FOUR_C_CLANGCUDA_HOST_ONLY + ) +endif() four_c_set_up_executable(${FOUR_C_EXECUTABLE_NAME}) - +if(FOUR_C_CLANGCUDA) + set_target_properties( + ${FOUR_C_EXECUTABLE_NAME} + PROPERTIES + CXX_COMPILER_LAUNCHER "" + C_COMPILER_LAUNCHER "" + CUDA_COMPILER_LAUNCHER "" + RULE_LAUNCH_COMPILE "" + RULE_LAUNCH_LINK "" + ) + target_compile_definitions(${FOUR_C_EXECUTABLE_NAME} PRIVATE + FOUR_C_CLANGCUDA_HOST_ONLY + ) +endif() if(FOUR_C_ENABLE_METADATA_GENERATION) if(FOUR_C_WITH_PYTHON) add_custom_command( diff --git a/cmake/configure/configure_MIRCO.cmake b/cmake/configure/configure_MIRCO.cmake index 684ee039fd7..2e25ac53bbf 100644 --- a/cmake/configure/configure_MIRCO.cmake +++ b/cmake/configure/configure_MIRCO.cmake @@ -13,7 +13,7 @@ four_c_process_global_option( OFF ) if(FOUR_C_MIRCO_FIND_INSTALLED) - + # Note that MIRCO and 4C must point to the same Kokkos and Kokkos-Kernels installation. Otherwise, there will be errors. message(STATUS "FOUR_C_MIRCO_FIND_INSTALLED is enabled") # MIRCO provides a package configuration file if installed. @@ -27,12 +27,15 @@ if(FOUR_C_MIRCO_FIND_INSTALLED) endif() else() # Fetch MIRCO from GIT repository - # Turn off googletest and Trilinos in MIRCO so that they don't interfere with 4C + # Turn off googletest in MIRCO so that it does not interfere with 4C. set(GTEST_IN_MIRCO "OFF") - set(TRILINOS_IN_MIRCO "OFF") + # Explicitly turn off "*_IN_MIRCO", so that MIRCO uses upstream targets + set(RYML_IN_MIRCO "OFF") + set(KOKKOS_IN_MIRCO "OFF") + set(KOKKOS_KERNELS_IN_MIRCO "OFF") set(MIRCO_GIT_REPO "https://github.com/imcs-compsim/MIRCO.git") - set(MIRCO_GIT_TAG "b9d0c4ba27ff8463a3d2b17163fead8800b2650c") # latest hash 03.04.2026 + set(MIRCO_GIT_TAG "de05a25cf595510b8e315d91aa56b6f7a69ad727") # latest hash 03.05.2026 fetchcontent_declare( mirco @@ -42,8 +45,8 @@ else() # Fetch MIRCO from GIT repository fetchcontent_makeavailable(mirco) # MIRCO requires a specific path, possibly due to inconsistent naming "mirco" vs "mirco_lib". set(FOUR_C_MIRCO_ROOT "${CMAKE_INSTALL_PREFIX}/lib/cmake/mirco") - - four_c_add_external_dependency(four_c_all_enabled_external_dependencies mirco::mirco_lib) endif() +four_c_add_external_dependency(four_c_all_enabled_external_dependencies mirco::mirco_lib) + four_c_remember_variable_for_install(FOUR_C_MIRCO_ROOT) diff --git a/cmake/configure/configure_Trilinos.cmake b/cmake/configure/configure_Trilinos.cmake index d6081f877f3..eb537af01cd 100644 --- a/cmake/configure/configure_Trilinos.cmake +++ b/cmake/configure/configure_Trilinos.cmake @@ -71,6 +71,21 @@ find_package(Trilinos REQUIRED) message(STATUS "Trilinos version: ${Trilinos_VERSION}") message(STATUS "Trilinos packages: ${Trilinos_PACKAGE_LIST}") +if(FOUR_C_CLANGCUDA) + set(CMAKE_CXX_COMPILER_LAUNCHER "" CACHE STRING "" FORCE) + set(CMAKE_C_COMPILER_LAUNCHER "" CACHE STRING "" FORCE) + set(CMAKE_CUDA_COMPILER_LAUNCHER "" CACHE STRING "" FORCE) + + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "") + set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "") + + set_property(DIRECTORY PROPERTY RULE_LAUNCH_COMPILE "") + set_property(DIRECTORY PROPERTY RULE_LAUNCH_LINK "") + + get_property(_global_rule GLOBAL PROPERTY RULE_LAUNCH_COMPILE) + get_property(_dir_rule DIRECTORY PROPERTY RULE_LAUNCH_COMPILE) +endif() + # Figure out the version. if(EXISTS "${Trilinos_DIR}/../../../TrilinosRepoVersion.txt") file(STRINGS "${Trilinos_DIR}/../../../TrilinosRepoVersion.txt" TrilinosRepoVersionFile) diff --git a/cmake/functions/four_c_auto_define_module.cmake b/cmake/functions/four_c_auto_define_module.cmake index 17cc65325e1..d63d5bdc7a9 100644 --- a/cmake/functions/four_c_auto_define_module.cmake +++ b/cmake/functions/four_c_auto_define_module.cmake @@ -37,6 +37,17 @@ function(four_c_auto_define_module) # Add all global compile settings as PRIVATE. We only want to use them to compile our own files and not force # them on other users of the library. target_link_libraries(${_target}_objs PRIVATE four_c_private_compile_interface) + + if(FOUR_C_CLANGCUDA) + set_target_properties(${_target}_objs PROPERTIES + CXX_COMPILER_LAUNCHER "" + C_COMPILER_LAUNCHER "" + CUDA_COMPILER_LAUNCHER "" + RULE_LAUNCH_COMPILE "" + RULE_LAUNCH_LINK "" + ) + target_compile_definitions(${_target}_objs PRIVATE FOUR_C_CLANGCUDA_HOST_ONLY) + endif() if(FOUR_C_ENABLE_IWYU) set_target_properties( diff --git a/src/cut/4C_cut_pointgraph.cpp b/src/cut/4C_cut_pointgraph.cpp index ae87b9bff6a..c0451dafe4f 100644 --- a/src/cut/4C_cut_pointgraph.cpp +++ b/src/cut/4C_cut_pointgraph.cpp @@ -12,6 +12,9 @@ #include "4C_cut_pointgraph_simple.hpp" #include "4C_cut_side.hpp" +#ifdef FOUR_C_CLANGCUDA_HOST_ONLY + #undef __noinline__ +#endif #include #include diff --git a/utilities/clangcuda++ b/utilities/clangcuda++ new file mode 100755 index 00000000000..59ae664f8e2 --- /dev/null +++ b/utilities/clangcuda++ @@ -0,0 +1,123 @@ +#!/usr/bin/env bash + +# This is a compiler wrapper around clang++ for building 4C when using a Kokkos installation with the Cuda backend enabled, due to issues with nvcc and the nvcc_wrapper provided by Kokkos. +# +# When relevant, this wrapper should be used as the CXX_COMPILER or OMPI_CXX backend in combination with CMake flags to signal which targets (or objects) should be compiled for CUDA device, CUDA host-only, or normal (non-CUDA) C++. +# +# The following environment variables can be used to override the defaults: +# - CLANGCUDA_CLANG: path to clang++ +# - CLANGCUDA_CUDA_PATH: path to the CUDA toolkit +# - CLANGCUDA_ARCH: GPU architecture to target (default is sm_90) +# - CLANGCUDA_LOG: optional path to a log file for recording final commands + +clang="${CLANGCUDA_CLANG:-/bin/clang++}" +cuda_path="${CLANGCUDA_CUDA_PATH:-${CUDA_HOME:-/usr/local/cuda}}" + +compile=0 +cuda_host_only=0 +cuda_device=0 +arch="${CLANGCUDA_ARCH:-sm_90}" + +args=() +skip_next_x=0 + +for arg in "$@"; do + if [[ "$skip_next_x" == "1" ]]; then + skip_next_x=0 + continue + fi + + case "$arg" in + -c) + compile=1 + args+=("$arg") + ;; + + -DFOUR_C_CLANGCUDA_HOST_ONLY) + cuda_host_only=1 + args+=("$arg") + ;; + + -DFOUR_C_CLANGCUDA_DEVICE_COMPILE) + cuda_device=1 + args+=("$arg") + ;; + + *.cu) + cuda_device=1 + args+=("$arg") + ;; + + -extended-lambda|--extended-lambda|--expt-extended-lambda|-expt-extended-lambda) + ;; + + -expt-relaxed-constexpr|--expt-relaxed-constexpr) + ;; + + -arch=sm_*|--cuda-gpu-arch=sm_*) + # Ignore incoming arch flags. Wrapper decides from CLANGCUDA_ARCH or default. + ;; + + -ccbin=*|--compiler-bindir=*) + ;; + + -x) + # Drop incoming language override and the following language token. + skip_next_x=1 + ;; + + *) + args+=("$arg") + ;; + esac +done + +# Sanity check +if [[ "$cuda_host_only" == "1" && "$cuda_device" == "1" ]]; then + has_explicit_device=0 + for arg in "$@"; do + if [[ "$arg" == "-DFOUR_C_CLANGCUDA_DEVICE_COMPILE" ]]; then + has_explicit_device=1 + break + fi + done + + if [[ "$has_explicit_device" == "1" ]]; then + echo "clangcuda++ wrapper error: both FOUR_C_CLANGCUDA_HOST_ONLY and FOUR_C_CLANGCUDA_DEVICE_COMPILE were set" >&2 + exit 1 + fi +fi + +if [[ "$compile" == "1" && "$cuda_host_only" == "1" ]]; then + final=( + "$clang" + -x cuda + --cuda-host-only + --cuda-path="$cuda_path" + --cuda-gpu-arch="$arch" + -Wno-unknown-cuda-version + "${args[@]}" + ) +elif [[ "$compile" == "1" && "$cuda_device" == "1" ]]; then + final=( + "$clang" + -x cuda + --cuda-path="$cuda_path" + --cuda-gpu-arch="$arch" + -Wno-unknown-cuda-version + "${args[@]}" + ) +else + final=( + "$clang" + -Wno-unknown-cuda-version + "${args[@]}" + ) +fi + +if [[ -n "$CLANGCUDA_LOG" ]]; then + printf '%q ' "${final[@]}" >> "$CLANGCUDA_LOG" + printf '\n' >> "$CLANGCUDA_LOG" +fi + +exec "${final[@]}" From 701569c480d12693b8fae56467f598388e155af4 Mon Sep 17 00:00:00 2001 From: Philip Oesterle-Pekrun Date: Sun, 3 May 2026 22:15:34 +0200 Subject: [PATCH 2/4] Add global option and ArborX conflict warning Signed-off-by: Philip Oesterle-Pekrun --- apps/global_full/4C_global_full_main.cpp | 55 ------------------- apps/global_full/CMakeLists.txt | 31 +++-------- cmake/configure/configure_MIRCO.cmake | 8 +++ cmake/configure/configure_Trilinos.cmake | 21 ++++--- .../functions/four_c_auto_define_module.cmake | 19 ++++--- cmake/setup_global_options.cmake | 14 +++++ src/cut/4C_cut_pointgraph.cpp | 14 ++++- utilities/clangcuda++ | 44 ++++++++------- 8 files changed, 86 insertions(+), 120 deletions(-) diff --git a/apps/global_full/4C_global_full_main.cpp b/apps/global_full/4C_global_full_main.cpp index ebef9d802d6..4d50d6faed4 100644 --- a/apps/global_full/4C_global_full_main.cpp +++ b/apps/global_full/4C_global_full_main.cpp @@ -29,14 +29,6 @@ #ifdef FOUR_C_ENABLE_FE_TRAPPING #include #endif -//#/#{ -#include -#include - -#include -#include -#include -//#/#} using namespace FourC; @@ -75,53 +67,6 @@ int main(int argc, char* argv[]) .np_type = arguments.nptype, .diffgroup = arguments.diffgroup, }; - - // # TEST KOKKOS STUFF HERE:{ - // Kokkos - { - using ExecSpace_DefaultHost_t = Kokkos::DefaultHostExecutionSpace; - using ExecSpace_Default_t = Kokkos::DefaultExecutionSpace; - using MemorySpace_Host_t = Kokkos::HostSpace; - using MemorySpace_ofDefaultExec_t = ExecSpace_Default_t::memory_space; - using Device_Host_t = Kokkos::Device; - using Device_Default_t = Kokkos::Device; - - using ViewVector_d = Kokkos::View; - using ViewMatrix_d = Kokkos::View; - - std::cout << "-- Kokkos information --\n"; - std::cout << "Threads in use: " << ExecSpace_Default_t().concurrency() << "\n"; - std::cout << "Default execution space: " << typeid(ExecSpace_Default_t).name() << "\n"; - std::cout << "Default host execution space: " << typeid(ExecSpace_DefaultHost_t).name() << "\n"; - std::cout << "Default memory space: " << typeid(MemorySpace_ofDefaultExec_t).name() << "\n"; - std::cout << "Default host memory space: " << typeid(MemorySpace_Host_t).name() << "\n"; - std::cout << "Num devices = " << Kokkos::num_devices() << "\n"; - std::cout << "\n"; - - } - - - // TPETRA - { - using LO = int; - using GO = int; - using map_type = Tpetra::Map; - using vec_type = Tpetra::Vector; - - - - using node_type = typename vec_type::node_type; - using device_type = typename vec_type::device_type; - using execution_space = typename vec_type::execution_space; - using memory_space = typename device_type::memory_space; - - std::cout << "-- Tpetra type information --\n"; - std::cout << "vec_type::node_type = " << typeid(node_type).name() << '\n'; - std::cout << "vec_type::device_type = " << typeid(device_type).name() << '\n'; - std::cout << "vec_type::execution_space = " << typeid(execution_space).name() << '\n'; - std::cout << "vec_type::memory_space = " << typeid(memory_space).name() << '\n'; - std::cout << '\n'; - } // Initialize communicators and use RAII to ensure that they are finalized properly in the end. // Note: Communicators must be finalized after singleton cleanup and before MPI finalization diff --git a/apps/global_full/CMakeLists.txt b/apps/global_full/CMakeLists.txt index a440a05c2db..e3f843ed71b 100644 --- a/apps/global_full/CMakeLists.txt +++ b/apps/global_full/CMakeLists.txt @@ -16,34 +16,17 @@ add_executable(${FOUR_C_EXECUTABLE_NAME} ${OBJS_FOUR_C_MAIN}) set_target_properties( ${FOUR_C_EXECUTABLE_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR} ) -if(FOUR_C_CLANGCUDA) - set_target_properties( - ${FOUR_C_EXECUTABLE_NAME} - PROPERTIES - CXX_COMPILER_LAUNCHER "" - C_COMPILER_LAUNCHER "" - CUDA_COMPILER_LAUNCHER "" - RULE_LAUNCH_COMPILE "" - RULE_LAUNCH_LINK "" - ) - target_compile_definitions(${FOUR_C_EXECUTABLE_NAME} PRIVATE - FOUR_C_CLANGCUDA_HOST_ONLY - ) -endif() four_c_set_up_executable(${FOUR_C_EXECUTABLE_NAME}) if(FOUR_C_CLANGCUDA) set_target_properties( ${FOUR_C_EXECUTABLE_NAME} - PROPERTIES - CXX_COMPILER_LAUNCHER "" - C_COMPILER_LAUNCHER "" - CUDA_COMPILER_LAUNCHER "" - RULE_LAUNCH_COMPILE "" - RULE_LAUNCH_LINK "" - ) - target_compile_definitions(${FOUR_C_EXECUTABLE_NAME} PRIVATE - FOUR_C_CLANGCUDA_HOST_ONLY - ) + PROPERTIES CXX_COMPILER_LAUNCHER "" + C_COMPILER_LAUNCHER "" + CUDA_COMPILER_LAUNCHER "" + RULE_LAUNCH_COMPILE "" + RULE_LAUNCH_LINK "" + ) + target_compile_definitions(${FOUR_C_EXECUTABLE_NAME} PRIVATE CLANGCUDA_MODE_HOST) endif() if(FOUR_C_ENABLE_METADATA_GENERATION) if(FOUR_C_WITH_PYTHON) diff --git a/cmake/configure/configure_MIRCO.cmake b/cmake/configure/configure_MIRCO.cmake index 2e25ac53bbf..da810b00fd5 100644 --- a/cmake/configure/configure_MIRCO.cmake +++ b/cmake/configure/configure_MIRCO.cmake @@ -34,9 +34,17 @@ else() # Fetch MIRCO from GIT repository set(KOKKOS_IN_MIRCO "OFF") set(KOKKOS_KERNELS_IN_MIRCO "OFF") + # Propagate + if(FOUR_C_CLANGCUDA) + set(MIRCO_CLANGCUDA "ON") + else() + set(MIRCO_CLANGCUDA "OFF") + endif() + set(MIRCO_GIT_REPO "https://github.com/imcs-compsim/MIRCO.git") set(MIRCO_GIT_TAG "de05a25cf595510b8e315d91aa56b6f7a69ad727") # latest hash 03.05.2026 + set(FETCHCONTENT_TRY_FIND_PACKAGE_MODE NEVER) fetchcontent_declare( mirco GIT_REPOSITORY ${MIRCO_GIT_REPO} diff --git a/cmake/configure/configure_Trilinos.cmake b/cmake/configure/configure_Trilinos.cmake index eb537af01cd..128dd6659ff 100644 --- a/cmake/configure/configure_Trilinos.cmake +++ b/cmake/configure/configure_Trilinos.cmake @@ -72,18 +72,21 @@ message(STATUS "Trilinos version: ${Trilinos_VERSION}") message(STATUS "Trilinos packages: ${Trilinos_PACKAGE_LIST}") if(FOUR_C_CLANGCUDA) - set(CMAKE_CXX_COMPILER_LAUNCHER "" CACHE STRING "" FORCE) - set(CMAKE_C_COMPILER_LAUNCHER "" CACHE STRING "" FORCE) - set(CMAKE_CUDA_COMPILER_LAUNCHER "" CACHE STRING "" FORCE) + set(CMAKE_CXX_COMPILER_LAUNCHER + "" + CACHE STRING "" FORCE + ) + set(CMAKE_C_COMPILER_LAUNCHER + "" + CACHE STRING "" FORCE + ) + set(CMAKE_CUDA_COMPILER_LAUNCHER + "" + CACHE STRING "" FORCE + ) set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "") set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "") - - set_property(DIRECTORY PROPERTY RULE_LAUNCH_COMPILE "") - set_property(DIRECTORY PROPERTY RULE_LAUNCH_LINK "") - - get_property(_global_rule GLOBAL PROPERTY RULE_LAUNCH_COMPILE) - get_property(_dir_rule DIRECTORY PROPERTY RULE_LAUNCH_COMPILE) endif() # Figure out the version. diff --git a/cmake/functions/four_c_auto_define_module.cmake b/cmake/functions/four_c_auto_define_module.cmake index d63d5bdc7a9..ab9c49c547d 100644 --- a/cmake/functions/four_c_auto_define_module.cmake +++ b/cmake/functions/four_c_auto_define_module.cmake @@ -37,16 +37,17 @@ function(four_c_auto_define_module) # Add all global compile settings as PRIVATE. We only want to use them to compile our own files and not force # them on other users of the library. target_link_libraries(${_target}_objs PRIVATE four_c_private_compile_interface) - + if(FOUR_C_CLANGCUDA) - set_target_properties(${_target}_objs PROPERTIES - CXX_COMPILER_LAUNCHER "" - C_COMPILER_LAUNCHER "" - CUDA_COMPILER_LAUNCHER "" - RULE_LAUNCH_COMPILE "" - RULE_LAUNCH_LINK "" - ) - target_compile_definitions(${_target}_objs PRIVATE FOUR_C_CLANGCUDA_HOST_ONLY) + set_target_properties( + ${_target}_objs + PROPERTIES CXX_COMPILER_LAUNCHER "" + C_COMPILER_LAUNCHER "" + CUDA_COMPILER_LAUNCHER "" + RULE_LAUNCH_COMPILE "" + RULE_LAUNCH_LINK "" + ) + target_compile_definitions(${_target}_objs PRIVATE CLANGCUDA_MODE_HOST) endif() if(FOUR_C_ENABLE_IWYU) diff --git a/cmake/setup_global_options.cmake b/cmake/setup_global_options.cmake index f8f2e9dac30..8bba1e48842 100644 --- a/cmake/setup_global_options.cmake +++ b/cmake/setup_global_options.cmake @@ -235,6 +235,20 @@ four_c_process_global_option( OFF ) +four_c_process_global_option( + FOUR_C_CLANGCUDA + DESCRIPTION + "Enable the relevant CMake compile definitions needed to use utilities/clangcuda++ as the compiler. This is currently necessary to use the CUDA backend of Kokkos in 4C, e.g. along with MIRCO." + DEFAULT + OFF + ) +if(FOUR_C_CLANGCUDA AND FOUR_C_WITH_ARBORX) + message( + WARNING + "Enabling both FOUR_C_CLANGCUDA and FOUR_C_WITH_ARBORX is not advised. This requires using an external CUDA-enabled ArborX installation and has not been tested." + ) +endif() + ## # Optimization flags # These flags are reasonable defaults. Users may amend them by setting FOUR_C_CXX_FLAGS and/or FOUR_C_CXX_FLAGS_. diff --git a/src/cut/4C_cut_pointgraph.cpp b/src/cut/4C_cut_pointgraph.cpp index c0451dafe4f..35e38f78386 100644 --- a/src/cut/4C_cut_pointgraph.cpp +++ b/src/cut/4C_cut_pointgraph.cpp @@ -12,11 +12,21 @@ #include "4C_cut_pointgraph_simple.hpp" #include "4C_cut_side.hpp" -#ifdef FOUR_C_CLANGCUDA_HOST_ONLY - #undef __noinline__ +#if defined(CLANGCUDA_MODE_HOST) || defined(CLANGCUDA_MODE_DEVICE) +#ifdef __noinline__ +#pragma push_macro("__noinline__") +#undef __noinline__ +#define FOUR_C_RESTORE_NOINLINE_MACRO #endif +#endif + #include +#ifdef FOUR_C_RESTORE_NOINLINE_MACRO +#pragma pop_macro("__noinline__") +#undef FOUR_C_RESTORE_NOINLINE_MACRO +#endif + #include #include #include diff --git a/utilities/clangcuda++ b/utilities/clangcuda++ index 59ae664f8e2..d3ef5e4d93b 100755 --- a/utilities/clangcuda++ +++ b/utilities/clangcuda++ @@ -1,21 +1,28 @@ -#!/usr/bin/env bash +#!/bin/bash -# This is a compiler wrapper around clang++ for building 4C when using a Kokkos installation with the Cuda backend enabled, due to issues with nvcc and the nvcc_wrapper provided by Kokkos. +# This file is part of 4C multiphysics licensed under the +# GNU Lesser General Public License v3.0 or later. # -# When relevant, this wrapper should be used as the CXX_COMPILER or OMPI_CXX backend in combination with CMake flags to signal which targets (or objects) should be compiled for CUDA device, CUDA host-only, or normal (non-CUDA) C++. +# See the LICENSE.md file in the top-level for license information. +# +# SPDX-License-Identifier: LGPL-3.0-or-later + +# This is a compiler wrapper around clang++ for building a project which uses Kokkos with the Cuda backend enabled, due to issues with nvcc and the nvcc_wrapper provided by Kokkos. +# +# When relevant, this wrapper should be used as the CXX_COMPILER or OMPI_CXX backend in combination with CMake flags to signal which targets (or objects) should be compiled for CUDA device (CLANGCUDA_MODE_DEVICE), CUDA host-only (CLANGCUDA_MODE_HOST), or normal (non-CUDA) C++. # # The following environment variables can be used to override the defaults: -# - CLANGCUDA_CLANG: path to clang++ +# - CLANGCUDA_CLANG_PATH: path to clang++ # - CLANGCUDA_CUDA_PATH: path to the CUDA toolkit # - CLANGCUDA_ARCH: GPU architecture to target (default is sm_90) # - CLANGCUDA_LOG: optional path to a log file for recording final commands -clang="${CLANGCUDA_CLANG:-/bin/clang++}" +clang="${CLANGCUDA_CLANG_PATH:-/bin/clang++}" cuda_path="${CLANGCUDA_CUDA_PATH:-${CUDA_HOME:-/usr/local/cuda}}" compile=0 -cuda_host_only=0 -cuda_device=0 +mode_cuda_host=0 +mode_cuda_device=0 arch="${CLANGCUDA_ARCH:-sm_90}" args=() @@ -33,18 +40,13 @@ for arg in "$@"; do args+=("$arg") ;; - -DFOUR_C_CLANGCUDA_HOST_ONLY) - cuda_host_only=1 - args+=("$arg") - ;; - - -DFOUR_C_CLANGCUDA_DEVICE_COMPILE) - cuda_device=1 + -DCLANGCUDA_MODE_HOST) + mode_cuda_host=1 args+=("$arg") ;; - *.cu) - cuda_device=1 + -DCLANGCUDA_MODE_DEVICE) + mode_cuda_device=1 args+=("$arg") ;; @@ -73,22 +75,22 @@ for arg in "$@"; do done # Sanity check -if [[ "$cuda_host_only" == "1" && "$cuda_device" == "1" ]]; then +if [[ "$mode_cuda_host" == "1" && "$mode_cuda_device" == "1" ]]; then has_explicit_device=0 for arg in "$@"; do - if [[ "$arg" == "-DFOUR_C_CLANGCUDA_DEVICE_COMPILE" ]]; then + if [[ "$arg" == "-DCLANGCUDA_MODE_DEVICE" ]]; then has_explicit_device=1 break fi done if [[ "$has_explicit_device" == "1" ]]; then - echo "clangcuda++ wrapper error: both FOUR_C_CLANGCUDA_HOST_ONLY and FOUR_C_CLANGCUDA_DEVICE_COMPILE were set" >&2 + echo "clangcuda++ wrapper error: both CLANGCUDA_MODE_HOST and CLANGCUDA_MODE_DEVICE were set" >&2 exit 1 fi fi -if [[ "$compile" == "1" && "$cuda_host_only" == "1" ]]; then +if [[ "$compile" == "1" && "$mode_cuda_host" == "1" ]]; then final=( "$clang" -x cuda @@ -98,7 +100,7 @@ if [[ "$compile" == "1" && "$cuda_host_only" == "1" ]]; then -Wno-unknown-cuda-version "${args[@]}" ) -elif [[ "$compile" == "1" && "$cuda_device" == "1" ]]; then +elif [[ "$compile" == "1" && "$mode_cuda_device" == "1" ]]; then final=( "$clang" -x cuda From b261b1c2ac02ed5086ca0e903eef35e1e2ee4f28 Mon Sep 17 00:00:00 2001 From: Philip Oesterle-Pekrun Date: Mon, 25 May 2026 10:04:57 +0200 Subject: [PATCH 3/4] Add documentation for using Kokkos with CUDA in 4C Signed-off-by: Philip Oesterle-Pekrun --- .../src/installation/installation.rst | 14 +++++++++++++- utilities/clangcuda++ | 8 ++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/doc/documentation/src/installation/installation.rst b/doc/documentation/src/installation/installation.rst index 52c9b7a17f5..a18c852b166 100644 --- a/doc/documentation/src/installation/installation.rst +++ b/doc/documentation/src/installation/installation.rst @@ -110,7 +110,7 @@ Currently supported versions are listed in ``<4C_sourceDir>/dependencies/support MIRCO can be used as optional dependency inside |FOURC| to be used for linear elastic frictionless normal contact between a rigid rough indentor and an elastic half-space. See the `MIRCO repository `_ for details and downloads. -Building |FOURC| with MIRCO enabled automatically fetches the repository during the configure stage and later builds the library as dependency. +Building |FOURC| with MIRCO enabled automatically fetches the repository during the configure stage and later builds the library as dependency. Alternatively, one can specify an external MIRCO installation. In either case, MIRCO can make use of shared memory parallelism through Kokkos :ref:`when enabled ` in |FOURC|. Note that 4C and MIRCO must depend on the same Kokkos installation. In case using Kokkos with CUDA enabled, MIRCO must be built with `CMAKE_POSITION_INDEPENDENT_CODE=ON`. .. _qhull: @@ -605,3 +605,15 @@ This will install |FOURC| in the specified location. You can then use the instal # This pulls in all the necessary dependencies and headers. target_link_libraries( PRIVATE 4C::lib4C) +.. _build4Cwithkokkoscuda: + +Building |FOURC| with OpenMP and CUDA support through Kokkos +------------------------------------------------ + +|FOURC| is primarily developed around MPI parallelism, but also offers the ability to use shared memory parallelism through `Kokkos _`, enabling hybrid parallelism on the CPU through OpenMP and GPU acceleration through CUDA. + +Kokkos (and Kokkos-Kernels) can be built within Trilinos or specified as an external TPL in Trilinos, and its configuration follows the usual procedure for the desired backend (see the `Kokkos configuration guide _`). Trilinos then requires `Trilinos_ENABLE_=ON` and, specifically for CUDA, `Trilinos_ENABLE_TPL_CUDA=ON`. To prevent oversubscription and unwanted shared memory parallelism in 4C, one should disable these backends for Tpetra with `TPETRA_INST_=OFF` and explicitly set `TPETRA_INST_SERIAL=ON`. + +To build 4C with this configuration, a compiler wrapper, `utilities/clangcuda++` must be used as the `CMAKE_CXX_COMPILER`, while clang should be used as the `CMAKE_C_COMPILER`. When using MPI, these should instead be set as the `OMPI_CXX` and `OMPI_CC` environment variables respectively. To change the GPU architecture or default clang++ and CUDA paths, one should set the corresponding environment variables listed at the start of the `utilities/clangcuda++` compiler wrapper. Additionally, the `FOUR_C_CLANGCUDA` compile option must be enabled in 4C. Due to incompatibility with the serial version of ArborX, it is recommended to disable `FOUR_C_WITH_ARBORX`. + +For developers, it is important to know that any target in 4C which contains Kokkos device code (e.g. `Kokkos::parallel_for()` or `KOKKOS_LAMBDA`) must be marked with the `CLANGCUDA_MODE_DEVICE` compile definition for CUDA compilation to be possible. diff --git a/utilities/clangcuda++ b/utilities/clangcuda++ index d3ef5e4d93b..70c0deafdc9 100755 --- a/utilities/clangcuda++ +++ b/utilities/clangcuda++ @@ -17,7 +17,7 @@ # - CLANGCUDA_ARCH: GPU architecture to target (default is sm_90) # - CLANGCUDA_LOG: optional path to a log file for recording final commands -clang="${CLANGCUDA_CLANG_PATH:-/bin/clang++}" +clang_path="${CLANGCUDA_CLANG_PATH:-/bin/clang++}" cuda_path="${CLANGCUDA_CUDA_PATH:-${CUDA_HOME:-/usr/local/cuda}}" compile=0 @@ -92,7 +92,7 @@ fi if [[ "$compile" == "1" && "$mode_cuda_host" == "1" ]]; then final=( - "$clang" + "$clang_path" -x cuda --cuda-host-only --cuda-path="$cuda_path" @@ -102,7 +102,7 @@ if [[ "$compile" == "1" && "$mode_cuda_host" == "1" ]]; then ) elif [[ "$compile" == "1" && "$mode_cuda_device" == "1" ]]; then final=( - "$clang" + "$clang_path" -x cuda --cuda-path="$cuda_path" --cuda-gpu-arch="$arch" @@ -111,7 +111,7 @@ elif [[ "$compile" == "1" && "$mode_cuda_device" == "1" ]]; then ) else final=( - "$clang" + "$clang_path" -Wno-unknown-cuda-version "${args[@]}" ) From 30116100e569811aeb6b37a5811df5a1426172b3 Mon Sep 17 00:00:00 2001 From: Philip Oesterle-Pekrun Date: Tue, 9 Jun 2026 15:36:12 +0200 Subject: [PATCH 4/4] Add docker and workflow for OpenMP/CUDA backends Signed-off-by: Philip Oesterle-Pekrun --- .github/workflows/trilinos-kokkosparallel.yml | 141 ++++++++++++++++++ cmake/configure/configure_MIRCO.cmake | 4 +- .../trilinos/install_cuda.sh | 129 ++++++++++++++++ .../trilinos/install_openmp.sh | 125 ++++++++++++++++ .../src/installation/installation.rst | 2 +- docker/trilinos_kokkosparallel/Dockerfile | 130 ++++++++++++++++ presets/docker/CMakePresets.json | 27 ++++ 7 files changed, 555 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/trilinos-kokkosparallel.yml create mode 100755 dependencies/trilinos_kokkosparallel/trilinos/install_cuda.sh create mode 100755 dependencies/trilinos_kokkosparallel/trilinos/install_openmp.sh create mode 100644 docker/trilinos_kokkosparallel/Dockerfile diff --git a/.github/workflows/trilinos-kokkosparallel.yml b/.github/workflows/trilinos-kokkosparallel.yml new file mode 100644 index 00000000000..b088258c4dd --- /dev/null +++ b/.github/workflows/trilinos-kokkosparallel.yml @@ -0,0 +1,141 @@ +name: Trilinos with shared memory parallelism enabled through OpenMP and CUDA backends (through Kokkos) + +on: + workflow_dispatch: + schedule: + - cron: '0 8 * * 6' + +env: + IMAGE_NAME: ghcr.io/4c-multiphysics/4c-dependencies-trilinos + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build-and-push-trilinos-image: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + attestations: write + id-token: write + steps: + - name: Checkout repository + uses: actions/checkout@v6 + - name: Log in to the Container registry + uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@030e881283bb7a6894de51c315a6bfe6a94e05cf # v6.0.0 + with: + images: ${{ env.IMAGE_NAME }} + labels: | + org.opencontainers.image.description=Image containing all the dependencies required for building and testing 4C based on the specified Trilinos commit ref + - name: Build and push Docker image + id: push + uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0 + with: + context: . + file: docker/trilinos_kokkosparallel/Dockerfile + push: true + tags: ${{ env.IMAGE_NAME }}:kokkosparallel + labels: ${{ steps.meta.outputs.labels }} + + build_kokkoscuda: + needs: build-and-push-trilinos-image + runs-on: ubuntu-latest + container: + image: ghcr.io/4c-multiphysics/4c-dependencies-trilinos:kokkosparallel + options: --user root --env OMPI_ALLOW_RUN_AS_ROOT=1 --env OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 + defaults: + run: + shell: bash + env: + OMPI_CXX: ${{ github.workspace }}/utilities/clangcuda++ + OMPI_CC: /usr/bin/clang + CLANGCUDA_CLANG_PATH: /usr/bin/clang++ + CLANGCUDA_CUDA_PATH: /usr/local/cuda + CLANGCUDA_ARCH: sm_90 + steps: + - uses: actions/checkout@v6 + - uses: ./.github/actions/build_4C + with: + cmake-preset: docker_kokkoscuda_clangcuda + build-targets: full + build-directory: ${{ github.workspace }}/build + use-ccache: "false" + + build_kokkosopenmp: + needs: build-and-push-trilinos-image + runs-on: ubuntu-latest + container: + image: ghcr.io/4c-multiphysics/4c-dependencies-trilinos:kokkosparallel + options: --user root --env OMPI_ALLOW_RUN_AS_ROOT=1 --env OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 + defaults: + run: + shell: bash + outputs: + test-chunks: ${{ steps.set-matrix.outputs.chunk-array }} + steps: + - uses: actions/checkout@v6 + - uses: ./.github/actions/build_4C + with: + cmake-preset: docker_kokkosopenmp + build-targets: full + build-directory: ${{ github.workspace }}/build + use-ccache: "false" + - uses: ./.github/actions/upload_directory + with: + directory: ${{ github.workspace }}/build + retention-days: 1 + name: trilinos_kokkosopenmp_build + - uses: ./.github/actions/chunk_test_suite + id: set-matrix + with: + build-directory: ${{ github.workspace }}/build + source-directory: ${{ github.workspace }} + number-of-chunks: 15 + junit-report-artifact-name: trilinos_test_report.xml + + test_openmp: + needs: build_kokkosopenmp + runs-on: ubuntu-latest + container: + image: ghcr.io/4c-multiphysics/4c-dependencies-trilinos:kokkosparallel + options: --user root --env OMPI_ALLOW_RUN_AS_ROOT=1 --env OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 + strategy: + fail-fast: false + matrix: + test-chunk: ${{fromJson(needs.build_kokkosopenmp.outputs.test-chunks)}} + defaults: + run: + shell: bash + steps: + - uses: actions/checkout@v6 + - name: Setup developer environment for testing + run: | + cd $GITHUB_WORKSPACE + git config --global --add safe.directory $GITHUB_WORKSPACE + - uses: ./.github/actions/download_directory + with: + name: trilinos_kokkosopenmp_build + destination: ${{ github.workspace }}/build + - name: Test + run: | + cd $GITHUB_WORKSPACE/build + ctest -I $TEST_CHUNK -j `nproc` --output-on-failure --output-junit $GITHUB_WORKSPACE/trilinos_test_report-$TEST_CHUNK.xml + env: + TEST_CHUNK: ${{ matrix.test-chunk }} + - name: Upload test report + if: success() || failure() + uses: actions/upload-artifact@v7 + with: + name: trilinos_test_report-${{ matrix.test-chunk }}.xml + path: | + ${{ github.workspace }}/trilinos_test_report-${{ matrix.test-chunk }}.xml + retention-days: 1 diff --git a/cmake/configure/configure_MIRCO.cmake b/cmake/configure/configure_MIRCO.cmake index da810b00fd5..b722deaf2c0 100644 --- a/cmake/configure/configure_MIRCO.cmake +++ b/cmake/configure/configure_MIRCO.cmake @@ -29,7 +29,7 @@ if(FOUR_C_MIRCO_FIND_INSTALLED) else() # Fetch MIRCO from GIT repository # Turn off googletest in MIRCO so that it does not interfere with 4C. set(GTEST_IN_MIRCO "OFF") - # Explicitly turn off "*_IN_MIRCO", so that MIRCO uses upstream targets + # Explicitly turn off `*_IN_MIRCO`, so that MIRCO uses upstream targets set(RYML_IN_MIRCO "OFF") set(KOKKOS_IN_MIRCO "OFF") set(KOKKOS_KERNELS_IN_MIRCO "OFF") @@ -42,7 +42,7 @@ else() # Fetch MIRCO from GIT repository endif() set(MIRCO_GIT_REPO "https://github.com/imcs-compsim/MIRCO.git") - set(MIRCO_GIT_TAG "de05a25cf595510b8e315d91aa56b6f7a69ad727") # latest hash 03.05.2026 + set(MIRCO_GIT_TAG "8b049a6462eba5809d7cffe039a77f3bc5593767") # latest hash 02.06.2026 set(FETCHCONTENT_TRY_FIND_PACKAGE_MODE NEVER) fetchcontent_declare( diff --git a/dependencies/trilinos_kokkosparallel/trilinos/install_cuda.sh b/dependencies/trilinos_kokkosparallel/trilinos/install_cuda.sh new file mode 100755 index 00000000000..94554a1a98b --- /dev/null +++ b/dependencies/trilinos_kokkosparallel/trilinos/install_cuda.sh @@ -0,0 +1,129 @@ +#!/bin/bash +# This file is part of 4C multiphysics licensed under the +# GNU Lesser General Public License v3.0 or later. +# +# See the LICENSE.md file in the top-level for license information. +# +# SPDX-License-Identifier: LGPL-3.0-or-later + +# Install trilinos with the CUDA backend enabled +# Call with +# ./install_cuda.sh /path/to/install/dir + +# Exit the script at the first failure +set -e + +INSTALL_DIR="$1" +# Number of procs for building (default 4) +NPROCS=${NPROCS:=4} +# git sha from Trilinos repository: +VERSION="f4d642715185dca1b94c91f434a2cf6db9f82014" +#CHECKSUM="" + + +# Location of script to apply patches later +SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +CMAKE_COMMAND=cmake + +git clone https://github.com/trilinos/Trilinos.git +cd Trilinos +git checkout $VERSION +cd .. && mkdir trilinos_build && cd trilinos_build + +MPI_DIR=/usr +MPI_BIN_DIR=$MPI_DIR/bin + +$CMAKE_COMMAND \ + -D CMAKE_BUILD_TYPE:STRING="RELEASE" \ + -D CMAKE_CXX_STANDARD:STRING="17" \ + -D CMAKE_CXX_COMPILER:FILEPATH="$MPI_BIN_DIR/mpic++" \ + -D CMAKE_C_COMPILER:FILEPATH="$MPI_BIN_DIR/mpicc" \ + -D CMAKE_Fortran_COMPILER:FILEPATH="$MPI_BIN_DIR/mpif90" \ + -D CMAKE_INSTALL_PREFIX:STRING=$INSTALL_DIR \ + -D BUILD_SHARED_LIBS:BOOL=ON \ + \ + -D Trilinos_ENABLE_ALL_OPTIONAL_PACKAGES:BOOL=OFF \ + -D Trilinos_ENABLE_EXPLICIT_INSTANTIATION:BOOL=ON \ + -D Trilinos_ENABLE_ALL_PACKAGES:BOOL=OFF \ + -D Trilinos_ENABLE_TESTS:BOOL=OFF \ + -D Trilinos_ENABLE_EXAMPLES:BOOL=OFF \ + \ + -D Trilinos_ASSERT_MISSING_PACKAGES=OFF \ + -D Trilinos_ENABLE_Gtest:BOOL=OFF \ + -D Trilinos_ENABLE_Amesos:BOOL=ON \ + -D Amesos_SHOW_DEPRECATED_WARNINGS:BOOL=OFF \ + -D Trilinos_ENABLE_Amesos2:BOOL=ON \ + -D Trilinos_ENABLE_AztecOO:BOOL=ON \ + -D AztecOO_SHOW_DEPRECATED_WARNINGS:BOOL=OFF \ + -D Trilinos_ENABLE_Belos:BOOL=ON \ + -D Trilinos_ENABLE_Epetra:BOOL=ON \ + -D Epetra_SHOW_DEPRECATED_WARNINGS:BOOL=OFF \ + -D Trilinos_ENABLE_EpetraExt:BOOL=ON \ + -D EpetraExt_SHOW_DEPRECATED_WARNINGS:BOOL=OFF \ + -D Trilinos_ENABLE_Intrepid2:BOOL=ON \ + -D Trilinos_ENABLE_Ifpack:BOOL=ON \ + -D Ifpack_SHOW_DEPRECATED_WARNINGS:BOOL=OFF \ + -D Trilinos_ENABLE_Ifpack2:BOOL=ON \ + -D Trilinos_ENABLE_Kokkos:BOOL=ON \ + -D Trilinos_ENABLE_ML:BOOL=ON \ + -D ML_SHOW_DEPRECATED_WARNINGS:BOOL=OFF \ + -D Trilinos_ENABLE_MueLu:BOOL=ON \ + -D Trilinos_ENABLE_NOX:BOOL=ON \ + -D NOX_ENABLE_ABSTRACT_IMPLEMENTATION_EPETRA:BOOL=OFF \ + -D NOX_ENABLE_STRATIMIKOS_EPETRA_STACK:BOOL=OFF \ + -D Trilinos_ENABLE_Sacado:BOOL=ON \ + -D Trilinos_ENABLE_SEACASExodus:BOOL=ON \ + -D Trilinos_ENABLE_SEACASNemesis:BOOL=OFF \ + -D Trilinos_ENABLE_Shards:BOOL=ON \ + -D Trilinos_ENABLE_Stratimikos:BOOL=ON \ + -D Trilinos_ENABLE_Teko:BOOL=ON \ + -D Trilinos_ENABLE_Teuchos:BOOL=ON \ + -D Trilinos_ENABLE_Thyra:BOOL=ON \ + -D Thyra_SHOW_DEPRECATED_WARNINGS:BOOL=OFF \ + -D Trilinos_ENABLE_ThyraEpetraAdapters:BOOL=ON \ + -D Trilinos_ENABLE_ThyraEpetraExtAdapters:BOOL=ON \ + -D Trilinos_ENABLE_Tpetra:BOOL=ON \ + -D Tpetra_INST_INT_INT:BOOL=ON \ + -D Trilinos_ENABLE_Xpetra:BOOL=ON \ + -D Xpetra_ENABLE_Epetra:BOOL=ON \ + -D Xpetra_ENABLE_EpetraExt:BOOL=ON \ + -D Xpetra_SHOW_DEPRECATED_WARNINGS:BOOL=OFF \ + -D Trilinos_ENABLE_Zoltan:BOOL=ON \ + -D Trilinos_ENABLE_Zoltan2:BOOL=ON \ + \ + -D Trilinos_MUST_FIND_ALL_TPL_LIBS=TRUE \ + -D TPL_ENABLE_DLlib:BOOL=OFF \ + -D TPL_ENABLE_Netcdf:BOOL=ON \ + -D TPL_ENABLE_MPI:BOOL=ON \ + -D TPL_ENABLE_MUMPS:BOOL=ON \ + -D TPL_ENABLE_ScaLAPACK:BOOL=ON \ + -D TPL_ENABLE_LAPACK:BOOL=ON \ + -D TPL_ENABLE_BLAS:BOOL=ON \ + -D TPL_ENABLE_ParMETIS:BOOL=ON \ + -D ParMETIS_INCLUDE_DIRS:PATH="/usr/include" \ + -D TPL_ENABLE_UMFPACK:BOOL=ON \ + -D UMFPACK_INCLUDE_DIRS:FILEPATH="/usr/include/suitesparse" \ + -D TPL_ENABLE_SuperLUDist:BOOL=ON \ + -D SuperLUDist_INCLUDE_DIRS:PATH="$INSTALL_DIR/../include" \ + -D SuperLUDist_LIBRARY_DIRS:PATH="$INSTALL_DIR/../lib" \ + \ + -D Trilinos_ENABLE_KokkosKernels=TRUE \ + -D KokkosKernels_ENABLE_TPL_BLAS=TRUE \ + -D KokkosKernels_ENABLE_TPL_LAPACK=TRUE \ + \ + -D Kokkos_ENABLE_SERIAL=TRUE \ + -D Tpetra_INST_SERIAL=TRUE \ + \ + -D Trilinos_ENABLE_CUDA=TRUE \ + -D TPL_ENABLE_CUDA=TRUE \ + -D Kokkos_ENABLE_CUDA=TRUE \ + -D Kokkos_ARCH_HOPPER90=TRUE \ + -D Kokkos_ENABLE_CUDA_CONSTEXPR=TRUE \ + -D Tpetra_INST_CUDA=FALSE \ + -D KokkosKernels_ENABLE_TPL_CUSOLVER=TRUE \ + \ + ../Trilinos + +make -j${NPROCS} install +cd .. +rm -rf Trilinos trilinos_build diff --git a/dependencies/trilinos_kokkosparallel/trilinos/install_openmp.sh b/dependencies/trilinos_kokkosparallel/trilinos/install_openmp.sh new file mode 100755 index 00000000000..fdf38127d0c --- /dev/null +++ b/dependencies/trilinos_kokkosparallel/trilinos/install_openmp.sh @@ -0,0 +1,125 @@ +#!/bin/bash +# This file is part of 4C multiphysics licensed under the +# GNU Lesser General Public License v3.0 or later. +# +# See the LICENSE.md file in the top-level for license information. +# +# SPDX-License-Identifier: LGPL-3.0-or-later + +# Install trilinos with the OpenMP backend enabled +# Call with +# ./install_openmp.sh /path/to/install/dir + +# Exit the script at the first failure +set -e + +INSTALL_DIR="$1" +# Number of procs for building (default 4) +NPROCS=${NPROCS:=4} +# git sha from Trilinos repository: +VERSION="f4d642715185dca1b94c91f434a2cf6db9f82014" +#CHECKSUM="" + + +# Location of script to apply patches later +SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +CMAKE_COMMAND=cmake + +git clone https://github.com/trilinos/Trilinos.git +cd Trilinos +git checkout $VERSION +cd .. && mkdir trilinos_build && cd trilinos_build + +MPI_DIR=/usr +MPI_BIN_DIR=$MPI_DIR/bin + +$CMAKE_COMMAND \ + -D CMAKE_BUILD_TYPE:STRING="RELEASE" \ + -D CMAKE_CXX_STANDARD:STRING="17" \ + -D CMAKE_CXX_COMPILER:FILEPATH="$MPI_BIN_DIR/mpic++" \ + -D CMAKE_C_COMPILER:FILEPATH="$MPI_BIN_DIR/mpicc" \ + -D CMAKE_Fortran_COMPILER:FILEPATH="$MPI_BIN_DIR/mpif90" \ + -D CMAKE_INSTALL_PREFIX:STRING=$INSTALL_DIR \ + -D BUILD_SHARED_LIBS:BOOL=ON \ + \ + -D Trilinos_ENABLE_ALL_OPTIONAL_PACKAGES:BOOL=OFF \ + -D Trilinos_ENABLE_EXPLICIT_INSTANTIATION:BOOL=ON \ + -D Trilinos_ENABLE_ALL_PACKAGES:BOOL=OFF \ + -D Trilinos_ENABLE_TESTS:BOOL=OFF \ + -D Trilinos_ENABLE_EXAMPLES:BOOL=OFF \ + \ + -D Trilinos_ASSERT_MISSING_PACKAGES=OFF \ + -D Trilinos_ENABLE_Gtest:BOOL=OFF \ + -D Trilinos_ENABLE_Amesos:BOOL=ON \ + -D Amesos_SHOW_DEPRECATED_WARNINGS:BOOL=OFF \ + -D Trilinos_ENABLE_Amesos2:BOOL=ON \ + -D Trilinos_ENABLE_AztecOO:BOOL=ON \ + -D AztecOO_SHOW_DEPRECATED_WARNINGS:BOOL=OFF \ + -D Trilinos_ENABLE_Belos:BOOL=ON \ + -D Trilinos_ENABLE_Epetra:BOOL=ON \ + -D Epetra_SHOW_DEPRECATED_WARNINGS:BOOL=OFF \ + -D Trilinos_ENABLE_EpetraExt:BOOL=ON \ + -D EpetraExt_SHOW_DEPRECATED_WARNINGS:BOOL=OFF \ + -D Trilinos_ENABLE_Intrepid2:BOOL=ON \ + -D Trilinos_ENABLE_Ifpack:BOOL=ON \ + -D Ifpack_SHOW_DEPRECATED_WARNINGS:BOOL=OFF \ + -D Trilinos_ENABLE_Ifpack2:BOOL=ON \ + -D Trilinos_ENABLE_Kokkos:BOOL=ON \ + -D Trilinos_ENABLE_ML:BOOL=ON \ + -D ML_SHOW_DEPRECATED_WARNINGS:BOOL=OFF \ + -D Trilinos_ENABLE_MueLu:BOOL=ON \ + -D Trilinos_ENABLE_NOX:BOOL=ON \ + -D NOX_ENABLE_ABSTRACT_IMPLEMENTATION_EPETRA:BOOL=OFF \ + -D NOX_ENABLE_STRATIMIKOS_EPETRA_STACK:BOOL=OFF \ + -D Trilinos_ENABLE_Sacado:BOOL=ON \ + -D Trilinos_ENABLE_SEACASExodus:BOOL=ON \ + -D Trilinos_ENABLE_SEACASNemesis:BOOL=OFF \ + -D Trilinos_ENABLE_Shards:BOOL=ON \ + -D Trilinos_ENABLE_Stratimikos:BOOL=ON \ + -D Trilinos_ENABLE_Teko:BOOL=ON \ + -D Trilinos_ENABLE_Teuchos:BOOL=ON \ + -D Trilinos_ENABLE_Thyra:BOOL=ON \ + -D Thyra_SHOW_DEPRECATED_WARNINGS:BOOL=OFF \ + -D Trilinos_ENABLE_ThyraEpetraAdapters:BOOL=ON \ + -D Trilinos_ENABLE_ThyraEpetraExtAdapters:BOOL=ON \ + -D Trilinos_ENABLE_Tpetra:BOOL=ON \ + -D Tpetra_INST_INT_INT:BOOL=ON \ + -D Trilinos_ENABLE_Xpetra:BOOL=ON \ + -D Xpetra_ENABLE_Epetra:BOOL=ON \ + -D Xpetra_ENABLE_EpetraExt:BOOL=ON \ + -D Xpetra_SHOW_DEPRECATED_WARNINGS:BOOL=OFF \ + -D Trilinos_ENABLE_Zoltan:BOOL=ON \ + -D Trilinos_ENABLE_Zoltan2:BOOL=ON \ + \ + -D Trilinos_MUST_FIND_ALL_TPL_LIBS=TRUE \ + -D TPL_ENABLE_DLlib:BOOL=OFF \ + -D TPL_ENABLE_Netcdf:BOOL=ON \ + -D TPL_ENABLE_MPI:BOOL=ON \ + -D TPL_ENABLE_MUMPS:BOOL=ON \ + -D TPL_ENABLE_ScaLAPACK:BOOL=ON \ + -D TPL_ENABLE_LAPACK:BOOL=ON \ + -D TPL_ENABLE_BLAS:BOOL=ON \ + -D TPL_ENABLE_ParMETIS:BOOL=ON \ + -D ParMETIS_INCLUDE_DIRS:PATH="/usr/include" \ + -D TPL_ENABLE_UMFPACK:BOOL=ON \ + -D UMFPACK_INCLUDE_DIRS:FILEPATH="/usr/include/suitesparse" \ + -D TPL_ENABLE_SuperLUDist:BOOL=ON \ + -D SuperLUDist_INCLUDE_DIRS:PATH="$INSTALL_DIR/../include" \ + -D SuperLUDist_LIBRARY_DIRS:PATH="$INSTALL_DIR/../lib" \ + \ + -D Trilinos_ENABLE_KokkosKernels=TRUE \ + -D KokkosKernels_ENABLE_TPL_BLAS=TRUE \ + -D KokkosKernels_ENABLE_TPL_LAPACK=TRUE \ + \ + -D Kokkos_ENABLE_SERIAL=TRUE \ + -D Tpetra_INST_SERIAL=TRUE \ + \ + -D Trilinos_ENABLE_OpenMP=TRUE \ + -D Kokkos_ENABLE_OPENMP=TRUE \ + -D Tpetra_INST_OPENMP=TRUE \ + \ + ../Trilinos + +make -j${NPROCS} install +cd .. +rm -rf Trilinos trilinos_build diff --git a/doc/documentation/src/installation/installation.rst b/doc/documentation/src/installation/installation.rst index a18c852b166..84fc7917823 100644 --- a/doc/documentation/src/installation/installation.rst +++ b/doc/documentation/src/installation/installation.rst @@ -608,7 +608,7 @@ This will install |FOURC| in the specified location. You can then use the instal .. _build4Cwithkokkoscuda: Building |FOURC| with OpenMP and CUDA support through Kokkos ------------------------------------------------- +------------------------------------------------------------ |FOURC| is primarily developed around MPI parallelism, but also offers the ability to use shared memory parallelism through `Kokkos _`, enabling hybrid parallelism on the CPU through OpenMP and GPU acceleration through CUDA. diff --git a/docker/trilinos_kokkosparallel/Dockerfile b/docker/trilinos_kokkosparallel/Dockerfile new file mode 100644 index 00000000000..5b5641247b6 --- /dev/null +++ b/docker/trilinos_kokkosparallel/Dockerfile @@ -0,0 +1,130 @@ +# This file is part of 4C multiphysics licensed under the +# GNU Lesser General Public License v3.0 or later. +# +# See the LICENSE.md file in the top-level for license information. +# +# SPDX-License-Identifier: LGPL-3.0-or-later + +ARG BASE_IMAGE=nvidia/cuda:12.8.1-devel-ubuntu24.04 +FROM ${BASE_IMAGE} +LABEL org.opencontainers.image.description="Image containing all the dependencies required for building and testing 4C" +LABEL org.4c-multiphysics.project=4C + +# Prevents tzdata asking for user feedback +ENV DEBIAN_FRONTEND=noninteractive + +USER root + +# Set locale information: region and timezone +RUN apt-get update && apt-get install -y --no-install-recommends \ + locales \ + && localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 \ + && locale-gen en_US.UTF-8 \ + && rm -rf /var/lib/apt/lists/* + +ENV LANG=en_US.UTF-8 +ENV LANGUAGE=en_US:en +ENV LC_ALL=en_US.UTF-8 + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + ffmpeg \ + git \ + libglu1-mesa \ + python3 \ + sudo \ + unzip \ + vim \ + wget \ + && \ + apt-get update && apt-get install -y \ + doxygen \ + graphviz \ + texinfo \ + lcov \ + libblas-dev \ + libboost-all-dev \ + libcln-dev \ + libhdf5-dev \ + libhdf5-openmpi-dev \ + libnetcdf-dev \ + libfftw3-dev \ + lld \ + python3-venv \ + python-is-python3 \ + liblapack-dev \ + libopenmpi-dev \ + libparmetis-dev \ + libmetis-dev \ + libsuitesparse-dev \ + libmumps-dev \ + libscalapack-mpi-dev \ + libqhull-dev \ + mpi-default-dev \ + ninja-build \ + libyaml-dev \ + clang \ + clang-tidy \ + clang-tools \ + libomp-dev \ + libvtk9-dev \ + && rm -rf /var/lib/apt/lists/* + +# Create directory for dependencies +ARG NPROCS=12 +ENV NPROCS=$NPROCS \ + INSTALL_DIR="/opt/4C-dependencies" +RUN mkdir -p ${INSTALL_DIR} + +COPY dependencies /dependencies + +# Make `nvcc` available as an environment variable +ENV PATH=/usr/local/cuda/bin:${PATH} + +# Install cmake +RUN /dependencies/current/cmake/install.sh /usr/local + +# Install superLU_dist 7.2.0 +RUN /dependencies/current/superlu_dist/install.sh ${INSTALL_DIR} + +# Install Trilinos 2025.6 with Kokkos' CUDA backend enabled +RUN /dependencies/trilinos_kokkosparallel/trilinos/install_cuda.sh ${INSTALL_DIR}/tk_cuda + +# Install Trilinos 2025.6 with Kokkos' OpenMP backend enabled +RUN /dependencies/trilinos_kokkosparallel/trilinos/install_openmp.sh ${INSTALL_DIR}/tk_openmp +# Install deal.II (needs to happen after Trilinos and within the same installation directory) +RUN /dependencies/current/dealii/install.sh ${INSTALL_DIR}/tk_openmp + +# Install (optional) backtrace library +RUN /dependencies/current/backtrace/install.sh ${INSTALL_DIR} + +# install (optional) gmsh library +RUN /dependencies/current/gmsh/install.sh ${INSTALL_DIR} + +# Packages for testing +# Installation directory for dependencies concerning testing +ENV FOUR_C_TESTING_DEPENDENCIES_DIR="/opt/4C-dependencies-testing/" +RUN mkdir ${FOUR_C_TESTING_DEPENDENCIES_DIR} + +# Install Mathjax +RUN /dependencies/testing/mathjax/install.sh ${FOUR_C_TESTING_DEPENDENCIES_DIR} + +# Add dependencies hash +# The label is added at the end because the label causes a cache miss +ARG DEPENDENCIES_HASH +LABEL org.4c-multiphysics.dependencies_hash="${DEPENDENCIES_HASH}" +ENV DEPENDENCIES_HASH=${DEPENDENCIES_HASH} + +# add and enable the default user +ENV USER=user +RUN adduser --disabled-password --shell '/usr/bin/bash' --gecos '' $USER +RUN adduser $USER sudo; echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers + +#make sure everything is in place +RUN chown -R $USER:$USER /home/$USER +USER $USER +ENV HOME=/home/$USER +ENV USER=$USER +# see https://github.com/open-mpi/ompi/issues/4948 +ENV OMPI_MCA_btl_vader_single_copy_mechanism=none +WORKDIR $HOME diff --git a/presets/docker/CMakePresets.json b/presets/docker/CMakePresets.json index e2380727571..5ee3398d84e 100644 --- a/presets/docker/CMakePresets.json +++ b/presets/docker/CMakePresets.json @@ -152,6 +152,33 @@ "FOUR_C_WITH_QHULL": "OFF", "FOUR_C_ENABLE_PYTHON_BINDINGS": "OFF" } + }, + { + "name": "docker_kokkoscuda_clangcuda", + "displayName": "Release build for CUDA-enabled Kokkos", + "description": "Release build using clangcuda++ compiler wrapper for CUDA-enabled Kokkos", + "inherits": [ + ".docker_base" + ], + "cacheVariables": { + "FOUR_C_CLANGCUDA": "ON", + "FOUR_C_TRILINOS_ROOT": "/opt/4C-dependencies/tk_cuda", + "FOUR_C_WITH_ARBORX": "OFF", + "FOUR_C_WITH_DEAL_II": "OFF", + "FOUR_C_ENABLE_METADATA_GENERATION": "OFF" + } + }, + { + "name": "docker_kokkosopenmp", + "displayName": "Release build forOpenMP-enabled Kokkos", + "description": "Release build forOpenMP-enabled Kokkos", + "inherits": [ + ".docker_base" + ], + "cacheVariables": { + "FOUR_C_TRILINOS_ROOT": "/opt/4C-dependencies/tk_openmp", + "FOUR_C_DEAL_II_ROOT": "/opt/4C-dependencies/tk_openmp" + } } ] }