From fe3982b51457282ddab02fded4b57a6ff73948f2 Mon Sep 17 00:00:00 2001 From: Jonas Schwab Date: Wed, 22 Apr 2026 18:12:59 +0200 Subject: [PATCH 1/2] Add configure target and sample jobscript for Helma CPU cluster. Also adds libfakeintel.so to spoof Intel CPUs for MKL. See e.g. https://danieldk.eu/Software/Misc/Intel-MKL-on-AMD-Zen --- .gitignore | 1 + Libraries/Modules/Makefile | 15 ++++- Libraries/Modules/fakeintel.c | 9 +++ Scripts_and_Parameters_files/JobfileFritz.sh | 2 +- Scripts_and_Parameters_files/JobfileHelma.sh | 63 +++++++++++++++++++ .../JobfileSuperMUC-NG.sh | 2 +- .../JobfileSuperMUC.sh | 6 +- configure.sh | 21 +++++++ 8 files changed, 112 insertions(+), 7 deletions(-) create mode 100644 Libraries/Modules/fakeintel.c create mode 100644 Scripts_and_Parameters_files/JobfileHelma.sh diff --git a/.gitignore b/.gitignore index 6c3064173..05a9eca0d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.o +*.so *.mod *.smod *.out diff --git a/Libraries/Modules/Makefile b/Libraries/Modules/Makefile index 2efaeb69f..6f094b9cf 100644 --- a/Libraries/Modules/Makefile +++ b/Libraries/Modules/Makefile @@ -29,7 +29,7 @@ SRCS = Mat_subroutines_mod.F90 \ DEPSFILE := .deps.mk DEPSGEN := ../../Prog/gen_deps.py -lib: $(LIB) +lib: $(LIB) libfakeintel.so # OBJS and MODS are auto-generated into $(DEPSFILE) by gen_deps.py -include $(DEPSFILE) @@ -56,11 +56,22 @@ lattices_armv8.4-a.so: runtime_error_mod.F90 lattices_interface_mod.F90 matrix_m %.o: %.F90 $(ALF_FC) -c $(ALF_FLAGS_MODULES) $< +libfakeintel.so: fakeintel.c + @for cc in gcc clang icc icx cc; do \ + if command -v $$cc > /dev/null 2>&1; then \ + $$cc -shared -fPIC -o libfakeintel.so fakeintel.c; \ + break; \ + fi; \ + done; \ + if [ ! -f libfakeintel.so ]; then \ + echo "No C compiler found, libfakeintel.so not created." 1>&2; \ + fi + $(DEPSFILE): $(SRCS) $(DEPSGEN) python3 $(DEPSGEN) $(SRCS) > $(DEPSFILE) clean: $(DEPSFILE) - rm -f $(OBJS) $(MODS) $(LIB) $(DEPSFILE) + rm -f $(OBJS) $(MODS) $(LIB) $(DEPSFILE) libfakeintel.so #unused modules: histograms histograms_v2 log_mesh natural_constants #unused, except for in Prog/FFA_Orginals/: precdef diff --git a/Libraries/Modules/fakeintel.c b/Libraries/Modules/fakeintel.c new file mode 100644 index 000000000..cac01f633 --- /dev/null +++ b/Libraries/Modules/fakeintel.c @@ -0,0 +1,9 @@ +int mkl_serv_intel_cpu_true(void) { + return 1; +} + +typedef int (*fakeintel_fptr)(void); + +fakeintel_fptr mkl_serv_get_cpu_true(void) { + return &mkl_serv_intel_cpu_true; +} diff --git a/Scripts_and_Parameters_files/JobfileFritz.sh b/Scripts_and_Parameters_files/JobfileFritz.sh index 568cbde74..4898bb9a2 100644 --- a/Scripts_and_Parameters_files/JobfileFritz.sh +++ b/Scripts_and_Parameters_files/JobfileFritz.sh @@ -28,7 +28,7 @@ module load intel module load intelmpi module load mkl -# the follwing environment variables generate an optimal pinning (to the best of our knowledge) +# the following environment variables generate an optimal pinning (to the best of our knowledge) # This DOES NOT have to be adapted to the choice of Ntasks # FIRST EXCEPTION: If you chose to use hyper-threading (not recommended) you should set I_MPI_PIN_CELL=cpu # SECOND EXCEPTION: The following environment variables are Intel specific. diff --git a/Scripts_and_Parameters_files/JobfileHelma.sh b/Scripts_and_Parameters_files/JobfileHelma.sh new file mode 100644 index 000000000..e3a859703 --- /dev/null +++ b/Scripts_and_Parameters_files/JobfileHelma.sh @@ -0,0 +1,63 @@ +#!/bin/bash -l +# +# Sample jobscript for Helma CPU of NHR@FAU +# +# The following jobscript contains a few 'variables' marked by the ##...## pattern. +# The user has to provide the appropriate values, e.g. replace ##Nnodes## by 1 if the job is supposed to run on a single node. +# Most variables are self-explanatory, one exception might be Nthreads, which is referring to the number of OpenMP threads per MPI task. +# In general we found that ALF does not profit from hyper-threading such that we suggest to only use physical cores. +# +# On Helma, a single node has 384 cores. +# There are two partitions, "cpu" and "preempt_cpu". +# -"cpu": Single-node and multi-node jobs are possible. +# Multi-node jobs always get all 384 cores of each node, +# while single-node jobs can request cores in multiples of 48 (one NUMA domain). +# -"preempt_cpu": +# Only single node, but an arbitrary number of cores is allowed. +# The walltime is up to 48 hours, but jobs might get cancelled after 2 hours +# to make room for jobs in the "cpu" partition (with a grace time as in the GPU preempt partition). +# +# For single node jobs, the number of cores is ##NtaskPnode## * ##Nthreads##. +# +#SBATCH --job-name ##NAME## +#SBATCH --output=out.%j.log +#SBATCH --error=err.%j.log +#Notification and type +#SBATCH --mail-type=ALL +#SBATCH --mail-user=##EMAIL## +# Wall clock limit (HH:MM:SS): +#SBATCH --time=##TIME## +#SBATCH --no-requeue +#Setup of execution environment +#SBATCH --export=NONE + +#available partitions: cpu, preempt_cpu +#SBATCH --partition=##PARTITION## +#SBATCH --nodes=##Nnodes## +#SBATCH --ntasks-per-node=##NtaskPnode## +#SBATCH --cpus-per-task=##Nthreads## + +unset SLURM_EXPORT_ENV +module --force switch gpu-env/2025 cpu-env/2026 +module load intel/2025.3.1 +module load intelmpi/2021.17.0 +module load mkl/2024.2.2 + +# the following environment variables generate an optimal pinning (to the best of our knowledge) +# This DOES NOT have to be adapted to the choice of Ntasks +# FIRST EXCEPTION: If you chose to use hyper-threading (not recommended) you should set I_MPI_PIN_CELL=cpu +# SECOND EXCEPTION: The following environment variables are Intel specific. +#export KMP_AFFINITY=verbose,granularity=fine,compact +export KMP_AFFINITY=granularity=fine,compact +export I_MPI_PIN_CELL=core +export I_MPI_PIN_DOMAIN=auto:cache +export I_MPI_PIN_ORDER=scatter + +export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK + + # Uncomment the following line to speed up MKL by making it believe it runs on an Intel CPU. + # After `make`, `libfakeintel.so` is expected under `Libraries/Modules/`; use its absolute path here, e.g.: + # export LD_PRELOAD=/absolute/path/to/ALF/Libraries/Modules/libfakeintel.so + +bash ./out_to_in.sh +mpiexec -n $SLURM_NTASKS ##EXECUTABLE## diff --git a/Scripts_and_Parameters_files/JobfileSuperMUC-NG.sh b/Scripts_and_Parameters_files/JobfileSuperMUC-NG.sh index 8a028bae6..51e85838f 100644 --- a/Scripts_and_Parameters_files/JobfileSuperMUC-NG.sh +++ b/Scripts_and_Parameters_files/JobfileSuperMUC-NG.sh @@ -35,7 +35,7 @@ module load slurm_setup module load hdf5/1.10.7-intel21 -# the follwing environment variables generate an optimal pinning (to the best of our knowledge) +# the following environment variables generate an optimal pinning (to the best of our knowledge) # This DOES NOT have to be adapted to the choice of Ntasks # FIRST EXCEPTION: If you chose to use hyper-threading (not recommended) you should set I_MPI_PIN_CELL=cpu # SECOND EXCEPTION: The following environment variables are Intel specific. diff --git a/Scripts_and_Parameters_files/JobfileSuperMUC.sh b/Scripts_and_Parameters_files/JobfileSuperMUC.sh index 277ae0ad9..722f65007 100644 --- a/Scripts_and_Parameters_files/JobfileSuperMUC.sh +++ b/Scripts_and_Parameters_files/JobfileSuperMUC.sh @@ -34,10 +34,10 @@ module switch mkl mkl/2018 export OMP_NUM_THREADS=##Nthreads## -# the follwing eviroment variables generate an optimal pinning (to the best of our knowledge) -# This DOES NOT have to be addepted to the choice of Ntasks +# the following environment variables generate an optimal pinning (to the best of our knowledge) +# This DOES NOT have to be adapted to the choice of Ntasks # FIRST EXCEPTION: If you chose to use hyperthreading (not recommended) you should set I_MPI_PIN_CELL=cpu -# SECOND EXCEPTION: The following enviroment variables are Intel specific. +# SECOND EXCEPTION: The following environment variables are Intel specific. export KMP_AFFINITY=verbose,granularity=fine,compact export I_MPI_PIN_CELL=core export I_MPI_PIN_DOMAIN=auto:cache3 diff --git a/configure.sh b/configure.sh index b48615754..d501a1f76 100755 --- a/configure.sh +++ b/configure.sh @@ -12,6 +12,7 @@ Please choose one of the following MACHINEs: * SuperMUC-NG * JUWELS * FRITZ + * HELMA Possible MODEs are: * MPI (default) * noMPI @@ -458,6 +459,26 @@ case $MACHINE in set_hdf5_flags "$INTELCC" ifort "$INTELCXX" || return 1 fi ;; + + + #NHR@FAU Helma CPU cluster + HELMA) + module --force switch gpu-env/2025 cpu-env/2026 + module load intel/2025.3.1 + module load intelmpi/2021.17.0 + module load mkl/2024.2.2 + + F90OPTFLAGS="$INTELLLVMOPTFLAGS" + F90USEFULFLAGS="$INTELLLVMUSEFULFLAGS" + ALF_FC="$INTELLLVMCOMPILER" + find_mkl_flag || return 1 + LIB_BLAS_LAPACK="${INTELMKL}" + if [ "${HDF5_ENABLED}" = "1" ]; then + set_intelcc + set_intelcxx + set_hdf5_flags "$INTELCC" ifx "$INTELCXX" || return 1 + fi + ;; #Default (unknown machine) *) if [ "$NO_FALLBACK" = "1" ]; then From 86b8c14f163887c82211b43a02abb416961e3a59 Mon Sep 17 00:00:00 2001 From: Jonas Schwab Date: Fri, 24 Apr 2026 15:41:24 +0200 Subject: [PATCH 2/2] Add comment to fakeintel.c --- Libraries/Modules/fakeintel.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Libraries/Modules/fakeintel.c b/Libraries/Modules/fakeintel.c index cac01f633..329bc7a4b 100644 --- a/Libraries/Modules/fakeintel.c +++ b/Libraries/Modules/fakeintel.c @@ -1,3 +1,10 @@ +// This file is a fake implementation of the MKL function mkl_serv_get_cpu_true, +// which is used by MKL to determine if the CPU is an Intel CPU. +// This is used to trick MKL into thinking that it is running on an Intel CPU, +// which allows it to use the optimized code paths for Intel CPUs, even when running on an AMD CPU. +// This has shown to improve performance on AMD CPUs significantly, especially on AMD Zen CPUs. +// See: https://danieldk.eu/Software/Misc/Intel-MKL-on-AMD-Zen + int mkl_serv_intel_cpu_true(void) { return 1; }