From fe3982b51457282ddab02fded4b57a6ff73948f2 Mon Sep 17 00:00:00 2001
From: Jonas Schwab <jonas.schwab@uni-wuerzburg.de>
Date: Wed, 22 Apr 2026 18:12:59 +0200
Subject: [PATCH 1/2] Add configure target and sample jobscript for Helma CPU
 cluster.

Also adds libfakeintel.so to spoof Intel CPUs for MKL. See e.g. https://danieldk.eu/Software/Misc/Intel-MKL-on-AMD-Zen
---
 .gitignore                                    |  1 +
 Libraries/Modules/Makefile                    | 15 ++++-
 Libraries/Modules/fakeintel.c                 |  9 +++
 Scripts_and_Parameters_files/JobfileFritz.sh  |  2 +-
 Scripts_and_Parameters_files/JobfileHelma.sh  | 63 +++++++++++++++++++
 .../JobfileSuperMUC-NG.sh                     |  2 +-
 .../JobfileSuperMUC.sh                        |  6 +-
 configure.sh                                  | 21 +++++++
 8 files changed, 112 insertions(+), 7 deletions(-)
 create mode 100644 Libraries/Modules/fakeintel.c
 create mode 100644 Scripts_and_Parameters_files/JobfileHelma.sh

diff --git a/.gitignore b/.gitignore
index 6c3064173..05a9eca0d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 *.o
+*.so
 *.mod
 *.smod
 *.out
diff --git a/Libraries/Modules/Makefile b/Libraries/Modules/Makefile
index 2efaeb69f..6f094b9cf 100644
--- a/Libraries/Modules/Makefile
+++ b/Libraries/Modules/Makefile
@@ -29,7 +29,7 @@ SRCS = Mat_subroutines_mod.F90 \
 DEPSFILE := .deps.mk
 DEPSGEN := ../../Prog/gen_deps.py
 
-lib: $(LIB)
+lib: $(LIB) libfakeintel.so
 
 # OBJS and MODS are auto-generated into $(DEPSFILE) by gen_deps.py
 -include $(DEPSFILE)
@@ -56,11 +56,22 @@ lattices_armv8.4-a.so: runtime_error_mod.F90 lattices_interface_mod.F90 matrix_m
 %.o: %.F90
 	$(ALF_FC) -c $(ALF_FLAGS_MODULES) $<
 
+libfakeintel.so: fakeintel.c
+	@for cc in gcc clang icc icx cc; do \
+		if command -v $$cc > /dev/null 2>&1; then \
+			$$cc -shared -fPIC -o libfakeintel.so fakeintel.c; \
+			break; \
+		fi; \
+	done; \
+	if [ ! -f libfakeintel.so ]; then \
+		echo "No C compiler found, libfakeintel.so not created." 1>&2; \
+	fi
+
 $(DEPSFILE): $(SRCS) $(DEPSGEN)
 	python3 $(DEPSGEN) $(SRCS) > $(DEPSFILE)
 
 clean: $(DEPSFILE)
-	rm -f $(OBJS) $(MODS) $(LIB) $(DEPSFILE)
+	rm -f $(OBJS) $(MODS) $(LIB) $(DEPSFILE) libfakeintel.so
 
 #unused modules: histograms histograms_v2 log_mesh natural_constants
 #unused, except for in Prog/FFA_Orginals/: precdef
diff --git a/Libraries/Modules/fakeintel.c b/Libraries/Modules/fakeintel.c
new file mode 100644
index 000000000..cac01f633
--- /dev/null
+++ b/Libraries/Modules/fakeintel.c
@@ -0,0 +1,9 @@
+int mkl_serv_intel_cpu_true(void) {
+  return 1;
+}
+ 
+typedef int (*fakeintel_fptr)(void);
+ 
+fakeintel_fptr mkl_serv_get_cpu_true(void) {
+  return &mkl_serv_intel_cpu_true;
+}
diff --git a/Scripts_and_Parameters_files/JobfileFritz.sh b/Scripts_and_Parameters_files/JobfileFritz.sh
index 568cbde74..4898bb9a2 100644
--- a/Scripts_and_Parameters_files/JobfileFritz.sh
+++ b/Scripts_and_Parameters_files/JobfileFritz.sh
@@ -28,7 +28,7 @@ module load intel
 module load intelmpi
 module load mkl
 
-# the follwing environment variables generate an optimal pinning (to the best of our knowledge)
+# the following environment variables generate an optimal pinning (to the best of our knowledge)
 # This DOES NOT have to be adapted to the choice of Ntasks
 # FIRST EXCEPTION: If you chose to use hyper-threading (not recommended) you should set I_MPI_PIN_CELL=cpu
 # SECOND EXCEPTION: The following environment variables are Intel specific.
diff --git a/Scripts_and_Parameters_files/JobfileHelma.sh b/Scripts_and_Parameters_files/JobfileHelma.sh
new file mode 100644
index 000000000..e3a859703
--- /dev/null
+++ b/Scripts_and_Parameters_files/JobfileHelma.sh
@@ -0,0 +1,63 @@
+#!/bin/bash -l
+#
+# Sample jobscript for Helma CPU of NHR@FAU
+#
+# The following jobscript contains a few 'variables' marked by the ##...## pattern.
+# The user has to provide the appropriate values, e.g. replace ##Nnodes## by 1 if the job is supposed to run on a single node.
+# Most variables are self-explanatory, one exception might be Nthreads, which is referring to the number of OpenMP threads per MPI task.
+# In general we found that ALF does not profit from hyper-threading such that we suggest to only use physical cores.
+#
+# On Helma, a single node has 384 cores.
+# There are two partitions, "cpu" and "preempt_cpu".
+# -"cpu": Single-node and multi-node jobs are possible. 
+#         Multi-node jobs always get all 384 cores of each node,
+#         while single-node jobs can request cores in multiples of 48 (one NUMA domain).
+# -"preempt_cpu":
+#     Only single node, but an arbitrary number of cores is allowed.
+#     The walltime is up to 48 hours, but jobs might get cancelled after 2 hours
+#     to make room for jobs in the "cpu" partition (with a grace time as in the GPU preempt partition).
+#
+# For single node jobs, the number of cores is ##NtaskPnode## * ##Nthreads##.
+#
+#SBATCH --job-name ##NAME##
+#SBATCH --output=out.%j.log
+#SBATCH --error=err.%j.log
+#Notification and type
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=##EMAIL##
+# Wall clock limit (HH:MM:SS):
+#SBATCH --time=##TIME##
+#SBATCH --no-requeue
+#Setup of execution environment
+#SBATCH --export=NONE
+
+#available partitions: cpu, preempt_cpu
+#SBATCH --partition=##PARTITION##
+#SBATCH --nodes=##Nnodes##
+#SBATCH --ntasks-per-node=##NtaskPnode##
+#SBATCH --cpus-per-task=##Nthreads##
+
+unset SLURM_EXPORT_ENV
+module --force switch gpu-env/2025 cpu-env/2026
+module load intel/2025.3.1
+module load intelmpi/2021.17.0
+module load mkl/2024.2.2
+
+# the following environment variables generate an optimal pinning (to the best of our knowledge)
+# This DOES NOT have to be adapted to the choice of Ntasks
+# FIRST EXCEPTION: If you chose to use hyper-threading (not recommended) you should set I_MPI_PIN_CELL=cpu
+# SECOND EXCEPTION: The following environment variables are Intel specific.
+#export KMP_AFFINITY=verbose,granularity=fine,compact
+export KMP_AFFINITY=granularity=fine,compact
+export I_MPI_PIN_CELL=core
+export I_MPI_PIN_DOMAIN=auto:cache
+export I_MPI_PIN_ORDER=scatter
+
+export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
+
+ # Uncomment the following line to speed up MKL by making it believe it runs on an Intel CPU.
+ # After `make`, `libfakeintel.so` is expected under `Libraries/Modules/`; use its absolute path here, e.g.:
+ # export LD_PRELOAD=/absolute/path/to/ALF/Libraries/Modules/libfakeintel.so
+
+bash ./out_to_in.sh
+mpiexec -n $SLURM_NTASKS ##EXECUTABLE##
diff --git a/Scripts_and_Parameters_files/JobfileSuperMUC-NG.sh b/Scripts_and_Parameters_files/JobfileSuperMUC-NG.sh
index 8a028bae6..51e85838f 100644
--- a/Scripts_and_Parameters_files/JobfileSuperMUC-NG.sh
+++ b/Scripts_and_Parameters_files/JobfileSuperMUC-NG.sh
@@ -35,7 +35,7 @@ module load slurm_setup
 
 module load hdf5/1.10.7-intel21
 
-# the follwing environment variables generate an optimal pinning (to the best of our knowledge)
+# the following environment variables generate an optimal pinning (to the best of our knowledge)
 # This DOES NOT have to be adapted to the choice of Ntasks
 # FIRST EXCEPTION: If you chose to use hyper-threading (not recommended) you should set I_MPI_PIN_CELL=cpu
 # SECOND EXCEPTION: The following environment variables are Intel specific.
diff --git a/Scripts_and_Parameters_files/JobfileSuperMUC.sh b/Scripts_and_Parameters_files/JobfileSuperMUC.sh
index 277ae0ad9..722f65007 100644
--- a/Scripts_and_Parameters_files/JobfileSuperMUC.sh
+++ b/Scripts_and_Parameters_files/JobfileSuperMUC.sh
@@ -34,10 +34,10 @@ module switch mkl mkl/2018
 
 export OMP_NUM_THREADS=##Nthreads##
 
-# the follwing eviroment variables generate an optimal pinning (to the best of our knowledge)
-# This DOES NOT have to be addepted to the choice of Ntasks
+# the following environment variables generate an optimal pinning (to the best of our knowledge)
+# This DOES NOT have to be adapted to the choice of Ntasks
 # FIRST EXCEPTION: If you chose to use hyperthreading (not recommended) you should set I_MPI_PIN_CELL=cpu
-# SECOND EXCEPTION: The following enviroment variables are Intel specific.
+# SECOND EXCEPTION: The following environment variables are Intel specific.
 export KMP_AFFINITY=verbose,granularity=fine,compact
 export I_MPI_PIN_CELL=core
 export I_MPI_PIN_DOMAIN=auto:cache3
diff --git a/configure.sh b/configure.sh
index b48615754..d501a1f76 100755
--- a/configure.sh
+++ b/configure.sh
@@ -12,6 +12,7 @@ Please choose one of the following MACHINEs:
  * SuperMUC-NG
  * JUWELS
  * FRITZ
+ * HELMA
 Possible MODEs are:
  * MPI (default)
  * noMPI
@@ -458,6 +459,26 @@ case $MACHINE in
       set_hdf5_flags "$INTELCC" ifort "$INTELCXX" || return 1
     fi
   ;;
+
+
+  #NHR@FAU Helma CPU cluster
+  HELMA)
+    module --force switch gpu-env/2025 cpu-env/2026
+    module load intel/2025.3.1
+    module load intelmpi/2021.17.0
+    module load mkl/2024.2.2
+
+    F90OPTFLAGS="$INTELLLVMOPTFLAGS"
+    F90USEFULFLAGS="$INTELLLVMUSEFULFLAGS"
+    ALF_FC="$INTELLLVMCOMPILER"
+    find_mkl_flag || return 1
+    LIB_BLAS_LAPACK="${INTELMKL}"
+    if [ "${HDF5_ENABLED}" = "1" ]; then
+      set_intelcc
+      set_intelcxx
+      set_hdf5_flags "$INTELCC" ifx "$INTELCXX" || return 1
+    fi
+  ;;
   #Default (unknown machine)
   *)
     if [ "$NO_FALLBACK" = "1" ]; then

From 86b8c14f163887c82211b43a02abb416961e3a59 Mon Sep 17 00:00:00 2001
From: Jonas Schwab <jonas.schwab@uni-wuerzburg.de>
Date: Fri, 24 Apr 2026 15:41:24 +0200
Subject: [PATCH 2/2] Add comment to fakeintel.c

---
 Libraries/Modules/fakeintel.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Libraries/Modules/fakeintel.c b/Libraries/Modules/fakeintel.c
index cac01f633..329bc7a4b 100644
--- a/Libraries/Modules/fakeintel.c
+++ b/Libraries/Modules/fakeintel.c
@@ -1,3 +1,10 @@
+// This file is a fake implementation of the MKL function mkl_serv_get_cpu_true,
+// which is used by MKL to determine if the CPU is an Intel CPU.
+// This is used to trick MKL into thinking that it is running on an Intel CPU,
+// which allows it to use the optimized code paths for Intel CPUs, even when running on an AMD CPU.
+// This has shown to improve performance on AMD CPUs significantly, especially on AMD Zen CPUs.
+// See: https://danieldk.eu/Software/Misc/Intel-MKL-on-AMD-Zen
+
 int mkl_serv_intel_cpu_true(void) {
   return 1;
 }