From 48d32edfcd308ecad3ad4a2a4ea65d5c2b43c418 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Tue, 8 Apr 2025 19:45:16 -0700 Subject: [PATCH 1/9] riscv: module: Optimize PLT/GOT entry counting perf reports that 99.63% of the cycles from `modprobe amdgpu` are spent inside module_frob_arch_sections(). This is because amdgpu.ko contains about 300000 relocations in its .rela.text section, and the algorithm in count_max_entries() takes quadratic time. Apply two optimizations from the arm64 code, which together reduce the total execution time by 99.57%. First, sort the relocations so duplicate entries are adjacent. Second, reduce the number of relocations that must be sorted by filtering to only relocations that need PLT/GOT entries, as done in commit d4e0340919fb ("arm64/module: Optimize module load time by optimizing PLT counting"). Unlike the arm64 code, here the filtering and sorting is done in a scratch buffer, because the HI20 relocation search optimization in apply_relocate_add() depends on the original order of the relocations. Signed-off-by: Samuel Holland --- arch/riscv/kernel/module-sections.c | 66 +++++++++++++++++++++++++---- 1 file changed, 57 insertions(+), 9 deletions(-) diff --git a/arch/riscv/kernel/module-sections.c b/arch/riscv/kernel/module-sections.c index e264e59e596e8..91d4f0fbd0af0 100644 --- a/arch/riscv/kernel/module-sections.c +++ b/arch/riscv/kernel/module-sections.c @@ -9,6 +9,7 @@ #include #include #include +#include unsigned long module_emit_got_entry(struct module *mod, unsigned long val) { @@ -55,19 +56,27 @@ unsigned long module_emit_plt_entry(struct module *mod, unsigned long val) return (unsigned long)&plt[i]; } -static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y) +#define cmp_3way(a, b) ((a) < (b) ? -1 : (a) > (b)) + +static int cmp_rela(const void *a, const void *b) { - return x->r_info == y->r_info && x->r_addend == y->r_addend; + const Elf_Rela *x = a, *y = b; + int i; + + /* sort by type, symbol index and addend */ + i = cmp_3way(x->r_info, y->r_info); + if (i == 0) + i = cmp_3way(x->r_addend, y->r_addend); + return i; } static bool duplicate_rela(const Elf_Rela *rela, int idx) { - int i; - for (i = 0; i < idx; i++) { - if (is_rela_equal(&rela[i], &rela[idx])) - return true; - } - return false; + /* + * Entries are sorted by type, symbol index and addend. That means + * that, if a duplicate entry exists, it must be in the preceding slot. + */ + return idx > 0 && cmp_rela(rela + idx, rela + idx - 1) == 0; } static void count_max_entries(Elf_Rela *relas, int num, @@ -87,11 +96,33 @@ static void count_max_entries(Elf_Rela *relas, int num, } } +static bool rela_needs_plt_got(const Elf_Rela *rela) +{ + unsigned int type = ELF_R_TYPE(rela->r_info); + + return type == R_RISCV_CALL_PLT || type == R_RISCV_GOT_HI20; +} + +/* Copy PLT and GOT relas to the scratch array. */ +static unsigned int partition_plt_got_relas(const Elf_Rela *relas, Elf_Rela *scratch, + unsigned int num_rela) +{ + int j = 0; + + for (int i = 0; i < num_rela; i++) + if (rela_needs_plt_got(&relas[i])) + scratch[j++] = relas[i]; + + return j; +} + int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings, struct module *mod) { unsigned int num_plts = 0; unsigned int num_gots = 0; + Elf_Rela *scratch = NULL; + size_t scratch_size = 0; int i; /* @@ -132,9 +163,26 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, if (!(dst_sec->sh_flags & SHF_EXECINSTR)) continue; - count_max_entries(relas, num_rela, &num_plts, &num_gots); + /* + * apply_relocate_add() relies on HI20 and LO12 relocation pairs being + * close together, so sort a copy of the section to avoid interfering. + */ + if (sechdrs[i].sh_size > scratch_size) { + scratch_size = sechdrs[i].sh_size; + scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL); + if (!scratch) + return -ENOMEM; + } + + /* sort relocations requiring a PLT or GOT entry so duplicates are adjacent */ + num_rela = partition_plt_got_relas(relas, scratch, num_rela); + sort(scratch, num_rela, sizeof(Elf_Rela), cmp_rela, NULL); + count_max_entries(scratch, num_rela, &num_plts, &num_gots); } + if (scratch) + kvfree(scratch); + mod->arch.plt.shdr->sh_type = SHT_NOBITS; mod->arch.plt.shdr->sh_flags = SHF_EXECINSTR | SHF_ALLOC; mod->arch.plt.shdr->sh_addralign = L1_CACHE_BYTES; From 064b89d0d78f620a3b94334b41ffa6a35c752189 Mon Sep 17 00:00:00 2001 From: Pritesh Patel Date: Wed, 9 Apr 2025 07:15:16 +0000 Subject: [PATCH 2/9] riscv: module: fix compilation error of kvrealloc Backporting the patch "riscv: module: Optimize PLT/GOT entry counting" to kernel 6.6.77 leads to compilation error. Error below: arch/riscv/kernel/module-sections.c: In function 'module_frob_arch_sections': arch/riscv/kernel/module-sections.c:172:35: error: too few arguments to function 'kvrealloc' 172 | scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL); | ^~~~~~~~~ Hence, adding the changes to resolve this compilation error. Link: https://lore.kernel.org/linux-riscv/20250409024519.454828-1-samuel.holland@sifive.com Signed-off-by: Pritesh Patel --- arch/riscv/kernel/module-sections.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/module-sections.c b/arch/riscv/kernel/module-sections.c index 91d4f0fbd0af0..a191ba7f8892a 100644 --- a/arch/riscv/kernel/module-sections.c +++ b/arch/riscv/kernel/module-sections.c @@ -168,10 +168,10 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, * close together, so sort a copy of the section to avoid interfering. */ if (sechdrs[i].sh_size > scratch_size) { - scratch_size = sechdrs[i].sh_size; - scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL); + scratch = kvrealloc(scratch, scratch_size, sechdrs[i].sh_size, GFP_KERNEL); if (!scratch) return -ENOMEM; + scratch_size = sechdrs[i].sh_size; } /* sort relocations requiring a PLT or GOT entry so duplicates are adjacent */ From 6d5d67b6c284c8a7ee6cd89873c042c432d5de35 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Fri, 29 Mar 2024 00:18:16 -0700 Subject: [PATCH 3/9] arch: add ARCH_HAS_KERNEL_FPU_SUPPORT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Several architectures provide an API to enable the FPU and run floating-point SIMD code in kernel space. However, the function names, header locations, and semantics are inconsistent across architectures, and FPU support may be gated behind other Kconfig options. provide a standard way for architectures to declare that kernel space FPU support is available. Architectures selecting this option must implement what is currently the most common API (kernel_fpu_begin() and kernel_fpu_end(), plus a new function kernel_fpu_available()) and provide the appropriate CFLAGS for compiling floating-point C code. Link: https://lkml.kernel.org/r/20240329072441.591471-2-samuel.holland@sifive.com Signed-off-by: Samuel Holland Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Acked-by: Christian König Cc: Alex Deucher Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Dave Hansen Cc: Huacai Chen Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Masahiro Yamada Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Palmer Dabbelt Cc: Russell King Cc: Thomas Gleixner Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Han Gao --- Documentation/core-api/floating-point.rst | 78 +++++++++++++++++++++++ Documentation/core-api/index.rst | 1 + Makefile | 5 ++ arch/Kconfig | 6 ++ include/linux/fpu.h | 12 ++++ 5 files changed, 102 insertions(+) create mode 100644 Documentation/core-api/floating-point.rst create mode 100644 include/linux/fpu.h diff --git a/Documentation/core-api/floating-point.rst b/Documentation/core-api/floating-point.rst new file mode 100644 index 0000000000000..a8d0d4b050529 --- /dev/null +++ b/Documentation/core-api/floating-point.rst @@ -0,0 +1,78 @@ +.. SPDX-License-Identifier: GPL-2.0+ + +Floating-point API +================== + +Kernel code is normally prohibited from using floating-point (FP) registers or +instructions, including the C float and double data types. This rule reduces +system call overhead, because the kernel does not need to save and restore the +userspace floating-point register state. + +However, occasionally drivers or library functions may need to include FP code. +This is supported by isolating the functions containing FP code to a separate +translation unit (a separate source file), and saving/restoring the FP register +state around calls to those functions. This creates "critical sections" of +floating-point usage. + +The reason for this isolation is to prevent the compiler from generating code +touching the FP registers outside these critical sections. Compilers sometimes +use FP registers to optimize inlined ``memcpy`` or variable assignment, as +floating-point registers may be wider than general-purpose registers. + +Usability of floating-point code within the kernel is architecture-specific. +Additionally, because a single kernel may be configured to support platforms +both with and without a floating-point unit, FPU availability must be checked +both at build time and at run time. + +Several architectures implement the generic kernel floating-point API from +``linux/fpu.h``, as described below. Some other architectures implement their +own unique APIs, which are documented separately. + +Build-time API +-------------- + +Floating-point code may be built if the option ``ARCH_HAS_KERNEL_FPU_SUPPORT`` +is enabled. For C code, such code must be placed in a separate file, and that +file must have its compilation flags adjusted using the following pattern:: + + CFLAGS_foo.o += $(CC_FLAGS_FPU) + CFLAGS_REMOVE_foo.o += $(CC_FLAGS_NO_FPU) + +Architectures are expected to define one or both of these variables in their +top-level Makefile as needed. For example:: + + CC_FLAGS_FPU := -mhard-float + +or:: + + CC_FLAGS_NO_FPU := -msoft-float + +Normal kernel code is assumed to use the equivalent of ``CC_FLAGS_NO_FPU``. + +Runtime API +----------- + +The runtime API is provided in ``linux/fpu.h``. This header cannot be included +from files implementing FP code (those with their compilation flags adjusted as +above). Instead, it must be included when defining the FP critical sections. + +.. c:function:: bool kernel_fpu_available( void ) + + This function reports if floating-point code can be used on this CPU or + platform. The value returned by this function is not expected to change + at runtime, so it only needs to be called once, not before every + critical section. + +.. c:function:: void kernel_fpu_begin( void ) + void kernel_fpu_end( void ) + + These functions create a floating-point critical section. It is only + valid to call ``kernel_fpu_begin()`` after a previous call to + ``kernel_fpu_available()`` returned ``true``. These functions are only + guaranteed to be callable from (preemptible or non-preemptible) process + context. + + Preemption may be disabled inside critical sections, so their size + should be minimized. They are *not* required to be reentrant. If the + caller expects to nest critical sections, it must implement its own + reference counting. diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index 7a3a08d81f111..974beccd671f6 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -48,6 +48,7 @@ Library functionality that is used throughout the kernel. errseq wrappers/atomic_t wrappers/atomic_bitops + floating-point Low level entry and exit ======================== diff --git a/Makefile b/Makefile index b7198af9e59b4..c8993b8dd6056 100644 --- a/Makefile +++ b/Makefile @@ -981,6 +981,11 @@ KBUILD_CFLAGS += $(CC_FLAGS_CFI) export CC_FLAGS_CFI endif +# Architectures can define flags to add/remove for floating-point support +CC_FLAGS_FPU += -D_LINUX_FPU_COMPILATION_UNIT +export CC_FLAGS_FPU +export CC_FLAGS_NO_FPU + ifneq ($(CONFIG_FUNCTION_ALIGNMENT),0) KBUILD_CFLAGS += -falign-functions=$(CONFIG_FUNCTION_ALIGNMENT) endif diff --git a/arch/Kconfig b/arch/Kconfig index 20c2c93d2c889..603eb47c6268e 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1480,6 +1480,12 @@ config ARCH_HAS_NONLEAF_PMD_YOUNG address translations. Page table walkers that clear the accessed bit may use this capability to reduce their search space. +config ARCH_HAS_KERNEL_FPU_SUPPORT + bool + help + Architectures that select this option can run floating-point code in + the kernel, as described in Documentation/core-api/floating-point.rst. + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/include/linux/fpu.h b/include/linux/fpu.h new file mode 100644 index 0000000000000..2fb63e22913bc --- /dev/null +++ b/include/linux/fpu.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_FPU_H +#define _LINUX_FPU_H + +#ifdef _LINUX_FPU_COMPILATION_UNIT +#error FP code must be compiled separately. See Documentation/core-api/floating-point.rst. +#endif + +#include + +#endif From 865b22737b4211074567dd6a716aac9c22c1eabd Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Fri, 29 Mar 2024 00:18:26 -0700 Subject: [PATCH 4/9] riscv: add support for kernel-mode FPU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is motivated by the amdgpu DRM driver, which needs floating-point code to support recent hardware. That code is not performance-critical, so only provide a minimal non-preemptible implementation for now. Support is limited to riscv64 because riscv32 requires runtime (libgcc) assistance to convert between doubles and 64-bit integers. Link: https://lkml.kernel.org/r/20240329072441.591471-12-samuel.holland@sifive.com Signed-off-by: Samuel Holland Acked-by: Palmer Dabbelt Reviewed-by: Palmer Dabbelt Reviewed-by: Christoph Hellwig Acked-by: Christian König Cc: Alex Deucher Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Dave Hansen Cc: Huacai Chen Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Masahiro Yamada Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Russell King Cc: Thomas Gleixner Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Han Gao --- arch/riscv/Kconfig | 1 + arch/riscv/Makefile | 3 +++ arch/riscv/include/asm/fpu.h | 16 ++++++++++++++++ arch/riscv/kernel/Makefile | 1 + arch/riscv/kernel/kernel_mode_fpu.c | 28 ++++++++++++++++++++++++++++ 5 files changed, 49 insertions(+) create mode 100644 arch/riscv/include/asm/fpu.h create mode 100644 arch/riscv/kernel/kernel_mode_fpu.c diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 443f7566c9d10..eacbc4efe60cd 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -29,6 +29,7 @@ config RISCV select ARCH_HAS_KCOV select ARCH_HAS_MEMBARRIER_CALLBACKS select ARCH_HAS_MEMBARRIER_SYNC_CORE + select ARCH_HAS_KERNEL_FPU_SUPPORT if 64BIT && FPU select ARCH_HAS_MMIOWB select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PMEM_API diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index b43a6bb7e4dcb..aaf7b9bd2275f 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -77,6 +77,9 @@ KBUILD_CFLAGS += -march=$(shell echo $(riscv-march-y) | sed -E 's/(rv32ima|rv64i KBUILD_AFLAGS += -march=$(riscv-march-y) +# For C code built with floating-point support, exclude V but keep F and D. +CC_FLAGS_FPU := -march=$(shell echo $(riscv-march-y) | sed -E 's/(rv32ima|rv64ima)([^v_]*)v?/\1\2/') + KBUILD_CFLAGS += -mno-save-restore KBUILD_CFLAGS += -DCONFIG_PAGE_OFFSET=$(CONFIG_PAGE_OFFSET) diff --git a/arch/riscv/include/asm/fpu.h b/arch/riscv/include/asm/fpu.h new file mode 100644 index 0000000000000..91c04c244e12d --- /dev/null +++ b/arch/riscv/include/asm/fpu.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2023 SiFive + */ + +#ifndef _ASM_RISCV_FPU_H +#define _ASM_RISCV_FPU_H + +#include + +#define kernel_fpu_available() has_fpu() + +void kernel_fpu_begin(void); +void kernel_fpu_end(void); + +#endif /* ! _ASM_RISCV_FPU_H */ diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 95cf25d484052..a90f90eb05787 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -61,6 +61,7 @@ obj-$(CONFIG_MMU) += vdso.o vdso/ obj-$(CONFIG_RISCV_M_MODE) += traps_misaligned.o obj-$(CONFIG_FPU) += fpu.o +obj-$(CONFIG_FPU) += kernel_mode_fpu.o obj-$(CONFIG_RISCV_ISA_V) += vector.o obj-$(CONFIG_SMP) += smpboot.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/riscv/kernel/kernel_mode_fpu.c b/arch/riscv/kernel/kernel_mode_fpu.c new file mode 100644 index 0000000000000..0ac8348876c49 --- /dev/null +++ b/arch/riscv/kernel/kernel_mode_fpu.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 SiFive + */ + +#include +#include + +#include +#include +#include +#include + +void kernel_fpu_begin(void) +{ + preempt_disable(); + fstate_save(current, task_pt_regs(current)); + csr_set(CSR_SSTATUS, SR_FS); +} +EXPORT_SYMBOL_GPL(kernel_fpu_begin); + +void kernel_fpu_end(void) +{ + csr_clear(CSR_SSTATUS, SR_FS); + fstate_restore(current, task_pt_regs(current)); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(kernel_fpu_end); From 495327798b580f19636371a9ef3eb0533c5944c8 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 21 Sep 2023 16:15:12 +0200 Subject: [PATCH 5/9] drm/amd/display: Remove migrate_en/dis from dc_fpu_begin(). This is a revert of the commit mentioned below while it is not wrong, as in the kernel will explode, having migrate_disable() here it is complete waste of resources. Additionally commit message is plain wrong the review tag does not make it any better. The migrate_disable() interface has a fat comment describing it and it includes the word "undesired" in the headline which should tickle people to read it before using it. Initially I assumed it is worded too harsh but now I beg to differ. The reviewer of the original commit, even not understanding what migrate_disable() does should ask the following: - migrate_disable() is added only to the CONFIG_X86 block and it claims to protect fpu_recursion_depth. Why are the other the architectures excluded? - migrate_disable() is added after fpu_recursion_depth was modified. Shouldn't it be added before the modification or referencing takes place? Moving on. Disabling preemption DOES prevent CPU migration. A task, that can not be pushed away from the CPU by the scheduler (due to disabled preemption) can not be pushed or migrated to another CPU. Disabling migration DOES NOT ensure consistency of per-CPU variables. It only ensures that the task acts always on the same per-CPU variable. The task remains preemptible meaning multiple tasks can access the same per-CPU variable. This in turn leads to inconsistency for the statement *pcpu -= 1; with two tasks on one CPU and a preemption point during the RMW operation: Task A Task B read pcpu to reg # 0 inc reg # 0 -> 1 read pcpu to reg # 0 inc reg # 0 -> 1 write reg to pcpu # 1 write reg to pcpu # 1 At the end pcpu reads 1 but should read 2 instead. Boom. get_cpu_ptr() already contains a preempt_disable() statement. That means that the per-CPU variable can only be referenced by a single task which is currently running. The only inconsistency that can occur if the variable is additionally accessed from an interrupt. Remove migrate_disable/enable() from dc_fpu_begin/end(). Cc: Tianci Yin Cc: Aurabindo Pillai Fixes: 0c316556d124 ("drm/amd/display: Disable migration to ensure consistency of per-CPU variable") Acked-by: Harry Wentland Reviewed-by: Rodrigo Siqueira Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Hamza Mahfooz Signed-off-by: Alex Deucher Signed-off-by: Han Gao --- drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c b/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c index 172aa10a8800f..86f4c0e046548 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c @@ -91,7 +91,6 @@ void dc_fpu_begin(const char *function_name, const int line) if (*pcpu == 1) { #if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH) - migrate_disable(); kernel_fpu_begin(); #elif defined(CONFIG_PPC64) if (cpu_has_feature(CPU_FTR_VSX_COMP)) { @@ -132,7 +131,6 @@ void dc_fpu_end(const char *function_name, const int line) if (*pcpu <= 0) { #if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH) kernel_fpu_end(); - migrate_enable(); #elif defined(CONFIG_PPC64) if (cpu_has_feature(CPU_FTR_VSX_COMP)) { disable_kernel_vsx(); From f0a25e44bd8eaa575c34e113ceff748ad1c45456 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 21 Sep 2023 16:15:13 +0200 Subject: [PATCH 6/9] drm/amd/display: Simplify the per-CPU usage. The fpu_recursion_depth counter is used to ensure that dc_fpu_begin() can be invoked multiple times while the FPU-disable function itself is only invoked once. Also the counter part (dc_fpu_end()) is ballanced properly. Instead of using the get_cpu_ptr() dance around the inc it is simpler to increment the per-CPU variable directly. Also the per-CPU variable has to be incremented and decremented on the same CPU. This is ensured by the inner-part which disables preemption. This is kind of not obvious, works and the preempt-counter is touched a few times for no reason. Disable preemption before incrementing fpu_recursion_depth for the first time. Keep preemption disabled until dc_fpu_end() where the counter is decremented making it obvious that the preemption has to stay disabled while the counter is non-zero. Use simple inc/dec functions. Remove the nested preempt_disable/enable functions which are now not needed. Acked-by: Harry Wentland Reviewed-by: Rodrigo Siqueira Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Hamza Mahfooz Signed-off-by: Alex Deucher Signed-off-by: Han Gao --- .../gpu/drm/amd/display/amdgpu_dm/dc_fpu.c | 50 ++++++++----------- 1 file changed, 20 insertions(+), 30 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c b/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c index 86f4c0e046548..8bd5926b47e06 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c @@ -60,11 +60,9 @@ static DEFINE_PER_CPU(int, fpu_recursion_depth); */ inline void dc_assert_fp_enabled(void) { - int *pcpu, depth = 0; + int depth; - pcpu = get_cpu_ptr(&fpu_recursion_depth); - depth = *pcpu; - put_cpu_ptr(&fpu_recursion_depth); + depth = __this_cpu_read(fpu_recursion_depth); ASSERT(depth >= 1); } @@ -84,32 +82,27 @@ inline void dc_assert_fp_enabled(void) */ void dc_fpu_begin(const char *function_name, const int line) { - int *pcpu; + int depth; - pcpu = get_cpu_ptr(&fpu_recursion_depth); - *pcpu += 1; + preempt_disable(); + depth = __this_cpu_inc_return(fpu_recursion_depth); - if (*pcpu == 1) { + if (depth == 1) { #if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH) kernel_fpu_begin(); #elif defined(CONFIG_PPC64) - if (cpu_has_feature(CPU_FTR_VSX_COMP)) { - preempt_disable(); + if (cpu_has_feature(CPU_FTR_VSX_COMP)) enable_kernel_vsx(); - } else if (cpu_has_feature(CPU_FTR_ALTIVEC_COMP)) { - preempt_disable(); + else if (cpu_has_feature(CPU_FTR_ALTIVEC_COMP)) enable_kernel_altivec(); - } else if (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE)) { - preempt_disable(); + else if (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE)) enable_kernel_fp(); - } #elif defined(CONFIG_ARM64) kernel_neon_begin(); #endif } - TRACE_DCN_FPU(true, function_name, line, *pcpu); - put_cpu_ptr(&fpu_recursion_depth); + TRACE_DCN_FPU(true, function_name, line, depth); } /** @@ -124,29 +117,26 @@ void dc_fpu_begin(const char *function_name, const int line) */ void dc_fpu_end(const char *function_name, const int line) { - int *pcpu; + int depth; - pcpu = get_cpu_ptr(&fpu_recursion_depth); - *pcpu -= 1; - if (*pcpu <= 0) { + depth = __this_cpu_dec_return(fpu_recursion_depth); + if (depth == 0) { #if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH) kernel_fpu_end(); #elif defined(CONFIG_PPC64) - if (cpu_has_feature(CPU_FTR_VSX_COMP)) { + if (cpu_has_feature(CPU_FTR_VSX_COMP)) disable_kernel_vsx(); - preempt_enable(); - } else if (cpu_has_feature(CPU_FTR_ALTIVEC_COMP)) { + else if (cpu_has_feature(CPU_FTR_ALTIVEC_COMP)) disable_kernel_altivec(); - preempt_enable(); - } else if (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE)) { + else if (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE)) disable_kernel_fp(); - preempt_enable(); - } #elif defined(CONFIG_ARM64) kernel_neon_end(); #endif + } else { + WARN_ON_ONCE(depth < 0); } - TRACE_DCN_FPU(false, function_name, line, *pcpu); - put_cpu_ptr(&fpu_recursion_depth); + TRACE_DCN_FPU(false, function_name, line, depth); + preempt_enable(); } From a0801aa15d2ebd162593d0ad7444708ad5fdf12d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 21 Sep 2023 16:15:14 +0200 Subject: [PATCH 7/9] drm/amd/display: Add a warning if the FPU is used outside from task context. Add a warning if the FPU is used from any context other than task context. This is only precaution since the code is not able to be used from softirq while the API allows it on x86 for instance. Acked-by: Harry Wentland Reviewed-by: Rodrigo Siqueira Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Hamza Mahfooz Signed-off-by: Alex Deucher Signed-off-by: Han Gao --- drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c b/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c index 8bd5926b47e06..4ae4720535a56 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c @@ -84,6 +84,7 @@ void dc_fpu_begin(const char *function_name, const int line) { int depth; + WARN_ON_ONCE(!in_task()); preempt_disable(); depth = __this_cpu_inc_return(fpu_recursion_depth); From a5d6b5b23bf29644b2b2f66da87041a649b5d44f Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 29 Mar 2024 00:18:27 -0700 Subject: [PATCH 8/9] drm/amd/display: only use hard-float, not altivec on powerpc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The compiler flags enable altivec, but that is not required; hard-float is sufficient for the code to build and function. Drop altivec from the compiler flags and adjust the enable/disable code to only enable FPU use. Link: https://lkml.kernel.org/r/20240329072441.591471-13-samuel.holland@sifive.com Signed-off-by: Michael Ellerman Signed-off-by: Samuel Holland Acked-by: Alex Deucher Acked-by: Christian König Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Dave Hansen Cc: Huacai Chen Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Palmer Dabbelt Cc: Russell King Cc: Thomas Gleixner Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Han Gao --- drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c | 12 ++---------- drivers/gpu/drm/amd/display/dc/dml/Makefile | 2 +- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c b/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c index 4ae4720535a56..0de16796466b8 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c @@ -92,11 +92,7 @@ void dc_fpu_begin(const char *function_name, const int line) #if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH) kernel_fpu_begin(); #elif defined(CONFIG_PPC64) - if (cpu_has_feature(CPU_FTR_VSX_COMP)) - enable_kernel_vsx(); - else if (cpu_has_feature(CPU_FTR_ALTIVEC_COMP)) - enable_kernel_altivec(); - else if (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE)) + if (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE)) enable_kernel_fp(); #elif defined(CONFIG_ARM64) kernel_neon_begin(); @@ -125,11 +121,7 @@ void dc_fpu_end(const char *function_name, const int line) #if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH) kernel_fpu_end(); #elif defined(CONFIG_PPC64) - if (cpu_has_feature(CPU_FTR_VSX_COMP)) - disable_kernel_vsx(); - else if (cpu_has_feature(CPU_FTR_ALTIVEC_COMP)) - disable_kernel_altivec(); - else if (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE)) + if (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE)) disable_kernel_fp(); #elif defined(CONFIG_ARM64) kernel_neon_end(); diff --git a/drivers/gpu/drm/amd/display/dc/dml/Makefile b/drivers/gpu/drm/amd/display/dc/dml/Makefile index 0ba9a7997d561..47b9f0154886e 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dml/Makefile @@ -31,7 +31,7 @@ dml_ccflags := $(dml_ccflags-y) -msse endif ifdef CONFIG_PPC64 -dml_ccflags := -mhard-float -maltivec +dml_ccflags := -mhard-float endif ifdef CONFIG_ARM64 From 0f420e57e9d5c032e03606a905bd2f3761e7f38b Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Fri, 29 Mar 2024 00:18:28 -0700 Subject: [PATCH 9/9] drm/amd/display: use ARCH_HAS_KERNEL_FPU_SUPPORT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that all previously-supported architectures select ARCH_HAS_KERNEL_FPU_SUPPORT, this code can depend on that symbol instead of the existing list of architectures. It can also take advantage of the common kernel-mode FPU API and method of adjusting CFLAGS. Link: https://lkml.kernel.org/r/20240329072441.591471-14-samuel.holland@sifive.com Signed-off-by: Samuel Holland Acked-by: Alex Deucher Reviewed-by: Christoph Hellwig Acked-by: Christian König Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Dave Hansen Cc: Huacai Chen Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Masahiro Yamada Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Palmer Dabbelt Cc: Russell King Cc: Thomas Gleixner Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Han Gao --- drivers/gpu/drm/amd/display/Kconfig | 2 +- .../gpu/drm/amd/display/amdgpu_dm/dc_fpu.c | 27 ++------------ drivers/gpu/drm/amd/display/dc/dml/Makefile | 36 ++----------------- 3 files changed, 5 insertions(+), 60 deletions(-) diff --git a/drivers/gpu/drm/amd/display/Kconfig b/drivers/gpu/drm/amd/display/Kconfig index 901d1961b7392..5fcd4f778dc3d 100644 --- a/drivers/gpu/drm/amd/display/Kconfig +++ b/drivers/gpu/drm/amd/display/Kconfig @@ -8,7 +8,7 @@ config DRM_AMD_DC depends on BROKEN || !CC_IS_CLANG || ARM64 || RISCV || SPARC64 || X86_64 select SND_HDA_COMPONENT if SND_HDA_CORE # !CC_IS_CLANG: https://github.com/ClangBuiltLinux/linux/issues/1752 - select DRM_AMD_DC_FP if (X86 || LOONGARCH || (PPC64 && ALTIVEC) || (ARM64 && KERNEL_MODE_NEON && !CC_IS_CLANG)) + select DRM_AMD_DC_FP if ARCH_HAS_KERNEL_FPU_SUPPORT && (!ARM64 || !CC_IS_CLANG) help Choose this option if you want to use the new display engine support for AMDGPU. This adds required support for Vega and diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c b/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c index 0de16796466b8..e46f8ce41d871 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c @@ -26,16 +26,7 @@ #include "dc_trace.h" -#if defined(CONFIG_X86) -#include -#elif defined(CONFIG_PPC64) -#include -#include -#elif defined(CONFIG_ARM64) -#include -#elif defined(CONFIG_LOONGARCH) -#include -#endif +#include /** * DOC: DC FPU manipulation overview @@ -87,16 +78,9 @@ void dc_fpu_begin(const char *function_name, const int line) WARN_ON_ONCE(!in_task()); preempt_disable(); depth = __this_cpu_inc_return(fpu_recursion_depth); - if (depth == 1) { -#if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH) + BUG_ON(!kernel_fpu_available()); kernel_fpu_begin(); -#elif defined(CONFIG_PPC64) - if (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE)) - enable_kernel_fp(); -#elif defined(CONFIG_ARM64) - kernel_neon_begin(); -#endif } TRACE_DCN_FPU(true, function_name, line, depth); @@ -118,14 +102,7 @@ void dc_fpu_end(const char *function_name, const int line) depth = __this_cpu_dec_return(fpu_recursion_depth); if (depth == 0) { -#if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH) kernel_fpu_end(); -#elif defined(CONFIG_PPC64) - if (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE)) - disable_kernel_fp(); -#elif defined(CONFIG_ARM64) - kernel_neon_end(); -#endif } else { WARN_ON_ONCE(depth < 0); } diff --git a/drivers/gpu/drm/amd/display/dc/dml/Makefile b/drivers/gpu/drm/amd/display/dc/dml/Makefile index 47b9f0154886e..2d04de698dd6c 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dml/Makefile @@ -25,40 +25,8 @@ # It provides the general basic services required by other DAL # subcomponents. -ifdef CONFIG_X86 -dml_ccflags-$(CONFIG_CC_IS_GCC) := -mhard-float -dml_ccflags := $(dml_ccflags-y) -msse -endif - -ifdef CONFIG_PPC64 -dml_ccflags := -mhard-float -endif - -ifdef CONFIG_ARM64 -dml_rcflags := -mgeneral-regs-only -endif - -ifdef CONFIG_LOONGARCH -dml_ccflags := -mfpu=64 -dml_rcflags := -msoft-float -endif - -ifdef CONFIG_CC_IS_GCC -ifneq ($(call gcc-min-version, 70100),y) -IS_OLD_GCC = 1 -endif -endif - -ifdef CONFIG_X86 -ifdef IS_OLD_GCC -# Stack alignment mismatch, proceed with caution. -# GCC < 7.1 cannot compile code using `double` and -mpreferred-stack-boundary=3 -# (8B stack alignment). -dml_ccflags += -mpreferred-stack-boundary=4 -else -dml_ccflags += -msse2 -endif -endif +dml_ccflags := $(CC_FLAGS_FPU) +dml_rcflags := $(CC_FLAGS_NO_FPU) ifneq ($(CONFIG_FRAME_WARN),0) ifeq ($(filter y,$(CONFIG_KASAN)$(CONFIG_KCSAN)),y)