linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] LoongArch: module: Optimize module load time by optimizing PLT/GOT counting
@ 2025-08-14  2:28 Kanglong Wang
  2025-08-17  3:42 ` Huacai Chen
  0 siblings, 1 reply; 2+ messages in thread
From: Kanglong Wang @ 2025-08-14  2:28 UTC (permalink / raw)
  To: Huacai Chen; +Cc: loongarch, linux-kernel

When enabling CONFIG_KASAN, CONFIG_PREEMPT_VOLUNTARY_BUILD and
CONFIG_PREEMPT_VOLUNTARY at the same time, there will be soft deadlock,
the relevant logs are as follows:

rcu: INFO: rcu_sched self-detected stall on CPU
...
Call Trace:
[<900000000024f9e4>] show_stack+0x5c/0x180
[<90000000002482f4>] dump_stack_lvl+0x94/0xbc
[<9000000000224544>] rcu_dump_cpu_stacks+0x1fc/0x280
[<900000000037ac80>] rcu_sched_clock_irq+0x720/0xf88
[<9000000000396c34>] update_process_times+0xb4/0x150
[<90000000003b2474>] tick_nohz_handler+0xf4/0x250
[<9000000000397e28>] __hrtimer_run_queues+0x1d0/0x428
[<9000000000399b2c>] hrtimer_interrupt+0x214/0x538
[<9000000000253634>] constant_timer_interrupt+0x64/0x80
[<9000000000349938>] __handle_irq_event_percpu+0x78/0x1a0
[<9000000000349a78>] handle_irq_event_percpu+0x18/0x88
[<9000000000354c00>] handle_percpu_irq+0x90/0xf0
[<9000000000348c74>] handle_irq_desc+0x94/0xb8
[<9000000001012b28>] handle_cpu_irq+0x68/0xa0
[<9000000001def8c0>] handle_loongarch_irq+0x30/0x48
[<9000000001def958>] do_vint+0x80/0xd0
[<9000000000268a0c>] kasan_mem_to_shadow.part.0+0x2c/0x2a0
[<90000000006344f4>] __asan_load8+0x4c/0x120
[<900000000025c0d0>] module_frob_arch_sections+0x5c8/0x6b8
[<90000000003895f0>] load_module+0x9e0/0x2958
[<900000000038b770>] __do_sys_init_module+0x208/0x2d0
[<9000000001df0c34>] do_syscall+0x94/0x190
[<900000000024d6fc>] handle_syscall+0xbc/0x158

After analysis, because the slow speed of loading the amdgpu module
leads to the long time occupation of the cpu and the soft deadlock.

When loading a module, module_frob_arch_sections() tries to figure out
the number of PLTs/GOTs that'll be needed to handle all the RELAs. It
will call the count_max_entries() to find in an out-of-order date which
counting algorithm has O(n^2) complexity.

To make faster, it sorts the relocation list by info and addend. That
way, to check for a duplicate relocation, it just needs to compare with
the previous entry. This reduces the complexity of the algorithm to O(n
 log n), as done in commit d4e0340919fb ("arm64/module: Optimize module
load time by optimizing PLT counting"). This gives sinificant reduction
in module load time for modules with large number of relocations.

After applying this patch, the soft deadlock problem has been solved,
and the kernel starts normally without "Call Trace".

Using the default configuration to test some modules, the results are as
follows:

Module              Size
ip_tables           36K
fat                 143K
radeon              2.5MB
amdgpu              16MB

Without this patch:
Module              Module load time (ms)	Count(PLTs/GOTs)
ip_tables           18				59/6
fat                 0				162/14
radeon              54				1221/84
amdgpu              1411			4525/1098

With this patch:
Module              Module load time (ms)	Count(PLTs/GOTs)
ip_tables           18				59/6
fat                 0				162/14
radeon              22				1221/84
amdgpu              45				4525/1098

Fixes: fcdfe9d22bed ("LoongArch: Add ELF and module support")
Signed-off-by: Kanglong Wang <wangkanglong@loongson.cn>
---
 arch/loongarch/kernel/module-sections.c | 34 ++++++++++++-------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/arch/loongarch/kernel/module-sections.c b/arch/loongarch/kernel/module-sections.c
index e2f30ff9afde..3b22b3c1af28 100644
--- a/arch/loongarch/kernel/module-sections.c
+++ b/arch/loongarch/kernel/module-sections.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/moduleloader.h>
 #include <linux/ftrace.h>
+#include <linux/sort.h>
 
 Elf_Addr module_emit_got_entry(struct module *mod, Elf_Shdr *sechdrs, Elf_Addr val)
 {
@@ -61,39 +62,36 @@ Elf_Addr module_emit_plt_entry(struct module *mod, Elf_Shdr *sechdrs, Elf_Addr v
 	return (Elf_Addr)&plt[nr];
 }
 
-static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y)
-{
-	return x->r_info == y->r_info && x->r_addend == y->r_addend;
-}
+#define cmp_3way(a, b)  ((a) < (b) ? -1 : (a) > (b))
 
-static bool duplicate_rela(const Elf_Rela *rela, int idx)
+static int compare_rela(const void *x, const void *y)
 {
-	int i;
+	int ret;
+	const Elf_Rela *rela_x = x, *rela_y = y;
 
-	for (i = 0; i < idx; i++) {
-		if (is_rela_equal(&rela[i], &rela[idx]))
-			return true;
-	}
+	ret = cmp_3way(rela_x->r_info, rela_y->r_info);
+	if (ret == 0)
+		ret = cmp_3way(rela_x->r_addend, rela_y->r_addend);
 
-	return false;
+	return ret;
 }
 
 static void count_max_entries(Elf_Rela *relas, int num,
 			      unsigned int *plts, unsigned int *gots)
 {
-	unsigned int i, type;
+	unsigned int i;
 
+	sort(relas, num, sizeof(Elf_Rela), compare_rela, NULL);
 	for (i = 0; i < num; i++) {
-		type = ELF_R_TYPE(relas[i].r_info);
-		switch (type) {
+		if (i > 0 && compare_rela(&relas[i-1], &relas[i]) == 0)
+			continue;
+		switch (ELF_R_TYPE(relas[i].r_info)) {
 		case R_LARCH_SOP_PUSH_PLT_PCREL:
 		case R_LARCH_B26:
-			if (!duplicate_rela(relas, i))
-				(*plts)++;
+			(*plts)++;
 			break;
 		case R_LARCH_GOT_PC_HI20:
-			if (!duplicate_rela(relas, i))
-				(*gots)++;
+			(*gots)++;
 			break;
 		default:
 			break; /* Do nothing. */
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH] LoongArch: module: Optimize module load time by optimizing PLT/GOT counting
  2025-08-14  2:28 [PATCH] LoongArch: module: Optimize module load time by optimizing PLT/GOT counting Kanglong Wang
@ 2025-08-17  3:42 ` Huacai Chen
  0 siblings, 0 replies; 2+ messages in thread
From: Huacai Chen @ 2025-08-17  3:42 UTC (permalink / raw)
  To: Kanglong Wang; +Cc: loongarch, linux-kernel

Applied, thanks.

Huacai

On Thu, Aug 14, 2025 at 10:28 AM Kanglong Wang <wangkanglong@loongson.cn> wrote:
>
> When enabling CONFIG_KASAN, CONFIG_PREEMPT_VOLUNTARY_BUILD and
> CONFIG_PREEMPT_VOLUNTARY at the same time, there will be soft deadlock,
> the relevant logs are as follows:
>
> rcu: INFO: rcu_sched self-detected stall on CPU
> ...
> Call Trace:
> [<900000000024f9e4>] show_stack+0x5c/0x180
> [<90000000002482f4>] dump_stack_lvl+0x94/0xbc
> [<9000000000224544>] rcu_dump_cpu_stacks+0x1fc/0x280
> [<900000000037ac80>] rcu_sched_clock_irq+0x720/0xf88
> [<9000000000396c34>] update_process_times+0xb4/0x150
> [<90000000003b2474>] tick_nohz_handler+0xf4/0x250
> [<9000000000397e28>] __hrtimer_run_queues+0x1d0/0x428
> [<9000000000399b2c>] hrtimer_interrupt+0x214/0x538
> [<9000000000253634>] constant_timer_interrupt+0x64/0x80
> [<9000000000349938>] __handle_irq_event_percpu+0x78/0x1a0
> [<9000000000349a78>] handle_irq_event_percpu+0x18/0x88
> [<9000000000354c00>] handle_percpu_irq+0x90/0xf0
> [<9000000000348c74>] handle_irq_desc+0x94/0xb8
> [<9000000001012b28>] handle_cpu_irq+0x68/0xa0
> [<9000000001def8c0>] handle_loongarch_irq+0x30/0x48
> [<9000000001def958>] do_vint+0x80/0xd0
> [<9000000000268a0c>] kasan_mem_to_shadow.part.0+0x2c/0x2a0
> [<90000000006344f4>] __asan_load8+0x4c/0x120
> [<900000000025c0d0>] module_frob_arch_sections+0x5c8/0x6b8
> [<90000000003895f0>] load_module+0x9e0/0x2958
> [<900000000038b770>] __do_sys_init_module+0x208/0x2d0
> [<9000000001df0c34>] do_syscall+0x94/0x190
> [<900000000024d6fc>] handle_syscall+0xbc/0x158
>
> After analysis, because the slow speed of loading the amdgpu module
> leads to the long time occupation of the cpu and the soft deadlock.
>
> When loading a module, module_frob_arch_sections() tries to figure out
> the number of PLTs/GOTs that'll be needed to handle all the RELAs. It
> will call the count_max_entries() to find in an out-of-order date which
> counting algorithm has O(n^2) complexity.
>
> To make faster, it sorts the relocation list by info and addend. That
> way, to check for a duplicate relocation, it just needs to compare with
> the previous entry. This reduces the complexity of the algorithm to O(n
>  log n), as done in commit d4e0340919fb ("arm64/module: Optimize module
> load time by optimizing PLT counting"). This gives sinificant reduction
> in module load time for modules with large number of relocations.
>
> After applying this patch, the soft deadlock problem has been solved,
> and the kernel starts normally without "Call Trace".
>
> Using the default configuration to test some modules, the results are as
> follows:
>
> Module              Size
> ip_tables           36K
> fat                 143K
> radeon              2.5MB
> amdgpu              16MB
>
> Without this patch:
> Module              Module load time (ms)       Count(PLTs/GOTs)
> ip_tables           18                          59/6
> fat                 0                           162/14
> radeon              54                          1221/84
> amdgpu              1411                        4525/1098
>
> With this patch:
> Module              Module load time (ms)       Count(PLTs/GOTs)
> ip_tables           18                          59/6
> fat                 0                           162/14
> radeon              22                          1221/84
> amdgpu              45                          4525/1098
>
> Fixes: fcdfe9d22bed ("LoongArch: Add ELF and module support")
> Signed-off-by: Kanglong Wang <wangkanglong@loongson.cn>
> ---
>  arch/loongarch/kernel/module-sections.c | 34 ++++++++++++-------------
>  1 file changed, 16 insertions(+), 18 deletions(-)
>
> diff --git a/arch/loongarch/kernel/module-sections.c b/arch/loongarch/kernel/module-sections.c
> index e2f30ff9afde..3b22b3c1af28 100644
> --- a/arch/loongarch/kernel/module-sections.c
> +++ b/arch/loongarch/kernel/module-sections.c
> @@ -8,6 +8,7 @@
>  #include <linux/module.h>
>  #include <linux/moduleloader.h>
>  #include <linux/ftrace.h>
> +#include <linux/sort.h>
>
>  Elf_Addr module_emit_got_entry(struct module *mod, Elf_Shdr *sechdrs, Elf_Addr val)
>  {
> @@ -61,39 +62,36 @@ Elf_Addr module_emit_plt_entry(struct module *mod, Elf_Shdr *sechdrs, Elf_Addr v
>         return (Elf_Addr)&plt[nr];
>  }
>
> -static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y)
> -{
> -       return x->r_info == y->r_info && x->r_addend == y->r_addend;
> -}
> +#define cmp_3way(a, b)  ((a) < (b) ? -1 : (a) > (b))
>
> -static bool duplicate_rela(const Elf_Rela *rela, int idx)
> +static int compare_rela(const void *x, const void *y)
>  {
> -       int i;
> +       int ret;
> +       const Elf_Rela *rela_x = x, *rela_y = y;
>
> -       for (i = 0; i < idx; i++) {
> -               if (is_rela_equal(&rela[i], &rela[idx]))
> -                       return true;
> -       }
> +       ret = cmp_3way(rela_x->r_info, rela_y->r_info);
> +       if (ret == 0)
> +               ret = cmp_3way(rela_x->r_addend, rela_y->r_addend);
>
> -       return false;
> +       return ret;
>  }
>
>  static void count_max_entries(Elf_Rela *relas, int num,
>                               unsigned int *plts, unsigned int *gots)
>  {
> -       unsigned int i, type;
> +       unsigned int i;
>
> +       sort(relas, num, sizeof(Elf_Rela), compare_rela, NULL);
>         for (i = 0; i < num; i++) {
> -               type = ELF_R_TYPE(relas[i].r_info);
> -               switch (type) {
> +               if (i > 0 && compare_rela(&relas[i-1], &relas[i]) == 0)
> +                       continue;
> +               switch (ELF_R_TYPE(relas[i].r_info)) {
>                 case R_LARCH_SOP_PUSH_PLT_PCREL:
>                 case R_LARCH_B26:
> -                       if (!duplicate_rela(relas, i))
> -                               (*plts)++;
> +                       (*plts)++;
>                         break;
>                 case R_LARCH_GOT_PC_HI20:
> -                       if (!duplicate_rela(relas, i))
> -                               (*gots)++;
> +                       (*gots)++;
>                         break;
>                 default:
>                         break; /* Do nothing. */
> --
> 2.20.1
>

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2025-08-17  3:43 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-14  2:28 [PATCH] LoongArch: module: Optimize module load time by optimizing PLT/GOT counting Kanglong Wang
2025-08-17  3:42 ` Huacai Chen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).