[PATCH v3] x86/alternatives: remove false sharing in poke_int3

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v3] x86/alternatives: remove false sharing in poke_int3_handler()
@ 2025-03-25  4:33 Eric Dumazet
  2025-03-25  7:14 ` Ingo Molnar
  2025-03-25  7:25 ` [tip: x86/alternatives] x86/alternatives: Improve code-patching scalability by removing " tip-bot2 for Eric Dumazet
  0 siblings, 2 replies; 4+ messages in thread
From: Eric Dumazet @ 2025-03-25  4:33 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H . Peter Anvin, Peter Zijlstra, Steven Rostedt
  Cc: linux-kernel, Alexei Starovoitov, Daniel Borkmann,
	Masami Hiramatsu, x86, bpf, Eric Dumazet, Greg Thelen,
	Stephane Eranian, Eric Dumazet

eBPF programs can be run 50,000,000 times per second on busy servers.

Whenever /proc/sys/kernel/bpf_stats_enabled is turned off,
hundreds of calls sites are patched from text_poke_bp_batch()
and we see a huge loss of performance due to false sharing
on bp_desc.refs lasting up to three seconds.

   51.30%  server_bin       [kernel.kallsyms]           [k] poke_int3_handler
            |
            |--46.45%--poke_int3_handler
            |          exc_int3
            |          asm_exc_int3
            |          |
            |          |--24.26%--cls_bpf_classify
            |          |          tcf_classify
            |          |          __dev_queue_xmit
            |          |          ip6_finish_output2
            |          |          ip6_output
            |          |          ip6_xmit
            |          |          inet6_csk_xmit
            |          |          __tcp_transmit_skb

Fix this by replacing bp_desc.refs with a per-cpu bp_refs.

Before the patch, on a host with 240 cores (480 threads):

sysctl -wq kernel.bpf_stats_enabled=0

text_poke_bp_batch(nr_entries=164) : Took 2655300 usec

bpftool prog | grep run_time_ns
...
105: sched_cls  name hn_egress  tag 699fc5eea64144e3  gpl run_time_ns
3009063719 run_cnt 82757845 : average cost is 36 nsec per call

After this patch:

sysctl -wq kernel.bpf_stats_enabled=0

text_poke_bp_batch(nr_entries=164) : Took 702 usec

$ bpftool prog | grep run_time_ns
...
105: sched_cls  name hn_egress  tag 699fc5eea64144e3  gpl run_time_ns
1928223019 run_cnt 67682728 : average cost is 28 nsec per call

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 arch/x86/kernel/alternative.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index c71b575bf229..5d364e990055 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2137,28 +2137,29 @@ struct text_poke_loc {
 struct bp_patching_desc {
 	struct text_poke_loc *vec;
 	int nr_entries;
-	atomic_t refs;
 };
 
+static DEFINE_PER_CPU(atomic_t, bp_refs);
+
 static struct bp_patching_desc bp_desc;
 
 static __always_inline
 struct bp_patching_desc *try_get_desc(void)
 {
-	struct bp_patching_desc *desc = &bp_desc;
+	atomic_t *refs = this_cpu_ptr(&bp_refs);
 
-	if (!raw_atomic_inc_not_zero(&desc->refs))
+	if (!raw_atomic_inc_not_zero(refs))
 		return NULL;
 
-	return desc;
+	return &bp_desc;
 }
 
 static __always_inline void put_desc(void)
 {
-	struct bp_patching_desc *desc = &bp_desc;
+	atomic_t *refs = this_cpu_ptr(&bp_refs);
 
 	smp_mb__before_atomic();
-	raw_atomic_dec(&desc->refs);
+	raw_atomic_dec(refs);
 }
 
 static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
@@ -2191,9 +2192,9 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
 	 * Having observed our INT3 instruction, we now must observe
 	 * bp_desc with non-zero refcount:
 	 *
-	 *	bp_desc.refs = 1		INT3
-	 *	WMB				RMB
-	 *	write INT3			if (bp_desc.refs != 0)
+	 *	bp_refs = 1		INT3
+	 *	WMB			RMB
+	 *	write INT3		if (bp_refs != 0)
 	 */
 	smp_rmb();
 
@@ -2299,7 +2300,8 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
 	 * Corresponds to the implicit memory barrier in try_get_desc() to
 	 * ensure reading a non-zero refcount provides up to date bp_desc data.
 	 */
-	atomic_set_release(&bp_desc.refs, 1);
+	for_each_possible_cpu(i)
+		atomic_set_release(per_cpu_ptr(&bp_refs, i), 1);
 
 	/*
 	 * Function tracing can enable thousands of places that need to be
@@ -2413,8 +2415,12 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
 	/*
 	 * Remove and wait for refs to be zero.
 	 */
-	if (!atomic_dec_and_test(&bp_desc.refs))
-		atomic_cond_read_acquire(&bp_desc.refs, !VAL);
+	for_each_possible_cpu(i) {
+		atomic_t *refs = per_cpu_ptr(&bp_refs, i);
+
+		if (unlikely(!atomic_dec_and_test(refs)))
+			atomic_cond_read_acquire(refs, !VAL);
+	}
 }
 
 static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
-- 
2.49.0.395.g12beb8f557-goog


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v3] x86/alternatives: remove false sharing in poke_int3_handler()
  2025-03-25  4:33 [PATCH v3] x86/alternatives: remove false sharing in poke_int3_handler() Eric Dumazet
@ 2025-03-25  7:14 ` Ingo Molnar
  2025-03-25  7:45   ` Eric Dumazet
  2025-03-25  7:25 ` [tip: x86/alternatives] x86/alternatives: Improve code-patching scalability by removing " tip-bot2 for Eric Dumazet
  1 sibling, 1 reply; 4+ messages in thread
From: Ingo Molnar @ 2025-03-25  7:14 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H . Peter Anvin, Peter Zijlstra, Steven Rostedt, linux-kernel,
	Alexei Starovoitov, Daniel Borkmann, Masami Hiramatsu, x86, bpf,
	Eric Dumazet, Greg Thelen, Stephane Eranian


* Eric Dumazet <edumazet@google.com> wrote:

> eBPF programs can be run 50,000,000 times per second on busy servers.
> 
> Whenever /proc/sys/kernel/bpf_stats_enabled is turned off,
> hundreds of calls sites are patched from text_poke_bp_batch()
> and we see a huge loss of performance due to false sharing
> on bp_desc.refs lasting up to three seconds.
> 
>    51.30%  server_bin       [kernel.kallsyms]           [k] poke_int3_handler
>             |
>             |--46.45%--poke_int3_handler
>             |          exc_int3
>             |          asm_exc_int3
>             |          |
>             |          |--24.26%--cls_bpf_classify
>             |          |          tcf_classify
>             |          |          __dev_queue_xmit
>             |          |          ip6_finish_output2
>             |          |          ip6_output
>             |          |          ip6_xmit
>             |          |          inet6_csk_xmit
>             |          |          __tcp_transmit_skb
> 
> Fix this by replacing bp_desc.refs with a per-cpu bp_refs.
> 
> Before the patch, on a host with 240 cores (480 threads):
> 
> sysctl -wq kernel.bpf_stats_enabled=0
> 
> text_poke_bp_batch(nr_entries=164) : Took 2655300 usec
> 
> bpftool prog | grep run_time_ns
> ...
> 105: sched_cls  name hn_egress  tag 699fc5eea64144e3  gpl run_time_ns
> 3009063719 run_cnt 82757845 : average cost is 36 nsec per call
> 
> After this patch:
> 
> sysctl -wq kernel.bpf_stats_enabled=0
> 
> text_poke_bp_batch(nr_entries=164) : Took 702 usec
> 
> $ bpftool prog | grep run_time_ns
> ...
> 105: sched_cls  name hn_egress  tag 699fc5eea64144e3  gpl run_time_ns
> 1928223019 run_cnt 67682728 : average cost is 28 nsec per call
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>  arch/x86/kernel/alternative.c | 30 ++++++++++++++++++------------
>  1 file changed, 18 insertions(+), 12 deletions(-)

Thanks for the updates. I've further improved the changelog (see 
attached below), and have tentatively applied it to 
tip:x86/alternatives.

Thanks,

	Ingo

==============================>
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 25 Mar 2025 04:33:16 +0000
Subject: [PATCH] x86/alternatives: Improve code-patching scalability by removing false sharing in poke_int3_handler()

eBPF programs can be run 50,000,000 times per second on busy servers.

Whenever /proc/sys/kernel/bpf_stats_enabled is turned off,
hundreds of calls sites are patched from text_poke_bp_batch()
and we see a huge loss of performance due to false sharing
on bp_desc.refs lasting up to three seconds.

   51.30%  server_bin       [kernel.kallsyms]           [k] poke_int3_handler
            |
            |--46.45%--poke_int3_handler
            |          exc_int3
            |          asm_exc_int3
            |          |
            |          |--24.26%--cls_bpf_classify
            |          |          tcf_classify
            |          |          __dev_queue_xmit
            |          |          ip6_finish_output2
            |          |          ip6_output
            |          |          ip6_xmit
            |          |          inet6_csk_xmit
            |          |          __tcp_transmit_skb

Fix this by replacing bp_desc.refs with a per-cpu bp_refs.

Before the patch, on a host with 240 cores (480 threads):

  $ sysctl -wq kernel.bpf_stats_enabled=0

  text_poke_bp_batch(nr_entries=164) : Took 2655300 usec

  $ bpftool prog | grep run_time_ns
  ...
  105: sched_cls  name hn_egress  tag 699fc5eea64144e3  gpl run_time_ns
  3009063719 run_cnt 82757845 : average cost is 36 nsec per call

After this patch:

  $ sysctl -wq kernel.bpf_stats_enabled=0

  text_poke_bp_batch(nr_entries=164) : Took 702 usec

  $ bpftool prog | grep run_time_ns
  ...
  105: sched_cls  name hn_egress  tag 699fc5eea64144e3  gpl run_time_ns
  1928223019 run_cnt 67682728 : average cost is 28 nsec per call

Ie. text-patching performance improved 3700x: from 2.65 seconds
to 0.0007 seconds.

Since the atomic_cond_read_acquire(refs, !VAL) spin-loop was not triggered
even once in my tests, add an unlikely() annotation, because this appears
to be the common case.

[ mingo: Improved the changelog some more. ]

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/20250325043316.874518-1-edumazet@google.com


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v3] x86/alternatives: remove false sharing in poke_int3_handler()
  2025-03-25  7:14 ` Ingo Molnar
@ 2025-03-25  7:45   ` Eric Dumazet
  0 siblings, 0 replies; 4+ messages in thread
From: Eric Dumazet @ 2025-03-25  7:45 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H . Peter Anvin, Peter Zijlstra, Steven Rostedt, linux-kernel,
	Alexei Starovoitov, Daniel Borkmann, Masami Hiramatsu, x86, bpf,
	Eric Dumazet, Greg Thelen, Stephane Eranian

On Tue, Mar 25, 2025 at 8:14 AM Ingo Molnar <mingo@kernel.org> wrote:

> Thanks for the updates. I've further improved the changelog (see
> attached below), and have tentatively applied it to
> tip:x86/alternatives.
>

Thanks Ingo !

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [tip: x86/alternatives] x86/alternatives: Improve code-patching scalability by removing false sharing in poke_int3_handler()
  2025-03-25  4:33 [PATCH v3] x86/alternatives: remove false sharing in poke_int3_handler() Eric Dumazet
  2025-03-25  7:14 ` Ingo Molnar
@ 2025-03-25  7:25 ` tip-bot2 for Eric Dumazet
  1 sibling, 0 replies; 4+ messages in thread
From: tip-bot2 for Eric Dumazet @ 2025-03-25  7:25 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Eric Dumazet, Ingo Molnar, Brian Gerst, Juergen Gross,
	H. Peter Anvin, Linus Torvalds, Kees Cook, Peter Zijlstra,
	Josh Poimboeuf, x86, linux-kernel

The following commit has been merged into the x86/alternatives branch of tip:

Commit-ID:     41e4ceece5913b867604a28298612f397072e1b4
Gitweb:        https://git.kernel.org/tip/41e4ceece5913b867604a28298612f397072e1b4
Author:        Eric Dumazet <edumazet@google.com>
AuthorDate:    Tue, 25 Mar 2025 04:33:16 
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Tue, 25 Mar 2025 08:10:30 +01:00

x86/alternatives: Improve code-patching scalability by removing false sharing in poke_int3_handler()

eBPF programs can be run 50,000,000 times per second on busy servers.

Whenever /proc/sys/kernel/bpf_stats_enabled is turned off,
hundreds of calls sites are patched from text_poke_bp_batch()
and we see a huge loss of performance due to false sharing
on bp_desc.refs lasting up to three seconds.

   51.30%  server_bin       [kernel.kallsyms]           [k] poke_int3_handler
            |
            |--46.45%--poke_int3_handler
            |          exc_int3
            |          asm_exc_int3
            |          |
            |          |--24.26%--cls_bpf_classify
            |          |          tcf_classify
            |          |          __dev_queue_xmit
            |          |          ip6_finish_output2
            |          |          ip6_output
            |          |          ip6_xmit
            |          |          inet6_csk_xmit
            |          |          __tcp_transmit_skb

Fix this by replacing bp_desc.refs with a per-cpu bp_refs.

Before the patch, on a host with 240 cores (480 threads):

  $ sysctl -wq kernel.bpf_stats_enabled=0

  text_poke_bp_batch(nr_entries=164) : Took 2655300 usec

  $ bpftool prog | grep run_time_ns
  ...
  105: sched_cls  name hn_egress  tag 699fc5eea64144e3  gpl run_time_ns
  3009063719 run_cnt 82757845 : average cost is 36 nsec per call

After this patch:

  $ sysctl -wq kernel.bpf_stats_enabled=0

  text_poke_bp_batch(nr_entries=164) : Took 702 usec

  $ bpftool prog | grep run_time_ns
  ...
  105: sched_cls  name hn_egress  tag 699fc5eea64144e3  gpl run_time_ns
  1928223019 run_cnt 67682728 : average cost is 28 nsec per call

Ie. text-patching performance improved 3700x: from 2.65 seconds
to 0.0007 seconds.

Since the atomic_cond_read_acquire(refs, !VAL) spin-loop was not triggered
even once in my tests, add an unlikely() annotation, because this appears
to be the common case.

[ mingo: Improved the changelog some more. ]

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/20250325043316.874518-1-edumazet@google.com
---
 arch/x86/kernel/alternative.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index bf82c6f..85089c7 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2474,28 +2474,29 @@ struct text_poke_loc {
 struct bp_patching_desc {
 	struct text_poke_loc *vec;
 	int nr_entries;
-	atomic_t refs;
 };
 
+static DEFINE_PER_CPU(atomic_t, bp_refs);
+
 static struct bp_patching_desc bp_desc;
 
 static __always_inline
 struct bp_patching_desc *try_get_desc(void)
 {
-	struct bp_patching_desc *desc = &bp_desc;
+	atomic_t *refs = this_cpu_ptr(&bp_refs);
 
-	if (!raw_atomic_inc_not_zero(&desc->refs))
+	if (!raw_atomic_inc_not_zero(refs))
 		return NULL;
 
-	return desc;
+	return &bp_desc;
 }
 
 static __always_inline void put_desc(void)
 {
-	struct bp_patching_desc *desc = &bp_desc;
+	atomic_t *refs = this_cpu_ptr(&bp_refs);
 
 	smp_mb__before_atomic();
-	raw_atomic_dec(&desc->refs);
+	raw_atomic_dec(refs);
 }
 
 static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
@@ -2528,9 +2529,9 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
 	 * Having observed our INT3 instruction, we now must observe
 	 * bp_desc with non-zero refcount:
 	 *
-	 *	bp_desc.refs = 1		INT3
-	 *	WMB				RMB
-	 *	write INT3			if (bp_desc.refs != 0)
+	 *	bp_refs = 1		INT3
+	 *	WMB			RMB
+	 *	write INT3		if (bp_refs != 0)
 	 */
 	smp_rmb();
 
@@ -2636,7 +2637,8 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
 	 * Corresponds to the implicit memory barrier in try_get_desc() to
 	 * ensure reading a non-zero refcount provides up to date bp_desc data.
 	 */
-	atomic_set_release(&bp_desc.refs, 1);
+	for_each_possible_cpu(i)
+		atomic_set_release(per_cpu_ptr(&bp_refs, i), 1);
 
 	/*
 	 * Function tracing can enable thousands of places that need to be
@@ -2750,8 +2752,12 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
 	/*
 	 * Remove and wait for refs to be zero.
 	 */
-	if (!atomic_dec_and_test(&bp_desc.refs))
-		atomic_cond_read_acquire(&bp_desc.refs, !VAL);
+	for_each_possible_cpu(i) {
+		atomic_t *refs = per_cpu_ptr(&bp_refs, i);
+
+		if (unlikely(!atomic_dec_and_test(refs)))
+			atomic_cond_read_acquire(refs, !VAL);
+	}
 }
 
 static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2025-03-25  7:45 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-03-25  4:33 [PATCH v3] x86/alternatives: remove false sharing in poke_int3_handler() Eric Dumazet
2025-03-25  7:14 ` Ingo Molnar
2025-03-25  7:45   ` Eric Dumazet
2025-03-25  7:25 ` [tip: x86/alternatives] x86/alternatives: Improve code-patching scalability by removing " tip-bot2 for Eric Dumazet

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.