* [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()
@ 2025-03-23 7:25 Eric Dumazet
2025-03-23 21:38 ` Ingo Molnar
0 siblings, 1 reply; 16+ messages in thread
From: Eric Dumazet @ 2025-03-23 7:25 UTC (permalink / raw)
To: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H . Peter Anvin, Peter Zijlstra, Steven Rostedt
Cc: linux-kernel, Alexei Starovoitov, Daniel Borkmann,
Masami Hiramatsu, x86, bpf, Eric Dumazet, Greg Thelen,
Stephane Eranian, Eric Dumazet
eBPF programs can be run 20,000,000+ times per second on busy servers.
Whenever /proc/sys/kernel/bpf_stats_enabled is turned off,
hundreds of calls sites are patched from text_poke_bp_batch()
and we see a critical loss of performance due to false sharing
on bp_desc.refs lasting up to three seconds.
51.30% server_bin [kernel.kallsyms] [k] poke_int3_handler
|
|--46.45%--poke_int3_handler
| exc_int3
| asm_exc_int3
| |
| |--24.26%--cls_bpf_classify
| | tcf_classify
| | __dev_queue_xmit
| | ip6_finish_output2
| | ip6_output
| | ip6_xmit
| | inet6_csk_xmit
| | __tcp_transmit_skb
| | |
| | |--9.00%--tcp_v6_do_rcv
| | | tcp_v6_rcv
| | | ip6_protocol_deliver_rcu
| | | ip6_rcv_finish
| | | ipv6_rcv
| | | __netif_receive_skb
| | | process_backlog
| | | __napi_poll
| | | net_rx_action
| | | __softirqentry_text_start
| | | asm_call_sysvec_on_stack
| | | do_softirq_own_stack
Fix this by replacing bp_desc.refs with a per-cpu bp_refs.
Before the patch, on a host with 240 cpus (480 threads):
echo 0 >/proc/sys/kernel/bpf_stats_enabled
text_poke_bp_batch(nr_entries=2)
text_poke_bp_batch+1
text_poke_finish+27
arch_jump_label_transform_apply+22
jump_label_update+98
__static_key_slow_dec_cpuslocked+64
static_key_slow_dec+31
bpf_stats_handler+236
proc_sys_call_handler+396
vfs_write+761
ksys_write+102
do_syscall_64+107
entry_SYSCALL_64_after_hwframe+103
Took 324 usec
text_poke_bp_batch(nr_entries=164)
text_poke_bp_batch+1
text_poke_finish+27
arch_jump_label_transform_apply+22
jump_label_update+98
__static_key_slow_dec_cpuslocked+64
static_key_slow_dec+31
bpf_stats_handler+236
proc_sys_call_handler+396
vfs_write+761
ksys_write+102
do_syscall_64+107
entry_SYSCALL_64_after_hwframe+103
Took 2655300 usec
After this patch:
echo 0 >/proc/sys/kernecho 0 >/proc/sys/kernel/bpf_stats_enabled
text_poke_bp_batch(nr_entries=2)
text_poke_bp_batch+1
text_poke_finish+27
arch_jump_label_transform_apply+22
jump_label_update+98
__static_key_slow_dec_cpuslocked+64
static_key_slow_dec+31
bpf_stats_handler+236
proc_sys_call_handler+396
vfs_write+761
ksys_write+102
do_syscall_64+107
entry_SYSCALL_64_after_hwframe+103
Took 519 usec
text_poke_bp_batch(nr_entries=164)
text_poke_bp_batch+1
text_poke_finish+27
arch_jump_label_transform_apply+22
jump_label_update+98
__static_key_slow_dec_cpuslocked+64
static_key_slow_dec+31
bpf_stats_handler+236
proc_sys_call_handler+396
vfs_write+761
ksys_write+102
do_syscall_64+107
entry_SYSCALL_64_after_hwframe+103
Took 702 usec
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
arch/x86/kernel/alternative.c | 30 ++++++++++++++++++------------
1 file changed, 18 insertions(+), 12 deletions(-)
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index c71b575bf229..d7afbf822c45 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2137,28 +2137,29 @@ struct text_poke_loc {
struct bp_patching_desc {
struct text_poke_loc *vec;
int nr_entries;
- atomic_t refs;
};
+static DEFINE_PER_CPU(atomic_t, bp_refs);
+
static struct bp_patching_desc bp_desc;
static __always_inline
struct bp_patching_desc *try_get_desc(void)
{
- struct bp_patching_desc *desc = &bp_desc;
+ atomic_t *refs = this_cpu_ptr(&bp_refs);
- if (!raw_atomic_inc_not_zero(&desc->refs))
+ if (!raw_atomic_inc_not_zero(refs))
return NULL;
- return desc;
+ return &bp_desc;
}
static __always_inline void put_desc(void)
{
- struct bp_patching_desc *desc = &bp_desc;
+ atomic_t *refs = this_cpu_ptr(&bp_refs);
smp_mb__before_atomic();
- raw_atomic_dec(&desc->refs);
+ raw_atomic_dec(refs);
}
static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
@@ -2191,9 +2192,9 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
* Having observed our INT3 instruction, we now must observe
* bp_desc with non-zero refcount:
*
- * bp_desc.refs = 1 INT3
- * WMB RMB
- * write INT3 if (bp_desc.refs != 0)
+ * bp_refs = 1 INT3
+ * WMB RMB
+ * write INT3 if (bp_refs != 0)
*/
smp_rmb();
@@ -2299,7 +2300,8 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
* Corresponds to the implicit memory barrier in try_get_desc() to
* ensure reading a non-zero refcount provides up to date bp_desc data.
*/
- atomic_set_release(&bp_desc.refs, 1);
+ for_each_possible_cpu(i)
+ atomic_set_release(per_cpu_ptr(&bp_refs, i), 1);
/*
* Function tracing can enable thousands of places that need to be
@@ -2413,8 +2415,12 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
/*
* Remove and wait for refs to be zero.
*/
- if (!atomic_dec_and_test(&bp_desc.refs))
- atomic_cond_read_acquire(&bp_desc.refs, !VAL);
+ for_each_possible_cpu(i) {
+ atomic_t *refs = per_cpu_ptr(&bp_refs, i);
+
+ if (!atomic_dec_and_test(refs))
+ atomic_cond_read_acquire(refs, !VAL);
+ }
}
static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
--
2.49.0.395.g12beb8f557-goog
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()
2025-03-23 7:25 [PATCH] x86/alternatives: remove false sharing in poke_int3_handler() Eric Dumazet
@ 2025-03-23 21:38 ` Ingo Molnar
2025-03-24 3:59 ` Eric Dumazet
0 siblings, 1 reply; 16+ messages in thread
From: Ingo Molnar @ 2025-03-23 21:38 UTC (permalink / raw)
To: Eric Dumazet
Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H . Peter Anvin, Peter Zijlstra, Steven Rostedt, linux-kernel,
Alexei Starovoitov, Daniel Borkmann, Masami Hiramatsu, x86, bpf,
Eric Dumazet, Greg Thelen, Stephane Eranian
* Eric Dumazet <edumazet@google.com> wrote:
> eBPF programs can be run 20,000,000+ times per second on busy servers.
>
> Whenever /proc/sys/kernel/bpf_stats_enabled is turned off,
> hundreds of calls sites are patched from text_poke_bp_batch()
> and we see a critical loss of performance due to false sharing
> on bp_desc.refs lasting up to three seconds.
> @@ -2413,8 +2415,12 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
> /*
> * Remove and wait for refs to be zero.
> */
> - if (!atomic_dec_and_test(&bp_desc.refs))
> - atomic_cond_read_acquire(&bp_desc.refs, !VAL);
> + for_each_possible_cpu(i) {
> + atomic_t *refs = per_cpu_ptr(&bp_refs, i);
> +
> + if (!atomic_dec_and_test(refs))
> + atomic_cond_read_acquire(refs, !VAL);
> + }
So your patch changes text_poke_bp_batch() to busy-spin-wait for
bp_refs to go to zero on all 480 CPUs.
Your measurement is using /proc/sys/kernel/bpf_stats_enabled on a
single CPU, right?
What's the adversarial workload here? Spamming bpf_stats_enabled on all
CPUs in parallel? Or mixing it with some other text_poke_bp_batch()
user if bpf_stats_enabled serializes access?
Does anything undesirable happen in that case?
Thanks,
Ingo
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()
2025-03-23 21:38 ` Ingo Molnar
@ 2025-03-24 3:59 ` Eric Dumazet
2025-03-24 7:16 ` Ingo Molnar
0 siblings, 1 reply; 16+ messages in thread
From: Eric Dumazet @ 2025-03-24 3:59 UTC (permalink / raw)
To: Ingo Molnar
Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H . Peter Anvin, Peter Zijlstra, Steven Rostedt, linux-kernel,
Alexei Starovoitov, Daniel Borkmann, Masami Hiramatsu, x86, bpf,
Eric Dumazet, Greg Thelen, Stephane Eranian
On Sun, Mar 23, 2025 at 10:38 PM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * Eric Dumazet <edumazet@google.com> wrote:
>
> > eBPF programs can be run 20,000,000+ times per second on busy servers.
> >
> > Whenever /proc/sys/kernel/bpf_stats_enabled is turned off,
> > hundreds of calls sites are patched from text_poke_bp_batch()
> > and we see a critical loss of performance due to false sharing
> > on bp_desc.refs lasting up to three seconds.
>
> > @@ -2413,8 +2415,12 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
> > /*
> > * Remove and wait for refs to be zero.
> > */
> > - if (!atomic_dec_and_test(&bp_desc.refs))
> > - atomic_cond_read_acquire(&bp_desc.refs, !VAL);
> > + for_each_possible_cpu(i) {
> > + atomic_t *refs = per_cpu_ptr(&bp_refs, i);
> > +
> > + if (!atomic_dec_and_test(refs))
> > + atomic_cond_read_acquire(refs, !VAL);
> > + }
>
> So your patch changes text_poke_bp_batch() to busy-spin-wait for
> bp_refs to go to zero on all 480 CPUs.
>
> Your measurement is using /proc/sys/kernel/bpf_stats_enabled on a
> single CPU, right?
Yes, some daemon enables bpf_stats for a small amount of time (one
second) to gather stats
on eBPF run time costs. (bpftool prog | grep run_time)
One eBPF selftest can also do this.
tools/testing/selftests/bpf/prog_tests/enable_stats.c
>
> What's the adversarial workload here? Spamming bpf_stats_enabled on all
> CPUs in parallel? Or mixing it with some other text_poke_bp_batch()
> user if bpf_stats_enabled serializes access?
The workload is having ~480 cpus running various eBPF programs all
over the places,
In the perf bit I added in the changelog, we see an eBPF program
hooked at the xmit of each packet.
But the fd = bpf_enable_stats(BPF_STATS_RUN_TIME) / .... / close(fd)
only happens from time to time,
because of the supposed extra cost of fetching two extra time stamps.
BTW, before the patch stats on my test host look like
105: sched_cls name hn_egress tag 699fc5eea64144e3 gpl run_time_ns
3009063719 run_cnt 82757845
-> average cost is 36 nsec per call
And after the patch :
105: sched_cls name hn_egress tag 699fc5eea64144e3 gpl run_time_ns
1928223019 run_cnt 67682728
-> average cost is 28 nsec per call
>
> Does anything undesirable happen in that case?
The case of multiple threads trying to flip bpf_stats_enabled is
handled by bpf_stats_enabled_mutex.
Thanks !
>
> Thanks,
>
> Ingo
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()
2025-03-24 3:59 ` Eric Dumazet
@ 2025-03-24 7:16 ` Ingo Molnar
2025-03-24 7:47 ` Eric Dumazet
0 siblings, 1 reply; 16+ messages in thread
From: Ingo Molnar @ 2025-03-24 7:16 UTC (permalink / raw)
To: Eric Dumazet
Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H . Peter Anvin, Peter Zijlstra, Steven Rostedt, linux-kernel,
Alexei Starovoitov, Daniel Borkmann, Masami Hiramatsu, x86, bpf,
Eric Dumazet, Greg Thelen, Stephane Eranian
* Eric Dumazet <edumazet@google.com> wrote:
> > What's the adversarial workload here? Spamming bpf_stats_enabled on all
> > CPUs in parallel? Or mixing it with some other text_poke_bp_batch()
> > user if bpf_stats_enabled serializes access?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> > Does anything undesirable happen in that case?
>
> The case of multiple threads trying to flip bpf_stats_enabled is
> handled by bpf_stats_enabled_mutex.
So my suggested workload wasn't adversarial enough due to
bpf_stats_enabled_mutex: how about some other workload that doesn't
serialize access to text_poke_bp_batch()?
Thanks,
Ingo
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()
2025-03-24 7:16 ` Ingo Molnar
@ 2025-03-24 7:47 ` Eric Dumazet
2025-03-24 7:53 ` Eric Dumazet
2025-03-24 8:02 ` [PATCH] x86/alternatives: remove false sharing in poke_int3_handler() Ingo Molnar
0 siblings, 2 replies; 16+ messages in thread
From: Eric Dumazet @ 2025-03-24 7:47 UTC (permalink / raw)
To: Ingo Molnar
Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H . Peter Anvin, Peter Zijlstra, Steven Rostedt, linux-kernel,
Alexei Starovoitov, Daniel Borkmann, Masami Hiramatsu, x86, bpf,
Eric Dumazet, Greg Thelen, Stephane Eranian
On Mon, Mar 24, 2025 at 8:16 AM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * Eric Dumazet <edumazet@google.com> wrote:
>
> > > What's the adversarial workload here? Spamming bpf_stats_enabled on all
> > > CPUs in parallel? Or mixing it with some other text_poke_bp_batch()
> > > user if bpf_stats_enabled serializes access?
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>
> > > Does anything undesirable happen in that case?
> >
> > The case of multiple threads trying to flip bpf_stats_enabled is
> > handled by bpf_stats_enabled_mutex.
>
> So my suggested workload wasn't adversarial enough due to
> bpf_stats_enabled_mutex: how about some other workload that doesn't
> serialize access to text_poke_bp_batch()?
Do you have a specific case in mind that I can test on these big platforms ?
text_poke_bp_batch() calls themselves are serialized by text_mutex, it
is not clear what you are looking for.
Thanks.
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()
2025-03-24 7:47 ` Eric Dumazet
@ 2025-03-24 7:53 ` Eric Dumazet
2025-03-24 8:04 ` Ingo Molnar
2025-03-24 11:33 ` Peter Zijlstra
2025-03-24 8:02 ` [PATCH] x86/alternatives: remove false sharing in poke_int3_handler() Ingo Molnar
1 sibling, 2 replies; 16+ messages in thread
From: Eric Dumazet @ 2025-03-24 7:53 UTC (permalink / raw)
To: Ingo Molnar
Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H . Peter Anvin, Peter Zijlstra, Steven Rostedt, linux-kernel,
Alexei Starovoitov, Daniel Borkmann, Masami Hiramatsu, x86, bpf,
Eric Dumazet, Greg Thelen, Stephane Eranian
On Mon, Mar 24, 2025 at 8:47 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Mon, Mar 24, 2025 at 8:16 AM Ingo Molnar <mingo@kernel.org> wrote:
> >
> >
> > * Eric Dumazet <edumazet@google.com> wrote:
> >
> > > > What's the adversarial workload here? Spamming bpf_stats_enabled on all
> > > > CPUs in parallel? Or mixing it with some other text_poke_bp_batch()
> > > > user if bpf_stats_enabled serializes access?
> > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> >
> > > > Does anything undesirable happen in that case?
> > >
> > > The case of multiple threads trying to flip bpf_stats_enabled is
> > > handled by bpf_stats_enabled_mutex.
> >
> > So my suggested workload wasn't adversarial enough due to
> > bpf_stats_enabled_mutex: how about some other workload that doesn't
> > serialize access to text_poke_bp_batch()?
>
> Do you have a specific case in mind that I can test on these big platforms ?
>
> text_poke_bp_batch() calls themselves are serialized by text_mutex, it
> is not clear what you are looking for.
BTW the atomic_cond_read_acquire() part is never called even during my
stress test.
We could add this eventually:
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index d7afbf822c45..5d364e990055 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2418,7 +2418,7 @@ static void text_poke_bp_batch(struct
text_poke_loc *tp, unsigned int nr_entries
for_each_possible_cpu(i) {
atomic_t *refs = per_cpu_ptr(&bp_refs, i);
- if (!atomic_dec_and_test(refs))
+ if (unlikely(!atomic_dec_and_test(refs)))
atomic_cond_read_acquire(refs, !VAL);
}
}
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()
2025-03-24 7:47 ` Eric Dumazet
2025-03-24 7:53 ` Eric Dumazet
@ 2025-03-24 8:02 ` Ingo Molnar
2025-03-24 8:20 ` Eric Dumazet
1 sibling, 1 reply; 16+ messages in thread
From: Ingo Molnar @ 2025-03-24 8:02 UTC (permalink / raw)
To: Eric Dumazet
Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H . Peter Anvin, Peter Zijlstra, Steven Rostedt, linux-kernel,
Alexei Starovoitov, Daniel Borkmann, Masami Hiramatsu, x86, bpf,
Eric Dumazet, Greg Thelen, Stephane Eranian
* Eric Dumazet <edumazet@google.com> wrote:
> Do you have a specific case in mind that I can test on these big
> platforms ?
No. I was thinking of large-scale kprobes or ftrace patching - but you
are right that the text_mutex should naturally serialize all the
write-side code here.
Mind adding your second round of test results to the changelog as well,
which improved per call overhead from 36 to 28 nsecs?
Thanks,
Ingo
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()
2025-03-24 7:53 ` Eric Dumazet
@ 2025-03-24 8:04 ` Ingo Molnar
2025-03-24 11:33 ` Peter Zijlstra
1 sibling, 0 replies; 16+ messages in thread
From: Ingo Molnar @ 2025-03-24 8:04 UTC (permalink / raw)
To: Eric Dumazet
Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H . Peter Anvin, Peter Zijlstra, Steven Rostedt, linux-kernel,
Alexei Starovoitov, Daniel Borkmann, Masami Hiramatsu, x86, bpf,
Eric Dumazet, Greg Thelen, Stephane Eranian
* Eric Dumazet <edumazet@google.com> wrote:
> On Mon, Mar 24, 2025 at 8:47 AM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Mon, Mar 24, 2025 at 8:16 AM Ingo Molnar <mingo@kernel.org> wrote:
> > >
> > >
> > > * Eric Dumazet <edumazet@google.com> wrote:
> > >
> > > > > What's the adversarial workload here? Spamming bpf_stats_enabled on all
> > > > > CPUs in parallel? Or mixing it with some other text_poke_bp_batch()
> > > > > user if bpf_stats_enabled serializes access?
> > > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> > >
> > > > > Does anything undesirable happen in that case?
> > > >
> > > > The case of multiple threads trying to flip bpf_stats_enabled is
> > > > handled by bpf_stats_enabled_mutex.
> > >
> > > So my suggested workload wasn't adversarial enough due to
> > > bpf_stats_enabled_mutex: how about some other workload that doesn't
> > > serialize access to text_poke_bp_batch()?
> >
> > Do you have a specific case in mind that I can test on these big platforms ?
> >
> > text_poke_bp_batch() calls themselves are serialized by text_mutex, it
> > is not clear what you are looking for.
>
>
> BTW the atomic_cond_read_acquire() part is never called even during my
> stress test.
Yeah, that code threw me off - can it really happen with text_mutex
serializing all of it?
> @@ -2418,7 +2418,7 @@ static void text_poke_bp_batch(struct
> text_poke_loc *tp, unsigned int nr_entries
> for_each_possible_cpu(i) {
> atomic_t *refs = per_cpu_ptr(&bp_refs, i);
>
> - if (!atomic_dec_and_test(refs))
> + if (unlikely(!atomic_dec_and_test(refs)))
> atomic_cond_read_acquire(refs, !VAL);
If it could never happen then this should that condition be a
WARN_ON_ONCE() perhaps?
Thanks,
Ingo
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()
2025-03-24 8:02 ` [PATCH] x86/alternatives: remove false sharing in poke_int3_handler() Ingo Molnar
@ 2025-03-24 8:20 ` Eric Dumazet
0 siblings, 0 replies; 16+ messages in thread
From: Eric Dumazet @ 2025-03-24 8:20 UTC (permalink / raw)
To: Ingo Molnar
Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H . Peter Anvin, Peter Zijlstra, Steven Rostedt, linux-kernel,
Alexei Starovoitov, Daniel Borkmann, Masami Hiramatsu, x86, bpf,
Eric Dumazet, Greg Thelen, Stephane Eranian
On Mon, Mar 24, 2025 at 9:02 AM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * Eric Dumazet <edumazet@google.com> wrote:
>
> > Do you have a specific case in mind that I can test on these big
> > platforms ?
>
> No. I was thinking of large-scale kprobes or ftrace patching - but you
> are right that the text_mutex should naturally serialize all the
> write-side code here.
>
> Mind adding your second round of test results to the changelog as well,
> which improved per call overhead from 36 to 28 nsecs?
Sure thing, thanks !
Note the 36 to 28 nsec was on a test host, not really under production stress.
As all of our production runs with the old code, I can not really tell
what would be the effective change once new kernels are rolled out.
When updating an unique and shared atomic_t from 480 cpus (worst case
scenario), we need more than 40000 cycles per operation.
perf stat atomic_bench -T480
The atomic counter is 21904528, total_cycles=2095231571464, 95652 avg
cycles per update
[05] 7866 in
[32,64[ cycles (53 avg)
[06] 2196 in
[64,128[ cycles (81 avg)
[07] 2942 in
[128,256[ cycles (202 avg)
[08] 1865 in
[256,512[ cycles (383 avg)
[09] 4251 in
[512,1024[ cycles (780 avg)
[10] 72248 in
[1024,2048[ cycles (1722 avg)
[11] *** 438110 in
[2048,4096[ cycles (3217 avg)
[12] *********** 1703927 in
[4096,8192[ cycles (6199 avg)
[13] ************************** 3869889 in
[8192,16384[ cycles (12320 avg)
[14] *************************** 4040952 in
[16384,32768[ cycles (25185 avg)
[15] ************************************************** 7261596 in
[32768,65536[ cycles (46884 avg)
[16] ****************** 2688791 in
[65536,131072[ cycles (83552 avg)
[17] * 253104 in
[131072,262144[ cycles (189642 avg)
[18] ** 326075 in
[262144,524288[ cycles (349319 avg)
[19] ****** 901293 in
[524288,1048576[ cycles (890724 avg)
[20] ** 321711 in
[1048576,2097152[ cycles (1205250 avg)
[21] 6616 in
[2097152,4194304[ cycles (2436096 avg)
Performance counter stats for './atomic_bench -T480':
964,194.88 msec task-clock # 467.120 CPUs
utilized
13,795 context-switches # 14.307 M/sec
480 cpu-migrations # 0.498 M/sec
1,605 page-faults # 1.665 M/sec
3,182,241,468,867 cycles # 3300416.170 GHz
11,077,646,267 instructions # 0.00 insn per
cycle
1,711,894,269 branches # 1775466.627 M/sec
3,747,877 branch-misses # 0.22% of all
branches
2.064128692 seconds time elapsed
I said the atomic_cond_read_acquire(refs, !VAL) was not called in my tests,
but it is a valid situation, we should not add a WARN_ON_ONCE().
I will simply add the unlikely()
Thanks.
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()
2025-03-24 7:53 ` Eric Dumazet
2025-03-24 8:04 ` Ingo Molnar
@ 2025-03-24 11:33 ` Peter Zijlstra
2025-03-25 8:41 ` Ingo Molnar
1 sibling, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2025-03-24 11:33 UTC (permalink / raw)
To: Eric Dumazet
Cc: Ingo Molnar, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, H . Peter Anvin, Steven Rostedt, linux-kernel,
Alexei Starovoitov, Daniel Borkmann, Masami Hiramatsu, x86, bpf,
Eric Dumazet, Greg Thelen, Stephane Eranian
On Mon, Mar 24, 2025 at 08:53:31AM +0100, Eric Dumazet wrote:
> BTW the atomic_cond_read_acquire() part is never called even during my
> stress test.
Yes, IIRC this is due to text_poke_sync() serializing the state, as that
does a synchronous IPI broadcast, which by necessity requires all
previous INT3 handlers to complete.
You can only hit that case if the INT3 remains after step-3 (IOW you're
actively writing INT3 into the text). This is exceedingly rare.
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()
2025-03-24 11:33 ` Peter Zijlstra
@ 2025-03-25 8:41 ` Ingo Molnar
2025-03-25 10:30 ` Peter Zijlstra
0 siblings, 1 reply; 16+ messages in thread
From: Ingo Molnar @ 2025-03-25 8:41 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Eric Dumazet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, H . Peter Anvin, Steven Rostedt, linux-kernel,
Alexei Starovoitov, Daniel Borkmann, Masami Hiramatsu, x86, bpf,
Eric Dumazet, Greg Thelen, Stephane Eranian
* Peter Zijlstra <peterz@infradead.org> wrote:
> On Mon, Mar 24, 2025 at 08:53:31AM +0100, Eric Dumazet wrote:
>
> > BTW the atomic_cond_read_acquire() part is never called even during my
> > stress test.
>
> Yes, IIRC this is due to text_poke_sync() serializing the state, as that
> does a synchronous IPI broadcast, which by necessity requires all
> previous INT3 handlers to complete.
>
> You can only hit that case if the INT3 remains after step-3 (IOW you're
> actively writing INT3 into the text). This is exceedingly rare.
Might make sense to add a comment for that.
Also, any strong objections against doing this in the namespace:
s/bp_/int3_
?
Half of the code already calls it a variant of 'int3', half of it 'bp',
which I had to think for a couple of seconds goes for breakpoint, not
base pointer ... ;-)
Might as well standardize on int3_ and call it a day?
Thanks,
Ingo
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()
2025-03-25 8:41 ` Ingo Molnar
@ 2025-03-25 10:30 ` Peter Zijlstra
2025-03-25 11:26 ` Ingo Molnar
2025-03-25 11:36 ` [tip: x86/alternatives] x86/alternatives: Document the text_poke_bp_batch() synchronization rules a bit more tip-bot2 for Peter Zijlstra
0 siblings, 2 replies; 16+ messages in thread
From: Peter Zijlstra @ 2025-03-25 10:30 UTC (permalink / raw)
To: Ingo Molnar
Cc: Eric Dumazet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, H . Peter Anvin, Steven Rostedt, linux-kernel,
Alexei Starovoitov, Daniel Borkmann, Masami Hiramatsu, x86, bpf,
Eric Dumazet, Greg Thelen, Stephane Eranian
On Tue, Mar 25, 2025 at 09:41:10AM +0100, Ingo Molnar wrote:
>
> * Peter Zijlstra <peterz@infradead.org> wrote:
>
> > On Mon, Mar 24, 2025 at 08:53:31AM +0100, Eric Dumazet wrote:
> >
> > > BTW the atomic_cond_read_acquire() part is never called even during my
> > > stress test.
> >
> > Yes, IIRC this is due to text_poke_sync() serializing the state, as that
> > does a synchronous IPI broadcast, which by necessity requires all
> > previous INT3 handlers to complete.
> >
> > You can only hit that case if the INT3 remains after step-3 (IOW you're
> > actively writing INT3 into the text). This is exceedingly rare.
>
> Might make sense to add a comment for that.
Sure, find below.
> Also, any strong objections against doing this in the namespace:
>
> s/bp_/int3_
>
> ?
>
> Half of the code already calls it a variant of 'int3', half of it 'bp',
> which I had to think for a couple of seconds goes for breakpoint, not
> base pointer ... ;-)
It actually is breakpoint, as in INT3 raises #BP. For complete confusion
the things that are commonly known as debug breakpoints, those things in
DR7, they raise #DB or debug exceptions.
> Might as well standardize on int3_ and call it a day?
Yeah, perhaps. At some point you've got to know that INT3->#BP and
DR7->#DB and it all sorta makes sense, but *shrug* :-)
---
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index bf82c6f7d690..01e94603e767 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2749,6 +2749,13 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
/*
* Remove and wait for refs to be zero.
+ *
+ * Notably, if after step-3 above the INT3 got removed, then the
+ * text_poke_sync() will have serialized against any running INT3
+ * handlers and the below spin-wait will not happen.
+ *
+ * IOW. unless the replacement instruction is INT3, this case goes
+ * unused.
*/
if (!atomic_dec_and_test(&bp_desc.refs))
atomic_cond_read_acquire(&bp_desc.refs, !VAL);
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()
2025-03-25 10:30 ` Peter Zijlstra
@ 2025-03-25 11:26 ` Ingo Molnar
2025-03-25 12:31 ` Peter Zijlstra
2025-03-25 11:36 ` [tip: x86/alternatives] x86/alternatives: Document the text_poke_bp_batch() synchronization rules a bit more tip-bot2 for Peter Zijlstra
1 sibling, 1 reply; 16+ messages in thread
From: Ingo Molnar @ 2025-03-25 11:26 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Eric Dumazet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, H . Peter Anvin, Steven Rostedt, linux-kernel,
Alexei Starovoitov, Daniel Borkmann, Masami Hiramatsu, x86, bpf,
Eric Dumazet, Greg Thelen, Stephane Eranian
* Peter Zijlstra <peterz@infradead.org> wrote:
> On Tue, Mar 25, 2025 at 09:41:10AM +0100, Ingo Molnar wrote:
> >
> > * Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > > On Mon, Mar 24, 2025 at 08:53:31AM +0100, Eric Dumazet wrote:
> > >
> > > > BTW the atomic_cond_read_acquire() part is never called even during my
> > > > stress test.
> > >
> > > Yes, IIRC this is due to text_poke_sync() serializing the state, as that
> > > does a synchronous IPI broadcast, which by necessity requires all
> > > previous INT3 handlers to complete.
> > >
> > > You can only hit that case if the INT3 remains after step-3 (IOW you're
> > > actively writing INT3 into the text). This is exceedingly rare.
> >
> > Might make sense to add a comment for that.
>
> Sure, find below.
>
> > Also, any strong objections against doing this in the namespace:
> >
> > s/bp_/int3_
> >
> > ?
> >
> > Half of the code already calls it a variant of 'int3', half of it 'bp',
> > which I had to think for a couple of seconds goes for breakpoint, not
> > base pointer ... ;-)
>
> It actually is breakpoint, as in INT3 raises #BP. For complete confusion
> the things that are commonly known as debug breakpoints, those things in
> DR7, they raise #DB or debug exceptions.
Yeah, it's a software breakpoint, swbp, that raises the #BP trap.
'bp' is confusingly aliased (in my brain at least) with 'base pointer'
register naming and assembler syntax: as in bp, ebp, rbp.
So I'd prefer if it was named consistently:
text_poke_int3_batch()
text_poke_int3_handler()
...
Not the current mishmash of:
text_poke_bp_batch()
poke_int3_handler()
...
Does this make more sense?
> > Might as well standardize on int3_ and call it a day?
>
> Yeah, perhaps. At some point you've got to know that INT3->#BP and
> DR7->#DB and it all sorta makes sense, but *shrug* :-)
Yeah, so I do know what #BP is, but what the heck disambiguates the two
meanings of _bp and why do we have the above jungle of an inconsistent
namespace? :-)
Picking _int3 would neatly solve all of that.
> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> index bf82c6f7d690..01e94603e767 100644
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -2749,6 +2749,13 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
>
> /*
> * Remove and wait for refs to be zero.
> + *
> + * Notably, if after step-3 above the INT3 got removed, then the
> + * text_poke_sync() will have serialized against any running INT3
> + * handlers and the below spin-wait will not happen.
> + *
> + * IOW. unless the replacement instruction is INT3, this case goes
> + * unused.
> */
> if (!atomic_dec_and_test(&bp_desc.refs))
> atomic_cond_read_acquire(&bp_desc.refs, !VAL);
Thanks! I stuck this into tip:x86/alternatives, with your SOB.
Ingo
^ permalink raw reply [flat|nested] 16+ messages in thread
* [tip: x86/alternatives] x86/alternatives: Document the text_poke_bp_batch() synchronization rules a bit more
2025-03-25 10:30 ` Peter Zijlstra
2025-03-25 11:26 ` Ingo Molnar
@ 2025-03-25 11:36 ` tip-bot2 for Peter Zijlstra
1 sibling, 0 replies; 16+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2025-03-25 11:36 UTC (permalink / raw)
To: linux-tip-commits
Cc: Peter Zijlstra (Intel), Ingo Molnar, Eric Dumazet, Brian Gerst,
Juergen Gross, H. Peter Anvin, Linus Torvalds, Josh Poimboeuf,
x86, linux-kernel
The following commit has been merged into the x86/alternatives branch of tip:
Commit-ID: 451283cd40bceccc02397d0e5b2b6eda6047b1ed
Gitweb: https://git.kernel.org/tip/451283cd40bceccc02397d0e5b2b6eda6047b1ed
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Tue, 25 Mar 2025 11:30:47 +01:00
Committer: Ingo Molnar <mingo@kernel.org>
CommitterDate: Tue, 25 Mar 2025 12:28:35 +01:00
x86/alternatives: Document the text_poke_bp_batch() synchronization rules a bit more
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/20250325103047.GH36322@noisy.programming.kicks-ass.net
---
arch/x86/kernel/alternative.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 85089c7..5f44814 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2751,6 +2751,13 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
/*
* Remove and wait for refs to be zero.
+ *
+ * Notably, if after step-3 above the INT3 got removed, then the
+ * text_poke_sync() will have serialized against any running INT3
+ * handlers and the below spin-wait will not happen.
+ *
+ * IOW. unless the replacement instruction is INT3, this case goes
+ * unused.
*/
for_each_possible_cpu(i) {
atomic_t *refs = per_cpu_ptr(&bp_refs, i);
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()
2025-03-25 11:26 ` Ingo Molnar
@ 2025-03-25 12:31 ` Peter Zijlstra
2025-03-27 20:56 ` Ingo Molnar
0 siblings, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2025-03-25 12:31 UTC (permalink / raw)
To: Ingo Molnar
Cc: Eric Dumazet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, H . Peter Anvin, Steven Rostedt, linux-kernel,
Alexei Starovoitov, Daniel Borkmann, Masami Hiramatsu, x86, bpf,
Eric Dumazet, Greg Thelen, Stephane Eranian
On Tue, Mar 25, 2025 at 12:26:36PM +0100, Ingo Molnar wrote:
> Yeah, so I do know what #BP is, but what the heck disambiguates the two
> meanings of _bp and why do we have the above jungle of an inconsistent
> namespace? :-)
>
> Picking _int3 would neatly solve all of that.
Sure; the most obvious case where BP would make sense, the trap entry
point, we already use int3 so yeah, make it int3 throughout.
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()
2025-03-25 12:31 ` Peter Zijlstra
@ 2025-03-27 20:56 ` Ingo Molnar
0 siblings, 0 replies; 16+ messages in thread
From: Ingo Molnar @ 2025-03-27 20:56 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Eric Dumazet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, H . Peter Anvin, Steven Rostedt, linux-kernel,
Alexei Starovoitov, Daniel Borkmann, Masami Hiramatsu, x86, bpf,
Eric Dumazet, Greg Thelen, Stephane Eranian
* Peter Zijlstra <peterz@infradead.org> wrote:
> On Tue, Mar 25, 2025 at 12:26:36PM +0100, Ingo Molnar wrote:
>
> > Yeah, so I do know what #BP is, but what the heck disambiguates the two
> > meanings of _bp and why do we have the above jungle of an inconsistent
> > namespace? :-)
> >
> > Picking _int3 would neatly solve all of that.
>
> Sure; the most obvious case where BP would make sense, the trap entry
> point, we already use int3 so yeah, make it int3 throughout.
Okay - I just sent out a series to do this. Found a few other things to
fix/improve along the way:
https://lore.kernel.org/r/20250327205355.378659-1-mingo@kernel.org
Thanks,
Ingo
^ permalink raw reply [flat|nested] 16+ messages in thread
end of thread, other threads:[~2025-03-27 20:57 UTC | newest]
Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-03-23 7:25 [PATCH] x86/alternatives: remove false sharing in poke_int3_handler() Eric Dumazet
2025-03-23 21:38 ` Ingo Molnar
2025-03-24 3:59 ` Eric Dumazet
2025-03-24 7:16 ` Ingo Molnar
2025-03-24 7:47 ` Eric Dumazet
2025-03-24 7:53 ` Eric Dumazet
2025-03-24 8:04 ` Ingo Molnar
2025-03-24 11:33 ` Peter Zijlstra
2025-03-25 8:41 ` Ingo Molnar
2025-03-25 10:30 ` Peter Zijlstra
2025-03-25 11:26 ` Ingo Molnar
2025-03-25 12:31 ` Peter Zijlstra
2025-03-27 20:56 ` Ingo Molnar
2025-03-25 11:36 ` [tip: x86/alternatives] x86/alternatives: Document the text_poke_bp_batch() synchronization rules a bit more tip-bot2 for Peter Zijlstra
2025-03-24 8:02 ` [PATCH] x86/alternatives: remove false sharing in poke_int3_handler() Ingo Molnar
2025-03-24 8:20 ` Eric Dumazet
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox