All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ingo Molnar <mingo@kernel.org>
To: linux-kernel@vger.kernel.org
Cc: Juergen Gross <jgross@suse.com>,
	"H . Peter Anvin" <hpa@zytor.com>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Peter Zijlstra <peterz@infradead.org>,
	Borislav Petkov <bp@alien8.de>,
	Thomas Gleixner <tglx@linutronix.de>,
	Eric Dumazet <edumazet@google.com>,
	Ingo Molnar <mingo@kernel.org>, Brian Gerst <brgerst@gmail.com>,
	Kees Cook <keescook@chromium.org>,
	Josh Poimboeuf <jpoimboe@redhat.com>
Subject: [PATCH 01/53] x86/alternatives: Improve code-patching scalability by removing false sharing in poke_int3_handler()
Date: Fri, 11 Apr 2025 07:40:13 +0200	[thread overview]
Message-ID: <20250411054105.2341982-2-mingo@kernel.org> (raw)
In-Reply-To: <20250411054105.2341982-1-mingo@kernel.org>

From: Eric Dumazet <edumazet@google.com>

eBPF programs can be run 50,000,000 times per second on busy servers.

Whenever /proc/sys/kernel/bpf_stats_enabled is turned off,
hundreds of calls sites are patched from text_poke_bp_batch()
and we see a huge loss of performance due to false sharing
on bp_desc.refs lasting up to three seconds.

   51.30%  server_bin       [kernel.kallsyms]           [k] poke_int3_handler
            |
            |--46.45%--poke_int3_handler
            |          exc_int3
            |          asm_exc_int3
            |          |
            |          |--24.26%--cls_bpf_classify
            |          |          tcf_classify
            |          |          __dev_queue_xmit
            |          |          ip6_finish_output2
            |          |          ip6_output
            |          |          ip6_xmit
            |          |          inet6_csk_xmit
            |          |          __tcp_transmit_skb

Fix this by replacing bp_desc.refs with a per-cpu bp_refs.

Before the patch, on a host with 240 cores (480 threads):

  $ sysctl -wq kernel.bpf_stats_enabled=0

  text_poke_bp_batch(nr_entries=164) : Took 2655300 usec

  $ bpftool prog | grep run_time_ns
  ...
  105: sched_cls  name hn_egress  tag 699fc5eea64144e3  gpl run_time_ns
  3009063719 run_cnt 82757845 : average cost is 36 nsec per call

After this patch:

  $ sysctl -wq kernel.bpf_stats_enabled=0

  text_poke_bp_batch(nr_entries=164) : Took 702 usec

  $ bpftool prog | grep run_time_ns
  ...
  105: sched_cls  name hn_egress  tag 699fc5eea64144e3  gpl run_time_ns
  1928223019 run_cnt 67682728 : average cost is 28 nsec per call

Ie. text-patching performance improved 3700x: from 2.65 seconds
to 0.0007 seconds.

Since the atomic_cond_read_acquire(refs, !VAL) spin-loop was not triggered
even once in my tests, add an unlikely() annotation, because this appears
to be the common case.

[ mingo: Improved the changelog some more. ]

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/20250325043316.874518-1-edumazet@google.com
---
 arch/x86/kernel/alternative.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index bf82c6f7d690..85089c79a828 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2474,28 +2474,29 @@ struct text_poke_loc {
 struct bp_patching_desc {
 	struct text_poke_loc *vec;
 	int nr_entries;
-	atomic_t refs;
 };
 
+static DEFINE_PER_CPU(atomic_t, bp_refs);
+
 static struct bp_patching_desc bp_desc;
 
 static __always_inline
 struct bp_patching_desc *try_get_desc(void)
 {
-	struct bp_patching_desc *desc = &bp_desc;
+	atomic_t *refs = this_cpu_ptr(&bp_refs);
 
-	if (!raw_atomic_inc_not_zero(&desc->refs))
+	if (!raw_atomic_inc_not_zero(refs))
 		return NULL;
 
-	return desc;
+	return &bp_desc;
 }
 
 static __always_inline void put_desc(void)
 {
-	struct bp_patching_desc *desc = &bp_desc;
+	atomic_t *refs = this_cpu_ptr(&bp_refs);
 
 	smp_mb__before_atomic();
-	raw_atomic_dec(&desc->refs);
+	raw_atomic_dec(refs);
 }
 
 static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
@@ -2528,9 +2529,9 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
 	 * Having observed our INT3 instruction, we now must observe
 	 * bp_desc with non-zero refcount:
 	 *
-	 *	bp_desc.refs = 1		INT3
-	 *	WMB				RMB
-	 *	write INT3			if (bp_desc.refs != 0)
+	 *	bp_refs = 1		INT3
+	 *	WMB			RMB
+	 *	write INT3		if (bp_refs != 0)
 	 */
 	smp_rmb();
 
@@ -2636,7 +2637,8 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
 	 * Corresponds to the implicit memory barrier in try_get_desc() to
 	 * ensure reading a non-zero refcount provides up to date bp_desc data.
 	 */
-	atomic_set_release(&bp_desc.refs, 1);
+	for_each_possible_cpu(i)
+		atomic_set_release(per_cpu_ptr(&bp_refs, i), 1);
 
 	/*
 	 * Function tracing can enable thousands of places that need to be
@@ -2750,8 +2752,12 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
 	/*
 	 * Remove and wait for refs to be zero.
 	 */
-	if (!atomic_dec_and_test(&bp_desc.refs))
-		atomic_cond_read_acquire(&bp_desc.refs, !VAL);
+	for_each_possible_cpu(i) {
+		atomic_t *refs = per_cpu_ptr(&bp_refs, i);
+
+		if (unlikely(!atomic_dec_and_test(refs)))
+			atomic_cond_read_acquire(refs, !VAL);
+	}
 }
 
 static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
-- 
2.45.2


  reply	other threads:[~2025-04-11  5:41 UTC|newest]

Thread overview: 105+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-04-11  5:40 [PATCH -v3 00/53] Simplify, reorganize and clean up the x86 text-patching code (alternative.c) Ingo Molnar
2025-04-11  5:40 ` Ingo Molnar [this message]
2025-04-11 10:02   ` [tip: x86/alternatives] x86/alternatives: Improve code-patching scalability by removing false sharing in poke_int3_handler() tip-bot2 for Eric Dumazet
2025-04-11  5:40 ` [PATCH 02/53] x86/alternatives: Document the text_poke_bp_batch() synchronization rules a bit more Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] " tip-bot2 for Peter Zijlstra
2025-04-11  5:40 ` [PATCH 03/53] x86/alternatives: Rename 'struct bp_patching_desc' to 'struct int3_patching_desc' Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] x86/alternatives: Rename 'struct bp_patching_desc' to 'struct text_poke_int3_vec' tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 04/53] x86/alternatives: Rename 'bp_refs' to 'int3_refs' Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] x86/alternatives: Rename 'bp_refs' to 'text_poke_array_refs' tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 05/53] x86/alternatives: Rename 'text_poke_bp_batch()' to 'smp_text_poke_batch_process()' Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 06/53] x86/alternatives: Rename 'text_poke_bp()' to 'smp_text_poke_single()' Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 07/53] x86/alternatives: Rename 'poke_int3_handler()' to 'smp_text_poke_int3_handler()' Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 08/53] x86/alternatives: Rename 'poking_mm' to 'text_poke_mm' Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 09/53] x86/alternatives: Rename 'poking_addr' to 'text_poke_mm_addr' Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 10/53] x86/alternatives: Rename 'bp_desc' to 'int3_desc' Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 11/53] x86/alternatives: Remove duplicate 'text_poke_early()' prototype Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 12/53] x86/alternatives: Update comments in int3_emulate_push() Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 13/53] x86/alternatives: Remove the confusing, inaccurate & unnecessary 'temp_mm_state_t' abstraction Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 14/53] x86/alternatives: Rename 'text_poke_flush()' to 'smp_text_poke_batch_flush()' Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 15/53] x86/alternatives: Rename 'text_poke_finish()' to 'smp_text_poke_batch_finish()' Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 16/53] x86/alternatives: Rename 'text_poke_queue()' to 'smp_text_poke_batch_add()' Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 17/53] x86/alternatives: Rename 'text_poke_loc_init()' to 'text_poke_int3_loc_init()' Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 18/53] x86/alternatives: Rename 'struct text_poke_loc' to 'struct smp_text_poke_loc' Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 19/53] x86/alternatives: Rename 'struct int3_patching_desc' to 'struct text_poke_int3_vec' Ingo Molnar
2025-04-11  5:40 ` [PATCH 20/53] x86/alternatives: Rename 'int3_desc' to 'int3_vec' Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 21/53] x86/alternatives: Add text_mutex) assert to smp_text_poke_batch_flush() Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 22/53] x86/alternatives: Use non-inverted logic instead of 'tp_order_fail()' Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 23/53] x86/alternatives: Remove the 'addr == NULL means forced-flush' hack from smp_text_poke_batch_finish()/smp_text_poke_batch_flush()/text_poke_addr_ordered() Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 24/53] x86/alternatives: Simplify smp_text_poke_single() by using tp_vec and existing APIs Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 25/53] x86/alternatives: Assert that smp_text_poke_int3_handler() can only ever handle 'tp_vec[]' based requests Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 26/53] x86/alternatives: Assert input parameters in smp_text_poke_batch_process() Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 27/53] x86/alternatives: Introduce 'struct smp_text_poke_array' and move tp_vec and tp_vec_nr to it Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 28/53] x86/alternatives: Remove the tp_vec indirection Ingo Molnar
2025-04-11 10:02   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 29/53] x86/alternatives: Rename 'try_get_desc()' to 'try_get_text_poke_array()' Ingo Molnar
2025-04-11 10:01   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 30/53] x86/alternatives: Rename 'put_desc()' to 'put_text_poke_array()' Ingo Molnar
2025-04-11 10:01   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 31/53] x86/alternatives: Simplify try_get_text_poke_array() Ingo Molnar
2025-04-11 10:01   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 32/53] x86/alternatives: Simplify smp_text_poke_int3_handler() Ingo Molnar
2025-04-11 10:01   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 33/53] x86/alternatives: Simplify smp_text_poke_batch_process() Ingo Molnar
2025-04-11 10:01   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 34/53] x86/alternatives: Rename 'int3_refs' to 'text_poke_array_refs' Ingo Molnar
2025-04-11  5:40 ` [PATCH 35/53] x86/alternatives: Move the text_poke_array manipulation into text_poke_int3_loc_init() and rename it to __smp_text_poke_batch_add() Ingo Molnar
2025-04-11 10:01   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 36/53] x86/alternatives: Remove the mixed-patching restriction on smp_text_poke_single() Ingo Molnar
2025-04-11 10:01   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 37/53] x86/alternatives: Document 'smp_text_poke_single()' Ingo Molnar
2025-04-11 10:01   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 38/53] x86/alternatives: Add documentation for smp_text_poke_batch_add() Ingo Molnar
2025-04-11 10:01   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 39/53] x86/alternatives: Move text_poke_array completion from smp_text_poke_batch_finish() and smp_text_poke_batch_flush() to smp_text_poke_batch_process() Ingo Molnar
2025-04-11 10:01   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 40/53] x86/alternatives: Rename 'text_poke_sync()' to 'smp_text_poke_sync_each_cpu()' Ingo Molnar
2025-04-11 10:01   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 41/53] x86/alternatives: Simplify text_poke_addr_ordered() Ingo Molnar
2025-04-11 10:01   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 42/53] x86/alternatives: Constify text_poke_addr() Ingo Molnar
2025-04-11 10:01   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 43/53] x86/alternatives: Simplify and clean up patch_cmp() Ingo Molnar
2025-04-11 10:01   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 44/53] x86/alternatives: Standardize on 'tpl' local variable names for 'struct smp_text_poke_loc *' Ingo Molnar
2025-04-11 10:01   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 45/53] x86/alternatives: Rename 'TP_ARRAY_NR_ENTRIES_MAX' to 'TEXT_POKE_ARRAY_MAX' Ingo Molnar
2025-04-11 10:01   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 46/53] x86/alternatives: Rename 'POKE_MAX_OPCODE_SIZE' to 'TEXT_POKE_MAX_OPCODE_SIZE' Ingo Molnar
2025-04-11 10:01   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:40 ` [PATCH 47/53] x86/alternatives: Simplify the #include section Ingo Molnar
2025-04-11 10:01   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:41 ` [PATCH 48/53] x86/alternatives: Move declarations of vmlinux.lds.S defined section symbols to <asm/alternative.h> Ingo Molnar
2025-04-11 10:01   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:41 ` [PATCH 49/53] x86/alternatives: Remove 'smp_text_poke_batch_flush()' Ingo Molnar
2025-04-11 10:01   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:41 ` [PATCH 50/53] x86/alternatives: Update the comments in smp_text_poke_batch_process() Ingo Molnar
2025-04-11 10:01   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:41 ` [PATCH 51/53] x86/alternatives: Rename 'apply_relocation()' to 'text_poke_apply_relocation()' Ingo Molnar
2025-04-11 10:01   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:41 ` [PATCH 52/53] x86/alternatives: Add comment about noinstr expectations Ingo Molnar
2025-04-11 10:01   ` [tip: x86/alternatives] " tip-bot2 for Ingo Molnar
2025-04-11  5:41 ` [PATCH 53/53] x86/alternatives: Make smp_text_poke_batch_process() subsume smp_text_poke_batch_finish() Ingo Molnar
2025-04-11 10:01   ` [tip: x86/alternatives] " tip-bot2 for Nikolay Borisov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250411054105.2341982-2-mingo@kernel.org \
    --to=mingo@kernel.org \
    --cc=bp@alien8.de \
    --cc=brgerst@gmail.com \
    --cc=edumazet@google.com \
    --cc=hpa@zytor.com \
    --cc=jgross@suse.com \
    --cc=jpoimboe@redhat.com \
    --cc=keescook@chromium.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=peterz@infradead.org \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.