From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org, x86@kernel.org, dave.hansen@linux.intel.com,
peterz@infradead.org, kernel-team@meta.com, bp@alien8.de,
Rik van Riel <riel@meta.com>, Rik van Riel <riel@surriel.com>
Subject: [RFC v5 8/8] x86/mm: make RAR invalidation scalable by skipping duplicate APIC pokes
Date: Fri, 21 Nov 2025 13:54:29 -0500 [thread overview]
Message-ID: <20251121185530.21876-9-riel@surriel.com> (raw)
In-Reply-To: <20251121185530.21876-1-riel@surriel.com>
From: Rik van Riel <riel@meta.com>
The naive RAR implementation suffers from heavy contention in
apic_mem_wait_irc_idle(), when multiple CPUs send our RAR
interrupts simultaneously.
When a CPU receives a RAR, it will scan its action vector, and
process all the rar_payload entries where the corresponding action
vector is set to RAR_PENDING. After processing each payload, it
will set the corresponding action vector to RAR_SUCCESS.
That means sending one single RAR to a CPU is enough for that CPU
to process all the pending RAR payloads, and other CPUs do not
usually need to send additional RARs to that CPU.
Optimistically avoid sending RAR interrupts to CPUs that are
already processing a RAR, looping back only if our request
went unprocessed, but the remote CPU is no longer processing
any RARs.
This changes will-it-scale tlb_flush2_threads numbers like this:
loops/sec IPI flush naive RAR optimized RAR
threads
1 175k 174k 170k
5 337k 345k 321k
10 530k 469k 497k
20 752k 363k 616k
30 922k 259k 754k
40 1005k 205k 779k
50 1073k 164k 883k
60 1040k 141k 813k
The numbers above are on a 30 core / 60 thread, single socket
Sapphire Rapids system. Average of 4 runs.
This exact same code reached up to 1200k loops/second on a
-tip kernel from a few weeks ago, and did so reliably across
several reboots. I have no good explanation for the difference.
Signed-off-by: Rik van Riel <riel@surriel.com>
---
arch/x86/mm/rar.c | 60 ++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 54 insertions(+), 6 deletions(-)
diff --git a/arch/x86/mm/rar.c b/arch/x86/mm/rar.c
index 76959782fb03..fd89eaaf4fc1 100644
--- a/arch/x86/mm/rar.c
+++ b/arch/x86/mm/rar.c
@@ -11,6 +11,7 @@
#include <asm/tlbflush.h>
static DEFINE_PER_CPU(struct cpumask, rar_cpu_mask);
+static DEFINE_PER_CPU(struct cpumask, apic_cpu_mask);
#define RAR_SUCCESS 0x00
#define RAR_PENDING 0x01
@@ -47,6 +48,32 @@ static struct rar_lock rar_locks[RAR_MAX_PAYLOADS] __cacheline_aligned;
*/
static DEFINE_PER_CPU_ALIGNED(u8[RAR_MAX_PAYLOADS], rar_action);
+/*
+ * Tracks whether a RAR is in flight to this CPU. This is used
+ * to avoid sending another RAR (waiting on the APIC) when the
+ * target CPU is already handling RARs.
+ */
+static DEFINE_PER_CPU(int, rar_pending) = -1;
+
+static bool get_rar_pending(int target_cpu, int this_cpu)
+{
+ int *this_rar_pending = &per_cpu(rar_pending, target_cpu);
+
+ /* Another CPU is flushing this CPU already. */
+ if (*this_rar_pending != -1)
+ return false;
+
+ /* Is this_cpu the one that needs to send a RAR to target_cpu? */
+ return cmpxchg(this_rar_pending, -1, this_cpu) == -1;
+}
+
+static void release_rar_pending(int target_cpu, int this_cpu)
+{
+ /* If this_cpu sent the RAR to target_cpu, clear rar_pending */
+ if (READ_ONCE(per_cpu(rar_pending, target_cpu)) == this_cpu)
+ WRITE_ONCE(per_cpu(rar_pending, target_cpu), -1);
+}
+
/*
* TODO: group CPUs together based on locality in the system instead
* of CPU number, to further reduce the cost of contention.
@@ -113,7 +140,7 @@ static void set_action_entry(unsigned long payload_nr, int target_cpu)
WRITE_ONCE(bitmap[payload_nr], RAR_PENDING);
}
-static void wait_for_action_done(unsigned long payload_nr, int target_cpu)
+static u8 wait_for_action_done(unsigned long payload_nr, int target_cpu)
{
u8 status;
u8 *rar_actions = per_cpu(rar_action, target_cpu);
@@ -123,9 +150,14 @@ static void wait_for_action_done(unsigned long payload_nr, int target_cpu)
while (status == RAR_PENDING) {
cpu_relax();
status = READ_ONCE(rar_actions[payload_nr]);
+ /* Target CPU is not processing RARs right now. */
+ if (READ_ONCE(per_cpu(rar_pending, target_cpu)) == -1)
+ return status;
}
WARN_ON_ONCE(rar_actions[payload_nr] != RAR_SUCCESS);
+
+ return status;
}
void rar_cpu_init(void)
@@ -183,7 +215,7 @@ void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
{
unsigned long pages = (end - start + PAGE_SIZE) / PAGE_SIZE;
int cpu, this_cpu = smp_processor_id();
- cpumask_t *dest_mask;
+ cpumask_t *dest_mask, *apic_mask;
unsigned long payload_nr;
/* Catch the "end - start + PAGE_SIZE" overflow above. */
@@ -213,7 +245,9 @@ void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
* flushes at context switch time.
*/
dest_mask = this_cpu_ptr(&rar_cpu_mask);
+ apic_mask = this_cpu_ptr(&apic_cpu_mask);
cpumask_and(dest_mask, mask, cpu_online_mask);
+ cpumask_clear(apic_mask);
/* Some callers race with other CPUs changing the passed mask */
if (unlikely(!cpumask_weight(dest_mask)))
@@ -225,11 +259,25 @@ void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
for_each_cpu(cpu, dest_mask)
set_action_entry(payload_nr, cpu);
- /* Send a message to all CPUs in the map */
- native_send_rar_ipi(dest_mask);
+ do {
+ for_each_cpu(cpu, dest_mask) {
+ /* Track the CPUs that have no RAR pending (yet). */
+ if (get_rar_pending(cpu, this_cpu))
+ __cpumask_set_cpu(cpu, apic_mask);
+ }
- for_each_cpu(cpu, dest_mask)
- wait_for_action_done(payload_nr, cpu);
+ /* Send a message to the CPUs not processing RARs yet */
+ native_send_rar_ipi(apic_mask);
+
+ for_each_cpu(cpu, dest_mask) {
+ u8 status = wait_for_action_done(payload_nr, cpu);
+ if (status == RAR_SUCCESS) {
+ release_rar_pending(cpu, this_cpu);
+ __cpumask_clear_cpu(cpu, dest_mask);
+ __cpumask_clear_cpu(cpu, apic_mask);
+ }
+ }
+ } while (unlikely(cpumask_weight(dest_mask)));
free_payload_slot(payload_nr);
}
--
2.51.1
prev parent reply other threads:[~2025-11-21 18:55 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-11-21 18:54 [RFC v5 0/8] x86/mm: Intel RAR TLB invalidation support Rik van Riel
2025-11-21 18:54 ` [RFC v5 1/8] x86/mm: Introduce Remote Action Request MSRs Rik van Riel
2025-11-21 18:54 ` [RFC v5 2/8] x86/mm: enable BROADCAST_TLB_FLUSH on Intel, too Rik van Riel
2025-11-21 18:54 ` [RFC v5 3/8] x86/mm: Introduce X86_FEATURE_RAR Rik van Riel
2025-11-21 18:54 ` [RFC v5 4/8] x86/apic: Introduce Remote Action Request Operations Rik van Riel
2025-11-21 18:54 ` [RFC v5 5/8] x86/mm: Introduce Remote Action Request Rik van Riel
2025-11-21 18:54 ` [RFC v5 6/8] x86/mm: use RAR for kernel TLB flushes Rik van Riel
2025-11-21 18:54 ` [RFC v5 7/8] x86/mm: userspace & pageout flushing using Intel RAR Rik van Riel
2025-11-21 18:54 ` Rik van Riel [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251121185530.21876-9-riel@surriel.com \
--to=riel@surriel.com \
--cc=bp@alien8.de \
--cc=dave.hansen@linux.intel.com \
--cc=kernel-team@meta.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=peterz@infradead.org \
--cc=riel@meta.com \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox