[RFC v5 8/8] x86/mm: make RAR invalidation scalable by skipping duplicate APIC pokes

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org, x86@kernel.org, dave.hansen@linux.intel.com,
	peterz@infradead.org, kernel-team@meta.com, bp@alien8.de,
	Rik van Riel <riel@meta.com>, Rik van Riel <riel@surriel.com>
Subject: [RFC v5 8/8] x86/mm: make RAR invalidation scalable by skipping duplicate APIC pokes
Date: Fri, 21 Nov 2025 13:54:29 -0500	[thread overview]
Message-ID: <20251121185530.21876-9-riel@surriel.com> (raw)
In-Reply-To: <20251121185530.21876-1-riel@surriel.com>

From: Rik van Riel <riel@meta.com>

The naive RAR implementation suffers from heavy contention in
apic_mem_wait_irc_idle(), when multiple CPUs send our RAR
interrupts simultaneously.

When a CPU receives a RAR, it will scan its action vector, and
process all the rar_payload entries where the corresponding action
vector is set to RAR_PENDING. After processing each payload, it
will set the corresponding action vector to RAR_SUCCESS.

That means sending one single RAR to a CPU is enough for that CPU
to process all the pending RAR payloads, and other CPUs do not
usually need to send additional RARs to that CPU.

Optimistically avoid sending RAR interrupts to CPUs that are
already processing a RAR, looping back only if our request
went unprocessed, but the remote CPU is no longer processing
any RARs.

This changes will-it-scale tlb_flush2_threads numbers like this:

loops/sec  IPI flush    naive RAR     optimized RAR
threads
   1         175k          174k           170k
   5         337k          345k           321k
  10         530k          469k           497k
  20         752k          363k           616k
  30         922k          259k           754k
  40        1005k          205k           779k
  50        1073k          164k           883k
  60        1040k          141k           813k

The numbers above are on a 30 core / 60 thread, single socket
Sapphire Rapids system. Average of 4 runs.

This exact same code reached up to 1200k loops/second on a
-tip kernel from a few weeks ago, and did so reliably across
several reboots. I have no good explanation for the difference.

Signed-off-by: Rik van Riel <riel@surriel.com>
---
 arch/x86/mm/rar.c | 60 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 54 insertions(+), 6 deletions(-)

diff --git a/arch/x86/mm/rar.c b/arch/x86/mm/rar.c
index 76959782fb03..fd89eaaf4fc1 100644
--- a/arch/x86/mm/rar.c
+++ b/arch/x86/mm/rar.c
@@ -11,6 +11,7 @@
 #include <asm/tlbflush.h>
 
 static DEFINE_PER_CPU(struct cpumask, rar_cpu_mask);
+static DEFINE_PER_CPU(struct cpumask, apic_cpu_mask);
 
 #define RAR_SUCCESS	0x00
 #define RAR_PENDING	0x01
@@ -47,6 +48,32 @@ static struct rar_lock rar_locks[RAR_MAX_PAYLOADS] __cacheline_aligned;
  */
 static DEFINE_PER_CPU_ALIGNED(u8[RAR_MAX_PAYLOADS], rar_action);
 
+/*
+ * Tracks whether a RAR is in flight to this CPU. This is used
+ * to avoid sending another RAR (waiting on the APIC) when the
+ * target CPU is already handling RARs.
+ */
+static DEFINE_PER_CPU(int, rar_pending) = -1;
+
+static bool get_rar_pending(int target_cpu, int this_cpu)
+{
+	int *this_rar_pending = &per_cpu(rar_pending, target_cpu);
+
+	/* Another CPU is flushing this CPU already. */
+	if (*this_rar_pending != -1)
+		return false;
+
+	/* Is this_cpu the one that needs to send a RAR to target_cpu? */
+	return cmpxchg(this_rar_pending, -1, this_cpu) == -1;
+}
+
+static void release_rar_pending(int target_cpu, int this_cpu)
+{
+	/* If this_cpu sent the RAR to target_cpu, clear rar_pending */
+	if (READ_ONCE(per_cpu(rar_pending, target_cpu)) == this_cpu)
+		WRITE_ONCE(per_cpu(rar_pending, target_cpu), -1);
+}
+
 /*
  * TODO: group CPUs together based on locality in the system instead
  * of CPU number, to further reduce the cost of contention.
@@ -113,7 +140,7 @@ static void set_action_entry(unsigned long payload_nr, int target_cpu)
 	WRITE_ONCE(bitmap[payload_nr], RAR_PENDING);
 }
 
-static void wait_for_action_done(unsigned long payload_nr, int target_cpu)
+static u8 wait_for_action_done(unsigned long payload_nr, int target_cpu)
 {
 	u8 status;
 	u8 *rar_actions = per_cpu(rar_action, target_cpu);
@@ -123,9 +150,14 @@ static void wait_for_action_done(unsigned long payload_nr, int target_cpu)
 	while (status == RAR_PENDING) {
 		cpu_relax();
 		status = READ_ONCE(rar_actions[payload_nr]);
+		/* Target CPU is not processing RARs right now. */
+		if (READ_ONCE(per_cpu(rar_pending, target_cpu)) == -1)
+			return status;
 	}
 
 	WARN_ON_ONCE(rar_actions[payload_nr] != RAR_SUCCESS);
+
+	return status;
 }
 
 void rar_cpu_init(void)
@@ -183,7 +215,7 @@ void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
 {
 	unsigned long pages = (end - start + PAGE_SIZE) / PAGE_SIZE;
 	int cpu, this_cpu = smp_processor_id();
-	cpumask_t *dest_mask;
+	cpumask_t *dest_mask, *apic_mask;
 	unsigned long payload_nr;
 
 	/* Catch the "end - start + PAGE_SIZE" overflow above. */
@@ -213,7 +245,9 @@ void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
 	 * flushes at context switch time.
 	 */
 	dest_mask = this_cpu_ptr(&rar_cpu_mask);
+	apic_mask = this_cpu_ptr(&apic_cpu_mask);
 	cpumask_and(dest_mask, mask, cpu_online_mask);
+	cpumask_clear(apic_mask);
 
 	/* Some callers race with other CPUs changing the passed mask */
 	if (unlikely(!cpumask_weight(dest_mask)))
@@ -225,11 +259,25 @@ void smp_call_rar_many(const struct cpumask *mask, u16 pcid,
 	for_each_cpu(cpu, dest_mask)
 		set_action_entry(payload_nr, cpu);
 
-	/* Send a message to all CPUs in the map */
-	native_send_rar_ipi(dest_mask);
+	do {
+		for_each_cpu(cpu, dest_mask) {
+			/* Track the CPUs that have no RAR pending (yet). */
+			if (get_rar_pending(cpu, this_cpu))
+				__cpumask_set_cpu(cpu, apic_mask);
+		}
 
-	for_each_cpu(cpu, dest_mask)
-		wait_for_action_done(payload_nr, cpu);
+		/* Send a message to the CPUs not processing RARs yet */
+		native_send_rar_ipi(apic_mask);
+
+		for_each_cpu(cpu, dest_mask) {
+			u8 status = wait_for_action_done(payload_nr, cpu);
+			if (status == RAR_SUCCESS) {
+				release_rar_pending(cpu, this_cpu);
+				__cpumask_clear_cpu(cpu, dest_mask);
+				__cpumask_clear_cpu(cpu, apic_mask);
+			}
+		}
+	} while (unlikely(cpumask_weight(dest_mask)));
 
 	free_payload_slot(payload_nr);
 }
-- 
2.51.1

     prev parent reply	other threads:[~2025-11-21 18:55 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-11-21 18:54 [RFC v5 0/8] x86/mm: Intel RAR TLB invalidation support Rik van Riel
2025-11-21 18:54 ` [RFC v5 1/8] x86/mm: Introduce Remote Action Request MSRs Rik van Riel
2025-11-21 18:54 ` [RFC v5 2/8] x86/mm: enable BROADCAST_TLB_FLUSH on Intel, too Rik van Riel
2025-11-21 18:54 ` [RFC v5 3/8] x86/mm: Introduce X86_FEATURE_RAR Rik van Riel
2025-11-21 18:54 ` [RFC v5 4/8] x86/apic: Introduce Remote Action Request Operations Rik van Riel
2025-11-21 18:54 ` [RFC v5 5/8] x86/mm: Introduce Remote Action Request Rik van Riel
2025-11-21 18:54 ` [RFC v5 6/8] x86/mm: use RAR for kernel TLB flushes Rik van Riel
2025-11-21 18:54 ` [RFC v5 7/8] x86/mm: userspace & pageout flushing using Intel RAR Rik van Riel
2025-11-21 18:54 ` Rik van Riel [this message]

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:76959782fb0 dfblob:fd89eaaf4fc )
 OR (
bs:"[RFC v5 8/8] x86/mm: make RAR invalidation scalable by skipping duplicate APIC pokes" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251121185530.21876-9-riel@surriel.com \
    --to=riel@surriel.com \
    --cc=bp@alien8.de \
    --cc=dave.hansen@linux.intel.com \
    --cc=kernel-team@meta.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=peterz@infradead.org \
    --cc=riel@meta.com \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox