From: Lance Yang <lance.yang@linux.dev>
To: akpm@linux-foundation.org
Cc: david@kernel.org, dave.hansen@intel.com,
dave.hansen@linux.intel.com, ypodemsk@redhat.com,
hughd@google.com, will@kernel.org, aneesh.kumar@kernel.org,
npiggin@gmail.com, peterz@infradead.org, tglx@linutronix.de,
mingo@redhat.com, bp@alien8.de, x86@kernel.org, hpa@zytor.com,
arnd@arndb.de, lorenzo.stoakes@oracle.com, ziy@nvidia.com,
baolin.wang@linux.alibaba.com, Liam.Howlett@oracle.com,
npache@redhat.com, ryan.roberts@arm.com, dev.jain@arm.com,
baohua@kernel.org, shy828301@gmail.com, riel@surriel.com,
jannh@google.com, jgross@suse.com, seanjc@google.com,
pbonzini@redhat.com, boris.ostrovsky@oracle.com,
virtualization@lists.linux.dev, kvm@vger.kernel.org,
linux-arch@vger.kernel.org, linux-mm@kvack.org,
linux-kernel@vger.kernel.org, ioworker0@gmail.com,
Lance Yang <lance.yang@linux.dev>
Subject: [PATCH v4 1/3] mm: use targeted IPIs for TLB sync with lockless page table walkers
Date: Mon, 2 Feb 2026 15:45:55 +0800 [thread overview]
Message-ID: <20260202074557.16544-2-lance.yang@linux.dev> (raw)
In-Reply-To: <20260202074557.16544-1-lance.yang@linux.dev>
From: Lance Yang <lance.yang@linux.dev>
Currently, tlb_remove_table_sync_one() broadcasts IPIs to all CPUs to wait
for any concurrent lockless page table walkers (e.g., GUP-fast). This is
inefficient on systems with many CPUs, especially for RT workloads[1].
This patch introduces a per-CPU tracking mechanism to record which CPUs are
actively performing lockless page table walks for a specific mm_struct.
When freeing/unsharing page tables, we can now send IPIs only to the CPUs
that are actually walking that mm, instead of broadcasting to all CPUs.
In preparation for targeted IPIs; a follow-up will switch callers to
tlb_remove_table_sync_mm().
Note that the tracking adds ~3% latency to GUP-fast, as measured on a
64-core system.
[1] https://lore.kernel.org/linux-mm/1b27a3fa-359a-43d0-bdeb-c31341749367@kernel.org/
Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
Signed-off-by: Lance Yang <lance.yang@linux.dev>
---
include/asm-generic/tlb.h | 2 ++
include/linux/mm.h | 34 ++++++++++++++++++++++++++
kernel/events/core.c | 2 ++
mm/gup.c | 2 ++
mm/mmu_gather.c | 50 +++++++++++++++++++++++++++++++++++++++
5 files changed, 90 insertions(+)
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 4aeac0c3d3f0..b6b06e6b879f 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -250,6 +250,7 @@ static inline void tlb_remove_table(struct mmu_gather *tlb, void *table)
#endif
void tlb_remove_table_sync_one(void);
+void tlb_remove_table_sync_mm(struct mm_struct *mm);
#else
@@ -258,6 +259,7 @@ void tlb_remove_table_sync_one(void);
#endif
static inline void tlb_remove_table_sync_one(void) { }
+static inline void tlb_remove_table_sync_mm(struct mm_struct *mm) { }
#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f8a8fd47399c..d92df995fcd1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2995,6 +2995,40 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
pgoff_t *offset);
int folio_add_pins(struct folio *folio, unsigned int pins);
+/*
+ * Track CPUs doing lockless page table walks to avoid broadcast IPIs
+ * during TLB flushes.
+ */
+DECLARE_PER_CPU(struct mm_struct *, active_lockless_pt_walk_mm);
+
+static inline void pt_walk_lockless_start(struct mm_struct *mm)
+{
+ lockdep_assert_irqs_disabled();
+
+ /*
+ * Tell other CPUs we're doing lockless page table walk.
+ *
+ * Full barrier needed to prevent page table reads from being
+ * reordered before this write.
+ *
+ * Pairs with smp_rmb() in tlb_remove_table_sync_mm().
+ */
+ this_cpu_write(active_lockless_pt_walk_mm, mm);
+ smp_mb();
+}
+
+static inline void pt_walk_lockless_end(void)
+{
+ lockdep_assert_irqs_disabled();
+
+ /*
+ * Clear the pointer so other CPUs no longer see this CPU as walking
+ * the mm. Use smp_store_release to ensure page table reads complete
+ * before the clear is visible to other CPUs.
+ */
+ smp_store_release(this_cpu_ptr(&active_lockless_pt_walk_mm), NULL);
+}
+
int get_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages);
int pin_user_pages_fast(unsigned long start, int nr_pages,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5b5cb620499e..6539112c28ff 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8190,7 +8190,9 @@ static u64 perf_get_page_size(unsigned long addr)
mm = &init_mm;
}
+ pt_walk_lockless_start(mm);
size = perf_get_pgtable_size(mm, addr);
+ pt_walk_lockless_end();
local_irq_restore(flags);
diff --git a/mm/gup.c b/mm/gup.c
index 8e7dc2c6ee73..6748e28b27f2 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -3154,7 +3154,9 @@ static unsigned long gup_fast(unsigned long start, unsigned long end,
* that come from callers of tlb_remove_table_sync_one().
*/
local_irq_save(flags);
+ pt_walk_lockless_start(current->mm);
gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned);
+ pt_walk_lockless_end();
local_irq_restore(flags);
/*
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 2faa23d7f8d4..35c89e4b6230 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -285,6 +285,56 @@ void tlb_remove_table_sync_one(void)
smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
}
+DEFINE_PER_CPU(struct mm_struct *, active_lockless_pt_walk_mm);
+EXPORT_PER_CPU_SYMBOL_GPL(active_lockless_pt_walk_mm);
+
+/**
+ * tlb_remove_table_sync_mm - send IPIs to CPUs doing lockless page table
+ * walk for @mm
+ *
+ * @mm: target mm; only CPUs walking this mm get an IPI.
+ *
+ * Like tlb_remove_table_sync_one() but only targets CPUs in
+ * active_lockless_pt_walk_mm.
+ */
+void tlb_remove_table_sync_mm(struct mm_struct *mm)
+{
+ cpumask_var_t target_cpus;
+ bool found_any = false;
+ int cpu;
+
+ if (WARN_ONCE(!mm, "NULL mm in %s\n", __func__)) {
+ tlb_remove_table_sync_one();
+ return;
+ }
+
+ /* If we can't, fall back to broadcast. */
+ if (!alloc_cpumask_var(&target_cpus, GFP_ATOMIC)) {
+ tlb_remove_table_sync_one();
+ return;
+ }
+
+ cpumask_clear(target_cpus);
+
+ /* Pairs with smp_mb() in pt_walk_lockless_start(). */
+ smp_rmb();
+
+ /* Find CPUs doing lockless page table walks for this mm */
+ for_each_online_cpu(cpu) {
+ if (per_cpu(active_lockless_pt_walk_mm, cpu) == mm) {
+ cpumask_set_cpu(cpu, target_cpus);
+ found_any = true;
+ }
+ }
+
+ /* Only send IPIs to CPUs actually doing lockless walks */
+ if (found_any)
+ smp_call_function_many(target_cpus, tlb_remove_table_smp_sync,
+ NULL, 1);
+
+ free_cpumask_var(target_cpus);
+}
+
static void tlb_remove_table_rcu(struct rcu_head *head)
{
__tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
--
2.49.0
next prev parent reply other threads:[~2026-02-02 7:46 UTC|newest]
Thread overview: 41+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-02-02 7:45 [PATCH v4 0/3] targeted TLB sync IPIs for lockless page table walkers Lance Yang
2026-02-02 7:45 ` Lance Yang [this message]
2026-02-02 9:42 ` [PATCH v4 1/3] mm: use targeted IPIs for TLB sync with " Peter Zijlstra
2026-02-02 12:14 ` Lance Yang
2026-02-02 12:51 ` Peter Zijlstra
2026-02-02 13:23 ` Lance Yang
2026-02-02 13:42 ` Peter Zijlstra
2026-02-02 14:28 ` Lance Yang
2026-02-02 16:20 ` Dave Hansen
2026-02-02 11:37 ` kernel test robot
2026-02-03 23:49 ` kernel test robot
2026-02-02 7:45 ` [PATCH v4 2/3] mm: switch callers to tlb_remove_table_sync_mm() Lance Yang
2026-02-02 7:45 ` [PATCH v4 3/3] x86/tlb: add architecture-specific TLB IPI optimization support Lance Yang
2026-02-25 20:11 ` Sean Christopherson
2026-02-26 11:37 ` Lance Yang
2026-02-26 18:24 ` Sean Christopherson
2026-03-01 6:56 ` Lance Yang
2026-02-02 9:54 ` [PATCH v4 0/3] targeted TLB sync IPIs for lockless page table walkers Peter Zijlstra
2026-02-02 11:00 ` [PATCH v4 0/3] targeted TLB sync IPIs for lockless page table Lance Yang
2026-02-02 12:50 ` Peter Zijlstra
2026-02-02 12:58 ` Lance Yang
2026-02-02 13:07 ` Lance Yang
2026-02-02 13:37 ` Peter Zijlstra
2026-02-02 14:37 ` Lance Yang
2026-02-02 15:09 ` Peter Zijlstra
2026-02-02 15:52 ` Lance Yang
2026-02-05 13:25 ` David Hildenbrand (Arm)
2026-02-05 15:01 ` Lance Yang
2026-02-05 15:05 ` David Hildenbrand (Arm)
2026-02-05 15:28 ` Lance Yang
2026-02-05 15:09 ` Dave Hansen
2026-02-05 15:31 ` Lance Yang
2026-02-05 15:41 ` Dave Hansen
2026-02-05 16:30 ` Lance Yang
2026-02-05 16:46 ` David Hildenbrand (Arm)
2026-02-05 16:48 ` Matthew Wilcox
2026-02-05 17:06 ` David Hildenbrand (Arm)
2026-02-05 18:36 ` Dave Hansen
2026-02-05 22:49 ` David Hildenbrand (Arm)
2026-02-05 21:30 ` David Hildenbrand (Arm)
2026-02-05 17:00 ` Dave Hansen
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260202074557.16544-2-lance.yang@linux.dev \
--to=lance.yang@linux.dev \
--cc=Liam.Howlett@oracle.com \
--cc=akpm@linux-foundation.org \
--cc=aneesh.kumar@kernel.org \
--cc=arnd@arndb.de \
--cc=baohua@kernel.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=boris.ostrovsky@oracle.com \
--cc=bp@alien8.de \
--cc=dave.hansen@intel.com \
--cc=dave.hansen@linux.intel.com \
--cc=david@kernel.org \
--cc=dev.jain@arm.com \
--cc=hpa@zytor.com \
--cc=hughd@google.com \
--cc=ioworker0@gmail.com \
--cc=jannh@google.com \
--cc=jgross@suse.com \
--cc=kvm@vger.kernel.org \
--cc=linux-arch@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=lorenzo.stoakes@oracle.com \
--cc=mingo@redhat.com \
--cc=npache@redhat.com \
--cc=npiggin@gmail.com \
--cc=pbonzini@redhat.com \
--cc=peterz@infradead.org \
--cc=riel@surriel.com \
--cc=ryan.roberts@arm.com \
--cc=seanjc@google.com \
--cc=shy828301@gmail.com \
--cc=tglx@linutronix.de \
--cc=virtualization@lists.linux.dev \
--cc=will@kernel.org \
--cc=x86@kernel.org \
--cc=ypodemsk@redhat.com \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.