From: Lance Yang <lance.yang@linux.dev>
To: Dave Hansen <dave.hansen@intel.com>,
"David Hildenbrand (Red Hat)" <david@kernel.org>
Cc: will@kernel.org, aneesh.kumar@kernel.org, npiggin@gmail.com,
peterz@infradead.org, tglx@linutronix.de, mingo@redhat.com,
bp@alien8.de, dave.hansen@linux.intel.com, x86@kernel.org,
hpa@zytor.com, arnd@arndb.de, lorenzo.stoakes@oracle.com,
ziy@nvidia.com, baolin.wang@linux.alibaba.com,
Liam.Howlett@oracle.com, npache@redhat.com, ryan.roberts@arm.com,
dev.jain@arm.com, baohua@kernel.org, ioworker0@gmail.com,
shy828301@gmail.com, riel@surriel.com, jannh@google.com,
linux-arch@vger.kernel.org, linux-mm@kvack.org,
linux-kernel@vger.kernel.org, akpm@linux-foundation.org
Subject: Re: [PATCH v2 0/3] skip redundant TLB sync IPIs
Date: Sat, 3 Jan 2026 16:39:06 +0800 [thread overview]
Message-ID: <fc3c20a9-69a2-41eb-9f22-8df262717348@linux.dev> (raw)
In-Reply-To: <cea71c01-68e7-4f7f-9931-017109d95ef0@intel.com>
On 2026/1/3 00:41, Dave Hansen wrote:
> On 12/31/25 04:33, David Hildenbrand (Red Hat) wrote:
>> On 12/31/25 05:26, Dave Hansen wrote:
>>> On 12/29/25 06:52, Lance Yang wrote:
>>> ...
>>>> This series introduces a way for architectures to indicate their TLB
>>>> flush
>>>> already provides full synchronization, allowing the redundant IPI to be
>>>> skipped. For now, the optimization is implemented for x86 first and
>>>> applied
>>>> to all page table operations that free or unshare tables.
>>>
>>> I really don't like all the complexity here. Even on x86, there are
>>> three or more ways of deriving this. Having the pv_ops check the value
>>> of another pv op is also a bit unsettling.
>>
>> Right. What I actually meant is that we simply have a property "bool
>> flush_tlb_multi_implies_ipi_broadcast" that we set only to true from the
>> initialization code.
>>
>> Without comparing the pv_ops.
>>
>> That should reduce the complexity quite a bit IMHO.
>
> Yeah, that sounds promising.
Thanks a lot for taking the time to review!
Yeah, I simplified things to just a bool property set during init
(no pv_ops comparison at runtime) as follows:
---8<---
diff --git a/arch/x86/include/asm/paravirt.h
b/arch/x86/include/asm/paravirt.h
index 13f9cd31c8f8..a926d459e6f5 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -698,6 +698,7 @@ static __always_inline unsigned long
arch_local_irq_save(void)
extern void default_banner(void);
void native_pv_lock_init(void) __init;
+void setup_pv_tlb_flush_ipi_broadcast(void) __init;
#else /* __ASSEMBLER__ */
@@ -727,6 +728,10 @@ void native_pv_lock_init(void) __init;
static inline void native_pv_lock_init(void)
{
}
+
+static inline void setup_pv_tlb_flush_ipi_broadcast(void)
+{
+}
#endif
#endif /* !CONFIG_PARAVIRT */
diff --git a/arch/x86/include/asm/paravirt_types.h
b/arch/x86/include/asm/paravirt_types.h
index 3502939415ad..7c010d8bee60 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -133,6 +133,12 @@ struct pv_mmu_ops {
void (*flush_tlb_multi)(const struct cpumask *cpus,
const struct flush_tlb_info *info);
+ /*
+ * Indicates whether flush_tlb_multi IPIs provide sufficient
+ * synchronization for GUP-fast when freeing or unsharing page tables.
+ */
+ bool flush_tlb_multi_implies_ipi_broadcast;
+
/* Hook for intercepting the destruction of an mm_struct. */
void (*exit_mmap)(struct mm_struct *mm);
void (*notify_page_enc_status_changed)(unsigned long pfn, int npages,
bool enc);
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 866ea78ba156..f570c7b2d03e 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -5,10 +5,23 @@
#define tlb_flush tlb_flush
static inline void tlb_flush(struct mmu_gather *tlb);
+#define tlb_table_flush_implies_ipi_broadcast
tlb_table_flush_implies_ipi_broadcast
+static inline bool tlb_table_flush_implies_ipi_broadcast(void);
+
#include <asm-generic/tlb.h>
#include <linux/kernel.h>
#include <vdso/bits.h>
#include <vdso/page.h>
+#include <asm/paravirt.h>
+
+static inline bool tlb_table_flush_implies_ipi_broadcast(void)
+{
+#ifdef CONFIG_PARAVIRT
+ return pv_ops.mmu.flush_tlb_multi_implies_ipi_broadcast;
+#else
+ return !cpu_feature_enabled(X86_FEATURE_INVLPGB);
+#endif
+}
static inline void tlb_flush(struct mmu_gather *tlb)
{
@@ -20,7 +33,8 @@ static inline void tlb_flush(struct mmu_gather *tlb)
end = tlb->end;
}
- flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
+ flush_tlb_mm_range(tlb->mm, start, end, stride_shift,
+ tlb->freed_tables || tlb->unshared_tables);
}
static inline void invlpg(unsigned long addr)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index ab3e172dcc69..0a49c2d79693 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -60,6 +60,23 @@ void __init native_pv_lock_init(void)
static_branch_enable(&virt_spin_lock_key);
}
+void __init setup_pv_tlb_flush_ipi_broadcast(void)
+{
+ /*
+ * For native TLB flush, if we don't have INVLPGB, we use IPI-based
+ * flushing which sends real IPIs to all CPUs. This provides sufficient
+ * synchronization for GUP-fast.
+ *
+ * For paravirt (e.g., KVM, Xen, HyperV), hypercalls may not send real
+ * IPIs, so we keep the default value of false. Only set to true when
+ * using native flush_tlb_multi without INVLPGB.
+ */
+ if (pv_ops.mmu.flush_tlb_multi == native_flush_tlb_multi &&
+ !cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ pv_ops.mmu.flush_tlb_multi_implies_ipi_broadcast = true;
+}
+
+
struct static_key paravirt_steal_enabled;
struct static_key paravirt_steal_rq_enabled;
@@ -173,6 +190,7 @@ struct paravirt_patch_template pv_ops = {
.mmu.flush_tlb_kernel = native_flush_tlb_global,
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
.mmu.flush_tlb_multi = native_flush_tlb_multi,
+ .mmu.flush_tlb_multi_implies_ipi_broadcast = false,
.mmu.exit_mmap = paravirt_nop,
.mmu.notify_page_enc_status_changed = paravirt_nop,
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 74aa904be6dc..3f673e686b12 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -51,6 +51,7 @@
#include <asm/olpc_ofw.h>
#include <asm/pci-direct.h>
#include <asm/prom.h>
+#include <asm/paravirt.h>
#include <asm/proto.h>
#include <asm/realmode.h>
#include <asm/thermal.h>
@@ -1257,6 +1258,7 @@ void __init setup_arch(char **cmdline_p)
io_apic_init_mappings();
x86_init.hyper.guest_late_init();
+ setup_pv_tlb_flush_ipi_broadcast();
e820__reserve_resources();
e820__register_nosave_regions(max_pfn);
---
>
>> But maybe you have an even better way on how to indicate support, in a
>> very simple way.
>
> Rather than having some kind of explicit support enumeration, the other
> idea I had would be to actually track the state about what needs to get
> flushed somewhere. For instance, even CPUs with enabled INVLPGB support
> still use IPIs sometimes. That makes the
> tlb_table_flush_implies_ipi_broadcast() check a bit imperfect as is
> because it will for the extra sync IPI even when INVLPGB isn't being
> used for an mm.
>
> First, we already save some semblance of support for doing different
> flushes when freeing page tables mmu_gather->freed_tables. But, the call
> sites in question here are for a single flush and don't use mmu_gathers.
>
> The other pretty straightforward thing to do would be to add something
> to mm->context that indicates that page tables need to be freed but
> there might still be wild gup walkers out there that need an IPI. It
> would get set when the page tables are modified and cleared at all the
> sites where an IPIs are sent.
Thanks for the suggestion! The mm->context tracking idea makes a lot
of sense - it would handle those mixed INVLPGB/IPI cases much better :)
Maybe we could do that as a follow-up. I'd like to keep things simple
for now, so we just add a bool property to skip redundant TLB sync IPIs
on systems without INVLPGB support.
Then we could add the mm->context (or something similar) tracking later
to handle things more precisely.
Anyway, I'm open to going straight to the mm->context approach as well
and happy to do that instead :D
Thanks,
Lance
>
>
>>> That said, complexity can be worth it with sufficient demonstrated
>>> gains. But:
>>>
>>>> When unsharing hugetlb PMD page tables or collapsing pages in
>>>> khugepaged,
>>>> we send two IPIs: one for TLB invalidation, and another to synchronize
>>>> with concurrent GUP-fast walkers.
>>>
>>> Those aren't exactly hot paths. khugepaged is fundamentally rate
>>> limited. I don't think unsharing hugetlb PMD page tables just is all
>>> that common either.
>>
>> Given that the added IPIs during unsharing broke Oracle DBs rather badly
>> [1], I think this is actually a case worth optimizing.
> ...
>> [1] https://lkml.kernel.org/r/20251223214037.580860-1-david@kernel.org
>
> Gah, that's good context, thanks.
>
> Are there any tests out there that might catch these this case better?
> It might be something good to have 0day watch for.
next prev parent reply other threads:[~2026-01-03 8:39 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-12-29 14:52 [PATCH v2 0/3] skip redundant TLB sync IPIs Lance Yang
2025-12-29 14:52 ` [PATCH v2 1/3] mm/tlb: allow architectures to " Lance Yang
2025-12-29 15:00 ` Lance Yang
2025-12-29 15:01 ` [PATCH v2 0/3] " Lance Yang
2025-12-30 20:31 ` [PATCH v2 1/3] mm/tlb: allow architectures to " David Hildenbrand (Red Hat)
2025-12-31 2:29 ` Lance Yang
2025-12-29 14:52 ` [PATCH v2 2/3] x86/mm: implement redundant IPI elimination for page table operations Lance Yang
2025-12-29 14:52 ` [PATCH v2 3/3] mm: embed TLB flush IPI check in tlb_remove_table_sync_one() Lance Yang
2025-12-30 20:33 ` David Hildenbrand (Red Hat)
2025-12-31 3:03 ` Lance Yang
2025-12-31 4:26 ` [PATCH v2 0/3] skip redundant TLB sync IPIs Dave Hansen
2025-12-31 12:33 ` David Hildenbrand (Red Hat)
2026-01-02 16:41 ` Dave Hansen
2026-01-03 8:39 ` Lance Yang [this message]
2026-01-03 17:06 ` Dave Hansen
2026-01-04 7:42 ` Lance Yang
2026-01-04 13:23 ` Lance Yang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=fc3c20a9-69a2-41eb-9f22-8df262717348@linux.dev \
--to=lance.yang@linux.dev \
--cc=Liam.Howlett@oracle.com \
--cc=akpm@linux-foundation.org \
--cc=aneesh.kumar@kernel.org \
--cc=arnd@arndb.de \
--cc=baohua@kernel.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=bp@alien8.de \
--cc=dave.hansen@intel.com \
--cc=dave.hansen@linux.intel.com \
--cc=david@kernel.org \
--cc=dev.jain@arm.com \
--cc=hpa@zytor.com \
--cc=ioworker0@gmail.com \
--cc=jannh@google.com \
--cc=linux-arch@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=lorenzo.stoakes@oracle.com \
--cc=mingo@redhat.com \
--cc=npache@redhat.com \
--cc=npiggin@gmail.com \
--cc=peterz@infradead.org \
--cc=riel@surriel.com \
--cc=ryan.roberts@arm.com \
--cc=shy828301@gmail.com \
--cc=tglx@linutronix.de \
--cc=will@kernel.org \
--cc=x86@kernel.org \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).