linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Lance Yang <lance.yang@linux.dev>
To: Dave Hansen <dave.hansen@intel.com>,
	"David Hildenbrand (Red Hat)" <david@kernel.org>
Cc: will@kernel.org, aneesh.kumar@kernel.org, npiggin@gmail.com,
	peterz@infradead.org, tglx@linutronix.de, mingo@redhat.com,
	bp@alien8.de, dave.hansen@linux.intel.com, x86@kernel.org,
	hpa@zytor.com, arnd@arndb.de, lorenzo.stoakes@oracle.com,
	ziy@nvidia.com, baolin.wang@linux.alibaba.com,
	Liam.Howlett@oracle.com, npache@redhat.com, ryan.roberts@arm.com,
	dev.jain@arm.com, baohua@kernel.org, ioworker0@gmail.com,
	shy828301@gmail.com, riel@surriel.com, jannh@google.com,
	linux-arch@vger.kernel.org, linux-mm@kvack.org,
	linux-kernel@vger.kernel.org, akpm@linux-foundation.org
Subject: Re: [PATCH v2 0/3] skip redundant TLB sync IPIs
Date: Sat, 3 Jan 2026 16:39:06 +0800	[thread overview]
Message-ID: <fc3c20a9-69a2-41eb-9f22-8df262717348@linux.dev> (raw)
In-Reply-To: <cea71c01-68e7-4f7f-9931-017109d95ef0@intel.com>



On 2026/1/3 00:41, Dave Hansen wrote:
> On 12/31/25 04:33, David Hildenbrand (Red Hat) wrote:
>> On 12/31/25 05:26, Dave Hansen wrote:
>>> On 12/29/25 06:52, Lance Yang wrote:
>>> ...
>>>> This series introduces a way for architectures to indicate their TLB
>>>> flush
>>>> already provides full synchronization, allowing the redundant IPI to be
>>>> skipped. For now, the optimization is implemented for x86 first and
>>>> applied
>>>> to all page table operations that free or unshare tables.
>>>
>>> I really don't like all the complexity here. Even on x86, there are
>>> three or more ways of deriving this. Having the pv_ops check the value
>>> of another pv op is also a bit unsettling.
>>
>> Right. What I actually meant is that we simply have a property "bool
>> flush_tlb_multi_implies_ipi_broadcast" that we set only to true from the
>> initialization code.
>>
>> Without comparing the pv_ops.
>>
>> That should reduce the complexity quite a bit IMHO.
> 
> Yeah, that sounds promising.

Thanks a lot for taking the time to review!

Yeah, I simplified things to just a bool property set during init
(no pv_ops comparison at runtime) as follows:

---8<---
diff --git a/arch/x86/include/asm/paravirt.h 
b/arch/x86/include/asm/paravirt.h
index 13f9cd31c8f8..a926d459e6f5 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -698,6 +698,7 @@ static __always_inline unsigned long 
arch_local_irq_save(void)

  extern void default_banner(void);
  void native_pv_lock_init(void) __init;
+void setup_pv_tlb_flush_ipi_broadcast(void) __init;

  #else  /* __ASSEMBLER__ */

@@ -727,6 +728,10 @@ void native_pv_lock_init(void) __init;
  static inline void native_pv_lock_init(void)
  {
  }
+
+static inline void setup_pv_tlb_flush_ipi_broadcast(void)
+{
+}
  #endif
  #endif /* !CONFIG_PARAVIRT */

diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 3502939415ad..7c010d8bee60 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -133,6 +133,12 @@ struct pv_mmu_ops {
  	void (*flush_tlb_multi)(const struct cpumask *cpus,
  				const struct flush_tlb_info *info);

+	/*
+	 * Indicates whether flush_tlb_multi IPIs provide sufficient
+	 * synchronization for GUP-fast when freeing or unsharing page tables.
+	 */
+	bool flush_tlb_multi_implies_ipi_broadcast;
+
  	/* Hook for intercepting the destruction of an mm_struct. */
  	void (*exit_mmap)(struct mm_struct *mm);
  	void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, 
bool enc);
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 866ea78ba156..f570c7b2d03e 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -5,10 +5,23 @@
  #define tlb_flush tlb_flush
  static inline void tlb_flush(struct mmu_gather *tlb);

+#define tlb_table_flush_implies_ipi_broadcast 
tlb_table_flush_implies_ipi_broadcast
+static inline bool tlb_table_flush_implies_ipi_broadcast(void);
+
  #include <asm-generic/tlb.h>
  #include <linux/kernel.h>
  #include <vdso/bits.h>
  #include <vdso/page.h>
+#include <asm/paravirt.h>
+
+static inline bool tlb_table_flush_implies_ipi_broadcast(void)
+{
+#ifdef CONFIG_PARAVIRT
+	return pv_ops.mmu.flush_tlb_multi_implies_ipi_broadcast;
+#else
+	return !cpu_feature_enabled(X86_FEATURE_INVLPGB);
+#endif
+}

  static inline void tlb_flush(struct mmu_gather *tlb)
  {
@@ -20,7 +33,8 @@ static inline void tlb_flush(struct mmu_gather *tlb)
  		end = tlb->end;
  	}

-	flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
+	flush_tlb_mm_range(tlb->mm, start, end, stride_shift,
+			   tlb->freed_tables || tlb->unshared_tables);
  }

  static inline void invlpg(unsigned long addr)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index ab3e172dcc69..0a49c2d79693 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -60,6 +60,23 @@ void __init native_pv_lock_init(void)
  		static_branch_enable(&virt_spin_lock_key);
  }

+void __init setup_pv_tlb_flush_ipi_broadcast(void)
+{
+	/*
+	 * For native TLB flush, if we don't have INVLPGB, we use IPI-based
+	 * flushing which sends real IPIs to all CPUs. This provides sufficient
+	 * synchronization for GUP-fast.
+	 *
+	 * For paravirt (e.g., KVM, Xen, HyperV), hypercalls may not send real
+	 * IPIs, so we keep the default value of false. Only set to true when
+	 * using native flush_tlb_multi without INVLPGB.
+	 */
+	if (pv_ops.mmu.flush_tlb_multi == native_flush_tlb_multi &&
+	    !cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		pv_ops.mmu.flush_tlb_multi_implies_ipi_broadcast = true;
+}
+
+
  struct static_key paravirt_steal_enabled;
  struct static_key paravirt_steal_rq_enabled;

@@ -173,6 +190,7 @@ struct paravirt_patch_template pv_ops = {
  	.mmu.flush_tlb_kernel	= native_flush_tlb_global,
  	.mmu.flush_tlb_one_user	= native_flush_tlb_one_user,
  	.mmu.flush_tlb_multi	= native_flush_tlb_multi,
+	.mmu.flush_tlb_multi_implies_ipi_broadcast = false,

  	.mmu.exit_mmap		= paravirt_nop,
  	.mmu.notify_page_enc_status_changed	= paravirt_nop,
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 74aa904be6dc..3f673e686b12 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -51,6 +51,7 @@
  #include <asm/olpc_ofw.h>
  #include <asm/pci-direct.h>
  #include <asm/prom.h>
+#include <asm/paravirt.h>
  #include <asm/proto.h>
  #include <asm/realmode.h>
  #include <asm/thermal.h>
@@ -1257,6 +1258,7 @@ void __init setup_arch(char **cmdline_p)
  	io_apic_init_mappings();

  	x86_init.hyper.guest_late_init();
+	setup_pv_tlb_flush_ipi_broadcast();

  	e820__reserve_resources();
  	e820__register_nosave_regions(max_pfn);
---

> 
>> But maybe you have an even better way on how to indicate support, in a
>> very simple way.
> 
> Rather than having some kind of explicit support enumeration, the other
> idea I had would be to actually track the state about what needs to get
> flushed somewhere. For instance, even CPUs with enabled INVLPGB support
> still use IPIs sometimes. That makes the
> tlb_table_flush_implies_ipi_broadcast() check a bit imperfect as is
> because it will for the extra sync IPI even when INVLPGB isn't being
> used for an mm.
> 
> First, we already save some semblance of support for doing different
> flushes when freeing page tables mmu_gather->freed_tables. But, the call
> sites in question here are for a single flush and don't use mmu_gathers.
> 
> The other pretty straightforward thing to do would be to add something
> to mm->context that indicates that page tables need to be freed but
> there might still be wild gup walkers out there that need an IPI. It
> would get set when the page tables are modified and cleared at all the
> sites where an IPIs are sent.

Thanks for the suggestion! The mm->context tracking idea makes a lot
of sense - it would handle those mixed INVLPGB/IPI cases much better :)

Maybe we could do that as a follow-up. I'd like to keep things simple
for now, so we just add a bool property to skip redundant TLB sync IPIs
on systems without INVLPGB support.

Then we could add the mm->context (or something similar) tracking later
to handle things more precisely.

Anyway, I'm open to going straight to the mm->context approach as well
and happy to do that instead :D

Thanks,
Lance

> 
> 
>>> That said, complexity can be worth it with sufficient demonstrated
>>> gains. But:
>>>
>>>> When unsharing hugetlb PMD page tables or collapsing pages in
>>>> khugepaged,
>>>> we send two IPIs: one for TLB invalidation, and another to synchronize
>>>> with concurrent GUP-fast walkers.
>>>
>>> Those aren't exactly hot paths. khugepaged is fundamentally rate
>>> limited. I don't think unsharing hugetlb PMD page tables just is all
>>> that common either.
>>
>> Given that the added IPIs during unsharing broke Oracle DBs rather badly
>> [1], I think this is actually a case worth optimizing.
> ...
>> [1] https://lkml.kernel.org/r/20251223214037.580860-1-david@kernel.org
> 
> Gah, that's good context, thanks.
> 
> Are there any tests out there that might catch these this case better?
> It might be something good to have 0day watch for.



  reply	other threads:[~2026-01-03  8:39 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-12-29 14:52 [PATCH v2 0/3] skip redundant TLB sync IPIs Lance Yang
2025-12-29 14:52 ` [PATCH v2 1/3] mm/tlb: allow architectures to " Lance Yang
2025-12-29 15:00   ` Lance Yang
2025-12-29 15:01     ` [PATCH v2 0/3] " Lance Yang
2025-12-30 20:31   ` [PATCH v2 1/3] mm/tlb: allow architectures to " David Hildenbrand (Red Hat)
2025-12-31  2:29     ` Lance Yang
2025-12-29 14:52 ` [PATCH v2 2/3] x86/mm: implement redundant IPI elimination for page table operations Lance Yang
2025-12-29 14:52 ` [PATCH v2 3/3] mm: embed TLB flush IPI check in tlb_remove_table_sync_one() Lance Yang
2025-12-30 20:33   ` David Hildenbrand (Red Hat)
2025-12-31  3:03     ` Lance Yang
2025-12-31  4:26 ` [PATCH v2 0/3] skip redundant TLB sync IPIs Dave Hansen
2025-12-31 12:33   ` David Hildenbrand (Red Hat)
2026-01-02 16:41     ` Dave Hansen
2026-01-03  8:39       ` Lance Yang [this message]
2026-01-03 17:06         ` Dave Hansen
2026-01-04  7:42           ` Lance Yang
2026-01-04 13:23             ` Lance Yang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=fc3c20a9-69a2-41eb-9f22-8df262717348@linux.dev \
    --to=lance.yang@linux.dev \
    --cc=Liam.Howlett@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=aneesh.kumar@kernel.org \
    --cc=arnd@arndb.de \
    --cc=baohua@kernel.org \
    --cc=baolin.wang@linux.alibaba.com \
    --cc=bp@alien8.de \
    --cc=dave.hansen@intel.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=david@kernel.org \
    --cc=dev.jain@arm.com \
    --cc=hpa@zytor.com \
    --cc=ioworker0@gmail.com \
    --cc=jannh@google.com \
    --cc=linux-arch@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lorenzo.stoakes@oracle.com \
    --cc=mingo@redhat.com \
    --cc=npache@redhat.com \
    --cc=npiggin@gmail.com \
    --cc=peterz@infradead.org \
    --cc=riel@surriel.com \
    --cc=ryan.roberts@arm.com \
    --cc=shy828301@gmail.com \
    --cc=tglx@linutronix.de \
    --cc=will@kernel.org \
    --cc=x86@kernel.org \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).