From: Valentin Schneider <vschneid@redhat.com>
To: linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org
Cc: Thomas Gleixner <tglx@linutronix.de>,
Ingo Molnar <mingo@redhat.com>, Borislav Petkov <bp@alien8.de>,
Dave Hansen <dave.hansen@linux.intel.com>,
"H. Peter Anvin" <hpa@zytor.com>,
Andy Lutomirski <luto@kernel.org>,
Peter Zijlstra <peterz@infradead.org>,
Arnaldo Carvalho de Melo <acme@kernel.org>,
Josh Poimboeuf <jpoimboe@kernel.org>,
Paolo Bonzini <pbonzini@redhat.com>,
Arnd Bergmann <arnd@arndb.de>,
Frederic Weisbecker <frederic@kernel.org>,
"Paul E. McKenney" <paulmck@kernel.org>,
Jason Baron <jbaron@akamai.com>,
Steven Rostedt <rostedt@goodmis.org>,
Ard Biesheuvel <ardb@kernel.org>,
Sami Tolvanen <samitolvanen@google.com>,
"David S. Miller" <davem@davemloft.net>,
Neeraj Upadhyay <neeraj.upadhyay@kernel.org>,
Joel Fernandes <joelagnelf@nvidia.com>,
Josh Triplett <josh@joshtriplett.org>,
Boqun Feng <boqun.feng@gmail.com>,
Uladzislau Rezki <urezki@gmail.com>,
Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
Mel Gorman <mgorman@suse.de>,
Andrew Morton <akpm@linux-foundation.org>,
Masahiro Yamada <masahiroy@kernel.org>,
Han Shen <shenhan@google.com>, Rik van Riel <riel@surriel.com>,
Jann Horn <jannh@google.com>,
Dan Carpenter <dan.carpenter@linaro.org>,
Oleg Nesterov <oleg@redhat.com>,
Juri Lelli <juri.lelli@redhat.com>,
Clark Williams <williams@redhat.com>,
Tomas Glozar <tglozar@redhat.com>,
Yair Podemsky <ypodemsk@redhat.com>,
Marcelo Tosatti <mtosatti@redhat.com>,
Daniel Wagner <dwagner@suse.de>, Petr Tesarik <ptesarik@suse.com>,
Shrikanth Hegde <sshegde@linux.ibm.com>
Subject: [PATCH v9 10/10] x86/mm, mm/vmalloc: Defer kernel TLB flush IPIs when tracking CR3 switches
Date: Tue, 5 May 2026 10:23:55 +0200 [thread overview]
Message-ID: <20260505082355.1982003-11-vschneid@redhat.com> (raw)
In-Reply-To: <20260505082355.1982003-1-vschneid@redhat.com>
Previous commits have added a software signal that tracks which CR3 (kernel
or user) is in use for any given CPU.
Combined with:
o the CR3 switch itself being a flush for non-global mappings
o global mappings under kPTI being limited to the CEA and entry text
we now have a way to safely defer (kernel) TLB flush IPIs targeting
NOHZ_FULL CPUs executing in userspace (i.e. with the user CR3 loaded).
When sending a kernel TLB flush IPI to a NOHZ_FULL CPU, check whether it is
using the user CR3, and if it is, do not interrupt it and instead rely on
the CR3 write that happens when switching to the kernel CR3.
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
---
arch/x86/include/asm/tlbflush.h | 1 +
arch/x86/mm/tlb.c | 49 ++++++++++++++++++++++++++++-----
mm/vmalloc.c | 30 ++++++++++++++++----
3 files changed, 68 insertions(+), 12 deletions(-)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 0ec669eb0b4e7..824304c08cd95 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -22,6 +22,7 @@ DECLARE_PER_CPU_PAGE_ALIGNED(bool, kernel_cr3_loaded);
#endif
void __flush_tlb_all(void);
+void flush_tlb_kernel_range_deferrable(unsigned long start, unsigned long end);
#define TLB_FLUSH_ALL -1UL
#define TLB_GENERATION_INVALID 0
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index af43d177087e7..68bcccace0659 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -13,6 +13,7 @@
#include <linux/mmu_notifier.h>
#include <linux/mmu_context.h>
#include <linux/kvm_types.h>
+#include <linux/sched/isolation.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -1509,23 +1510,24 @@ static void do_kernel_range_flush(void *info)
flush_tlb_one_kernel(addr);
}
-static void kernel_tlb_flush_all(struct flush_tlb_info *info)
+static void kernel_tlb_flush_all(smp_cond_func_t cond, struct flush_tlb_info *info)
{
if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
invlpgb_flush_all();
else
- on_each_cpu(do_flush_tlb_all, NULL, 1);
+ on_each_cpu_cond(cond, do_flush_tlb_all, NULL, 1);
}
-static void kernel_tlb_flush_range(struct flush_tlb_info *info)
+static void kernel_tlb_flush_range(smp_cond_func_t cond, struct flush_tlb_info *info)
{
if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
invlpgb_kernel_range_flush(info);
else
- on_each_cpu(do_kernel_range_flush, info, 1);
+ on_each_cpu_cond(cond, do_kernel_range_flush, info, 1);
}
-void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+static inline void
+__flush_tlb_kernel_range(smp_cond_func_t cond, unsigned long start, unsigned long end)
{
struct flush_tlb_info *info;
@@ -1535,13 +1537,46 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
TLB_GENERATION_INVALID);
if (info->end == TLB_FLUSH_ALL)
- kernel_tlb_flush_all(info);
+ kernel_tlb_flush_all(cond, info);
else
- kernel_tlb_flush_range(info);
+ kernel_tlb_flush_range(cond, info);
put_flush_tlb_info();
}
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+ __flush_tlb_kernel_range(NULL, start, end);
+}
+
+#ifdef CONFIG_TRACK_CR3
+static bool flush_tlb_kernel_cond(int cpu, void *info)
+{
+ /*
+ * Send the IPI if the target CPU is a housekeeping one, or if it is
+ * already executing in kernelspace.
+ */
+ bool ret = housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE);
+
+ /*
+ * Pairs with the LOCK in NOTE_KERNEL_CR3
+ *
+ * Ensures any previous operations are visible on a remote CPU
+ * entering the kernel and setting @kernel_cr3_loaded, if this one
+ * decides to defer the IPI.
+ */
+ smp_mb();
+ ret |= per_cpu(kernel_cr3_loaded, cpu);
+
+ return ret;
+}
+
+void flush_tlb_kernel_range_deferrable(unsigned long start, unsigned long end)
+{
+ __flush_tlb_kernel_range(flush_tlb_kernel_cond, start, end);
+}
+#endif
+
/*
* This can be used from process context to figure out what the value of
* CR3 is without needing to do a (slow) __read_cr3().
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index aa08651ec0df6..6276c8cb2be0d 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -506,6 +506,26 @@ void vunmap_range_noflush(unsigned long start, unsigned long end)
__vunmap_range_noflush(start, end);
}
+/*
+ * !!! BIG FAT WARNING !!!
+ *
+ * The CPU is free to cache any part of the paging hierarchy it wants at any
+ * time. It's also free to set accessed and dirty bits at any time, even for
+ * instructions that may never execute architecturally.
+ *
+ * This means that deferring a TLB flush affecting freed page-table-pages (IOW,
+ * keeping them in a CPU's paging hierarchy cache) is a recipe for disaster.
+ *
+ * This isn't a problem for deferral of TLB flushes in vmalloc, because
+ * page-table-pages used for vmap() mappings are never freed - see how
+ * __vunmap_range_noflush() walks the whole mapping but only clears the leaf PTEs.
+ * If this ever changes, TLB flush deferral will cause misery.
+ */
+void __weak flush_tlb_kernel_range_deferrable(unsigned long start, unsigned long end)
+{
+ flush_tlb_kernel_range(start, end);
+}
+
/**
* vunmap_range - unmap kernel virtual addresses
* @addr: start of the VM area to unmap
@@ -519,7 +539,7 @@ void vunmap_range(unsigned long addr, unsigned long end)
{
flush_cache_vunmap(addr, end);
vunmap_range_noflush(addr, end);
- flush_tlb_kernel_range(addr, end);
+ flush_tlb_kernel_range_deferrable(addr, end);
}
static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
@@ -2373,7 +2393,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
nr_purge_nodes = cpumask_weight(&purge_nodes);
if (nr_purge_nodes > 0) {
- flush_tlb_kernel_range(start, end);
+ flush_tlb_kernel_range_deferrable(start, end);
/* One extra worker is per a lazy_max_pages() full set minus one. */
nr_purge_helpers = atomic_long_read(&vmap_lazy_nr) / lazy_max_pages();
@@ -2476,7 +2496,7 @@ static void free_unmap_vmap_area(struct vmap_area *va)
flush_cache_vunmap(va->va_start, va->va_end);
vunmap_range_noflush(va->va_start, va->va_end);
if (debug_pagealloc_enabled_static())
- flush_tlb_kernel_range(va->va_start, va->va_end);
+ flush_tlb_kernel_range_deferrable(va->va_start, va->va_end);
free_vmap_area_noflush(va);
}
@@ -2923,7 +2943,7 @@ static void vb_free(unsigned long addr, unsigned long size)
vunmap_range_noflush(addr, addr + size);
if (debug_pagealloc_enabled_static())
- flush_tlb_kernel_range(addr, addr + size);
+ flush_tlb_kernel_range_deferrable(addr, addr + size);
spin_lock(&vb->lock);
@@ -2988,7 +3008,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
free_purged_blocks(&purge_list);
if (!__purge_vmap_area_lazy(start, end, false) && flush)
- flush_tlb_kernel_range(start, end);
+ flush_tlb_kernel_range_deferrable(start, end);
mutex_unlock(&vmap_purge_lock);
}
--
2.52.0
prev parent reply other threads:[~2026-05-05 8:27 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-05 8:23 [PATCH v9 00/10] x86: Defer some IPIs until a user->kernel transition Valentin Schneider
2026-05-05 8:23 ` [PATCH v9 01/10] objtool: Make validate_call() recognize indirect calls to pv_ops[] Valentin Schneider
2026-05-05 8:23 ` [PATCH v9 02/10] objtool: Flesh out warning related to pv_ops[] calls Valentin Schneider
2026-05-05 8:23 ` [PATCH v9 03/10] objtool: Always pass a section to validate_unwind_hints() Valentin Schneider
2026-05-05 8:23 ` [PATCH v9 04/10] x86/retpoline: Make warn_thunk_thunk .noinstr Valentin Schneider
2026-05-05 8:23 ` [PATCH v9 05/10] jump_label: Add annotations for validating .entry.text key usage Valentin Schneider
2026-05-05 8:23 ` [PATCH v9 06/10] objtool: Add .entry.text validation for static branches Valentin Schneider
2026-05-05 8:23 ` [PATCH v9 07/10] x86/jump_label: Add ASM support for static_branch_likely() Valentin Schneider
2026-05-05 8:23 ` [PATCH v9 08/10] x86/mm/pti: Introduce a kernel/user CR3 software signal Valentin Schneider
2026-05-05 8:23 ` [PATCH v9 09/10] context_tracking,x86: Defer kernel text patching IPIs when tracking CR3 switches Valentin Schneider
2026-05-05 8:23 ` Valentin Schneider [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260505082355.1982003-11-vschneid@redhat.com \
--to=vschneid@redhat.com \
--cc=acme@kernel.org \
--cc=akpm@linux-foundation.org \
--cc=ardb@kernel.org \
--cc=arnd@arndb.de \
--cc=boqun.feng@gmail.com \
--cc=bp@alien8.de \
--cc=dan.carpenter@linaro.org \
--cc=dave.hansen@linux.intel.com \
--cc=davem@davemloft.net \
--cc=dwagner@suse.de \
--cc=frederic@kernel.org \
--cc=hpa@zytor.com \
--cc=jannh@google.com \
--cc=jbaron@akamai.com \
--cc=joelagnelf@nvidia.com \
--cc=josh@joshtriplett.org \
--cc=jpoimboe@kernel.org \
--cc=juri.lelli@redhat.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=luto@kernel.org \
--cc=masahiroy@kernel.org \
--cc=mathieu.desnoyers@efficios.com \
--cc=mgorman@suse.de \
--cc=mingo@redhat.com \
--cc=mtosatti@redhat.com \
--cc=neeraj.upadhyay@kernel.org \
--cc=oleg@redhat.com \
--cc=paulmck@kernel.org \
--cc=pbonzini@redhat.com \
--cc=peterz@infradead.org \
--cc=ptesarik@suse.com \
--cc=riel@surriel.com \
--cc=rostedt@goodmis.org \
--cc=samitolvanen@google.com \
--cc=shenhan@google.com \
--cc=sshegde@linux.ibm.com \
--cc=tglozar@redhat.com \
--cc=tglx@linutronix.de \
--cc=urezki@gmail.com \
--cc=williams@redhat.com \
--cc=x86@kernel.org \
--cc=ypodemsk@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox