* [PATCH v9 01/10] objtool: Make validate_call() recognize indirect calls to pv_ops[]
2026-05-05 8:23 [PATCH v9 00/10] x86: Defer some IPIs until a user->kernel transition Valentin Schneider
@ 2026-05-05 8:23 ` Valentin Schneider
2026-05-05 8:23 ` [PATCH v9 02/10] objtool: Flesh out warning related to pv_ops[] calls Valentin Schneider
` (8 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Valentin Schneider @ 2026-05-05 8:23 UTC (permalink / raw)
To: linux-kernel, linux-mm, x86
Cc: Josh Poimboeuf, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, H. Peter Anvin, Andy Lutomirski, Peter Zijlstra,
Arnaldo Carvalho de Melo, Paolo Bonzini, Arnd Bergmann,
Frederic Weisbecker, Paul E. McKenney, Jason Baron,
Steven Rostedt, Ard Biesheuvel, Sami Tolvanen, David S. Miller,
Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
Uladzislau Rezki, Mathieu Desnoyers, Mel Gorman, Andrew Morton,
Masahiro Yamada, Han Shen, Rik van Riel, Jann Horn, Dan Carpenter,
Oleg Nesterov, Juri Lelli, Clark Williams, Tomas Glozar,
Yair Podemsky, Marcelo Tosatti, Daniel Wagner, Petr Tesarik,
Shrikanth Hegde
call_dest_name() does not get passed the file pointer of validate_call(),
which means its invocation of insn_reloc() will always return NULL. Make it
take a file pointer.
While at it, make sure call_dest_name() uses arch_dest_reloc_offset(),
otherwise it gets the pv_ops[] offset wrong.
Fabricating an intentional warning shows the change; previously:
vmlinux.o: warning: objtool: __flush_tlb_all_noinstr+0x4: call to {dynamic}() leaves .noinstr.text section
now:
vmlinux.o: warning: objtool: __flush_tlb_all_noinstr+0x4: call to pv_ops[1]() leaves .noinstr.text section
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
tools/objtool/check.c | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 9b11cf3193b9c..dcc386e17c70a 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -3442,7 +3442,7 @@ static inline bool func_uaccess_safe(struct symbol *func)
return false;
}
-static inline const char *call_dest_name(struct instruction *insn)
+static inline const char *call_dest_name(struct objtool_file *file, struct instruction *insn)
{
static char pvname[19];
struct reloc *reloc;
@@ -3451,9 +3451,9 @@ static inline const char *call_dest_name(struct instruction *insn)
if (insn_call_dest(insn))
return insn_call_dest(insn)->name;
- reloc = insn_reloc(NULL, insn);
+ reloc = insn_reloc(file, insn);
if (reloc && !strcmp(reloc->sym->name, "pv_ops")) {
- idx = (reloc_addend(reloc) / sizeof(void *));
+ idx = arch_insn_adjusted_addend(insn, reloc) / sizeof(void *);
snprintf(pvname, sizeof(pvname), "pv_ops[%d]", idx);
return pvname;
}
@@ -3532,17 +3532,19 @@ static int validate_call(struct objtool_file *file,
{
if (state->noinstr && state->instr <= 0 &&
!noinstr_call_dest(file, insn, insn_call_dest(insn))) {
- WARN_INSN(insn, "call to %s() leaves .noinstr.text section", call_dest_name(insn));
+ WARN_INSN(insn, "call to %s() leaves .noinstr.text section", call_dest_name(file, insn));
return 1;
}
if (state->uaccess && !func_uaccess_safe(insn_call_dest(insn))) {
- WARN_INSN(insn, "call to %s() with UACCESS enabled", call_dest_name(insn));
+ WARN_INSN(insn, "call to %s() with UACCESS enabled",
+ call_dest_name(file, insn));
return 1;
}
if (state->df) {
- WARN_INSN(insn, "call to %s() with DF set", call_dest_name(insn));
+ WARN_INSN(insn, "call to %s() with DF set",
+ call_dest_name(file, insn));
return 1;
}
--
2.52.0
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH v9 02/10] objtool: Flesh out warning related to pv_ops[] calls
2026-05-05 8:23 [PATCH v9 00/10] x86: Defer some IPIs until a user->kernel transition Valentin Schneider
2026-05-05 8:23 ` [PATCH v9 01/10] objtool: Make validate_call() recognize indirect calls to pv_ops[] Valentin Schneider
@ 2026-05-05 8:23 ` Valentin Schneider
2026-05-05 8:23 ` [PATCH v9 03/10] objtool: Always pass a section to validate_unwind_hints() Valentin Schneider
` (7 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Valentin Schneider @ 2026-05-05 8:23 UTC (permalink / raw)
To: linux-kernel, linux-mm, x86
Cc: Josh Poimboeuf, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, H. Peter Anvin, Andy Lutomirski, Peter Zijlstra,
Arnaldo Carvalho de Melo, Paolo Bonzini, Arnd Bergmann,
Frederic Weisbecker, Paul E. McKenney, Jason Baron,
Steven Rostedt, Ard Biesheuvel, Sami Tolvanen, David S. Miller,
Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
Uladzislau Rezki, Mathieu Desnoyers, Mel Gorman, Andrew Morton,
Masahiro Yamada, Han Shen, Rik van Riel, Jann Horn, Dan Carpenter,
Oleg Nesterov, Juri Lelli, Clark Williams, Tomas Glozar,
Yair Podemsky, Marcelo Tosatti, Daniel Wagner, Petr Tesarik,
Shrikanth Hegde
I had to look into objtool itself to understand what this warning was
about; make it more explicit.
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
tools/objtool/check.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index dcc386e17c70a..ba943bbbc51db 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -3480,7 +3480,7 @@ static bool pv_call_dest(struct objtool_file *file, struct instruction *insn)
list_for_each_entry(target, &file->pv_ops[idx].targets, pv_target) {
if (!target->sec->noinstr) {
- WARN("pv_ops[%d]: %s", idx, target->name);
+ WARN("pv_ops[%d]: indirect call to %s() leaves .noinstr.text section", idx, target->name);
file->pv_ops[idx].clean = false;
}
}
--
2.52.0
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH v9 03/10] objtool: Always pass a section to validate_unwind_hints()
2026-05-05 8:23 [PATCH v9 00/10] x86: Defer some IPIs until a user->kernel transition Valentin Schneider
2026-05-05 8:23 ` [PATCH v9 01/10] objtool: Make validate_call() recognize indirect calls to pv_ops[] Valentin Schneider
2026-05-05 8:23 ` [PATCH v9 02/10] objtool: Flesh out warning related to pv_ops[] calls Valentin Schneider
@ 2026-05-05 8:23 ` Valentin Schneider
2026-05-05 8:23 ` [PATCH v9 04/10] x86/retpoline: Make warn_thunk_thunk .noinstr Valentin Schneider
` (6 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Valentin Schneider @ 2026-05-05 8:23 UTC (permalink / raw)
To: linux-kernel, linux-mm, x86
Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Andy Lutomirski, Peter Zijlstra,
Arnaldo Carvalho de Melo, Josh Poimboeuf, Paolo Bonzini,
Arnd Bergmann, Frederic Weisbecker, Paul E. McKenney, Jason Baron,
Steven Rostedt, Ard Biesheuvel, Sami Tolvanen, David S. Miller,
Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
Uladzislau Rezki, Mathieu Desnoyers, Mel Gorman, Andrew Morton,
Masahiro Yamada, Han Shen, Rik van Riel, Jann Horn, Dan Carpenter,
Oleg Nesterov, Juri Lelli, Clark Williams, Tomas Glozar,
Yair Podemsky, Marcelo Tosatti, Daniel Wagner, Petr Tesarik,
Shrikanth Hegde
When passing a NULL @sec to validate_unwind_hints(), it is unable to
properly initialize the insn_state->noinstr passed down during
validation. This means we lose noinstr validation of the hints.
That validation currently happens when 'opts.noinstr' is true but
'validate_branch_enabled()' isn't.
In other words, this will run noinstr validation of hints:
$ objtool --noinstr --link [...]
but this won't:
$ objtool --noinstr --link --uaccess [...]
Always pass a valid section to validate_unwind_hints(), so that noinstr
validation of hints happens regardless of the value of
validate_branch_enabled().
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
---
tools/objtool/check.c | 27 +++++++++++++++++++--------
1 file changed, 19 insertions(+), 8 deletions(-)
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index ba943bbbc51db..2bb927aa34047 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -4118,13 +4118,8 @@ static int validate_unwind_hints(struct objtool_file *file, struct section *sec)
init_insn_state(file, &state, sec);
- if (sec) {
- sec_for_each_insn(file, sec, insn)
- warnings += validate_unwind_hint(file, insn, &state);
- } else {
- for_each_insn(file, insn)
- warnings += validate_unwind_hint(file, insn, &state);
- }
+ sec_for_each_insn(file, sec, insn)
+ warnings += validate_unwind_hint(file, insn, &state);
return warnings;
}
@@ -4621,6 +4616,21 @@ static int validate_functions(struct objtool_file *file)
return warnings;
}
+static int validate_file_unwind_hints(struct objtool_file *file)
+{
+ struct section *sec;
+ int warnings = 0;
+
+ for_each_sec(file->elf, sec) {
+ if (!is_text_sec(sec))
+ continue;
+
+ warnings += validate_unwind_hints(file, sec);
+ }
+
+ return warnings;
+}
+
static void mark_endbr_used(struct instruction *insn)
{
if (!list_empty(&insn->call_node))
@@ -5030,7 +5040,8 @@ int check(struct objtool_file *file)
int w = 0;
w += validate_functions(file);
- w += validate_unwind_hints(file, NULL);
+ w += validate_file_unwind_hints(file);
+
if (!w)
w += validate_reachable_instructions(file);
--
2.52.0
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH v9 04/10] x86/retpoline: Make warn_thunk_thunk .noinstr
2026-05-05 8:23 [PATCH v9 00/10] x86: Defer some IPIs until a user->kernel transition Valentin Schneider
` (2 preceding siblings ...)
2026-05-05 8:23 ` [PATCH v9 03/10] objtool: Always pass a section to validate_unwind_hints() Valentin Schneider
@ 2026-05-05 8:23 ` Valentin Schneider
2026-05-05 8:23 ` [PATCH v9 05/10] jump_label: Add annotations for validating .entry.text key usage Valentin Schneider
` (5 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Valentin Schneider @ 2026-05-05 8:23 UTC (permalink / raw)
To: linux-kernel, linux-mm, x86
Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Andy Lutomirski, Peter Zijlstra,
Arnaldo Carvalho de Melo, Josh Poimboeuf, Paolo Bonzini,
Arnd Bergmann, Frederic Weisbecker, Paul E. McKenney, Jason Baron,
Steven Rostedt, Ard Biesheuvel, Sami Tolvanen, David S. Miller,
Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
Uladzislau Rezki, Mathieu Desnoyers, Mel Gorman, Andrew Morton,
Masahiro Yamada, Han Shen, Rik van Riel, Jann Horn, Dan Carpenter,
Oleg Nesterov, Juri Lelli, Clark Williams, Tomas Glozar,
Yair Podemsky, Marcelo Tosatti, Daniel Wagner, Petr Tesarik,
Shrikanth Hegde
Objtool now warns about it:
vmlinux.o: warning: objtool: .altinstr_replacement+0x28e1: call to warn_thunk_thunk() leaves .noinstr.text section
Mark it noinstr.
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
---
arch/x86/entry/entry.S | 3 ++-
arch/x86/kernel/cpu/bugs.c | 2 +-
2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S
index 6ba2b3adcef0f..e76560f86b332 100644
--- a/arch/x86/entry/entry.S
+++ b/arch/x86/entry/entry.S
@@ -40,6 +40,8 @@ SYM_FUNC_START(__WARN_trap)
SYM_FUNC_END(__WARN_trap)
EXPORT_SYMBOL(__WARN_trap)
+THUNK warn_thunk_thunk, __warn_thunk
+
.popsection
/*
@@ -60,7 +62,6 @@ EXPORT_SYMBOL_FOR_KVM(x86_verw_sel);
.popsection
-THUNK warn_thunk_thunk, __warn_thunk
/*
* Clang's implementation of TLS stack cookies requires the variable in
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 83f51cab0b1e3..87dffa7a95a4e 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -3731,7 +3731,7 @@ ssize_t cpu_show_vmscape(struct device *dev, struct device_attribute *attr, char
}
#endif
-void __warn_thunk(void)
+void noinstr __warn_thunk(void)
{
WARN_ONCE(1, "Unpatched return thunk in use. This should not happen!\n");
}
--
2.52.0
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH v9 05/10] jump_label: Add annotations for validating .entry.text key usage
2026-05-05 8:23 [PATCH v9 00/10] x86: Defer some IPIs until a user->kernel transition Valentin Schneider
` (3 preceding siblings ...)
2026-05-05 8:23 ` [PATCH v9 04/10] x86/retpoline: Make warn_thunk_thunk .noinstr Valentin Schneider
@ 2026-05-05 8:23 ` Valentin Schneider
2026-05-05 8:23 ` [PATCH v9 06/10] objtool: Add .entry.text validation for static branches Valentin Schneider
` (4 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Valentin Schneider @ 2026-05-05 8:23 UTC (permalink / raw)
To: linux-kernel, linux-mm, x86
Cc: Josh Poimboeuf, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, H. Peter Anvin, Andy Lutomirski, Peter Zijlstra,
Arnaldo Carvalho de Melo, Paolo Bonzini, Arnd Bergmann,
Frederic Weisbecker, Paul E. McKenney, Jason Baron,
Steven Rostedt, Ard Biesheuvel, Sami Tolvanen, David S. Miller,
Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
Uladzislau Rezki, Mathieu Desnoyers, Mel Gorman, Andrew Morton,
Masahiro Yamada, Han Shen, Rik van Riel, Jann Horn, Dan Carpenter,
Oleg Nesterov, Juri Lelli, Clark Williams, Tomas Glozar,
Yair Podemsky, Marcelo Tosatti, Daniel Wagner, Petr Tesarik,
Shrikanth Hegde
From: Josh Poimboeuf <jpoimboe@kernel.org>
Adding static keys to entry text needs to be done with care, as they may be
executed before a context serialization operation has been run. Add
DEFINE_STATIC_KEY_*_ENTRY() variants for those. They don't do anything
special yet; that will come later.
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
[Reduced from .noinstr to .entry.text]
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
---
include/linux/jump_label.h | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index b9c7b0ebf7b9d..e05b4a52ba1b4 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -374,6 +374,20 @@ struct static_key_false {
#define DEFINE_STATIC_KEY_FALSE_RO(name) \
struct static_key_false name __ro_after_init = STATIC_KEY_FALSE_INIT
+/*
+ * Objtool will warn about static keys used in early entry code, as they may
+ * be unsafe if executed before any serializing operation.
+ *
+ * The _ENTRY variants are used to tell objtool the static key is safe to be
+ * used. If using one of these _ENTRY variants, please add a comment above the
+ * definition with the rationale.
+ */
+#define DEFINE_STATIC_KEY_TRUE_ENTRY(name) \
+ DEFINE_STATIC_KEY_TRUE(name)
+
+#define DEFINE_STATIC_KEY_FALSE_ENTRY(name) \
+ DEFINE_STATIC_KEY_FALSE(name)
+
#define DECLARE_STATIC_KEY_FALSE(name) \
extern struct static_key_false name
--
2.52.0
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH v9 06/10] objtool: Add .entry.text validation for static branches
2026-05-05 8:23 [PATCH v9 00/10] x86: Defer some IPIs until a user->kernel transition Valentin Schneider
` (4 preceding siblings ...)
2026-05-05 8:23 ` [PATCH v9 05/10] jump_label: Add annotations for validating .entry.text key usage Valentin Schneider
@ 2026-05-05 8:23 ` Valentin Schneider
2026-05-05 8:23 ` [PATCH v9 07/10] x86/jump_label: Add ASM support for static_branch_likely() Valentin Schneider
` (3 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Valentin Schneider @ 2026-05-05 8:23 UTC (permalink / raw)
To: linux-kernel, linux-mm, x86
Cc: Josh Poimboeuf, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, H. Peter Anvin, Andy Lutomirski, Peter Zijlstra,
Arnaldo Carvalho de Melo, Paolo Bonzini, Arnd Bergmann,
Frederic Weisbecker, Paul E. McKenney, Jason Baron,
Steven Rostedt, Ard Biesheuvel, Sami Tolvanen, David S. Miller,
Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
Uladzislau Rezki, Mathieu Desnoyers, Mel Gorman, Andrew Morton,
Masahiro Yamada, Han Shen, Rik van Riel, Jann Horn, Dan Carpenter,
Oleg Nesterov, Juri Lelli, Clark Williams, Tomas Glozar,
Yair Podemsky, Marcelo Tosatti, Daniel Wagner, Petr Tesarik,
Shrikanth Hegde
From: Josh Poimboeuf <jpoimboe@kernel.org>
Warn about static branches in entry text, unless the corresponding key is
RO-after-init.
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
[Reduced to only .entry.text rather than .noinstr]
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
---
include/linux/jump_label.h | 17 ++++---
include/linux/objtool.h | 16 ++++++
tools/objtool/Documentation/objtool.txt | 15 ++++++
tools/objtool/check.c | 65 ++++++++++++++++++++++++-
tools/objtool/include/objtool/check.h | 2 +
tools/objtool/include/objtool/elf.h | 3 +-
tools/objtool/include/objtool/special.h | 1 +
tools/objtool/special.c | 15 +++++-
8 files changed, 125 insertions(+), 9 deletions(-)
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index e05b4a52ba1b4..e39685061ebbe 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -76,6 +76,7 @@
#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/cleanup.h>
+#include <linux/objtool.h>
extern bool static_key_initialized;
@@ -362,8 +363,9 @@ struct static_key_false {
#define DEFINE_STATIC_KEY_TRUE(name) \
struct static_key_true name = STATIC_KEY_TRUE_INIT
-#define DEFINE_STATIC_KEY_TRUE_RO(name) \
- struct static_key_true name __ro_after_init = STATIC_KEY_TRUE_INIT
+#define DEFINE_STATIC_KEY_TRUE_RO(name) \
+ struct static_key_true name __ro_after_init = STATIC_KEY_TRUE_INIT; \
+ ANNOTATE_ENTRY_ALLOWED(name)
#define DECLARE_STATIC_KEY_TRUE(name) \
extern struct static_key_true name
@@ -371,8 +373,9 @@ struct static_key_false {
#define DEFINE_STATIC_KEY_FALSE(name) \
struct static_key_false name = STATIC_KEY_FALSE_INIT
-#define DEFINE_STATIC_KEY_FALSE_RO(name) \
- struct static_key_false name __ro_after_init = STATIC_KEY_FALSE_INIT
+#define DEFINE_STATIC_KEY_FALSE_RO(name) \
+ struct static_key_false name __ro_after_init = STATIC_KEY_FALSE_INIT; \
+ ANNOTATE_ENTRY_ALLOWED(name)
/*
* Objtool will warn about static keys used in early entry code, as they may
@@ -383,10 +386,12 @@ struct static_key_false {
* definition with the rationale.
*/
#define DEFINE_STATIC_KEY_TRUE_ENTRY(name) \
- DEFINE_STATIC_KEY_TRUE(name)
+ DEFINE_STATIC_KEY_TRUE(name); \
+ ANNOTATE_ENTRY_ALLOWED(name)
#define DEFINE_STATIC_KEY_FALSE_ENTRY(name) \
- DEFINE_STATIC_KEY_FALSE(name)
+ DEFINE_STATIC_KEY_FALSE(name); \
+ ANNOTATE_ENTRY_ALLOWED(name)
#define DECLARE_STATIC_KEY_FALSE(name) \
extern struct static_key_false name
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 9a00e701454c5..d738450897b3b 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -34,6 +34,19 @@
static void __used __section(".discard.func_stack_frame_non_standard") \
*__func_stack_frame_non_standard_##func = func
+#define __ANNOTATE_ENTRY_ALLOWED(key) \
+ static void __used __section(".discard.entry_allowed") \
+ *__annotate_entry_allowed_##key = &key
+
+/*
+ * This is used to tell objtool that a given static key is safe to be used
+ * within .noinstr code, and it doesn't need to generate a warning about it.
+ *
+ * For more information, see tools/objtool/Documentation/objtool.txt,
+ * "non-RO static key usage in entry code"
+ */
+#define ANNOTATE_ENTRY_ALLOWED(key) __ANNOTATE_ENTRY_ALLOWED(key)
+
/*
* STACK_FRAME_NON_STANDARD_FP() is a frame-pointer-specific function ignore
* for the case where a function is intentionally missing frame pointer setup,
@@ -111,6 +124,9 @@
#define UNWIND_HINT(type, sp_reg, sp_offset, signal) "\n\t"
#define STACK_FRAME_NON_STANDARD(func)
#define STACK_FRAME_NON_STANDARD_FP(func)
+#define __ASM_ANNOTATE(label, type) ""
+#define ASM_ANNOTATE(type)
+#define ANNOTATE_ENTRY_ALLOWED(key)
#else
.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0
.endm
diff --git a/tools/objtool/Documentation/objtool.txt b/tools/objtool/Documentation/objtool.txt
index 9e97fc25b2d8a..7b92e85ab5e49 100644
--- a/tools/objtool/Documentation/objtool.txt
+++ b/tools/objtool/Documentation/objtool.txt
@@ -456,6 +456,21 @@ the objtool maintainers.
these special names and does not use module_init() / module_exit()
macros to create them.
+vmlinux.o: warning: objtool: entry_SYSCALL_64+0x108: housekeeping_overridden: non-RO static key usage in entry code
+
+13. file.o: warning: func()+0x2a: key: non-RO static key usage in entry code
+
+ This means that .entry.text function func() uses a static key named 'key'
+ which can be modified at runtime. This is discouraged because the jump
+ location may be accessed before a serializating operation has been
+ executed.
+
+ Check whether the static key in question is only modified during init. If so,
+ define it as read-only-after-init with DEFINE_STATIC_KEY_*_RO().
+
+ Alternatively, if observing a stale/wrong value of the key isn't critical
+ (i.e. the system is in a transient state as lots of things are being updated),
+ you can mark it as safe with DEFINE_STATIC_KEY_*_ENTRY().
If the error doesn't seem to make sense, it could be a bug in objtool.
Feel free to ask objtool maintainers for help.
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 2bb927aa34047..1d976287af3ad 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -328,8 +328,10 @@ static void init_insn_state(struct objtool_file *file, struct insn_state *state,
memset(state, 0, sizeof(*state));
init_cfi_state(&state->cfi);
- if (opts.noinstr && sec)
+ if (opts.noinstr && sec) {
state->noinstr = sec->noinstr;
+ state->entry = sec->entry;
+ }
}
static struct cfi_state *cfi_alloc(void)
@@ -434,6 +436,9 @@ static int decode_instructions(struct objtool_file *file)
!strncmp(sec->name, ".text..__x86.", 13))
sec->noinstr = true;
+ if (!strcmp(sec->name, ".entry.text"))
+ sec->entry= true;
+
/*
* .init.text code is ran before userspace and thus doesn't
* strictly need retpolines, except for modules which are
@@ -1076,6 +1081,45 @@ static int create_sym_checksum_section(struct objtool_file *file)
static int create_sym_checksum_section(struct objtool_file *file) { return -EINVAL; }
#endif
+static int read_entry_allowed(struct objtool_file *file)
+{
+ struct section *rsec;
+ struct symbol *sym;
+ struct reloc *reloc;
+
+ rsec = find_section_by_name(file->elf, ".rela.discard.entry_allowed");
+ if (!rsec)
+ return 0;
+
+ for_each_reloc(rsec, reloc) {
+ switch (reloc->sym->type) {
+ case STT_OBJECT:
+ case STT_FUNC:
+ sym = reloc->sym;
+ break;
+
+ case STT_SECTION:
+ sym = find_symbol_by_offset(reloc->sym->sec,
+ reloc_addend(reloc));
+ if (!sym) {
+ WARN_FUNC(reloc->sym->sec, reloc_addend(reloc),
+ "can't find static key/call symbol");
+ return -1;
+ }
+ break;
+
+ default:
+ WARN("unexpected relocation symbol type in %s: %d",
+ rsec->name, reloc->sym->type);
+ return -1;
+ }
+
+ sym->entry_allowed = 1;
+ }
+
+ return 0;
+}
+
/*
* Warnings shouldn't be reported for ignored functions.
*/
@@ -1919,6 +1963,8 @@ static int handle_jump_alt(struct objtool_file *file,
return -1;
}
+ orig_insn->key = special_alt->key;
+
if (opts.hack_jump_label && special_alt->key_addend & 2) {
struct reloc *reloc = insn_reloc(file, orig_insn);
@@ -2700,6 +2746,9 @@ static int decode_sections(struct objtool_file *file)
if (read_annotate(file, __annotate_late))
return -1;
+ if (read_entry_allowed(file))
+ return -1;
+
return 0;
}
@@ -3598,6 +3647,17 @@ static int validate_return(struct symbol *func, struct instruction *insn, struct
return 0;
}
+static int validate_static_key(struct instruction *insn, struct insn_state *state)
+{
+ if (state->entry && !insn->key->entry_allowed) {
+ WARN_INSN(insn, "%s: non-RO static key usage in entry code",
+ insn->key->name);
+ return 1;
+ }
+
+ return 0;
+}
+
static struct instruction *next_insn_to_validate(struct objtool_file *file,
struct instruction *insn)
{
@@ -3861,6 +3921,9 @@ static int validate_insn(struct objtool_file *file, struct symbol *func,
if (handle_insn_ops(insn, next_insn, statep))
return 1;
+ if (insn->key)
+ validate_static_key(insn, statep);
+
switch (insn->type) {
case INSN_RETURN:
diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h
index 5f2f77bd9b416..23b9c717bd20a 100644
--- a/tools/objtool/include/objtool/check.h
+++ b/tools/objtool/include/objtool/check.h
@@ -16,6 +16,7 @@ struct insn_state {
bool uaccess;
bool df;
bool noinstr;
+ bool entry;
s8 instr;
};
@@ -97,6 +98,7 @@ struct instruction {
struct symbol *sym;
struct stack_op *stack_ops;
struct cfi_state *cfi;
+ struct symbol *key;
};
static inline struct symbol *insn_func(struct instruction *insn)
diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h
index 25573e5af76ef..c05c441cf90df 100644
--- a/tools/objtool/include/objtool/elf.h
+++ b/tools/objtool/include/objtool/elf.h
@@ -51,7 +51,7 @@ struct section {
Elf_Data *data;
const char *name;
int idx;
- bool _changed, text, rodata, noinstr, init, truncate;
+ bool _changed, text, rodata, noinstr, init, truncate, entry;
struct reloc *relocs;
unsigned long nr_alloc_relocs;
struct section *twin;
@@ -89,6 +89,7 @@ struct symbol {
u8 changed : 1;
u8 included : 1;
u8 klp : 1;
+ u8 entry_allowed : 1;
struct list_head pv_target;
struct reloc *relocs;
struct section *group_sec;
diff --git a/tools/objtool/include/objtool/special.h b/tools/objtool/include/objtool/special.h
index 121c3761899c1..2298586a75479 100644
--- a/tools/objtool/include/objtool/special.h
+++ b/tools/objtool/include/objtool/special.h
@@ -18,6 +18,7 @@ struct special_alt {
bool group;
bool jump_or_nop;
u8 key_addend;
+ struct symbol *key;
struct section *orig_sec;
unsigned long orig_off;
diff --git a/tools/objtool/special.c b/tools/objtool/special.c
index 2a533afbc69aa..adec1d0d8a5fe 100644
--- a/tools/objtool/special.c
+++ b/tools/objtool/special.c
@@ -111,13 +111,26 @@ static int get_alt_entry(struct elf *elf, const struct special_entry *entry,
if (entry->key) {
struct reloc *key_reloc;
+ struct symbol *key;
+ s64 key_addend;
key_reloc = find_reloc_by_dest(elf, sec, offset + entry->key);
if (!key_reloc) {
ERROR_FUNC(sec, offset + entry->key, "can't find key reloc");
return -1;
}
- alt->key_addend = reloc_addend(key_reloc);
+
+ key = key_reloc->sym;
+ key_addend = reloc_addend(key_reloc);
+
+ if (key->type == STT_SECTION)
+ key = find_symbol_by_offset(key->sec, key_addend & ~3);
+
+ /* embedded keys not supported */
+ if (key) {
+ alt->key = key;
+ alt->key_addend = key_addend;
+ }
}
return 0;
--
2.52.0
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH v9 07/10] x86/jump_label: Add ASM support for static_branch_likely()
2026-05-05 8:23 [PATCH v9 00/10] x86: Defer some IPIs until a user->kernel transition Valentin Schneider
` (5 preceding siblings ...)
2026-05-05 8:23 ` [PATCH v9 06/10] objtool: Add .entry.text validation for static branches Valentin Schneider
@ 2026-05-05 8:23 ` Valentin Schneider
2026-05-05 8:23 ` [PATCH v9 08/10] x86/mm/pti: Introduce a kernel/user CR3 software signal Valentin Schneider
` (2 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Valentin Schneider @ 2026-05-05 8:23 UTC (permalink / raw)
To: linux-kernel, linux-mm, x86
Cc: Frederic Weisbecker, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, H. Peter Anvin, Andy Lutomirski,
Peter Zijlstra, Arnaldo Carvalho de Melo, Josh Poimboeuf,
Paolo Bonzini, Arnd Bergmann, Paul E. McKenney, Jason Baron,
Steven Rostedt, Ard Biesheuvel, Sami Tolvanen, David S. Miller,
Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
Uladzislau Rezki, Mathieu Desnoyers, Mel Gorman, Andrew Morton,
Masahiro Yamada, Han Shen, Rik van Riel, Jann Horn, Dan Carpenter,
Oleg Nesterov, Juri Lelli, Clark Williams, Tomas Glozar,
Yair Podemsky, Marcelo Tosatti, Daniel Wagner, Petr Tesarik,
Shrikanth Hegde
A later commit will add some early entry code that only needs to be
executed if nohz_full is present on the cmdline, not just if
CONFIG_NO_HZ_FULL is compiled in. Add an ASM-callable static branch macro.
Note that I haven't found a way to express unlikely (i.e. out-of-line)
static branches in ASM macros without using extra jumps, which kind of
defeats the purpose. Consider:
.macro FOOBAR
// Key enabled: JMP .Ldostuff_\@
// Key disabled: NOP
STATIC_BRANCH_UNLIKELY key, .Ldostuff_\@ // Patched to JMP if enabled
jmp .Lend_\@
.Ldostuff_\@:
<dostuff>
.Lend_\@:
.endm
Instead, this should be expressed as a likely (i.e. in-line) static key:
.macro FOOBAR
// Key enabled: NOP
// Key disabled: JMP .Lend_\@
STATIC_BRANCH_LIKELY key, .Lend\@ // Patched to NOP if enabled
<dostuff>
.Lend_\@:
.endm
Suggested-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
---
arch/x86/include/asm/jump_label.h | 33 ++++++++++++++++++++++++++++++-
1 file changed, 32 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 05b16299588d5..ea587598abe7c 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -7,7 +7,38 @@
#include <asm/asm.h>
#include <asm/nops.h>
-#ifndef __ASSEMBLER__
+#ifdef __ASSEMBLER__
+
+/*
+ * There isn't a neat way to craft unlikely static branches in ASM, so they
+ * all have to be expressed as likely (inline) static branches. This macro
+ * thus assumes a "likely" usage.
+ */
+.macro ARCH_STATIC_BRANCH_LIKELY_ASM key, label, jump, hack
+1:
+.if \jump || \hack
+ jmp \label
+.else
+ .byte BYTES_NOP5
+.endif
+ .pushsection __jump_table, "aw"
+ _ASM_ALIGN
+ .long 1b - .
+ .long \label - .
+ /* LIKELY so bit0=1, bit1=hack */
+ _ASM_PTR \key + 1 + (\hack << 1) - .
+ .popsection
+.endm
+
+.macro STATIC_BRANCH_TRUE_LIKELY key, label
+ ARCH_STATIC_BRANCH_LIKELY_ASM \key, \label, 0, IS_ENABLED(CONFIG_HAVE_JUMP_LABEL_HACK)
+.endm
+
+.macro STATIC_BRANCH_FALSE_LIKELY key, label
+ ARCH_STATIC_BRANCH_LIKELY_ASM \key, \label, 1, 0
+.endm
+
+#else /* !__ASSEMBLER__ */
#include <linux/stringify.h>
#include <linux/types.h>
--
2.52.0
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH v9 08/10] x86/mm/pti: Introduce a kernel/user CR3 software signal
2026-05-05 8:23 [PATCH v9 00/10] x86: Defer some IPIs until a user->kernel transition Valentin Schneider
` (6 preceding siblings ...)
2026-05-05 8:23 ` [PATCH v9 07/10] x86/jump_label: Add ASM support for static_branch_likely() Valentin Schneider
@ 2026-05-05 8:23 ` Valentin Schneider
2026-05-05 8:23 ` [PATCH v9 09/10] context_tracking,x86: Defer kernel text patching IPIs when tracking CR3 switches Valentin Schneider
2026-05-05 8:23 ` [PATCH v9 10/10] x86/mm, mm/vmalloc: Defer kernel TLB flush " Valentin Schneider
9 siblings, 0 replies; 11+ messages in thread
From: Valentin Schneider @ 2026-05-05 8:23 UTC (permalink / raw)
To: linux-kernel, linux-mm, x86
Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Andy Lutomirski, Peter Zijlstra,
Arnaldo Carvalho de Melo, Josh Poimboeuf, Paolo Bonzini,
Arnd Bergmann, Frederic Weisbecker, Paul E. McKenney, Jason Baron,
Steven Rostedt, Ard Biesheuvel, Sami Tolvanen, David S. Miller,
Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
Uladzislau Rezki, Mathieu Desnoyers, Mel Gorman, Andrew Morton,
Masahiro Yamada, Han Shen, Rik van Riel, Jann Horn, Dan Carpenter,
Oleg Nesterov, Juri Lelli, Clark Williams, Tomas Glozar,
Yair Podemsky, Marcelo Tosatti, Daniel Wagner, Petr Tesarik,
Shrikanth Hegde
Later commits will rely on being able to check whether a remote CPU is
using the kernel or the user CR3.
This software signal needs to be updated before the actual CR3 write, IOW
it always immediately precedes it:
KERNEL_CR3_LOADED := 1
SWITCH_TO_KERNEL_CR3
[...]
KERNEL_CR3_LOADED := 0
SWITCH_TO_USER_CR3
The variable also gets mapped into the user space visible pages.
I tried really hard not to do that, and at some point had something mostly
working with having an alias to it through the cpu_entry_area accessed like
so before the switch to the kernel CR3:
subq $10, %rsp
sgdt (%rsp)
movq 2(%rsp), \scratch_reg /* GDT address */
addq $10, %rsp
movl $1, CPU_ENTRY_AREA_kernel_cr3(\scratch_reg)
however this explodes when running 64-bit user code that invokes SYSCALL,
since the scratch reg is %rsp itself, and I figured this was enough headaches.
This will only be really useful for NOHZ_FULL CPUs, but it should be
cheaper to unconditionally update a never-used per-CPU variable living in
its own cacheline than to check a shared cpumask such as
housekeeping_cpumask(HK_TYPE_KERNEL_NOISE)
at every entry.
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
---
arch/x86/Kconfig | 14 +++++++++++++
arch/x86/entry/calling.h | 18 +++++++++++++++++
arch/x86/entry/syscall_64.c | 4 ++++
arch/x86/include/asm/tlbflush.h | 3 +++
arch/x86/mm/pti.c | 36 ++++++++++++++++++++++-----------
kernel/sched/isolation.c | 9 ++++++++-
6 files changed, 71 insertions(+), 13 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 99bb5217649a8..36b4527ef69bd 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2198,6 +2198,20 @@ config ADDRESS_MASKING
The capability can be used for efficient address sanitizers (ASAN)
implementation and for optimizations in JITs.
+config TRACK_CR3
+ def_bool n
+ prompt "Track which CR3 is in use"
+ depends on X86_64 && MITIGATION_PAGE_TABLE_ISOLATION && NO_HZ_FULL
+ help
+ This option adds a software signal that allows checking remotely
+ whether a CPU is using the user or the kernel page table.
+
+ This allows further optimizations for NOHZ_FULL CPUs.
+
+ This obviously makes the user<->kernel transition overhead even worse.
+
+ If unsure, say N.
+
config HOTPLUG_CPU
def_bool y
depends on SMP
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 77e2d920a6407..deb8224b5ee48 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -9,6 +9,7 @@
#include <asm/ptrace-abi.h>
#include <asm/msr.h>
#include <asm/nospec-branch.h>
+#include <asm/jump_label.h>
/*
@@ -170,8 +171,22 @@ For 32-bit we have the following conventions - kernel is built with
andq $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg
.endm
+.macro NOTE_CR3_SWITCH scratch_reg:req in_kernel:req
+#ifdef CONFIG_TRACK_CR3
+ STATIC_BRANCH_FALSE_LIKELY housekeeping_overridden, .Lend_\@
+ LOCK_PREFIX
+.if \in_kernel == 1
+ orq $1, PER_CPU_VAR(kernel_cr3_loaded)
+.else
+ andq $0, PER_CPU_VAR(kernel_cr3_loaded)
+.endif
+.Lend_\@:
+#endif // CONFIG_TRACK_CR3
+.endm
+
.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ NOTE_CR3_SWITCH \scratch_reg $1
mov %cr3, \scratch_reg
ADJUST_KERNEL_CR3 \scratch_reg
mov \scratch_reg, %cr3
@@ -182,6 +197,7 @@ For 32-bit we have the following conventions - kernel is built with
PER_CPU_VAR(cpu_tlbstate + TLB_STATE_user_pcid_flush_mask)
.macro SWITCH_TO_USER_CR3 scratch_reg:req scratch_reg2:req
+ NOTE_CR3_SWITCH \scratch_reg $0
mov %cr3, \scratch_reg
ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
@@ -229,6 +245,7 @@ For 32-bit we have the following conventions - kernel is built with
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
+ NOTE_CR3_SWITCH \scratch_reg $1
movq %cr3, \scratch_reg
movq \scratch_reg, \save_reg
/*
@@ -257,6 +274,7 @@ For 32-bit we have the following conventions - kernel is built with
bt $PTI_USER_PGTABLE_BIT, \save_reg
jnc .Lend_\@
+ NOTE_CR3_SWITCH \scratch_reg $0
ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
/*
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 71f032504e731..5d7596a86f240 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -83,6 +83,10 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
return false;
}
+#ifdef CONFIG_TRACK_CR3
+DEFINE_PER_CPU_PAGE_ALIGNED(bool, kernel_cr3_loaded) = true;
+#endif
+
/* Returns true to return using SYSRET, or false to use IRET */
__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
{
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 0545fe75c3fa1..0ec669eb0b4e7 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -17,6 +17,9 @@
#include <asm/pgtable.h>
DECLARE_PER_CPU(u64, tlbstate_untag_mask);
+#ifdef CONFIG_TRACK_CR3
+DECLARE_PER_CPU_PAGE_ALIGNED(bool, kernel_cr3_loaded);
+#endif
void __flush_tlb_all(void);
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index f7546e9e8e896..e75450cabd3a6 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -440,6 +440,18 @@ static void __init pti_clone_p4d(unsigned long addr)
*user_p4d = *kernel_p4d;
}
+static void __init pti_clone_percpu(unsigned long va)
+{
+ phys_addr_t pa = per_cpu_ptr_to_phys((void *)va);
+ pte_t *target_pte;
+
+ target_pte = pti_user_pagetable_walk_pte(va, false);
+ if (WARN_ON(!target_pte))
+ return;
+
+ *target_pte = pfn_pte(pa >> PAGE_SHIFT, PAGE_KERNEL);
+}
+
/*
* Clone the CPU_ENTRY_AREA and associated data into the user space visible
* page table.
@@ -450,25 +462,25 @@ static void __init pti_clone_user_shared(void)
pti_clone_p4d(CPU_ENTRY_AREA_BASE);
+ /*
+ * This is done for all possible CPUs during boot to ensure that it's
+ * propagated to all mms.
+ */
for_each_possible_cpu(cpu) {
/*
* The SYSCALL64 entry code needs one word of scratch space
* in which to spill a register. It lives in the sp2 slot
* of the CPU's TSS.
- *
- * This is done for all possible CPUs during boot to ensure
- * that it's propagated to all mms.
*/
+ pti_clone_percpu((unsigned long)&per_cpu(cpu_tss_rw, cpu));
- unsigned long va = (unsigned long)&per_cpu(cpu_tss_rw, cpu);
- phys_addr_t pa = per_cpu_ptr_to_phys((void *)va);
- pte_t *target_pte;
-
- target_pte = pti_user_pagetable_walk_pte(va, false);
- if (WARN_ON(!target_pte))
- return;
-
- *target_pte = pfn_pte(pa >> PAGE_SHIFT, PAGE_KERNEL);
+#ifdef CONFIG_TRACK_CR3
+ /*
+ * The entry code needs access to the @kernel_cr3_loaded percpu
+ * variable before the kernel CR3 is loaded.
+ */
+ pti_clone_percpu((unsigned long)&per_cpu(kernel_cr3_loaded, cpu));
+#endif
}
}
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index ef152d401fe20..827e17760f1e2 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -18,7 +18,14 @@ enum hk_flags {
HK_FLAG_KERNEL_NOISE = BIT(HK_TYPE_KERNEL_NOISE),
};
-DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
+/*
+ * This key is accessed in early entry code to drive the CPU isolation IPI
+ * deferral mechanism. This happens before a serializing instruction has been
+ * executed in the entry path. This is acceptable as a stale value will only be
+ * observed during transient CPU isolation states, and the updated value will be
+ * acted upon at the next kernel exit.
+ */
+DEFINE_STATIC_KEY_FALSE_ENTRY(housekeeping_overridden);
EXPORT_SYMBOL_GPL(housekeeping_overridden);
struct housekeeping {
--
2.52.0
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH v9 09/10] context_tracking,x86: Defer kernel text patching IPIs when tracking CR3 switches
2026-05-05 8:23 [PATCH v9 00/10] x86: Defer some IPIs until a user->kernel transition Valentin Schneider
` (7 preceding siblings ...)
2026-05-05 8:23 ` [PATCH v9 08/10] x86/mm/pti: Introduce a kernel/user CR3 software signal Valentin Schneider
@ 2026-05-05 8:23 ` Valentin Schneider
2026-05-05 8:23 ` [PATCH v9 10/10] x86/mm, mm/vmalloc: Defer kernel TLB flush " Valentin Schneider
9 siblings, 0 replies; 11+ messages in thread
From: Valentin Schneider @ 2026-05-05 8:23 UTC (permalink / raw)
To: linux-kernel, linux-mm, x86
Cc: Peter Zijlstra (Intel), Nicolas Saenz Julienne, Thomas Gleixner,
Ingo Molnar, Borislav Petkov, Dave Hansen, H. Peter Anvin,
Andy Lutomirski, Arnaldo Carvalho de Melo, Josh Poimboeuf,
Paolo Bonzini, Arnd Bergmann, Frederic Weisbecker,
Paul E. McKenney, Jason Baron, Steven Rostedt, Ard Biesheuvel,
Sami Tolvanen, David S. Miller, Neeraj Upadhyay, Joel Fernandes,
Josh Triplett, Boqun Feng, Uladzislau Rezki, Mathieu Desnoyers,
Mel Gorman, Andrew Morton, Masahiro Yamada, Han Shen,
Rik van Riel, Jann Horn, Dan Carpenter, Oleg Nesterov, Juri Lelli,
Clark Williams, Tomas Glozar, Yair Podemsky, Marcelo Tosatti,
Daniel Wagner, Petr Tesarik, Shrikanth Hegde
text_poke_bp_batch() sends IPIs to all online CPUs to synchronize
them vs the newly patched instruction. CPUs that are executing in userspace
do not need this synchronization to happen immediately, and this is
actually harmful interference for NOHZ_FULL CPUs.
As the synchronization IPIs are sent using a blocking call, returning from
text_poke_bp_batch() implies all CPUs will observe the patched
instruction(s), and this should be preserved even if the IPI is deferred.
In other words, to safely defer this synchronization, any kernel
instruction leading to the execution of the deferred instruction
sync must *not* be mutable (patchable) at runtime.
This means we must pay attention to mutable instructions in the early entry
code:
- alternatives
- static keys
- static calls
- all sorts of probes (kprobes/ftrace/bpf/???)
The early entry code is noinstr, which gets rid of the probes.
Alternatives are safe, because it's boot-time patching (before SMP is
even brought up) which is before any IPI deferral can happen.
This leaves us with static keys and static calls. Any static key used in
early entry code should be only forever-enabled at boot time, IOW
__ro_after_init (pretty much like alternatives). Exceptions to that will
now be caught by objtool.
The deferred instruction sync is the CR3 RMW done as part of
kPTI when switching to the kernel page table:
SDM vol2 chapter 4.3 - Move to/from control registers:
```
MOV CR* instructions, except for MOV CR8, are serializing instructions.
```
Leverage the new kernel_cr3_loaded signal and the kPTI CR3 RMW to defer
sync_core() IPIs targeting NOHZ_FULL CPUs.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
---
arch/x86/include/asm/text-patching.h | 5 +++
arch/x86/kernel/alternative.c | 57 +++++++++++++++++++++++++---
arch/x86/kernel/kprobes/core.c | 4 +-
arch/x86/kernel/kprobes/opt.c | 4 +-
arch/x86/kernel/module.c | 2 +-
include/asm-generic/sections.h | 14 +++++++
6 files changed, 75 insertions(+), 11 deletions(-)
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index f2d142a0a862e..628e80f8318cd 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -33,6 +33,11 @@ extern void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t i
*/
extern void *text_poke(void *addr, const void *opcode, size_t len);
extern void smp_text_poke_sync_each_cpu(void);
+#ifdef CONFIG_TRACK_CR3
+extern void smp_text_poke_sync_each_cpu_deferrable(void);
+#else
+#define smp_text_poke_sync_each_cpu_deferrable smp_text_poke_sync_each_cpu
+#endif
extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
extern void *text_poke_copy(void *addr, const void *opcode, size_t len);
#define text_poke_copy text_poke_copy
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 62936a3bde19b..e2d185e6cb7ca 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -6,6 +6,7 @@
#include <linux/vmalloc.h>
#include <linux/memory.h>
#include <linux/execmem.h>
+#include <linux/sched/isolation.h>
#include <asm/text-patching.h>
#include <asm/insn.h>
@@ -13,6 +14,7 @@
#include <asm/ibt.h>
#include <asm/set_memory.h>
#include <asm/nmi.h>
+#include <asm/tlbflush.h>
int __read_mostly alternatives_patched;
@@ -2768,10 +2770,43 @@ static void do_sync_core(void *info)
sync_core();
}
+static void __smp_text_poke_sync_each_cpu(smp_cond_func_t cond_func)
+{
+ on_each_cpu_cond(cond_func, do_sync_core, NULL, 1);
+}
+
void smp_text_poke_sync_each_cpu(void)
{
- on_each_cpu(do_sync_core, NULL, 1);
+ __smp_text_poke_sync_each_cpu(NULL);
+}
+
+#ifdef CONFIG_TRACK_CR3
+static bool do_sync_core_defer_cond(int cpu, void *info)
+{
+ /*
+ * Send the IPI if the target CPU is a housekeeping one, or if it is
+ * already executing in kernelspace.
+ */
+ bool ret = housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE);
+
+ /*
+ * Pairs with the LOCK in NOTE_KERNEL_CR3
+ *
+ * Ensures any previous operations are visible on a remote CPU
+ * entering the kernel and setting @kernel_cr3_loaded, if this one
+ * decides to defer the IPI.
+ */
+ smp_mb();
+ ret |= per_cpu(kernel_cr3_loaded, cpu);
+
+ return ret;
+}
+
+void smp_text_poke_sync_each_cpu_deferrable(void)
+{
+ __smp_text_poke_sync_each_cpu(do_sync_core_defer_cond);
}
+#endif
/*
* NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of
@@ -2940,6 +2975,7 @@ noinstr int smp_text_poke_int3_handler(struct pt_regs *regs)
*/
void smp_text_poke_batch_finish(void)
{
+ void (*sync_fn)(void) = smp_text_poke_sync_each_cpu_deferrable;
unsigned char int3 = INT3_INSN_OPCODE;
unsigned int i;
int do_sync;
@@ -2976,11 +3012,20 @@ void smp_text_poke_batch_finish(void)
* First step: add a INT3 trap to the address that will be patched.
*/
for (i = 0; i < text_poke_array.nr_entries; i++) {
- text_poke_array.vec[i].old = *(u8 *)text_poke_addr(&text_poke_array.vec[i]);
- text_poke(text_poke_addr(&text_poke_array.vec[i]), &int3, INT3_INSN_SIZE);
+ void *addr = text_poke_addr(&text_poke_array.vec[i]);
+
+ /*
+ * There's no safe way to defer IPIs for patching text in
+ * entry, record whether there is at least one such poke.
+ */
+ if (is_kernel_entrytext((unsigned long)addr))
+ sync_fn = smp_text_poke_sync_each_cpu;
+
+ text_poke_array.vec[i].old = *((u8 *)addr);
+ text_poke(addr, &int3, INT3_INSN_SIZE);
}
- smp_text_poke_sync_each_cpu();
+ sync_fn();
/*
* Second step: update all but the first byte of the patched range.
@@ -3042,7 +3087,7 @@ void smp_text_poke_batch_finish(void)
* not necessary and we'd be safe even without it. But
* better safe than sorry (plus there's not only Intel).
*/
- smp_text_poke_sync_each_cpu();
+ sync_fn();
}
/*
@@ -3063,7 +3108,7 @@ void smp_text_poke_batch_finish(void)
}
if (do_sync)
- smp_text_poke_sync_each_cpu();
+ sync_fn();
/*
* Remove and wait for refs to be zero.
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index c1fac3a9fecc2..61a93ba30f255 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -789,7 +789,7 @@ void arch_arm_kprobe(struct kprobe *p)
u8 int3 = INT3_INSN_OPCODE;
text_poke(p->addr, &int3, 1);
- smp_text_poke_sync_each_cpu();
+ smp_text_poke_sync_each_cpu_deferrable();
perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1);
}
@@ -799,7 +799,7 @@ void arch_disarm_kprobe(struct kprobe *p)
perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1);
text_poke(p->addr, &p->opcode, 1);
- smp_text_poke_sync_each_cpu();
+ smp_text_poke_sync_each_cpu_deferrable();
}
void arch_remove_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 6f826a00eca29..3b3be66da320c 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -509,11 +509,11 @@ void arch_unoptimize_kprobe(struct optimized_kprobe *op)
JMP32_INSN_SIZE - INT3_INSN_SIZE);
text_poke(addr, new, INT3_INSN_SIZE);
- smp_text_poke_sync_each_cpu();
+ smp_text_poke_sync_each_cpu_deferrable();
text_poke(addr + INT3_INSN_SIZE,
new + INT3_INSN_SIZE,
JMP32_INSN_SIZE - INT3_INSN_SIZE);
- smp_text_poke_sync_each_cpu();
+ smp_text_poke_sync_each_cpu_deferrable();
perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE);
}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 11c45ce42694c..0894b1f38de77 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -209,7 +209,7 @@ static int write_relocate_add(Elf64_Shdr *sechdrs,
write, apply);
if (!early) {
- smp_text_poke_sync_each_cpu();
+ smp_text_poke_sync_each_cpu_deferrable();
mutex_unlock(&text_mutex);
}
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index 0755bc39b0d80..7496d26a85a4c 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -199,6 +199,20 @@ static inline bool is_kernel_inittext(unsigned long addr)
addr < (unsigned long)_einittext;
}
+/**
+ * is_kernel_entrytext - checks if the pointer address is located in the
+ * .entry.text section
+ *
+ * @addr: address to check
+ *
+ * Returns: true if the address is located in .entry.text, false otherwise.
+ */
+static inline bool is_kernel_entrytext(unsigned long addr)
+{
+ return addr >= (unsigned long)__entry_text_start &&
+ addr < (unsigned long)__entry_text_end;
+}
+
/**
* __is_kernel_text - checks if the pointer address is located in the
* .text section
--
2.52.0
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH v9 10/10] x86/mm, mm/vmalloc: Defer kernel TLB flush IPIs when tracking CR3 switches
2026-05-05 8:23 [PATCH v9 00/10] x86: Defer some IPIs until a user->kernel transition Valentin Schneider
` (8 preceding siblings ...)
2026-05-05 8:23 ` [PATCH v9 09/10] context_tracking,x86: Defer kernel text patching IPIs when tracking CR3 switches Valentin Schneider
@ 2026-05-05 8:23 ` Valentin Schneider
9 siblings, 0 replies; 11+ messages in thread
From: Valentin Schneider @ 2026-05-05 8:23 UTC (permalink / raw)
To: linux-kernel, linux-mm, x86
Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Andy Lutomirski, Peter Zijlstra,
Arnaldo Carvalho de Melo, Josh Poimboeuf, Paolo Bonzini,
Arnd Bergmann, Frederic Weisbecker, Paul E. McKenney, Jason Baron,
Steven Rostedt, Ard Biesheuvel, Sami Tolvanen, David S. Miller,
Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
Uladzislau Rezki, Mathieu Desnoyers, Mel Gorman, Andrew Morton,
Masahiro Yamada, Han Shen, Rik van Riel, Jann Horn, Dan Carpenter,
Oleg Nesterov, Juri Lelli, Clark Williams, Tomas Glozar,
Yair Podemsky, Marcelo Tosatti, Daniel Wagner, Petr Tesarik,
Shrikanth Hegde
Previous commits have added a software signal that tracks which CR3 (kernel
or user) is in use for any given CPU.
Combined with:
o the CR3 switch itself being a flush for non-global mappings
o global mappings under kPTI being limited to the CEA and entry text
we now have a way to safely defer (kernel) TLB flush IPIs targeting
NOHZ_FULL CPUs executing in userspace (i.e. with the user CR3 loaded).
When sending a kernel TLB flush IPI to a NOHZ_FULL CPU, check whether it is
using the user CR3, and if it is, do not interrupt it and instead rely on
the CR3 write that happens when switching to the kernel CR3.
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
---
arch/x86/include/asm/tlbflush.h | 1 +
arch/x86/mm/tlb.c | 49 ++++++++++++++++++++++++++++-----
mm/vmalloc.c | 30 ++++++++++++++++----
3 files changed, 68 insertions(+), 12 deletions(-)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 0ec669eb0b4e7..824304c08cd95 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -22,6 +22,7 @@ DECLARE_PER_CPU_PAGE_ALIGNED(bool, kernel_cr3_loaded);
#endif
void __flush_tlb_all(void);
+void flush_tlb_kernel_range_deferrable(unsigned long start, unsigned long end);
#define TLB_FLUSH_ALL -1UL
#define TLB_GENERATION_INVALID 0
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index af43d177087e7..68bcccace0659 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -13,6 +13,7 @@
#include <linux/mmu_notifier.h>
#include <linux/mmu_context.h>
#include <linux/kvm_types.h>
+#include <linux/sched/isolation.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -1509,23 +1510,24 @@ static void do_kernel_range_flush(void *info)
flush_tlb_one_kernel(addr);
}
-static void kernel_tlb_flush_all(struct flush_tlb_info *info)
+static void kernel_tlb_flush_all(smp_cond_func_t cond, struct flush_tlb_info *info)
{
if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
invlpgb_flush_all();
else
- on_each_cpu(do_flush_tlb_all, NULL, 1);
+ on_each_cpu_cond(cond, do_flush_tlb_all, NULL, 1);
}
-static void kernel_tlb_flush_range(struct flush_tlb_info *info)
+static void kernel_tlb_flush_range(smp_cond_func_t cond, struct flush_tlb_info *info)
{
if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
invlpgb_kernel_range_flush(info);
else
- on_each_cpu(do_kernel_range_flush, info, 1);
+ on_each_cpu_cond(cond, do_kernel_range_flush, info, 1);
}
-void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+static inline void
+__flush_tlb_kernel_range(smp_cond_func_t cond, unsigned long start, unsigned long end)
{
struct flush_tlb_info *info;
@@ -1535,13 +1537,46 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
TLB_GENERATION_INVALID);
if (info->end == TLB_FLUSH_ALL)
- kernel_tlb_flush_all(info);
+ kernel_tlb_flush_all(cond, info);
else
- kernel_tlb_flush_range(info);
+ kernel_tlb_flush_range(cond, info);
put_flush_tlb_info();
}
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+ __flush_tlb_kernel_range(NULL, start, end);
+}
+
+#ifdef CONFIG_TRACK_CR3
+static bool flush_tlb_kernel_cond(int cpu, void *info)
+{
+ /*
+ * Send the IPI if the target CPU is a housekeeping one, or if it is
+ * already executing in kernelspace.
+ */
+ bool ret = housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE);
+
+ /*
+ * Pairs with the LOCK in NOTE_KERNEL_CR3
+ *
+ * Ensures any previous operations are visible on a remote CPU
+ * entering the kernel and setting @kernel_cr3_loaded, if this one
+ * decides to defer the IPI.
+ */
+ smp_mb();
+ ret |= per_cpu(kernel_cr3_loaded, cpu);
+
+ return ret;
+}
+
+void flush_tlb_kernel_range_deferrable(unsigned long start, unsigned long end)
+{
+ __flush_tlb_kernel_range(flush_tlb_kernel_cond, start, end);
+}
+#endif
+
/*
* This can be used from process context to figure out what the value of
* CR3 is without needing to do a (slow) __read_cr3().
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index aa08651ec0df6..6276c8cb2be0d 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -506,6 +506,26 @@ void vunmap_range_noflush(unsigned long start, unsigned long end)
__vunmap_range_noflush(start, end);
}
+/*
+ * !!! BIG FAT WARNING !!!
+ *
+ * The CPU is free to cache any part of the paging hierarchy it wants at any
+ * time. It's also free to set accessed and dirty bits at any time, even for
+ * instructions that may never execute architecturally.
+ *
+ * This means that deferring a TLB flush affecting freed page-table-pages (IOW,
+ * keeping them in a CPU's paging hierarchy cache) is a recipe for disaster.
+ *
+ * This isn't a problem for deferral of TLB flushes in vmalloc, because
+ * page-table-pages used for vmap() mappings are never freed - see how
+ * __vunmap_range_noflush() walks the whole mapping but only clears the leaf PTEs.
+ * If this ever changes, TLB flush deferral will cause misery.
+ */
+void __weak flush_tlb_kernel_range_deferrable(unsigned long start, unsigned long end)
+{
+ flush_tlb_kernel_range(start, end);
+}
+
/**
* vunmap_range - unmap kernel virtual addresses
* @addr: start of the VM area to unmap
@@ -519,7 +539,7 @@ void vunmap_range(unsigned long addr, unsigned long end)
{
flush_cache_vunmap(addr, end);
vunmap_range_noflush(addr, end);
- flush_tlb_kernel_range(addr, end);
+ flush_tlb_kernel_range_deferrable(addr, end);
}
static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
@@ -2373,7 +2393,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
nr_purge_nodes = cpumask_weight(&purge_nodes);
if (nr_purge_nodes > 0) {
- flush_tlb_kernel_range(start, end);
+ flush_tlb_kernel_range_deferrable(start, end);
/* One extra worker is per a lazy_max_pages() full set minus one. */
nr_purge_helpers = atomic_long_read(&vmap_lazy_nr) / lazy_max_pages();
@@ -2476,7 +2496,7 @@ static void free_unmap_vmap_area(struct vmap_area *va)
flush_cache_vunmap(va->va_start, va->va_end);
vunmap_range_noflush(va->va_start, va->va_end);
if (debug_pagealloc_enabled_static())
- flush_tlb_kernel_range(va->va_start, va->va_end);
+ flush_tlb_kernel_range_deferrable(va->va_start, va->va_end);
free_vmap_area_noflush(va);
}
@@ -2923,7 +2943,7 @@ static void vb_free(unsigned long addr, unsigned long size)
vunmap_range_noflush(addr, addr + size);
if (debug_pagealloc_enabled_static())
- flush_tlb_kernel_range(addr, addr + size);
+ flush_tlb_kernel_range_deferrable(addr, addr + size);
spin_lock(&vb->lock);
@@ -2988,7 +3008,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
free_purged_blocks(&purge_list);
if (!__purge_vmap_area_lazy(start, end, false) && flush)
- flush_tlb_kernel_range(start, end);
+ flush_tlb_kernel_range_deferrable(start, end);
mutex_unlock(&vmap_purge_lock);
}
--
2.52.0
^ permalink raw reply related [flat|nested] 11+ messages in thread