The Linux Kernel Mailing List
 help / color / mirror / Atom feed
* [PATCH] x86/entry: Read CR2 in asm entry stub to redcue NMI clobbering window
@ 2026-05-13 16:12 Rik van Riel
  2026-05-13 16:31 ` Dave Hansen
  0 siblings, 1 reply; 4+ messages in thread
From: Rik van Riel @ 2026-05-13 16:12 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Ingo Molnar, Borislav Petkov, x86, linux-kernel, kernel-team,
	Dave Hansen, Andy Lutomirski, Xin Li

exc_page_fault() reads CR2 in the C handler (fault.c), not in the asm
entry stub. An NMI can fire in the window between the CPU setting CR2
and the kernel calling read_cr2(). If the NMI handler runs
KASAN-instrumented code that touches a KASAN shadow for a CEA gap
address (unmapped, PMD=0), the nested fault overwrites CR2. When the
NMI returns, the original handler reads the wrong value.

This manifests as a consistent pattern: CR2 in fffffe0000xxxxxx (CEA
range) with page offset 0x028, regardless of the actual faulting
address. The real fault address is lost.

Fix: move read_cr2() into the asm entry stub via new idtentry_pf /
DEFINE_IDTENTRY_PF macros, reading %cr2 into %rdx immediately after
error_entry (SWAPGS done, registers saved), before any C or KASAN code
runs. Pass the address as a 3rd argument to exc_page_fault().

For FRED, hardware saves the faulting address in event data, so we use
fred_event_data(regs) instead, which has no race.

This reduces the NMI/RC2 race window by 90% -- from hundreds of C
instructions deep in the handler down to ~30 asm instructions in
error_entry.

Found with syzkaller

Assisted-by: Claude:claude-opus-4.7 syzkaller
Signed-off-by: Rik van Riel <riel@surriel.com>
---
 arch/x86/coco/sev/vc-handle.c   |  3 +-
 arch/x86/entry/entry_64.S       | 61 +++++++++++++++++++++++++++++++++
 arch/x86/entry/entry_fred.c     |  2 +-
 arch/x86/include/asm/idtentry.h | 34 +++++++++++++++++-
 arch/x86/mm/fault.c             | 11 ++++--
 5 files changed, 104 insertions(+), 7 deletions(-)

diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c
index d98b5c08ef00..55362d017cc9 100644
--- a/arch/x86/coco/sev/vc-handle.c
+++ b/arch/x86/coco/sev/vc-handle.c
@@ -113,8 +113,7 @@ void vc_forward_exception(struct es_em_ctxt *ctxt)
 		exc_invalid_op(ctxt->regs);
 		break;
 	case X86_TRAP_PF:
-		write_cr2(ctxt->fi.cr2);
-		exc_page_fault(ctxt->regs, error_code);
+		exc_page_fault(ctxt->regs, error_code, ctxt->fi.cr2);
 		break;
 	case X86_TRAP_AC:
 		exc_alignment_check(ctxt->regs, error_code);
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 42447b1e1dff..a02313909516 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -364,6 +364,67 @@ _ASM_NOKPROBE(\asmsym)
 SYM_CODE_END(\asmsym)
 .endm
 
+/**
+ * idtentry_body_pf - Macro to emit code calling the #PF C function
+ * @cfunc:		C function to be called
+ *
+ * Reads CR2 immediately after error_entry completes (PUSH_AND_CLEAR_REGS
+ * and SWAPGS are done) and passes it as the 3rd argument to the C handler.
+ * This minimizes the window where an NMI can clobber CR2 by triggering a
+ * nested fault (e.g., KASAN shadow access on unmapped CEA gap addresses).
+ */
+.macro idtentry_body_pf cfunc
+
+	ALTERNATIVE "call error_entry; movq %rax, %rsp", \
+		    "call xen_error_entry", X86_FEATURE_XENPV
+
+	ENCODE_FRAME_POINTER
+	UNWIND_HINT_REGS
+
+	/*
+	 * Read CR2 immediately.  error_entry has completed SWAPGS (if
+	 * needed) so per-cpu state is accessible, but more importantly
+	 * we read CR2 before calling any C code that might be
+	 * KASAN-instrumented or otherwise trigger nested faults.
+	 */
+	movq	%cr2, %rdx
+
+	movq	%rsp, %rdi			/* pt_regs pointer into 1st argument*/
+	movq	ORIG_RAX(%rsp), %rsi		/* get error code into 2nd argument*/
+	movq	$-1, ORIG_RAX(%rsp)		/* no syscall to restart */
+
+	/* %rdx already has CR2 value as 3rd argument */
+
+	/* For some configurations \cfunc ends up being a noreturn. */
+	ANNOTATE_REACHABLE
+	call	\cfunc
+
+	jmp	error_return
+.endm
+
+/**
+ * idtentry_pf - Entry point for #PF (page fault)
+ * @vector:		Vector number
+ * @asmsym:		ASM symbol for the entry point
+ * @cfunc:		C function to be called
+ *
+ * Page fault specific entry macro that reads CR2 as early as possible
+ * to avoid NMI-induced CR2 clobbering.  Hardware pushes the error code.
+ */
+.macro idtentry_pf vector asmsym cfunc
+SYM_CODE_START(\asmsym)
+
+	UNWIND_HINT_IRET_ENTRY offset=8
+	ENDBR
+	ASM_CLAC
+	cld
+
+	idtentry_body_pf \cfunc
+
+_ASM_NOKPROBE(\asmsym)
+SYM_CODE_END(\asmsym)
+.endm
+
 /*
  * Interrupt entry/exit.
  *
diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c
index fbe2d10dd737..3a1fa438535b 100644
--- a/arch/x86/entry/entry_fred.c
+++ b/arch/x86/entry/entry_fred.c
@@ -191,7 +191,7 @@ static noinstr void fred_hwexc(struct pt_regs *regs, unsigned long error_code)
 {
 	/* Optimize for #PF. That's the only exception which matters performance wise */
 	if (likely(regs->fred_ss.vector == X86_TRAP_PF))
-		return exc_page_fault(regs, error_code);
+		return exc_page_fault(regs, error_code, fred_event_data(regs));
 
 	switch (regs->fred_ss.vector) {
 	case X86_TRAP_DE: return exc_divide_error(regs);
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 42bf6a58ec36..6c5cbb858817 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -180,6 +180,35 @@ noinstr void fred_##func(struct pt_regs *regs)
 #define DEFINE_IDTENTRY_RAW_ERRORCODE(func)				\
 __visible noinstr void func(struct pt_regs *regs, unsigned long error_code)
 
+/**
+ * DECLARE_IDTENTRY_PF - Declare functions for page fault IDT entry point
+ * @vector:	Vector number (ignored for C)
+ * @func:	Function name of the entry point
+ *
+ * Same as DECLARE_IDTENTRY_RAW_ERRORCODE() but with an extra address
+ * argument. The address (CR2 or FRED event data) is read as early as
+ * possible in the asm entry stub to minimize the window where an NMI
+ * can clobber CR2 by triggering a nested fault.
+ */
+#define DECLARE_IDTENTRY_PF(vector, func)				\
+	asmlinkage void asm_##func(void);				\
+	asmlinkage void xen_asm_##func(void);				\
+	__visible void func(struct pt_regs *regs,			\
+			    unsigned long error_code,			\
+			    unsigned long address)
+
+/**
+ * DEFINE_IDTENTRY_PF - Emit code for page fault IDT entry point
+ * @func:	Function name of the entry point
+ *
+ * Same as DEFINE_IDTENTRY_RAW_ERRORCODE() but with an extra address
+ * argument containing the faulting address (from CR2 or FRED event data).
+ */
+#define DEFINE_IDTENTRY_PF(func)					\
+__visible noinstr void func(struct pt_regs *regs,			\
+			    unsigned long error_code,			\
+			    unsigned long address)
+
 /**
  * DECLARE_IDTENTRY_IRQ - Declare functions for device interrupt IDT entry
  *			  points (common/spurious)
@@ -489,6 +518,9 @@ void fred_install_sysvec(unsigned int vector, const idtentry_t function);
 #define DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func)			\
 	DECLARE_IDTENTRY_ERRORCODE(vector, func)
 
+#define DECLARE_IDTENTRY_PF(vector, func)				\
+	idtentry_pf vector asm_##func func
+
 /* Entries for common/spurious (device) interrupts */
 #define DECLARE_IDTENTRY_IRQ(vector, func)				\
 	idtentry_irq vector func
@@ -615,7 +647,7 @@ DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_AC,	exc_alignment_check);
 /* Raw exception entries which need extra work */
 DECLARE_IDTENTRY_RAW(X86_TRAP_UD,		exc_invalid_op);
 DECLARE_IDTENTRY_RAW(X86_TRAP_BP,		exc_int3);
-DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF,	exc_page_fault);
+DECLARE_IDTENTRY_PF(X86_TRAP_PF,	exc_page_fault);
 
 #if defined(CONFIG_IA32_EMULATION)
 DECLARE_IDTENTRY_RAW(IA32_SYSCALL_VECTOR,	int80_emulation);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index b83a06739b51..f0e2341e551f 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1480,12 +1480,17 @@ handle_page_fault(struct pt_regs *regs, unsigned long error_code,
 	local_irq_disable();
 }
 
-DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
+DEFINE_IDTENTRY_PF(exc_page_fault)
 {
 	irqentry_state_t state;
-	unsigned long address;
 
-	address = cpu_feature_enabled(X86_FEATURE_FRED) ? fred_event_data(regs) : read_cr2();
+	/*
+	 * For FRED, the faulting address is saved atomically by hardware
+	 * as event data.  Override the asm-provided CR2 value which is
+	 * meaningless under FRED.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_FRED))
+		address = fred_event_data(regs);
 
 	/*
 	 * KVM uses #PF vector to deliver 'page not present' events to guests
-- 
2.53.0-Meta



^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2026-05-13 17:13 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-13 16:12 [PATCH] x86/entry: Read CR2 in asm entry stub to redcue NMI clobbering window Rik van Riel
2026-05-13 16:31 ` Dave Hansen
2026-05-13 16:43   ` Rik van Riel
2026-05-13 17:12     ` Dave Hansen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox