All of lore.kernel.org
 help / color / mirror / Atom feed
From: Rik van Riel <riel@surriel.com>
To: Thomas Gleixner <tglx@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>, Borislav Petkov <bp@alien8.de>,
	x86@kernel.org, linux-kernel@vger.kernel.org,
	kernel-team@meta.com, Dave Hansen <dave.hansen@linux.intel.com>,
	Andy Lutomirski <luto@kernel.org>, Xin Li <xin@zytor.com>
Subject: [PATCH] x86/entry: Read CR2 in asm entry stub to redcue NMI clobbering window
Date: Wed, 13 May 2026 12:12:59 -0400	[thread overview]
Message-ID: <20260513121259.0aae7adc@fangorn> (raw)

exc_page_fault() reads CR2 in the C handler (fault.c), not in the asm
entry stub. An NMI can fire in the window between the CPU setting CR2
and the kernel calling read_cr2(). If the NMI handler runs
KASAN-instrumented code that touches a KASAN shadow for a CEA gap
address (unmapped, PMD=0), the nested fault overwrites CR2. When the
NMI returns, the original handler reads the wrong value.

This manifests as a consistent pattern: CR2 in fffffe0000xxxxxx (CEA
range) with page offset 0x028, regardless of the actual faulting
address. The real fault address is lost.

Fix: move read_cr2() into the asm entry stub via new idtentry_pf /
DEFINE_IDTENTRY_PF macros, reading %cr2 into %rdx immediately after
error_entry (SWAPGS done, registers saved), before any C or KASAN code
runs. Pass the address as a 3rd argument to exc_page_fault().

For FRED, hardware saves the faulting address in event data, so we use
fred_event_data(regs) instead, which has no race.

This reduces the NMI/RC2 race window by 90% -- from hundreds of C
instructions deep in the handler down to ~30 asm instructions in
error_entry.

Found with syzkaller

Assisted-by: Claude:claude-opus-4.7 syzkaller
Signed-off-by: Rik van Riel <riel@surriel.com>
---
 arch/x86/coco/sev/vc-handle.c   |  3 +-
 arch/x86/entry/entry_64.S       | 61 +++++++++++++++++++++++++++++++++
 arch/x86/entry/entry_fred.c     |  2 +-
 arch/x86/include/asm/idtentry.h | 34 +++++++++++++++++-
 arch/x86/mm/fault.c             | 11 ++++--
 5 files changed, 104 insertions(+), 7 deletions(-)

diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c
index d98b5c08ef00..55362d017cc9 100644
--- a/arch/x86/coco/sev/vc-handle.c
+++ b/arch/x86/coco/sev/vc-handle.c
@@ -113,8 +113,7 @@ void vc_forward_exception(struct es_em_ctxt *ctxt)
 		exc_invalid_op(ctxt->regs);
 		break;
 	case X86_TRAP_PF:
-		write_cr2(ctxt->fi.cr2);
-		exc_page_fault(ctxt->regs, error_code);
+		exc_page_fault(ctxt->regs, error_code, ctxt->fi.cr2);
 		break;
 	case X86_TRAP_AC:
 		exc_alignment_check(ctxt->regs, error_code);
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 42447b1e1dff..a02313909516 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -364,6 +364,67 @@ _ASM_NOKPROBE(\asmsym)
 SYM_CODE_END(\asmsym)
 .endm
 
+/**
+ * idtentry_body_pf - Macro to emit code calling the #PF C function
+ * @cfunc:		C function to be called
+ *
+ * Reads CR2 immediately after error_entry completes (PUSH_AND_CLEAR_REGS
+ * and SWAPGS are done) and passes it as the 3rd argument to the C handler.
+ * This minimizes the window where an NMI can clobber CR2 by triggering a
+ * nested fault (e.g., KASAN shadow access on unmapped CEA gap addresses).
+ */
+.macro idtentry_body_pf cfunc
+
+	ALTERNATIVE "call error_entry; movq %rax, %rsp", \
+		    "call xen_error_entry", X86_FEATURE_XENPV
+
+	ENCODE_FRAME_POINTER
+	UNWIND_HINT_REGS
+
+	/*
+	 * Read CR2 immediately.  error_entry has completed SWAPGS (if
+	 * needed) so per-cpu state is accessible, but more importantly
+	 * we read CR2 before calling any C code that might be
+	 * KASAN-instrumented or otherwise trigger nested faults.
+	 */
+	movq	%cr2, %rdx
+
+	movq	%rsp, %rdi			/* pt_regs pointer into 1st argument*/
+	movq	ORIG_RAX(%rsp), %rsi		/* get error code into 2nd argument*/
+	movq	$-1, ORIG_RAX(%rsp)		/* no syscall to restart */
+
+	/* %rdx already has CR2 value as 3rd argument */
+
+	/* For some configurations \cfunc ends up being a noreturn. */
+	ANNOTATE_REACHABLE
+	call	\cfunc
+
+	jmp	error_return
+.endm
+
+/**
+ * idtentry_pf - Entry point for #PF (page fault)
+ * @vector:		Vector number
+ * @asmsym:		ASM symbol for the entry point
+ * @cfunc:		C function to be called
+ *
+ * Page fault specific entry macro that reads CR2 as early as possible
+ * to avoid NMI-induced CR2 clobbering.  Hardware pushes the error code.
+ */
+.macro idtentry_pf vector asmsym cfunc
+SYM_CODE_START(\asmsym)
+
+	UNWIND_HINT_IRET_ENTRY offset=8
+	ENDBR
+	ASM_CLAC
+	cld
+
+	idtentry_body_pf \cfunc
+
+_ASM_NOKPROBE(\asmsym)
+SYM_CODE_END(\asmsym)
+.endm
+
 /*
  * Interrupt entry/exit.
  *
diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c
index fbe2d10dd737..3a1fa438535b 100644
--- a/arch/x86/entry/entry_fred.c
+++ b/arch/x86/entry/entry_fred.c
@@ -191,7 +191,7 @@ static noinstr void fred_hwexc(struct pt_regs *regs, unsigned long error_code)
 {
 	/* Optimize for #PF. That's the only exception which matters performance wise */
 	if (likely(regs->fred_ss.vector == X86_TRAP_PF))
-		return exc_page_fault(regs, error_code);
+		return exc_page_fault(regs, error_code, fred_event_data(regs));
 
 	switch (regs->fred_ss.vector) {
 	case X86_TRAP_DE: return exc_divide_error(regs);
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 42bf6a58ec36..6c5cbb858817 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -180,6 +180,35 @@ noinstr void fred_##func(struct pt_regs *regs)
 #define DEFINE_IDTENTRY_RAW_ERRORCODE(func)				\
 __visible noinstr void func(struct pt_regs *regs, unsigned long error_code)
 
+/**
+ * DECLARE_IDTENTRY_PF - Declare functions for page fault IDT entry point
+ * @vector:	Vector number (ignored for C)
+ * @func:	Function name of the entry point
+ *
+ * Same as DECLARE_IDTENTRY_RAW_ERRORCODE() but with an extra address
+ * argument. The address (CR2 or FRED event data) is read as early as
+ * possible in the asm entry stub to minimize the window where an NMI
+ * can clobber CR2 by triggering a nested fault.
+ */
+#define DECLARE_IDTENTRY_PF(vector, func)				\
+	asmlinkage void asm_##func(void);				\
+	asmlinkage void xen_asm_##func(void);				\
+	__visible void func(struct pt_regs *regs,			\
+			    unsigned long error_code,			\
+			    unsigned long address)
+
+/**
+ * DEFINE_IDTENTRY_PF - Emit code for page fault IDT entry point
+ * @func:	Function name of the entry point
+ *
+ * Same as DEFINE_IDTENTRY_RAW_ERRORCODE() but with an extra address
+ * argument containing the faulting address (from CR2 or FRED event data).
+ */
+#define DEFINE_IDTENTRY_PF(func)					\
+__visible noinstr void func(struct pt_regs *regs,			\
+			    unsigned long error_code,			\
+			    unsigned long address)
+
 /**
  * DECLARE_IDTENTRY_IRQ - Declare functions for device interrupt IDT entry
  *			  points (common/spurious)
@@ -489,6 +518,9 @@ void fred_install_sysvec(unsigned int vector, const idtentry_t function);
 #define DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func)			\
 	DECLARE_IDTENTRY_ERRORCODE(vector, func)
 
+#define DECLARE_IDTENTRY_PF(vector, func)				\
+	idtentry_pf vector asm_##func func
+
 /* Entries for common/spurious (device) interrupts */
 #define DECLARE_IDTENTRY_IRQ(vector, func)				\
 	idtentry_irq vector func
@@ -615,7 +647,7 @@ DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_AC,	exc_alignment_check);
 /* Raw exception entries which need extra work */
 DECLARE_IDTENTRY_RAW(X86_TRAP_UD,		exc_invalid_op);
 DECLARE_IDTENTRY_RAW(X86_TRAP_BP,		exc_int3);
-DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF,	exc_page_fault);
+DECLARE_IDTENTRY_PF(X86_TRAP_PF,	exc_page_fault);
 
 #if defined(CONFIG_IA32_EMULATION)
 DECLARE_IDTENTRY_RAW(IA32_SYSCALL_VECTOR,	int80_emulation);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index b83a06739b51..f0e2341e551f 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1480,12 +1480,17 @@ handle_page_fault(struct pt_regs *regs, unsigned long error_code,
 	local_irq_disable();
 }
 
-DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
+DEFINE_IDTENTRY_PF(exc_page_fault)
 {
 	irqentry_state_t state;
-	unsigned long address;
 
-	address = cpu_feature_enabled(X86_FEATURE_FRED) ? fred_event_data(regs) : read_cr2();
+	/*
+	 * For FRED, the faulting address is saved atomically by hardware
+	 * as event data.  Override the asm-provided CR2 value which is
+	 * meaningless under FRED.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_FRED))
+		address = fred_event_data(regs);
 
 	/*
 	 * KVM uses #PF vector to deliver 'page not present' events to guests
-- 
2.53.0-Meta



             reply	other threads:[~2026-05-13 16:13 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-13 16:12 Rik van Riel [this message]
2026-05-13 16:31 ` [PATCH] x86/entry: Read CR2 in asm entry stub to redcue NMI clobbering window Dave Hansen
2026-05-13 16:43   ` Rik van Riel
2026-05-13 17:12     ` Dave Hansen
2026-05-14 19:49       ` Andrew Cooper
2026-05-14 19:55         ` Dave Hansen
2026-05-14 18:09 ` kernel test robot
2026-05-14 22:54 ` kernel test robot

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260513121259.0aae7adc@fangorn \
    --to=riel@surriel.com \
    --cc=bp@alien8.de \
    --cc=dave.hansen@linux.intel.com \
    --cc=kernel-team@meta.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=mingo@redhat.com \
    --cc=tglx@kernel.org \
    --cc=x86@kernel.org \
    --cc=xin@zytor.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.