public inbox for linux-mm@kvack.org
 help / color / mirror / Atom feed
From: David Stevens <stevensd@google.com>
To: Pasha Tatashin <pasha.tatashin@soleen.com>,
	Linus Walleij <linus.walleij@linaro.org>,
	 Will Deacon <willdeacon@google.com>,
	Quentin Perret <qperret@google.com>,
	 Thomas Gleixner <tglx@kernel.org>,
	Ingo Molnar <mingo@redhat.com>, Borislav Petkov <bp@alien8.de>,
	 Dave Hansen <dave.hansen@linux.intel.com>,
	x86@kernel.org,  "H. Peter Anvin" <hpa@zytor.com>,
	Andy Lutomirski <luto@kernel.org>, Xin Li <xin@zytor.com>,
	 Peter Zijlstra <peterz@infradead.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	 David Hildenbrand <david@kernel.org>,
	Lorenzo Stoakes <ljs@kernel.org>,
	 "Liam R. Howlett" <Liam.Howlett@oracle.com>,
	Vlastimil Babka <vbabka@kernel.org>,
	 Mike Rapoport <rppt@kernel.org>,
	Suren Baghdasaryan <surenb@google.com>,
	Michal Hocko <mhocko@suse.com>,
	 Uladzislau Rezki <urezki@gmail.com>, Kees Cook <kees@kernel.org>
Cc: David Stevens <stevensd@google.com>,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org
Subject: [PATCH v2 12/13] x86: Add support for dynamic kernel stacks via FRED
Date: Fri, 24 Apr 2026 12:14:55 -0700	[thread overview]
Message-ID: <20260424191456.2679717-13-stevensd@google.com> (raw)
In-Reply-To: <20260424191456.2679717-1-stevensd@google.com>

Add support for dynamic kernel stack faults by handling #PFs from CPL 0
on stack level 1. Since we can't sleep while on a per-CPU stack, any
page faults that didn't originate in an atomic context need to be
bounced back to the originating stack.

With dynamic kernel stacks, the processor pushing data onto the kernel
thread stack can cause a page fault. The SDM says in the #DF section
that the processor should be able to handle these exceptions serially.
However, this does not seem to actually be handled reliably.

With KVM, I've observed timer interrupts dropped. The corresponding bit
in VIRR is cleared and the ISR bit in the APIC is set before the #PF is
delivered, but the interrupt handler is not invoked after the kernel
stack fault is resolved. On bare metal, I've observed frequent hangs due
to threads getting stuck on folio_wait_bit_common. I haven't traced this
to an exact interrupt being lost, but moving interrupts to stack level 1
reduces boot failures from >10% to 0 in 1000s of attempts.

To work around this, external interrupts are also moved to stack level
1, and unconditionally bounced back to the originating stack.

Bouncing page faults and external interrupts through stack level 1 while
in CPL 0 adds a small but non-trivial overhead to those paths. The
shared entry point for events received in CPL 0 also becomes slightly
more expensive, due to the need to detect page faults and external
interrupts.

Since enabling HAVE_ARCH_DYNAMIC_STACK requires unconditional support,
enabling the config is done in the next patch that adds dynamic stack
support for traditional interrupt delivery.

Signed-off-by: David Stevens <stevensd@google.com>
---
 arch/x86/entry/entry_64_fred.S    | 55 +++++++++++++++++++++++++++++++
 arch/x86/include/asm/pgtable_64.h | 36 ++++++++++++++++++++
 arch/x86/include/asm/traps.h      |  5 +++
 arch/x86/kernel/fred.c            | 20 ++++++++---
 arch/x86/mm/dump_pagetables.c     | 14 +++++---
 arch/x86/mm/fault.c               | 53 +++++++++++++++++++++++++++++
 6 files changed, 174 insertions(+), 9 deletions(-)

diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S
index 119b8214748e..7202655ef662 100644
--- a/arch/x86/entry/entry_64_fred.S
+++ b/arch/x86/entry/entry_64_fred.S
@@ -54,7 +54,62 @@ SYM_CODE_END(asm_fred_entrypoint_user)
 	.org asm_fred_entrypoint_user + 256, 0xcc
 SYM_CODE_START_NOALIGN(asm_fred_entrypoint_kernel)
 	FRED_ENTER
+
+#ifdef CONFIG_DYNAMIC_STACK
+	/* Extract event type and vector from augmented SS. */
+	movl	(SS + 4)(%rsp), %esi
+	andl	$0x000f00ff, %esi
+
+	/* Check if event type is hardware exception and vector is #PF. */
+	cmpl	$0x0003000e, %esi
+	jne	.Lcheck_for_extint
+
+	call	handle_dynamic_stack_kernel_faults
+	testq	%rax, %rax
+	jz	.Lentrypoint_done
+	cmpq	%rax, %rsp
+	je	.Lskip_stack_switch
+	jmp	.Ldo_stack_switch
+
+.Lcheck_for_extint:
+	/* Check if event type is external interrupt. */
+	andl	$0xf0000, %esi
+	testl	%esi, %esi
+	jne	.Lcall_primary_entry
+	call	switch_to_kstack
+
+.Ldo_stack_switch:
+#ifdef CONFIG_DEBUG_ENTRY
+	/*
+	 * We should only do a stack switch for an external interrupt or a page
+	 * fault in a non-atomic context. These should only ever happen in user
+	 * space or from a regular kernel stack (i.e. CSL == 0).
+	 */
+	movw	(CS + 2)(%rsp), %si
+	testw	$0x3, %si
+	jz	.Lcsl_ok
+	ud2
+.Lcsl_ok:
+#endif
+	movq	%rax, %rsp
+
+	UNWIND_HINT_REGS
+	ENCODE_FRAME_POINTER
+
+	mov	$MSR_IA32_FRED_CONFIG, %ecx
+	rdmsr
+	andl	$~0x3, %eax
+	wrmsr
+
+	movq	%rsp, %rdi
+#endif
+
+.Lskip_stack_switch:
+	movq	%rsp, %rdi
+.Lcall_primary_entry:
 	call	fred_entry_from_kernel
+
+.Lentrypoint_done:
 	FRED_EXIT
 	ERETS
 SYM_CODE_END(asm_fred_entrypoint_kernel)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index ce45882ccd07..fbb042c89d13 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -237,6 +237,42 @@ static inline void native_pgd_clear(pgd_t *pgd)
 #define __swp_entry_to_pte(x)		(__pte((x).val))
 #define __swp_entry_to_pmd(x)		(__pmd((x).val))
 
+#ifdef CONFIG_DYNAMIC_STACK
+
+/*
+ * Skip the present bit. And skip dirty and accessed bits due to
+ * erratum where they can be incorrectly set on non-present ptes.
+ *
+ * Also skip bit 8, which is used for pte_present for PROT_NONE. This
+ * isn't necessary in the strictest sense since PROT_NONE doesn't apply
+ * to kernel PTEs, but it's easier to let pte_present just continue
+ * to work.
+ */
+#define KPTE_AVAILABLE_DATA_BITS 58
+
+static inline pte_t make_data_kpte(unsigned long val)
+{
+	unsigned long low_part, mid_part, high_part;
+
+	low_part = (val & 0xf) << 1;
+	mid_part = (val & 0x10) << 3;
+	high_part = (val & ~0x1f) << 4;
+
+	return __pte(low_part | mid_part | high_part);
+}
+
+static inline unsigned long unpack_data_kpte(pte_t pte)
+{
+	unsigned long val = pte_val(pte), high_part, mid_part, low_part;
+
+	low_part = (val >> 1) & 0xf;
+	mid_part = (val >> 3) & 0x10;
+	high_part = (val >> 4) & ~0x1f;
+
+	return low_part | mid_part | high_part;
+}
+#endif /* CONFIG_DYNAMIC_STACK */
+
 extern void cleanup_highmap(void);
 
 #define HAVE_ARCH_UNMAPPED_AREA
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 3f24cc472ce9..6b55eb91aea6 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -15,6 +15,11 @@ asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs);
 asmlinkage __visible notrace
 struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs);
 asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs);
+
+#ifdef CONFIG_DYNAMIC_STACK
+asmlinkage __visible noinstr unsigned long switch_to_kstack(struct pt_regs *regs);
+asmlinkage __visible noinstr unsigned long handle_dynamic_stack_kernel_faults(struct pt_regs *regs);
+#endif
 #endif
 
 extern int ibt_selftest(void);
diff --git a/arch/x86/kernel/fred.c b/arch/x86/kernel/fred.c
index e736b19e18de..01d727420d1f 100644
--- a/arch/x86/kernel/fred.c
+++ b/arch/x86/kernel/fred.c
@@ -9,6 +9,8 @@
 
 /* #DB in the kernel would imply the use of a kernel debugger. */
 #define FRED_DB_STACK_LEVEL		1UL
+#define FRED_PF_STACK_LEVEL		1UL
+#define FRED_INT_STACK_LEVEL		1UL
 #define FRED_NMI_STACK_LEVEL		2UL
 #define FRED_MC_STACK_LEVEL		2UL
 /*
@@ -25,6 +27,11 @@
 DEFINE_PER_CPU(unsigned long, fred_rsp0);
 EXPORT_PER_CPU_SYMBOL(fred_rsp0);
 
+#define FRED_CONFIG_VAL(int_stklvl) \
+	(FRED_CONFIG_REDZONE /* Reserve for CALL emulation */ | \
+	 FRED_CONFIG_INT_STKLVL(int_stklvl) | \
+	 FRED_CONFIG_ENTRYPOINT(asm_fred_entrypoint_user))
+
 void cpu_init_fred_exceptions(void)
 {
 	/* When FRED is enabled by default, remove this log message */
@@ -44,11 +51,7 @@ void cpu_init_fred_exceptions(void)
 	 */
 	loadsegment(ss, __KERNEL_DS);
 
-	wrmsrq(MSR_IA32_FRED_CONFIG,
-	       /* Reserve for CALL emulation */
-	       FRED_CONFIG_REDZONE |
-	       FRED_CONFIG_INT_STKLVL(0) |
-	       FRED_CONFIG_ENTRYPOINT(asm_fred_entrypoint_user));
+	wrmsrq(MSR_IA32_FRED_CONFIG, FRED_CONFIG_VAL(0));
 
 	wrmsrq(MSR_IA32_FRED_STKLVLS, 0);
 
@@ -84,8 +87,15 @@ void cpu_init_fred_rsps(void)
 	       FRED_STKLVL(X86_TRAP_DB,  FRED_DB_STACK_LEVEL) |
 	       FRED_STKLVL(X86_TRAP_NMI, FRED_NMI_STACK_LEVEL) |
 	       FRED_STKLVL(X86_TRAP_MC,  FRED_MC_STACK_LEVEL) |
+#ifdef CONFIG_DYNAMIC_STACK
+	       FRED_STKLVL(X86_TRAP_PF,  FRED_PF_STACK_LEVEL) |
+#endif
 	       FRED_STKLVL(X86_TRAP_DF,  FRED_DF_STACK_LEVEL));
 
+#ifdef CONFIG_DYNAMIC_STACK
+	wrmsrq(MSR_IA32_FRED_CONFIG, FRED_CONFIG_VAL(FRED_INT_STACK_LEVEL));
+#endif
+
 	/* The FRED equivalents to IST stacks... */
 	wrmsrq(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB));
 	wrmsrq(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI));
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 2afa7a23340e..5c33c33e93fe 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -306,11 +306,17 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
 	static const char units[] = "BKMGTPE";
 	struct seq_file *m = st->seq;
 
-	new_prot = val & PTE_FLAGS_MASK;
-	if (!val)
+	/* Ignore prot/eff from data kptes. */
+	if (val & _PAGE_PRESENT || addr < address_markers[KERNEL_SPACE_NR].start_address) {
+		new_prot = val & PTE_FLAGS_MASK;
+		if (!val)
+			new_eff = 0;
+		else
+			new_eff = st->prot_levels[level];
+	} else {
+		new_prot = 0;
 		new_eff = 0;
-	else
-		new_eff = st->prot_levels[level];
+	}
 
 	/*
 	 * If we have a "break" in the series, we need to flush the state that
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index b83a06739b51..40d518d9f562 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1480,6 +1480,59 @@ handle_page_fault(struct pt_regs *regs, unsigned long error_code,
 	local_irq_disable();
 }
 
+#ifdef CONFIG_DYNAMIC_STACK
+
+static noinstr unsigned long copy_stack_data(struct pt_regs *regs)
+{
+	unsigned long new_sp;
+	unsigned long data_len;
+
+	new_sp = regs->sp - (FRED_CONFIG_REDZONE_AMOUNT << 6);
+	new_sp &= FRED_STACK_FRAME_RSP_MASK;
+	data_len = sizeof(struct fred_frame);
+	new_sp -= data_len;
+
+	memcpy((void *)new_sp, regs, data_len);
+
+	return new_sp;
+}
+
+__visible noinstr unsigned long switch_to_kstack(struct pt_regs *regs)
+{
+	return copy_stack_data(regs);
+}
+
+#define ALIGN_TO_STACK(addr) ((addr) & ~(THREAD_ALIGN - 1))
+
+__visible noinstr unsigned long handle_dynamic_stack_kernel_faults(struct pt_regs *regs)
+{
+	unsigned long address;
+	struct task_struct *tsk;
+	bool on_stack;
+
+	address = fred_event_data(regs);
+	if (fault_in_kernel_space(address) && !in_nmi()) {
+		tsk = task_from_stack_address(address);
+
+		if (tsk && dynamic_stack_fault(tsk, address, &on_stack)) {
+			WARN_ON_ONCE(tsk != current &&
+				     ALIGN_TO_STACK(regs->sp) != ALIGN_TO_STACK(address));
+			return 0;
+		}
+	}
+
+	/*
+	 * The regular fault handler won't sleep when executing in an
+	 * atomic context, so we can complete the #PF directly on the
+	 * #PF stack.
+	 */
+	if (in_atomic())
+		return (unsigned long)regs;
+	else
+		return copy_stack_data(regs);
+}
+#endif
+
 DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
 {
 	irqentry_state_t state;
-- 
2.54.0.rc2.544.gc7ae2d5bb8-goog



  parent reply	other threads:[~2026-04-24 19:17 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-24 19:14 [PATCH v2 00/13] Dynamic Kernel Stacks David Stevens
2026-04-24 19:14 ` [PATCH v2 01/13] fork: Remove assumption that vm_area->nr_pages equals to THREAD_SIZE David Stevens
2026-04-24 19:14 ` [PATCH v2 02/13] fork: Don't assume fully populated stack during reuse David Stevens
2026-04-24 19:14 ` [PATCH v2 03/13] fork: Move vm_stack to the beginning of the stack David Stevens
2026-04-24 19:14 ` [PATCH v2 04/13] fork: separate vmap stack allocation and free calls David Stevens
2026-04-24 19:14 ` [PATCH v2 05/13] mm/vmalloc: Add a get_vm_area_node() and vmap_pages_range() public functions David Stevens
2026-04-24 19:14 ` [PATCH v2 06/13] fork: Move vmap stack freeing to work queue David Stevens
2026-04-24 19:14 ` [PATCH v2 07/13] fork: Dynamic Kernel Stacks David Stevens
2026-04-24 19:14 ` [PATCH v2 08/13] task_stack.h: Add stack_not_used() support for dynamic stack David Stevens
2026-04-24 19:14 ` [PATCH v2 09/13] fork: Dynamic Kernel Stack accounting David Stevens
2026-04-24 19:14 ` [PATCH v2 10/13] fork: Store task pointer in unpopulated stack ptes David Stevens
2026-04-24 19:14 ` [PATCH v2 11/13] x86/entry/fred: encode frame pointer on entry David Stevens
2026-04-24 19:14 ` David Stevens [this message]
2026-04-24 19:14 ` [PATCH v2 13/13] x86: Add support for dynamic kernel stacks via IST David Stevens
2026-04-24 19:41 ` [PATCH v2 00/13] Dynamic Kernel Stacks Dave Hansen
2026-04-24 21:35   ` Pasha Tatashin
2026-04-24 22:21     ` Dave Hansen
2026-04-24 22:49       ` David Stevens
2026-04-24 22:26     ` David Laight
2026-04-24 23:06       ` Pasha Tatashin
2026-04-25  9:19   ` H. Peter Anvin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260424191456.2679717-13-stevensd@google.com \
    --to=stevensd@google.com \
    --cc=Liam.Howlett@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=bp@alien8.de \
    --cc=dave.hansen@linux.intel.com \
    --cc=david@kernel.org \
    --cc=hpa@zytor.com \
    --cc=kees@kernel.org \
    --cc=linus.walleij@linaro.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ljs@kernel.org \
    --cc=luto@kernel.org \
    --cc=mhocko@suse.com \
    --cc=mingo@redhat.com \
    --cc=pasha.tatashin@soleen.com \
    --cc=peterz@infradead.org \
    --cc=qperret@google.com \
    --cc=rppt@kernel.org \
    --cc=surenb@google.com \
    --cc=tglx@kernel.org \
    --cc=urezki@gmail.com \
    --cc=vbabka@kernel.org \
    --cc=willdeacon@google.com \
    --cc=x86@kernel.org \
    --cc=xin@zytor.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox