public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Lai Jiangshan <jiangshanlai@gmail.com>
To: linux-kernel@vger.kernel.org
Cc: Lai Jiangshan <jiangshan.ljs@antgroup.com>,
	"H. Peter Anvin" <hpa@linux.intel.com>,
	Andi Kleen <ak@linux.intel.com>,
	Andrew Cooper <andrew.cooper3@citrix.com>,
	Andy Lutomirski <luto@kernel.org>,
	Asit Mallick <asit.k.mallick@intel.com>,
	Cfir Cohen <cfir@google.com>,
	Dan Williams <dan.j.williams@intel.com>,
	Dave Hansen <dave.hansen@intel.com>,
	David Kaplan <David.Kaplan@amd.com>,
	David Rientjes <rientjes@google.com>,
	Dirk Hohndel <dirkhh@vmware.com>,
	Erdem Aktas <erdemaktas@google.com>,
	Jan Kiszka <jan.kiszka@siemens.com>, Jiri Slaby <jslaby@suse.cz>,
	Joerg Roedel <joro@8bytes.org>, Juergen Gross <jgross@suse.com>,
	Kees Cook <keescook@chromium.org>,
	Kirill Shutemov <kirill.shutemov@linux.intel.com>,
	Kuppuswamy Sathyanarayanan <knsathya@kernel.org>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Mike Stunes <mstunes@vmware.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Raj Ashok <ashok.raj@intel.com>,
	Sean Christopherson <seanjc@google.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Tom Lendacky <thomas.lendacky@amd.com>,
	Tony Luck <tony.luck@intel.com>,
	kvm@vger.kernel.org, linux-coco@lists.linux.dev, x86@kernel.org,
	Ingo Molnar <mingo@redhat.com>, Borislav Petkov <bp@alien8.de>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	"H. Peter Anvin" <hpa@zytor.com>,
	Kuppuswamy Sathyanarayanan 
	<sathyanarayanan.kuppuswamy@linux.intel.com>
Subject: [RFC PATCH 4/7] x86/entry: Use atomic-IST-entry for NMI
Date: Mon,  3 Apr 2023 22:06:02 +0800	[thread overview]
Message-ID: <20230403140605.540512-5-jiangshanlai@gmail.com> (raw)
In-Reply-To: <20230403140605.540512-1-jiangshanlai@gmail.com>

From: Lai Jiangshan <jiangshan.ljs@antgroup.com>

Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
---
 arch/x86/entry/entry_64.S       | 373 --------------------------------
 arch/x86/entry/ist_entry.c      |   2 +-
 arch/x86/include/asm/idtentry.h |   9 +-
 3 files changed, 7 insertions(+), 377 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 50a24cc83581..2bb7ab8512dc 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1341,379 +1341,6 @@ SYM_CODE_START_LOCAL(error_return)
 	jmp	swapgs_restore_regs_and_return_to_usermode
 SYM_CODE_END(error_return)
 
-/*
- * Runs on exception stack.  Xen PV does not go through this path at all,
- * so we can use real assembly here.
- *
- * Registers:
- *	%r14: Used to save/restore the CR3 of the interrupted context
- *	      when PAGE_TABLE_ISOLATION is in use.  Do not clobber.
- */
-SYM_CODE_START(asm_exc_nmi)
-	UNWIND_HINT_IRET_REGS
-	ENDBR
-
-	/*
-	 * We allow breakpoints in NMIs. If a breakpoint occurs, then
-	 * the iretq it performs will take us out of NMI context.
-	 * This means that we can have nested NMIs where the next
-	 * NMI is using the top of the stack of the previous NMI. We
-	 * can't let it execute because the nested NMI will corrupt the
-	 * stack of the previous NMI. NMI handlers are not re-entrant
-	 * anyway.
-	 *
-	 * To handle this case we do the following:
-	 *  Check the a special location on the stack that contains
-	 *  a variable that is set when NMIs are executing.
-	 *  The interrupted task's stack is also checked to see if it
-	 *  is an NMI stack.
-	 *  If the variable is not set and the stack is not the NMI
-	 *  stack then:
-	 *    o Set the special variable on the stack
-	 *    o Copy the interrupt frame into an "outermost" location on the
-	 *      stack
-	 *    o Copy the interrupt frame into an "iret" location on the stack
-	 *    o Continue processing the NMI
-	 *  If the variable is set or the previous stack is the NMI stack:
-	 *    o Modify the "iret" location to jump to the repeat_nmi
-	 *    o return back to the first NMI
-	 *
-	 * Now on exit of the first NMI, we first clear the stack variable
-	 * The NMI stack will tell any nested NMIs at that point that it is
-	 * nested. Then we pop the stack normally with iret, and if there was
-	 * a nested NMI that updated the copy interrupt stack frame, a
-	 * jump will be made to the repeat_nmi code that will handle the second
-	 * NMI.
-	 *
-	 * However, espfix prevents us from directly returning to userspace
-	 * with a single IRET instruction.  Similarly, IRET to user mode
-	 * can fault.  We therefore handle NMIs from user space like
-	 * other IST entries.
-	 */
-
-	ASM_CLAC
-	cld
-
-	/* Use %rdx as our temp variable throughout */
-	pushq	%rdx
-
-	testb	$3, CS-RIP+8(%rsp)
-	jz	.Lnmi_from_kernel
-
-	/*
-	 * NMI from user mode.  We need to run on the thread stack, but we
-	 * can't go through the normal entry paths: NMIs are masked, and
-	 * we don't want to enable interrupts, because then we'll end
-	 * up in an awkward situation in which IRQs are on but NMIs
-	 * are off.
-	 *
-	 * We also must not push anything to the stack before switching
-	 * stacks lest we corrupt the "NMI executing" variable.
-	 */
-
-	swapgs
-	FENCE_SWAPGS_USER_ENTRY
-	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
-	movq	%rsp, %rdx
-	movq	PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
-	UNWIND_HINT_IRET_REGS base=%rdx offset=8
-	pushq	5*8(%rdx)	/* pt_regs->ss */
-	pushq	4*8(%rdx)	/* pt_regs->rsp */
-	pushq	3*8(%rdx)	/* pt_regs->flags */
-	pushq	2*8(%rdx)	/* pt_regs->cs */
-	pushq	1*8(%rdx)	/* pt_regs->rip */
-	UNWIND_HINT_IRET_REGS
-	pushq   $-1		/* pt_regs->orig_ax */
-	PUSH_AND_CLEAR_REGS rdx=(%rdx)
-	ENCODE_FRAME_POINTER
-
-	IBRS_ENTER
-	UNTRAIN_RET
-
-	/*
-	 * At this point we no longer need to worry about stack damage
-	 * due to nesting -- we're on the normal thread stack and we're
-	 * done with the NMI stack.
-	 */
-
-	movq	%rsp, %rdi
-	movq	$-1, %rsi
-	call	exc_nmi
-
-	/*
-	 * Return back to user mode.  We must *not* do the normal exit
-	 * work, because we don't want to enable interrupts.
-	 */
-	jmp	swapgs_restore_regs_and_return_to_usermode
-
-.Lnmi_from_kernel:
-	/*
-	 * Here's what our stack frame will look like:
-	 * +---------------------------------------------------------+
-	 * | original SS                                             |
-	 * | original Return RSP                                     |
-	 * | original RFLAGS                                         |
-	 * | original CS                                             |
-	 * | original RIP                                            |
-	 * +---------------------------------------------------------+
-	 * | temp storage for rdx                                    |
-	 * +---------------------------------------------------------+
-	 * | "NMI executing" variable                                |
-	 * +---------------------------------------------------------+
-	 * | iret SS          } Copied from "outermost" frame        |
-	 * | iret Return RSP  } on each loop iteration; overwritten  |
-	 * | iret RFLAGS      } by a nested NMI to force another     |
-	 * | iret CS          } iteration if needed.                 |
-	 * | iret RIP         }                                      |
-	 * +---------------------------------------------------------+
-	 * | outermost SS          } initialized in first_nmi;       |
-	 * | outermost Return RSP  } will not be changed before      |
-	 * | outermost RFLAGS      } NMI processing is done.         |
-	 * | outermost CS          } Copied to "iret" frame on each  |
-	 * | outermost RIP         } iteration.                      |
-	 * +---------------------------------------------------------+
-	 * | pt_regs                                                 |
-	 * +---------------------------------------------------------+
-	 *
-	 * The "original" frame is used by hardware.  Before re-enabling
-	 * NMIs, we need to be done with it, and we need to leave enough
-	 * space for the asm code here.
-	 *
-	 * We return by executing IRET while RSP points to the "iret" frame.
-	 * That will either return for real or it will loop back into NMI
-	 * processing.
-	 *
-	 * The "outermost" frame is copied to the "iret" frame on each
-	 * iteration of the loop, so each iteration starts with the "iret"
-	 * frame pointing to the final return target.
-	 */
-
-	/*
-	 * Determine whether we're a nested NMI.
-	 *
-	 * If we interrupted kernel code between repeat_nmi and
-	 * end_repeat_nmi, then we are a nested NMI.  We must not
-	 * modify the "iret" frame because it's being written by
-	 * the outer NMI.  That's okay; the outer NMI handler is
-	 * about to about to call exc_nmi() anyway, so we can just
-	 * resume the outer NMI.
-	 */
-
-	movq	$repeat_nmi, %rdx
-	cmpq	8(%rsp), %rdx
-	ja	1f
-	movq	$end_repeat_nmi, %rdx
-	cmpq	8(%rsp), %rdx
-	ja	nested_nmi_out
-1:
-
-	/*
-	 * Now check "NMI executing".  If it's set, then we're nested.
-	 * This will not detect if we interrupted an outer NMI just
-	 * before IRET.
-	 */
-	cmpl	$1, -8(%rsp)
-	je	nested_nmi
-
-	/*
-	 * Now test if the previous stack was an NMI stack.  This covers
-	 * the case where we interrupt an outer NMI after it clears
-	 * "NMI executing" but before IRET.  We need to be careful, though:
-	 * there is one case in which RSP could point to the NMI stack
-	 * despite there being no NMI active: naughty userspace controls
-	 * RSP at the very beginning of the SYSCALL targets.  We can
-	 * pull a fast one on naughty userspace, though: we program
-	 * SYSCALL to mask DF, so userspace cannot cause DF to be set
-	 * if it controls the kernel's RSP.  We set DF before we clear
-	 * "NMI executing".
-	 */
-	lea	6*8(%rsp), %rdx
-	/* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
-	cmpq	%rdx, 4*8(%rsp)
-	/* If the stack pointer is above the NMI stack, this is a normal NMI */
-	ja	first_nmi
-
-	subq	$EXCEPTION_STKSZ, %rdx
-	cmpq	%rdx, 4*8(%rsp)
-	/* If it is below the NMI stack, it is a normal NMI */
-	jb	first_nmi
-
-	/* Ah, it is within the NMI stack. */
-
-	testb	$(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
-	jz	first_nmi	/* RSP was user controlled. */
-
-	/* This is a nested NMI. */
-
-nested_nmi:
-	/*
-	 * Modify the "iret" frame to point to repeat_nmi, forcing another
-	 * iteration of NMI handling.
-	 */
-	subq	$8, %rsp
-	leaq	-10*8(%rsp), %rdx
-	pushq	$__KERNEL_DS
-	pushq	%rdx
-	pushfq
-	pushq	$__KERNEL_CS
-	pushq	$repeat_nmi
-
-	/* Put stack back */
-	addq	$(6*8), %rsp
-
-nested_nmi_out:
-	popq	%rdx
-
-	/* We are returning to kernel mode, so this cannot result in a fault. */
-	iretq
-
-first_nmi:
-	/* Restore rdx. */
-	movq	(%rsp), %rdx
-
-	/* Make room for "NMI executing". */
-	pushq	$0
-
-	/* Leave room for the "iret" frame */
-	subq	$(5*8), %rsp
-
-	/* Copy the "original" frame to the "outermost" frame */
-	.rept 5
-	pushq	11*8(%rsp)
-	.endr
-	UNWIND_HINT_IRET_REGS
-
-	/* Everything up to here is safe from nested NMIs */
-
-#ifdef CONFIG_DEBUG_ENTRY
-	/*
-	 * For ease of testing, unmask NMIs right away.  Disabled by
-	 * default because IRET is very expensive.
-	 */
-	pushq	$0		/* SS */
-	pushq	%rsp		/* RSP (minus 8 because of the previous push) */
-	addq	$8, (%rsp)	/* Fix up RSP */
-	pushfq			/* RFLAGS */
-	pushq	$__KERNEL_CS	/* CS */
-	pushq	$1f		/* RIP */
-	iretq			/* continues at repeat_nmi below */
-	UNWIND_HINT_IRET_REGS
-1:
-#endif
-
-repeat_nmi:
-	ANNOTATE_NOENDBR // this code
-	/*
-	 * If there was a nested NMI, the first NMI's iret will return
-	 * here. But NMIs are still enabled and we can take another
-	 * nested NMI. The nested NMI checks the interrupted RIP to see
-	 * if it is between repeat_nmi and end_repeat_nmi, and if so
-	 * it will just return, as we are about to repeat an NMI anyway.
-	 * This makes it safe to copy to the stack frame that a nested
-	 * NMI will update.
-	 *
-	 * RSP is pointing to "outermost RIP".  gsbase is unknown, but, if
-	 * we're repeating an NMI, gsbase has the same value that it had on
-	 * the first iteration.  paranoid_entry will load the kernel
-	 * gsbase if needed before we call exc_nmi().  "NMI executing"
-	 * is zero.
-	 */
-	movq	$1, 10*8(%rsp)		/* Set "NMI executing". */
-
-	/*
-	 * Copy the "outermost" frame to the "iret" frame.  NMIs that nest
-	 * here must not modify the "iret" frame while we're writing to
-	 * it or it will end up containing garbage.
-	 */
-	addq	$(10*8), %rsp
-	.rept 5
-	pushq	-6*8(%rsp)
-	.endr
-	subq	$(5*8), %rsp
-end_repeat_nmi:
-	ANNOTATE_NOENDBR // this code
-
-	/*
-	 * Everything below this point can be preempted by a nested NMI.
-	 * If this happens, then the inner NMI will change the "iret"
-	 * frame to point back to repeat_nmi.
-	 */
-	pushq	$-1				/* ORIG_RAX: no syscall to restart */
-
-	PUSH_AND_CLEAR_REGS
-	UNWIND_HINT_REGS
-	ENCODE_FRAME_POINTER
-
-	/*
-	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
-	 * as we should not be calling schedule in NMI context.
-	 * Even with normal interrupts enabled. An NMI should not be
-	 * setting NEED_RESCHED or anything that normal interrupts and
-	 * exceptions might do.
-	 */
-	call	paranoid_entry
-
-	movq	%rsp, %rdi
-	movq	$-1, %rsi
-	call	exc_nmi
-
-	/* Always restore stashed SPEC_CTRL value (see paranoid_entry) */
-	IBRS_EXIT save_reg=%r15
-
-	/* Always restore stashed CR3 value (see paranoid_entry) */
-	RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
-
-	/*
-	 * The above invocation of paranoid_entry stored the GSBASE
-	 * related information in R/EBX depending on the availability
-	 * of FSGSBASE.
-	 *
-	 * If FSGSBASE is enabled, restore the saved GSBASE value
-	 * unconditionally, otherwise take the conditional SWAPGS path.
-	 */
-	ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE
-
-	wrgsbase	%rbx
-	jmp	nmi_restore
-
-nmi_no_fsgsbase:
-	/* EBX == 0 -> invoke SWAPGS */
-	testl	%ebx, %ebx
-	jnz	nmi_restore
-
-nmi_swapgs:
-	swapgs
-
-nmi_restore:
-	POP_REGS
-
-	/*
-	 * Skip orig_ax and the "outermost" frame to point RSP at the "iret"
-	 * at the "iret" frame.
-	 */
-	addq	$6*8, %rsp
-
-	/*
-	 * Clear "NMI executing".  Set DF first so that we can easily
-	 * distinguish the remaining code between here and IRET from
-	 * the SYSCALL entry and exit paths.
-	 *
-	 * We arguably should just inspect RIP instead, but I (Andy) wrote
-	 * this code when I had the misapprehension that Xen PV supported
-	 * NMIs, and Xen PV would break that approach.
-	 */
-	std
-	movq	$0, 5*8(%rsp)		/* clear "NMI executing" */
-
-	/*
-	 * iretq reads the "iret" frame and exits the NMI stack in a
-	 * single instruction.  We are returning to kernel mode, so this
-	 * cannot result in a fault.  Similarly, we don't need to worry
-	 * about espfix64 on the way back to kernel mode.
-	 */
-	iretq
-SYM_CODE_END(asm_exc_nmi)
-
 #ifndef CONFIG_IA32_EMULATION
 /*
  * This handles SYSCALL from 32-bit code.  There is no way to program
diff --git a/arch/x86/entry/ist_entry.c b/arch/x86/entry/ist_entry.c
index e1b06306ac51..407571cc4a8c 100644
--- a/arch/x86/entry/ist_entry.c
+++ b/arch/x86/entry/ist_entry.c
@@ -41,7 +41,7 @@ static __always_inline bool identify_ist_##sym_name(				\
 	return true;								\
 }
 
-DEFINE_IDENTIFY_IST(NMI, nmi, false)
+DEFINE_IDENTIFY_IST(NMI, nmi, true)
 DEFINE_IDENTIFY_IST(DB, debug, false)
 DEFINE_IDENTIFY_IST(MCE, machine_check, false)
 DEFINE_IDENTIFY_IST(VC, vmm_communication, false)
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index b241af4ce9b4..b568f1de6da6 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -450,6 +450,9 @@ __visible noinstr void func(struct pt_regs *regs,			\
 	idtentry_sysvec vector func
 
 #ifdef CONFIG_X86_64
+# define DECLARE_IDTENTRY_NMI(vector, func)				\
+	idtentry_ist vector asm_##func func func has_error_code=0 stack_offset=CEA_stacks_NMI
+
 # define DECLARE_IDTENTRY_MCE(vector, func)				\
 	idtentry_mce_db vector asm_##func func
 
@@ -475,11 +478,11 @@ __visible noinstr void func(struct pt_regs *regs,			\
 /* No ASM emitted for XEN hypervisor callback */
 # define DECLARE_IDTENTRY_XENCB(vector, func)
 
-#endif
-
-/* No ASM code emitted for NMI */
+/* No ASM code emitted for NMI for X86_32 */
 #define DECLARE_IDTENTRY_NMI(vector, func)
 
+#endif
+
 /*
  * ASM code to emit the common vector entry stubs where each stub is
  * packed into IDT_ALIGN bytes.
-- 
2.19.1.6.gb485710b


  parent reply	other threads:[~2023-04-03 14:06 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-04-03 14:05 [RFC PATCH 0/7] x86/entry: Atomic statck switching for IST Lai Jiangshan
2023-04-03 14:05 ` [RFC PATCH 1/7] x86/entry: Move PUSH_AND_CLEAR_REGS out of paranoid_entry Lai Jiangshan
2023-04-06 20:37   ` Peter Zijlstra
2023-04-03 14:06 ` [RFC PATCH 2/7] x86/entry: Add IST main stack Lai Jiangshan
2023-04-03 16:21   ` Linus Torvalds
2023-04-06 20:51   ` Peter Zijlstra
2023-04-03 14:06 ` [RFC PATCH 3/7] x86/entry: Implement atomic-IST-entry Lai Jiangshan
2023-04-06 21:01   ` Peter Zijlstra
2023-04-07  2:33     ` Lai Jiangshan
2023-04-06 21:58   ` Peter Zijlstra
2023-04-06 23:07     ` Andrew Cooper
2023-04-03 14:06 ` Lai Jiangshan [this message]
2023-04-03 14:06 ` [RFC PATCH 5/7] x86/entry: Use atomic-IST-entry for MCE and DB Lai Jiangshan
2023-04-03 14:06 ` [RFC PATCH 6/7] x86/entry: Use atomic-IST-entry for VC Lai Jiangshan
2023-04-03 14:06 ` [RFC PATCH 7/7] x86/entry: Test atomic-IST-entry via KVM Lai Jiangshan
2023-04-03 14:23 ` [RFC PATCH 0/7] x86/entry: Atomic statck switching for IST Dave Hansen
2023-04-03 16:32   ` Lai Jiangshan
2023-04-03 16:53 ` Dave Hansen
2023-04-04  3:17   ` Lai Jiangshan
2023-04-04 17:03     ` Paolo Bonzini
2023-04-06 10:12       ` Peter Zijlstra
2023-04-06 10:35         ` Jiri Slaby
2023-04-06 10:47           ` Peter Zijlstra
2023-04-06 11:04             ` Jiri Slaby
2023-04-06 12:37               ` Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230403140605.540512-5-jiangshanlai@gmail.com \
    --to=jiangshanlai@gmail.com \
    --cc=David.Kaplan@amd.com \
    --cc=ak@linux.intel.com \
    --cc=andrew.cooper3@citrix.com \
    --cc=ashok.raj@intel.com \
    --cc=asit.k.mallick@intel.com \
    --cc=bp@alien8.de \
    --cc=cfir@google.com \
    --cc=dan.j.williams@intel.com \
    --cc=dave.hansen@intel.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=dirkhh@vmware.com \
    --cc=erdemaktas@google.com \
    --cc=hpa@linux.intel.com \
    --cc=hpa@zytor.com \
    --cc=jan.kiszka@siemens.com \
    --cc=jgross@suse.com \
    --cc=jiangshan.ljs@antgroup.com \
    --cc=joro@8bytes.org \
    --cc=jslaby@suse.cz \
    --cc=keescook@chromium.org \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=knsathya@kernel.org \
    --cc=kvm@vger.kernel.org \
    --cc=linux-coco@lists.linux.dev \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=mingo@redhat.com \
    --cc=mstunes@vmware.com \
    --cc=peterz@infradead.org \
    --cc=rientjes@google.com \
    --cc=sathyanarayanan.kuppuswamy@linux.intel.com \
    --cc=seanjc@google.com \
    --cc=tglx@linutronix.de \
    --cc=thomas.lendacky@amd.com \
    --cc=tony.luck@intel.com \
    --cc=torvalds@linux-foundation.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox