From: Peter Zijlstra <peterz@infradead.org>
To: tglx@kernel.org
Cc: linux-kernel@vger.kernel.org, peterz@infradead.org,
Sean Christopherson <seanjc@google.com>,
Jim Mattson <jmattson@google.com>,
Binbin Wu <binbin.wu@linux.intel.com>,
Vishal L Verma <vishal.l.verma@intel.com>,
"kvm@vger.kernel.org" <kvm@vger.kernel.org>,
Rick P Edgecombe <rick.p.edgecombe@intel.com>,
Binbin Wu <binbin.wu@intel.com>,
"x86@kernel.org" <x86@kernel.org>,
Paolo Bonzini <bonzini@redhat.com>
Subject: [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
Date: Thu, 23 Apr 2026 17:56:12 +0200 [thread overview]
Message-ID: <20260423155936.843498069@infradead.org> (raw)
In-Reply-To: 20260423155611.216805954@infradead.org
Move the VMX interrupt dispatch magic into the x86 core code. This
isolates KVM from the FRED/IDT decisions and reduces the amount of
EXPORT_SYMBOL_FOR_KVM().
Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
---
arch/x86/entry/Makefile | 2 -
arch/x86/entry/common.c | 48 ++++++++++++++++++++++++++++++++++++
arch/x86/entry/entry.S | 46 ++++++++++++++++++++++++++++++++++
arch/x86/entry/entry_64_fred.S | 1
arch/x86/include/asm/desc.h | 4 +++
arch/x86/include/asm/desc_defs.h | 2 -
arch/x86/include/asm/entry-common.h | 2 +
arch/x86/include/asm/fred.h | 1
arch/x86/include/asm/idtentry.h | 11 --------
arch/x86/kernel/idt.c | 13 +++++++++
arch/x86/kernel/nmi.c | 8 ------
arch/x86/kvm/vmx/vmenter.S | 46 ----------------------------------
arch/x86/kvm/vmx/vmx.c | 20 ++-------------
13 files changed, 118 insertions(+), 86 deletions(-)
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -13,7 +13,7 @@ CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_
CFLAGS_syscall_32.o += -fno-stack-protector
CFLAGS_syscall_64.o += -fno-stack-protector
-obj-y := entry.o entry_$(BITS).o syscall_$(BITS).o
+obj-y := entry.o entry_$(BITS).o syscall_$(BITS).o common.o
obj-y += vdso/
obj-y += vsyscall/
--- /dev/null
+++ b/arch/x86/entry/common.c
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/entry-common.h>
+#include <linux/kvm_types.h>
+#include <asm/fred.h>
+#include <asm/desc.h>
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+/*
+ * On VMX, NMIs and IRQs (as configured by KVM) are acknowledge by hardware as
+ * part of the VM-Exit, i.e. the event itself is consumed as part the VM-Exit.
+ * x86_entry_from_kvm() is invoked by KVM to effectively forward NMIs and IRQs
+ * to the kernel for servicing. On SVM, a.k.a. AMD, the NMI/IRQ VM-Exit is
+ * purely a signal that an NMI/IRQ is pending, i.e. the event that triggered
+ * the VM-Exit is held pending until it's unblocked in the host.
+ */
+noinstr void x86_entry_from_kvm(unsigned int event_type, unsigned int vector)
+{
+ if (event_type == EVENT_TYPE_EXTINT) {
+#ifdef CONFIG_X86_64
+ /*
+ * Use FRED dispatch, even when running IDT. The dispatch
+ * tables are kept in sync between FRED and IDT, and the FRED
+ * dispatch works well with CFI.
+ */
+ fred_entry_from_kvm(event_type, vector);
+#else
+ idt_entry_from_kvm(vector);
+#endif
+ return;
+ }
+
+ WARN_ON_ONCE(event_type != EVENT_TYPE_NMI);
+
+#ifdef CONFIG_X86_64
+ if (cpu_feature_enabled(X86_FEATURE_FRED))
+ return fred_entry_from_kvm(event_type, vector);
+#endif
+
+ /*
+ * Notably, we must use IDT dispatch for NMI when running in IDT mode.
+ * The FRED NMI context is significantly different and will not work
+ * right (speficially FRED fixed the NMI recursion issue).
+ */
+ idt_entry_from_kvm(vector);
+}
+EXPORT_SYMBOL_FOR_KVM(x86_entry_from_kvm);
+#endif
--- a/arch/x86/entry/entry.S
+++ b/arch/x86/entry/entry.S
@@ -75,3 +75,49 @@ THUNK warn_thunk_thunk, __warn_thunk
#if defined(CONFIG_STACKPROTECTOR) && defined(CONFIG_SMP)
EXPORT_SYMBOL(__ref_stack_chk_guard);
#endif
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+.macro IDT_DO_EVENT_IRQOFF call_insn call_target
+ /*
+ * Unconditionally create a stack frame, getting the correct RSP on the
+ * stack (for x86-64) would take two instructions anyways, and RBP can
+ * be used to restore RSP to make objtool happy (see below).
+ */
+ push %_ASM_BP
+ mov %_ASM_SP, %_ASM_BP
+
+#ifdef CONFIG_X86_64
+ /*
+ * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
+ * creating the synthetic interrupt stack frame for the IRQ/NMI.
+ */
+ and $-16, %rsp
+ push $__KERNEL_DS
+ push %rbp
+#endif
+ pushf
+ push $__KERNEL_CS
+ \call_insn \call_target
+
+ /*
+ * "Restore" RSP from RBP, even though IRET has already unwound RSP to
+ * the correct value. objtool doesn't know the callee will IRET and,
+ * without the explicit restore, thinks the stack is getting walloped.
+ * Using an unwind hint is problematic due to x86-64's dynamic alignment.
+ */
+ leave
+ RET
+.endm
+
+.pushsection .text, "ax"
+SYM_FUNC_START(idt_do_interrupt_irqoff)
+ IDT_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
+SYM_FUNC_END(idt_do_interrupt_irqoff)
+.popsection
+
+.pushsection .noinstr.text, "ax"
+SYM_FUNC_START(idt_do_nmi_irqoff)
+ IDT_DO_EVENT_IRQOFF call asm_exc_nmi
+SYM_FUNC_END(idt_do_nmi_irqoff)
+.popsection
+#endif
--- a/arch/x86/entry/entry_64_fred.S
+++ b/arch/x86/entry/entry_64_fred.S
@@ -147,5 +147,4 @@ SYM_FUNC_START(asm_fred_entry_from_kvm)
RET
SYM_FUNC_END(asm_fred_entry_from_kvm)
-EXPORT_SYMBOL_FOR_KVM(asm_fred_entry_from_kvm);
#endif
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -438,6 +438,10 @@ extern void idt_setup_traps(void);
extern void idt_setup_apic_and_irq_gates(void);
extern bool idt_is_f00f_address(unsigned long address);
+extern void idt_do_interrupt_irqoff(unsigned int vector);
+extern void idt_do_nmi_irqoff(void);
+extern void idt_entry_from_kvm(unsigned int vector);
+
#ifdef CONFIG_X86_64
extern void idt_setup_early_pf(void);
#else
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -145,7 +145,7 @@ struct gate_struct {
typedef struct gate_struct gate_desc;
#ifndef _SETUP
-static inline unsigned long gate_offset(const gate_desc *g)
+static __always_inline unsigned long gate_offset(const gate_desc *g)
{
#ifdef CONFIG_X86_64
return g->offset_low | ((unsigned long)g->offset_middle << 16) |
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -97,4 +97,6 @@ static __always_inline void arch_exit_to
}
#define arch_exit_to_user_mode arch_exit_to_user_mode
+extern void x86_entry_from_kvm(unsigned int entry_type, unsigned int vector);
+
#endif
--- a/arch/x86/include/asm/fred.h
+++ b/arch/x86/include/asm/fred.h
@@ -110,7 +110,6 @@ static __always_inline unsigned long fre
static inline void cpu_init_fred_exceptions(void) { }
static inline void cpu_init_fred_rsps(void) { }
static inline void fred_complete_exception_setup(void) { }
-static inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { }
static inline void fred_sync_rsp0(unsigned long rsp0) { }
static inline void fred_update_rsp0(void) { }
#endif /* CONFIG_X86_FRED */
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -633,17 +633,6 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_
#endif
/* NMI */
-
-#if IS_ENABLED(CONFIG_KVM_INTEL)
-/*
- * Special entry point for VMX which invokes this on the kernel stack, even for
- * 64-bit, i.e. without using an IST. asm_exc_nmi() requires an IST to work
- * correctly vs. the NMI 'executing' marker. Used for 32-bit kernels as well
- * to avoid more ifdeffery.
- */
-DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_kvm_vmx);
-#endif
-
DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi);
#ifdef CONFIG_XEN_PV
DECLARE_IDTENTRY_RAW(X86_TRAP_NMI, xenpv_exc_nmi);
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -268,6 +268,19 @@ void __init idt_setup_early_pf(void)
}
#endif
+noinstr void idt_entry_from_kvm(unsigned int vector)
+{
+ if (vector == NMI_VECTOR)
+ return idt_do_nmi_irqoff();
+
+ /*
+ * Only the NMI path requires noinstr.
+ */
+ instrumentation_begin();
+ idt_do_interrupt_irqoff(gate_offset(idt_table + vector));
+ instrumentation_end();
+}
+
static void __init idt_map_in_cea(void)
{
/*
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -609,14 +609,6 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
goto nmi_restart;
}
-#if IS_ENABLED(CONFIG_KVM_INTEL)
-DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx)
-{
- exc_nmi(regs);
-}
-EXPORT_SYMBOL_FOR_KVM(asm_exc_nmi_kvm_vmx);
-#endif
-
#ifdef CONFIG_NMI_CHECK_CPU
static char *nmi_check_stall_msg[] = {
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -31,38 +31,6 @@
#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE
#endif
-.macro VMX_DO_EVENT_IRQOFF call_insn call_target
- /*
- * Unconditionally create a stack frame, getting the correct RSP on the
- * stack (for x86-64) would take two instructions anyways, and RBP can
- * be used to restore RSP to make objtool happy (see below).
- */
- push %_ASM_BP
- mov %_ASM_SP, %_ASM_BP
-
-#ifdef CONFIG_X86_64
- /*
- * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
- * creating the synthetic interrupt stack frame for the IRQ/NMI.
- */
- and $-16, %rsp
- push $__KERNEL_DS
- push %rbp
-#endif
- pushf
- push $__KERNEL_CS
- \call_insn \call_target
-
- /*
- * "Restore" RSP from RBP, even though IRET has already unwound RSP to
- * the correct value. objtool doesn't know the callee will IRET and,
- * without the explicit restore, thinks the stack is getting walloped.
- * Using an unwind hint is problematic due to x86-64's dynamic alignment.
- */
- leave
- RET
-.endm
-
.section .noinstr.text, "ax"
/**
@@ -320,10 +288,6 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_
SYM_FUNC_END(__vmx_vcpu_run)
-SYM_FUNC_START(vmx_do_nmi_irqoff)
- VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx
-SYM_FUNC_END(vmx_do_nmi_irqoff)
-
#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
/**
@@ -375,13 +339,3 @@ SYM_FUNC_START(vmread_error_trampoline)
RET
SYM_FUNC_END(vmread_error_trampoline)
#endif
-
-.section .text, "ax"
-
-#ifndef CONFIG_X86_FRED
-
-SYM_FUNC_START(vmx_do_interrupt_irqoff)
- VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
-SYM_FUNC_END(vmx_do_interrupt_irqoff)
-
-#endif
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7083,9 +7083,6 @@ void vmx_load_eoi_exitmap(struct kvm_vcp
vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
}
-void vmx_do_interrupt_irqoff(unsigned long entry);
-void vmx_do_nmi_irqoff(void);
-
static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
{
/*
@@ -7127,17 +7124,9 @@ static void handle_external_interrupt_ir
"unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
- /*
- * Invoke the kernel's IRQ handler for the vector. Use the FRED path
- * when it's available even if FRED isn't fully enabled, e.g. even if
- * FRED isn't supported in hardware, in order to avoid the indirect
- * CALL in the non-FRED path.
- */
+ /* For the IRQ to the core kernel for processing. */
kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
- if (IS_ENABLED(CONFIG_X86_FRED))
- fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
- else
- vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector));
+ x86_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
kvm_after_interrupt(vcpu);
vcpu->arch.at_instruction_boundary = true;
@@ -7447,10 +7436,7 @@ noinstr void vmx_handle_nmi(struct kvm_v
return;
kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
- if (cpu_feature_enabled(X86_FEATURE_FRED))
- fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
- else
- vmx_do_nmi_irqoff();
+ x86_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
kvm_after_interrupt(vcpu);
}
next prev parent reply other threads:[~2026-04-23 16:00 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-23 15:56 [PATCH 0/2] x86/kvm/vmx: Fix VMX interrupt injection vs hrtimer_rearm_deferred() Peter Zijlstra
2026-04-23 15:56 ` Peter Zijlstra [this message]
2026-04-23 17:54 ` [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core Xin Li
2026-04-23 15:56 ` [PATCH 2/2] x86/kvm/vmx: Fix VMX vs hrtimer_rearm_deferred() Peter Zijlstra
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260423155936.843498069@infradead.org \
--to=peterz@infradead.org \
--cc=binbin.wu@intel.com \
--cc=binbin.wu@linux.intel.com \
--cc=bonzini@redhat.com \
--cc=jmattson@google.com \
--cc=kvm@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=rick.p.edgecombe@intel.com \
--cc=seanjc@google.com \
--cc=tglx@kernel.org \
--cc=vishal.l.verma@intel.com \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox