[PATCH 0/2] x86/kvm/vmx: Fix VMX interrupt injection vs hrtimer_rearm

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/2] x86/kvm/vmx: Fix VMX interrupt injection vs hrtimer_rearm_deferred()
@ 2026-04-23 15:56 Peter Zijlstra
  2026-04-23 15:56 ` [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core Peter Zijlstra
  2026-04-23 15:56 ` [PATCH 2/2] x86/kvm/vmx: Fix VMX vs hrtimer_rearm_deferred() Peter Zijlstra
  0 siblings, 2 replies; 8+ messages in thread
From: Peter Zijlstra @ 2026-04-23 15:56 UTC (permalink / raw)
  To: tglx
  Cc: linux-kernel, peterz, Sean Christopherson, Jim Mattson, Binbin Wu,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini

Hi,

Nicely split out in two patches, but otherwise unchanged (except for a small
comment).


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-04-23 15:56 [PATCH 0/2] x86/kvm/vmx: Fix VMX interrupt injection vs hrtimer_rearm_deferred() Peter Zijlstra
@ 2026-04-23 15:56 ` Peter Zijlstra
  2026-04-23 17:54   ` Xin Li
                     ` (2 more replies)
  2026-04-23 15:56 ` [PATCH 2/2] x86/kvm/vmx: Fix VMX vs hrtimer_rearm_deferred() Peter Zijlstra
  1 sibling, 3 replies; 8+ messages in thread
From: Peter Zijlstra @ 2026-04-23 15:56 UTC (permalink / raw)
  To: tglx
  Cc: linux-kernel, peterz, Sean Christopherson, Jim Mattson, Binbin Wu,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini

Move the VMX interrupt dispatch magic into the x86 core code. This
isolates KVM from the FRED/IDT decisions and reduces the amount of
EXPORT_SYMBOL_FOR_KVM().

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
---
 arch/x86/entry/Makefile             |    2 -
 arch/x86/entry/common.c             |   48 ++++++++++++++++++++++++++++++++++++
 arch/x86/entry/entry.S              |   46 ++++++++++++++++++++++++++++++++++
 arch/x86/entry/entry_64_fred.S      |    1 
 arch/x86/include/asm/desc.h         |    4 +++
 arch/x86/include/asm/desc_defs.h    |    2 -
 arch/x86/include/asm/entry-common.h |    2 +
 arch/x86/include/asm/fred.h         |    1 
 arch/x86/include/asm/idtentry.h     |   11 --------
 arch/x86/kernel/idt.c               |   13 +++++++++
 arch/x86/kernel/nmi.c               |    8 ------
 arch/x86/kvm/vmx/vmenter.S          |   46 ----------------------------------
 arch/x86/kvm/vmx/vmx.c              |   20 ++-------------
 13 files changed, 118 insertions(+), 86 deletions(-)

--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -13,7 +13,7 @@ CFLAGS_REMOVE_syscall_64.o	= $(CC_FLAGS_
 CFLAGS_syscall_32.o		+= -fno-stack-protector
 CFLAGS_syscall_64.o		+= -fno-stack-protector
 
-obj-y				:= entry.o entry_$(BITS).o syscall_$(BITS).o
+obj-y				:= entry.o entry_$(BITS).o syscall_$(BITS).o common.o
 
 obj-y				+= vdso/
 obj-y				+= vsyscall/
--- /dev/null
+++ b/arch/x86/entry/common.c
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/entry-common.h>
+#include <linux/kvm_types.h>
+#include <asm/fred.h>
+#include <asm/desc.h>
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+/*
+ * On VMX, NMIs and IRQs (as configured by KVM) are acknowledge by hardware as
+ * part of the VM-Exit, i.e. the event itself is consumed as part the VM-Exit.
+ * x86_entry_from_kvm() is invoked by KVM to effectively forward NMIs and IRQs
+ * to the kernel for servicing.  On SVM, a.k.a. AMD, the NMI/IRQ VM-Exit is
+ * purely a signal that an NMI/IRQ is pending, i.e. the event that triggered
+ * the VM-Exit is held pending until it's unblocked in the host.
+ */
+noinstr void x86_entry_from_kvm(unsigned int event_type, unsigned int vector)
+{
+	if (event_type == EVENT_TYPE_EXTINT) {
+#ifdef CONFIG_X86_64
+		/*
+		 * Use FRED dispatch, even when running IDT. The dispatch
+		 * tables are kept in sync between FRED and IDT, and the FRED
+		 * dispatch works well with CFI.
+		 */
+		fred_entry_from_kvm(event_type, vector);
+#else
+		idt_entry_from_kvm(vector);
+#endif
+		return;
+	}
+
+	WARN_ON_ONCE(event_type != EVENT_TYPE_NMI);
+
+#ifdef CONFIG_X86_64
+	if (cpu_feature_enabled(X86_FEATURE_FRED))
+		return fred_entry_from_kvm(event_type, vector);
+#endif
+
+	/*
+	 * Notably, we must use IDT dispatch for NMI when running in IDT mode.
+	 * The FRED NMI context is significantly different and will not work
+	 * right (speficially FRED fixed the NMI recursion issue).
+	 */
+	idt_entry_from_kvm(vector);
+}
+EXPORT_SYMBOL_FOR_KVM(x86_entry_from_kvm);
+#endif
--- a/arch/x86/entry/entry.S
+++ b/arch/x86/entry/entry.S
@@ -75,3 +75,49 @@ THUNK warn_thunk_thunk, __warn_thunk
 #if defined(CONFIG_STACKPROTECTOR) && defined(CONFIG_SMP)
 EXPORT_SYMBOL(__ref_stack_chk_guard);
 #endif
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+.macro IDT_DO_EVENT_IRQOFF call_insn call_target
+	/*
+	 * Unconditionally create a stack frame, getting the correct RSP on the
+	 * stack (for x86-64) would take two instructions anyways, and RBP can
+	 * be used to restore RSP to make objtool happy (see below).
+	 */
+	push %_ASM_BP
+	mov %_ASM_SP, %_ASM_BP
+
+#ifdef CONFIG_X86_64
+	/*
+	 * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
+	 * creating the synthetic interrupt stack frame for the IRQ/NMI.
+	 */
+	and  $-16, %rsp
+	push $__KERNEL_DS
+	push %rbp
+#endif
+	pushf
+	push $__KERNEL_CS
+	\call_insn \call_target
+
+	/*
+	 * "Restore" RSP from RBP, even though IRET has already unwound RSP to
+	 * the correct value.  objtool doesn't know the callee will IRET and,
+	 * without the explicit restore, thinks the stack is getting walloped.
+	 * Using an unwind hint is problematic due to x86-64's dynamic alignment.
+	 */
+	leave
+	RET
+.endm
+
+.pushsection .text, "ax"
+SYM_FUNC_START(idt_do_interrupt_irqoff)
+	IDT_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
+SYM_FUNC_END(idt_do_interrupt_irqoff)
+.popsection
+
+.pushsection .noinstr.text, "ax"
+SYM_FUNC_START(idt_do_nmi_irqoff)
+	IDT_DO_EVENT_IRQOFF call asm_exc_nmi
+SYM_FUNC_END(idt_do_nmi_irqoff)
+.popsection
+#endif
--- a/arch/x86/entry/entry_64_fred.S
+++ b/arch/x86/entry/entry_64_fred.S
@@ -147,5 +147,4 @@ SYM_FUNC_START(asm_fred_entry_from_kvm)
 	RET
 
 SYM_FUNC_END(asm_fred_entry_from_kvm)
-EXPORT_SYMBOL_FOR_KVM(asm_fred_entry_from_kvm);
 #endif
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -438,6 +438,10 @@ extern void idt_setup_traps(void);
 extern void idt_setup_apic_and_irq_gates(void);
 extern bool idt_is_f00f_address(unsigned long address);
 
+extern void idt_do_interrupt_irqoff(unsigned int vector);
+extern void idt_do_nmi_irqoff(void);
+extern void idt_entry_from_kvm(unsigned int vector);
+
 #ifdef CONFIG_X86_64
 extern void idt_setup_early_pf(void);
 #else
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -145,7 +145,7 @@ struct gate_struct {
 typedef struct gate_struct gate_desc;
 
 #ifndef _SETUP
-static inline unsigned long gate_offset(const gate_desc *g)
+static __always_inline unsigned long gate_offset(const gate_desc *g)
 {
 #ifdef CONFIG_X86_64
 	return g->offset_low | ((unsigned long)g->offset_middle << 16) |
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -97,4 +97,6 @@ static __always_inline void arch_exit_to
 }
 #define arch_exit_to_user_mode arch_exit_to_user_mode
 
+extern void x86_entry_from_kvm(unsigned int entry_type, unsigned int vector);
+
 #endif
--- a/arch/x86/include/asm/fred.h
+++ b/arch/x86/include/asm/fred.h
@@ -110,7 +110,6 @@ static __always_inline unsigned long fre
 static inline void cpu_init_fred_exceptions(void) { }
 static inline void cpu_init_fred_rsps(void) { }
 static inline void fred_complete_exception_setup(void) { }
-static inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { }
 static inline void fred_sync_rsp0(unsigned long rsp0) { }
 static inline void fred_update_rsp0(void) { }
 #endif /* CONFIG_X86_FRED */
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -633,17 +633,6 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC,	xenpv_
 #endif
 
 /* NMI */
-
-#if IS_ENABLED(CONFIG_KVM_INTEL)
-/*
- * Special entry point for VMX which invokes this on the kernel stack, even for
- * 64-bit, i.e. without using an IST.  asm_exc_nmi() requires an IST to work
- * correctly vs. the NMI 'executing' marker.  Used for 32-bit kernels as well
- * to avoid more ifdeffery.
- */
-DECLARE_IDTENTRY(X86_TRAP_NMI,		exc_nmi_kvm_vmx);
-#endif
-
 DECLARE_IDTENTRY_NMI(X86_TRAP_NMI,	exc_nmi);
 #ifdef CONFIG_XEN_PV
 DECLARE_IDTENTRY_RAW(X86_TRAP_NMI,	xenpv_exc_nmi);
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -268,6 +268,19 @@ void __init idt_setup_early_pf(void)
 }
 #endif
 
+noinstr void idt_entry_from_kvm(unsigned int vector)
+{
+	if (vector == NMI_VECTOR)
+		return idt_do_nmi_irqoff();
+
+	/*
+	 * Only the NMI path requires noinstr.
+	 */
+	instrumentation_begin();
+	idt_do_interrupt_irqoff(gate_offset(idt_table + vector));
+	instrumentation_end();
+}
+
 static void __init idt_map_in_cea(void)
 {
 	/*
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -609,14 +609,6 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
 		goto nmi_restart;
 }
 
-#if IS_ENABLED(CONFIG_KVM_INTEL)
-DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx)
-{
-	exc_nmi(regs);
-}
-EXPORT_SYMBOL_FOR_KVM(asm_exc_nmi_kvm_vmx);
-#endif
-
 #ifdef CONFIG_NMI_CHECK_CPU
 
 static char *nmi_check_stall_msg[] = {
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -31,38 +31,6 @@
 #define VCPU_R15	__VCPU_REGS_R15 * WORD_SIZE
 #endif
 
-.macro VMX_DO_EVENT_IRQOFF call_insn call_target
-	/*
-	 * Unconditionally create a stack frame, getting the correct RSP on the
-	 * stack (for x86-64) would take two instructions anyways, and RBP can
-	 * be used to restore RSP to make objtool happy (see below).
-	 */
-	push %_ASM_BP
-	mov %_ASM_SP, %_ASM_BP
-
-#ifdef CONFIG_X86_64
-	/*
-	 * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
-	 * creating the synthetic interrupt stack frame for the IRQ/NMI.
-	 */
-	and  $-16, %rsp
-	push $__KERNEL_DS
-	push %rbp
-#endif
-	pushf
-	push $__KERNEL_CS
-	\call_insn \call_target
-
-	/*
-	 * "Restore" RSP from RBP, even though IRET has already unwound RSP to
-	 * the correct value.  objtool doesn't know the callee will IRET and,
-	 * without the explicit restore, thinks the stack is getting walloped.
-	 * Using an unwind hint is problematic due to x86-64's dynamic alignment.
-	 */
-	leave
-	RET
-.endm
-
 .section .noinstr.text, "ax"
 
 /**
@@ -320,10 +288,6 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_
 
 SYM_FUNC_END(__vmx_vcpu_run)
 
-SYM_FUNC_START(vmx_do_nmi_irqoff)
-	VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx
-SYM_FUNC_END(vmx_do_nmi_irqoff)
-
 #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
 
 /**
@@ -375,13 +339,3 @@ SYM_FUNC_START(vmread_error_trampoline)
 	RET
 SYM_FUNC_END(vmread_error_trampoline)
 #endif
-
-.section .text, "ax"
-
-#ifndef CONFIG_X86_FRED
-
-SYM_FUNC_START(vmx_do_interrupt_irqoff)
-	VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
-SYM_FUNC_END(vmx_do_interrupt_irqoff)
-
-#endif
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7083,9 +7083,6 @@ void vmx_load_eoi_exitmap(struct kvm_vcp
 	vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
 }
 
-void vmx_do_interrupt_irqoff(unsigned long entry);
-void vmx_do_nmi_irqoff(void);
-
 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
 {
 	/*
@@ -7127,17 +7124,9 @@ static void handle_external_interrupt_ir
 	    "unexpected VM-Exit interrupt info: 0x%x", intr_info))
 		return;
 
-	/*
-	 * Invoke the kernel's IRQ handler for the vector.  Use the FRED path
-	 * when it's available even if FRED isn't fully enabled, e.g. even if
-	 * FRED isn't supported in hardware, in order to avoid the indirect
-	 * CALL in the non-FRED path.
-	 */
+	/* For the IRQ to the core kernel for processing. */
 	kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
-	if (IS_ENABLED(CONFIG_X86_FRED))
-		fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
-	else
-		vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector));
+	x86_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
 	kvm_after_interrupt(vcpu);
 
 	vcpu->arch.at_instruction_boundary = true;
@@ -7447,10 +7436,7 @@ noinstr void vmx_handle_nmi(struct kvm_v
 		return;
 
 	kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
-	if (cpu_feature_enabled(X86_FEATURE_FRED))
-		fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
-	else
-		vmx_do_nmi_irqoff();
+	x86_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
 	kvm_after_interrupt(vcpu);
 }
 



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-04-23 15:56 ` [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core Peter Zijlstra
@ 2026-04-23 17:54   ` Xin Li
  2026-04-28  9:43   ` Binbin Wu
  2026-05-01 20:37   ` [PATCH v2 " Peter Zijlstra
  2 siblings, 0 replies; 8+ messages in thread
From: Xin Li @ 2026-04-23 17:54 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, linux-kernel, Sean Christopherson, Jim Mattson, Binbin Wu,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini

Few nits below, but

Reviewed-by: Xin Li <xin@zytor.com <mailto:xin@zytor.com>>


> On Apr 23, 2026, at 8:56 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> Move the VMX interrupt dispatch magic into the x86 core code. This
> isolates KVM from the FRED/IDT decisions and reduces the amount of
> EXPORT_SYMBOL_FOR_KVM().
> 
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Tested-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
> --- /dev/null
> +++ b/arch/x86/entry/common.c
> @@ -0,0 +1,48 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#include <linux/entry-common.h>
> +#include <linux/kvm_types.h>
> +#include <asm/fred.h>
> +#include <asm/desc.h>

Swap the two header inclusion order?

> +
> +#if IS_ENABLED(CONFIG_KVM_INTEL)
> +/*
> + * On VMX, NMIs and IRQs (as configured by KVM) are acknowledge by hardware as
> + * part of the VM-Exit, i.e. the event itself is consumed as part the VM-Exit.
> + * x86_entry_from_kvm() is invoked by KVM to effectively forward NMIs and IRQs
> + * to the kernel for servicing.  On SVM, a.k.a. AMD, the NMI/IRQ VM-Exit is
> + * purely a signal that an NMI/IRQ is pending, i.e. the event that triggered
> + * the VM-Exit is held pending until it's unblocked in the host.
> + */
> +noinstr void x86_entry_from_kvm(unsigned int event_type, unsigned int vector)

Is u32 better than unsigned int as is close to hardware definition?

Currently, the event type and event vector are defined as 4-bit and 8-bit
fields, respectively; and they can be readily expanded to 8 and 16 bits by
utilizing adjacent reserved bits.

> +{
> + if (event_type == EVENT_TYPE_EXTINT) {
> +#ifdef CONFIG_X86_64
> + /*
> + * Use FRED dispatch, even when running IDT. The dispatch
> + * tables are kept in sync between FRED and IDT, and the FRED
> + * dispatch works well with CFI.
> + */
> + fred_entry_from_kvm(event_type, vector);

Then should we rename the “fred_” prefix, say “x86_64_"?

Sorry naming is hard to me ;)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-04-23 15:56 ` [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core Peter Zijlstra
  2026-04-23 17:54   ` Xin Li
@ 2026-04-28  9:43   ` Binbin Wu
  2026-04-28 11:25     ` Paolo Bonzini
  2026-05-01 20:37   ` [PATCH v2 " Peter Zijlstra
  2 siblings, 1 reply; 8+ messages in thread
From: Binbin Wu @ 2026-04-28  9:43 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, linux-kernel, Sean Christopherson, Jim Mattson,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini



On 4/23/2026 11:56 PM, Peter Zijlstra wrote:
> Move the VMX interrupt dispatch magic into the x86 core code. This
> isolates KVM from the FRED/IDT decisions and reduces the amount of
> EXPORT_SYMBOL_FOR_KVM().
> 
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Tested-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
> ---
>  arch/x86/entry/Makefile             |    2 -
>  arch/x86/entry/common.c             |   48 ++++++++++++++++++++++++++++++++++++
>  arch/x86/entry/entry.S              |   46 ++++++++++++++++++++++++++++++++++
>  arch/x86/entry/entry_64_fred.S      |    1 
>  arch/x86/include/asm/desc.h         |    4 +++
>  arch/x86/include/asm/desc_defs.h    |    2 -
>  arch/x86/include/asm/entry-common.h |    2 +
>  arch/x86/include/asm/fred.h         |    1 
>  arch/x86/include/asm/idtentry.h     |   11 --------
>  arch/x86/kernel/idt.c               |   13 +++++++++
>  arch/x86/kernel/nmi.c               |    8 ------
>  arch/x86/kvm/vmx/vmenter.S          |   46 ----------------------------------
>  arch/x86/kvm/vmx/vmx.c              |   20 ++-------------
>  13 files changed, 118 insertions(+), 86 deletions(-)
> 
> --- a/arch/x86/entry/Makefile
> +++ b/arch/x86/entry/Makefile
> @@ -13,7 +13,7 @@ CFLAGS_REMOVE_syscall_64.o	= $(CC_FLAGS_
>  CFLAGS_syscall_32.o		+= -fno-stack-protector
>  CFLAGS_syscall_64.o		+= -fno-stack-protector
>  
> -obj-y				:= entry.o entry_$(BITS).o syscall_$(BITS).o
> +obj-y				:= entry.o entry_$(BITS).o syscall_$(BITS).o common.o
>  
>  obj-y				+= vdso/
>  obj-y				+= vsyscall/
> --- /dev/null
> +++ b/arch/x86/entry/common.c
> @@ -0,0 +1,48 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#include <linux/entry-common.h>
> +#include <linux/kvm_types.h>
> +#include <asm/fred.h>
> +#include <asm/desc.h>
> +
> +#if IS_ENABLED(CONFIG_KVM_INTEL)
> +/*
> + * On VMX, NMIs and IRQs (as configured by KVM) are acknowledge by hardware as

s/acknowledge/acknowledged
 
> + * part of the VM-Exit, i.e. the event itself is consumed as part the VM-Exit.
> + * x86_entry_from_kvm() is invoked by KVM to effectively forward NMIs and IRQs
> + * to the kernel for servicing.  On SVM, a.k.a. AMD, the NMI/IRQ VM-Exit is
> + * purely a signal that an NMI/IRQ is pending, i.e. the event that triggered
> + * the VM-Exit is held pending until it's unblocked in the host.
> + */

[...]

> -
> -#if IS_ENABLED(CONFIG_KVM_INTEL)
> -/*
> - * Special entry point for VMX which invokes this on the kernel stack, even for
> - * 64-bit, i.e. without using an IST.  asm_exc_nmi() requires an IST to work

Although it's being removed, I guess what it says is still true?

It says asm_exc_nmi() requires an IST to work correctly, and the new path for
handling NMI when FRED is disabled.

idt_entry_from_kvm
    idt_do_nmi_irqoff
        IDT_DO_EVENT_IRQOFF call asm_exc_nmi
            ...
            call asm_exc_nmi

It seems the stack before calling asm_exc_nmi is not an IST?
Does it matter?

> - * correctly vs. the NMI 'executing' marker.  Used for 32-bit kernels as well
> - * to avoid more ifdeffery.
> - */
> -DECLARE_IDTENTRY(X86_TRAP_NMI,		exc_nmi_kvm_vmx);
> -#endif
> -
>  DECLARE_IDTENTRY_NMI(X86_TRAP_NMI,	exc_nmi);
>  #ifdef CONFIG_XEN_PV
>  DECLARE_IDTENTRY_RAW(X86_TRAP_NMI,	xenpv_exc_nmi);

[...]

> @@ -7127,17 +7124,9 @@ static void handle_external_interrupt_ir
>  	    "unexpected VM-Exit interrupt info: 0x%x", intr_info))
>  		return;
>  
> -	/*
> -	 * Invoke the kernel's IRQ handler for the vector.  Use the FRED path
> -	 * when it's available even if FRED isn't fully enabled, e.g. even if
> -	 * FRED isn't supported in hardware, in order to avoid the indirect
> -	 * CALL in the non-FRED path.
> -	 */
> +	/* For the IRQ to the core kernel for processing. */

For -> Forward?

>  	kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
> -	if (IS_ENABLED(CONFIG_X86_FRED))
> -		fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
> -	else
> -		vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector));
> +	x86_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
>  	kvm_after_interrupt(vcpu);
>  
>  	vcpu->arch.at_instruction_boundary = true;
> @@ -7447,10 +7436,7 @@ noinstr void vmx_handle_nmi(struct kvm_v
>  		return;
>  
>  	kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
> -	if (cpu_feature_enabled(X86_FEATURE_FRED))
> -		fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
> -	else
> -		vmx_do_nmi_irqoff();
> +	x86_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
>  	kvm_after_interrupt(vcpu);
>  }
>  
> 
> 
> 


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-04-28  9:43   ` Binbin Wu
@ 2026-04-28 11:25     ` Paolo Bonzini
  2026-05-01 20:31       ` Peter Zijlstra
  0 siblings, 1 reply; 8+ messages in thread
From: Paolo Bonzini @ 2026-04-28 11:25 UTC (permalink / raw)
  To: Binbin Wu, Peter Zijlstra
  Cc: tglx, linux-kernel, Sean Christopherson, Jim Mattson,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini

On 4/28/26 11:43, Binbin Wu wrote:
> 
> 
> On 4/23/2026 11:56 PM, Peter Zijlstra wrote:
>> Move the VMX interrupt dispatch magic into the x86 core code. This
>> isolates KVM from the FRED/IDT decisions and reduces the amount of
>> EXPORT_SYMBOL_FOR_KVM().
>>
>> Suggested-by: Sean Christopherson <seanjc@google.com>
>> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> Tested-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
>> ---
>>   arch/x86/entry/Makefile             |    2 -
>>   arch/x86/entry/common.c             |   48 ++++++++++++++++++++++++++++++++++++
>>   arch/x86/entry/entry.S              |   46 ++++++++++++++++++++++++++++++++++
>>   arch/x86/entry/entry_64_fred.S      |    1
>>   arch/x86/include/asm/desc.h         |    4 +++
>>   arch/x86/include/asm/desc_defs.h    |    2 -
>>   arch/x86/include/asm/entry-common.h |    2 +
>>   arch/x86/include/asm/fred.h         |    1
>>   arch/x86/include/asm/idtentry.h     |   11 --------
>>   arch/x86/kernel/idt.c               |   13 +++++++++
>>   arch/x86/kernel/nmi.c               |    8 ------
>>   arch/x86/kvm/vmx/vmenter.S          |   46 ----------------------------------
>>   arch/x86/kvm/vmx/vmx.c              |   20 ++-------------
>>   13 files changed, 118 insertions(+), 86 deletions(-)
>>
>> --- a/arch/x86/entry/Makefile
>> +++ b/arch/x86/entry/Makefile
>> @@ -13,7 +13,7 @@ CFLAGS_REMOVE_syscall_64.o	= $(CC_FLAGS_
>>   CFLAGS_syscall_32.o		+= -fno-stack-protector
>>   CFLAGS_syscall_64.o		+= -fno-stack-protector
>>   
>> -obj-y				:= entry.o entry_$(BITS).o syscall_$(BITS).o
>> +obj-y				:= entry.o entry_$(BITS).o syscall_$(BITS).o common.o
>>   
>>   obj-y				+= vdso/
>>   obj-y				+= vsyscall/
>> --- /dev/null
>> +++ b/arch/x86/entry/common.c
>> @@ -0,0 +1,48 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +
>> +#include <linux/entry-common.h>
>> +#include <linux/kvm_types.h>
>> +#include <asm/fred.h>
>> +#include <asm/desc.h>
>> +
>> +#if IS_ENABLED(CONFIG_KVM_INTEL)
>> +/*
>> + * On VMX, NMIs and IRQs (as configured by KVM) are acknowledge by hardware as
> 
> s/acknowledge/acknowledged
>   
>> + * part of the VM-Exit, i.e. the event itself is consumed as part the VM-Exit.
>> + * x86_entry_from_kvm() is invoked by KVM to effectively forward NMIs and IRQs
>> + * to the kernel for servicing.  On SVM, a.k.a. AMD, the NMI/IRQ VM-Exit is
>> + * purely a signal that an NMI/IRQ is pending, i.e. the event that triggered
>> + * the VM-Exit is held pending until it's unblocked in the host.
>> + */
> 
> [...]
> 
>> -
>> -#if IS_ENABLED(CONFIG_KVM_INTEL)
>> -/*
>> - * Special entry point for VMX which invokes this on the kernel stack, even for
>> - * 64-bit, i.e. without using an IST.  asm_exc_nmi() requires an IST to work
> 
> Although it's being removed, I guess what it says is still true?
> 
> It says asm_exc_nmi() requires an IST to work correctly, and the new path for
> handling NMI when FRED is disabled.
> 
> idt_entry_from_kvm
>      idt_do_nmi_irqoff
>          IDT_DO_EVENT_IRQOFF call asm_exc_nmi
>              ...
>              call asm_exc_nmi
> 
> It seems the stack before calling asm_exc_nmi is not an IST?
> Does it matter?

I think it does, the IST is needed because of all the stuff to detect 
recursive NMIs.  So asm_exc_nmi_kvm_vmx needs to remain.

By the way, here:

> +	/*
> +	 * Notably, we must use IDT dispatch for NMI when running in IDT mode.
> +	 * The FRED NMI context is significantly different and will not work
> +	 * right (speficially FRED fixed the NMI recursion issue).
> +	 */

It's even more important to note that NMIs need an IRET in order to 
unblock further NMIs.  This is even more important than the recursion 
issue, which does not affect KVM's non-IST entry into the NMI handler, 
and is the real reason to use IDT_DO_EVENT_IRQOFF to build the interrupt 
stack frame for NMIs.

Paolo


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-04-28 11:25     ` Paolo Bonzini
@ 2026-05-01 20:31       ` Peter Zijlstra
  0 siblings, 0 replies; 8+ messages in thread
From: Peter Zijlstra @ 2026-05-01 20:31 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Binbin Wu, tglx, linux-kernel, Sean Christopherson, Jim Mattson,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini

On Tue, Apr 28, 2026 at 01:25:58PM +0200, Paolo Bonzini wrote:

> > > +#if IS_ENABLED(CONFIG_KVM_INTEL)
> > > +/*
> > > + * On VMX, NMIs and IRQs (as configured by KVM) are acknowledge by hardware as
> > 
> > s/acknowledge/acknowledged
> > > + * part of the VM-Exit, i.e. the event itself is consumed as part the VM-Exit.
> > > + * x86_entry_from_kvm() is invoked by KVM to effectively forward NMIs and IRQs
> > > + * to the kernel for servicing.  On SVM, a.k.a. AMD, the NMI/IRQ VM-Exit is
> > > + * purely a signal that an NMI/IRQ is pending, i.e. the event that triggered
> > > + * the VM-Exit is held pending until it's unblocked in the host.
> > > + */
> > 
> > [...]
> > 
> > > -
> > > -#if IS_ENABLED(CONFIG_KVM_INTEL)
> > > -/*
> > > - * Special entry point for VMX which invokes this on the kernel stack, even for
> > > - * 64-bit, i.e. without using an IST.  asm_exc_nmi() requires an IST to work
> > 
> > Although it's being removed, I guess what it says is still true?
> > 
> > It says asm_exc_nmi() requires an IST to work correctly, and the new path for
> > handling NMI when FRED is disabled.
> > 
> > idt_entry_from_kvm
> >      idt_do_nmi_irqoff
> >          IDT_DO_EVENT_IRQOFF call asm_exc_nmi
> >              ...
> >              call asm_exc_nmi
> > 
> > It seems the stack before calling asm_exc_nmi is not an IST?
> > Does it matter?
> 
> I think it does, the IST is needed because of all the stuff to detect
> recursive NMIs.  So asm_exc_nmi_kvm_vmx needs to remain.
> 
> By the way, here:
> 
> > +	/*
> > +	 * Notably, we must use IDT dispatch for NMI when running in IDT mode.
> > +	 * The FRED NMI context is significantly different and will not work
> > +	 * right (speficially FRED fixed the NMI recursion issue).
> > +	 */
> 
> It's even more important to note that NMIs need an IRET in order to unblock
> further NMIs.  This is even more important than the recursion issue, which
> does not affect KVM's non-IST entry into the NMI handler, and is the real
> reason to use IDT_DO_EVENT_IRQOFF to build the interrupt stack frame for
> NMIs.

Durr, I missed that: DECLARE_IDTENTRY_NMI != DECLARE_IDTENTRY_RAW

Let me go rectify that.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v2 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-04-23 15:56 ` [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core Peter Zijlstra
  2026-04-23 17:54   ` Xin Li
  2026-04-28  9:43   ` Binbin Wu
@ 2026-05-01 20:37   ` Peter Zijlstra
  2 siblings, 0 replies; 8+ messages in thread
From: Peter Zijlstra @ 2026-05-01 20:37 UTC (permalink / raw)
  To: tglx
  Cc: linux-kernel, Sean Christopherson, Jim Mattson, Binbin Wu,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini


Move the VMX interrupt dispatch magic into the x86 core code. This
isolates KVM from the FRED/IDT decisions and reduces the amount of
EXPORT_SYMBOL_FOR_KVM().

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
---
 arch/x86/entry/Makefile             |    2 -
 arch/x86/entry/common.c             |   48 ++++++++++++++++++++++++++++++++++++
 arch/x86/entry/entry.S              |   46 ++++++++++++++++++++++++++++++++++
 arch/x86/entry/entry_64_fred.S      |    1 
 arch/x86/include/asm/desc.h         |    4 +++
 arch/x86/include/asm/desc_defs.h    |    2 -
 arch/x86/include/asm/entry-common.h |    2 +
 arch/x86/include/asm/fred.h         |    1 
 arch/x86/kernel/idt.c               |   13 +++++++++
 arch/x86/kernel/nmi.c               |    1 
 arch/x86/kvm/vmx/vmenter.S          |   46 ----------------------------------
 arch/x86/kvm/vmx/vmx.c              |   20 ++-------------
 12 files changed, 118 insertions(+), 68 deletions(-)

--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -13,7 +13,7 @@ CFLAGS_REMOVE_syscall_64.o	= $(CC_FLAGS_
 CFLAGS_syscall_32.o		+= -fno-stack-protector
 CFLAGS_syscall_64.o		+= -fno-stack-protector
 
-obj-y				:= entry.o entry_$(BITS).o syscall_$(BITS).o
+obj-y				:= entry.o entry_$(BITS).o syscall_$(BITS).o common.o
 
 obj-y				+= vdso/
 obj-y				+= vsyscall/
--- /dev/null
+++ b/arch/x86/entry/common.c
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/entry-common.h>
+#include <linux/kvm_types.h>
+#include <asm/fred.h>
+#include <asm/desc.h>
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+/*
+ * On VMX, NMIs and IRQs (as configured by KVM) are acknowledged by hardware as
+ * part of the VM-Exit, i.e. the event itself is consumed as part the VM-Exit.
+ * x86_entry_from_kvm() is invoked by KVM to effectively forward NMIs and IRQs
+ * to the kernel for servicing.  On SVM, a.k.a. AMD, the NMI/IRQ VM-Exit is
+ * purely a signal that an NMI/IRQ is pending, i.e. the event that triggered
+ * the VM-Exit is held pending until it's unblocked in the host.
+ */
+noinstr void x86_entry_from_kvm(unsigned int event_type, unsigned int vector)
+{
+	if (event_type == EVENT_TYPE_EXTINT) {
+#ifdef CONFIG_X86_64
+		/*
+		 * Use FRED dispatch, even when running IDT. The dispatch
+		 * tables are kept in sync between FRED and IDT, and the FRED
+		 * dispatch works well with CFI.
+		 */
+		fred_entry_from_kvm(event_type, vector);
+#else
+		idt_entry_from_kvm(vector);
+#endif
+		return;
+	}
+
+	WARN_ON_ONCE(event_type != EVENT_TYPE_NMI);
+
+#ifdef CONFIG_X86_64
+	if (cpu_feature_enabled(X86_FEATURE_FRED))
+		return fred_entry_from_kvm(event_type, vector);
+#endif
+
+	/*
+	 * Notably, we must use IDT dispatch for NMI when running in IDT mode.
+	 * The FRED NMI context is significantly different and will not work
+	 * right (speficially FRED fixed the NMI recursion issue).
+	 */
+	idt_entry_from_kvm(vector);
+}
+EXPORT_SYMBOL_FOR_KVM(x86_entry_from_kvm);
+#endif
--- a/arch/x86/entry/entry.S
+++ b/arch/x86/entry/entry.S
@@ -75,3 +75,49 @@ THUNK warn_thunk_thunk, __warn_thunk
 #if defined(CONFIG_STACKPROTECTOR) && defined(CONFIG_SMP)
 EXPORT_SYMBOL(__ref_stack_chk_guard);
 #endif
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+.macro IDT_DO_EVENT_IRQOFF call_insn call_target
+	/*
+	 * Unconditionally create a stack frame, getting the correct RSP on the
+	 * stack (for x86-64) would take two instructions anyways, and RBP can
+	 * be used to restore RSP to make objtool happy (see below).
+	 */
+	push %_ASM_BP
+	mov %_ASM_SP, %_ASM_BP
+
+#ifdef CONFIG_X86_64
+	/*
+	 * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
+	 * creating the synthetic interrupt stack frame for the IRQ/NMI.
+	 */
+	and  $-16, %rsp
+	push $__KERNEL_DS
+	push %rbp
+#endif
+	pushf
+	push $__KERNEL_CS
+	\call_insn \call_target
+
+	/*
+	 * "Restore" RSP from RBP, even though IRET has already unwound RSP to
+	 * the correct value.  objtool doesn't know the callee will IRET and,
+	 * without the explicit restore, thinks the stack is getting walloped.
+	 * Using an unwind hint is problematic due to x86-64's dynamic alignment.
+	 */
+	leave
+	RET
+.endm
+
+.pushsection .text, "ax"
+SYM_FUNC_START(idt_do_interrupt_irqoff)
+	IDT_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
+SYM_FUNC_END(idt_do_interrupt_irqoff)
+.popsection
+
+.pushsection .noinstr.text, "ax"
+SYM_FUNC_START(idt_do_nmi_irqoff)
+	IDT_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx
+SYM_FUNC_END(idt_do_nmi_irqoff)
+.popsection
+#endif
--- a/arch/x86/entry/entry_64_fred.S
+++ b/arch/x86/entry/entry_64_fred.S
@@ -147,5 +147,4 @@ SYM_FUNC_START(asm_fred_entry_from_kvm)
 	RET
 
 SYM_FUNC_END(asm_fred_entry_from_kvm)
-EXPORT_SYMBOL_FOR_KVM(asm_fred_entry_from_kvm);
 #endif
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -438,6 +438,10 @@ extern void idt_setup_traps(void);
 extern void idt_setup_apic_and_irq_gates(void);
 extern bool idt_is_f00f_address(unsigned long address);
 
+extern void idt_do_interrupt_irqoff(unsigned int vector);
+extern void idt_do_nmi_irqoff(void);
+extern void idt_entry_from_kvm(unsigned int vector);
+
 #ifdef CONFIG_X86_64
 extern void idt_setup_early_pf(void);
 #else
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -145,7 +145,7 @@ struct gate_struct {
 typedef struct gate_struct gate_desc;
 
 #ifndef _SETUP
-static inline unsigned long gate_offset(const gate_desc *g)
+static __always_inline unsigned long gate_offset(const gate_desc *g)
 {
 #ifdef CONFIG_X86_64
 	return g->offset_low | ((unsigned long)g->offset_middle << 16) |
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -97,4 +97,6 @@ static __always_inline void arch_exit_to
 }
 #define arch_exit_to_user_mode arch_exit_to_user_mode
 
+extern void x86_entry_from_kvm(unsigned int entry_type, unsigned int vector);
+
 #endif
--- a/arch/x86/include/asm/fred.h
+++ b/arch/x86/include/asm/fred.h
@@ -110,7 +110,6 @@ static __always_inline unsigned long fre
 static inline void cpu_init_fred_exceptions(void) { }
 static inline void cpu_init_fred_rsps(void) { }
 static inline void fred_complete_exception_setup(void) { }
-static inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { }
 static inline void fred_sync_rsp0(unsigned long rsp0) { }
 static inline void fred_update_rsp0(void) { }
 #endif /* CONFIG_X86_FRED */
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -268,6 +268,19 @@ void __init idt_setup_early_pf(void)
 }
 #endif
 
+noinstr void idt_entry_from_kvm(unsigned int vector)
+{
+	if (vector == NMI_VECTOR)
+		return idt_do_nmi_irqoff();
+
+	/*
+	 * Only the NMI path requires noinstr.
+	 */
+	instrumentation_begin();
+	idt_do_interrupt_irqoff(gate_offset(idt_table + vector));
+	instrumentation_end();
+}
+
 static void __init idt_map_in_cea(void)
 {
 	/*
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -614,7 +614,6 @@ DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx)
 {
 	exc_nmi(regs);
 }
-EXPORT_SYMBOL_FOR_KVM(asm_exc_nmi_kvm_vmx);
 #endif
 
 #ifdef CONFIG_NMI_CHECK_CPU
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -31,38 +31,6 @@
 #define VCPU_R15	__VCPU_REGS_R15 * WORD_SIZE
 #endif
 
-.macro VMX_DO_EVENT_IRQOFF call_insn call_target
-	/*
-	 * Unconditionally create a stack frame, getting the correct RSP on the
-	 * stack (for x86-64) would take two instructions anyways, and RBP can
-	 * be used to restore RSP to make objtool happy (see below).
-	 */
-	push %_ASM_BP
-	mov %_ASM_SP, %_ASM_BP
-
-#ifdef CONFIG_X86_64
-	/*
-	 * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
-	 * creating the synthetic interrupt stack frame for the IRQ/NMI.
-	 */
-	and  $-16, %rsp
-	push $__KERNEL_DS
-	push %rbp
-#endif
-	pushf
-	push $__KERNEL_CS
-	\call_insn \call_target
-
-	/*
-	 * "Restore" RSP from RBP, even though IRET has already unwound RSP to
-	 * the correct value.  objtool doesn't know the callee will IRET and,
-	 * without the explicit restore, thinks the stack is getting walloped.
-	 * Using an unwind hint is problematic due to x86-64's dynamic alignment.
-	 */
-	leave
-	RET
-.endm
-
 .section .noinstr.text, "ax"
 
 /**
@@ -320,10 +288,6 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_
 
 SYM_FUNC_END(__vmx_vcpu_run)
 
-SYM_FUNC_START(vmx_do_nmi_irqoff)
-	VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx
-SYM_FUNC_END(vmx_do_nmi_irqoff)
-
 #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
 
 /**
@@ -375,13 +339,3 @@ SYM_FUNC_START(vmread_error_trampoline)
 	RET
 SYM_FUNC_END(vmread_error_trampoline)
 #endif
-
-.section .text, "ax"
-
-#ifndef CONFIG_X86_FRED
-
-SYM_FUNC_START(vmx_do_interrupt_irqoff)
-	VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
-SYM_FUNC_END(vmx_do_interrupt_irqoff)
-
-#endif
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7083,9 +7083,6 @@ void vmx_load_eoi_exitmap(struct kvm_vcp
 	vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
 }
 
-void vmx_do_interrupt_irqoff(unsigned long entry);
-void vmx_do_nmi_irqoff(void);
-
 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
 {
 	/*
@@ -7127,17 +7124,9 @@ static void handle_external_interrupt_ir
 	    "unexpected VM-Exit interrupt info: 0x%x", intr_info))
 		return;
 
-	/*
-	 * Invoke the kernel's IRQ handler for the vector.  Use the FRED path
-	 * when it's available even if FRED isn't fully enabled, e.g. even if
-	 * FRED isn't supported in hardware, in order to avoid the indirect
-	 * CALL in the non-FRED path.
-	 */
+	/* For the IRQ to the core kernel for processing. */
 	kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
-	if (IS_ENABLED(CONFIG_X86_FRED))
-		fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
-	else
-		vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector));
+	x86_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
 	kvm_after_interrupt(vcpu);
 
 	vcpu->arch.at_instruction_boundary = true;
@@ -7447,10 +7436,7 @@ noinstr void vmx_handle_nmi(struct kvm_v
 		return;
 
 	kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
-	if (cpu_feature_enabled(X86_FEATURE_FRED))
-		fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
-	else
-		vmx_do_nmi_irqoff();
+	x86_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
 	kvm_after_interrupt(vcpu);
 }
 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 2/2] x86/kvm/vmx: Fix VMX vs hrtimer_rearm_deferred()
  2026-04-23 15:56 [PATCH 0/2] x86/kvm/vmx: Fix VMX interrupt injection vs hrtimer_rearm_deferred() Peter Zijlstra
  2026-04-23 15:56 ` [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core Peter Zijlstra
@ 2026-04-23 15:56 ` Peter Zijlstra
  1 sibling, 0 replies; 8+ messages in thread
From: Peter Zijlstra @ 2026-04-23 15:56 UTC (permalink / raw)
  To: tglx
  Cc: linux-kernel, peterz, Sean Christopherson, Jim Mattson, Binbin Wu,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini

Vishal reported that KVM unit test 'x2apic' started failing after commit
0e98eb14814e ("entry: Prepare for deferred hrtimer rearming").

The reason is that KVM/VMX is injecting interrupts while it has IRQs disabled,
for a context that will enable IRQs, this means that regs->flags.X86_EFLAGS_IF
== 0 and the irqentry_exit() will not DTRT.

Notably, irqentry_exit() must not call hrtimer_rearm_deferred() when the return
context does not have IF set, because this will cause problems vs NMIs.

Therefore, fix up the state after the injection.

Fixes: 0e98eb14814e ("entry: Prepare for deferred hrtimer rearming")
Reported-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
Closes: https://lore.kernel.org/r/70cd3e97fbb796e2eb2ff8cd4b7614ada05a5f24.camel%40intel.com
Suggested-by: Thomas Gleixner <tglx@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
---
 arch/x86/entry/common.c |   13 +++++++++++++
 1 file changed, 13 insertions(+)

--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -2,6 +2,7 @@
 
 #include <linux/entry-common.h>
 #include <linux/kvm_types.h>
+#include <linux/hrtimer_rearm.h>
 #include <asm/fred.h>
 #include <asm/desc.h>
 
@@ -27,6 +28,18 @@ noinstr void x86_entry_from_kvm(unsigned
 #else
 		idt_entry_from_kvm(vector);
 #endif
+		/*
+		 * Strictly speaking, only the NMI path requires noinstr.
+		 */
+		instrumentation_begin();
+		/*
+		 * KVM/VMX will dispatch from IRQ-disabled but for a context
+		 * that will have IRQs-enabled. This confuses the entry code
+		 * and it will not have reprogrammed the timer. Do so now.
+		 */
+		hrtimer_rearm_deferred();
+		instrumentation_end();
+
 		return;
 	}
 



^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2026-05-01 20:37 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-23 15:56 [PATCH 0/2] x86/kvm/vmx: Fix VMX interrupt injection vs hrtimer_rearm_deferred() Peter Zijlstra
2026-04-23 15:56 ` [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core Peter Zijlstra
2026-04-23 17:54   ` Xin Li
2026-04-28  9:43   ` Binbin Wu
2026-04-28 11:25     ` Paolo Bonzini
2026-05-01 20:31       ` Peter Zijlstra
2026-05-01 20:37   ` [PATCH v2 " Peter Zijlstra
2026-04-23 15:56 ` [PATCH 2/2] x86/kvm/vmx: Fix VMX vs hrtimer_rearm_deferred() Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox