Kernel KVM virtualization development
 help / color / mirror / Atom feed
* [PATCH 0/2] x86/kvm/vmx: Fix VMX interrupt injection vs hrtimer_rearm_deferred()
@ 2026-04-23 15:56 Peter Zijlstra
  2026-04-23 15:56 ` [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core Peter Zijlstra
  2026-04-23 15:56 ` [PATCH 2/2] x86/kvm/vmx: Fix VMX vs hrtimer_rearm_deferred() Peter Zijlstra
  0 siblings, 2 replies; 28+ messages in thread
From: Peter Zijlstra @ 2026-04-23 15:56 UTC (permalink / raw)
  To: tglx
  Cc: linux-kernel, peterz, Sean Christopherson, Jim Mattson, Binbin Wu,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini

Hi,

Nicely split out in two patches, but otherwise unchanged (except for a small
comment).


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-04-23 15:56 [PATCH 0/2] x86/kvm/vmx: Fix VMX interrupt injection vs hrtimer_rearm_deferred() Peter Zijlstra
@ 2026-04-23 15:56 ` Peter Zijlstra
  2026-04-23 17:54   ` Xin Li
                     ` (3 more replies)
  2026-04-23 15:56 ` [PATCH 2/2] x86/kvm/vmx: Fix VMX vs hrtimer_rearm_deferred() Peter Zijlstra
  1 sibling, 4 replies; 28+ messages in thread
From: Peter Zijlstra @ 2026-04-23 15:56 UTC (permalink / raw)
  To: tglx
  Cc: linux-kernel, peterz, Sean Christopherson, Jim Mattson, Binbin Wu,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini

Move the VMX interrupt dispatch magic into the x86 core code. This
isolates KVM from the FRED/IDT decisions and reduces the amount of
EXPORT_SYMBOL_FOR_KVM().

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
---
 arch/x86/entry/Makefile             |    2 -
 arch/x86/entry/common.c             |   48 ++++++++++++++++++++++++++++++++++++
 arch/x86/entry/entry.S              |   46 ++++++++++++++++++++++++++++++++++
 arch/x86/entry/entry_64_fred.S      |    1 
 arch/x86/include/asm/desc.h         |    4 +++
 arch/x86/include/asm/desc_defs.h    |    2 -
 arch/x86/include/asm/entry-common.h |    2 +
 arch/x86/include/asm/fred.h         |    1 
 arch/x86/include/asm/idtentry.h     |   11 --------
 arch/x86/kernel/idt.c               |   13 +++++++++
 arch/x86/kernel/nmi.c               |    8 ------
 arch/x86/kvm/vmx/vmenter.S          |   46 ----------------------------------
 arch/x86/kvm/vmx/vmx.c              |   20 ++-------------
 13 files changed, 118 insertions(+), 86 deletions(-)

--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -13,7 +13,7 @@ CFLAGS_REMOVE_syscall_64.o	= $(CC_FLAGS_
 CFLAGS_syscall_32.o		+= -fno-stack-protector
 CFLAGS_syscall_64.o		+= -fno-stack-protector
 
-obj-y				:= entry.o entry_$(BITS).o syscall_$(BITS).o
+obj-y				:= entry.o entry_$(BITS).o syscall_$(BITS).o common.o
 
 obj-y				+= vdso/
 obj-y				+= vsyscall/
--- /dev/null
+++ b/arch/x86/entry/common.c
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/entry-common.h>
+#include <linux/kvm_types.h>
+#include <asm/fred.h>
+#include <asm/desc.h>
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+/*
+ * On VMX, NMIs and IRQs (as configured by KVM) are acknowledge by hardware as
+ * part of the VM-Exit, i.e. the event itself is consumed as part the VM-Exit.
+ * x86_entry_from_kvm() is invoked by KVM to effectively forward NMIs and IRQs
+ * to the kernel for servicing.  On SVM, a.k.a. AMD, the NMI/IRQ VM-Exit is
+ * purely a signal that an NMI/IRQ is pending, i.e. the event that triggered
+ * the VM-Exit is held pending until it's unblocked in the host.
+ */
+noinstr void x86_entry_from_kvm(unsigned int event_type, unsigned int vector)
+{
+	if (event_type == EVENT_TYPE_EXTINT) {
+#ifdef CONFIG_X86_64
+		/*
+		 * Use FRED dispatch, even when running IDT. The dispatch
+		 * tables are kept in sync between FRED and IDT, and the FRED
+		 * dispatch works well with CFI.
+		 */
+		fred_entry_from_kvm(event_type, vector);
+#else
+		idt_entry_from_kvm(vector);
+#endif
+		return;
+	}
+
+	WARN_ON_ONCE(event_type != EVENT_TYPE_NMI);
+
+#ifdef CONFIG_X86_64
+	if (cpu_feature_enabled(X86_FEATURE_FRED))
+		return fred_entry_from_kvm(event_type, vector);
+#endif
+
+	/*
+	 * Notably, we must use IDT dispatch for NMI when running in IDT mode.
+	 * The FRED NMI context is significantly different and will not work
+	 * right (speficially FRED fixed the NMI recursion issue).
+	 */
+	idt_entry_from_kvm(vector);
+}
+EXPORT_SYMBOL_FOR_KVM(x86_entry_from_kvm);
+#endif
--- a/arch/x86/entry/entry.S
+++ b/arch/x86/entry/entry.S
@@ -75,3 +75,49 @@ THUNK warn_thunk_thunk, __warn_thunk
 #if defined(CONFIG_STACKPROTECTOR) && defined(CONFIG_SMP)
 EXPORT_SYMBOL(__ref_stack_chk_guard);
 #endif
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+.macro IDT_DO_EVENT_IRQOFF call_insn call_target
+	/*
+	 * Unconditionally create a stack frame, getting the correct RSP on the
+	 * stack (for x86-64) would take two instructions anyways, and RBP can
+	 * be used to restore RSP to make objtool happy (see below).
+	 */
+	push %_ASM_BP
+	mov %_ASM_SP, %_ASM_BP
+
+#ifdef CONFIG_X86_64
+	/*
+	 * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
+	 * creating the synthetic interrupt stack frame for the IRQ/NMI.
+	 */
+	and  $-16, %rsp
+	push $__KERNEL_DS
+	push %rbp
+#endif
+	pushf
+	push $__KERNEL_CS
+	\call_insn \call_target
+
+	/*
+	 * "Restore" RSP from RBP, even though IRET has already unwound RSP to
+	 * the correct value.  objtool doesn't know the callee will IRET and,
+	 * without the explicit restore, thinks the stack is getting walloped.
+	 * Using an unwind hint is problematic due to x86-64's dynamic alignment.
+	 */
+	leave
+	RET
+.endm
+
+.pushsection .text, "ax"
+SYM_FUNC_START(idt_do_interrupt_irqoff)
+	IDT_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
+SYM_FUNC_END(idt_do_interrupt_irqoff)
+.popsection
+
+.pushsection .noinstr.text, "ax"
+SYM_FUNC_START(idt_do_nmi_irqoff)
+	IDT_DO_EVENT_IRQOFF call asm_exc_nmi
+SYM_FUNC_END(idt_do_nmi_irqoff)
+.popsection
+#endif
--- a/arch/x86/entry/entry_64_fred.S
+++ b/arch/x86/entry/entry_64_fred.S
@@ -147,5 +147,4 @@ SYM_FUNC_START(asm_fred_entry_from_kvm)
 	RET
 
 SYM_FUNC_END(asm_fred_entry_from_kvm)
-EXPORT_SYMBOL_FOR_KVM(asm_fred_entry_from_kvm);
 #endif
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -438,6 +438,10 @@ extern void idt_setup_traps(void);
 extern void idt_setup_apic_and_irq_gates(void);
 extern bool idt_is_f00f_address(unsigned long address);
 
+extern void idt_do_interrupt_irqoff(unsigned int vector);
+extern void idt_do_nmi_irqoff(void);
+extern void idt_entry_from_kvm(unsigned int vector);
+
 #ifdef CONFIG_X86_64
 extern void idt_setup_early_pf(void);
 #else
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -145,7 +145,7 @@ struct gate_struct {
 typedef struct gate_struct gate_desc;
 
 #ifndef _SETUP
-static inline unsigned long gate_offset(const gate_desc *g)
+static __always_inline unsigned long gate_offset(const gate_desc *g)
 {
 #ifdef CONFIG_X86_64
 	return g->offset_low | ((unsigned long)g->offset_middle << 16) |
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -97,4 +97,6 @@ static __always_inline void arch_exit_to
 }
 #define arch_exit_to_user_mode arch_exit_to_user_mode
 
+extern void x86_entry_from_kvm(unsigned int entry_type, unsigned int vector);
+
 #endif
--- a/arch/x86/include/asm/fred.h
+++ b/arch/x86/include/asm/fred.h
@@ -110,7 +110,6 @@ static __always_inline unsigned long fre
 static inline void cpu_init_fred_exceptions(void) { }
 static inline void cpu_init_fred_rsps(void) { }
 static inline void fred_complete_exception_setup(void) { }
-static inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { }
 static inline void fred_sync_rsp0(unsigned long rsp0) { }
 static inline void fred_update_rsp0(void) { }
 #endif /* CONFIG_X86_FRED */
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -633,17 +633,6 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC,	xenpv_
 #endif
 
 /* NMI */
-
-#if IS_ENABLED(CONFIG_KVM_INTEL)
-/*
- * Special entry point for VMX which invokes this on the kernel stack, even for
- * 64-bit, i.e. without using an IST.  asm_exc_nmi() requires an IST to work
- * correctly vs. the NMI 'executing' marker.  Used for 32-bit kernels as well
- * to avoid more ifdeffery.
- */
-DECLARE_IDTENTRY(X86_TRAP_NMI,		exc_nmi_kvm_vmx);
-#endif
-
 DECLARE_IDTENTRY_NMI(X86_TRAP_NMI,	exc_nmi);
 #ifdef CONFIG_XEN_PV
 DECLARE_IDTENTRY_RAW(X86_TRAP_NMI,	xenpv_exc_nmi);
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -268,6 +268,19 @@ void __init idt_setup_early_pf(void)
 }
 #endif
 
+noinstr void idt_entry_from_kvm(unsigned int vector)
+{
+	if (vector == NMI_VECTOR)
+		return idt_do_nmi_irqoff();
+
+	/*
+	 * Only the NMI path requires noinstr.
+	 */
+	instrumentation_begin();
+	idt_do_interrupt_irqoff(gate_offset(idt_table + vector));
+	instrumentation_end();
+}
+
 static void __init idt_map_in_cea(void)
 {
 	/*
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -609,14 +609,6 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
 		goto nmi_restart;
 }
 
-#if IS_ENABLED(CONFIG_KVM_INTEL)
-DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx)
-{
-	exc_nmi(regs);
-}
-EXPORT_SYMBOL_FOR_KVM(asm_exc_nmi_kvm_vmx);
-#endif
-
 #ifdef CONFIG_NMI_CHECK_CPU
 
 static char *nmi_check_stall_msg[] = {
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -31,38 +31,6 @@
 #define VCPU_R15	__VCPU_REGS_R15 * WORD_SIZE
 #endif
 
-.macro VMX_DO_EVENT_IRQOFF call_insn call_target
-	/*
-	 * Unconditionally create a stack frame, getting the correct RSP on the
-	 * stack (for x86-64) would take two instructions anyways, and RBP can
-	 * be used to restore RSP to make objtool happy (see below).
-	 */
-	push %_ASM_BP
-	mov %_ASM_SP, %_ASM_BP
-
-#ifdef CONFIG_X86_64
-	/*
-	 * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
-	 * creating the synthetic interrupt stack frame for the IRQ/NMI.
-	 */
-	and  $-16, %rsp
-	push $__KERNEL_DS
-	push %rbp
-#endif
-	pushf
-	push $__KERNEL_CS
-	\call_insn \call_target
-
-	/*
-	 * "Restore" RSP from RBP, even though IRET has already unwound RSP to
-	 * the correct value.  objtool doesn't know the callee will IRET and,
-	 * without the explicit restore, thinks the stack is getting walloped.
-	 * Using an unwind hint is problematic due to x86-64's dynamic alignment.
-	 */
-	leave
-	RET
-.endm
-
 .section .noinstr.text, "ax"
 
 /**
@@ -320,10 +288,6 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_
 
 SYM_FUNC_END(__vmx_vcpu_run)
 
-SYM_FUNC_START(vmx_do_nmi_irqoff)
-	VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx
-SYM_FUNC_END(vmx_do_nmi_irqoff)
-
 #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
 
 /**
@@ -375,13 +339,3 @@ SYM_FUNC_START(vmread_error_trampoline)
 	RET
 SYM_FUNC_END(vmread_error_trampoline)
 #endif
-
-.section .text, "ax"
-
-#ifndef CONFIG_X86_FRED
-
-SYM_FUNC_START(vmx_do_interrupt_irqoff)
-	VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
-SYM_FUNC_END(vmx_do_interrupt_irqoff)
-
-#endif
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7083,9 +7083,6 @@ void vmx_load_eoi_exitmap(struct kvm_vcp
 	vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
 }
 
-void vmx_do_interrupt_irqoff(unsigned long entry);
-void vmx_do_nmi_irqoff(void);
-
 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
 {
 	/*
@@ -7127,17 +7124,9 @@ static void handle_external_interrupt_ir
 	    "unexpected VM-Exit interrupt info: 0x%x", intr_info))
 		return;
 
-	/*
-	 * Invoke the kernel's IRQ handler for the vector.  Use the FRED path
-	 * when it's available even if FRED isn't fully enabled, e.g. even if
-	 * FRED isn't supported in hardware, in order to avoid the indirect
-	 * CALL in the non-FRED path.
-	 */
+	/* For the IRQ to the core kernel for processing. */
 	kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
-	if (IS_ENABLED(CONFIG_X86_FRED))
-		fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
-	else
-		vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector));
+	x86_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
 	kvm_after_interrupt(vcpu);
 
 	vcpu->arch.at_instruction_boundary = true;
@@ -7447,10 +7436,7 @@ noinstr void vmx_handle_nmi(struct kvm_v
 		return;
 
 	kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
-	if (cpu_feature_enabled(X86_FEATURE_FRED))
-		fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
-	else
-		vmx_do_nmi_irqoff();
+	x86_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
 	kvm_after_interrupt(vcpu);
 }
 



^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 2/2] x86/kvm/vmx: Fix VMX vs hrtimer_rearm_deferred()
  2026-04-23 15:56 [PATCH 0/2] x86/kvm/vmx: Fix VMX interrupt injection vs hrtimer_rearm_deferred() Peter Zijlstra
  2026-04-23 15:56 ` [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core Peter Zijlstra
@ 2026-04-23 15:56 ` Peter Zijlstra
  2026-05-11 12:59   ` David Woodhouse
                     ` (2 more replies)
  1 sibling, 3 replies; 28+ messages in thread
From: Peter Zijlstra @ 2026-04-23 15:56 UTC (permalink / raw)
  To: tglx
  Cc: linux-kernel, peterz, Sean Christopherson, Jim Mattson, Binbin Wu,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini

Vishal reported that KVM unit test 'x2apic' started failing after commit
0e98eb14814e ("entry: Prepare for deferred hrtimer rearming").

The reason is that KVM/VMX is injecting interrupts while it has IRQs disabled,
for a context that will enable IRQs, this means that regs->flags.X86_EFLAGS_IF
== 0 and the irqentry_exit() will not DTRT.

Notably, irqentry_exit() must not call hrtimer_rearm_deferred() when the return
context does not have IF set, because this will cause problems vs NMIs.

Therefore, fix up the state after the injection.

Fixes: 0e98eb14814e ("entry: Prepare for deferred hrtimer rearming")
Reported-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
Closes: https://lore.kernel.org/r/70cd3e97fbb796e2eb2ff8cd4b7614ada05a5f24.camel%40intel.com
Suggested-by: Thomas Gleixner <tglx@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
---
 arch/x86/entry/common.c |   13 +++++++++++++
 1 file changed, 13 insertions(+)

--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -2,6 +2,7 @@
 
 #include <linux/entry-common.h>
 #include <linux/kvm_types.h>
+#include <linux/hrtimer_rearm.h>
 #include <asm/fred.h>
 #include <asm/desc.h>
 
@@ -27,6 +28,18 @@ noinstr void x86_entry_from_kvm(unsigned
 #else
 		idt_entry_from_kvm(vector);
 #endif
+		/*
+		 * Strictly speaking, only the NMI path requires noinstr.
+		 */
+		instrumentation_begin();
+		/*
+		 * KVM/VMX will dispatch from IRQ-disabled but for a context
+		 * that will have IRQs-enabled. This confuses the entry code
+		 * and it will not have reprogrammed the timer. Do so now.
+		 */
+		hrtimer_rearm_deferred();
+		instrumentation_end();
+
 		return;
 	}
 



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-04-23 15:56 ` [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core Peter Zijlstra
@ 2026-04-23 17:54   ` Xin Li
  2026-04-28  9:43   ` Binbin Wu
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 28+ messages in thread
From: Xin Li @ 2026-04-23 17:54 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, linux-kernel, Sean Christopherson, Jim Mattson, Binbin Wu,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini

Few nits below, but

Reviewed-by: Xin Li <xin@zytor.com <mailto:xin@zytor.com>>


> On Apr 23, 2026, at 8:56 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> Move the VMX interrupt dispatch magic into the x86 core code. This
> isolates KVM from the FRED/IDT decisions and reduces the amount of
> EXPORT_SYMBOL_FOR_KVM().
> 
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Tested-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
> --- /dev/null
> +++ b/arch/x86/entry/common.c
> @@ -0,0 +1,48 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#include <linux/entry-common.h>
> +#include <linux/kvm_types.h>
> +#include <asm/fred.h>
> +#include <asm/desc.h>

Swap the two header inclusion order?

> +
> +#if IS_ENABLED(CONFIG_KVM_INTEL)
> +/*
> + * On VMX, NMIs and IRQs (as configured by KVM) are acknowledge by hardware as
> + * part of the VM-Exit, i.e. the event itself is consumed as part the VM-Exit.
> + * x86_entry_from_kvm() is invoked by KVM to effectively forward NMIs and IRQs
> + * to the kernel for servicing.  On SVM, a.k.a. AMD, the NMI/IRQ VM-Exit is
> + * purely a signal that an NMI/IRQ is pending, i.e. the event that triggered
> + * the VM-Exit is held pending until it's unblocked in the host.
> + */
> +noinstr void x86_entry_from_kvm(unsigned int event_type, unsigned int vector)

Is u32 better than unsigned int as is close to hardware definition?

Currently, the event type and event vector are defined as 4-bit and 8-bit
fields, respectively; and they can be readily expanded to 8 and 16 bits by
utilizing adjacent reserved bits.

> +{
> + if (event_type == EVENT_TYPE_EXTINT) {
> +#ifdef CONFIG_X86_64
> + /*
> + * Use FRED dispatch, even when running IDT. The dispatch
> + * tables are kept in sync between FRED and IDT, and the FRED
> + * dispatch works well with CFI.
> + */
> + fred_entry_from_kvm(event_type, vector);

Then should we rename the “fred_” prefix, say “x86_64_"?

Sorry naming is hard to me ;)

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-04-23 15:56 ` [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core Peter Zijlstra
  2026-04-23 17:54   ` Xin Li
@ 2026-04-28  9:43   ` Binbin Wu
  2026-04-28 11:25     ` Paolo Bonzini
  2026-05-01 20:37   ` [PATCH v2 " Peter Zijlstra
  2026-05-08  9:18   ` [PATCH v3 " Peter Zijlstra
  3 siblings, 1 reply; 28+ messages in thread
From: Binbin Wu @ 2026-04-28  9:43 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, linux-kernel, Sean Christopherson, Jim Mattson,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini



On 4/23/2026 11:56 PM, Peter Zijlstra wrote:
> Move the VMX interrupt dispatch magic into the x86 core code. This
> isolates KVM from the FRED/IDT decisions and reduces the amount of
> EXPORT_SYMBOL_FOR_KVM().
> 
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Tested-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
> ---
>  arch/x86/entry/Makefile             |    2 -
>  arch/x86/entry/common.c             |   48 ++++++++++++++++++++++++++++++++++++
>  arch/x86/entry/entry.S              |   46 ++++++++++++++++++++++++++++++++++
>  arch/x86/entry/entry_64_fred.S      |    1 
>  arch/x86/include/asm/desc.h         |    4 +++
>  arch/x86/include/asm/desc_defs.h    |    2 -
>  arch/x86/include/asm/entry-common.h |    2 +
>  arch/x86/include/asm/fred.h         |    1 
>  arch/x86/include/asm/idtentry.h     |   11 --------
>  arch/x86/kernel/idt.c               |   13 +++++++++
>  arch/x86/kernel/nmi.c               |    8 ------
>  arch/x86/kvm/vmx/vmenter.S          |   46 ----------------------------------
>  arch/x86/kvm/vmx/vmx.c              |   20 ++-------------
>  13 files changed, 118 insertions(+), 86 deletions(-)
> 
> --- a/arch/x86/entry/Makefile
> +++ b/arch/x86/entry/Makefile
> @@ -13,7 +13,7 @@ CFLAGS_REMOVE_syscall_64.o	= $(CC_FLAGS_
>  CFLAGS_syscall_32.o		+= -fno-stack-protector
>  CFLAGS_syscall_64.o		+= -fno-stack-protector
>  
> -obj-y				:= entry.o entry_$(BITS).o syscall_$(BITS).o
> +obj-y				:= entry.o entry_$(BITS).o syscall_$(BITS).o common.o
>  
>  obj-y				+= vdso/
>  obj-y				+= vsyscall/
> --- /dev/null
> +++ b/arch/x86/entry/common.c
> @@ -0,0 +1,48 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#include <linux/entry-common.h>
> +#include <linux/kvm_types.h>
> +#include <asm/fred.h>
> +#include <asm/desc.h>
> +
> +#if IS_ENABLED(CONFIG_KVM_INTEL)
> +/*
> + * On VMX, NMIs and IRQs (as configured by KVM) are acknowledge by hardware as

s/acknowledge/acknowledged
 
> + * part of the VM-Exit, i.e. the event itself is consumed as part the VM-Exit.
> + * x86_entry_from_kvm() is invoked by KVM to effectively forward NMIs and IRQs
> + * to the kernel for servicing.  On SVM, a.k.a. AMD, the NMI/IRQ VM-Exit is
> + * purely a signal that an NMI/IRQ is pending, i.e. the event that triggered
> + * the VM-Exit is held pending until it's unblocked in the host.
> + */

[...]

> -
> -#if IS_ENABLED(CONFIG_KVM_INTEL)
> -/*
> - * Special entry point for VMX which invokes this on the kernel stack, even for
> - * 64-bit, i.e. without using an IST.  asm_exc_nmi() requires an IST to work

Although it's being removed, I guess what it says is still true?

It says asm_exc_nmi() requires an IST to work correctly, and the new path for
handling NMI when FRED is disabled.

idt_entry_from_kvm
    idt_do_nmi_irqoff
        IDT_DO_EVENT_IRQOFF call asm_exc_nmi
            ...
            call asm_exc_nmi

It seems the stack before calling asm_exc_nmi is not an IST?
Does it matter?

> - * correctly vs. the NMI 'executing' marker.  Used for 32-bit kernels as well
> - * to avoid more ifdeffery.
> - */
> -DECLARE_IDTENTRY(X86_TRAP_NMI,		exc_nmi_kvm_vmx);
> -#endif
> -
>  DECLARE_IDTENTRY_NMI(X86_TRAP_NMI,	exc_nmi);
>  #ifdef CONFIG_XEN_PV
>  DECLARE_IDTENTRY_RAW(X86_TRAP_NMI,	xenpv_exc_nmi);

[...]

> @@ -7127,17 +7124,9 @@ static void handle_external_interrupt_ir
>  	    "unexpected VM-Exit interrupt info: 0x%x", intr_info))
>  		return;
>  
> -	/*
> -	 * Invoke the kernel's IRQ handler for the vector.  Use the FRED path
> -	 * when it's available even if FRED isn't fully enabled, e.g. even if
> -	 * FRED isn't supported in hardware, in order to avoid the indirect
> -	 * CALL in the non-FRED path.
> -	 */
> +	/* For the IRQ to the core kernel for processing. */

For -> Forward?

>  	kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
> -	if (IS_ENABLED(CONFIG_X86_FRED))
> -		fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
> -	else
> -		vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector));
> +	x86_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
>  	kvm_after_interrupt(vcpu);
>  
>  	vcpu->arch.at_instruction_boundary = true;
> @@ -7447,10 +7436,7 @@ noinstr void vmx_handle_nmi(struct kvm_v
>  		return;
>  
>  	kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
> -	if (cpu_feature_enabled(X86_FEATURE_FRED))
> -		fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
> -	else
> -		vmx_do_nmi_irqoff();
> +	x86_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
>  	kvm_after_interrupt(vcpu);
>  }
>  
> 
> 
> 


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-04-28  9:43   ` Binbin Wu
@ 2026-04-28 11:25     ` Paolo Bonzini
  2026-05-01 20:31       ` Peter Zijlstra
  0 siblings, 1 reply; 28+ messages in thread
From: Paolo Bonzini @ 2026-04-28 11:25 UTC (permalink / raw)
  To: Binbin Wu, Peter Zijlstra
  Cc: tglx, linux-kernel, Sean Christopherson, Jim Mattson,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini

On 4/28/26 11:43, Binbin Wu wrote:
> 
> 
> On 4/23/2026 11:56 PM, Peter Zijlstra wrote:
>> Move the VMX interrupt dispatch magic into the x86 core code. This
>> isolates KVM from the FRED/IDT decisions and reduces the amount of
>> EXPORT_SYMBOL_FOR_KVM().
>>
>> Suggested-by: Sean Christopherson <seanjc@google.com>
>> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> Tested-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
>> ---
>>   arch/x86/entry/Makefile             |    2 -
>>   arch/x86/entry/common.c             |   48 ++++++++++++++++++++++++++++++++++++
>>   arch/x86/entry/entry.S              |   46 ++++++++++++++++++++++++++++++++++
>>   arch/x86/entry/entry_64_fred.S      |    1
>>   arch/x86/include/asm/desc.h         |    4 +++
>>   arch/x86/include/asm/desc_defs.h    |    2 -
>>   arch/x86/include/asm/entry-common.h |    2 +
>>   arch/x86/include/asm/fred.h         |    1
>>   arch/x86/include/asm/idtentry.h     |   11 --------
>>   arch/x86/kernel/idt.c               |   13 +++++++++
>>   arch/x86/kernel/nmi.c               |    8 ------
>>   arch/x86/kvm/vmx/vmenter.S          |   46 ----------------------------------
>>   arch/x86/kvm/vmx/vmx.c              |   20 ++-------------
>>   13 files changed, 118 insertions(+), 86 deletions(-)
>>
>> --- a/arch/x86/entry/Makefile
>> +++ b/arch/x86/entry/Makefile
>> @@ -13,7 +13,7 @@ CFLAGS_REMOVE_syscall_64.o	= $(CC_FLAGS_
>>   CFLAGS_syscall_32.o		+= -fno-stack-protector
>>   CFLAGS_syscall_64.o		+= -fno-stack-protector
>>   
>> -obj-y				:= entry.o entry_$(BITS).o syscall_$(BITS).o
>> +obj-y				:= entry.o entry_$(BITS).o syscall_$(BITS).o common.o
>>   
>>   obj-y				+= vdso/
>>   obj-y				+= vsyscall/
>> --- /dev/null
>> +++ b/arch/x86/entry/common.c
>> @@ -0,0 +1,48 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +
>> +#include <linux/entry-common.h>
>> +#include <linux/kvm_types.h>
>> +#include <asm/fred.h>
>> +#include <asm/desc.h>
>> +
>> +#if IS_ENABLED(CONFIG_KVM_INTEL)
>> +/*
>> + * On VMX, NMIs and IRQs (as configured by KVM) are acknowledge by hardware as
> 
> s/acknowledge/acknowledged
>   
>> + * part of the VM-Exit, i.e. the event itself is consumed as part the VM-Exit.
>> + * x86_entry_from_kvm() is invoked by KVM to effectively forward NMIs and IRQs
>> + * to the kernel for servicing.  On SVM, a.k.a. AMD, the NMI/IRQ VM-Exit is
>> + * purely a signal that an NMI/IRQ is pending, i.e. the event that triggered
>> + * the VM-Exit is held pending until it's unblocked in the host.
>> + */
> 
> [...]
> 
>> -
>> -#if IS_ENABLED(CONFIG_KVM_INTEL)
>> -/*
>> - * Special entry point for VMX which invokes this on the kernel stack, even for
>> - * 64-bit, i.e. without using an IST.  asm_exc_nmi() requires an IST to work
> 
> Although it's being removed, I guess what it says is still true?
> 
> It says asm_exc_nmi() requires an IST to work correctly, and the new path for
> handling NMI when FRED is disabled.
> 
> idt_entry_from_kvm
>      idt_do_nmi_irqoff
>          IDT_DO_EVENT_IRQOFF call asm_exc_nmi
>              ...
>              call asm_exc_nmi
> 
> It seems the stack before calling asm_exc_nmi is not an IST?
> Does it matter?

I think it does, the IST is needed because of all the stuff to detect 
recursive NMIs.  So asm_exc_nmi_kvm_vmx needs to remain.

By the way, here:

> +	/*
> +	 * Notably, we must use IDT dispatch for NMI when running in IDT mode.
> +	 * The FRED NMI context is significantly different and will not work
> +	 * right (speficially FRED fixed the NMI recursion issue).
> +	 */

It's even more important to note that NMIs need an IRET in order to 
unblock further NMIs.  This is even more important than the recursion 
issue, which does not affect KVM's non-IST entry into the NMI handler, 
and is the real reason to use IDT_DO_EVENT_IRQOFF to build the interrupt 
stack frame for NMIs.

Paolo


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-04-28 11:25     ` Paolo Bonzini
@ 2026-05-01 20:31       ` Peter Zijlstra
  0 siblings, 0 replies; 28+ messages in thread
From: Peter Zijlstra @ 2026-05-01 20:31 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Binbin Wu, tglx, linux-kernel, Sean Christopherson, Jim Mattson,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini

On Tue, Apr 28, 2026 at 01:25:58PM +0200, Paolo Bonzini wrote:

> > > +#if IS_ENABLED(CONFIG_KVM_INTEL)
> > > +/*
> > > + * On VMX, NMIs and IRQs (as configured by KVM) are acknowledge by hardware as
> > 
> > s/acknowledge/acknowledged
> > > + * part of the VM-Exit, i.e. the event itself is consumed as part the VM-Exit.
> > > + * x86_entry_from_kvm() is invoked by KVM to effectively forward NMIs and IRQs
> > > + * to the kernel for servicing.  On SVM, a.k.a. AMD, the NMI/IRQ VM-Exit is
> > > + * purely a signal that an NMI/IRQ is pending, i.e. the event that triggered
> > > + * the VM-Exit is held pending until it's unblocked in the host.
> > > + */
> > 
> > [...]
> > 
> > > -
> > > -#if IS_ENABLED(CONFIG_KVM_INTEL)
> > > -/*
> > > - * Special entry point for VMX which invokes this on the kernel stack, even for
> > > - * 64-bit, i.e. without using an IST.  asm_exc_nmi() requires an IST to work
> > 
> > Although it's being removed, I guess what it says is still true?
> > 
> > It says asm_exc_nmi() requires an IST to work correctly, and the new path for
> > handling NMI when FRED is disabled.
> > 
> > idt_entry_from_kvm
> >      idt_do_nmi_irqoff
> >          IDT_DO_EVENT_IRQOFF call asm_exc_nmi
> >              ...
> >              call asm_exc_nmi
> > 
> > It seems the stack before calling asm_exc_nmi is not an IST?
> > Does it matter?
> 
> I think it does, the IST is needed because of all the stuff to detect
> recursive NMIs.  So asm_exc_nmi_kvm_vmx needs to remain.
> 
> By the way, here:
> 
> > +	/*
> > +	 * Notably, we must use IDT dispatch for NMI when running in IDT mode.
> > +	 * The FRED NMI context is significantly different and will not work
> > +	 * right (speficially FRED fixed the NMI recursion issue).
> > +	 */
> 
> It's even more important to note that NMIs need an IRET in order to unblock
> further NMIs.  This is even more important than the recursion issue, which
> does not affect KVM's non-IST entry into the NMI handler, and is the real
> reason to use IDT_DO_EVENT_IRQOFF to build the interrupt stack frame for
> NMIs.

Durr, I missed that: DECLARE_IDTENTRY_NMI != DECLARE_IDTENTRY_RAW

Let me go rectify that.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v2 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-04-23 15:56 ` [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core Peter Zijlstra
  2026-04-23 17:54   ` Xin Li
  2026-04-28  9:43   ` Binbin Wu
@ 2026-05-01 20:37   ` Peter Zijlstra
  2026-05-08  2:54     ` Yan Zhao
  2026-05-08  6:09     ` Binbin Wu
  2026-05-08  9:18   ` [PATCH v3 " Peter Zijlstra
  3 siblings, 2 replies; 28+ messages in thread
From: Peter Zijlstra @ 2026-05-01 20:37 UTC (permalink / raw)
  To: tglx
  Cc: linux-kernel, Sean Christopherson, Jim Mattson, Binbin Wu,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini


Move the VMX interrupt dispatch magic into the x86 core code. This
isolates KVM from the FRED/IDT decisions and reduces the amount of
EXPORT_SYMBOL_FOR_KVM().

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
---
 arch/x86/entry/Makefile             |    2 -
 arch/x86/entry/common.c             |   48 ++++++++++++++++++++++++++++++++++++
 arch/x86/entry/entry.S              |   46 ++++++++++++++++++++++++++++++++++
 arch/x86/entry/entry_64_fred.S      |    1 
 arch/x86/include/asm/desc.h         |    4 +++
 arch/x86/include/asm/desc_defs.h    |    2 -
 arch/x86/include/asm/entry-common.h |    2 +
 arch/x86/include/asm/fred.h         |    1 
 arch/x86/kernel/idt.c               |   13 +++++++++
 arch/x86/kernel/nmi.c               |    1 
 arch/x86/kvm/vmx/vmenter.S          |   46 ----------------------------------
 arch/x86/kvm/vmx/vmx.c              |   20 ++-------------
 12 files changed, 118 insertions(+), 68 deletions(-)

--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -13,7 +13,7 @@ CFLAGS_REMOVE_syscall_64.o	= $(CC_FLAGS_
 CFLAGS_syscall_32.o		+= -fno-stack-protector
 CFLAGS_syscall_64.o		+= -fno-stack-protector
 
-obj-y				:= entry.o entry_$(BITS).o syscall_$(BITS).o
+obj-y				:= entry.o entry_$(BITS).o syscall_$(BITS).o common.o
 
 obj-y				+= vdso/
 obj-y				+= vsyscall/
--- /dev/null
+++ b/arch/x86/entry/common.c
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/entry-common.h>
+#include <linux/kvm_types.h>
+#include <asm/fred.h>
+#include <asm/desc.h>
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+/*
+ * On VMX, NMIs and IRQs (as configured by KVM) are acknowledged by hardware as
+ * part of the VM-Exit, i.e. the event itself is consumed as part the VM-Exit.
+ * x86_entry_from_kvm() is invoked by KVM to effectively forward NMIs and IRQs
+ * to the kernel for servicing.  On SVM, a.k.a. AMD, the NMI/IRQ VM-Exit is
+ * purely a signal that an NMI/IRQ is pending, i.e. the event that triggered
+ * the VM-Exit is held pending until it's unblocked in the host.
+ */
+noinstr void x86_entry_from_kvm(unsigned int event_type, unsigned int vector)
+{
+	if (event_type == EVENT_TYPE_EXTINT) {
+#ifdef CONFIG_X86_64
+		/*
+		 * Use FRED dispatch, even when running IDT. The dispatch
+		 * tables are kept in sync between FRED and IDT, and the FRED
+		 * dispatch works well with CFI.
+		 */
+		fred_entry_from_kvm(event_type, vector);
+#else
+		idt_entry_from_kvm(vector);
+#endif
+		return;
+	}
+
+	WARN_ON_ONCE(event_type != EVENT_TYPE_NMI);
+
+#ifdef CONFIG_X86_64
+	if (cpu_feature_enabled(X86_FEATURE_FRED))
+		return fred_entry_from_kvm(event_type, vector);
+#endif
+
+	/*
+	 * Notably, we must use IDT dispatch for NMI when running in IDT mode.
+	 * The FRED NMI context is significantly different and will not work
+	 * right (speficially FRED fixed the NMI recursion issue).
+	 */
+	idt_entry_from_kvm(vector);
+}
+EXPORT_SYMBOL_FOR_KVM(x86_entry_from_kvm);
+#endif
--- a/arch/x86/entry/entry.S
+++ b/arch/x86/entry/entry.S
@@ -75,3 +75,49 @@ THUNK warn_thunk_thunk, __warn_thunk
 #if defined(CONFIG_STACKPROTECTOR) && defined(CONFIG_SMP)
 EXPORT_SYMBOL(__ref_stack_chk_guard);
 #endif
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+.macro IDT_DO_EVENT_IRQOFF call_insn call_target
+	/*
+	 * Unconditionally create a stack frame, getting the correct RSP on the
+	 * stack (for x86-64) would take two instructions anyways, and RBP can
+	 * be used to restore RSP to make objtool happy (see below).
+	 */
+	push %_ASM_BP
+	mov %_ASM_SP, %_ASM_BP
+
+#ifdef CONFIG_X86_64
+	/*
+	 * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
+	 * creating the synthetic interrupt stack frame for the IRQ/NMI.
+	 */
+	and  $-16, %rsp
+	push $__KERNEL_DS
+	push %rbp
+#endif
+	pushf
+	push $__KERNEL_CS
+	\call_insn \call_target
+
+	/*
+	 * "Restore" RSP from RBP, even though IRET has already unwound RSP to
+	 * the correct value.  objtool doesn't know the callee will IRET and,
+	 * without the explicit restore, thinks the stack is getting walloped.
+	 * Using an unwind hint is problematic due to x86-64's dynamic alignment.
+	 */
+	leave
+	RET
+.endm
+
+.pushsection .text, "ax"
+SYM_FUNC_START(idt_do_interrupt_irqoff)
+	IDT_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
+SYM_FUNC_END(idt_do_interrupt_irqoff)
+.popsection
+
+.pushsection .noinstr.text, "ax"
+SYM_FUNC_START(idt_do_nmi_irqoff)
+	IDT_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx
+SYM_FUNC_END(idt_do_nmi_irqoff)
+.popsection
+#endif
--- a/arch/x86/entry/entry_64_fred.S
+++ b/arch/x86/entry/entry_64_fred.S
@@ -147,5 +147,4 @@ SYM_FUNC_START(asm_fred_entry_from_kvm)
 	RET
 
 SYM_FUNC_END(asm_fred_entry_from_kvm)
-EXPORT_SYMBOL_FOR_KVM(asm_fred_entry_from_kvm);
 #endif
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -438,6 +438,10 @@ extern void idt_setup_traps(void);
 extern void idt_setup_apic_and_irq_gates(void);
 extern bool idt_is_f00f_address(unsigned long address);
 
+extern void idt_do_interrupt_irqoff(unsigned int vector);
+extern void idt_do_nmi_irqoff(void);
+extern void idt_entry_from_kvm(unsigned int vector);
+
 #ifdef CONFIG_X86_64
 extern void idt_setup_early_pf(void);
 #else
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -145,7 +145,7 @@ struct gate_struct {
 typedef struct gate_struct gate_desc;
 
 #ifndef _SETUP
-static inline unsigned long gate_offset(const gate_desc *g)
+static __always_inline unsigned long gate_offset(const gate_desc *g)
 {
 #ifdef CONFIG_X86_64
 	return g->offset_low | ((unsigned long)g->offset_middle << 16) |
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -97,4 +97,6 @@ static __always_inline void arch_exit_to
 }
 #define arch_exit_to_user_mode arch_exit_to_user_mode
 
+extern void x86_entry_from_kvm(unsigned int entry_type, unsigned int vector);
+
 #endif
--- a/arch/x86/include/asm/fred.h
+++ b/arch/x86/include/asm/fred.h
@@ -110,7 +110,6 @@ static __always_inline unsigned long fre
 static inline void cpu_init_fred_exceptions(void) { }
 static inline void cpu_init_fred_rsps(void) { }
 static inline void fred_complete_exception_setup(void) { }
-static inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { }
 static inline void fred_sync_rsp0(unsigned long rsp0) { }
 static inline void fred_update_rsp0(void) { }
 #endif /* CONFIG_X86_FRED */
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -268,6 +268,19 @@ void __init idt_setup_early_pf(void)
 }
 #endif
 
+noinstr void idt_entry_from_kvm(unsigned int vector)
+{
+	if (vector == NMI_VECTOR)
+		return idt_do_nmi_irqoff();
+
+	/*
+	 * Only the NMI path requires noinstr.
+	 */
+	instrumentation_begin();
+	idt_do_interrupt_irqoff(gate_offset(idt_table + vector));
+	instrumentation_end();
+}
+
 static void __init idt_map_in_cea(void)
 {
 	/*
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -614,7 +614,6 @@ DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx)
 {
 	exc_nmi(regs);
 }
-EXPORT_SYMBOL_FOR_KVM(asm_exc_nmi_kvm_vmx);
 #endif
 
 #ifdef CONFIG_NMI_CHECK_CPU
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -31,38 +31,6 @@
 #define VCPU_R15	__VCPU_REGS_R15 * WORD_SIZE
 #endif
 
-.macro VMX_DO_EVENT_IRQOFF call_insn call_target
-	/*
-	 * Unconditionally create a stack frame, getting the correct RSP on the
-	 * stack (for x86-64) would take two instructions anyways, and RBP can
-	 * be used to restore RSP to make objtool happy (see below).
-	 */
-	push %_ASM_BP
-	mov %_ASM_SP, %_ASM_BP
-
-#ifdef CONFIG_X86_64
-	/*
-	 * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
-	 * creating the synthetic interrupt stack frame for the IRQ/NMI.
-	 */
-	and  $-16, %rsp
-	push $__KERNEL_DS
-	push %rbp
-#endif
-	pushf
-	push $__KERNEL_CS
-	\call_insn \call_target
-
-	/*
-	 * "Restore" RSP from RBP, even though IRET has already unwound RSP to
-	 * the correct value.  objtool doesn't know the callee will IRET and,
-	 * without the explicit restore, thinks the stack is getting walloped.
-	 * Using an unwind hint is problematic due to x86-64's dynamic alignment.
-	 */
-	leave
-	RET
-.endm
-
 .section .noinstr.text, "ax"
 
 /**
@@ -320,10 +288,6 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_
 
 SYM_FUNC_END(__vmx_vcpu_run)
 
-SYM_FUNC_START(vmx_do_nmi_irqoff)
-	VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx
-SYM_FUNC_END(vmx_do_nmi_irqoff)
-
 #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
 
 /**
@@ -375,13 +339,3 @@ SYM_FUNC_START(vmread_error_trampoline)
 	RET
 SYM_FUNC_END(vmread_error_trampoline)
 #endif
-
-.section .text, "ax"
-
-#ifndef CONFIG_X86_FRED
-
-SYM_FUNC_START(vmx_do_interrupt_irqoff)
-	VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
-SYM_FUNC_END(vmx_do_interrupt_irqoff)
-
-#endif
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7083,9 +7083,6 @@ void vmx_load_eoi_exitmap(struct kvm_vcp
 	vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
 }
 
-void vmx_do_interrupt_irqoff(unsigned long entry);
-void vmx_do_nmi_irqoff(void);
-
 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
 {
 	/*
@@ -7127,17 +7124,9 @@ static void handle_external_interrupt_ir
 	    "unexpected VM-Exit interrupt info: 0x%x", intr_info))
 		return;
 
-	/*
-	 * Invoke the kernel's IRQ handler for the vector.  Use the FRED path
-	 * when it's available even if FRED isn't fully enabled, e.g. even if
-	 * FRED isn't supported in hardware, in order to avoid the indirect
-	 * CALL in the non-FRED path.
-	 */
+	/* For the IRQ to the core kernel for processing. */
 	kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
-	if (IS_ENABLED(CONFIG_X86_FRED))
-		fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
-	else
-		vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector));
+	x86_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
 	kvm_after_interrupt(vcpu);
 
 	vcpu->arch.at_instruction_boundary = true;
@@ -7447,10 +7436,7 @@ noinstr void vmx_handle_nmi(struct kvm_v
 		return;
 
 	kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
-	if (cpu_feature_enabled(X86_FEATURE_FRED))
-		fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
-	else
-		vmx_do_nmi_irqoff();
+	x86_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
 	kvm_after_interrupt(vcpu);
 }
 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-05-01 20:37   ` [PATCH v2 " Peter Zijlstra
@ 2026-05-08  2:54     ` Yan Zhao
  2026-05-08  8:54       ` Peter Zijlstra
  2026-05-08  6:09     ` Binbin Wu
  1 sibling, 1 reply; 28+ messages in thread
From: Yan Zhao @ 2026-05-08  2:54 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, linux-kernel, Sean Christopherson, Jim Mattson, Binbin Wu,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini

On Fri, May 01, 2026 at 10:37:17PM +0200, Peter Zijlstra wrote:
> --- /dev/null
> +++ b/arch/x86/entry/common.c
> @@ -0,0 +1,48 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#include <linux/entry-common.h>
> +#include <linux/kvm_types.h>
> +#include <asm/fred.h>
> +#include <asm/desc.h>
> +
> +#if IS_ENABLED(CONFIG_KVM_INTEL)
> +/*
> + * On VMX, NMIs and IRQs (as configured by KVM) are acknowledged by hardware as
> + * part of the VM-Exit, i.e. the event itself is consumed as part the VM-Exit.
> + * x86_entry_from_kvm() is invoked by KVM to effectively forward NMIs and IRQs
> + * to the kernel for servicing.  On SVM, a.k.a. AMD, the NMI/IRQ VM-Exit is
> + * purely a signal that an NMI/IRQ is pending, i.e. the event that triggered
> + * the VM-Exit is held pending until it's unblocked in the host.
> + */
> +noinstr void x86_entry_from_kvm(unsigned int event_type, unsigned int vector)
> +{
> +	if (event_type == EVENT_TYPE_EXTINT) {
> +#ifdef CONFIG_X86_64
> +		/*
> +		 * Use FRED dispatch, even when running IDT. The dispatch
> +		 * tables are kept in sync between FRED and IDT, and the FRED
> +		 * dispatch works well with CFI.
> +		 */
> +		fred_entry_from_kvm(event_type, vector);
> +#else
> +		idt_entry_from_kvm(vector);
FYI:
I can met below error with "make ARCH=i386 allnoconfig" or
"make ARCH=x86_64 allnoconfig".

ld: vmlinux.o: in function `idt_entry_from_kvm':
(.noinstr.text+0x1618): undefined reference to `idt_do_interrupt_irqoff'
ld: (.noinstr.text+0x161f): undefined reference to `idt_do_nmi_irqoff'
make[2]: *** [scripts/Makefile.vmlinux:72: vmlinux.unstripped] Error 1
make[1]: *** [/home/yan/kernel/tdx-upstream/tdx-kvm-upstream/Makefile:1335: vmlinux] Error 2
make: *** [Makefile:248: __sub-make] Error 2

So, I added below fix:

diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 0e8fb61f63ff..7bcf1decc034 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -268,6 +268,7 @@ void __init idt_setup_early_pf(void)
 }
 #endif

+#if IS_ENABLED(CONFIG_KVM_INTEL)
 noinstr void idt_entry_from_kvm(unsigned int vector)
 {
        if (vector == NMI_VECTOR)
@@ -280,6 +281,7 @@ noinstr void idt_entry_from_kvm(unsigned int vector)
        idt_do_interrupt_irqoff(gate_offset(idt_table + vector));
        instrumentation_end();
 }
+#endif

 static void __init idt_map_in_cea(void)
 {


> +#endif
> +		return;
> +	}
> +
> +	WARN_ON_ONCE(event_type != EVENT_TYPE_NMI);
> +
> +#ifdef CONFIG_X86_64
> +	if (cpu_feature_enabled(X86_FEATURE_FRED))
> +		return fred_entry_from_kvm(event_type, vector);
> +#endif
> +
> +	/*
> +	 * Notably, we must use IDT dispatch for NMI when running in IDT mode.
> +	 * The FRED NMI context is significantly different and will not work
> +	 * right (speficially FRED fixed the NMI recursion issue).
> +	 */
> +	idt_entry_from_kvm(vector);
> +}
> +EXPORT_SYMBOL_FOR_KVM(x86_entry_from_kvm);
> +#endif

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-05-01 20:37   ` [PATCH v2 " Peter Zijlstra
  2026-05-08  2:54     ` Yan Zhao
@ 2026-05-08  6:09     ` Binbin Wu
  2026-05-08  8:53       ` Peter Zijlstra
  1 sibling, 1 reply; 28+ messages in thread
From: Binbin Wu @ 2026-05-08  6:09 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, linux-kernel, Sean Christopherson, Jim Mattson,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini



On 5/2/2026 4:37 AM, Peter Zijlstra wrote:
[...]

> --- /dev/null
> +++ b/arch/x86/entry/common.c
> @@ -0,0 +1,48 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#include <linux/entry-common.h>
> +#include <linux/kvm_types.h>
> +#include <asm/fred.h>
> +#include <asm/desc.h>
> +
> +#if IS_ENABLED(CONFIG_KVM_INTEL)
> +/*
> + * On VMX, NMIs and IRQs (as configured by KVM) are acknowledged by hardware as
> + * part of the VM-Exit, i.e. the event itself is consumed as part the VM-Exit.
> + * x86_entry_from_kvm() is invoked by KVM to effectively forward NMIs and IRQs
> + * to the kernel for servicing.  On SVM, a.k.a. AMD, the NMI/IRQ VM-Exit is
> + * purely a signal that an NMI/IRQ is pending, i.e. the event that triggered
> + * the VM-Exit is held pending until it's unblocked in the host.
> + */
> +noinstr void x86_entry_from_kvm(unsigned int event_type, unsigned int vector)
> +{
> +	if (event_type == EVENT_TYPE_EXTINT) {
> +#ifdef CONFIG_X86_64
> +		/*
> +		 * Use FRED dispatch, even when running IDT. The dispatch
> +		 * tables are kept in sync between FRED and IDT, and the FRED
> +		 * dispatch works well with CFI.
> +		 */
> +		fred_entry_from_kvm(event_type, vector);
> +#else
> +		idt_entry_from_kvm(vector);
> +#endif
> +		return;
> +	}
> +
> +	WARN_ON_ONCE(event_type != EVENT_TYPE_NMI);

Not sure if it's OK to use WARN_ON_ONCE() here.
If the warning is triggered, it could unblock NMI due to handling of #UD.

> +
> +#ifdef CONFIG_X86_64
> +	if (cpu_feature_enabled(X86_FEATURE_FRED))
> +		return fred_entry_from_kvm(event_type, vector);
> +#endif
> +
> +	/*
> +	 * Notably, we must use IDT dispatch for NMI when running in IDT mode.
> +	 * The FRED NMI context is significantly different and will not work
> +	 * right (speficially FRED fixed the NMI recursion issue).
> +	 */
> +	idt_entry_from_kvm(vector);
> +}
> +EXPORT_SYMBOL_FOR_KVM(x86_entry_from_kvm);
> +#endif

[...]


> --- a/arch/x86/include/asm/desc.h
> +++ b/arch/x86/include/asm/desc.h
> @@ -438,6 +438,10 @@ extern void idt_setup_traps(void);
>  extern void idt_setup_apic_and_irq_gates(void);
>  extern bool idt_is_f00f_address(unsigned long address);
>  
> +extern void idt_do_interrupt_irqoff(unsigned int vector);

In idt_entry_from_kvm() below, gate_offset() returns 'unsigned long', but here
it uses 'unsigned int'. It's not safe since there is no guarantee that the
address is within 32 bits for x86_64.

Also, the argument is not a vector.

[...]
> +noinstr void idt_entry_from_kvm(unsigned int vector)
> +{
> +	if (vector == NMI_VECTOR)
> +		return idt_do_nmi_irqoff();
> +
> +	/*
> +	 * Only the NMI path requires noinstr.
> +	 */
> +	instrumentation_begin();
> +	idt_do_interrupt_irqoff(gate_offset(idt_table + vector));
> +	instrumentation_end();
> +}
> +



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-05-08  6:09     ` Binbin Wu
@ 2026-05-08  8:53       ` Peter Zijlstra
  2026-05-08  8:56         ` Binbin Wu
  0 siblings, 1 reply; 28+ messages in thread
From: Peter Zijlstra @ 2026-05-08  8:53 UTC (permalink / raw)
  To: Binbin Wu
  Cc: tglx, linux-kernel, Sean Christopherson, Jim Mattson,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini

On Fri, May 08, 2026 at 02:09:09PM +0800, Binbin Wu wrote:
> On 5/2/2026 4:37 AM, Peter Zijlstra wrote:

> > +noinstr void x86_entry_from_kvm(unsigned int event_type, unsigned int vector)
> > +{
> > +	if (event_type == EVENT_TYPE_EXTINT) {
> > +#ifdef CONFIG_X86_64
> > +		/*
> > +		 * Use FRED dispatch, even when running IDT. The dispatch
> > +		 * tables are kept in sync between FRED and IDT, and the FRED
> > +		 * dispatch works well with CFI.
> > +		 */
> > +		fred_entry_from_kvm(event_type, vector);
> > +#else
> > +		idt_entry_from_kvm(vector);
> > +#endif
> > +		return;
> > +	}
> > +
> > +	WARN_ON_ONCE(event_type != EVENT_TYPE_NMI);
> 
> Not sure if it's OK to use WARN_ON_ONCE() here.
> If the warning is triggered, it could unblock NMI due to handling of #UD.

If that ever triggers you've got bigger problems.

> > --- a/arch/x86/include/asm/desc.h
> > +++ b/arch/x86/include/asm/desc.h
> > @@ -438,6 +438,10 @@ extern void idt_setup_traps(void);
> >  extern void idt_setup_apic_and_irq_gates(void);
> >  extern bool idt_is_f00f_address(unsigned long address);
> >  
> > +extern void idt_do_interrupt_irqoff(unsigned int vector);
> 
> In idt_entry_from_kvm() below, gate_offset() returns 'unsigned long', but here
> it uses 'unsigned int'. It's not safe since there is no guarantee that the
> address is within 32 bits for x86_64.
> 

Right you are, 'unsigned long address' it is.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-05-08  2:54     ` Yan Zhao
@ 2026-05-08  8:54       ` Peter Zijlstra
  0 siblings, 0 replies; 28+ messages in thread
From: Peter Zijlstra @ 2026-05-08  8:54 UTC (permalink / raw)
  To: Yan Zhao
  Cc: tglx, linux-kernel, Sean Christopherson, Jim Mattson, Binbin Wu,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini

On Fri, May 08, 2026 at 10:54:00AM +0800, Yan Zhao wrote:

> So, I added below fix:
> 
> diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
> index 0e8fb61f63ff..7bcf1decc034 100644
> --- a/arch/x86/kernel/idt.c
> +++ b/arch/x86/kernel/idt.c
> @@ -268,6 +268,7 @@ void __init idt_setup_early_pf(void)
>  }
>  #endif
> 
> +#if IS_ENABLED(CONFIG_KVM_INTEL)
>  noinstr void idt_entry_from_kvm(unsigned int vector)
>  {
>         if (vector == NMI_VECTOR)
> @@ -280,6 +281,7 @@ noinstr void idt_entry_from_kvm(unsigned int vector)
>         idt_do_interrupt_irqoff(gate_offset(idt_table + vector));
>         instrumentation_end();
>  }
> +#endif
> 
>  static void __init idt_map_in_cea(void)
>  {

Thanks!

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-05-08  8:53       ` Peter Zijlstra
@ 2026-05-08  8:56         ` Binbin Wu
  0 siblings, 0 replies; 28+ messages in thread
From: Binbin Wu @ 2026-05-08  8:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, linux-kernel, Sean Christopherson, Jim Mattson,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini



On 5/8/2026 4:53 PM, Peter Zijlstra wrote:
> On Fri, May 08, 2026 at 02:09:09PM +0800, Binbin Wu wrote:
>> On 5/2/2026 4:37 AM, Peter Zijlstra wrote:
> 
>>> +noinstr void x86_entry_from_kvm(unsigned int event_type, unsigned int vector)
>>> +{
>>> +	if (event_type == EVENT_TYPE_EXTINT) {
>>> +#ifdef CONFIG_X86_64
>>> +		/*
>>> +		 * Use FRED dispatch, even when running IDT. The dispatch
>>> +		 * tables are kept in sync between FRED and IDT, and the FRED
>>> +		 * dispatch works well with CFI.
>>> +		 */
>>> +		fred_entry_from_kvm(event_type, vector);
>>> +#else
>>> +		idt_entry_from_kvm(vector);
>>> +#endif
>>> +		return;
>>> +	}
>>> +
>>> +	WARN_ON_ONCE(event_type != EVENT_TYPE_NMI);
>>
>> Not sure if it's OK to use WARN_ON_ONCE() here.
>> If the warning is triggered, it could unblock NMI due to handling of #UD.
> 
> If that ever triggers you've got bigger problems.

Agree. :)

> 
>>> --- a/arch/x86/include/asm/desc.h
>>> +++ b/arch/x86/include/asm/desc.h
>>> @@ -438,6 +438,10 @@ extern void idt_setup_traps(void);
>>>  extern void idt_setup_apic_and_irq_gates(void);
>>>  extern bool idt_is_f00f_address(unsigned long address);
>>>  
>>> +extern void idt_do_interrupt_irqoff(unsigned int vector);
>>
>> In idt_entry_from_kvm() below, gate_offset() returns 'unsigned long', but here
>> it uses 'unsigned int'. It's not safe since there is no guarantee that the
>> address is within 32 bits for x86_64.
>>
> 
> Right you are, 'unsigned long address' it is.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v3 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-04-23 15:56 ` [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core Peter Zijlstra
                     ` (2 preceding siblings ...)
  2026-05-01 20:37   ` [PATCH v2 " Peter Zijlstra
@ 2026-05-08  9:18   ` Peter Zijlstra
  2026-05-08  9:41     ` Binbin Wu
                       ` (4 more replies)
  3 siblings, 5 replies; 28+ messages in thread
From: Peter Zijlstra @ 2026-05-08  9:18 UTC (permalink / raw)
  To: tglx
  Cc: linux-kernel, Sean Christopherson, Jim Mattson, Binbin Wu,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini


Move the VMX interrupt dispatch magic into the x86 core code. This
isolates KVM from the FRED/IDT decisions and reduces the amount of
EXPORT_SYMBOL_FOR_KVM().

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
---
Changes since v2:
 - one more IS_ENABLED(CONFIG_KVM_INTEL) (Yan Zhao)
 - fixed idt_do_interrupt_irqoff() prototype (Binbin Wu)

 arch/x86/entry/Makefile             |    2 -
 arch/x86/entry/common.c             |   48 ++++++++++++++++++++++++++++++++++++
 arch/x86/entry/entry.S              |   46 ++++++++++++++++++++++++++++++++++
 arch/x86/entry/entry_64_fred.S      |    1 
 arch/x86/include/asm/desc.h         |    4 +++
 arch/x86/include/asm/desc_defs.h    |    2 -
 arch/x86/include/asm/entry-common.h |    2 +
 arch/x86/include/asm/fred.h         |    1 
 arch/x86/kernel/idt.c               |   15 +++++++++++
 arch/x86/kernel/nmi.c               |    1 
 arch/x86/kvm/vmx/vmenter.S          |   46 ----------------------------------
 arch/x86/kvm/vmx/vmx.c              |   20 ++-------------
 12 files changed, 120 insertions(+), 68 deletions(-)

--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -13,7 +13,7 @@ CFLAGS_REMOVE_syscall_64.o	= $(CC_FLAGS_
 CFLAGS_syscall_32.o		+= -fno-stack-protector
 CFLAGS_syscall_64.o		+= -fno-stack-protector
 
-obj-y				:= entry.o entry_$(BITS).o syscall_$(BITS).o
+obj-y				:= entry.o entry_$(BITS).o syscall_$(BITS).o common.o
 
 obj-y				+= vdso/
 obj-y				+= vsyscall/
--- /dev/null
+++ b/arch/x86/entry/common.c
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/entry-common.h>
+#include <linux/kvm_types.h>
+#include <asm/fred.h>
+#include <asm/desc.h>
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+/*
+ * On VMX, NMIs and IRQs (as configured by KVM) are acknowledged by hardware as
+ * part of the VM-Exit, i.e. the event itself is consumed as part the VM-Exit.
+ * x86_entry_from_kvm() is invoked by KVM to effectively forward NMIs and IRQs
+ * to the kernel for servicing.  On SVM, a.k.a. AMD, the NMI/IRQ VM-Exit is
+ * purely a signal that an NMI/IRQ is pending, i.e. the event that triggered
+ * the VM-Exit is held pending until it's unblocked in the host.
+ */
+noinstr void x86_entry_from_kvm(unsigned int event_type, unsigned int vector)
+{
+	if (event_type == EVENT_TYPE_EXTINT) {
+#ifdef CONFIG_X86_64
+		/*
+		 * Use FRED dispatch, even when running IDT. The dispatch
+		 * tables are kept in sync between FRED and IDT, and the FRED
+		 * dispatch works well with CFI.
+		 */
+		fred_entry_from_kvm(event_type, vector);
+#else
+		idt_entry_from_kvm(vector);
+#endif
+		return;
+	}
+
+	WARN_ON_ONCE(event_type != EVENT_TYPE_NMI);
+
+#ifdef CONFIG_X86_64
+	if (cpu_feature_enabled(X86_FEATURE_FRED))
+		return fred_entry_from_kvm(event_type, vector);
+#endif
+
+	/*
+	 * Notably, we must use IDT dispatch for NMI when running in IDT mode.
+	 * The FRED NMI context is significantly different and will not work
+	 * right (speficially FRED fixed the NMI recursion issue).
+	 */
+	idt_entry_from_kvm(vector);
+}
+EXPORT_SYMBOL_FOR_KVM(x86_entry_from_kvm);
+#endif
--- a/arch/x86/entry/entry.S
+++ b/arch/x86/entry/entry.S
@@ -75,3 +75,49 @@ THUNK warn_thunk_thunk, __warn_thunk
 #if defined(CONFIG_STACKPROTECTOR) && defined(CONFIG_SMP)
 EXPORT_SYMBOL(__ref_stack_chk_guard);
 #endif
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+.macro IDT_DO_EVENT_IRQOFF call_insn call_target
+	/*
+	 * Unconditionally create a stack frame, getting the correct RSP on the
+	 * stack (for x86-64) would take two instructions anyways, and RBP can
+	 * be used to restore RSP to make objtool happy (see below).
+	 */
+	push %_ASM_BP
+	mov %_ASM_SP, %_ASM_BP
+
+#ifdef CONFIG_X86_64
+	/*
+	 * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
+	 * creating the synthetic interrupt stack frame for the IRQ/NMI.
+	 */
+	and  $-16, %rsp
+	push $__KERNEL_DS
+	push %rbp
+#endif
+	pushf
+	push $__KERNEL_CS
+	\call_insn \call_target
+
+	/*
+	 * "Restore" RSP from RBP, even though IRET has already unwound RSP to
+	 * the correct value.  objtool doesn't know the callee will IRET and,
+	 * without the explicit restore, thinks the stack is getting walloped.
+	 * Using an unwind hint is problematic due to x86-64's dynamic alignment.
+	 */
+	leave
+	RET
+.endm
+
+.pushsection .text, "ax"
+SYM_FUNC_START(idt_do_interrupt_irqoff)
+	IDT_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
+SYM_FUNC_END(idt_do_interrupt_irqoff)
+.popsection
+
+.pushsection .noinstr.text, "ax"
+SYM_FUNC_START(idt_do_nmi_irqoff)
+	IDT_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx
+SYM_FUNC_END(idt_do_nmi_irqoff)
+.popsection
+#endif
--- a/arch/x86/entry/entry_64_fred.S
+++ b/arch/x86/entry/entry_64_fred.S
@@ -147,5 +147,4 @@ SYM_FUNC_START(asm_fred_entry_from_kvm)
 	RET
 
 SYM_FUNC_END(asm_fred_entry_from_kvm)
-EXPORT_SYMBOL_FOR_KVM(asm_fred_entry_from_kvm);
 #endif
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -438,6 +438,10 @@ extern void idt_setup_traps(void);
 extern void idt_setup_apic_and_irq_gates(void);
 extern bool idt_is_f00f_address(unsigned long address);
 
+extern void idt_do_interrupt_irqoff(unsigned long address);
+extern void idt_do_nmi_irqoff(void);
+extern void idt_entry_from_kvm(unsigned int vector);
+
 #ifdef CONFIG_X86_64
 extern void idt_setup_early_pf(void);
 #else
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -145,7 +145,7 @@ struct gate_struct {
 typedef struct gate_struct gate_desc;
 
 #ifndef _SETUP
-static inline unsigned long gate_offset(const gate_desc *g)
+static __always_inline unsigned long gate_offset(const gate_desc *g)
 {
 #ifdef CONFIG_X86_64
 	return g->offset_low | ((unsigned long)g->offset_middle << 16) |
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -97,4 +97,6 @@ static __always_inline void arch_exit_to
 }
 #define arch_exit_to_user_mode arch_exit_to_user_mode
 
+extern void x86_entry_from_kvm(unsigned int entry_type, unsigned int vector);
+
 #endif
--- a/arch/x86/include/asm/fred.h
+++ b/arch/x86/include/asm/fred.h
@@ -110,7 +110,6 @@ static __always_inline unsigned long fre
 static inline void cpu_init_fred_exceptions(void) { }
 static inline void cpu_init_fred_rsps(void) { }
 static inline void fred_complete_exception_setup(void) { }
-static inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { }
 static inline void fred_sync_rsp0(unsigned long rsp0) { }
 static inline void fred_update_rsp0(void) { }
 #endif /* CONFIG_X86_FRED */
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -268,6 +268,21 @@ void __init idt_setup_early_pf(void)
 }
 #endif
 
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+noinstr void idt_entry_from_kvm(unsigned int vector)
+{
+	if (vector == NMI_VECTOR)
+		return idt_do_nmi_irqoff();
+
+	/*
+	 * Only the NMI path requires noinstr.
+	 */
+	instrumentation_begin();
+	idt_do_interrupt_irqoff(gate_offset(idt_table + vector));
+	instrumentation_end();
+}
+#endif
+
 static void __init idt_map_in_cea(void)
 {
 	/*
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -614,7 +614,6 @@ DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx)
 {
 	exc_nmi(regs);
 }
-EXPORT_SYMBOL_FOR_KVM(asm_exc_nmi_kvm_vmx);
 #endif
 
 #ifdef CONFIG_NMI_CHECK_CPU
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -31,38 +31,6 @@
 #define VCPU_R15	__VCPU_REGS_R15 * WORD_SIZE
 #endif
 
-.macro VMX_DO_EVENT_IRQOFF call_insn call_target
-	/*
-	 * Unconditionally create a stack frame, getting the correct RSP on the
-	 * stack (for x86-64) would take two instructions anyways, and RBP can
-	 * be used to restore RSP to make objtool happy (see below).
-	 */
-	push %_ASM_BP
-	mov %_ASM_SP, %_ASM_BP
-
-#ifdef CONFIG_X86_64
-	/*
-	 * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
-	 * creating the synthetic interrupt stack frame for the IRQ/NMI.
-	 */
-	and  $-16, %rsp
-	push $__KERNEL_DS
-	push %rbp
-#endif
-	pushf
-	push $__KERNEL_CS
-	\call_insn \call_target
-
-	/*
-	 * "Restore" RSP from RBP, even though IRET has already unwound RSP to
-	 * the correct value.  objtool doesn't know the callee will IRET and,
-	 * without the explicit restore, thinks the stack is getting walloped.
-	 * Using an unwind hint is problematic due to x86-64's dynamic alignment.
-	 */
-	leave
-	RET
-.endm
-
 .section .noinstr.text, "ax"
 
 /**
@@ -320,10 +288,6 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_
 
 SYM_FUNC_END(__vmx_vcpu_run)
 
-SYM_FUNC_START(vmx_do_nmi_irqoff)
-	VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx
-SYM_FUNC_END(vmx_do_nmi_irqoff)
-
 #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
 
 /**
@@ -375,13 +339,3 @@ SYM_FUNC_START(vmread_error_trampoline)
 	RET
 SYM_FUNC_END(vmread_error_trampoline)
 #endif
-
-.section .text, "ax"
-
-#ifndef CONFIG_X86_FRED
-
-SYM_FUNC_START(vmx_do_interrupt_irqoff)
-	VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
-SYM_FUNC_END(vmx_do_interrupt_irqoff)
-
-#endif
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7108,9 +7108,6 @@ void vmx_load_eoi_exitmap(struct kvm_vcp
 	vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
 }
 
-void vmx_do_interrupt_irqoff(unsigned long entry);
-void vmx_do_nmi_irqoff(void);
-
 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
 {
 	/*
@@ -7152,17 +7149,9 @@ static void handle_external_interrupt_ir
 	    "unexpected VM-Exit interrupt info: 0x%x", intr_info))
 		return;
 
-	/*
-	 * Invoke the kernel's IRQ handler for the vector.  Use the FRED path
-	 * when it's available even if FRED isn't fully enabled, e.g. even if
-	 * FRED isn't supported in hardware, in order to avoid the indirect
-	 * CALL in the non-FRED path.
-	 */
+	/* For the IRQ to the core kernel for processing. */
 	kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
-	if (IS_ENABLED(CONFIG_X86_FRED))
-		fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
-	else
-		vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector));
+	x86_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
 	kvm_after_interrupt(vcpu);
 
 	vcpu->arch.at_instruction_boundary = true;
@@ -7472,10 +7461,7 @@ noinstr void vmx_handle_nmi(struct kvm_v
 		return;
 
 	kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
-	if (cpu_feature_enabled(X86_FEATURE_FRED))
-		fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
-	else
-		vmx_do_nmi_irqoff();
+	x86_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
 	kvm_after_interrupt(vcpu);
 }
 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-05-08  9:18   ` [PATCH v3 " Peter Zijlstra
@ 2026-05-08  9:41     ` Binbin Wu
  2026-05-12 22:31     ` Sean Christopherson
                       ` (3 subsequent siblings)
  4 siblings, 0 replies; 28+ messages in thread
From: Binbin Wu @ 2026-05-08  9:41 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, linux-kernel, Sean Christopherson, Jim Mattson,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini



On 5/8/2026 5:18 PM, Peter Zijlstra wrote:
> 
> Move the VMX interrupt dispatch magic into the x86 core code. This
> isolates KVM from the FRED/IDT decisions and reduces the amount of
> EXPORT_SYMBOL_FOR_KVM().
> 
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Tested-by: "Verma, Vishal L" <vishal.l.verma@intel.com>

Reviewed-by: Binbin Wu <binbin.wu@linxu.intel.com>

One typo below.

[...]

> +
> +	/*
> +	 * Notably, we must use IDT dispatch for NMI when running in IDT mode.
> +	 * The FRED NMI context is significantly different and will not work
> +	 * right (speficially FRED fixed the NMI recursion issue).

speficially -> specifically

> +	 */
> +	idt_entry_from_kvm(vector);
> +}

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] x86/kvm/vmx: Fix VMX vs hrtimer_rearm_deferred()
  2026-04-23 15:56 ` [PATCH 2/2] x86/kvm/vmx: Fix VMX vs hrtimer_rearm_deferred() Peter Zijlstra
@ 2026-05-11 12:59   ` David Woodhouse
  2026-05-12 22:32     ` Sean Christopherson
  2026-05-15 18:15     ` Marc Dionne
  2026-05-18  8:01   ` Zhao Liu
  2026-05-18  8:16   ` Binbin Wu
  2 siblings, 2 replies; 28+ messages in thread
From: David Woodhouse @ 2026-05-11 12:59 UTC (permalink / raw)
  To: Peter Zijlstra, tglx
  Cc: linux-kernel, Sean Christopherson, Jim Mattson, Binbin Wu,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini

[-- Attachment #1: Type: text/plain, Size: 1389 bytes --]

On Thu, 2026-04-23 at 17:56 +0200, Peter Zijlstra wrote:
> Vishal reported that KVM unit test 'x2apic' started failing after commit
> 0e98eb14814e ("entry: Prepare for deferred hrtimer rearming").
> 
> The reason is that KVM/VMX is injecting interrupts while it has IRQs disabled,
> for a context that will enable IRQs, this means that regs->flags.X86_EFLAGS_IF
> == 0 and the irqentry_exit() will not DTRT.
> 
> Notably, irqentry_exit() must not call hrtimer_rearm_deferred() when the return
> context does not have IF set, because this will cause problems vs NMIs.
> 
> Therefore, fix up the state after the injection.
> 
> Fixes: 0e98eb14814e ("entry: Prepare for deferred hrtimer rearming")
> Reported-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
> Closes: https://lore.kernel.org/r/70cd3e97fbb796e2eb2ff8cd4b7614ada05a5f24.camel%40intel.com
> Suggested-by: Thomas Gleixner <tglx@kernel.org>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Tested-by: "Verma, Vishal L" <vishal.l.verma@intel.com>

Tested-by: David Woodhouse <dwmw@amazon.co.uk>

I don't see this being merged yet?

Without this, Xen timer delivery (tested by the xen_shinfo_test KVM
selftest) is failing. I think the Fixes: tag is wrong though; it
actually broke with commit 15dd3a948855 ("hrtimer: Push reprogramming
timers into the interrupt return path"), didn't it?

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5069 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-05-08  9:18   ` [PATCH v3 " Peter Zijlstra
  2026-05-08  9:41     ` Binbin Wu
@ 2026-05-12 22:31     ` Sean Christopherson
  2026-05-18  8:12     ` Zhao Liu
                       ` (2 subsequent siblings)
  4 siblings, 0 replies; 28+ messages in thread
From: Sean Christopherson @ 2026-05-12 22:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, linux-kernel, Jim Mattson, Binbin Wu, Vishal L Verma,
	kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu, x86@kernel.org,
	Paolo Bonzini

On Fri, May 08, 2026, Peter Zijlstra wrote:
> 
> Move the VMX interrupt dispatch magic into the x86 core code. This
> isolates KVM from the FRED/IDT decisions and reduces the amount of
> EXPORT_SYMBOL_FOR_KVM().
> 
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Tested-by: Vishal L Verma <vishal.l.verma@intel.com>
> ---

Acked-by: Sean Christopherson <seanjc@google.com>

> @@ -7152,17 +7149,9 @@ static void handle_external_interrupt_ir
>  	    "unexpected VM-Exit interrupt info: 0x%x", intr_info))
>  		return;
>  
> -	/*
> -	 * Invoke the kernel's IRQ handler for the vector.  Use the FRED path
> -	 * when it's available even if FRED isn't fully enabled, e.g. even if
> -	 * FRED isn't supported in hardware, in order to avoid the indirect
> -	 * CALL in the non-FRED path.
> -	 */
> +	/* For the IRQ to the core kernel for processing. */

Forward?  Or just delete this comment entirely, either way works for me.

>  	kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
> -	if (IS_ENABLED(CONFIG_X86_FRED))
> -		fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
> -	else
> -		vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector));
> +	x86_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
>  	kvm_after_interrupt(vcpu);
>  
>  	vcpu->arch.at_instruction_boundary = true;

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] x86/kvm/vmx: Fix VMX vs hrtimer_rearm_deferred()
  2026-05-11 12:59   ` David Woodhouse
@ 2026-05-12 22:32     ` Sean Christopherson
  2026-05-15 18:15     ` Marc Dionne
  1 sibling, 0 replies; 28+ messages in thread
From: Sean Christopherson @ 2026-05-12 22:32 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Peter Zijlstra, tglx, linux-kernel, Jim Mattson, Binbin Wu,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini

On Mon, May 11, 2026, David Woodhouse wrote:
> On Thu, 2026-04-23 at 17:56 +0200, Peter Zijlstra wrote:
> > Vishal reported that KVM unit test 'x2apic' started failing after commit
> > 0e98eb14814e ("entry: Prepare for deferred hrtimer rearming").
> > 
> > The reason is that KVM/VMX is injecting interrupts while it has IRQs disabled,
> > for a context that will enable IRQs, this means that regs->flags.X86_EFLAGS_IF
> > == 0 and the irqentry_exit() will not DTRT.
> > 
> > Notably, irqentry_exit() must not call hrtimer_rearm_deferred() when the return
> > context does not have IF set, because this will cause problems vs NMIs.
> > 
> > Therefore, fix up the state after the injection.
> > 
> > Fixes: 0e98eb14814e ("entry: Prepare for deferred hrtimer rearming")
> > Reported-by: Vishal L Verma <vishal.l.verma@intel.com>
> > Closes: https://lore.kernel.org/r/70cd3e97fbb796e2eb2ff8cd4b7614ada05a5f24.camel%40intel.com
> > Suggested-by: Thomas Gleixner <tglx@kernel.org>
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > Tested-by: Vishal L Verma <vishal.l.verma@intel.com>
> 
> Tested-by: David Woodhouse <dwmw@amazon.co.uk>

Tested-by: Sean Christopherson <seanjc@google.com>

> I don't see this being merged yet?
> 
> Without this, Xen timer delivery (tested by the xen_shinfo_test KVM
> selftest) is failing.

And many, many other issues with KVM.  I.e. we need this or something like it
in 7.1.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] x86/kvm/vmx: Fix VMX vs hrtimer_rearm_deferred()
  2026-05-11 12:59   ` David Woodhouse
  2026-05-12 22:32     ` Sean Christopherson
@ 2026-05-15 18:15     ` Marc Dionne
  1 sibling, 0 replies; 28+ messages in thread
From: Marc Dionne @ 2026-05-15 18:15 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Peter Zijlstra, tglx, linux-kernel, Sean Christopherson,
	Jim Mattson, Binbin Wu, Vishal L Verma, kvm@vger.kernel.org,
	Rick P Edgecombe, Binbin Wu, x86@kernel.org, Paolo Bonzini

On Mon, May 11, 2026 at 2:52 PM David Woodhouse <dwmw2@infradead.org> wrote:
>
> On Thu, 2026-04-23 at 17:56 +0200, Peter Zijlstra wrote:
> > Vishal reported that KVM unit test 'x2apic' started failing after commit
> > 0e98eb14814e ("entry: Prepare for deferred hrtimer rearming").
> >
> > The reason is that KVM/VMX is injecting interrupts while it has IRQs disabled,
> > for a context that will enable IRQs, this means that regs->flags.X86_EFLAGS_IF
> > == 0 and the irqentry_exit() will not DTRT.
> >
> > Notably, irqentry_exit() must not call hrtimer_rearm_deferred() when the return
> > context does not have IF set, because this will cause problems vs NMIs.
> >
> > Therefore, fix up the state after the injection.
> >
> > Fixes: 0e98eb14814e ("entry: Prepare for deferred hrtimer rearming")
> > Reported-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
> > Closes: https://lore.kernel.org/r/70cd3e97fbb796e2eb2ff8cd4b7614ada05a5f24.camel%40intel.com
> > Suggested-by: Thomas Gleixner <tglx@kernel.org>
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > Tested-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
>
> Tested-by: David Woodhouse <dwmw@amazon.co.uk>
>
> I don't see this being merged yet?
>
> Without this, Xen timer delivery (tested by the xen_shinfo_test KVM
> selftest) is failing. I think the Fixes: tag is wrong though; it
> actually broke with commit 15dd3a948855 ("hrtimer: Push reprogramming
> timers into the interrupt return path"), didn't it?

Just to add another voice, without these patches I have a host
(current 7.1-rc mainline) running VM workloads that regularly gets
"watchdog: CPUx: Watchdog detected hard LOCKUP on cpu x" messages and
related stack traces.

A few weeks ago I had bisected the issue down to the same commit that
David suggests, 15dd3a948855 ("hrtimer: Push reprogramming timers into
the interrupt return path"), and verified that applying those patches
made the oopses go away.

Thanks,
Marc

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] x86/kvm/vmx: Fix VMX vs hrtimer_rearm_deferred()
  2026-04-23 15:56 ` [PATCH 2/2] x86/kvm/vmx: Fix VMX vs hrtimer_rearm_deferred() Peter Zijlstra
  2026-05-11 12:59   ` David Woodhouse
@ 2026-05-18  8:01   ` Zhao Liu
  2026-05-18  8:16   ` Binbin Wu
  2 siblings, 0 replies; 28+ messages in thread
From: Zhao Liu @ 2026-05-18  8:01 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, linux-kernel, Sean Christopherson, Jim Mattson, Binbin Wu,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini, Yi Lai

On Thu, Apr 23, 2026 at 05:56:13PM +0200, Peter Zijlstra wrote:
> Date: Thu, 23 Apr 2026 17:56:13 +0200
> From: Peter Zijlstra <peterz@infradead.org>
> Subject: [PATCH 2/2] x86/kvm/vmx: Fix VMX vs hrtimer_rearm_deferred()
> 
> Vishal reported that KVM unit test 'x2apic' started failing after commit
> 0e98eb14814e ("entry: Prepare for deferred hrtimer rearming").
> 
> The reason is that KVM/VMX is injecting interrupts while it has IRQs disabled,
> for a context that will enable IRQs, this means that regs->flags.X86_EFLAGS_IF
> == 0 and the irqentry_exit() will not DTRT.
> 
> Notably, irqentry_exit() must not call hrtimer_rearm_deferred() when the return
> context does not have IF set, because this will cause problems vs NMIs.
> 
> Therefore, fix up the state after the injection.
> 
> Fixes: 0e98eb14814e ("entry: Prepare for deferred hrtimer rearming")
> Reported-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
> Closes: https://lore.kernel.org/r/70cd3e97fbb796e2eb2ff8cd4b7614ada05a5f24.camel%40intel.com
> Suggested-by: Thomas Gleixner <tglx@kernel.org>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Tested-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
> ---
>  arch/x86/entry/common.c |   13 +++++++++++++
>  1 file changed, 13 insertions(+)

I also found this issue for a normal VM on DMR (with 192 vCPUs). As the
previous threads analysised, this issue is not platform-specific, nor is
it specific to TD.

I applied this series and confirmed this could resolve the hard lockup
in my case, so,

Tested-by: Zhao Liu <zhao1.liu@intel.com>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-05-08  9:18   ` [PATCH v3 " Peter Zijlstra
  2026-05-08  9:41     ` Binbin Wu
  2026-05-12 22:31     ` Sean Christopherson
@ 2026-05-18  8:12     ` Zhao Liu
  2026-05-20 23:06     ` Nathan Chancellor
  2026-05-26 10:01     ` Bezdeka, Florian
  4 siblings, 0 replies; 28+ messages in thread
From: Zhao Liu @ 2026-05-18  8:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, linux-kernel, Sean Christopherson, Jim Mattson, Binbin Wu,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini, Yi Lai

On Fri, May 08, 2026 at 11:18:29AM +0200, Peter Zijlstra wrote:
> Date: Fri, 8 May 2026 11:18:29 +0200
> From: Peter Zijlstra <peterz@infradead.org>
> Subject: [PATCH v3 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into
>  x86 core
> 
> 
> Move the VMX interrupt dispatch magic into the x86 core code. This
> isolates KVM from the FRED/IDT decisions and reduces the amount of
> EXPORT_SYMBOL_FOR_KVM().
> 
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Tested-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
> ---
> Changes since v2:
>  - one more IS_ENABLED(CONFIG_KVM_INTEL) (Yan Zhao)
>  - fixed idt_do_interrupt_irqoff() prototype (Binbin Wu)

My Guest is running well and there's no interrupt error in system, so,

Tested-by: Zhao Liu <zhao1.liu@intel.com>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] x86/kvm/vmx: Fix VMX vs hrtimer_rearm_deferred()
  2026-04-23 15:56 ` [PATCH 2/2] x86/kvm/vmx: Fix VMX vs hrtimer_rearm_deferred() Peter Zijlstra
  2026-05-11 12:59   ` David Woodhouse
  2026-05-18  8:01   ` Zhao Liu
@ 2026-05-18  8:16   ` Binbin Wu
  2 siblings, 0 replies; 28+ messages in thread
From: Binbin Wu @ 2026-05-18  8:16 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, linux-kernel, Sean Christopherson, Jim Mattson,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini

On 4/23/2026 11:56 PM, Peter Zijlstra wrote:
> Vishal reported that KVM unit test 'x2apic' started failing after commit
> 0e98eb14814e ("entry: Prepare for deferred hrtimer rearming").
> 
> The reason is that KVM/VMX is injecting interrupts while it has IRQs disabled,
> for a context that will enable IRQs, this means that regs->flags.X86_EFLAGS_IF
> == 0 and the irqentry_exit() will not DTRT.
> 
> Notably, irqentry_exit() must not call hrtimer_rearm_deferred() when the return
> context does not have IF set, because this will cause problems vs NMIs.
> 
> Therefore, fix up the state after the injection.
> 
> Fixes: 0e98eb14814e ("entry: Prepare for deferred hrtimer rearming")
> Reported-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
> Closes: https://lore.kernel.org/r/70cd3e97fbb796e2eb2ff8cd4b7614ada05a5f24.camel%40intel.com
> Suggested-by: Thomas Gleixner <tglx@kernel.org>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Tested-by: "Verma, Vishal L" <vishal.l.verma@intel.com>

Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>

> ---
>  arch/x86/entry/common.c |   13 +++++++++++++
>  1 file changed, 13 insertions(+)
> 
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -2,6 +2,7 @@
>  
>  #include <linux/entry-common.h>
>  #include <linux/kvm_types.h>
> +#include <linux/hrtimer_rearm.h>
>  #include <asm/fred.h>
>  #include <asm/desc.h>
>  
> @@ -27,6 +28,18 @@ noinstr void x86_entry_from_kvm(unsigned
>  #else
>  		idt_entry_from_kvm(vector);
>  #endif
> +		/*
> +		 * Strictly speaking, only the NMI path requires noinstr.
> +		 */
> +		instrumentation_begin();
> +		/*
> +		 * KVM/VMX will dispatch from IRQ-disabled but for a context
> +		 * that will have IRQs-enabled. This confuses the entry code
> +		 * and it will not have reprogrammed the timer. Do so now.
> +		 */
> +		hrtimer_rearm_deferred();
> +		instrumentation_end();
> +
>  		return;
>  	}
>  
> 
> 
> 


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-05-08  9:18   ` [PATCH v3 " Peter Zijlstra
                       ` (2 preceding siblings ...)
  2026-05-18  8:12     ` Zhao Liu
@ 2026-05-20 23:06     ` Nathan Chancellor
  2026-05-26  9:06       ` Peter Zijlstra
  2026-05-26 10:01     ` Bezdeka, Florian
  4 siblings, 1 reply; 28+ messages in thread
From: Nathan Chancellor @ 2026-05-20 23:06 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, linux-kernel, Sean Christopherson, Jim Mattson, Binbin Wu,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini

On Fri, May 08, 2026 at 11:18:29AM +0200, Peter Zijlstra wrote:
> 
> Move the VMX interrupt dispatch magic into the x86 core code. This
> isolates KVM from the FRED/IDT decisions and reduces the amount of
> EXPORT_SYMBOL_FOR_KVM().
> 
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Tested-by: "Verma, Vishal L" <vishal.l.verma@intel.com>

I am seeing

  vmlinux.o: warning: objtool: idt_do_interrupt_irqoff+0xe: no-cfi indirect call!

after this landed in -next.

  $ cat arch/x86/configs/repro.config
  CONFIG_CFI=y
  CONFIG_KVM=y
  CONFIG_KVM_INTEL=y

  $ make -skj"$(nproc)" ARCH=x86_64 LLVM=1 mrproper defconfig repro.config vmlinux
  vmlinux.o: warning: objtool: idt_do_interrupt_irqoff+0xe: no-cfi indirect call!

  $ llvm-objdump -drS --disassemble-symbols=idt_do_interrupt_irqoff vmlinux.o

  vmlinux.o:	file format elf64-x86-64

  Disassembly of section .text:

  0000000000001540 <idt_do_interrupt_irqoff>:
      1540: 55                           	pushq	%rbp
      1541: 48 89 e5                     	movq	%rsp, %rbp
      1544: 48 83 e4 f0                  	andq	$-0x10, %rsp
      1548: 6a 18                        	pushq	$0x18
      154a: 55                           	pushq	%rbp
      154b: 9c                           	pushfq
      154c: 6a 10                        	pushq	$0x10
      154e: e8 00 00 00 00               	callq	0x1553 <idt_do_interrupt_irqoff+0x13>
  		000000000000154f:  R_X86_64_PLT32	__x86_indirect_thunk_rdi-0x4
      1553: c9                           	leave
      1554: e9 00 00 00 00               	jmp	0x1559 <idt_do_interrupt_irqoff+0x19>
  		0000000000001555:  R_X86_64_PLT32	__x86_return_thunk-0x4
      1559: cc                           	int3
      155a: cc                           	int3
      155b: cc                           	int3
      155c: cc                           	int3
      155d: cc                           	int3
      155e: cc                           	int3
      155f: cc                           	int3
      1560: 90                           	nop
      1561: 90                           	nop
      1562: 90                           	nop
      1563: 90                           	nop
      1564: 90                           	nop
      1565: 90                           	nop
      1566: 90                           	nop
      1567: 90                           	nop
      1568: 90                           	nop
      1569: 90                           	nop
      156a: 90                           	nop
      156b: 90                           	nop
      156c: 90                           	nop
      156d: 90                           	nop
      156e: 90                           	nop
      156f: 90                           	nop

Cheers,
Nathan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-05-20 23:06     ` Nathan Chancellor
@ 2026-05-26  9:06       ` Peter Zijlstra
  2026-05-26 19:35         ` Nathan Chancellor
  0 siblings, 1 reply; 28+ messages in thread
From: Peter Zijlstra @ 2026-05-26  9:06 UTC (permalink / raw)
  To: Nathan Chancellor
  Cc: tglx, linux-kernel, Sean Christopherson, Jim Mattson, Binbin Wu,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini, Calvin Owens


Sorry, I missed this :/

On Wed, May 20, 2026 at 04:06:21PM -0700, Nathan Chancellor wrote:
> On Fri, May 08, 2026 at 11:18:29AM +0200, Peter Zijlstra wrote:
> > 
> > Move the VMX interrupt dispatch magic into the x86 core code. This
> > isolates KVM from the FRED/IDT decisions and reduces the amount of
> > EXPORT_SYMBOL_FOR_KVM().
> > 
> > Suggested-by: Sean Christopherson <seanjc@google.com>
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > Tested-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
> 
> I am seeing
> 
>   vmlinux.o: warning: objtool: idt_do_interrupt_irqoff+0xe: no-cfi indirect call!
> 
> after this landed in -next.
> 
>   $ cat arch/x86/configs/repro.config
>   CONFIG_CFI=y
>   CONFIG_KVM=y
>   CONFIG_KVM_INTEL=y
> 
>   $ make -skj"$(nproc)" ARCH=x86_64 LLVM=1 mrproper defconfig repro.config vmlinux
>   vmlinux.o: warning: objtool: idt_do_interrupt_irqoff+0xe: no-cfi indirect call!
> 

Durr.

---
Subject: x86/kvm/vmx: Fix x86_64 CFI build

I missed that idt_do_interrupt_irqoff() gets compiled on x84_64; this is
a problem for CFI builds because it includes an unadorned indirect call.
It is however completely dead code.

Rework things to not emit this function at all.

Fixes: 0701c9e17bd9 ("x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core")
Reported-by: Nathan Chancellor <nathan@kernel.org>
Reported-by: Calvin Owens <calvin@wbinvd.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 06c7c6ebd6f9..14cd43d4da6c 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -55,7 +55,7 @@ noinstr void x86_entry_from_kvm(unsigned int event_type, unsigned int vector)
 	 * The FRED NMI context is significantly different and will not work
 	 * right (specifically FRED fixed the NMI recursion issue).
 	 */
-	idt_entry_from_kvm(vector);
+	idt_do_nmi_irqoff();
 }
 EXPORT_SYMBOL_FOR_KVM(x86_entry_from_kvm);
 #endif
diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S
index a56e043b266d..2bc217bb5475 100644
--- a/arch/x86/entry/entry.S
+++ b/arch/x86/entry/entry.S
@@ -109,11 +109,13 @@ EXPORT_SYMBOL(__ref_stack_chk_guard);
 	RET
 .endm
 
+#ifndef CONFIG_X86_64
 .pushsection .text, "ax"
 SYM_FUNC_START(idt_do_interrupt_irqoff)
 	IDT_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
 SYM_FUNC_END(idt_do_interrupt_irqoff)
 .popsection
+#endif
 
 .pushsection .noinstr.text, "ax"
 SYM_FUNC_START(idt_do_nmi_irqoff)
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 7bcf1decc034..90a22e24a9eb 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -268,18 +268,10 @@ void __init idt_setup_early_pf(void)
 }
 #endif
 
-#if IS_ENABLED(CONFIG_KVM_INTEL)
-noinstr void idt_entry_from_kvm(unsigned int vector)
+#if IS_ENABLED(CONFIG_KVM_INTEL) && !defined(CONFIG_X86_64)
+void idt_entry_from_kvm(unsigned int vector)
 {
-	if (vector == NMI_VECTOR)
-		return idt_do_nmi_irqoff();
-
-	/*
-	 * Only the NMI path requires noinstr.
-	 */
-	instrumentation_begin();
 	idt_do_interrupt_irqoff(gate_offset(idt_table + vector));
-	instrumentation_end();
 }
 #endif
 

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-05-08  9:18   ` [PATCH v3 " Peter Zijlstra
                       ` (3 preceding siblings ...)
  2026-05-20 23:06     ` Nathan Chancellor
@ 2026-05-26 10:01     ` Bezdeka, Florian
  2026-05-26 10:21       ` Peter Zijlstra
  4 siblings, 1 reply; 28+ messages in thread
From: Bezdeka, Florian @ 2026-05-26 10:01 UTC (permalink / raw)
  To: peterz@infradead.org, tglx@kernel.org
  Cc: jmattson@google.com, rick.p.edgecombe@intel.com,
	binbin.wu@intel.com, seanjc@google.com, binbin.wu@linux.intel.com,
	bonzini@redhat.com, x86@kernel.org, linux-kernel@vger.kernel.org,
	vishal.l.verma@intel.com, kvm@vger.kernel.org, Kiszka, Jan,
	rpm@xenomai.org

Hi Peter,

On Fri, 2026-05-08 at 11:18 +0200, Peter Zijlstra wrote:
> Move the VMX interrupt dispatch magic into the x86 core code. This
> isolates KVM from the FRED/IDT decisions and reduces the amount of
> EXPORT_SYMBOL_FOR_KVM().
> 
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Tested-by: "Verma, Vishal L" <vishal.l.verma@intel.com>
> ---
> Changes since v2:
>  - one more IS_ENABLED(CONFIG_KVM_INTEL) (Yan Zhao)
>  - fixed idt_do_interrupt_irqoff() prototype (Binbin Wu)
> 
>  arch/x86/entry/Makefile             |    2 -
>  arch/x86/entry/common.c             |   48 ++++++++++++++++++++++++++++++++++++
>  arch/x86/entry/entry.S              |   46 ++++++++++++++++++++++++++++++++++
>  arch/x86/entry/entry_64_fred.S      |    1 
>  arch/x86/include/asm/desc.h         |    4 +++
>  arch/x86/include/asm/desc_defs.h    |    2 -
>  arch/x86/include/asm/entry-common.h |    2 +
>  arch/x86/include/asm/fred.h         |    1 
>  arch/x86/kernel/idt.c               |   15 +++++++++++
>  arch/x86/kernel/nmi.c               |    1 
>  arch/x86/kvm/vmx/vmenter.S          |   46 ----------------------------------
>  arch/x86/kvm/vmx/vmx.c              |   20 ++-------------
>  12 files changed, 120 insertions(+), 68 deletions(-)
> 
> --- a/arch/x86/entry/Makefile
> +++ b/arch/x86/entry/Makefile
> @@ -13,7 +13,7 @@ CFLAGS_REMOVE_syscall_64.o	= $(CC_FLAGS_
>  CFLAGS_syscall_32.o		+= -fno-stack-protector
>  CFLAGS_syscall_64.o		+= -fno-stack-protector
>  
> -obj-y				:= entry.o entry_$(BITS).o syscall_$(BITS).o
> +obj-y				:= entry.o entry_$(BITS).o syscall_$(BITS).o common.o
>  
>  obj-y				+= vdso/
>  obj-y				+= vsyscall/
> --- /dev/null
> +++ b/arch/x86/entry/common.c
> @@ -0,0 +1,48 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#include <linux/entry-common.h>
> +#include <linux/kvm_types.h>
> +#include <asm/fred.h>
> +#include <asm/desc.h>
> +
> +#if IS_ENABLED(CONFIG_KVM_INTEL)
> +/*
> + * On VMX, NMIs and IRQs (as configured by KVM) are acknowledged by hardware as
> + * part of the VM-Exit, i.e. the event itself is consumed as part the VM-Exit.
> + * x86_entry_from_kvm() is invoked by KVM to effectively forward NMIs and IRQs
> + * to the kernel for servicing.  On SVM, a.k.a. AMD, the NMI/IRQ VM-Exit is
> + * purely a signal that an NMI/IRQ is pending, i.e. the event that triggered
> + * the VM-Exit is held pending until it's unblocked in the host.
> + */
> +noinstr void x86_entry_from_kvm(unsigned int event_type, unsigned int vector)
> +{
> +	if (event_type == EVENT_TYPE_EXTINT) {
> +#ifdef CONFIG_X86_64
> +		/*
> +		 * Use FRED dispatch, even when running IDT. The dispatch
> +		 * tables are kept in sync between FRED and IDT, and the FRED
> +		 * dispatch works well with CFI.
> +		 */
> +		fred_entry_from_kvm(event_type, vector);

Seems this landed in 7.1-rc5.

I'm seeing a build failure here:

arch/x86/entry/common.c: In function ‘x86_entry_from_kvm’:
arch/x86/entry/common.c:27:17: error: implicit declaration of function ‘fred_entry_from_kvm’; did you mean ‘idt_entry_from_kvm’? [-Wimplicit-function-declaration]
   27 |                 fred_entry_from_kvm(event_type, vector);
      |                 ^~~~~~~~~~~~~~~~~~~
      |                 idt_entry_from_kvm
arch/x86/entry/common.c:50:24: error: ‘return’ with a value, in function returning void [-Wreturn-mismatch]
   50 |                 return fred_entry_from_kvm(event_type, vector);
      |                        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
arch/x86/entry/common.c:18:14: note: declared here
   18 | noinstr void x86_entry_from_kvm(unsigned int event_type, unsigned int vector)
      |              ^~~~~~~~~~~~~~~~~~



> +#else
> +		idt_entry_from_kvm(vector);
> +#endif
> +		return;
> +	}
> +
> +	WARN_ON_ONCE(event_type != EVENT_TYPE_NMI);
> +
> +#ifdef CONFIG_X86_64
> +	if (cpu_feature_enabled(X86_FEATURE_FRED))
> +		return fred_entry_from_kvm(event_type, vector);
> +#endif
> +
> +	/*
> +	 * Notably, we must use IDT dispatch for NMI when running in IDT mode.
> +	 * The FRED NMI context is significantly different and will not work
> +	 * right (speficially FRED fixed the NMI recursion issue).
> +	 */
> +	idt_entry_from_kvm(vector);
> +}
> +EXPORT_SYMBOL_FOR_KVM(x86_entry_from_kvm);
> +#endif
> 

[snip]

> --- a/arch/x86/include/asm/entry-common.h
> +++ b/arch/x86/include/asm/entry-common.h
> @@ -97,4 +97,6 @@ static __always_inline void arch_exit_to
>  }
>  #define arch_exit_to_user_mode arch_exit_to_user_mode
>  
> +extern void x86_entry_from_kvm(unsigned int entry_type, unsigned int vector);
> +
>  #endif
> --- a/arch/x86/include/asm/fred.h
> +++ b/arch/x86/include/asm/fred.h
> @@ -110,7 +110,6 @@ static __always_inline unsigned long fre
>  static inline void cpu_init_fred_exceptions(void) { }
>  static inline void cpu_init_fred_rsps(void) { }
>  static inline void fred_complete_exception_setup(void) { }
> -static inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { }

That seems still necessary for the !CONFIG_X86_FRED case.

>  static inline void fred_sync_rsp0(unsigned long rsp0) { }
>  static inline void fred_update_rsp0(void) { }
>  #endif /* CONFIG_X86_FRED */

[snip]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-05-26 10:01     ` Bezdeka, Florian
@ 2026-05-26 10:21       ` Peter Zijlstra
  2026-05-26 10:42         ` Florian Bezdeka
  0 siblings, 1 reply; 28+ messages in thread
From: Peter Zijlstra @ 2026-05-26 10:21 UTC (permalink / raw)
  To: Bezdeka, Florian
  Cc: tglx@kernel.org, jmattson@google.com, rick.p.edgecombe@intel.com,
	binbin.wu@intel.com, seanjc@google.com, binbin.wu@linux.intel.com,
	bonzini@redhat.com, x86@kernel.org, linux-kernel@vger.kernel.org,
	vishal.l.verma@intel.com, kvm@vger.kernel.org, Kiszka, Jan,
	rpm@xenomai.org

On Tue, May 26, 2026 at 10:01:41AM +0000, Bezdeka, Florian wrote:

> > +#if IS_ENABLED(CONFIG_KVM_INTEL)
> > +/*
> > + * On VMX, NMIs and IRQs (as configured by KVM) are acknowledged by hardware as
> > + * part of the VM-Exit, i.e. the event itself is consumed as part the VM-Exit.
> > + * x86_entry_from_kvm() is invoked by KVM to effectively forward NMIs and IRQs
> > + * to the kernel for servicing.  On SVM, a.k.a. AMD, the NMI/IRQ VM-Exit is
> > + * purely a signal that an NMI/IRQ is pending, i.e. the event that triggered
> > + * the VM-Exit is held pending until it's unblocked in the host.
> > + */
> > +noinstr void x86_entry_from_kvm(unsigned int event_type, unsigned int vector)
> > +{
> > +	if (event_type == EVENT_TYPE_EXTINT) {
> > +#ifdef CONFIG_X86_64
> > +		/*
> > +		 * Use FRED dispatch, even when running IDT. The dispatch
> > +		 * tables are kept in sync between FRED and IDT, and the FRED
> > +		 * dispatch works well with CFI.
> > +		 */
> > +		fred_entry_from_kvm(event_type, vector);
> 
> Seems this landed in 7.1-rc5.
> 
> I'm seeing a build failure here:
> 
> arch/x86/entry/common.c: In function ‘x86_entry_from_kvm’:
> arch/x86/entry/common.c:27:17: error: implicit declaration of function ‘fred_entry_from_kvm’; did you mean ‘idt_entry_from_kvm’? [-Wimplicit-function-declaration]
>    27 |                 fred_entry_from_kvm(event_type, vector);
>       |                 ^~~~~~~~~~~~~~~~~~~
>       |                 idt_entry_from_kvm
> arch/x86/entry/common.c:50:24: error: ‘return’ with a value, in function returning void [-Wreturn-mismatch]
>    50 |                 return fred_entry_from_kvm(event_type, vector);
>       |                        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> arch/x86/entry/common.c:18:14: note: declared here
>    18 | noinstr void x86_entry_from_kvm(unsigned int event_type, unsigned int vector)
>       |              ^~~~~~~~~~~~~~~~~~
> 
> 
> 
> > +#else
> > +		idt_entry_from_kvm(vector);
> > +#endif
> > +		return;
> > +	}
> > +
> > +	WARN_ON_ONCE(event_type != EVENT_TYPE_NMI);
> > +
> > +#ifdef CONFIG_X86_64
> > +	if (cpu_feature_enabled(X86_FEATURE_FRED))
> > +		return fred_entry_from_kvm(event_type, vector);
> > +#endif
> > +
> > +	/*
> > +	 * Notably, we must use IDT dispatch for NMI when running in IDT mode.
> > +	 * The FRED NMI context is significantly different and will not work
> > +	 * right (speficially FRED fixed the NMI recursion issue).
> > +	 */
> > +	idt_entry_from_kvm(vector);
> > +}
> > +EXPORT_SYMBOL_FOR_KVM(x86_entry_from_kvm);
> > +#endif
> > 
> 
> [snip]
> 
> > --- a/arch/x86/include/asm/entry-common.h
> > +++ b/arch/x86/include/asm/entry-common.h
> > @@ -97,4 +97,6 @@ static __always_inline void arch_exit_to
> >  }
> >  #define arch_exit_to_user_mode arch_exit_to_user_mode
> >  
> > +extern void x86_entry_from_kvm(unsigned int entry_type, unsigned int vector);
> > +
> >  #endif
> > --- a/arch/x86/include/asm/fred.h
> > +++ b/arch/x86/include/asm/fred.h
> > @@ -110,7 +110,6 @@ static __always_inline unsigned long fre
> >  static inline void cpu_init_fred_exceptions(void) { }
> >  static inline void cpu_init_fred_rsps(void) { }
> >  static inline void fred_complete_exception_setup(void) { }
> > -static inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { }
> 
> That seems still necessary for the !CONFIG_X86_FRED case.

The thing is, KVM_INTEL should force X86_FRED, I'm not sure how you can
have both KVM_INTEL and !X86_FRED, that should be an invalid config.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-05-26 10:21       ` Peter Zijlstra
@ 2026-05-26 10:42         ` Florian Bezdeka
  0 siblings, 0 replies; 28+ messages in thread
From: Florian Bezdeka @ 2026-05-26 10:42 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx@kernel.org, jmattson@google.com, rick.p.edgecombe@intel.com,
	binbin.wu@intel.com, seanjc@google.com, binbin.wu@linux.intel.com,
	bonzini@redhat.com, x86@kernel.org, linux-kernel@vger.kernel.org,
	vishal.l.verma@intel.com, kvm@vger.kernel.org, Kiszka, Jan,
	rpm@xenomai.org

On Tue, 2026-05-26 at 12:21 +0200, Peter Zijlstra wrote:
> On Tue, May 26, 2026 at 10:01:41AM +0000, Bezdeka, Florian wrote:
> 
> > > +#if IS_ENABLED(CONFIG_KVM_INTEL)
> > > +/*
> > > + * On VMX, NMIs and IRQs (as configured by KVM) are acknowledged by hardware as
> > > + * part of the VM-Exit, i.e. the event itself is consumed as part the VM-Exit.
> > > + * x86_entry_from_kvm() is invoked by KVM to effectively forward NMIs and IRQs
> > > + * to the kernel for servicing.  On SVM, a.k.a. AMD, the NMI/IRQ VM-Exit is
> > > + * purely a signal that an NMI/IRQ is pending, i.e. the event that triggered
> > > + * the VM-Exit is held pending until it's unblocked in the host.
> > > + */
> > > +noinstr void x86_entry_from_kvm(unsigned int event_type, unsigned int vector)
> > > +{
> > > +	if (event_type == EVENT_TYPE_EXTINT) {
> > > +#ifdef CONFIG_X86_64
> > > +		/*
> > > +		 * Use FRED dispatch, even when running IDT. The dispatch
> > > +		 * tables are kept in sync between FRED and IDT, and the FRED
> > > +		 * dispatch works well with CFI.
> > > +		 */
> > > +		fred_entry_from_kvm(event_type, vector);
> > 
> > Seems this landed in 7.1-rc5.
> > 
> > I'm seeing a build failure here:
> > 
> > arch/x86/entry/common.c: In function ‘x86_entry_from_kvm’:
> > arch/x86/entry/common.c:27:17: error: implicit declaration of function ‘fred_entry_from_kvm’; did you mean ‘idt_entry_from_kvm’? [-Wimplicit-function-declaration]
> >    27 |                 fred_entry_from_kvm(event_type, vector);
> >       |                 ^~~~~~~~~~~~~~~~~~~
> >       |                 idt_entry_from_kvm
> > arch/x86/entry/common.c:50:24: error: ‘return’ with a value, in function returning void [-Wreturn-mismatch]
> >    50 |                 return fred_entry_from_kvm(event_type, vector);
> >       |                        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> > arch/x86/entry/common.c:18:14: note: declared here
> >    18 | noinstr void x86_entry_from_kvm(unsigned int event_type, unsigned int vector)
> >       |              ^~~~~~~~~~~~~~~~~~
> > 
> > 
> > 
> > > +#else
> > > +		idt_entry_from_kvm(vector);
> > > +#endif
> > > +		return;
> > > +	}
> > > +
> > > +	WARN_ON_ONCE(event_type != EVENT_TYPE_NMI);
> > > +
> > > +#ifdef CONFIG_X86_64
> > > +	if (cpu_feature_enabled(X86_FEATURE_FRED))
> > > +		return fred_entry_from_kvm(event_type, vector);
> > > +#endif
> > > +
> > > +	/*
> > > +	 * Notably, we must use IDT dispatch for NMI when running in IDT mode.
> > > +	 * The FRED NMI context is significantly different and will not work
> > > +	 * right (speficially FRED fixed the NMI recursion issue).
> > > +	 */
> > > +	idt_entry_from_kvm(vector);
> > > +}
> > > +EXPORT_SYMBOL_FOR_KVM(x86_entry_from_kvm);
> > > +#endif
> > > 
> > 
> > [snip]
> > 
> > > --- a/arch/x86/include/asm/entry-common.h
> > > +++ b/arch/x86/include/asm/entry-common.h
> > > @@ -97,4 +97,6 @@ static __always_inline void arch_exit_to
> > >  }
> > >  #define arch_exit_to_user_mode arch_exit_to_user_mode
> > >  
> > > +extern void x86_entry_from_kvm(unsigned int entry_type, unsigned int vector);
> > > +
> > >  #endif
> > > --- a/arch/x86/include/asm/fred.h
> > > +++ b/arch/x86/include/asm/fred.h
> > > @@ -110,7 +110,6 @@ static __always_inline unsigned long fre
> > >  static inline void cpu_init_fred_exceptions(void) { }
> > >  static inline void cpu_init_fred_rsps(void) { }
> > >  static inline void fred_complete_exception_setup(void) { }
> > > -static inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { }
> > 
> > That seems still necessary for the !CONFIG_X86_FRED case.
> 
> The thing is, KVM_INTEL should force X86_FRED, I'm not sure how you can
> have both KVM_INTEL and !X86_FRED, that should be an invalid config.

Right. Thanks for the super fast response. Let me double check what is
going on here...

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
  2026-05-26  9:06       ` Peter Zijlstra
@ 2026-05-26 19:35         ` Nathan Chancellor
  0 siblings, 0 replies; 28+ messages in thread
From: Nathan Chancellor @ 2026-05-26 19:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, linux-kernel, Sean Christopherson, Jim Mattson, Binbin Wu,
	Vishal L Verma, kvm@vger.kernel.org, Rick P Edgecombe, Binbin Wu,
	x86@kernel.org, Paolo Bonzini, Calvin Owens

On Tue, May 26, 2026 at 11:06:31AM +0200, Peter Zijlstra wrote:
> 
> Sorry, I missed this :/

No worries, I should have pinged about it elsewhere sooner.

> Subject: x86/kvm/vmx: Fix x86_64 CFI build
> 
> I missed that idt_do_interrupt_irqoff() gets compiled on x84_64; this is
> a problem for CFI builds because it includes an unadorned indirect call.
> It is however completely dead code.
> 
> Rework things to not emit this function at all.
> 
> Fixes: 0701c9e17bd9 ("x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core")
> Reported-by: Nathan Chancellor <nathan@kernel.org>
> Reported-by: Calvin Owens <calvin@wbinvd.org>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Tested-by: Nathan Chancellor <nathan@kernel.org>

> ---
> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
> index 06c7c6ebd6f9..14cd43d4da6c 100644
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -55,7 +55,7 @@ noinstr void x86_entry_from_kvm(unsigned int event_type, unsigned int vector)
>  	 * The FRED NMI context is significantly different and will not work
>  	 * right (specifically FRED fixed the NMI recursion issue).
>  	 */
> -	idt_entry_from_kvm(vector);
> +	idt_do_nmi_irqoff();
>  }
>  EXPORT_SYMBOL_FOR_KVM(x86_entry_from_kvm);
>  #endif
> diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S
> index a56e043b266d..2bc217bb5475 100644
> --- a/arch/x86/entry/entry.S
> +++ b/arch/x86/entry/entry.S
> @@ -109,11 +109,13 @@ EXPORT_SYMBOL(__ref_stack_chk_guard);
>  	RET
>  .endm
>  
> +#ifndef CONFIG_X86_64
>  .pushsection .text, "ax"
>  SYM_FUNC_START(idt_do_interrupt_irqoff)
>  	IDT_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
>  SYM_FUNC_END(idt_do_interrupt_irqoff)
>  .popsection
> +#endif
>  
>  .pushsection .noinstr.text, "ax"
>  SYM_FUNC_START(idt_do_nmi_irqoff)
> diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
> index 7bcf1decc034..90a22e24a9eb 100644
> --- a/arch/x86/kernel/idt.c
> +++ b/arch/x86/kernel/idt.c
> @@ -268,18 +268,10 @@ void __init idt_setup_early_pf(void)
>  }
>  #endif
>  
> -#if IS_ENABLED(CONFIG_KVM_INTEL)
> -noinstr void idt_entry_from_kvm(unsigned int vector)
> +#if IS_ENABLED(CONFIG_KVM_INTEL) && !defined(CONFIG_X86_64)
> +void idt_entry_from_kvm(unsigned int vector)
>  {
> -	if (vector == NMI_VECTOR)
> -		return idt_do_nmi_irqoff();
> -
> -	/*
> -	 * Only the NMI path requires noinstr.
> -	 */
> -	instrumentation_begin();
>  	idt_do_interrupt_irqoff(gate_offset(idt_table + vector));
> -	instrumentation_end();
>  }
>  #endif
>  

-- 
Cheers,
Nathan

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2026-05-26 19:35 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-23 15:56 [PATCH 0/2] x86/kvm/vmx: Fix VMX interrupt injection vs hrtimer_rearm_deferred() Peter Zijlstra
2026-04-23 15:56 ` [PATCH 1/2] x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core Peter Zijlstra
2026-04-23 17:54   ` Xin Li
2026-04-28  9:43   ` Binbin Wu
2026-04-28 11:25     ` Paolo Bonzini
2026-05-01 20:31       ` Peter Zijlstra
2026-05-01 20:37   ` [PATCH v2 " Peter Zijlstra
2026-05-08  2:54     ` Yan Zhao
2026-05-08  8:54       ` Peter Zijlstra
2026-05-08  6:09     ` Binbin Wu
2026-05-08  8:53       ` Peter Zijlstra
2026-05-08  8:56         ` Binbin Wu
2026-05-08  9:18   ` [PATCH v3 " Peter Zijlstra
2026-05-08  9:41     ` Binbin Wu
2026-05-12 22:31     ` Sean Christopherson
2026-05-18  8:12     ` Zhao Liu
2026-05-20 23:06     ` Nathan Chancellor
2026-05-26  9:06       ` Peter Zijlstra
2026-05-26 19:35         ` Nathan Chancellor
2026-05-26 10:01     ` Bezdeka, Florian
2026-05-26 10:21       ` Peter Zijlstra
2026-05-26 10:42         ` Florian Bezdeka
2026-04-23 15:56 ` [PATCH 2/2] x86/kvm/vmx: Fix VMX vs hrtimer_rearm_deferred() Peter Zijlstra
2026-05-11 12:59   ` David Woodhouse
2026-05-12 22:32     ` Sean Christopherson
2026-05-15 18:15     ` Marc Dionne
2026-05-18  8:01   ` Zhao Liu
2026-05-18  8:16   ` Binbin Wu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox