Re: CPU Lockups in KVM with deferred hrtimer rearming

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

From: Peter Zijlstra <peterz@infradead.org>
To: Sean Christopherson <seanjc@google.com>
Cc: Thomas Gleixner <tglx@kernel.org>,
	Jim Mattson <jmattson@google.com>,
	Binbin Wu <binbin.wu@linux.intel.com>,
	Vishal L Verma <vishal.l.verma@intel.com>,
	"kvm@vger.kernel.org" <kvm@vger.kernel.org>,
	Rick P Edgecombe <rick.p.edgecombe@intel.com>,
	Binbin Wu <binbin.wu@intel.com>,
	"x86@kernel.org" <x86@kernel.org>,
	Paolo Bonzini <bonzini@redhat.com>
Subject: Re: CPU Lockups in KVM with deferred hrtimer rearming
Date: Wed, 22 Apr 2026 09:46:46 +0200	[thread overview]
Message-ID: <20260422074646.GO3126523@noisy.programming.kicks-ass.net> (raw)
In-Reply-To: <20260422065542.GN3126523@noisy.programming.kicks-ass.net>

On Wed, Apr 22, 2026 at 08:55:42AM +0200, Peter Zijlstra wrote:
> On Tue, Apr 21, 2026 at 02:42:46PM -0700, Sean Christopherson wrote:
> > On Tue, Apr 21, 2026, Peter Zijlstra wrote:
> > > On Tue, Apr 21, 2026 at 08:57:24PM +0000, Sean Christopherson wrote:
> > > > This as delta? (I had typed this all up before Peter posted a new verison, so
> > > > dammit I'm sending it!)
> > > 
> > > :-)
> > > 
> > > I'll go stare at it in the morning, I'm about to go crash out.
> > 
> > New delta against your effective v2, builds all of my configs (which isn't _that_
> > many, but I think they cover most of the weirder ways to include KVM (or not)).
> > 
> > I'll start testing the full thing to try and get early signal on the health.
> 
> Oh, re-reading commit 28d11e4548b7, I think this wrecks NMIs. The patch
> will also use the FRED NMI path and that's not good when running IDT.
> 
> I'll go fix that and stick in a few more comments to clarify this magic.

This should probably we at least two patches, one moving the code into
the x86 core and one adding that hrtimer fix on top. But kept as one for
now.

Irrespective of the hrtimer fix, I think the initial move into x86 core
part is a sane move.

---
Subject: x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86

Move the VMX interrupt dispatch magic into the x86 core code. This
isolates KVM from the FRED/IDT decisions and reduces the amount of
EXPORT_SYMBOL_FOR_KVM().

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index 72cae8e0ce85..83b4762d6ecb 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -13,7 +13,7 @@ CFLAGS_REMOVE_syscall_64.o	= $(CC_FLAGS_FTRACE)
 CFLAGS_syscall_32.o		+= -fno-stack-protector
 CFLAGS_syscall_64.o		+= -fno-stack-protector
 
-obj-y				:= entry.o entry_$(BITS).o syscall_$(BITS).o
+obj-y				:= entry.o entry_$(BITS).o syscall_$(BITS).o common.o
 
 obj-y				+= vdso/
 obj-y				+= vsyscall/
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
new file mode 100644
index 000000000000..c8a9ddd3e809
--- /dev/null
+++ b/arch/x86/entry/common.c
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/entry-common.h>
+#include <linux/kvm_types.h>
+#include <linux/hrtimer_rearm.h>
+#include <asm/fred.h>
+#include <asm/desc.h>
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+/*
+ * On VMX, NMIs and IRQs (as configured by KVM) are acknowledge by hardware as
+ * part of the VM-Exit, i.e. the event itself is consumed as part the VM-Exit.
+ * x86_entry_from_kvm() is invoked by KVM to effectively forward NMIs and IRQs
+ * to the kernel for servicing.  On SVM, a.k.a. AMD, the NMI/IRQ VM-Exit is
+ * purely a signal that an NMI/IRQ is pending, i.e. the event that triggered
+ * the VM-Exit is held pending until it's unblocked in the host.
+ */
+noinstr void x86_entry_from_kvm(unsigned int event_type, unsigned int vector)
+{
+	if (event_type == EVENT_TYPE_EXTINT) {
+#ifdef CONFIG_X86_64
+		/*
+		 * Use FRED dispatch, even when running IDT. The dispatch
+		 * tables are kept in sync between FRED and IDT, and the FRED
+		 * dispatch works well with CFI.
+		 */
+		fred_entry_from_kvm(event_type, vector);
+#else
+		idt_entry_from_kvm(vector);
+#endif
+		/*
+		 * Strictly speaking, only the NMI path requires noinstr.
+		 */
+		instrumentation_begin();
+		/*
+		 * KVM/VMX will dispatch from IRQ-disabled but for a context
+		 * that will have IRQs-enabled. This confuses the entry code
+		 * and it will not have reprogrammed the timer (or do
+		 * preemption). Minimal fixup for now.
+		 */
+		hrtimer_rearm_deferred();
+		instrumentation_end();
+
+		return;
+	}
+
+	WARN_ON_ONCE(event_type != EVENT_TYPE_NMI);
+
+#ifdef CONFIG_X86_64
+	if (cpu_feature_enabled(X86_FEATURE_FRED))
+		return fred_entry_from_kvm(event_type, vector);
+#endif
+
+	/*
+	 * Notably, we must use IDT dispatch for NMI when running in IDT mode.
+	 * The FRED NMI context is significantly different and will not work
+	 * right (speficially FRED fixed the NMI recursion issue).
+	 */
+	idt_entry_from_kvm(vector);
+}
+EXPORT_SYMBOL_FOR_KVM(x86_entry_from_kvm);
+#endif
diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S
index 6ba2b3adcef0..16f0a5b253e2 100644
--- a/arch/x86/entry/entry.S
+++ b/arch/x86/entry/entry.S
@@ -75,3 +75,49 @@ THUNK warn_thunk_thunk, __warn_thunk
 #if defined(CONFIG_STACKPROTECTOR) && defined(CONFIG_SMP)
 EXPORT_SYMBOL(__ref_stack_chk_guard);
 #endif
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+.macro IDT_DO_EVENT_IRQOFF call_insn call_target
+	/*
+	 * Unconditionally create a stack frame, getting the correct RSP on the
+	 * stack (for x86-64) would take two instructions anyways, and RBP can
+	 * be used to restore RSP to make objtool happy (see below).
+	 */
+	push %_ASM_BP
+	mov %_ASM_SP, %_ASM_BP
+
+#ifdef CONFIG_X86_64
+	/*
+	 * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
+	 * creating the synthetic interrupt stack frame for the IRQ/NMI.
+	 */
+	and  $-16, %rsp
+	push $__KERNEL_DS
+	push %rbp
+#endif
+	pushf
+	push $__KERNEL_CS
+	\call_insn \call_target
+
+	/*
+	 * "Restore" RSP from RBP, even though IRET has already unwound RSP to
+	 * the correct value.  objtool doesn't know the callee will IRET and,
+	 * without the explicit restore, thinks the stack is getting walloped.
+	 * Using an unwind hint is problematic due to x86-64's dynamic alignment.
+	 */
+	leave
+	RET
+.endm
+
+.pushsection .text, "ax"
+SYM_FUNC_START(idt_do_interrupt_irqoff)
+	IDT_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
+SYM_FUNC_END(idt_do_interrupt_irqoff)
+.popsection
+
+.pushsection .noinstr.text, "ax"
+SYM_FUNC_START(idt_do_nmi_irqoff)
+	IDT_DO_EVENT_IRQOFF call asm_exc_nmi
+SYM_FUNC_END(idt_do_nmi_irqoff)
+.popsection
+#endif
diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S
index 894f7f16eb80..0d2768ab836c 100644
--- a/arch/x86/entry/entry_64_fred.S
+++ b/arch/x86/entry/entry_64_fred.S
@@ -147,5 +147,4 @@ SYM_FUNC_START(asm_fred_entry_from_kvm)
 	RET
 
 SYM_FUNC_END(asm_fred_entry_from_kvm)
-EXPORT_SYMBOL_FOR_KVM(asm_fred_entry_from_kvm);
 #endif
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index ec95fe44fa3a..f44d6a606b4c 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -438,6 +438,10 @@ extern void idt_setup_traps(void);
 extern void idt_setup_apic_and_irq_gates(void);
 extern bool idt_is_f00f_address(unsigned long address);
 
+extern void idt_do_interrupt_irqoff(unsigned int vector);
+extern void idt_do_nmi_irqoff(void);
+extern void idt_entry_from_kvm(unsigned int vector);
+
 #ifdef CONFIG_X86_64
 extern void idt_setup_early_pf(void);
 #else
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
index 7e6b9314758a..2f2ce8aadf07 100644
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -145,7 +145,7 @@ struct gate_struct {
 typedef struct gate_struct gate_desc;
 
 #ifndef _SETUP
-static inline unsigned long gate_offset(const gate_desc *g)
+static __always_inline unsigned long gate_offset(const gate_desc *g)
 {
 #ifdef CONFIG_X86_64
 	return g->offset_low | ((unsigned long)g->offset_middle << 16) |
diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index 7535131c711b..eca24b5e07f4 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -97,4 +97,6 @@ static __always_inline void arch_exit_to_user_mode(void)
 }
 #define arch_exit_to_user_mode arch_exit_to_user_mode
 
+extern void x86_entry_from_kvm(unsigned int entry_type, unsigned int vector);
+
 #endif
diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h
index 2bb65677c079..18a2f811c358 100644
--- a/arch/x86/include/asm/fred.h
+++ b/arch/x86/include/asm/fred.h
@@ -110,7 +110,6 @@ static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { ret
 static inline void cpu_init_fred_exceptions(void) { }
 static inline void cpu_init_fred_rsps(void) { }
 static inline void fred_complete_exception_setup(void) { }
-static inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { }
 static inline void fred_sync_rsp0(unsigned long rsp0) { }
 static inline void fred_update_rsp0(void) { }
 #endif /* CONFIG_X86_FRED */
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 42bf6a58ec36..db4072875f5f 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -633,17 +633,6 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC,	xenpv_exc_machine_check);
 #endif
 
 /* NMI */
-
-#if IS_ENABLED(CONFIG_KVM_INTEL)
-/*
- * Special entry point for VMX which invokes this on the kernel stack, even for
- * 64-bit, i.e. without using an IST.  asm_exc_nmi() requires an IST to work
- * correctly vs. the NMI 'executing' marker.  Used for 32-bit kernels as well
- * to avoid more ifdeffery.
- */
-DECLARE_IDTENTRY(X86_TRAP_NMI,		exc_nmi_kvm_vmx);
-#endif
-
 DECLARE_IDTENTRY_NMI(X86_TRAP_NMI,	exc_nmi);
 #ifdef CONFIG_XEN_PV
 DECLARE_IDTENTRY_RAW(X86_TRAP_NMI,	xenpv_exc_nmi);
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 260456588756..0e8fb61f63ff 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -268,6 +268,19 @@ void __init idt_setup_early_pf(void)
 }
 #endif
 
+noinstr void idt_entry_from_kvm(unsigned int vector)
+{
+	if (vector == NMI_VECTOR)
+		return idt_do_nmi_irqoff();
+
+	/*
+	 * Only the NMI path requires noinstr.
+	 */
+	instrumentation_begin();
+	idt_do_interrupt_irqoff(gate_offset(idt_table + vector));
+	instrumentation_end();
+}
+
 static void __init idt_map_in_cea(void)
 {
 	/*
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 3d239ed12744..06fe225fb0a2 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -609,14 +609,6 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
 		goto nmi_restart;
 }
 
-#if IS_ENABLED(CONFIG_KVM_INTEL)
-DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx)
-{
-	exc_nmi(regs);
-}
-EXPORT_SYMBOL_FOR_KVM(asm_exc_nmi_kvm_vmx);
-#endif
-
 #ifdef CONFIG_NMI_CHECK_CPU
 
 static char *nmi_check_stall_msg[] = {
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 8a481dae9cae..ff1f254a0ef4 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -31,38 +31,6 @@
 #define VCPU_R15	__VCPU_REGS_R15 * WORD_SIZE
 #endif
 
-.macro VMX_DO_EVENT_IRQOFF call_insn call_target
-	/*
-	 * Unconditionally create a stack frame, getting the correct RSP on the
-	 * stack (for x86-64) would take two instructions anyways, and RBP can
-	 * be used to restore RSP to make objtool happy (see below).
-	 */
-	push %_ASM_BP
-	mov %_ASM_SP, %_ASM_BP
-
-#ifdef CONFIG_X86_64
-	/*
-	 * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
-	 * creating the synthetic interrupt stack frame for the IRQ/NMI.
-	 */
-	and  $-16, %rsp
-	push $__KERNEL_DS
-	push %rbp
-#endif
-	pushf
-	push $__KERNEL_CS
-	\call_insn \call_target
-
-	/*
-	 * "Restore" RSP from RBP, even though IRET has already unwound RSP to
-	 * the correct value.  objtool doesn't know the callee will IRET and,
-	 * without the explicit restore, thinks the stack is getting walloped.
-	 * Using an unwind hint is problematic due to x86-64's dynamic alignment.
-	 */
-	leave
-	RET
-.endm
-
 .section .noinstr.text, "ax"
 
 /**
@@ -320,10 +288,6 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL)
 
 SYM_FUNC_END(__vmx_vcpu_run)
 
-SYM_FUNC_START(vmx_do_nmi_irqoff)
-	VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx
-SYM_FUNC_END(vmx_do_nmi_irqoff)
-
 #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
 
 /**
@@ -375,13 +339,3 @@ SYM_FUNC_START(vmread_error_trampoline)
 	RET
 SYM_FUNC_END(vmread_error_trampoline)
 #endif
-
-.section .text, "ax"
-
-#ifndef CONFIG_X86_FRED
-
-SYM_FUNC_START(vmx_do_interrupt_irqoff)
-	VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
-SYM_FUNC_END(vmx_do_interrupt_irqoff)
-
-#endif
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index a29896a9ef14..753f0dbb9cf8 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7083,9 +7083,6 @@ void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 	vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
 }
 
-void vmx_do_interrupt_irqoff(unsigned long entry);
-void vmx_do_nmi_irqoff(void);
-
 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
 {
 	/*
@@ -7127,17 +7124,9 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu,
 	    "unexpected VM-Exit interrupt info: 0x%x", intr_info))
 		return;
 
-	/*
-	 * Invoke the kernel's IRQ handler for the vector.  Use the FRED path
-	 * when it's available even if FRED isn't fully enabled, e.g. even if
-	 * FRED isn't supported in hardware, in order to avoid the indirect
-	 * CALL in the non-FRED path.
-	 */
+	/* For the IRQ to the core kernel for processing. */
 	kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
-	if (IS_ENABLED(CONFIG_X86_FRED))
-		fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
-	else
-		vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector));
+	x86_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
 	kvm_after_interrupt(vcpu);
 
 	vcpu->arch.at_instruction_boundary = true;
@@ -7447,10 +7436,7 @@ noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu)
 		return;
 
 	kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
-	if (cpu_feature_enabled(X86_FEATURE_FRED))
-		fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
-	else
-		vmx_do_nmi_irqoff();
+	x86_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
 	kvm_after_interrupt(vcpu);
 }

next prev parent reply	other threads:[~2026-04-22  7:46 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-16 20:50 CPU Lockups in KVM with deferred hrtimer rearming Verma, Vishal L
2026-04-20 15:00 ` Thomas Gleixner
2026-04-20 15:22   ` Thomas Gleixner
2026-04-20 20:57   ` Verma, Vishal L
2026-04-20 22:19     ` Thomas Gleixner
2026-04-20 22:24       ` Verma, Vishal L
2026-04-21  6:29         ` Thomas Gleixner
2026-04-21  4:51   ` Binbin Wu
2026-04-21  7:39     ` Thomas Gleixner
2026-04-21 11:18       ` Peter Zijlstra
2026-04-21 11:32         ` Peter Zijlstra
2026-04-21 11:34           ` Peter Zijlstra
2026-04-21 11:49             ` Peter Zijlstra
2026-04-21 12:05               ` Peter Zijlstra
2026-04-21 13:19                 ` Peter Zijlstra
2026-04-21 13:29                   ` Peter Zijlstra
2026-04-21 16:36                     ` Thomas Gleixner
2026-04-21 18:11                     ` Verma, Vishal L
2026-04-21 17:11               ` Thomas Gleixner
2026-04-21 17:20                 ` Jim Mattson
2026-04-21 18:29                   ` Thomas Gleixner
2026-04-21 18:55                     ` Sean Christopherson
2026-04-21 20:06                       ` Peter Zijlstra
2026-04-21 20:46                         ` Peter Zijlstra
2026-04-21 20:57                         ` Sean Christopherson
2026-04-21 21:02                           ` Peter Zijlstra
2026-04-21 21:42                             ` Sean Christopherson
2026-04-22  6:55                               ` Peter Zijlstra
2026-04-22  7:46                                 ` Peter Zijlstra [this message]
2026-04-21 20:39                       ` Paolo Bonzini
2026-04-21 21:02                         ` Sean Christopherson
2026-04-21 22:48                         ` Thomas Gleixner
2026-04-21 23:15                           ` Paolo Bonzini
2026-04-21 23:34                             ` Jim Mattson
2026-04-21 23:37                               ` Paolo Bonzini
2026-04-22  2:10                             ` Thomas Gleixner
2026-04-21 21:49                       ` Thomas Gleixner
2026-04-21 22:07                         ` Sean Christopherson
2026-04-21 22:24                         ` Paolo Bonzini
2026-04-21 19:18                 ` Verma, Vishal L
2026-04-21 16:30           ` Thomas Gleixner
2026-04-21 16:11       ` Verma, Vishal L

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:72cae8e0ce8 dfblob:83b4762d6ec dfblob:c8a9ddd3e80
dfblob:6ba2b3adcef dfblob:16f0a5b253e dfblob:894f7f16eb8
dfblob:0d2768ab836 dfblob:ec95fe44fa3 dfblob:f44d6a606b4
dfblob:7e6b9314758 dfblob:2f2ce8aadf0 dfblob:7535131c711
dfblob:eca24b5e07f dfblob:2bb65677c07 dfblob:18a2f811c35
dfblob:42bf6a58ec3 dfblob:db4072875f5 dfblob:26045658875
dfblob:0e8fb61f63f dfblob:3d239ed1274 dfblob:06fe225fb0a
dfblob:8a481dae9ca dfblob:ff1f254a0ef dfblob:a29896a9ef1
dfblob:753f0dbb9cf )
 OR (
bs:"Re: CPU Lockups in KVM with deferred hrtimer rearming" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260422074646.GO3126523@noisy.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=binbin.wu@intel.com \
    --cc=binbin.wu@linux.intel.com \
    --cc=bonzini@redhat.com \
    --cc=jmattson@google.com \
    --cc=kvm@vger.kernel.org \
    --cc=rick.p.edgecombe@intel.com \
    --cc=seanjc@google.com \
    --cc=tglx@kernel.org \
    --cc=vishal.l.verma@intel.com \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox