public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: Thomas Gleixner <tglx@kernel.org>
Cc: Binbin Wu <binbin.wu@linux.intel.com>,
	"Verma, Vishal L" <vishal.l.verma@intel.com>,
	"kvm@vger.kernel.org" <kvm@vger.kernel.org>,
	"Edgecombe, Rick P" <rick.p.edgecombe@intel.com>,
	"Wu, Binbin" <binbin.wu@intel.com>,
	"x86@kernel.org" <x86@kernel.org>
Subject: Re: CPU Lockups in KVM with deferred hrtimer rearming
Date: Tue, 21 Apr 2026 15:29:13 +0200	[thread overview]
Message-ID: <20260421132913.GB1064669@noisy.programming.kicks-ass.net> (raw)
In-Reply-To: <20260421131953.GA1064669@noisy.programming.kicks-ass.net>

On Tue, Apr 21, 2026 at 03:19:53PM +0200, Peter Zijlstra wrote:
> On Tue, Apr 21, 2026 at 02:05:31PM +0200, Peter Zijlstra wrote:
> > On Tue, Apr 21, 2026 at 01:49:40PM +0200, Peter Zijlstra wrote:
> > > On Tue, Apr 21, 2026 at 01:34:07PM +0200, Peter Zijlstra wrote:
> > > > On Tue, Apr 21, 2026 at 01:32:12PM +0200, Peter Zijlstra wrote:
> > > > > On Tue, Apr 21, 2026 at 01:18:58PM +0200, Peter Zijlstra wrote:
> > > > > > On Tue, Apr 21, 2026 at 09:39:14AM +0200, Thomas Gleixner wrote:
> > > > > > 
> > > > > > > ---
> > > > > > > Subject: entry: Enforce hrtimer rearming in the irqentry_exit path
> > > > > > > From: Thomas Gleixner <tglx@kernel.org>
> > > > > > > Date: Tue, 21 Apr 2026 09:00:52 +0200
> > > > > > > 
> > > > > > > irqentry_exit_to_kernel_mode_after_preempt() invokes
> > > > > > > hrtimer_rearm_deferred() only when the interrupted context had interrupts
> > > > > > > enabled. That's a correct decision because the timer interrupt can only be
> > > > > > > delivered in interrupt enabled contexts. The interrupt disabled path is
> > > > > > > used by exceptions and traps which never touch the hrtimer mechanics.
> > > > > > > 
> > > > > > > So much for the theory, but then there is VIRT which ruins everything.
> > > > > > > 
> > > > > > > KVM invokes regular interrupts with pt_regs which have interrupts
> > > > > > > disabled. That's correct from the KVM point of view, but completely
> > > > > > > violates the obviously correct expectations of the interrupt entry/exit
> > > > > > > code.
> > > > > > 
> > > > > > Mooo :-(
> > > > 
> > > > Also, is this a x86/KVM 'special' or is this true for all arch/KVM that
> > > > use GENERIC_ENTRY?
> > > 
> > > Should we not make asm_fred_entry_from_kvm()/VMX_DO_EVENT_IRQOFF fix IF
> > > on the fake frame instead? We know it will enable IRQs after doing
> > > handle_exit_irqoff() in vcpu_enter_guest().
> > 
> > Moo, you can't do that either, because it will ERETS/IRET and fuck up
> > the state :/
> 
> How insane is something like this?

Small matter of actually building...

---
diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S
index 894f7f16eb80..cc2c961a5683 100644
--- a/arch/x86/entry/entry_64_fred.S
+++ b/arch/x86/entry/entry_64_fred.S
@@ -98,6 +98,7 @@ SYM_FUNC_START(asm_fred_entry_from_kvm)
 	push %rdi			/* fred_ss handed in by the caller */
 	push %rbp
 	pushf
+	orq $X86_EFLAGS_KVM, (%rsp)
 	push $__KERNEL_CS
 
 	/*
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 0e8c611bc9e2..75568a85b2d3 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -43,6 +43,7 @@
 #define _ASM_SUB	__ASM_SIZE(sub)
 #define _ASM_XADD	__ASM_SIZE(xadd)
 #define _ASM_MUL	__ASM_SIZE(mul)
+#define _ASM_OR		__ASM_SIZE(or)
 
 #define _ASM_AX		__ASM_REG(ax)
 #define _ASM_BX		__ASM_REG(bx)
diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index 7535131c711b..aab93f07e768 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -97,4 +97,16 @@ static __always_inline void arch_exit_to_user_mode(void)
 }
 #define arch_exit_to_user_mode arch_exit_to_user_mode
 
+static __always_inline void arch_exit_to_kernel_mode(struct pt_regs *regs)
+{
+#ifdef CONFIG_KVM_INTEL
+	/*
+	 * KVM is a reserved bit and must always be 0. Hardware will #GP on
+	 * IRET/ERETS with this bit set.
+	 */
+	regs->flags &= ~X86_EFLAGS_KVM;
+#endif
+}
+#define arch_exit_to_kernel_mode arch_exit_to_kernel_mode
+
 #endif
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 7bb7bd90355d..c31f7bc2eba2 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -311,7 +311,15 @@ void user_stack_pointer_set(struct pt_regs *regs, unsigned long val)
 
 static __always_inline bool regs_irqs_disabled(struct pt_regs *regs)
 {
-	return !(regs->flags & X86_EFLAGS_IF);
+	/*
+	 * return context | IF | KVM
+	 * ---------------+----+----
+	 * IRQ-off        |  0 |  0
+	 * IRQ-on         |  0 |  1
+	 * IRQ-on         |  1 |  0
+	 * invalid        |  1 |  1
+	 */
+	return (regs->flags & (X86_EFLAGS_IF | X86_EFLAGS_KVM)) == 0;
 }
 
 /* Query offset/name of register from its name/offset */
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 81d0c8bf1137..d32edefde587 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -14,6 +14,8 @@
 #define X86_EFLAGS_FIXED	_BITUL(X86_EFLAGS_FIXED_BIT)
 #define X86_EFLAGS_PF_BIT	2 /* Parity Flag */
 #define X86_EFLAGS_PF		_BITUL(X86_EFLAGS_PF_BIT)
+#define X86_EFLAGS_KVM_BIT	3 /* KVM Flag -- must be 0 */
+#define X86_EFLAGS_KVM		_BITUL(X86_EFLAGS_PF_BIT)
 #define X86_EFLAGS_AF_BIT	4 /* Auxiliary carry Flag */
 #define X86_EFLAGS_AF		_BITUL(X86_EFLAGS_AF_BIT)
 #define X86_EFLAGS_ZF_BIT	6 /* Zero Flag */
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 8a481dae9cae..cb9ab3ce030b 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -6,6 +6,7 @@
 #include <asm/nospec-branch.h>
 #include <asm/percpu.h>
 #include <asm/segment.h>
+#include <asm/processor-flags.h>
 #include "kvm-asm-offsets.h"
 #include "run_flags.h"
 
@@ -50,6 +51,7 @@
 	push %rbp
 #endif
 	pushf
+	_ASM_OR $X86_EFLAGS_KVM, (%_ASM_SP)
 	push $__KERNEL_CS
 	\call_insn \call_target
 
diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
index 167fba7dbf04..0acc20b63513 100644
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -167,6 +167,10 @@ static __always_inline void arch_exit_to_user_mode(void);
 static __always_inline void arch_exit_to_user_mode(void) { }
 #endif
 
+#ifndef arch_exit_to_kernel_mode
+static __always_inline void arch_exit_to_kernel_mode(struct pt_regs *regs) { }
+#endif
+
 /**
  * arch_do_signal_or_restart -  Architecture specific signal delivery function
  * @regs:	Pointer to currents pt_regs
@@ -548,6 +552,7 @@ static __always_inline void irqentry_exit_to_kernel_mode(struct pt_regs *regs,
 	instrumentation_end();
 
 	irqentry_exit_to_kernel_mode_after_preempt(regs, state);
+	arch_exit_to_kernel_mode(regs);
 }
 
 /**

  reply	other threads:[~2026-04-21 13:29 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-16 20:50 CPU Lockups in KVM with deferred hrtimer rearming Verma, Vishal L
2026-04-20 15:00 ` Thomas Gleixner
2026-04-20 15:22   ` Thomas Gleixner
2026-04-20 20:57   ` Verma, Vishal L
2026-04-20 22:19     ` Thomas Gleixner
2026-04-20 22:24       ` Verma, Vishal L
2026-04-21  6:29         ` Thomas Gleixner
2026-04-21  4:51   ` Binbin Wu
2026-04-21  7:39     ` Thomas Gleixner
2026-04-21 11:18       ` Peter Zijlstra
2026-04-21 11:32         ` Peter Zijlstra
2026-04-21 11:34           ` Peter Zijlstra
2026-04-21 11:49             ` Peter Zijlstra
2026-04-21 12:05               ` Peter Zijlstra
2026-04-21 13:19                 ` Peter Zijlstra
2026-04-21 13:29                   ` Peter Zijlstra [this message]
2026-04-21 16:36                     ` Thomas Gleixner
2026-04-21 17:11               ` Thomas Gleixner
2026-04-21 17:20                 ` Jim Mattson
2026-04-21 16:30           ` Thomas Gleixner
2026-04-21 16:11       ` Verma, Vishal L

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260421132913.GB1064669@noisy.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=binbin.wu@intel.com \
    --cc=binbin.wu@linux.intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=rick.p.edgecombe@intel.com \
    --cc=tglx@kernel.org \
    --cc=vishal.l.verma@intel.com \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox