All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: Thomas Gleixner <tglx@kernel.org>
Cc: Binbin Wu <binbin.wu@linux.intel.com>,
	"Verma, Vishal L" <vishal.l.verma@intel.com>,
	"kvm@vger.kernel.org" <kvm@vger.kernel.org>,
	"Edgecombe, Rick P" <rick.p.edgecombe@intel.com>,
	"Wu, Binbin" <binbin.wu@intel.com>,
	"x86@kernel.org" <x86@kernel.org>
Subject: Re: CPU Lockups in KVM with deferred hrtimer rearming
Date: Tue, 21 Apr 2026 15:19:53 +0200	[thread overview]
Message-ID: <20260421131953.GA1064669@noisy.programming.kicks-ass.net> (raw)
In-Reply-To: <20260421120531.GF3102924@noisy.programming.kicks-ass.net>

On Tue, Apr 21, 2026 at 02:05:31PM +0200, Peter Zijlstra wrote:
> On Tue, Apr 21, 2026 at 01:49:40PM +0200, Peter Zijlstra wrote:
> > On Tue, Apr 21, 2026 at 01:34:07PM +0200, Peter Zijlstra wrote:
> > > On Tue, Apr 21, 2026 at 01:32:12PM +0200, Peter Zijlstra wrote:
> > > > On Tue, Apr 21, 2026 at 01:18:58PM +0200, Peter Zijlstra wrote:
> > > > > On Tue, Apr 21, 2026 at 09:39:14AM +0200, Thomas Gleixner wrote:
> > > > > 
> > > > > > ---
> > > > > > Subject: entry: Enforce hrtimer rearming in the irqentry_exit path
> > > > > > From: Thomas Gleixner <tglx@kernel.org>
> > > > > > Date: Tue, 21 Apr 2026 09:00:52 +0200
> > > > > > 
> > > > > > irqentry_exit_to_kernel_mode_after_preempt() invokes
> > > > > > hrtimer_rearm_deferred() only when the interrupted context had interrupts
> > > > > > enabled. That's a correct decision because the timer interrupt can only be
> > > > > > delivered in interrupt enabled contexts. The interrupt disabled path is
> > > > > > used by exceptions and traps which never touch the hrtimer mechanics.
> > > > > > 
> > > > > > So much for the theory, but then there is VIRT which ruins everything.
> > > > > > 
> > > > > > KVM invokes regular interrupts with pt_regs which have interrupts
> > > > > > disabled. That's correct from the KVM point of view, but completely
> > > > > > violates the obviously correct expectations of the interrupt entry/exit
> > > > > > code.
> > > > > 
> > > > > Mooo :-(
> > > 
> > > Also, is this a x86/KVM 'special' or is this true for all arch/KVM that
> > > use GENERIC_ENTRY?
> > 
> > Should we not make asm_fred_entry_from_kvm()/VMX_DO_EVENT_IRQOFF fix IF
> > on the fake frame instead? We know it will enable IRQs after doing
> > handle_exit_irqoff() in vcpu_enter_guest().
> 
> Moo, you can't do that either, because it will ERETS/IRET and fuck up
> the state :/

How insane is something like this?

---
diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S
index 894f7f16eb80..f3e2a8fde1ab 100644
--- a/arch/x86/entry/entry_64_fred.S
+++ b/arch/x86/entry/entry_64_fred.S
@@ -98,6 +98,7 @@ SYM_FUNC_START(asm_fred_entry_from_kvm)
 	push %rdi			/* fred_ss handed in by the caller */
 	push %rbp
 	pushf
+	or $X86_EFLAGS_KVM, (%rsp)
 	push $__KERNEL_CS
 
 	/*
diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index 7535131c711b..aab93f07e768 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -97,4 +97,16 @@ static __always_inline void arch_exit_to_user_mode(void)
 }
 #define arch_exit_to_user_mode arch_exit_to_user_mode
 
+static __always_inline void arch_exit_to_kernel_mode(struct pt_regs *regs)
+{
+#ifdef CONFIG_KVM_INTEL
+	/*
+	 * KVM is a reserved bit and must always be 0. Hardware will #GP on
+	 * IRET/ERETS with this bit set.
+	 */
+	regs->flags &= ~X86_EFLAGS_KVM;
+#endif
+}
+#define arch_exit_to_kernel_mode arch_exit_to_kernel_mode
+
 #endif
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 7bb7bd90355d..c31f7bc2eba2 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -311,7 +311,15 @@ void user_stack_pointer_set(struct pt_regs *regs, unsigned long val)
 
 static __always_inline bool regs_irqs_disabled(struct pt_regs *regs)
 {
-	return !(regs->flags & X86_EFLAGS_IF);
+	/*
+	 * return context | IF | KVM
+	 * ---------------+----+----
+	 * IRQ-off        |  0 |  0
+	 * IRQ-on         |  0 |  1
+	 * IRQ-on         |  1 |  0
+	 * invalid        |  1 |  1
+	 */
+	return (regs->flags & (X86_EFLAGS_IF | X86_EFLAGS_KVM)) == 0;
 }
 
 /* Query offset/name of register from its name/offset */
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 81d0c8bf1137..d32edefde587 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -14,6 +14,8 @@
 #define X86_EFLAGS_FIXED	_BITUL(X86_EFLAGS_FIXED_BIT)
 #define X86_EFLAGS_PF_BIT	2 /* Parity Flag */
 #define X86_EFLAGS_PF		_BITUL(X86_EFLAGS_PF_BIT)
+#define X86_EFLAGS_KVM_BIT	3 /* KVM Flag -- must be 0 */
+#define X86_EFLAGS_KVM		_BITUL(X86_EFLAGS_PF_BIT)
 #define X86_EFLAGS_AF_BIT	4 /* Auxiliary carry Flag */
 #define X86_EFLAGS_AF		_BITUL(X86_EFLAGS_AF_BIT)
 #define X86_EFLAGS_ZF_BIT	6 /* Zero Flag */
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 8a481dae9cae..3d0d0fb8de79 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -50,6 +50,7 @@
 	push %rbp
 #endif
 	pushf
+	or $X86_EFLAGS_KVM, (%_ASM_SP)
 	push $__KERNEL_CS
 	\call_insn \call_target
 
diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
index 167fba7dbf04..0acc20b63513 100644
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -167,6 +167,10 @@ static __always_inline void arch_exit_to_user_mode(void);
 static __always_inline void arch_exit_to_user_mode(void) { }
 #endif
 
+#ifndef arch_exit_to_kernel_mode
+static __always_inline void arch_exit_to_kernel_mode(struct pt_regs *regs) { }
+#endif
+
 /**
  * arch_do_signal_or_restart -  Architecture specific signal delivery function
  * @regs:	Pointer to currents pt_regs
@@ -548,6 +552,7 @@ static __always_inline void irqentry_exit_to_kernel_mode(struct pt_regs *regs,
 	instrumentation_end();
 
 	irqentry_exit_to_kernel_mode_after_preempt(regs, state);
+	arch_exit_to_kernel_mode(regs);
 }
 
 /**

  reply	other threads:[~2026-04-21 13:19 UTC|newest]

Thread overview: 48+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-16 20:50 CPU Lockups in KVM with deferred hrtimer rearming Verma, Vishal L
2026-04-20 15:00 ` Thomas Gleixner
2026-04-20 15:22   ` Thomas Gleixner
2026-04-20 20:57   ` Verma, Vishal L
2026-04-20 22:19     ` Thomas Gleixner
2026-04-20 22:24       ` Verma, Vishal L
2026-04-21  6:29         ` Thomas Gleixner
2026-04-21  4:51   ` Binbin Wu
2026-04-21  7:39     ` Thomas Gleixner
2026-04-21 11:18       ` Peter Zijlstra
2026-04-21 11:32         ` Peter Zijlstra
2026-04-21 11:34           ` Peter Zijlstra
2026-04-21 11:49             ` Peter Zijlstra
2026-04-21 12:05               ` Peter Zijlstra
2026-04-21 13:19                 ` Peter Zijlstra [this message]
2026-04-21 13:29                   ` Peter Zijlstra
2026-04-21 16:36                     ` Thomas Gleixner
2026-04-21 18:11                     ` Verma, Vishal L
2026-04-21 17:11               ` Thomas Gleixner
2026-04-21 17:20                 ` Jim Mattson
2026-04-21 18:29                   ` Thomas Gleixner
2026-04-21 18:55                     ` Sean Christopherson
2026-04-21 20:06                       ` Peter Zijlstra
2026-04-21 20:46                         ` Peter Zijlstra
2026-04-21 20:57                         ` Sean Christopherson
2026-04-21 21:02                           ` Peter Zijlstra
2026-04-21 21:42                             ` Sean Christopherson
2026-04-22  6:55                               ` Peter Zijlstra
2026-04-22  7:46                                 ` Peter Zijlstra
2026-04-22 14:08                                   ` Peter Zijlstra
2026-04-22 15:26                                     ` Sean Christopherson
2026-04-22 19:13                                   ` Verma, Vishal L
2026-04-22 22:57                                   ` Thomas Gleixner
2026-04-23 15:23                                     ` Peter Zijlstra
2026-04-22 13:47                                 ` Sean Christopherson
2026-04-21 20:39                       ` Paolo Bonzini
2026-04-21 21:02                         ` Sean Christopherson
2026-04-21 22:48                         ` Thomas Gleixner
2026-04-21 23:15                           ` Paolo Bonzini
2026-04-21 23:34                             ` Jim Mattson
2026-04-21 23:37                               ` Paolo Bonzini
2026-04-22  2:10                             ` Thomas Gleixner
2026-04-21 21:49                       ` Thomas Gleixner
2026-04-21 22:07                         ` Sean Christopherson
2026-04-21 22:24                         ` Paolo Bonzini
2026-04-21 19:18                 ` Verma, Vishal L
2026-04-21 16:30           ` Thomas Gleixner
2026-04-21 16:11       ` Verma, Vishal L

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260421131953.GA1064669@noisy.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=binbin.wu@intel.com \
    --cc=binbin.wu@linux.intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=rick.p.edgecombe@intel.com \
    --cc=tglx@kernel.org \
    --cc=vishal.l.verma@intel.com \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.