From: Peter Zijlstra <peterz@infradead.org>
To: Thomas Gleixner <tglx@kernel.org>
Cc: Binbin Wu <binbin.wu@linux.intel.com>,
"Verma, Vishal L" <vishal.l.verma@intel.com>,
"kvm@vger.kernel.org" <kvm@vger.kernel.org>,
"Edgecombe, Rick P" <rick.p.edgecombe@intel.com>,
"Wu, Binbin" <binbin.wu@intel.com>,
"x86@kernel.org" <x86@kernel.org>
Subject: Re: CPU Lockups in KVM with deferred hrtimer rearming
Date: Tue, 21 Apr 2026 15:29:13 +0200 [thread overview]
Message-ID: <20260421132913.GB1064669@noisy.programming.kicks-ass.net> (raw)
In-Reply-To: <20260421131953.GA1064669@noisy.programming.kicks-ass.net>
On Tue, Apr 21, 2026 at 03:19:53PM +0200, Peter Zijlstra wrote:
> On Tue, Apr 21, 2026 at 02:05:31PM +0200, Peter Zijlstra wrote:
> > On Tue, Apr 21, 2026 at 01:49:40PM +0200, Peter Zijlstra wrote:
> > > On Tue, Apr 21, 2026 at 01:34:07PM +0200, Peter Zijlstra wrote:
> > > > On Tue, Apr 21, 2026 at 01:32:12PM +0200, Peter Zijlstra wrote:
> > > > > On Tue, Apr 21, 2026 at 01:18:58PM +0200, Peter Zijlstra wrote:
> > > > > > On Tue, Apr 21, 2026 at 09:39:14AM +0200, Thomas Gleixner wrote:
> > > > > >
> > > > > > > ---
> > > > > > > Subject: entry: Enforce hrtimer rearming in the irqentry_exit path
> > > > > > > From: Thomas Gleixner <tglx@kernel.org>
> > > > > > > Date: Tue, 21 Apr 2026 09:00:52 +0200
> > > > > > >
> > > > > > > irqentry_exit_to_kernel_mode_after_preempt() invokes
> > > > > > > hrtimer_rearm_deferred() only when the interrupted context had interrupts
> > > > > > > enabled. That's a correct decision because the timer interrupt can only be
> > > > > > > delivered in interrupt enabled contexts. The interrupt disabled path is
> > > > > > > used by exceptions and traps which never touch the hrtimer mechanics.
> > > > > > >
> > > > > > > So much for the theory, but then there is VIRT which ruins everything.
> > > > > > >
> > > > > > > KVM invokes regular interrupts with pt_regs which have interrupts
> > > > > > > disabled. That's correct from the KVM point of view, but completely
> > > > > > > violates the obviously correct expectations of the interrupt entry/exit
> > > > > > > code.
> > > > > >
> > > > > > Mooo :-(
> > > >
> > > > Also, is this a x86/KVM 'special' or is this true for all arch/KVM that
> > > > use GENERIC_ENTRY?
> > >
> > > Should we not make asm_fred_entry_from_kvm()/VMX_DO_EVENT_IRQOFF fix IF
> > > on the fake frame instead? We know it will enable IRQs after doing
> > > handle_exit_irqoff() in vcpu_enter_guest().
> >
> > Moo, you can't do that either, because it will ERETS/IRET and fuck up
> > the state :/
>
> How insane is something like this?
Small matter of actually building...
---
diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S
index 894f7f16eb80..cc2c961a5683 100644
--- a/arch/x86/entry/entry_64_fred.S
+++ b/arch/x86/entry/entry_64_fred.S
@@ -98,6 +98,7 @@ SYM_FUNC_START(asm_fred_entry_from_kvm)
push %rdi /* fred_ss handed in by the caller */
push %rbp
pushf
+ orq $X86_EFLAGS_KVM, (%rsp)
push $__KERNEL_CS
/*
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 0e8c611bc9e2..75568a85b2d3 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -43,6 +43,7 @@
#define _ASM_SUB __ASM_SIZE(sub)
#define _ASM_XADD __ASM_SIZE(xadd)
#define _ASM_MUL __ASM_SIZE(mul)
+#define _ASM_OR __ASM_SIZE(or)
#define _ASM_AX __ASM_REG(ax)
#define _ASM_BX __ASM_REG(bx)
diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index 7535131c711b..aab93f07e768 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -97,4 +97,16 @@ static __always_inline void arch_exit_to_user_mode(void)
}
#define arch_exit_to_user_mode arch_exit_to_user_mode
+static __always_inline void arch_exit_to_kernel_mode(struct pt_regs *regs)
+{
+#ifdef CONFIG_KVM_INTEL
+ /*
+ * KVM is a reserved bit and must always be 0. Hardware will #GP on
+ * IRET/ERETS with this bit set.
+ */
+ regs->flags &= ~X86_EFLAGS_KVM;
+#endif
+}
+#define arch_exit_to_kernel_mode arch_exit_to_kernel_mode
+
#endif
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 7bb7bd90355d..c31f7bc2eba2 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -311,7 +311,15 @@ void user_stack_pointer_set(struct pt_regs *regs, unsigned long val)
static __always_inline bool regs_irqs_disabled(struct pt_regs *regs)
{
- return !(regs->flags & X86_EFLAGS_IF);
+ /*
+ * return context | IF | KVM
+ * ---------------+----+----
+ * IRQ-off | 0 | 0
+ * IRQ-on | 0 | 1
+ * IRQ-on | 1 | 0
+ * invalid | 1 | 1
+ */
+ return (regs->flags & (X86_EFLAGS_IF | X86_EFLAGS_KVM)) == 0;
}
/* Query offset/name of register from its name/offset */
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 81d0c8bf1137..d32edefde587 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -14,6 +14,8 @@
#define X86_EFLAGS_FIXED _BITUL(X86_EFLAGS_FIXED_BIT)
#define X86_EFLAGS_PF_BIT 2 /* Parity Flag */
#define X86_EFLAGS_PF _BITUL(X86_EFLAGS_PF_BIT)
+#define X86_EFLAGS_KVM_BIT 3 /* KVM Flag -- must be 0 */
+#define X86_EFLAGS_KVM _BITUL(X86_EFLAGS_PF_BIT)
#define X86_EFLAGS_AF_BIT 4 /* Auxiliary carry Flag */
#define X86_EFLAGS_AF _BITUL(X86_EFLAGS_AF_BIT)
#define X86_EFLAGS_ZF_BIT 6 /* Zero Flag */
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 8a481dae9cae..cb9ab3ce030b 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -6,6 +6,7 @@
#include <asm/nospec-branch.h>
#include <asm/percpu.h>
#include <asm/segment.h>
+#include <asm/processor-flags.h>
#include "kvm-asm-offsets.h"
#include "run_flags.h"
@@ -50,6 +51,7 @@
push %rbp
#endif
pushf
+ _ASM_OR $X86_EFLAGS_KVM, (%_ASM_SP)
push $__KERNEL_CS
\call_insn \call_target
diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
index 167fba7dbf04..0acc20b63513 100644
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -167,6 +167,10 @@ static __always_inline void arch_exit_to_user_mode(void);
static __always_inline void arch_exit_to_user_mode(void) { }
#endif
+#ifndef arch_exit_to_kernel_mode
+static __always_inline void arch_exit_to_kernel_mode(struct pt_regs *regs) { }
+#endif
+
/**
* arch_do_signal_or_restart - Architecture specific signal delivery function
* @regs: Pointer to currents pt_regs
@@ -548,6 +552,7 @@ static __always_inline void irqentry_exit_to_kernel_mode(struct pt_regs *regs,
instrumentation_end();
irqentry_exit_to_kernel_mode_after_preempt(regs, state);
+ arch_exit_to_kernel_mode(regs);
}
/**
next prev parent reply other threads:[~2026-04-21 13:29 UTC|newest]
Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-16 20:50 CPU Lockups in KVM with deferred hrtimer rearming Verma, Vishal L
2026-04-20 15:00 ` Thomas Gleixner
2026-04-20 15:22 ` Thomas Gleixner
2026-04-20 20:57 ` Verma, Vishal L
2026-04-20 22:19 ` Thomas Gleixner
2026-04-20 22:24 ` Verma, Vishal L
2026-04-21 6:29 ` Thomas Gleixner
2026-04-21 4:51 ` Binbin Wu
2026-04-21 7:39 ` Thomas Gleixner
2026-04-21 11:18 ` Peter Zijlstra
2026-04-21 11:32 ` Peter Zijlstra
2026-04-21 11:34 ` Peter Zijlstra
2026-04-21 11:49 ` Peter Zijlstra
2026-04-21 12:05 ` Peter Zijlstra
2026-04-21 13:19 ` Peter Zijlstra
2026-04-21 13:29 ` Peter Zijlstra [this message]
2026-04-21 16:36 ` Thomas Gleixner
2026-04-21 17:11 ` Thomas Gleixner
2026-04-21 17:20 ` Jim Mattson
2026-04-21 16:30 ` Thomas Gleixner
2026-04-21 16:11 ` Verma, Vishal L
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260421132913.GB1064669@noisy.programming.kicks-ass.net \
--to=peterz@infradead.org \
--cc=binbin.wu@intel.com \
--cc=binbin.wu@linux.intel.com \
--cc=kvm@vger.kernel.org \
--cc=rick.p.edgecombe@intel.com \
--cc=tglx@kernel.org \
--cc=vishal.l.verma@intel.com \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox