Linux-HyperV List
 help / color / mirror / Atom feed
* [PATCH v4 28/47] x86/paravirt: Move handling of unstable PV clocks into paravirt_set_sched_clock()
From: Sean Christopherson @ 2026-05-29 15:07 UTC (permalink / raw)
  To: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Kiryl Shutsemau, Sean Christopherson,
	K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Andy Lutomirski,
	Peter Zijlstra, Juergen Gross, Daniel Lezcano, John Stultz
  Cc: H . Peter Anvin, Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, David Woodhouse, Tom Lendacky,
	Nikunj A Dadhania, David Woodhouse, Michael Kelley,
	Thomas Gleixner
In-Reply-To: <20260529144435.704127-1-seanjc@google.com>

Move the handling of unstable PV clocks, of which kvmclock is the only
example, into paravirt_set_sched_clock().  This will allow modifying
paravirt_set_sched_clock() to keep using the TSC for sched_clock in
certain scenarios without unintentionally marking the TSC-based clock as
unstable.

No functional change intended.

Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/timer.h | 7 ++++++-
 arch/x86/kernel/kvmclock.c   | 5 +----
 arch/x86/kernel/tsc.c        | 5 ++++-
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index c71b466d6ace..fe41d40a9ae6 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -14,7 +14,12 @@ extern int no_timer_check;
 extern bool using_native_sched_clock(void);
 
 #ifdef CONFIG_PARAVIRT
-void paravirt_set_sched_clock(u64 (*func)(void));
+void __paravirt_set_sched_clock(u64 (*func)(void), bool stable);
+
+static inline void paravirt_set_sched_clock(u64 (*func)(void))
+{
+	__paravirt_set_sched_clock(func, true);
+}
 #endif
 
 /*
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 13c4be3a7f0a..4e50e75ff43d 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -12,7 +12,6 @@
 #include <linux/hardirq.h>
 #include <linux/cpuhotplug.h>
 #include <linux/sched.h>
-#include <linux/sched/clock.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/set_memory.h>
@@ -94,10 +93,8 @@ static noinstr u64 kvm_sched_clock_read(void)
 
 static inline void kvm_sched_clock_init(bool stable)
 {
-	if (!stable)
-		clear_sched_clock_stable();
 	kvm_sched_clock_offset = kvm_clock_read();
-	paravirt_set_sched_clock(kvm_sched_clock_read);
+	__paravirt_set_sched_clock(kvm_sched_clock_read, stable);
 
 	pr_info("kvm-clock: using sched offset of %llu cycles",
 		kvm_sched_clock_offset);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 888bd1cbd9bc..a9b6d3399c23 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -280,8 +280,11 @@ bool using_native_sched_clock(void)
 	return static_call_query(pv_sched_clock) == native_sched_clock;
 }
 
-void paravirt_set_sched_clock(u64 (*func)(void))
+void __paravirt_set_sched_clock(u64 (*func)(void), bool stable)
 {
+	if (!stable)
+		clear_sched_clock_stable();
+
 	static_call_update(pv_sched_clock, func);
 }
 #else
-- 
2.54.0.823.g6e5bcc1fc9-goog


^ permalink raw reply related

* [PATCH v4 29/47] x86/kvmclock: Move sched_clock save/restore helpers up in kvmclock.c
From: Sean Christopherson @ 2026-05-29 15:07 UTC (permalink / raw)
  To: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Kiryl Shutsemau, Sean Christopherson,
	K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Andy Lutomirski,
	Peter Zijlstra, Juergen Gross, Daniel Lezcano, John Stultz
  Cc: H . Peter Anvin, Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, David Woodhouse, Tom Lendacky,
	Nikunj A Dadhania, David Woodhouse, Michael Kelley,
	Thomas Gleixner
In-Reply-To: <20260529144435.704127-1-seanjc@google.com>

Move kvmclock's sched_clock save/restore helper "up" so that they can
(eventually) be referenced by kvm_sched_clock_init().

No functional change intended.

Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kernel/kvmclock.c | 108 ++++++++++++++++++-------------------
 1 file changed, 54 insertions(+), 54 deletions(-)

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 4e50e75ff43d..0d4f2cf97246 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -71,6 +71,25 @@ static int kvm_set_wallclock(const struct timespec64 *now)
 	return -ENODEV;
 }
 
+static void kvm_register_clock(char *txt)
+{
+	struct pvclock_vsyscall_time_info *src = this_cpu_hvclock();
+	u64 pa;
+
+	if (!src)
+		return;
+
+	pa = slow_virt_to_phys(&src->pvti) | 0x01ULL;
+	wrmsrq(msr_kvm_system_time, pa);
+	pr_debug("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
+}
+
+static void kvmclock_disable(void)
+{
+	if (msr_kvm_system_time)
+		native_write_msr(msr_kvm_system_time, 0);
+}
+
 static u64 kvm_clock_read(void)
 {
 	u64 ret;
@@ -91,6 +110,30 @@ static noinstr u64 kvm_sched_clock_read(void)
 	return pvclock_clocksource_read_nowd(this_cpu_pvti()) - kvm_sched_clock_offset;
 }
 
+static void kvm_save_sched_clock_state(void)
+{
+	/*
+	 * Stop host writes to kvmclock immediately prior to suspend/hibernate.
+	 * If the system is hibernating, then kvmclock will likely reside at a
+	 * different physical address when the system awakens, and host writes
+	 * to the old address prior to reconfiguring kvmclock would clobber
+	 * random memory.
+	 */
+	kvmclock_disable();
+}
+
+#ifdef CONFIG_SMP
+static void kvm_setup_secondary_clock(void)
+{
+	kvm_register_clock("secondary cpu clock");
+}
+#endif
+
+static void kvm_restore_sched_clock_state(void)
+{
+	kvm_register_clock("primary cpu clock, resume");
+}
+
 static inline void kvm_sched_clock_init(bool stable)
 {
 	kvm_sched_clock_offset = kvm_clock_read();
@@ -103,6 +146,17 @@ static inline void kvm_sched_clock_init(bool stable)
 		sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
 }
 
+void kvmclock_cpu_action(enum kvm_guest_cpu_action action)
+{
+	/*
+	 * Don't disable kvmclock on the BSP during suspend.  If kvmclock is
+	 * being used for sched_clock, then it needs to be kept alive until the
+	 * last minute, and restored as quickly as possible after resume.
+	 */
+	if (action != KVM_GUEST_BSP_SUSPEND)
+		kvmclock_disable();
+}
+
 /*
  * If we don't do that, there is the possibility that the guest
  * will calibrate under heavy load - thus, getting a lower lpj -
@@ -161,60 +215,6 @@ static struct clocksource kvm_clock = {
 	.enable	= kvm_cs_enable,
 };
 
-static void kvm_register_clock(char *txt)
-{
-	struct pvclock_vsyscall_time_info *src = this_cpu_hvclock();
-	u64 pa;
-
-	if (!src)
-		return;
-
-	pa = slow_virt_to_phys(&src->pvti) | 0x01ULL;
-	wrmsrq(msr_kvm_system_time, pa);
-	pr_debug("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
-}
-
-static void kvmclock_disable(void)
-{
-	if (msr_kvm_system_time)
-		native_write_msr(msr_kvm_system_time, 0);
-}
-
-static void kvm_save_sched_clock_state(void)
-{
-	/*
-	 * Stop host writes to kvmclock immediately prior to suspend/hibernate.
-	 * If the system is hibernating, then kvmclock will likely reside at a
-	 * different physical address when the system awakens, and host writes
-	 * to the old address prior to reconfiguring kvmclock would clobber
-	 * random memory.
-	 */
-	kvmclock_disable();
-}
-
-static void kvm_restore_sched_clock_state(void)
-{
-	kvm_register_clock("primary cpu clock, resume");
-}
-
-void kvmclock_cpu_action(enum kvm_guest_cpu_action action)
-{
-	/*
-	 * Don't disable kvmclock on the BSP during suspend.  If kvmclock is
-	 * being used for sched_clock, then it needs to be kept alive until the
-	 * last minute, and restored as quickly as possible after resume.
-	 */
-	if (action != KVM_GUEST_BSP_SUSPEND)
-		kvmclock_disable();
-}
-
-#ifdef CONFIG_SMP
-static void kvm_setup_secondary_clock(void)
-{
-	kvm_register_clock("secondary cpu clock");
-}
-#endif
-
 static void __init kvmclock_init_mem(void)
 {
 	unsigned long ncpus;
-- 
2.54.0.823.g6e5bcc1fc9-goog


^ permalink raw reply related

* [PATCH v4 30/47] x86/xen/time: NOP-ify x86_platform's sched_clock save/restore hooks
From: Sean Christopherson @ 2026-05-29 15:07 UTC (permalink / raw)
  To: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Kiryl Shutsemau, Sean Christopherson,
	K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Andy Lutomirski,
	Peter Zijlstra, Juergen Gross, Daniel Lezcano, John Stultz
  Cc: H . Peter Anvin, Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, David Woodhouse, Tom Lendacky,
	Nikunj A Dadhania, David Woodhouse, Michael Kelley,
	Thomas Gleixner
In-Reply-To: <20260529144435.704127-1-seanjc@google.com>

NOP-ify the x86_platform sched_clock save/restore hooks when setting up
Xen's PV clock to make it somewhat obvious the hooks aren't used when
running as a Xen guest (Xen uses a paravirtualized suspend/resume flow).

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/xen/time.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 36d66abf5379..640b71d22d97 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -578,6 +578,12 @@ static void __init xen_init_time_common(void)
 	xen_sched_clock_offset = xen_clocksource_read();
 	static_call_update(pv_steal_clock, xen_steal_clock);
 	paravirt_set_sched_clock(xen_sched_clock);
+	/*
+	 * Xen has paravirtualized suspend/resume and so doesn't use the common
+	 * x86 sched_clock save/restore hooks.
+	 */
+	x86_platform.save_sched_clock_state = x86_init_noop;
+	x86_platform.restore_sched_clock_state = x86_init_noop;
 
 	x86_init.hyper.get_tsc_khz = xen_tsc_khz;
 	x86_platform.get_wallclock = xen_get_wallclock;
-- 
2.54.0.823.g6e5bcc1fc9-goog


^ permalink raw reply related

* [PATCH v4 31/47] x86/vmware: NOP-ify save/restore hooks when using VMware's sched_clock
From: Sean Christopherson @ 2026-05-29 15:07 UTC (permalink / raw)
  To: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Kiryl Shutsemau, Sean Christopherson,
	K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Andy Lutomirski,
	Peter Zijlstra, Juergen Gross, Daniel Lezcano, John Stultz
  Cc: H . Peter Anvin, Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, David Woodhouse, Tom Lendacky,
	Nikunj A Dadhania, David Woodhouse, Michael Kelley,
	Thomas Gleixner
In-Reply-To: <20260529144435.704127-1-seanjc@google.com>

NOP-ify the sched_clock save/restore hooks when using VMware's version of
sched_clock.  This will allow extending paravirt_set_sched_clock() to set
the save/restore hooks, without having to simultaneously change the
behavior of VMware guests.

Note, it's not at all obvious that it's safe/correct for VMware guests to
do nothing on suspend/resume, but that's a pre-existing problem.  Leave it
for a VMware expert to sort out.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kernel/cpu/vmware.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 2d0624c66799..051ef89029a7 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -347,8 +347,11 @@ static void __init vmware_paravirt_ops_setup(void)
 
 	vmware_cyc2ns_setup();
 
-	if (vmw_sched_clock)
+	if (vmw_sched_clock) {
 		paravirt_set_sched_clock(vmware_sched_clock);
+		x86_platform.save_sched_clock_state = x86_init_noop;
+		x86_platform.restore_sched_clock_state = x86_init_noop;
+	}
 
 	if (vmware_is_stealclock_available()) {
 		has_steal_clock = true;
-- 
2.54.0.823.g6e5bcc1fc9-goog


^ permalink raw reply related

* [PATCH v4 32/47] x86/tsc: WARN if TSC sched_clock save/restore used with PV sched_clock
From: Sean Christopherson @ 2026-05-29 15:07 UTC (permalink / raw)
  To: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Kiryl Shutsemau, Sean Christopherson,
	K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Andy Lutomirski,
	Peter Zijlstra, Juergen Gross, Daniel Lezcano, John Stultz
  Cc: H . Peter Anvin, Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, David Woodhouse, Tom Lendacky,
	Nikunj A Dadhania, David Woodhouse, Michael Kelley,
	Thomas Gleixner
In-Reply-To: <20260529144435.704127-1-seanjc@google.com>

Now that all PV clocksources override the sched_clock save/restore hooks
when overriding sched_clock, WARN if the "default" TSC hooks are invoked
when using a PV sched_clock, e.g. to guard against regressions.

Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kernel/tsc.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index a9b6d3399c23..19da1a3d2126 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -972,9 +972,17 @@ EXPORT_SYMBOL_GPL(recalibrate_cpu_khz);
 
 static unsigned long long cyc2ns_suspend;
 
+static __always_inline bool tsc_is_save_restore_needed(void)
+{
+	if (WARN_ON_ONCE(!using_native_sched_clock()))
+		return false;
+
+	return static_branch_likely(&__use_tsc) || sched_clock_stable();
+}
+
 void tsc_save_sched_clock_state(void)
 {
-	if (!static_branch_likely(&__use_tsc) && !sched_clock_stable())
+	if (!tsc_is_save_restore_needed())
 		return;
 
 	cyc2ns_suspend = sched_clock();
@@ -994,7 +1002,7 @@ void tsc_restore_sched_clock_state(void)
 	unsigned long flags;
 	int cpu;
 
-	if (!static_branch_likely(&__use_tsc) && !sched_clock_stable())
+	if (!tsc_is_save_restore_needed())
 		return;
 
 	local_irq_save(flags);
-- 
2.54.0.823.g6e5bcc1fc9-goog


^ permalink raw reply related

* [PATCH v4 33/47] x86/paravirt: Pass sched_clock save/restore helpers during registration
From: Sean Christopherson @ 2026-05-29 15:07 UTC (permalink / raw)
  To: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Kiryl Shutsemau, Sean Christopherson,
	K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Andy Lutomirski,
	Peter Zijlstra, Juergen Gross, Daniel Lezcano, John Stultz
  Cc: H . Peter Anvin, Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, David Woodhouse, Tom Lendacky,
	Nikunj A Dadhania, David Woodhouse, Michael Kelley,
	Thomas Gleixner
In-Reply-To: <20260529144435.704127-1-seanjc@google.com>

Pass in a PV clock's save/restore helpers when configuring sched_clock
instead of relying on each PV clock to manually set the save/restore hooks.
In addition to bringing sanity to the code, this will allow gracefully
"rejecting" a PV sched_clock, e.g. when running as a CoCo guest that has
access to a "secure" TSC.

No functional change intended.

Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/timer.h       | 9 ++++++---
 arch/x86/kernel/cpu/vmware.c       | 8 +++-----
 arch/x86/kernel/kvmclock.c         | 6 +++---
 arch/x86/kernel/tsc.c              | 5 ++++-
 arch/x86/xen/time.c                | 5 ++---
 drivers/clocksource/hyperv_timer.c | 6 ++----
 6 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index fe41d40a9ae6..e97cd1ae03d1 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -14,11 +14,14 @@ extern int no_timer_check;
 extern bool using_native_sched_clock(void);
 
 #ifdef CONFIG_PARAVIRT
-void __paravirt_set_sched_clock(u64 (*func)(void), bool stable);
+void __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
+				void (*save)(void), void (*restore)(void));
 
-static inline void paravirt_set_sched_clock(u64 (*func)(void))
+static inline void paravirt_set_sched_clock(u64 (*func)(void),
+					    void (*save)(void),
+					    void (*restore)(void))
 {
-	__paravirt_set_sched_clock(func, true);
+	__paravirt_set_sched_clock(func, true, save, restore);
 }
 #endif
 
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 051ef89029a7..f3ffc05c7c53 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -347,11 +347,9 @@ static void __init vmware_paravirt_ops_setup(void)
 
 	vmware_cyc2ns_setup();
 
-	if (vmw_sched_clock) {
-		paravirt_set_sched_clock(vmware_sched_clock);
-		x86_platform.save_sched_clock_state = x86_init_noop;
-		x86_platform.restore_sched_clock_state = x86_init_noop;
-	}
+	if (vmw_sched_clock)
+		paravirt_set_sched_clock(vmware_sched_clock,
+					 x86_init_noop, x86_init_noop);
 
 	if (vmware_is_stealclock_available()) {
 		has_steal_clock = true;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 0d4f2cf97246..05bca2be0df3 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -137,7 +137,9 @@ static void kvm_restore_sched_clock_state(void)
 static inline void kvm_sched_clock_init(bool stable)
 {
 	kvm_sched_clock_offset = kvm_clock_read();
-	__paravirt_set_sched_clock(kvm_sched_clock_read, stable);
+	__paravirt_set_sched_clock(kvm_sched_clock_read, stable,
+				   kvm_save_sched_clock_state,
+				   kvm_restore_sched_clock_state);
 
 	pr_info("kvm-clock: using sched offset of %llu cycles",
 		kvm_sched_clock_offset);
@@ -345,8 +347,6 @@ void __init kvmclock_init(bool prefer_tsc)
 #ifdef CONFIG_SMP
 	x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock;
 #endif
-	x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
-	x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
 	kvm_get_preset_lpj();
 
 	/*
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 19da1a3d2126..7fbcfc2efd1d 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -280,12 +280,15 @@ bool using_native_sched_clock(void)
 	return static_call_query(pv_sched_clock) == native_sched_clock;
 }
 
-void __paravirt_set_sched_clock(u64 (*func)(void), bool stable)
+void __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
+				void (*save)(void), void (*restore)(void))
 {
 	if (!stable)
 		clear_sched_clock_stable();
 
 	static_call_update(pv_sched_clock, func);
+	x86_platform.save_sched_clock_state = save;
+	x86_platform.restore_sched_clock_state = restore;
 }
 #else
 u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock")));
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 640b71d22d97..91ef83b1e540 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -577,13 +577,12 @@ static void __init xen_init_time_common(void)
 {
 	xen_sched_clock_offset = xen_clocksource_read();
 	static_call_update(pv_steal_clock, xen_steal_clock);
-	paravirt_set_sched_clock(xen_sched_clock);
+
 	/*
 	 * Xen has paravirtualized suspend/resume and so doesn't use the common
 	 * x86 sched_clock save/restore hooks.
 	 */
-	x86_platform.save_sched_clock_state = x86_init_noop;
-	x86_platform.restore_sched_clock_state = x86_init_noop;
+	paravirt_set_sched_clock(xen_sched_clock, x86_init_noop, x86_init_noop);
 
 	x86_init.hyper.get_tsc_khz = xen_tsc_khz;
 	x86_platform.get_wallclock = xen_get_wallclock;
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index ac1d9f9c381c..dee59ce61c29 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -553,10 +553,8 @@ static void hv_restore_sched_clock_state(void)
 static __always_inline void hv_setup_sched_clock(void *sched_clock)
 {
 	/* We're on x86/x64 *and* using PV ops */
-	paravirt_set_sched_clock(sched_clock);
-
-	x86_platform.save_sched_clock_state = hv_save_sched_clock_state;
-	x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state;
+	paravirt_set_sched_clock(sched_clock, hv_save_sched_clock_state,
+				 hv_restore_sched_clock_state);
 }
 #else /* !CONFIG_GENERIC_SCHED_CLOCK && !CONFIG_PARAVIRT */
 static __always_inline void hv_setup_sched_clock(void *sched_clock) {}
-- 
2.54.0.823.g6e5bcc1fc9-goog


^ permalink raw reply related

* [PATCH v4 34/47] x86/kvmclock: Move kvm_sched_clock_init() down in kvmclock.c
From: Sean Christopherson @ 2026-05-29 15:08 UTC (permalink / raw)
  To: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Kiryl Shutsemau, Sean Christopherson,
	K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Andy Lutomirski,
	Peter Zijlstra, Juergen Gross, Daniel Lezcano, John Stultz
  Cc: H . Peter Anvin, Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, David Woodhouse, Tom Lendacky,
	Nikunj A Dadhania, David Woodhouse, Michael Kelley,
	Thomas Gleixner
In-Reply-To: <20260529144435.704127-1-seanjc@google.com>

Move kvm_sched_clock_init() "down" so that it can reference the global
kvm_clock structure without needing a forward declaration.

Opportunistically mark the helper as "__init" instead of "inline" to make
its usage more obvious; modern compilers don't need a hint to inline a
single-use function, and an extra CALL+RET pair during boot is a complete
non-issue.  And, if the compiler ignores the hint and does NOT inline the
function, the resulting code may not get discarded after boot due lack of
an __init annotation.

No functional change intended.

Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kernel/kvmclock.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 05bca2be0df3..6372b4dc7b0c 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -134,20 +134,6 @@ static void kvm_restore_sched_clock_state(void)
 	kvm_register_clock("primary cpu clock, resume");
 }
 
-static inline void kvm_sched_clock_init(bool stable)
-{
-	kvm_sched_clock_offset = kvm_clock_read();
-	__paravirt_set_sched_clock(kvm_sched_clock_read, stable,
-				   kvm_save_sched_clock_state,
-				   kvm_restore_sched_clock_state);
-
-	pr_info("kvm-clock: using sched offset of %llu cycles",
-		kvm_sched_clock_offset);
-
-	BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
-		sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
-}
-
 void kvmclock_cpu_action(enum kvm_guest_cpu_action action)
 {
 	/*
@@ -303,6 +289,20 @@ static int kvmclock_setup_percpu(unsigned int cpu)
 	return p ? 0 : -ENOMEM;
 }
 
+static __init void kvm_sched_clock_init(bool stable)
+{
+	kvm_sched_clock_offset = kvm_clock_read();
+	__paravirt_set_sched_clock(kvm_sched_clock_read, stable,
+				   kvm_save_sched_clock_state,
+				   kvm_restore_sched_clock_state);
+
+	pr_info("kvm-clock: using sched offset of %llu cycles",
+		kvm_sched_clock_offset);
+
+	BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
+		sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
+}
+
 void __init kvmclock_init(bool prefer_tsc)
 {
 	u8 flags;
-- 
2.54.0.823.g6e5bcc1fc9-goog


^ permalink raw reply related

* [PATCH v4 35/47] x86/xen/time: Mark xen_setup_vsyscall_time_info() as __init
From: Sean Christopherson @ 2026-05-29 15:08 UTC (permalink / raw)
  To: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Kiryl Shutsemau, Sean Christopherson,
	K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Andy Lutomirski,
	Peter Zijlstra, Juergen Gross, Daniel Lezcano, John Stultz
  Cc: H . Peter Anvin, Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, David Woodhouse, Tom Lendacky,
	Nikunj A Dadhania, David Woodhouse, Michael Kelley,
	Thomas Gleixner
In-Reply-To: <20260529144435.704127-1-seanjc@google.com>

Annotate xen_setup_vsyscall_time_info() as being used only during kernel
initialization; it's called only by xen_time_init(), which is already
tagged __init.

Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/xen/time.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 91ef83b1e540..8f4511f91d16 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -454,7 +454,7 @@ void xen_restore_time_memory_area(void)
 	xen_sched_clock_offset = xen_clocksource_read() - xen_clock_value_saved;
 }
 
-static void xen_setup_vsyscall_time_info(void)
+static void __init xen_setup_vsyscall_time_info(void)
 {
 	struct vcpu_register_time_memory_area t;
 	struct pvclock_vsyscall_time_info *ti;
-- 
2.54.0.823.g6e5bcc1fc9-goog


^ permalink raw reply related

* [PATCH v4 36/47] x86/pvclock: Mark setup helpers and related various as __init/__ro_after_init
From: Sean Christopherson @ 2026-05-29 15:08 UTC (permalink / raw)
  To: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Kiryl Shutsemau, Sean Christopherson,
	K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Andy Lutomirski,
	Peter Zijlstra, Juergen Gross, Daniel Lezcano, John Stultz
  Cc: H . Peter Anvin, Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, David Woodhouse, Tom Lendacky,
	Nikunj A Dadhania, David Woodhouse, Michael Kelley,
	Thomas Gleixner
In-Reply-To: <20260529144435.704127-1-seanjc@google.com>

Now that Xen PV clock and kvmclock explicitly do setup only during init,
tag the common PV clock flags/vsyscall variables and their mutators with
__init.

Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kernel/pvclock.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index b3f81379c2fc..a51adce67f92 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -16,10 +16,10 @@
 #include <asm/pvclock.h>
 #include <asm/vgtod.h>
 
-static u8 valid_flags __read_mostly = 0;
-static struct pvclock_vsyscall_time_info *pvti_cpu0_va __read_mostly;
+static u8 valid_flags __ro_after_init = 0;
+static struct pvclock_vsyscall_time_info *pvti_cpu0_va __ro_after_init;
 
-void pvclock_set_flags(u8 flags)
+void __init pvclock_set_flags(u8 flags)
 {
 	valid_flags = flags;
 }
@@ -153,7 +153,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
 	set_normalized_timespec64(ts, now.tv_sec, now.tv_nsec);
 }
 
-void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti)
+void __init pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti)
 {
 	WARN_ON(vclock_was_used(VDSO_CLOCKMODE_PVCLOCK));
 	pvti_cpu0_va = pvti;
-- 
2.54.0.823.g6e5bcc1fc9-goog


^ permalink raw reply related

* [PATCH v4 37/47] x86/pvclock: WARN if pvclock's valid_flags are overwritten
From: Sean Christopherson @ 2026-05-29 15:08 UTC (permalink / raw)
  To: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Kiryl Shutsemau, Sean Christopherson,
	K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Andy Lutomirski,
	Peter Zijlstra, Juergen Gross, Daniel Lezcano, John Stultz
  Cc: H . Peter Anvin, Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, David Woodhouse, Tom Lendacky,
	Nikunj A Dadhania, David Woodhouse, Michael Kelley,
	Thomas Gleixner
In-Reply-To: <20260529144435.704127-1-seanjc@google.com>

WARN if the common PV clock valid_flags are overwritten; all PV clocks
expect that they are the one and only PV clock, i.e. don't guard against
another PV clock having modified the flags.

Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kernel/pvclock.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index a51adce67f92..8d098841a225 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -21,6 +21,7 @@ static struct pvclock_vsyscall_time_info *pvti_cpu0_va __ro_after_init;
 
 void __init pvclock_set_flags(u8 flags)
 {
+	WARN_ON(valid_flags);
 	valid_flags = flags;
 }
 
-- 
2.54.0.823.g6e5bcc1fc9-goog


^ permalink raw reply related

* [PATCH v4 38/47] x86/kvmclock: Refactor handling of PVCLOCK_TSC_STABLE_BIT during kvmclock_init()
From: Sean Christopherson @ 2026-05-29 15:08 UTC (permalink / raw)
  To: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Kiryl Shutsemau, Sean Christopherson,
	K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Andy Lutomirski,
	Peter Zijlstra, Juergen Gross, Daniel Lezcano, John Stultz
  Cc: H . Peter Anvin, Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, David Woodhouse, Tom Lendacky,
	Nikunj A Dadhania, David Woodhouse, Michael Kelley,
	Thomas Gleixner
In-Reply-To: <20260529144435.704127-1-seanjc@google.com>

Clean up the setting of PVCLOCK_TSC_STABLE_BIT during kvmclock init to
make it somewhat obvious that pvclock_read_flags() must be called *after*
pvclock_set_flags().

Note, in theory, a different PV clock could have set PVCLOCK_TSC_STABLE_BIT
in the supported flags, i.e. reading flags only if
KVM_FEATURE_CLOCKSOURCE_STABLE_BIT is set could very, very theoretically
result in a change in behavior.  In practice, the kernel only supports a
single PV clock.

Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kernel/kvmclock.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 6372b4dc7b0c..4e304f1c887d 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -305,7 +305,7 @@ static __init void kvm_sched_clock_init(bool stable)
 
 void __init kvmclock_init(bool prefer_tsc)
 {
-	u8 flags;
+	bool stable = false;
 
 	if (!kvm_para_available() || !kvmclock)
 		return;
@@ -332,11 +332,18 @@ void __init kvmclock_init(bool prefer_tsc)
 	kvm_register_clock("primary cpu clock");
 	pvclock_set_pvti_cpu0_va(hv_clock_boot);
 
-	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
+	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) {
 		pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
 
-	flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
-	kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
+		/*
+		 * Check if the clock is stable *after* marking TSC_STABLE as a
+		 * valid flag.
+		 */
+		stable = pvclock_read_flags(&hv_clock_boot[0].pvti) &
+			 PVCLOCK_TSC_STABLE_BIT;
+	}
+
+	kvm_sched_clock_init(stable);
 
 	if (!x86_init.hyper.get_tsc_khz)
 		x86_init.hyper.get_tsc_khz = kvmclock_get_tsc_khz;
-- 
2.54.0.823.g6e5bcc1fc9-goog


^ permalink raw reply related

* [PATCH v4 39/47] timekeeping: Resume clocksources before reading persistent clock
From: Sean Christopherson @ 2026-05-29 15:08 UTC (permalink / raw)
  To: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Kiryl Shutsemau, Sean Christopherson,
	K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Andy Lutomirski,
	Peter Zijlstra, Juergen Gross, Daniel Lezcano, John Stultz
  Cc: H . Peter Anvin, Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, David Woodhouse, Tom Lendacky,
	Nikunj A Dadhania, David Woodhouse, Michael Kelley,
	Thomas Gleixner
In-Reply-To: <20260529144435.704127-1-seanjc@google.com>

When resuming timekeeping after suspend, restore clocksources prior to
reading the persistent clock.  Paravirt clocks, e.g. kvmclock, tie the
validity of a PV persistent clock to a clocksource, i.e. reading the PV
persistent clock will return garbage if the underlying PV clocksource
hasn't been enabled.  The flaw has gone unnoticed because kvmclock is a
mess and uses its own suspend/resume hooks instead of the clocksource
suspend/resume hooks, which happens to work by sheer dumb luck (the
kvmclock resume hook runs before timekeeping_resume()).

Note, there is no evidence that any clocksource supported by the kernel
depends on a persistent clock.

Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 kernel/time/timekeeping.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c493a4010305..26f3291a814d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2098,11 +2098,16 @@ void timekeeping_resume(void)
 	u64 cycle_now, nsec;
 	unsigned long flags;
 
-	read_persistent_clock64(&ts_new);
-
 	clockevents_resume();
 	clocksource_resume();
 
+	/*
+	 * Read persistent time after clocksources have been resumed.  Paravirt
+	 * clocks have a nasty habit of piggybacking a persistent clock on a
+	 * system clock, and may return garbage if the system clock is suspended.
+	 */
+	read_persistent_clock64(&ts_new);
+
 	raw_spin_lock_irqsave(&tk_core.lock, flags);
 
 	/*
-- 
2.54.0.823.g6e5bcc1fc9-goog


^ permalink raw reply related

* [PATCH v4 40/47] x86/kvmclock: Hook clocksource.suspend/resume when kvmclock isn't sched_clock
From: Sean Christopherson @ 2026-05-29 15:08 UTC (permalink / raw)
  To: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Kiryl Shutsemau, Sean Christopherson,
	K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Andy Lutomirski,
	Peter Zijlstra, Juergen Gross, Daniel Lezcano, John Stultz
  Cc: H . Peter Anvin, Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, David Woodhouse, Tom Lendacky,
	Nikunj A Dadhania, David Woodhouse, Michael Kelley,
	Thomas Gleixner
In-Reply-To: <20260529144435.704127-1-seanjc@google.com>

Save/restore kvmclock across suspend/resume via clocksource hooks when
kvmclock isn't being used for sched_clock.  This will allow using kvmclock
as a clocksource (or for wallclock!) without also using it for sched_clock.

Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kernel/kvmclock.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 4e304f1c887d..5dfac79a5d30 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -131,7 +131,17 @@ static void kvm_setup_secondary_clock(void)
 
 static void kvm_restore_sched_clock_state(void)
 {
-	kvm_register_clock("primary cpu clock, resume");
+	kvm_register_clock("primary cpu, sched_clock resume");
+}
+
+static void kvmclock_suspend(struct clocksource *cs)
+{
+	kvmclock_disable();
+}
+
+static void kvmclock_resume(struct clocksource *cs)
+{
+	kvm_register_clock("primary cpu, clocksource resume");
 }
 
 void kvmclock_cpu_action(enum kvm_guest_cpu_action action)
@@ -201,6 +211,8 @@ static struct clocksource kvm_clock = {
 	.flags	= CLOCK_SOURCE_IS_CONTINUOUS,
 	.id     = CSID_X86_KVM_CLK,
 	.enable	= kvm_cs_enable,
+	.suspend = kvmclock_suspend,
+	.resume = kvmclock_resume,
 };
 
 static void __init kvmclock_init_mem(void)
@@ -296,6 +308,15 @@ static __init void kvm_sched_clock_init(bool stable)
 				   kvm_save_sched_clock_state,
 				   kvm_restore_sched_clock_state);
 
+	/*
+	 * The BSP's clock is managed via dedicated sched_clock save/restore
+	 * hooks when kvmclock is used as sched_clock, as sched_clock needs to
+	 * be kept alive until the very end of suspend entry, and restored as
+	 * quickly as possible after resume.
+	 */
+	kvm_clock.suspend = NULL;
+	kvm_clock.resume = NULL;
+
 	pr_info("kvm-clock: using sched offset of %llu cycles",
 		kvm_sched_clock_offset);
 
-- 
2.54.0.823.g6e5bcc1fc9-goog


^ permalink raw reply related

* [PATCH v4 41/47] x86/kvmclock: WARN if wall clock is read while kvmclock is suspended
From: Sean Christopherson @ 2026-05-29 15:08 UTC (permalink / raw)
  To: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Kiryl Shutsemau, Sean Christopherson,
	K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Andy Lutomirski,
	Peter Zijlstra, Juergen Gross, Daniel Lezcano, John Stultz
  Cc: H . Peter Anvin, Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, David Woodhouse, Tom Lendacky,
	Nikunj A Dadhania, David Woodhouse, Michael Kelley,
	Thomas Gleixner
In-Reply-To: <20260529144435.704127-1-seanjc@google.com>

WARN if kvmclock is still suspended when its wallclock is read, i.e. when
the kernel reads its persistent clock.  The wallclock subtly depends on
the BSP's kvmclock being enabled, and returns garbage if kvmclock is
disabled.

Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kernel/kvmclock.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 5dfac79a5d30..73fabfac2bc9 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -53,6 +53,8 @@ static struct pvclock_vsyscall_time_info *hvclock_mem;
 DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
 EXPORT_PER_CPU_SYMBOL_GPL(hv_clock_per_cpu);
 
+static bool kvmclock_suspended;
+
 /*
  * The wallclock is the time of day when we booted. Since then, some time may
  * have elapsed since the hypervisor wrote the data. So we try to account for
@@ -60,6 +62,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(hv_clock_per_cpu);
  */
 static void kvm_get_wallclock(struct timespec64 *now)
 {
+	WARN_ON_ONCE(kvmclock_suspended);
 	wrmsrq(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock));
 	preempt_disable();
 	pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now);
@@ -119,6 +122,7 @@ static void kvm_save_sched_clock_state(void)
 	 * to the old address prior to reconfiguring kvmclock would clobber
 	 * random memory.
 	 */
+	kvmclock_suspended = true;
 	kvmclock_disable();
 }
 
@@ -131,16 +135,19 @@ static void kvm_setup_secondary_clock(void)
 
 static void kvm_restore_sched_clock_state(void)
 {
+	kvmclock_suspended = false;
 	kvm_register_clock("primary cpu, sched_clock resume");
 }
 
 static void kvmclock_suspend(struct clocksource *cs)
 {
+	kvmclock_suspended = true;
 	kvmclock_disable();
 }
 
 static void kvmclock_resume(struct clocksource *cs)
 {
+	kvmclock_suspended = false;
 	kvm_register_clock("primary cpu, clocksource resume");
 }
 
-- 
2.54.0.823.g6e5bcc1fc9-goog


^ permalink raw reply related

* [PATCH v4 42/47] x86/paravirt: Mark __paravirt_set_sched_clock() as __init
From: Sean Christopherson @ 2026-05-29 15:08 UTC (permalink / raw)
  To: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Kiryl Shutsemau, Sean Christopherson,
	K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Andy Lutomirski,
	Peter Zijlstra, Juergen Gross, Daniel Lezcano, John Stultz
  Cc: H . Peter Anvin, Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, David Woodhouse, Tom Lendacky,
	Nikunj A Dadhania, David Woodhouse, Michael Kelley,
	Thomas Gleixner
In-Reply-To: <20260529144435.704127-1-seanjc@google.com>

Annotate __paravirt_set_sched_clock() as __init, and make its wrapper
__always_inline to ensure sanitizers don't result in a non-inline version
hanging around.  All callers run during __init, and changing sched_clock
after boot would be all kinds of crazy.

No functional change intended.

Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/timer.h | 10 +++++-----
 arch/x86/kernel/tsc.c        |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index e97cd1ae03d1..96ae7feac47c 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -14,12 +14,12 @@ extern int no_timer_check;
 extern bool using_native_sched_clock(void);
 
 #ifdef CONFIG_PARAVIRT
-void __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
-				void (*save)(void), void (*restore)(void));
+void __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
+				       void (*save)(void), void (*restore)(void));
 
-static inline void paravirt_set_sched_clock(u64 (*func)(void),
-					    void (*save)(void),
-					    void (*restore)(void))
+static __always_inline void paravirt_set_sched_clock(u64 (*func)(void),
+						     void (*save)(void),
+						     void (*restore)(void))
 {
 	__paravirt_set_sched_clock(func, true, save, restore);
 }
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7fbcfc2efd1d..6da0a3ac05c2 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -280,8 +280,8 @@ bool using_native_sched_clock(void)
 	return static_call_query(pv_sched_clock) == native_sched_clock;
 }
 
-void __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
-				void (*save)(void), void (*restore)(void))
+void __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
+				       void (*save)(void), void (*restore)(void))
 {
 	if (!stable)
 		clear_sched_clock_stable();
-- 
2.54.0.823.g6e5bcc1fc9-goog


^ permalink raw reply related

* [PATCH v4 43/47] x86/paravirt: Plumb a return code into __paravirt_set_sched_clock()
From: Sean Christopherson @ 2026-05-29 15:08 UTC (permalink / raw)
  To: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Kiryl Shutsemau, Sean Christopherson,
	K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Andy Lutomirski,
	Peter Zijlstra, Juergen Gross, Daniel Lezcano, John Stultz
  Cc: H . Peter Anvin, Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, David Woodhouse, Tom Lendacky,
	Nikunj A Dadhania, David Woodhouse, Michael Kelley,
	Thomas Gleixner
In-Reply-To: <20260529144435.704127-1-seanjc@google.com>

Add a return code to __paravirt_set_sched_clock() so that the kernel can
reject attempts to use a PV sched_clock without breaking the caller.  E.g.
when running as a CoCo VM with a secure TSC, using a PV clock is generally
undesirable.

Note, kvmclock is the only PV clock that does anything "extra" beyond
simply registering itself as sched_clock, i.e. is the only caller that
needs to check the new return value.

Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/timer.h | 6 +++---
 arch/x86/kernel/kvmclock.c   | 9 ++++++---
 arch/x86/kernel/tsc.c        | 5 +++--
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 96ae7feac47c..ca5c95d48c03 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -14,14 +14,14 @@ extern int no_timer_check;
 extern bool using_native_sched_clock(void);
 
 #ifdef CONFIG_PARAVIRT
-void __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
-				       void (*save)(void), void (*restore)(void));
+int __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
+				      void (*save)(void), void (*restore)(void));
 
 static __always_inline void paravirt_set_sched_clock(u64 (*func)(void),
 						     void (*save)(void),
 						     void (*restore)(void))
 {
-	__paravirt_set_sched_clock(func, true, save, restore);
+	(void)__paravirt_set_sched_clock(func, true, save, restore);
 }
 #endif
 
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 73fabfac2bc9..1336c24f59cf 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -310,10 +310,13 @@ static int kvmclock_setup_percpu(unsigned int cpu)
 
 static __init void kvm_sched_clock_init(bool stable)
 {
+	/* Ensure the offset is configured before making kvmclock visible! */
 	kvm_sched_clock_offset = kvm_clock_read();
-	__paravirt_set_sched_clock(kvm_sched_clock_read, stable,
-				   kvm_save_sched_clock_state,
-				   kvm_restore_sched_clock_state);
+
+	if (__paravirt_set_sched_clock(kvm_sched_clock_read, stable,
+				       kvm_save_sched_clock_state,
+				       kvm_restore_sched_clock_state))
+		return;
 
 	/*
 	 * The BSP's clock is managed via dedicated sched_clock save/restore
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 6da0a3ac05c2..7bcf757bf551 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -280,8 +280,8 @@ bool using_native_sched_clock(void)
 	return static_call_query(pv_sched_clock) == native_sched_clock;
 }
 
-void __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
-				       void (*save)(void), void (*restore)(void))
+int __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
+				      void (*save)(void), void (*restore)(void))
 {
 	if (!stable)
 		clear_sched_clock_stable();
@@ -289,6 +289,7 @@ void __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
 	static_call_update(pv_sched_clock, func);
 	x86_platform.save_sched_clock_state = save;
 	x86_platform.restore_sched_clock_state = restore;
+	return 0;
 }
 #else
 u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock")));
-- 
2.54.0.823.g6e5bcc1fc9-goog


^ permalink raw reply related

* [PATCH v4 44/47] x86/paravirt: Don't use a PV sched_clock in CoCo guests with trusted TSC
From: Sean Christopherson @ 2026-05-29 15:08 UTC (permalink / raw)
  To: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Kiryl Shutsemau, Sean Christopherson,
	K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Andy Lutomirski,
	Peter Zijlstra, Juergen Gross, Daniel Lezcano, John Stultz
  Cc: H . Peter Anvin, Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, David Woodhouse, Tom Lendacky,
	Nikunj A Dadhania, David Woodhouse, Michael Kelley,
	Thomas Gleixner
In-Reply-To: <20260529144435.704127-1-seanjc@google.com>

Silently ignore attempts to switch to a paravirt sched_clock when running
as a CoCo guest with trusted TSC.  In hand-wavy theory, a misbehaving
hypervisor could attack the guest by manipulating the PV clock to affect
guest scheduling in some weird and/or predictable way.  More importantly,
reading TSC on such platforms is faster than any PV clock, and sched_clock
is all about speed.

Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kernel/tsc.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7bcf757bf551..036916953f4a 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -283,6 +283,15 @@ bool using_native_sched_clock(void)
 int __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
 				      void (*save)(void), void (*restore)(void))
 {
+	/*
+	 * Don't replace TSC with a PV clock when running as a CoCo guest and
+	 * the TSC is secure/trusted; PV clocks are emulated by the hypervisor,
+	 * which isn't in the guest's TCB.
+	 */
+	if (cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC) ||
+	    boot_cpu_has(X86_FEATURE_TDX_GUEST))
+		return -EPERM;
+
 	if (!stable)
 		clear_sched_clock_stable();
 
-- 
2.54.0.823.g6e5bcc1fc9-goog


^ permalink raw reply related

* [PATCH v4 45/47] x86/kvmclock: Use TSC for sched_clock if it's constant and non-stop
From: Sean Christopherson @ 2026-05-29 15:08 UTC (permalink / raw)
  To: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Kiryl Shutsemau, Sean Christopherson,
	K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Andy Lutomirski,
	Peter Zijlstra, Juergen Gross, Daniel Lezcano, John Stultz
  Cc: H . Peter Anvin, Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, David Woodhouse, Tom Lendacky,
	Nikunj A Dadhania, David Woodhouse, Michael Kelley,
	Thomas Gleixner
In-Reply-To: <20260529144435.704127-1-seanjc@google.com>

Prefer the TSC over kvmclock for sched_clock if the TSC is constant,
nonstop, and not marked unstable via command line.  I.e. use the same
criteria as tweaking the clocksource rating so that TSC is preferred over
kvmclock.  Per the below comment from native_sched_clock(), sched_clock
is more tolerant of slop than clocksource; using TSC for clocksource but
not sched_clock makes little to no sense, especially now that KVM CoCo
guests with a trusted TSC use TSC, not kvmclock.

        /*
         * Fall back to jiffies if there's no TSC available:
         * ( But note that we still use it if the TSC is marked
         *   unstable. We do this because unlike Time Of Day,
         *   the scheduler clock tolerates small errors and it's
         *   very important for it to be as fast as the platform
         *   can achieve it. )
         */

The only advantage of using kvmclock is that doing so allows for early
and common detection of PVCLOCK_GUEST_STOPPED, but that code has been
broken for over two years with nary a complaint, i.e. it can't be
_that_ valuable.  And as above, certain types of KVM guests are losing
the functionality regardless, i.e. acknowledging PVCLOCK_GUEST_STOPPED
needs to be decoupled from sched_clock() no matter what.

Link: https://lore.kernel.org/all/Z4hDK27OV7wK572A@google.com
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kernel/kvmclock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 1336c24f59cf..cd65ad328637 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -374,7 +374,6 @@ void __init kvmclock_init(bool prefer_tsc)
 			 PVCLOCK_TSC_STABLE_BIT;
 	}
 
-	kvm_sched_clock_init(stable);
 
 	if (!x86_init.hyper.get_tsc_khz)
 		x86_init.hyper.get_tsc_khz = kvmclock_get_tsc_khz;
@@ -394,6 +393,8 @@ void __init kvmclock_init(bool prefer_tsc)
 	 */
 	if (prefer_tsc)
 		kvm_clock.rating = 299;
+	else
+		kvm_sched_clock_init(stable);
 
 	clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
 	pv_info.name = "KVM";
-- 
2.54.0.823.g6e5bcc1fc9-goog


^ permalink raw reply related

* [PATCH v4 46/47] x86/kvmclock: Plumb in AP-online and BSP-resume to kvmlock, for documentation
From: Sean Christopherson @ 2026-05-29 15:08 UTC (permalink / raw)
  To: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Kiryl Shutsemau, Sean Christopherson,
	K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Andy Lutomirski,
	Peter Zijlstra, Juergen Gross, Daniel Lezcano, John Stultz
  Cc: H . Peter Anvin, Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, David Woodhouse, Tom Lendacky,
	Nikunj A Dadhania, David Woodhouse, Michael Kelley,
	Thomas Gleixner
In-Reply-To: <20260529144435.704127-1-seanjc@google.com>

Invoke kvmclock_cpu_action() with AP_ONLINE and BSP_RESUME, even though
kvmclock doesn't need to do anything in either case, so that the asymmetry
of kvmclock is a detail buried in kvmclock, and to explicitly document
that doing nothing during those phases is intentional and correct.

For all intents and purposes, no functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/kvm_para.h |  2 ++
 arch/x86/kernel/kvm.c           | 22 +++++++++++++-------
 arch/x86/kernel/kvmclock.c      | 37 ++++++++++++++++++++++++++-------
 3 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 08686ff19caa..763ed017738a 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -120,6 +120,8 @@ static inline long kvm_sev_hypercall3(unsigned int nr, unsigned long p1,
 #ifdef CONFIG_KVM_GUEST
 enum kvm_guest_cpu_action {
 	KVM_GUEST_BSP_SUSPEND,
+	KVM_GUEST_BSP_RESUME,
+	KVM_GUEST_AP_ONLINE,
 	KVM_GUEST_AP_OFFLINE,
 	KVM_GUEST_SHUTDOWN,
 };
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index fd1c417b4f9b..2ed4bf13e3ed 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -474,18 +474,24 @@ static void kvm_guest_cpu_offline(enum kvm_guest_cpu_action action)
 	kvmclock_cpu_action(action);
 }
 
+static void __kvm_cpu_online(unsigned int cpu, enum kvm_guest_cpu_action action)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	kvmclock_cpu_action(action);
+	kvm_guest_cpu_init();
+	local_irq_restore(flags);
+}
+
+#ifdef CONFIG_SMP
+
 static int kvm_cpu_online(unsigned int cpu)
 {
-	unsigned long flags;
-
-	local_irq_save(flags);
-	kvm_guest_cpu_init();
-	local_irq_restore(flags);
+	__kvm_cpu_online(cpu, KVM_GUEST_AP_ONLINE);
 	return 0;
 }
 
-#ifdef CONFIG_SMP
-
 static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
 
 static bool pv_tlb_flush_supported(void)
@@ -750,7 +756,7 @@ static int kvm_suspend(void *data)
 
 static void kvm_resume(void *data)
 {
-	kvm_cpu_online(raw_smp_processor_id());
+	__kvm_cpu_online(raw_smp_processor_id(), KVM_GUEST_BSP_RESUME);
 
 #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
 	if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index cd65ad328637..d122912b8856 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -129,7 +129,7 @@ static void kvm_save_sched_clock_state(void)
 #ifdef CONFIG_SMP
 static void kvm_setup_secondary_clock(void)
 {
-	kvm_register_clock("secondary cpu clock");
+	kvm_register_clock("secondary cpu, startup");
 }
 #endif
 
@@ -153,13 +153,34 @@ static void kvmclock_resume(struct clocksource *cs)
 
 void kvmclock_cpu_action(enum kvm_guest_cpu_action action)
 {
-	/*
-	 * Don't disable kvmclock on the BSP during suspend.  If kvmclock is
-	 * being used for sched_clock, then it needs to be kept alive until the
-	 * last minute, and restored as quickly as possible after resume.
-	 */
-	if (action != KVM_GUEST_BSP_SUSPEND)
+	switch (action) {
+		/*
+		 * The BSP's clock is managed via clocksource suspend/resume,
+		 * to ensure it's enabled/disabled when timekeeping needs it
+		 * to be, e.g. before reading wallclock (which uses kvmclock).
+		 */
+	case KVM_GUEST_BSP_SUSPEND:
+	case KVM_GUEST_BSP_RESUME:
+		break;
+	case KVM_GUEST_AP_ONLINE:
+		/*
+		 * Secondary CPUs use a dedicated hook to enable kvmclock early
+		 * during bringup, there's nothing to be done during CPU online
+		 * (which runs at CPUHP_AP_ONLINE_DYN).  When kvmclock is being
+		 * used as sched_clock, kvmclock must be enabled *very* early,
+		 * and even when kvmclock is "only" being used for the main
+		 * clocksource, it still needs to be enabled long before the
+		 * dynamic CPUHP calls are made.
+		 */
+		break;
+	case KVM_GUEST_AP_OFFLINE:
+	case KVM_GUEST_SHUTDOWN:
 		kvmclock_disable();
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		break;
+	}
 }
 
 /*
@@ -360,7 +381,7 @@ void __init kvmclock_init(bool prefer_tsc)
 		msr_kvm_system_time, msr_kvm_wall_clock);
 
 	this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]);
-	kvm_register_clock("primary cpu clock");
+	kvm_register_clock("primary cpu, online");
 	pvclock_set_pvti_cpu0_va(hv_clock_boot);
 
 	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) {
-- 
2.54.0.823.g6e5bcc1fc9-goog


^ permalink raw reply related

* [PATCH v4 47/47] x86/paravirt: Move using_native_sched_clock() stub into timer.h
From: Sean Christopherson @ 2026-05-29 15:08 UTC (permalink / raw)
  To: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Kiryl Shutsemau, Sean Christopherson,
	K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Andy Lutomirski,
	Peter Zijlstra, Juergen Gross, Daniel Lezcano, John Stultz
  Cc: H . Peter Anvin, Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, David Woodhouse, Tom Lendacky,
	Nikunj A Dadhania, David Woodhouse, Michael Kelley,
	Thomas Gleixner
In-Reply-To: <20260529144435.704127-1-seanjc@google.com>

Now that timer.h ended up with CONFIG_PARAVIRT #ifdeffery anyways, move the
PARAVIRT=n using_native_sched_clock() stub into timer.h as a "free"
optimization.

No functional change intended.

Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/timer.h | 6 ++++--
 arch/x86/kernel/tsc.c        | 2 --
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index ca5c95d48c03..a52388af6055 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -11,9 +11,9 @@ extern void recalibrate_cpu_khz(void);
 
 extern int no_timer_check;
 
-extern bool using_native_sched_clock(void);
-
 #ifdef CONFIG_PARAVIRT
+extern bool using_native_sched_clock(void);
+
 int __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
 				      void (*save)(void), void (*restore)(void));
 
@@ -23,6 +23,8 @@ static __always_inline void paravirt_set_sched_clock(u64 (*func)(void),
 {
 	(void)__paravirt_set_sched_clock(func, true, save, restore);
 }
+#else
+static inline bool using_native_sched_clock(void) { return true; }
 #endif
 
 /*
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 036916953f4a..159d7d060204 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -302,8 +302,6 @@ int __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
 }
 #else
 u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock")));
-
-bool using_native_sched_clock(void) { return true; }
 #endif
 
 notrace u64 sched_clock(void)
-- 
2.54.0.823.g6e5bcc1fc9-goog


^ permalink raw reply related

* Re: [PATCH v4 00/47] x86: Try to wrangle PV clocks vs. TSC
From: Sean Christopherson @ 2026-05-29 15:10 UTC (permalink / raw)
  To: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Kiryl Shutsemau, K. Y. Srinivasan,
	Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
	Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
	Juergen Gross, Daniel Lezcano, John Stultz, H. Peter Anvin,
	Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, David Woodhouse, Tom Lendacky,
	Nikunj A Dadhania, David Woodhouse, Michael Kelley,
	Thomas Gleixner
In-Reply-To: <20260529144435.704127-1-seanjc@google.com>

On Fri, May 29, 2026, Sean Christopherson wrote:
> Well, the number of patches in the series is going in the wrong direction,
> but I'm much happier with this version, which eschews the x86_platform
> overrides entirely in favor of a fixed sequence for selecting the TSC/CPU
> frequency "routine".

FYI, our internal mail server flamed out after sending patch 26 in the initial
go.  I'm pretty sure I managed to get the rest sent without screwing up the
threading.  Holler if something is wonky and I'll RESEND the whole pile if necessary.

^ permalink raw reply

* Re: [PATCH v4 00/47] x86: Try to wrangle PV clocks vs. TSC
From: Jürgen Groß @ 2026-05-29 15:17 UTC (permalink / raw)
  To: Sean Christopherson, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
	K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Andy Lutomirski,
	Peter Zijlstra, Daniel Lezcano, John Stultz, H. Peter Anvin,
	Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, David Woodhouse, Tom Lendacky,
	Nikunj A Dadhania, David Woodhouse, Michael Kelley,
	Thomas Gleixner
In-Reply-To: <ahmsZA8mHj9CPnd2@google.com>


[-- Attachment #1.1.1: Type: text/plain, Size: 649 bytes --]

On 29.05.26 17:10, Sean Christopherson wrote:
> On Fri, May 29, 2026, Sean Christopherson wrote:
>> Well, the number of patches in the series is going in the wrong direction,
>> but I'm much happier with this version, which eschews the x86_platform
>> overrides entirely in favor of a fixed sequence for selecting the TSC/CPU
>> frequency "routine".
> 
> FYI, our internal mail server flamed out after sending patch 26 in the initial
> go.  I'm pretty sure I managed to get the rest sent without screwing up the
> threading.  Holler if something is wonky and I'll RESEND the whole pile if necessary.

Looks fine on my side.


Juergen

[-- Attachment #1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 3743 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 495 bytes --]

^ permalink raw reply

* Re: [PATCH v4 04/47] x86/sev: Don't override CPU frequency calibration for SNP's Secure TSC
From: sashiko-bot @ 2026-05-29 15:44 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: linux-hyperv, kvm
In-Reply-To: <20260529144435.704127-5-seanjc@google.com>

Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [High] Removing the `calibrate_cpu` override for SEV-SNP Secure TSC guests incorrectly assumes that `cpu_khz` and `tsc_khz` should be decoupled for APERF/MPERF calculations. On AMD architectures, the MPERF register architecturally increments at the P0 frequency, which matches the invariant TSC frequency. The kernel calculates effective frequency using `(cpu_khz * APERF) / MPERF`, so `cpu_khz` must exactly equal the MPERF tick rate (`tsc_khz`) for correct results. Furthermore, because `cpu_khz_from_cpuid()` is disabled for non-Intel CPUs, removing this override forces SEV-SNP guests to fall back to untrusted PV clocks or legacy PIT calibration. PIT calibration involves thousands of I/O port reads, causing `#VC` VMGEXITs that significantly increase boot latency, and forces the guest to trust the hypervisor's timer emulation, violating the security model of Secure TSC.
--

commit d8a832435bc58075d8e63ea3016b822abba9433c
Author: Sean Christopherson <seanjc@google.com>

x86/sev: Don't override CPU frequency calibration for SNP's Secure TSC

This commit removes the override of the CPU frequency calibration routine
for SEV-SNP guests using Secure TSC. It asserts that CPU frequency should
not be strictly tied to the TSC frequency when features like APERF/MPERF
are exposed to the guest.

> diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
> index ed0ac52a765e0..665de1aea0ee9 100644
> --- a/arch/x86/coco/sev/core.c
> +++ b/arch/x86/coco/sev/core.c
> @@ -2046,7 +2046,6 @@ void __init snp_secure_tsc_init(void)
>  
>  	snp_tsc_freq_khz = SNP_SCALE_TSC_FREQ(tsc_freq_mhz * 1000, secrets->tsc_factor);
>  
> -	x86_platform.calibrate_cpu = securetsc_get_tsc_khz;

[Severity: High]
Does removing this override lead to incorrect APERF/MPERF effective frequency
calculations and boot latency regressions?

On AMD architectures, the MPERF register architecturally increments at the P0
frequency, which matches the invariant TSC frequency. Because the kernel
calculates effective frequency using (cpu_khz * APERF) / MPERF, cpu_khz
must exactly equal the MPERF tick rate (the TSC frequency) for correct results.

Furthermore, because cpu_khz_from_cpuid() is disabled for non-Intel CPUs,
does removing this override force SEV-SNP guests to fall back to untrusted PV
clocks or legacy PIT calibration?

Legacy PIT calibration involves thousands of I/O port reads, which cause #VC
VMGEXITs. This significantly increases boot latency and forces the guest to
trust the hypervisor's timer emulation, which appears to violate the security
model of Secure TSC.

>  	x86_platform.calibrate_tsc = securetsc_get_tsc_khz;
>  
>  	early_memunmap(mem, PAGE_SIZE);

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260529144435.704127-1-seanjc@google.com?part=4

^ permalink raw reply

* Re: [PATCH v4 06/47] x86/sev: Shove SNP's secure/trusted TSC frequency directly into "calibration"
From: sashiko-bot @ 2026-05-29 16:14 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: kvm, linux-hyperv
In-Reply-To: <20260529144435.704127-7-seanjc@google.com>

Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [High] Bypassing `snp_secure_tsc_init()` when `tsc_early_khz` is provided omits critical SEV-SNP CPU capabilities, breaking TSC stability and security.
--

commit 8e7d363bd0d5d42b37ec02afb4108ddb034a9988
Author: Sean Christopherson <seanjc@google.com>

x86/sev: Shove SNP's secure/trusted TSC frequency directly into "calibration"

This commit modifies early TSC initialization to directly pass the secure
TSC frequency obtained from SNP firmware to determine_cpu_tsc_frequencies().
It introduces logic to prioritize tsc_early_khz over other sources to
maintain existing behavior.

> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 833eed5c048a9..2b8f94c3fcc7f 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
[ ... ]
> @@ -1537,16 +1538,20 @@ static void __init tsc_enable_sched_clock(void)
>  
>  void __init tsc_early_init(void)
>  {
> +	unsigned int known_tsc_khz = 0;
> +
>  	if (!boot_cpu_has(X86_FEATURE_TSC))
>  		return;
>  	/* Don't change UV TSC multi-chassis synchronization */
>  	if (is_early_uv_system())
>  		return;
>  
> -	if (cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
> -		snp_secure_tsc_init();
> +	if (tsc_early_khz)
> +		known_tsc_khz = tsc_early_khz;
> +	else if (cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
> +		known_tsc_khz = snp_secure_tsc_init();

[Severity: High]
Does skipping snp_secure_tsc_init() when tsc_early_khz is provided omit
critical CPU capabilities?

In the pre-patch code, snp_secure_tsc_init() was called unconditionally to
set up hardware capabilities before determining frequencies:

arch/x86/coco/sev/core.c:snp_secure_tsc_init() {
	...
	setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
	setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
	...
}

By chaining this into an else-if block, if tsc_early_khz is provided
(e.g., via the kernel command line), snp_secure_tsc_init() is bypassed.

Without X86_FEATURE_TSC_RELIABLE, the TSC clocksource watchdog remains active.
In a Confidential VM, an untrusted hypervisor could manipulate legacy external
timers (HPET/PIT) to trigger the watchdog, falsely marking the TSC as unstable
and forcing a fallback to untrusted time sources.

> -	if (!determine_cpu_tsc_frequencies(true))
> +	if (!determine_cpu_tsc_frequencies(true, known_tsc_khz))
>  		return;
>  	tsc_enable_sched_clock();
>  }

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260529144435.704127-1-seanjc@google.com?part=6

^ permalink raw reply

* Re: [PATCH v4 07/47] x86/tdx: Force TSC frequency with CPUID-based info provided by the TDX-Module
From: sashiko-bot @ 2026-05-29 16:21 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: linux-hyperv, kvm
In-Reply-To: <20260529144435.704127-8-seanjc@google.com>

Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [Medium] Providing the 'tsc_early_khz=' boot parameter silently bypasses TDX-specific APIC timer and TSC capability initializations, causing a feature regression.
--

commit 8a9e5abad4e72a5ef6c47191b8815b94bbed98c9
Author: Sean Christopherson <seanjc@google.com>

x86/tdx: Force TSC frequency with CPUID-based info provided by the TDX-Module

This commit updates the TSC frequency initialization for TDX guests to use
trustworthy CPUID-based information provided by the TDX-Module instead of
relying on the hypervisor. It also relocates the TSC_RELIABLE capability
setup from tdx_early_init() to a new tdx_tsc_init() routine.

> diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
> index 29b6f1ed59ec5..5d79763592201 100644
> --- a/arch/x86/coco/tdx/tdx.c
> +++ b/arch/x86/coco/tdx/tdx.c

[ ... ]

> @@ -1123,9 +1124,6 @@ void __init tdx_early_init(void)
>  
>  	setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
>  
> -	/* TSC is the only reliable clock in TDX guest */
> -	setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
> -
>  	cc_vendor = CC_VENDOR_INTEL;

[ ... ]

> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 2b8f94c3fcc7f..2603f136e29bd 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c

[ ... ]

> @@ -1550,6 +1551,8 @@ void __init tsc_early_init(void)
>  		known_tsc_khz = tsc_early_khz;
>  	else if (cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
>  		known_tsc_khz = snp_secure_tsc_init();
> +	else if (boot_cpu_has(X86_FEATURE_TDX_GUEST))
> +		known_tsc_khz = tdx_tsc_init();

[Severity: Medium]
Does this structure accidentally bypass the TDX APIC timer and TSC capability
initializations if the user provides the tsc_early_khz boot parameter?

Previously, setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE) was executed
unconditionally in tdx_early_init() for TDX guests. With this change, 
tdx_tsc_init() is skipped entirely when tsc_early_khz is provided due to
the if/else if chain.

While using tsc_early_khz is an explicit user override, skipping
tdx_tsc_init() completely drops the X86_FEATURE_TSC_RELIABLE capability.
Could this cause the clocksource watchdog to flag the TSC as unstable and
break timekeeping for configurations that rely on this parameter?

>  
>  	if (!determine_cpu_tsc_frequencies(true, known_tsc_khz))
>  		return;

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260529144435.704127-1-seanjc@google.com?part=7

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox