* [PATCH v5 01/51] x86/apic: Provide helpers to set local APIC timer period in hz and khz
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:46 ` sashiko-bot
2026-07-02 17:47 ` Michael Kelley
2026-07-01 19:31 ` [PATCH v5 02/51] x86/apic: Add CONFIG_X86_LOCAL_APIC=n stubs for apic_set_timer_period_{,k}hz() Sean Christopherson
` (49 subsequent siblings)
50 siblings, 2 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Add and use APIs to set the local APIC timer period instead of open coding
the subtle HZ math in a all external callers, and make lapic_timer_period
local to apic.c. Provide APIs to specify the frequency in both hertz and
kilohertz so that Hyper-V and VMware code aren't forced to lose precision.
Opportunistically use mul_u64_u32_div() to harden against the possibility
that the period in Khz is greater than 4294967, i.e. if the APIC timer runs
at ~4.29 GHz. As pointed out by Sashiko, 4294968 * 1000 == 0x1_000002c0,
and thus a Khz period of 4294968 would silently overflow the 32-bit
unsigned integer used by most callers.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/apic.h | 3 ++-
arch/x86/kernel/apic/apic.c | 12 +++++++++++-
arch/x86/kernel/cpu/mshyperv.c | 5 +----
arch/x86/kernel/cpu/vmware.c | 4 +---
arch/x86/kernel/jailhouse.c | 2 +-
arch/x86/kernel/tsc.c | 2 +-
arch/x86/kernel/tsc_msr.c | 2 +-
7 files changed, 18 insertions(+), 12 deletions(-)
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 9cd493d467d4..cd84a94688a2 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -63,7 +63,6 @@ extern int apic_verbosity;
extern int local_apic_timer_c2_ok;
extern bool apic_is_disabled;
-extern unsigned int lapic_timer_period;
extern enum apic_intr_mode_id apic_intr_mode;
enum apic_intr_mode_id {
@@ -138,6 +137,8 @@ void register_lapic_address(unsigned long address);
extern void setup_boot_APIC_clock(void);
extern void setup_secondary_APIC_clock(void);
extern void lapic_update_tsc_freq(void);
+extern void apic_set_timer_period_hz(u64 period_hz, const char *source);
+extern void apic_set_timer_period_khz(u64 period_khz, const char *source);
#ifdef CONFIG_X86_64
static inline bool apic_force_enable(unsigned long addr)
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index aa1e19979aa8..8d3d930576fd 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -176,7 +176,7 @@ static struct resource lapic_resource = {
};
/* Measured in ticks per HZ. */
-unsigned int lapic_timer_period = 0;
+static unsigned int lapic_timer_period;
static void apic_pm_activate(void);
@@ -796,6 +796,16 @@ bool __init apic_needs_pit(void)
return lapic_timer_period == 0;
}
+void apic_set_timer_period_khz(u64 period_khz, const char *source)
+{
+ lapic_timer_period = mul_u64_u32_div(period_khz, 1000, HZ);
+}
+
+void apic_set_timer_period_hz(u64 period_hz, const char *source)
+{
+ lapic_timer_period = div_u64(period_hz, HZ);
+}
+
static int __init calibrate_APIC_clock(void)
{
struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 185d4f677ec0..87beecec76f0 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -646,10 +646,7 @@ static void __init ms_hyperv_init_platform(void)
u64 hv_lapic_frequency;
rdmsrq(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
- hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ);
- lapic_timer_period = hv_lapic_frequency;
- pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n",
- lapic_timer_period);
+ apic_set_timer_period_hz(hv_lapic_frequency, "Hyper-V hypervisor");
}
register_nmi_handler(NMI_UNKNOWN, hv_nmi_unknown, NMI_FLAG_FIRST,
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 34b73573b108..36f779dd311d 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -424,9 +424,7 @@ static void __init vmware_platform_setup(void)
#ifdef CONFIG_X86_LOCAL_APIC
/* Skip lapic calibration since we know the bus frequency. */
- lapic_timer_period = ecx / HZ;
- pr_info("Host bus clock speed read from hypervisor : %u Hz\n",
- ecx);
+ apic_set_timer_period_hz(ecx, "VMware hypervisor");
#endif
} else {
pr_warn("Failed to get TSC freq from the hypervisor\n");
diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
index f58ce9220e0f..f2d4ef89c085 100644
--- a/arch/x86/kernel/jailhouse.c
+++ b/arch/x86/kernel/jailhouse.c
@@ -65,7 +65,7 @@ static void jailhouse_get_wallclock(struct timespec64 *now)
static void __init jailhouse_timer_init(void)
{
- lapic_timer_period = setup_data.v1.apic_khz * (1000 / HZ);
+ apic_set_timer_period_khz(setup_data.v1.apic_khz, "Jailhouse hypervisor");
}
static unsigned long jailhouse_get_tsc(void)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index ce10ae4b298b..f9ecc9256863 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -717,7 +717,7 @@ unsigned long native_calibrate_tsc(void)
* lapic_timer_period here to avoid having to calibrate the APIC
* timer later.
*/
- lapic_timer_period = crystal_khz * 1000 / HZ;
+ apic_set_timer_period_khz(crystal_khz, "CPUID 0x15/0x16");
#endif
return crystal_khz * ebx_numerator / eax_denominator;
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
index 48e6cc1cb017..7e990871e041 100644
--- a/arch/x86/kernel/tsc_msr.c
+++ b/arch/x86/kernel/tsc_msr.c
@@ -211,7 +211,7 @@ unsigned long cpu_khz_from_msr(void)
pr_err("Error MSR_FSB_FREQ index %d is unknown\n", index);
#ifdef CONFIG_X86_LOCAL_APIC
- lapic_timer_period = (freq * 1000) / HZ;
+ apic_set_timer_period_khz(freq, "MSR_FSB_FREQ");
#endif
/*
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* Re: [PATCH v5 01/51] x86/apic: Provide helpers to set local APIC timer period in hz and khz
2026-07-01 19:31 ` [PATCH v5 01/51] x86/apic: Provide helpers to set local APIC timer period in hz and khz Sean Christopherson
@ 2026-07-01 19:46 ` sashiko-bot
2026-07-01 20:05 ` Sean Christopherson
2026-07-02 17:47 ` Michael Kelley
1 sibling, 1 reply; 72+ messages in thread
From: sashiko-bot @ 2026-07-01 19:46 UTC (permalink / raw)
To: Sean Christopherson; +Cc: kvm, linux-hyperv
Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [Medium] The `apic_set_timer_period_khz()` and `apic_set_timer_period_hz()` helper functions accept a `const char *source` argument which is never used, resulting in dead code and a silent loss of boot diagnostic logging on platforms where logging was removed.
--
commit 4fb576b0d909663c9b3e6628dbed45d329c08a5d
Author: Sean Christopherson <seanjc@google.com>
x86/apic: Provide helpers to set local APIC timer period in hz and khz
This commit adds APIs to set the local APIC timer period, abstracting the
HZ math previously open-coded by external callers. It also changes the
lapic_timer_period variable to be local to apic.c.
> diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
> index aa1e19979aa8e..8d3d930576fd7 100644
> --- a/arch/x86/kernel/apic/apic.c
> +++ b/arch/x86/kernel/apic/apic.c
[ ... ]
> @@ -796,6 +796,16 @@ bool __init apic_needs_pit(void)
> return lapic_timer_period == 0;
> }
>
> +void apic_set_timer_period_khz(u64 period_khz, const char *source)
> +{
> + lapic_timer_period = mul_u64_u32_div(period_khz, 1000, HZ);
> +}
> +
> +void apic_set_timer_period_hz(u64 period_hz, const char *source)
> +{
> + lapic_timer_period = div_u64(period_hz, HZ);
> +}
[Severity: Medium]
It looks like the source parameter is accepted by apic_set_timer_period_khz()
and apic_set_timer_period_hz() in arch/x86/kernel/apic/apic.c but is never
used. Was it intended to log the frequency using this source string?
> diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
> index 185d4f677ec0f..87beecec76f04 100644
> --- a/arch/x86/kernel/cpu/mshyperv.c
> +++ b/arch/x86/kernel/cpu/mshyperv.c
> @@ -646,10 +646,7 @@ static void __init ms_hyperv_init_platform(void)
> u64 hv_lapic_frequency;
>
> rdmsrq(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
> - hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ);
> - lapic_timer_period = hv_lapic_frequency;
> - pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n",
> - lapic_timer_period);
> + apic_set_timer_period_hz(hv_lapic_frequency, "Hyper-V hypervisor");
> }
>
[ ... ]
> diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
> index 34b73573b108d..36f779dd311dc 100644
> --- a/arch/x86/kernel/cpu/vmware.c
> +++ b/arch/x86/kernel/cpu/vmware.c
> @@ -424,9 +424,7 @@ static void __init vmware_platform_setup(void)
>
> #ifdef CONFIG_X86_LOCAL_APIC
> /* Skip lapic calibration since we know the bus frequency. */
> - lapic_timer_period = ecx / HZ;
> - pr_info("Host bus clock speed read from hypervisor : %u Hz\n",
> - ecx);
> + apic_set_timer_period_hz(ecx, "VMware hypervisor");
> #endif
[Severity: Medium]
These changes remove the pr_info() logging of the APIC frequency during early
boot APIC initialization in ms_hyperv_init_platform()
(arch/x86/kernel/cpu/mshyperv.c) and vmware_platform_setup()
(arch/x86/kernel/cpu/vmware.c).
Since the new apic_set_timer_period_hz() helper does not log this information,
does this result in an unintended loss of boot diagnostic logging on these
platforms?
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260701193212.749551-1-seanjc@google.com?part=1
^ permalink raw reply [flat|nested] 72+ messages in thread* Re: [PATCH v5 01/51] x86/apic: Provide helpers to set local APIC timer period in hz and khz
2026-07-01 19:46 ` sashiko-bot
@ 2026-07-01 20:05 ` Sean Christopherson
0 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 20:05 UTC (permalink / raw)
To: sashiko-reviews; +Cc: kvm, linux-hyperv
On Wed, Jul 01, 2026, sashiko-bot@kernel.org wrote:
> Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
> > diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
> > index aa1e19979aa8e..8d3d930576fd7 100644
> > --- a/arch/x86/kernel/apic/apic.c
> > +++ b/arch/x86/kernel/apic/apic.c
>
> [ ... ]
>
> > @@ -796,6 +796,16 @@ bool __init apic_needs_pit(void)
> > return lapic_timer_period == 0;
> > }
> >
> > +void apic_set_timer_period_khz(u64 period_khz, const char *source)
> > +{
> > + lapic_timer_period = mul_u64_u32_div(period_khz, 1000, HZ);
> > +}
> > +
> > +void apic_set_timer_period_hz(u64 period_hz, const char *source)
> > +{
> > + lapic_timer_period = div_u64(period_hz, HZ);
> > +}
>
> [Severity: Medium]
> It looks like the source parameter is accepted by apic_set_timer_period_khz()
> and apic_set_timer_period_hz() in arch/x86/kernel/apic/apic.c but is never
> used. Was it intended to log the frequency using this source string?
/facepalm, yes.
^ permalink raw reply [flat|nested] 72+ messages in thread
* RE: [PATCH v5 01/51] x86/apic: Provide helpers to set local APIC timer period in hz and khz
2026-07-01 19:31 ` [PATCH v5 01/51] x86/apic: Provide helpers to set local APIC timer period in hz and khz Sean Christopherson
2026-07-01 19:46 ` sashiko-bot
@ 2026-07-02 17:47 ` Michael Kelley
1 sibling, 0 replies; 72+ messages in thread
From: Michael Kelley @ 2026-07-02 17:47 UTC (permalink / raw)
To: Sean Christopherson, Jonathan Corbet, Paolo Bonzini,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
x86@kernel.org, Kiryl Shutsemau, Rick Edgecombe, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc@vger.kernel.org, kvm@vger.kernel.org,
linux-kernel@vger.kernel.org, linux-coco@lists.linux.dev,
linux-hyperv@vger.kernel.org, virtualization@lists.linux.dev,
xen-devel@lists.xenproject.org, Tom Lendacky, Nikunj A Dadhania,
David Woodhouse, David Woodhouse, Michael Kelley, Thomas Gleixner
From: Sean Christopherson <seanjc@google.com> Sent: Wednesday, July 1, 2026 12:31 PM
>
> Add and use APIs to set the local APIC timer period instead of open coding
> the subtle HZ math in a all external callers, and make lapic_timer_period
Spurious word "a".
> local to apic.c. Provide APIs to specify the frequency in both hertz and
> kilohertz so that Hyper-V and VMware code aren't forced to lose precision.
>
> Opportunistically use mul_u64_u32_div() to harden against the possibility
> that the period in Khz is greater than 4294967, i.e. if the APIC timer runs
> at ~4.29 GHz. As pointed out by Sashiko, 4294968 * 1000 == 0x1_000002c0,
> and thus a Khz period of 4294968 would silently overflow the 32-bit
> unsigned integer used by most callers.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
> arch/x86/include/asm/apic.h | 3 ++-
> arch/x86/kernel/apic/apic.c | 12 +++++++++++-
> arch/x86/kernel/cpu/mshyperv.c | 5 +----
> arch/x86/kernel/cpu/vmware.c | 4 +---
> arch/x86/kernel/jailhouse.c | 2 +-
> arch/x86/kernel/tsc.c | 2 +-
> arch/x86/kernel/tsc_msr.c | 2 +-
> 7 files changed, 18 insertions(+), 12 deletions(-)
>
> diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
> index 9cd493d467d4..cd84a94688a2 100644
> --- a/arch/x86/include/asm/apic.h
> +++ b/arch/x86/include/asm/apic.h
> @@ -63,7 +63,6 @@ extern int apic_verbosity;
> extern int local_apic_timer_c2_ok;
>
> extern bool apic_is_disabled;
> -extern unsigned int lapic_timer_period;
>
> extern enum apic_intr_mode_id apic_intr_mode;
> enum apic_intr_mode_id {
> @@ -138,6 +137,8 @@ void register_lapic_address(unsigned long address);
> extern void setup_boot_APIC_clock(void);
> extern void setup_secondary_APIC_clock(void);
> extern void lapic_update_tsc_freq(void);
> +extern void apic_set_timer_period_hz(u64 period_hz, const char *source);
> +extern void apic_set_timer_period_khz(u64 period_khz, const char *source);
>
> #ifdef CONFIG_X86_64
> static inline bool apic_force_enable(unsigned long addr)
> diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
> index aa1e19979aa8..8d3d930576fd 100644
> --- a/arch/x86/kernel/apic/apic.c
> +++ b/arch/x86/kernel/apic/apic.c
> @@ -176,7 +176,7 @@ static struct resource lapic_resource = {
> };
>
> /* Measured in ticks per HZ. */
> -unsigned int lapic_timer_period = 0;
> +static unsigned int lapic_timer_period;
>
> static void apic_pm_activate(void);
>
> @@ -796,6 +796,16 @@ bool __init apic_needs_pit(void)
> return lapic_timer_period == 0;
> }
>
> +void apic_set_timer_period_khz(u64 period_khz, const char *source)
> +{
> + lapic_timer_period = mul_u64_u32_div(period_khz, 1000, HZ);
> +}
> +
> +void apic_set_timer_period_hz(u64 period_hz, const char *source)
> +{
> + lapic_timer_period = div_u64(period_hz, HZ);
> +}
A string "source" argument is passed in, but not used. Is there an
envisioned future use? Also, this function doesn't output a pr_info()
message like the existing Hyper-V and VMware code does. I don't
know that the message is all that useful, though I do remember
one case where I was debugging some clock/timer issue when I
looked at it.
Michael
> +
> static int __init calibrate_APIC_clock(void)
> {
> struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
> diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
> index 185d4f677ec0..87beecec76f0 100644
> --- a/arch/x86/kernel/cpu/mshyperv.c
> +++ b/arch/x86/kernel/cpu/mshyperv.c
> @@ -646,10 +646,7 @@ static void __init ms_hyperv_init_platform(void)
> u64 hv_lapic_frequency;
>
> rdmsrq(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
> - hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ);
> - lapic_timer_period = hv_lapic_frequency;
> - pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n",
> - lapic_timer_period);
> + apic_set_timer_period_hz(hv_lapic_frequency, "Hyper-V hypervisor");
> }
>
> register_nmi_handler(NMI_UNKNOWN, hv_nmi_unknown, NMI_FLAG_FIRST,
> diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
> index 34b73573b108..36f779dd311d 100644
> --- a/arch/x86/kernel/cpu/vmware.c
> +++ b/arch/x86/kernel/cpu/vmware.c
> @@ -424,9 +424,7 @@ static void __init vmware_platform_setup(void)
>
> #ifdef CONFIG_X86_LOCAL_APIC
> /* Skip lapic calibration since we know the bus frequency. */
> - lapic_timer_period = ecx / HZ;
> - pr_info("Host bus clock speed read from hypervisor : %u Hz\n",
> - ecx);
> + apic_set_timer_period_hz(ecx, "VMware hypervisor");
> #endif
> } else {
> pr_warn("Failed to get TSC freq from the hypervisor\n");
> diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
> index f58ce9220e0f..f2d4ef89c085 100644
> --- a/arch/x86/kernel/jailhouse.c
> +++ b/arch/x86/kernel/jailhouse.c
> @@ -65,7 +65,7 @@ static void jailhouse_get_wallclock(struct timespec64 *now)
>
> static void __init jailhouse_timer_init(void)
> {
> - lapic_timer_period = setup_data.v1.apic_khz * (1000 / HZ);
> + apic_set_timer_period_khz(setup_data.v1.apic_khz, "Jailhouse hypervisor");
> }
>
> static unsigned long jailhouse_get_tsc(void)
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index ce10ae4b298b..f9ecc9256863 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -717,7 +717,7 @@ unsigned long native_calibrate_tsc(void)
> * lapic_timer_period here to avoid having to calibrate the APIC
> * timer later.
> */
> - lapic_timer_period = crystal_khz * 1000 / HZ;
> + apic_set_timer_period_khz(crystal_khz, "CPUID 0x15/0x16");
> #endif
>
> return crystal_khz * ebx_numerator / eax_denominator;
> diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
> index 48e6cc1cb017..7e990871e041 100644
> --- a/arch/x86/kernel/tsc_msr.c
> +++ b/arch/x86/kernel/tsc_msr.c
> @@ -211,7 +211,7 @@ unsigned long cpu_khz_from_msr(void)
> pr_err("Error MSR_FSB_FREQ index %d is unknown\n", index);
>
> #ifdef CONFIG_X86_LOCAL_APIC
> - lapic_timer_period = (freq * 1000) / HZ;
> + apic_set_timer_period_khz(freq, "MSR_FSB_FREQ");
> #endif
>
> /*
> --
> 2.55.0.rc0.799.gd6f94ed593-goog
>
^ permalink raw reply [flat|nested] 72+ messages in thread
* [PATCH v5 02/51] x86/apic: Add CONFIG_X86_LOCAL_APIC=n stubs for apic_set_timer_period_{,k}hz()
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 01/51] x86/apic: Provide helpers to set local APIC timer period in hz and khz Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:48 ` sashiko-bot
2026-07-01 19:31 ` [PATCH v5 03/51] x86/tsc: Ensure that TSC recalibration doesn't run if TSC frequency is known Sean Christopherson
` (48 subsequent siblings)
50 siblings, 1 reply; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Add stubs for the apic_set_timer_period_{,k}hz() APIs when the kernel is
built without support for a local APIC, and drop #ifdefs in callers that
don't need to check CONFIG_X86_LOCAL_APIC for other reasons.
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/apic.h | 2 ++
arch/x86/kernel/cpu/vmware.c | 2 --
arch/x86/kernel/tsc.c | 2 --
arch/x86/kernel/tsc_msr.c | 2 --
4 files changed, 2 insertions(+), 6 deletions(-)
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index cd84a94688a2..035998555e99 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -189,6 +189,8 @@ static inline void disable_local_APIC(void) { }
# define setup_boot_APIC_clock x86_init_noop
# define setup_secondary_APIC_clock x86_init_noop
static inline void lapic_update_tsc_freq(void) { }
+static inline void apic_set_timer_period_hz(u64 period_hz, const char *source) { }
+static inline void apic_set_timer_period_khz(u64 period_khz, const char *source) { }
static inline void init_bsp_APIC(void) { }
static inline void apic_intr_mode_select(void) { }
static inline void apic_intr_mode_init(void) { }
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 36f779dd311d..13b97265c535 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -422,10 +422,8 @@ static void __init vmware_platform_setup(void)
x86_platform.calibrate_tsc = vmware_get_tsc_khz;
x86_platform.calibrate_cpu = vmware_get_tsc_khz;
-#ifdef CONFIG_X86_LOCAL_APIC
/* Skip lapic calibration since we know the bus frequency. */
apic_set_timer_period_hz(ecx, "VMware hypervisor");
-#endif
} else {
pr_warn("Failed to get TSC freq from the hypervisor\n");
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index f9ecc9256863..4d6a446645c0 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -710,7 +710,6 @@ unsigned long native_calibrate_tsc(void)
if (boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT)
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
-#ifdef CONFIG_X86_LOCAL_APIC
/*
* The local APIC appears to be fed by the core crystal clock
* (which sounds entirely sensible). We can set the global
@@ -718,7 +717,6 @@ unsigned long native_calibrate_tsc(void)
* timer later.
*/
apic_set_timer_period_khz(crystal_khz, "CPUID 0x15/0x16");
-#endif
return crystal_khz * ebx_numerator / eax_denominator;
}
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
index 7e990871e041..aece062aee7e 100644
--- a/arch/x86/kernel/tsc_msr.c
+++ b/arch/x86/kernel/tsc_msr.c
@@ -210,9 +210,7 @@ unsigned long cpu_khz_from_msr(void)
if (freq == 0)
pr_err("Error MSR_FSB_FREQ index %d is unknown\n", index);
-#ifdef CONFIG_X86_LOCAL_APIC
apic_set_timer_period_khz(freq, "MSR_FSB_FREQ");
-#endif
/*
* TSC frequency determined by MSR is always considered "known"
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* Re: [PATCH v5 02/51] x86/apic: Add CONFIG_X86_LOCAL_APIC=n stubs for apic_set_timer_period_{,k}hz()
2026-07-01 19:31 ` [PATCH v5 02/51] x86/apic: Add CONFIG_X86_LOCAL_APIC=n stubs for apic_set_timer_period_{,k}hz() Sean Christopherson
@ 2026-07-01 19:48 ` sashiko-bot
0 siblings, 0 replies; 72+ messages in thread
From: sashiko-bot @ 2026-07-01 19:48 UTC (permalink / raw)
To: Sean Christopherson; +Cc: linux-hyperv, kvm
Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [Medium] The `source` argument in `apic_set_timer_period_hz()` and `apic_set_timer_period_khz()` is completely unused, leading to dead code and lost debug logging.
--
commit a7b2ed4060ee2d44136d35fad056804351b581ca
Author: Sean Christopherson <seanjc@google.com>
x86/apic: Add CONFIG_X86_LOCAL_APIC=n stubs for apic_set_timer_period_{,k}hz()
This commit adds stubs for apic_set_timer_period_hz() and
apic_set_timer_period_khz() when the kernel is built without local APIC
support. It also removes the corresponding #ifdef guards from the calling
code in various x86 subsystems.
> diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
> index cd84a94688a2f..035998555e992 100644
> --- a/arch/x86/include/asm/apic.h
> +++ b/arch/x86/include/asm/apic.h
> @@ -189,6 +189,8 @@ static inline void disable_local_APIC(void) { }
> # define setup_boot_APIC_clock x86_init_noop
> # define setup_secondary_APIC_clock x86_init_noop
> static inline void lapic_update_tsc_freq(void) { }
> +static inline void apic_set_timer_period_hz(u64 period_hz, const char *source) { }
> +static inline void apic_set_timer_period_khz(u64 period_khz, const char *source) { }
[Severity: Medium]
Is it intentional that the source argument is completely unused here in the
stubs?
Looking at the main implementation in arch/x86/kernel/apic/apic.c:
void apic_set_timer_period_khz(u64 period_khz, const char *source)
{
lapic_timer_period = mul_u64_u32_div(period_khz, 1000, HZ);
}
The source parameter is discarded without being used for debug logging. Since
any code calling apic_set_timer_period_hz() or apic_set_timer_period_khz()
passes a string literal describing the source (such as "VMware hypervisor"
or "CPUID 0x15/0x16"), this results in dropped debug information.
Could the source parameter be removed from the API entirely to clean up the
dead code, or should the boot-time logging be restored?
> static inline void init_bsp_APIC(void) { }
> static inline void apic_intr_mode_select(void) { }
> static inline void apic_intr_mode_init(void) { }
[ ... ]
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260701193212.749551-1-seanjc@google.com?part=2
^ permalink raw reply [flat|nested] 72+ messages in thread
* [PATCH v5 03/51] x86/tsc: Ensure that TSC recalibration doesn't run if TSC frequency is known
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 01/51] x86/apic: Provide helpers to set local APIC timer period in hz and khz Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 02/51] x86/apic: Add CONFIG_X86_LOCAL_APIC=n stubs for apic_set_timer_period_{,k}hz() Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 04/51] x86/tsc: Restrict recalibrate_cpu_khz() export to p4-clockmod and powernow-k7 Sean Christopherson
` (47 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
When attempting TSC recalibration post-boot, which is only done for ancient
CPUS (P4 and K7) on SMP=n kernels, assert that the TSC frequency isn't
known (explicitly provided by hardware) by way of MSR or CPUID, and bail if
the impossible happens. In practice, recalibration and TSC_KNOWN_FREQ are
mutually exclusive, as TSC_KNOWN_FREQ will only be set when running on
hardware that was released decades after recalibration was obsoleted, but
but it's hard to see that, especially when looking at just the TSC code.
Note, the WARN can likely be tripped by running in a virtual machine and
concocting an impossible CPU model, e.g. by combining a P4 signature with
CPUID 0x15. This is working as intended, as such a virtual CPU model is
wildly out-of-spec and is not supported.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/tsc.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 4d6a446645c0..4393902c0ddd 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -930,6 +930,9 @@ void recalibrate_cpu_khz(void)
if (!boot_cpu_has(X86_FEATURE_TSC))
return;
+ if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_TSC_KNOWN_FREQ)))
+ return;
+
cpu_khz = x86_platform.calibrate_cpu();
tsc_khz = x86_platform.calibrate_tsc();
if (tsc_khz == 0)
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 04/51] x86/tsc: Restrict recalibrate_cpu_khz() export to p4-clockmod and powernow-k7
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (2 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 03/51] x86/tsc: Ensure that TSC recalibration doesn't run if TSC frequency is known Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 05/51] x86/sev: Mark TSC as reliable when configuring Secure TSC Sean Christopherson
` (46 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Export recalibrate_cpu_khz() only for its two users, p4-clockmod.ko and
powernow-k7.ko, to help document that recalibration is relevant only to
ancient CPUs.
For all intents and purposes, no functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/tsc.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 4393902c0ddd..482cc3a8999a 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -943,7 +943,7 @@ void recalibrate_cpu_khz(void)
cpu_khz_old, cpu_khz);
#endif
}
-EXPORT_SYMBOL_GPL(recalibrate_cpu_khz);
+EXPORT_SYMBOL_FOR_MODULES(recalibrate_cpu_khz, "p4-clockmod,powernow-k7");
static unsigned long long cyc2ns_suspend;
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 05/51] x86/sev: Mark TSC as reliable when configuring Secure TSC
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (3 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 04/51] x86/tsc: Restrict recalibrate_cpu_khz() export to p4-clockmod and powernow-k7 Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:56 ` sashiko-bot
2026-07-01 19:31 ` [PATCH v5 06/51] x86/sev: Don't override CPU frequency calibration for SNP's " Sean Christopherson
` (45 subsequent siblings)
50 siblings, 1 reply; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Move the code to mark the TSC as reliable from sme_early_init() to
snp_secure_tsc_init(). The only reader of TSC_RELIABLE is the aptly
named check_system_tsc_reliable(), which runs in tsc_init(), i.e.
after snp_secure_tsc_init().
This will allow consolidating the handling of TSC_KNOWN_FREQ and
TSC_RELIABLE when overriding the TSC calibration routine.
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Nikunj A Dadhania <nikunj@amd.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/coco/sev/core.c | 2 ++
arch/x86/mm/mem_encrypt_amd.c | 3 ---
2 files changed, 2 insertions(+), 3 deletions(-)
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index ecd77d3217f3..ed0ac52a765e 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -2037,6 +2037,8 @@ void __init snp_secure_tsc_init(void)
secrets = (__force struct snp_secrets_page *)mem;
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+
rdmsrq(MSR_AMD64_GUEST_TSC_FREQ, tsc_freq_mhz);
/* Extract the GUEST TSC MHZ from BIT[17:0], rest is reserved space */
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
index 2f8c32173972..6c3af974c7c2 100644
--- a/arch/x86/mm/mem_encrypt_amd.c
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -535,9 +535,6 @@ void __init sme_early_init(void)
*/
x86_init.resources.dmi_setup = snp_dmi_setup;
}
-
- if (sev_status & MSR_AMD64_SNP_SECURE_TSC)
- setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
}
void __init mem_encrypt_free_decrypted_mem(void)
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* Re: [PATCH v5 05/51] x86/sev: Mark TSC as reliable when configuring Secure TSC
2026-07-01 19:31 ` [PATCH v5 05/51] x86/sev: Mark TSC as reliable when configuring Secure TSC Sean Christopherson
@ 2026-07-01 19:56 ` sashiko-bot
2026-07-01 20:07 ` Sean Christopherson
0 siblings, 1 reply; 72+ messages in thread
From: sashiko-bot @ 2026-07-01 19:56 UTC (permalink / raw)
To: Sean Christopherson; +Cc: linux-hyperv, kvm
Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
Pre-existing issues:
- [High] This problem wasn't introduced by this patch, but a malicious hypervisor can bypass Secure TSC calibration by injecting a fake UV system table, forcing the guest to fall back to untrusted legacy timers.
- [High] This problem wasn't introduced by this patch, but if the calculated Secure TSC frequency is 0, the kernel silently falls back to untrusted timers instead of terminating the VM.
--
Note: Due to system recitation restrictions, this report is provided as a free-form summary without direct diff quoting.
commit cdbd80de837f6d512a759452a18164ddd83b9b7b
Author: Sean Christopherson <seanjc@google.com>
x86/sev: Mark TSC as reliable when configuring Secure TSC
This commit moves the setting of the X86_FEATURE_TSC_RELIABLE flag from
sme_early_init() to snp_secure_tsc_init(). It consolidates the handling of
known-frequency and reliable TSC flags when overriding the TSC calibration
routine.
[Severity: High]
This isn't a bug introduced by this patch, but does the early return in
arch/x86/kernel/tsc.c:tsc_early_init() allow an untrusted hypervisor to
bypass Secure TSC initialization?
The tsc_early_init() function checks is_early_uv_system() before initializing
Secure TSC. The is_early_uv_system() check relies on the presence of
UV_SYSTEM_TABLE_GUID in the EFI configuration tables.
Because the hypervisor can provide tables to the guest firmware which may
expose them as EFI configuration tables, an untrusted hypervisor could inject
this specific GUID. Would this trick the kernel into returning early,
bypassing both Secure TSC and TDX TSC initialization, and causing the guest
to fall back to untrusted legacy timers?
[Severity: High]
This is also a pre-existing issue, but should the newly updated
arch/x86/coco/sev/core.c:snp_secure_tsc_init() validate that the calculated
frequency is non-zero before returning?
If the calculated frequency evaluates to 0, which could happen if the
hypervisor causes MSR_AMD64_GUEST_TSC_FREQ to read as 0 or if the secrets
factor is invalid, the function simply returns 0 instead of terminating
the VM.
The caller tsc_early_init() treats a return value of 0 as a failure to find
a trusted frequency and gracefully falls back to x86_init.hyper.get_tsc_khz()
or legacy timers. In a confidential computing threat model, should this
security-critical initialization fail closed and terminate the VM, rather
than falling back to the hypervisor-controlled time sources that Secure TSC
is designed to protect against?
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260701193212.749551-1-seanjc@google.com?part=5
^ permalink raw reply [flat|nested] 72+ messages in thread
* Re: [PATCH v5 05/51] x86/sev: Mark TSC as reliable when configuring Secure TSC
2026-07-01 19:56 ` sashiko-bot
@ 2026-07-01 20:07 ` Sean Christopherson
0 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 20:07 UTC (permalink / raw)
To: sashiko-reviews; +Cc: linux-hyperv, kvm
On Wed, Jul 01, 2026, sashiko-bot@kernel.org wrote:
> Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
> commit cdbd80de837f6d512a759452a18164ddd83b9b7b
> Author: Sean Christopherson <seanjc@google.com>
>
> x86/sev: Mark TSC as reliable when configuring Secure TSC
>
> This commit moves the setting of the X86_FEATURE_TSC_RELIABLE flag from
> sme_early_init() to snp_secure_tsc_init(). It consolidates the handling of
> known-frequency and reliable TSC flags when overriding the TSC calibration
> routine.
>
> [Severity: High]
> This isn't a bug introduced by this patch, but does the early return in
> arch/x86/kernel/tsc.c:tsc_early_init() allow an untrusted hypervisor to
> bypass Secure TSC initialization?
>
> The tsc_early_init() function checks is_early_uv_system() before initializing
> Secure TSC. The is_early_uv_system() check relies on the presence of
> UV_SYSTEM_TABLE_GUID in the EFI configuration tables.
>
> Because the hypervisor can provide tables to the guest firmware which may
> expose them as EFI configuration tables, an untrusted hypervisor could inject
> this specific GUID. Would this trick the kernel into returning early,
> bypassing both Secure TSC and TDX TSC initialization, and causing the guest
> to fall back to untrusted legacy timers?
I suppose so? But that's a problem best handled in a dedicated, separate patch.
>
> [Severity: High]
> This is also a pre-existing issue, but should the newly updated
> arch/x86/coco/sev/core.c:snp_secure_tsc_init() validate that the calculated
> frequency is non-zero before returning?
>
> If the calculated frequency evaluates to 0, which could happen if the
> hypervisor causes MSR_AMD64_GUEST_TSC_FREQ to read as 0 or if the secrets
> factor is invalid, the function simply returns 0 instead of terminating
> the VM.
The hypervisor can't do that? If it can, that too should be addressed separately.
> The caller tsc_early_init() treats a return value of 0 as a failure to find
> a trusted frequency and gracefully falls back to x86_init.hyper.get_tsc_khz()
> or legacy timers. In a confidential computing threat model, should this
> security-critical initialization fail closed and terminate the VM, rather
> than falling back to the hypervisor-controlled time sources that Secure TSC
> is designed to protect against?
>
> --
> Sashiko AI review · https://sashiko.dev/#/patchset/20260701193212.749551-1-seanjc@google.com?part=5
^ permalink raw reply [flat|nested] 72+ messages in thread
* [PATCH v5 06/51] x86/sev: Don't override CPU frequency calibration for SNP's Secure TSC
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (4 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 05/51] x86/sev: Mark TSC as reliable when configuring Secure TSC Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:53 ` sashiko-bot
2026-07-01 19:31 ` [PATCH v5 07/51] x86/sev: Move check for SNP Secure TSC support to tsc_early_init() Sean Christopherson
` (44 subsequent siblings)
50 siblings, 1 reply; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Don't override the kernel's CPU frequency calibration routine when
registering SNP's Secure TSC calibration routine. SNP (the architecture)
provides zero guarantees that the CPU runs at the same frequency as the
TSC. The justification for clobbering the CPU routine was:
Since the difference between CPU base and TSC frequency does not apply
in this case, the same callback is being used.
but that's simply not true. E.g. if APERF/MPERF is exposed to the VM, then
the CPU frequency absolutely does matter.
While relying on heuristics and/or the untrusted hypervisor to provide the
CPU frequency isn't ideal, it's at least not outright wrong.
Fixes: 73bbf3b0fbba ("x86/tsc: Init the TSC for Secure TSC guests")
Cc: Nikunj A Dadhania <nikunj@amd.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/coco/sev/core.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index ed0ac52a765e..665de1aea0ee 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -2046,7 +2046,6 @@ void __init snp_secure_tsc_init(void)
snp_tsc_freq_khz = SNP_SCALE_TSC_FREQ(tsc_freq_mhz * 1000, secrets->tsc_factor);
- x86_platform.calibrate_cpu = securetsc_get_tsc_khz;
x86_platform.calibrate_tsc = securetsc_get_tsc_khz;
early_memunmap(mem, PAGE_SIZE);
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* Re: [PATCH v5 06/51] x86/sev: Don't override CPU frequency calibration for SNP's Secure TSC
2026-07-01 19:31 ` [PATCH v5 06/51] x86/sev: Don't override CPU frequency calibration for SNP's " Sean Christopherson
@ 2026-07-01 19:53 ` sashiko-bot
2026-07-01 20:09 ` Sean Christopherson
0 siblings, 1 reply; 72+ messages in thread
From: sashiko-bot @ 2026-07-01 19:53 UTC (permalink / raw)
To: Sean Christopherson; +Cc: kvm, linux-hyperv
Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [Medium] The patch removes the `calibrate_cpu` override to allow the CPU base frequency to differ from the TSC frequency, but fails to account for `determine_cpu_tsc_frequencies()` which will forcefully clobber `cpu_khz` if it diverges by more than 10%.
--
commit 6d3b9e81661ead5f25434666266aec6514dadaa8
Author: Sean Christopherson <seanjc@google.com>
x86/sev: Don't override CPU frequency calibration for SNP's Secure TSC
This commit removes the override of the kernel's CPU frequency calibration
routine when registering SNP's Secure TSC. It aims to decouple the CPU base
frequency from the TSC frequency, as exposing APERF/MPERF requires an
accurate CPU frequency.
> diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
> --- a/arch/x86/coco/sev/core.c
> +++ b/arch/x86/coco/sev/core.c
> @@ -2046,7 +2046,6 @@ void __init snp_secure_tsc_init(void)
>
> snp_tsc_freq_khz = SNP_SCALE_TSC_FREQ(tsc_freq_mhz * 1000, secrets->tsc_factor);
>
> - x86_platform.calibrate_cpu = securetsc_get_tsc_khz;
[Severity: Medium]
Will this removal be sufficient to decouple cpu_khz from tsc_khz if they
diverge significantly?
Looking at determine_cpu_tsc_frequencies() in arch/x86/kernel/tsc.c, there
is a generic sanity check that enforces a 10% limit on the difference
between the two frequencies:
arch/x86/kernel/tsc.c:determine_cpu_tsc_frequencies() {
...
else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
cpu_khz = tsc_khz;
...
}
If the true host CPU frequency diverges from the guest TSC frequency by more
than 10%, won't this logic forcefully overwrite the correct cpu_khz back to
tsc_khz, unintentionally undoing the fix for the APERF/MPERF calculation?
> x86_platform.calibrate_tsc = securetsc_get_tsc_khz;
>
> early_memunmap(mem, PAGE_SIZE);
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260701193212.749551-1-seanjc@google.com?part=6
^ permalink raw reply [flat|nested] 72+ messages in thread* Re: [PATCH v5 06/51] x86/sev: Don't override CPU frequency calibration for SNP's Secure TSC
2026-07-01 19:53 ` sashiko-bot
@ 2026-07-01 20:09 ` Sean Christopherson
0 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 20:09 UTC (permalink / raw)
To: sashiko-reviews; +Cc: kvm, linux-hyperv
On Wed, Jul 01, 2026, sashiko-bot@kernel.org wrote:
> Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
> - [Medium] The patch removes the `calibrate_cpu` override to allow the CPU base frequency to differ from the TSC frequency, but fails to account for `determine_cpu_tsc_frequencies()` which will forcefully clobber `cpu_khz` if it diverges by more than 10%.
> --
>
> commit 6d3b9e81661ead5f25434666266aec6514dadaa8
> Author: Sean Christopherson <seanjc@google.com>
>
> x86/sev: Don't override CPU frequency calibration for SNP's Secure TSC
>
> This commit removes the override of the kernel's CPU frequency calibration
> routine when registering SNP's Secure TSC. It aims to decouple the CPU base
> frequency from the TSC frequency, as exposing APERF/MPERF requires an
> accurate CPU frequency.
>
> > diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
> > --- a/arch/x86/coco/sev/core.c
> > +++ b/arch/x86/coco/sev/core.c
> > @@ -2046,7 +2046,6 @@ void __init snp_secure_tsc_init(void)
> >
> > snp_tsc_freq_khz = SNP_SCALE_TSC_FREQ(tsc_freq_mhz * 1000, secrets->tsc_factor);
> >
> > - x86_platform.calibrate_cpu = securetsc_get_tsc_khz;
>
> [Severity: Medium]
> Will this removal be sufficient to decouple cpu_khz from tsc_khz if they
> diverge significantly?
No, but IMO it's a step in the right direction.
> Looking at determine_cpu_tsc_frequencies() in arch/x86/kernel/tsc.c, there
> is a generic sanity check that enforces a 10% limit on the difference
> between the two frequencies:
>
> arch/x86/kernel/tsc.c:determine_cpu_tsc_frequencies() {
> ...
> else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
> cpu_khz = tsc_khz;
> ...
> }
>
> If the true host CPU frequency diverges from the guest TSC frequency by more
> than 10%, won't this logic forcefully overwrite the correct cpu_khz back to
> tsc_khz, unintentionally undoing the fix for the APERF/MPERF calculation?
Probably, but this series is already boiling a pretty large lake, I think the
ocean can be left for a future cleanup.
^ permalink raw reply [flat|nested] 72+ messages in thread
* [PATCH v5 07/51] x86/sev: Move check for SNP Secure TSC support to tsc_early_init()
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (5 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 06/51] x86/sev: Don't override CPU frequency calibration for SNP's " Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 08/51] x86/sev: Shove SNP's secure/trusted TSC frequency directly into "calibration" Sean Christopherson
` (43 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Move the check on having a Secure TSC to the common tsc_early_init() so
that it's obvious that having a Secure TSC is conditional, and to prepare
for adding TDX to the mix (blindly initializing *both* SNP and TDX TSC
logic looks especially weird).
No functional change intended.
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Nikunj A Dadhania <nikunj@amd.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/coco/sev/core.c | 3 ---
arch/x86/kernel/tsc.c | 3 ++-
2 files changed, 2 insertions(+), 4 deletions(-)
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index 665de1aea0ee..403dcea86452 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -2025,9 +2025,6 @@ void __init snp_secure_tsc_init(void)
unsigned long tsc_freq_mhz;
void *mem;
- if (!cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
- return;
-
mem = early_memremap_encrypted(sev_secrets_pa, PAGE_SIZE);
if (!mem) {
pr_err("Unable to get TSC_FACTOR: failed to map the SNP secrets page.\n");
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 482cc3a8999a..8f1604ffe986 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1509,7 +1509,8 @@ void __init tsc_early_init(void)
if (is_early_uv_system())
return;
- snp_secure_tsc_init();
+ if (cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
+ snp_secure_tsc_init();
if (!determine_cpu_tsc_frequencies(true))
return;
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 08/51] x86/sev: Shove SNP's secure/trusted TSC frequency directly into "calibration"
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (6 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 07/51] x86/sev: Move check for SNP Secure TSC support to tsc_early_init() Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 09/51] x86/tsc: Add a standalone helper for getting TSC info from CPUID.0x15 Sean Christopherson
` (42 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
As a first step towards dropping .calibrate_{cpu,tsc}() and explicitly
defining precedence/priority for "calibration" routines, pass the secure
TSC frequency obtained from SNP firmware directly to
determine_cpu_tsc_frequencies() instead of overriding the .calibrate_tsc()
hook.
Unlike the native calibration routines, all of the paravirtual overrides,
including SNP and TDX, are constant in the sense that the frequency
provided by the hypervisor or trusted firmware is fixed, known, and always
available during early boot. More importantly, for CoCo (SNP and TDX) VMs,
it's imperative that the kernel uses the frequency provided by the trusted
firmware, not by the untrusted hypervisor. Enforcing the priority between
sources by carefully ordering seemingly unrelated init calls, so that the
trusted override "wins", is brittle and all but impossible to follow.
Explicitly ignore tsc_early_khz if the exact TSC frequency was obtained
from trusted firmware, as per commit bd35c77e32e4 ("x86/tsc: Add
tsc_early_khz command line parameter"), the goal of the param is to play
nice with setups that provide partial frequency information in CPUID, i.e.
is NOT intended to be a hard override. Neither SNP's secure TSC nor TDX
was supported when commit bd35c77e32e4 landed back in 2020, i.e. lack of
consideration for the interaction was purely due to oversight when SNP and
TDX support came along.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
.../admin-guide/kernel-parameters.txt | 4 +++
arch/x86/coco/sev/core.c | 14 +++--------
arch/x86/include/asm/sev.h | 4 +--
arch/x86/kernel/tsc.c | 25 ++++++++++++++-----
4 files changed, 29 insertions(+), 18 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index b5493a7f8f22..181149f633c3 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -7946,6 +7946,10 @@ Kernel parameters
with CPUID.16h support and partial CPUID.15h support.
Format: <unsigned int>
+ Note, tsc_early_khz is ignored if the TSC frequency is
+ provided by trusted firmware when running as an SNP
+ guest.
+
tsx= [X86] Control Transactional Synchronization
Extensions (TSX) feature in Intel processors that
support TSX control.
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index 403dcea86452..bc5ae9ef74da 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -99,7 +99,6 @@ static const char * const sev_status_feat_names[] = {
*/
static u64 snp_tsc_scale __ro_after_init;
static u64 snp_tsc_offset __ro_after_init;
-static unsigned long snp_tsc_freq_khz __ro_after_init;
DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa);
@@ -2014,15 +2013,10 @@ void __init snp_secure_tsc_prepare(void)
pr_debug("SecureTSC enabled");
}
-static unsigned long securetsc_get_tsc_khz(void)
-{
- return snp_tsc_freq_khz;
-}
-
-void __init snp_secure_tsc_init(void)
+unsigned int __init snp_secure_tsc_init(void)
{
+ unsigned long snp_tsc_freq_khz, tsc_freq_mhz;
struct snp_secrets_page *secrets;
- unsigned long tsc_freq_mhz;
void *mem;
mem = early_memremap_encrypted(sev_secrets_pa, PAGE_SIZE);
@@ -2043,7 +2037,7 @@ void __init snp_secure_tsc_init(void)
snp_tsc_freq_khz = SNP_SCALE_TSC_FREQ(tsc_freq_mhz * 1000, secrets->tsc_factor);
- x86_platform.calibrate_tsc = securetsc_get_tsc_khz;
-
early_memunmap(mem, PAGE_SIZE);
+
+ return snp_tsc_freq_khz;
}
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 594cfa19cbd4..05ebf0b73ef4 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -530,7 +530,7 @@ int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req
int snp_svsm_vtpm_send_command(u8 *buffer);
void __init snp_secure_tsc_prepare(void);
-void __init snp_secure_tsc_init(void);
+unsigned int snp_secure_tsc_init(void);
enum es_result savic_register_gpa(u64 gpa);
enum es_result savic_unregister_gpa(u64 *gpa);
u64 savic_ghcb_msr_read(u32 reg);
@@ -637,7 +637,7 @@ static inline int snp_send_guest_request(struct snp_msg_desc *mdesc,
struct snp_guest_req *req) { return -ENODEV; }
static inline int snp_svsm_vtpm_send_command(u8 *buffer) { return -ENODEV; }
static inline void __init snp_secure_tsc_prepare(void) { }
-static inline void __init snp_secure_tsc_init(void) { }
+static inline unsigned int __init snp_secure_tsc_init(void) { return 0; }
static inline void sev_evict_cache(void *va, int npages) {}
static inline enum es_result savic_register_gpa(u64 gpa) { return ES_UNSUPPORTED; }
static inline enum es_result savic_unregister_gpa(u64 *gpa) { return ES_UNSUPPORTED; }
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 8f1604ffe986..f049c126e47c 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1440,15 +1440,16 @@ static int __init init_tsc_clocksource(void)
*/
device_initcall(init_tsc_clocksource);
-static bool __init determine_cpu_tsc_frequencies(bool early)
+static bool __init determine_cpu_tsc_frequencies(bool early,
+ unsigned int known_tsc_khz)
{
/* Make sure that cpu and tsc are not already calibrated */
WARN_ON(cpu_khz || tsc_khz);
if (early) {
cpu_khz = x86_platform.calibrate_cpu();
- if (tsc_early_khz)
- tsc_khz = tsc_early_khz;
+ if (known_tsc_khz)
+ tsc_khz = known_tsc_khz;
else
tsc_khz = x86_platform.calibrate_tsc();
} else {
@@ -1503,6 +1504,8 @@ static void __init tsc_enable_sched_clock(void)
void __init tsc_early_init(void)
{
+ unsigned int known_tsc_khz = 0;
+
if (!boot_cpu_has(X86_FEATURE_TSC))
return;
/* Don't change UV TSC multi-chassis synchronization */
@@ -1510,9 +1513,19 @@ void __init tsc_early_init(void)
return;
if (cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
- snp_secure_tsc_init();
+ known_tsc_khz = snp_secure_tsc_init();
- if (!determine_cpu_tsc_frequencies(true))
+ /*
+ * Ignore the user-provided TSC frequency if the exact frequency was
+ * obtained from trusted firmware, as the user-provided frequency is
+ * intended as a "starting point", not a known, guaranteed frequency.
+ */
+ if (!known_tsc_khz)
+ known_tsc_khz = tsc_early_khz;
+ else if (tsc_early_khz)
+ pr_err("Ignoring 'tsc_early_khz' in favor of trusted firmware.\n");
+
+ if (!determine_cpu_tsc_frequencies(true, known_tsc_khz))
return;
tsc_enable_sched_clock();
}
@@ -1533,7 +1546,7 @@ void __init tsc_init(void)
if (!tsc_khz) {
/* We failed to determine frequencies earlier, try again */
- if (!determine_cpu_tsc_frequencies(false)) {
+ if (!determine_cpu_tsc_frequencies(false, 0)) {
mark_tsc_unstable("could not calculate TSC khz");
setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
return;
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 09/51] x86/tsc: Add a standalone helper for getting TSC info from CPUID.0x15
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (7 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 08/51] x86/sev: Shove SNP's secure/trusted TSC frequency directly into "calibration" Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 10/51] x86/tdx: Force TSC frequency with CPUID-based info provided by the TDX-Module Sean Christopherson
` (41 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Extract retrieval of TSC frequency information from CPUID into a standalone
helper so that TDX guest support can reuse the logic.
Opportunistically drop native_calibrate_tsc()'s "== 0" and "!= 0" checks
in favor of the kernel's preferred style.
No functional change intended.
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/tsc.c | 61 +++++++++++++++++++++++++++----------------
1 file changed, 38 insertions(+), 23 deletions(-)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index f049c126e47c..12043812c8f5 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -645,46 +645,62 @@ static unsigned long quick_pit_calibrate(void)
return delta;
}
+struct cpuid_tsc_info {
+ unsigned int denominator;
+ unsigned int numerator;
+ unsigned int crystal_khz;
+};
+
+static int cpuid_get_tsc_info(struct cpuid_tsc_info *info)
+{
+ unsigned int ecx_hz, edx;
+
+ if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC)
+ return -ENOENT;
+
+ /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
+ cpuid(CPUID_LEAF_TSC, &info->denominator, &info->numerator, &ecx_hz, &edx);
+
+ if (!info->denominator || !info->numerator)
+ return -ENOENT;
+
+ /*
+ * Note: some CPUs provide the multiplier information, but not the core
+ * crystal frequency. The multiplier information is still useful for
+ * such CPUs, as the crystal frequency can be gleaned from CPUID.0x16.
+ */
+ info->crystal_khz = ecx_hz / 1000;
+ return 0;
+}
+
/**
* native_calibrate_tsc - determine TSC frequency
* Determine TSC frequency via CPUID, else return 0.
*/
unsigned long native_calibrate_tsc(void)
{
- unsigned int eax_denominator, ebx_numerator, ecx_hz, edx;
- unsigned int crystal_khz;
+ struct cpuid_tsc_info info;
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return 0;
- if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC)
+ if (cpuid_get_tsc_info(&info))
return 0;
- eax_denominator = ebx_numerator = ecx_hz = edx = 0;
-
- /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
- cpuid(CPUID_LEAF_TSC, &eax_denominator, &ebx_numerator, &ecx_hz, &edx);
-
- if (ebx_numerator == 0 || eax_denominator == 0)
- return 0;
-
- crystal_khz = ecx_hz / 1000;
-
/*
* Denverton SoCs don't report crystal clock, and also don't support
* CPUID_LEAF_FREQ for the calculation below, so hardcode the 25MHz
* crystal clock.
*/
- if (crystal_khz == 0 &&
- boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D)
- crystal_khz = 25000;
+ if (!info.crystal_khz && boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D)
+ info.crystal_khz = 25000;
/*
* TSC frequency reported directly by CPUID is a "hardware reported"
* frequency and is the most accurate one so far we have. This
* is considered a known frequency.
*/
- if (crystal_khz != 0)
+ if (info.crystal_khz)
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
/*
@@ -692,15 +708,14 @@ unsigned long native_calibrate_tsc(void)
* clock, but we can easily calculate it to a high degree of accuracy
* by considering the crystal ratio and the CPU speed.
*/
- if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= CPUID_LEAF_FREQ) {
+ if (!info.crystal_khz && boot_cpu_data.cpuid_level >= CPUID_LEAF_FREQ) {
unsigned int eax_base_mhz, ebx, ecx, edx;
cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx, &ecx, &edx);
- crystal_khz = eax_base_mhz * 1000 *
- eax_denominator / ebx_numerator;
+ info.crystal_khz = eax_base_mhz * 1000 * info.denominator / info.numerator;
}
- if (crystal_khz == 0)
+ if (!info.crystal_khz)
return 0;
/*
@@ -716,9 +731,9 @@ unsigned long native_calibrate_tsc(void)
* lapic_timer_period here to avoid having to calibrate the APIC
* timer later.
*/
- apic_set_timer_period_khz(crystal_khz, "CPUID 0x15/0x16");
+ apic_set_timer_period_khz(info.crystal_khz, "CPUID 0x15/0x16");
- return crystal_khz * ebx_numerator / eax_denominator;
+ return info.crystal_khz * info.numerator / info.denominator;
}
static unsigned long cpu_khz_from_cpuid(void)
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 10/51] x86/tdx: Force TSC frequency with CPUID-based info provided by the TDX-Module
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (8 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 09/51] x86/tsc: Add a standalone helper for getting TSC info from CPUID.0x15 Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 11/51] x86/tsc: Add dedicated hypervisor hooks for getting known TSC/CPU frequencies Sean Christopherson
` (40 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
When running as a TDX guest, explicitly set the TSC frequency to a known
value, using CPUID-based information, instead of potentially relying on a
hypervisor-controlled PV routine. For TDX guests, CPUID.0x15 is always
emulated by the TDX-Module, i.e. the information from CPUID is more
trustworthy than the information provided by the hypervisor.
To maintain backwards compatibility with TDX guest kernels that use native
calibration, and because it's the least awful option, retain
native_calibrate_tsc()'s stuffing of the local APIC bus period using the
core crystal frequency. While it's entirely possible for the hypervisor
to emulate the APIC timer at a different frequency than the core crystal
frequency, the commonly accepted interpretation of Intel's SDM is that APIC
timer runs at the core crystal frequency when that latter is enumerated via
CPUID:
The APIC timer frequency will be the processor’s bus clock or core
crystal clock frequency (when TSC/core crystal clock ratio is enumerated
in CPUID leaf 0x15).
If the hypervisor is malicious and deliberately runs the APIC timer at the
wrong frequency, nothing would stop the hypervisor from modifying the
frequency at any time, i.e. attempting to manually calibrate the frequency
out of paranoia would be futile.
Deliberately leave CPU frequency calibration as is, since the TDX-Module
doesn't provide any guarantees with respect to CPUID.0x16.
Expose and use cpuid_get_tsc_info() instead of providing a wrapper to
get the TSC and core crystal frequency, as TDX is the only anticipated
user outside of the TSC code, i.e. adding a helper to dedup the math won't
actually dedup anything. Having TDX use "struct cpuid_tsc_info" also
avoids the temptation of declaring a local "tsc_khz" variable and thus
unintentionally creating a shadow of the global "tsc_khz".
Cc: Kiryl Shutsemau (Meta) <kas@kernel.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
.../admin-guide/kernel-parameters.txt | 4 ++--
arch/x86/coco/tdx/tdx.c | 20 ++++++++++++++++---
arch/x86/include/asm/tdx.h | 2 ++
arch/x86/include/asm/tsc.h | 7 +++++++
arch/x86/kernel/tsc.c | 11 ++++------
5 files changed, 32 insertions(+), 12 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 181149f633c3..490e6aa72fc2 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -7947,8 +7947,8 @@ Kernel parameters
Format: <unsigned int>
Note, tsc_early_khz is ignored if the TSC frequency is
- provided by trusted firmware when running as an SNP
- guest.
+ provided by trusted firmware when running as an SNP or
+ TDX guest.
tsx= [X86] Control Transactional Synchronization
Extensions (TSX) feature in Intel processors that
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 29b6f1ed59ec..ae2d35f2ef33 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -8,6 +8,7 @@
#include <linux/export.h>
#include <linux/io.h>
#include <linux/kexec.h>
+#include <asm/apic.h>
#include <asm/coco.h>
#include <asm/tdx.h>
#include <asm/vmx.h>
@@ -1123,9 +1124,6 @@ void __init tdx_early_init(void)
setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
- /* TSC is the only reliable clock in TDX guest */
- setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
-
cc_vendor = CC_VENDOR_INTEL;
/* Configure the TD */
@@ -1195,3 +1193,19 @@ void __init tdx_early_init(void)
tdx_announce();
}
+
+unsigned int __init tdx_tsc_init(void)
+{
+ struct cpuid_tsc_info info;
+
+ if (WARN_ON_ONCE(cpuid_get_tsc_info(&info) || !info.crystal_khz))
+ return 0;
+
+ apic_set_timer_period_khz(info.crystal_khz, "TDX-Module via CPUID");
+
+ /* TSC is the only reliable clock in TDX guest */
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+
+ return info.crystal_khz * info.numerator / info.denominator;
+}
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 89e97d5761d8..d23ff06db41a 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -68,6 +68,7 @@ struct ve_info {
#ifdef CONFIG_INTEL_TDX_GUEST
void __init tdx_early_init(void);
+unsigned int __init tdx_tsc_init(void);
void tdx_get_ve_info(struct ve_info *ve);
@@ -89,6 +90,7 @@ void __init tdx_dump_td_ctls(u64 td_ctls);
#else
static inline void tdx_early_init(void) { };
+static inline unsigned int tdx_tsc_init(void) { return 0; }
static inline void tdx_halt(void) { };
static inline bool tdx_early_handle_ve(struct pt_regs *regs) { return false; }
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 4d2d2f21ff06..b6b86e24e1bf 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -82,6 +82,13 @@ static inline cycles_t get_cycles(void)
}
#define get_cycles get_cycles
+struct cpuid_tsc_info {
+ unsigned int denominator;
+ unsigned int numerator;
+ unsigned int crystal_khz;
+};
+extern int cpuid_get_tsc_info(struct cpuid_tsc_info *info);
+
extern void tsc_early_init(void);
extern void tsc_init(void);
extern void mark_tsc_unstable(char *reason);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 12043812c8f5..86384a83a5f6 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -34,6 +34,7 @@
#include <asm/topology.h>
#include <asm/uv/uv.h>
#include <asm/sev.h>
+#include <asm/tdx.h>
unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);
@@ -645,13 +646,7 @@ static unsigned long quick_pit_calibrate(void)
return delta;
}
-struct cpuid_tsc_info {
- unsigned int denominator;
- unsigned int numerator;
- unsigned int crystal_khz;
-};
-
-static int cpuid_get_tsc_info(struct cpuid_tsc_info *info)
+int cpuid_get_tsc_info(struct cpuid_tsc_info *info)
{
unsigned int ecx_hz, edx;
@@ -1529,6 +1524,8 @@ void __init tsc_early_init(void)
if (cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
known_tsc_khz = snp_secure_tsc_init();
+ else if (boot_cpu_has(X86_FEATURE_TDX_GUEST))
+ known_tsc_khz = tdx_tsc_init();
/*
* Ignore the user-provided TSC frequency if the exact frequency was
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 11/51] x86/tsc: Add dedicated hypervisor hooks for getting known TSC/CPU frequencies
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (9 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 10/51] x86/tdx: Force TSC frequency with CPUID-based info provided by the TDX-Module Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-02 17:47 ` Michael Kelley
2026-07-01 19:31 ` [PATCH v5 12/51] x86/acrn: Register TSC/CPU frequency callbacks iff frequency is actually in CPUID Sean Christopherson
` (39 subsequent siblings)
50 siblings, 1 reply; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Add dedicated hypervisor hooks for getting known TSC/CPU frequencies
instead of overriding seemingly generic platform hooks, and explicitly
priotize hypervisor-provided frequencies over native methods, but do NOT
clobber the frequency obtained from trusted firmware. While shuffling the
hooks around is arguably "six of one, half dozen of the other", scoping
them to x86_hyper_init makes their purpose more obvious, and allows for
explicitly defining the priority of sources (as is done here).
As is already done when trusted firmware provides the TSC frequency, ignore
ignore tsc_early_khz if the exact TSC frequency was obtained from the
hypervisor, as attempting to refine the TSC frequency when running in a VM
is all but guaranteed to cause problems sooner or later due to the
calibration sources being emulated devices in the vast majority of setups.
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
.../admin-guide/kernel-parameters.txt | 3 +-
arch/x86/include/asm/acrn.h | 5 ----
arch/x86/include/asm/x86_init.h | 4 +++
arch/x86/kernel/cpu/acrn.c | 10 +++++--
arch/x86/kernel/cpu/mshyperv.c | 6 ++--
arch/x86/kernel/cpu/vmware.c | 8 ++---
arch/x86/kernel/jailhouse.c | 6 ++--
arch/x86/kernel/kvmclock.c | 6 ++--
arch/x86/kernel/tsc.c | 29 ++++++++++++++-----
arch/x86/xen/time.c | 4 +--
10 files changed, 50 insertions(+), 31 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 490e6aa72fc2..a387bb2c47e2 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -7948,7 +7948,8 @@ Kernel parameters
Note, tsc_early_khz is ignored if the TSC frequency is
provided by trusted firmware when running as an SNP or
- TDX guest.
+ TDX guest, or when the hypervisor provides the exact
+ frequency via a paravirtual interface.
tsx= [X86] Control Transactional Synchronization
Extensions (TSX) feature in Intel processors that
diff --git a/arch/x86/include/asm/acrn.h b/arch/x86/include/asm/acrn.h
index db42b477c41d..a892179c61c6 100644
--- a/arch/x86/include/asm/acrn.h
+++ b/arch/x86/include/asm/acrn.h
@@ -32,11 +32,6 @@ static inline u32 acrn_cpuid_base(void)
return 0;
}
-static inline unsigned long acrn_get_tsc_khz(void)
-{
- return cpuid_eax(ACRN_CPUID_TIMING_INFO);
-}
-
/*
* Hypercalls for ACRN
*
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 953d3199408a..0c89bf40f507 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -123,6 +123,8 @@ struct x86_init_pci {
* @msi_ext_dest_id: MSI supports 15-bit APIC IDs
* @init_mem_mapping: setup early mappings during init_mem_mapping()
* @init_after_bootmem: guest init after boot allocator is finished
+ * @get_tsc_khz: get the TSC frequency (returns 0 if frequency is unknown)
+ * @get_cpu_khz: get the CPU frequency (returns 0 if frequency is unknown)
*/
struct x86_hyper_init {
void (*init_platform)(void);
@@ -131,6 +133,8 @@ struct x86_hyper_init {
bool (*msi_ext_dest_id)(void);
void (*init_mem_mapping)(void);
void (*init_after_bootmem)(void);
+ unsigned int (*get_tsc_khz)(void);
+ unsigned int (*get_cpu_khz)(void);
};
/**
diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c
index dc119af83524..ad8f2da8003b 100644
--- a/arch/x86/kernel/cpu/acrn.c
+++ b/arch/x86/kernel/cpu/acrn.c
@@ -24,13 +24,15 @@ static u32 __init acrn_detect(void)
return acrn_cpuid_base();
}
+static unsigned int __init acrn_get_tsc_khz(void)
+{
+ return cpuid_eax(ACRN_CPUID_TIMING_INFO);
+}
+
static void __init acrn_init_platform(void)
{
/* Install system interrupt handler for ACRN hypervisor callback */
sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback);
-
- x86_platform.calibrate_tsc = acrn_get_tsc_khz;
- x86_platform.calibrate_cpu = acrn_get_tsc_khz;
}
static bool acrn_x2apic_available(void)
@@ -78,4 +80,6 @@ const __initconst struct hypervisor_x86 x86_hyper_acrn = {
.type = X86_HYPER_ACRN,
.init.init_platform = acrn_init_platform,
.init.x2apic_available = acrn_x2apic_available,
+ .init.get_tsc_khz = acrn_get_tsc_khz,
+ .init.get_cpu_khz = acrn_get_tsc_khz,
};
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 87beecec76f0..f9bc1c2d8c93 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -395,7 +395,7 @@ static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs)
}
#endif
-static unsigned long hv_get_tsc_khz(void)
+static unsigned int __init hv_get_tsc_khz(void)
{
unsigned long freq;
@@ -573,8 +573,8 @@ static void __init ms_hyperv_init_platform(void)
if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
- x86_platform.calibrate_tsc = hv_get_tsc_khz;
- x86_platform.calibrate_cpu = hv_get_tsc_khz;
+ x86_init.hyper.get_tsc_khz = hv_get_tsc_khz;
+ x86_init.hyper.get_cpu_khz = hv_get_tsc_khz;
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
}
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 13b97265c535..3cb473cae462 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -64,7 +64,7 @@ struct vmware_steal_time {
u64 reserved[7];
};
-static unsigned long vmware_tsc_khz __ro_after_init;
+static unsigned long vmware_tsc_khz __initdata;
static u8 vmware_hypercall_mode __ro_after_init;
unsigned long vmware_hypercall_slow(unsigned long cmd,
@@ -137,7 +137,7 @@ static inline int __vmware_platform(void)
return eax != UINT_MAX && ebx == VMWARE_HYPERVISOR_MAGIC;
}
-static unsigned long vmware_get_tsc_khz(void)
+static unsigned int __init vmware_get_tsc_khz(void)
{
return vmware_tsc_khz;
}
@@ -419,8 +419,8 @@ static void __init vmware_platform_setup(void)
}
vmware_tsc_khz = tsc_khz;
- x86_platform.calibrate_tsc = vmware_get_tsc_khz;
- x86_platform.calibrate_cpu = vmware_get_tsc_khz;
+ x86_init.hyper.get_tsc_khz = vmware_get_tsc_khz;
+ x86_init.hyper.get_cpu_khz = vmware_get_tsc_khz;
/* Skip lapic calibration since we know the bus frequency. */
apic_set_timer_period_hz(ecx, "VMware hypervisor");
diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
index f2d4ef89c085..e24c05ab4fae 100644
--- a/arch/x86/kernel/jailhouse.c
+++ b/arch/x86/kernel/jailhouse.c
@@ -68,7 +68,7 @@ static void __init jailhouse_timer_init(void)
apic_set_timer_period_khz(setup_data.v1.apic_khz, "Jailhouse hypervisor");
}
-static unsigned long jailhouse_get_tsc(void)
+static unsigned int __init jailhouse_get_tsc(void)
{
return precalibrated_tsc_khz;
}
@@ -210,8 +210,6 @@ static void __init jailhouse_init_platform(void)
x86_init.mpparse.parse_smp_cfg = jailhouse_parse_smp_config;
x86_init.pci.arch_init = jailhouse_pci_arch_init;
- x86_platform.calibrate_cpu = jailhouse_get_tsc;
- x86_platform.calibrate_tsc = jailhouse_get_tsc;
x86_platform.get_wallclock = jailhouse_get_wallclock;
x86_platform.legacy.rtc = 0;
x86_platform.legacy.warm_reset = 0;
@@ -293,5 +291,7 @@ const struct hypervisor_x86 x86_hyper_jailhouse __refconst = {
.detect = jailhouse_detect,
.init.init_platform = jailhouse_init_platform,
.init.x2apic_available = jailhouse_x2apic_available,
+ .init.get_tsc_khz = jailhouse_get_tsc,
+ .init.get_cpu_khz = jailhouse_get_tsc,
.ignore_nopv = true,
};
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index cb3d0ca1fa22..4f8299303a19 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -136,7 +136,7 @@ static inline void kvm_sched_clock_init(bool stable)
* poll of guests can be running and trouble each other. So we preset
* lpj here
*/
-static unsigned long kvm_get_tsc_khz(void)
+static unsigned int __init kvm_get_tsc_khz(void)
{
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
return pvclock_tsc_khz(this_cpu_pvti());
@@ -343,8 +343,8 @@ void __init kvmclock_init(void)
flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
- x86_platform.calibrate_tsc = kvm_get_tsc_khz;
- x86_platform.calibrate_cpu = kvm_get_tsc_khz;
+ x86_init.hyper.get_tsc_khz = kvm_get_tsc_khz;
+ x86_init.hyper.get_cpu_khz = kvm_get_tsc_khz;
x86_platform.get_wallclock = kvm_get_wallclock;
x86_platform.set_wallclock = kvm_set_wallclock;
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 86384a83a5f6..1dca9464b41c 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1451,13 +1451,17 @@ static int __init init_tsc_clocksource(void)
device_initcall(init_tsc_clocksource);
static bool __init determine_cpu_tsc_frequencies(bool early,
+ unsigned int known_cpu_khz,
unsigned int known_tsc_khz)
{
/* Make sure that cpu and tsc are not already calibrated */
WARN_ON(cpu_khz || tsc_khz);
if (early) {
- cpu_khz = x86_platform.calibrate_cpu();
+ if (known_cpu_khz)
+ cpu_khz = known_cpu_khz;
+ else
+ cpu_khz = x86_platform.calibrate_cpu();
if (known_tsc_khz)
tsc_khz = known_tsc_khz;
else
@@ -1514,7 +1518,7 @@ static void __init tsc_enable_sched_clock(void)
void __init tsc_early_init(void)
{
- unsigned int known_tsc_khz = 0;
+ unsigned int known_cpu_khz = 0, known_tsc_khz = 0;
if (!boot_cpu_has(X86_FEATURE_TSC))
return;
@@ -1522,22 +1526,33 @@ void __init tsc_early_init(void)
if (is_early_uv_system())
return;
+ if (x86_init.hyper.get_cpu_khz)
+ known_cpu_khz = x86_init.hyper.get_cpu_khz();
+
if (cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
known_tsc_khz = snp_secure_tsc_init();
else if (boot_cpu_has(X86_FEATURE_TDX_GUEST))
known_tsc_khz = tdx_tsc_init();
+ /*
+ * If the TSC frequency wasn't provided by trusted firmware, try to get
+ * it from the hypervisor (which is untrusted when running as a CoCo guest).
+ */
+ if (!known_tsc_khz && x86_init.hyper.get_tsc_khz)
+ known_tsc_khz = x86_init.hyper.get_tsc_khz();
+
/*
* Ignore the user-provided TSC frequency if the exact frequency was
- * obtained from trusted firmware, as the user-provided frequency is
- * intended as a "starting point", not a known, guaranteed frequency.
+ * obtained from trusted firmware or the hypervisor, as the user-
+ * provided frequency is intended as a "starting point", not a known,
+ * guaranteed frequency.
*/
if (!known_tsc_khz)
known_tsc_khz = tsc_early_khz;
else if (tsc_early_khz)
- pr_err("Ignoring 'tsc_early_khz' in favor of trusted firmware.\n");
+ pr_err("Ignoring 'tsc_early_khz' in favor of firmware/hypervisor.\n");
- if (!determine_cpu_tsc_frequencies(true, known_tsc_khz))
+ if (!determine_cpu_tsc_frequencies(true, known_cpu_khz, known_tsc_khz))
return;
tsc_enable_sched_clock();
}
@@ -1558,7 +1573,7 @@ void __init tsc_init(void)
if (!tsc_khz) {
/* We failed to determine frequencies earlier, try again */
- if (!determine_cpu_tsc_frequencies(false, 0)) {
+ if (!determine_cpu_tsc_frequencies(false, 0, 0)) {
mark_tsc_unstable("could not calculate TSC khz");
setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
return;
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index d62c14334b35..1adb44fdddb2 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -38,7 +38,7 @@
static u64 xen_sched_clock_offset __read_mostly;
/* Get the TSC speed from Xen */
-static unsigned long xen_tsc_khz(void)
+static unsigned int __init xen_tsc_khz(void)
{
struct pvclock_vcpu_time_info *info =
&HYPERVISOR_shared_info->vcpu_info[0].time;
@@ -569,7 +569,7 @@ static void __init xen_init_time_common(void)
static_call_update(pv_steal_clock, xen_steal_clock);
paravirt_set_sched_clock(xen_sched_clock);
- x86_platform.calibrate_tsc = xen_tsc_khz;
+ x86_init.hyper.get_tsc_khz = xen_tsc_khz;
x86_platform.get_wallclock = xen_get_wallclock;
}
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* RE: [PATCH v5 11/51] x86/tsc: Add dedicated hypervisor hooks for getting known TSC/CPU frequencies
2026-07-01 19:31 ` [PATCH v5 11/51] x86/tsc: Add dedicated hypervisor hooks for getting known TSC/CPU frequencies Sean Christopherson
@ 2026-07-02 17:47 ` Michael Kelley
0 siblings, 0 replies; 72+ messages in thread
From: Michael Kelley @ 2026-07-02 17:47 UTC (permalink / raw)
To: Sean Christopherson, Jonathan Corbet, Paolo Bonzini,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
x86@kernel.org, Kiryl Shutsemau, Rick Edgecombe, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc@vger.kernel.org, kvm@vger.kernel.org,
linux-kernel@vger.kernel.org, linux-coco@lists.linux.dev,
linux-hyperv@vger.kernel.org, virtualization@lists.linux.dev,
xen-devel@lists.xenproject.org, Tom Lendacky, Nikunj A Dadhania,
David Woodhouse, David Woodhouse, Michael Kelley, Thomas Gleixner
From: Sean Christopherson <seanjc@google.com> Sent: Wednesday, July 1, 2026 12:32 PM
>
> Add dedicated hypervisor hooks for getting known TSC/CPU frequencies
> instead of overriding seemingly generic platform hooks, and explicitly
> priotize hypervisor-provided frequencies over native methods, but do NOT
s/priotize/prioritize/
> clobber the frequency obtained from trusted firmware. While shuffling the
> hooks around is arguably "six of one, half dozen of the other", scoping
> them to x86_hyper_init makes their purpose more obvious, and allows for
> explicitly defining the priority of sources (as is done here).
>
> As is already done when trusted firmware provides the TSC frequency, ignore
Word "ignore" is duplicated.
> ignore tsc_early_khz if the exact TSC frequency was obtained from the
> hypervisor, as attempting to refine the TSC frequency when running in a VM
> is all but guaranteed to cause problems sooner or later due to the
> calibration sources being emulated devices in the vast majority of setups.
>
> Cc: David Woodhouse <dwmw2@infradead.org>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
For the Hyper-V changes,
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
> ---
> .../admin-guide/kernel-parameters.txt | 3 +-
> arch/x86/include/asm/acrn.h | 5 ----
> arch/x86/include/asm/x86_init.h | 4 +++
> arch/x86/kernel/cpu/acrn.c | 10 +++++--
> arch/x86/kernel/cpu/mshyperv.c | 6 ++--
> arch/x86/kernel/cpu/vmware.c | 8 ++---
> arch/x86/kernel/jailhouse.c | 6 ++--
> arch/x86/kernel/kvmclock.c | 6 ++--
> arch/x86/kernel/tsc.c | 29 ++++++++++++++-----
> arch/x86/xen/time.c | 4 +--
> 10 files changed, 50 insertions(+), 31 deletions(-)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-
> guide/kernel-parameters.txt
> index 490e6aa72fc2..a387bb2c47e2 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -7948,7 +7948,8 @@ Kernel parameters
>
> Note, tsc_early_khz is ignored if the TSC frequency is
> provided by trusted firmware when running as an SNP or
> - TDX guest.
> + TDX guest, or when the hypervisor provides the exact
> + frequency via a paravirtual interface.
>
> tsx= [X86] Control Transactional Synchronization
> Extensions (TSX) feature in Intel processors that
> diff --git a/arch/x86/include/asm/acrn.h b/arch/x86/include/asm/acrn.h
> index db42b477c41d..a892179c61c6 100644
> --- a/arch/x86/include/asm/acrn.h
> +++ b/arch/x86/include/asm/acrn.h
> @@ -32,11 +32,6 @@ static inline u32 acrn_cpuid_base(void)
> return 0;
> }
>
> -static inline unsigned long acrn_get_tsc_khz(void)
> -{
> - return cpuid_eax(ACRN_CPUID_TIMING_INFO);
> -}
> -
> /*
> * Hypercalls for ACRN
> *
> diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
> index 953d3199408a..0c89bf40f507 100644
> --- a/arch/x86/include/asm/x86_init.h
> +++ b/arch/x86/include/asm/x86_init.h
> @@ -123,6 +123,8 @@ struct x86_init_pci {
> * @msi_ext_dest_id: MSI supports 15-bit APIC IDs
> * @init_mem_mapping: setup early mappings during init_mem_mapping()
> * @init_after_bootmem: guest init after boot allocator is finished
> + * @get_tsc_khz: get the TSC frequency (returns 0 if frequency is unknown)
> + * @get_cpu_khz: get the CPU frequency (returns 0 if frequency is unknown)
> */
> struct x86_hyper_init {
> void (*init_platform)(void);
> @@ -131,6 +133,8 @@ struct x86_hyper_init {
> bool (*msi_ext_dest_id)(void);
> void (*init_mem_mapping)(void);
> void (*init_after_bootmem)(void);
> + unsigned int (*get_tsc_khz)(void);
> + unsigned int (*get_cpu_khz)(void);
> };
>
> /**
> diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c
> index dc119af83524..ad8f2da8003b 100644
> --- a/arch/x86/kernel/cpu/acrn.c
> +++ b/arch/x86/kernel/cpu/acrn.c
> @@ -24,13 +24,15 @@ static u32 __init acrn_detect(void)
> return acrn_cpuid_base();
> }
>
> +static unsigned int __init acrn_get_tsc_khz(void)
> +{
> + return cpuid_eax(ACRN_CPUID_TIMING_INFO);
> +}
> +
> static void __init acrn_init_platform(void)
> {
> /* Install system interrupt handler for ACRN hypervisor callback */
> sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback);
> -
> - x86_platform.calibrate_tsc = acrn_get_tsc_khz;
> - x86_platform.calibrate_cpu = acrn_get_tsc_khz;
> }
>
> static bool acrn_x2apic_available(void)
> @@ -78,4 +80,6 @@ const __initconst struct hypervisor_x86 x86_hyper_acrn = {
> .type = X86_HYPER_ACRN,
> .init.init_platform = acrn_init_platform,
> .init.x2apic_available = acrn_x2apic_available,
> + .init.get_tsc_khz = acrn_get_tsc_khz,
> + .init.get_cpu_khz = acrn_get_tsc_khz,
> };
> diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
> index 87beecec76f0..f9bc1c2d8c93 100644
> --- a/arch/x86/kernel/cpu/mshyperv.c
> +++ b/arch/x86/kernel/cpu/mshyperv.c
> @@ -395,7 +395,7 @@ static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs)
> }
> #endif
>
> -static unsigned long hv_get_tsc_khz(void)
> +static unsigned int __init hv_get_tsc_khz(void)
> {
> unsigned long freq;
>
> @@ -573,8 +573,8 @@ static void __init ms_hyperv_init_platform(void)
>
> if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
> ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
> - x86_platform.calibrate_tsc = hv_get_tsc_khz;
> - x86_platform.calibrate_cpu = hv_get_tsc_khz;
> + x86_init.hyper.get_tsc_khz = hv_get_tsc_khz;
> + x86_init.hyper.get_cpu_khz = hv_get_tsc_khz;
> setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
> }
>
> diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
> index 13b97265c535..3cb473cae462 100644
> --- a/arch/x86/kernel/cpu/vmware.c
> +++ b/arch/x86/kernel/cpu/vmware.c
> @@ -64,7 +64,7 @@ struct vmware_steal_time {
> u64 reserved[7];
> };
>
> -static unsigned long vmware_tsc_khz __ro_after_init;
> +static unsigned long vmware_tsc_khz __initdata;
> static u8 vmware_hypercall_mode __ro_after_init;
>
> unsigned long vmware_hypercall_slow(unsigned long cmd,
> @@ -137,7 +137,7 @@ static inline int __vmware_platform(void)
> return eax != UINT_MAX && ebx == VMWARE_HYPERVISOR_MAGIC;
> }
>
> -static unsigned long vmware_get_tsc_khz(void)
> +static unsigned int __init vmware_get_tsc_khz(void)
> {
> return vmware_tsc_khz;
> }
> @@ -419,8 +419,8 @@ static void __init vmware_platform_setup(void)
> }
>
> vmware_tsc_khz = tsc_khz;
> - x86_platform.calibrate_tsc = vmware_get_tsc_khz;
> - x86_platform.calibrate_cpu = vmware_get_tsc_khz;
> + x86_init.hyper.get_tsc_khz = vmware_get_tsc_khz;
> + x86_init.hyper.get_cpu_khz = vmware_get_tsc_khz;
>
> /* Skip lapic calibration since we know the bus frequency. */
> apic_set_timer_period_hz(ecx, "VMware hypervisor");
> diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
> index f2d4ef89c085..e24c05ab4fae 100644
> --- a/arch/x86/kernel/jailhouse.c
> +++ b/arch/x86/kernel/jailhouse.c
> @@ -68,7 +68,7 @@ static void __init jailhouse_timer_init(void)
> apic_set_timer_period_khz(setup_data.v1.apic_khz, "Jailhouse hypervisor");
> }
>
> -static unsigned long jailhouse_get_tsc(void)
> +static unsigned int __init jailhouse_get_tsc(void)
> {
> return precalibrated_tsc_khz;
> }
> @@ -210,8 +210,6 @@ static void __init jailhouse_init_platform(void)
> x86_init.mpparse.parse_smp_cfg = jailhouse_parse_smp_config;
> x86_init.pci.arch_init = jailhouse_pci_arch_init;
>
> - x86_platform.calibrate_cpu = jailhouse_get_tsc;
> - x86_platform.calibrate_tsc = jailhouse_get_tsc;
> x86_platform.get_wallclock = jailhouse_get_wallclock;
> x86_platform.legacy.rtc = 0;
> x86_platform.legacy.warm_reset = 0;
> @@ -293,5 +291,7 @@ const struct hypervisor_x86 x86_hyper_jailhouse __refconst = {
> .detect = jailhouse_detect,
> .init.init_platform = jailhouse_init_platform,
> .init.x2apic_available = jailhouse_x2apic_available,
> + .init.get_tsc_khz = jailhouse_get_tsc,
> + .init.get_cpu_khz = jailhouse_get_tsc,
> .ignore_nopv = true,
> };
> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> index cb3d0ca1fa22..4f8299303a19 100644
> --- a/arch/x86/kernel/kvmclock.c
> +++ b/arch/x86/kernel/kvmclock.c
> @@ -136,7 +136,7 @@ static inline void kvm_sched_clock_init(bool stable)
> * poll of guests can be running and trouble each other. So we preset
> * lpj here
> */
> -static unsigned long kvm_get_tsc_khz(void)
> +static unsigned int __init kvm_get_tsc_khz(void)
> {
> setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
> return pvclock_tsc_khz(this_cpu_pvti());
> @@ -343,8 +343,8 @@ void __init kvmclock_init(void)
> flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
> kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
>
> - x86_platform.calibrate_tsc = kvm_get_tsc_khz;
> - x86_platform.calibrate_cpu = kvm_get_tsc_khz;
> + x86_init.hyper.get_tsc_khz = kvm_get_tsc_khz;
> + x86_init.hyper.get_cpu_khz = kvm_get_tsc_khz;
> x86_platform.get_wallclock = kvm_get_wallclock;
> x86_platform.set_wallclock = kvm_set_wallclock;
> #ifdef CONFIG_X86_LOCAL_APIC
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 86384a83a5f6..1dca9464b41c 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -1451,13 +1451,17 @@ static int __init init_tsc_clocksource(void)
> device_initcall(init_tsc_clocksource);
>
> static bool __init determine_cpu_tsc_frequencies(bool early,
> + unsigned int known_cpu_khz,
> unsigned int known_tsc_khz)
> {
> /* Make sure that cpu and tsc are not already calibrated */
> WARN_ON(cpu_khz || tsc_khz);
>
> if (early) {
> - cpu_khz = x86_platform.calibrate_cpu();
> + if (known_cpu_khz)
> + cpu_khz = known_cpu_khz;
> + else
> + cpu_khz = x86_platform.calibrate_cpu();
> if (known_tsc_khz)
> tsc_khz = known_tsc_khz;
> else
> @@ -1514,7 +1518,7 @@ static void __init tsc_enable_sched_clock(void)
>
> void __init tsc_early_init(void)
> {
> - unsigned int known_tsc_khz = 0;
> + unsigned int known_cpu_khz = 0, known_tsc_khz = 0;
>
> if (!boot_cpu_has(X86_FEATURE_TSC))
> return;
> @@ -1522,22 +1526,33 @@ void __init tsc_early_init(void)
> if (is_early_uv_system())
> return;
>
> + if (x86_init.hyper.get_cpu_khz)
> + known_cpu_khz = x86_init.hyper.get_cpu_khz();
> +
> if (cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
> known_tsc_khz = snp_secure_tsc_init();
> else if (boot_cpu_has(X86_FEATURE_TDX_GUEST))
> known_tsc_khz = tdx_tsc_init();
>
> + /*
> + * If the TSC frequency wasn't provided by trusted firmware, try to get
> + * it from the hypervisor (which is untrusted when running as a CoCo guest).
> + */
> + if (!known_tsc_khz && x86_init.hyper.get_tsc_khz)
> + known_tsc_khz = x86_init.hyper.get_tsc_khz();
> +
> /*
> * Ignore the user-provided TSC frequency if the exact frequency was
> - * obtained from trusted firmware, as the user-provided frequency is
> - * intended as a "starting point", not a known, guaranteed frequency.
> + * obtained from trusted firmware or the hypervisor, as the user-
> + * provided frequency is intended as a "starting point", not a known,
> + * guaranteed frequency.
> */
> if (!known_tsc_khz)
> known_tsc_khz = tsc_early_khz;
> else if (tsc_early_khz)
> - pr_err("Ignoring 'tsc_early_khz' in favor of trusted firmware.\n");
> + pr_err("Ignoring 'tsc_early_khz' in favor of firmware/hypervisor.\n");
>
> - if (!determine_cpu_tsc_frequencies(true, known_tsc_khz))
> + if (!determine_cpu_tsc_frequencies(true, known_cpu_khz, known_tsc_khz))
> return;
> tsc_enable_sched_clock();
> }
> @@ -1558,7 +1573,7 @@ void __init tsc_init(void)
>
> if (!tsc_khz) {
> /* We failed to determine frequencies earlier, try again */
> - if (!determine_cpu_tsc_frequencies(false, 0)) {
> + if (!determine_cpu_tsc_frequencies(false, 0, 0)) {
> mark_tsc_unstable("could not calculate TSC khz");
> setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
> return;
> diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
> index d62c14334b35..1adb44fdddb2 100644
> --- a/arch/x86/xen/time.c
> +++ b/arch/x86/xen/time.c
> @@ -38,7 +38,7 @@
> static u64 xen_sched_clock_offset __read_mostly;
>
> /* Get the TSC speed from Xen */
> -static unsigned long xen_tsc_khz(void)
> +static unsigned int __init xen_tsc_khz(void)
> {
> struct pvclock_vcpu_time_info *info =
> &HYPERVISOR_shared_info->vcpu_info[0].time;
> @@ -569,7 +569,7 @@ static void __init xen_init_time_common(void)
> static_call_update(pv_steal_clock, xen_steal_clock);
> paravirt_set_sched_clock(xen_sched_clock);
>
> - x86_platform.calibrate_tsc = xen_tsc_khz;
> + x86_init.hyper.get_tsc_khz = xen_tsc_khz;
> x86_platform.get_wallclock = xen_get_wallclock;
> }
>
> --
> 2.55.0.rc0.799.gd6f94ed593-goog
>
^ permalink raw reply [flat|nested] 72+ messages in thread
* [PATCH v5 12/51] x86/acrn: Register TSC/CPU frequency callbacks iff frequency is actually in CPUID
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (10 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 11/51] x86/tsc: Add dedicated hypervisor hooks for getting known TSC/CPU frequencies Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:52 ` sashiko-bot
2026-07-01 19:31 ` [PATCH v5 13/51] x86/acrn: Mark TSC frequency as known when using ACRN for calibration Sean Christopherson
` (38 subsequent siblings)
50 siblings, 1 reply; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Register ACRN's TSC/CPU frequency overrides if and only if the exact TSC
frequency is actually provided in CPUID. This will allow marking the TSC
as reliable as appropriate, and avoids relying on the caller to handle
"failure".
For all intents and purposes, no functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/cpu/acrn.c | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c
index ad8f2da8003b..dc71a6fdd461 100644
--- a/arch/x86/kernel/cpu/acrn.c
+++ b/arch/x86/kernel/cpu/acrn.c
@@ -19,6 +19,8 @@
#include <asm/idtentry.h>
#include <asm/irq_regs.h>
+static unsigned int acrn_tsc_khz_cpuid __initdata;
+
static u32 __init acrn_detect(void)
{
return acrn_cpuid_base();
@@ -26,13 +28,19 @@ static u32 __init acrn_detect(void)
static unsigned int __init acrn_get_tsc_khz(void)
{
- return cpuid_eax(ACRN_CPUID_TIMING_INFO);
+ return acrn_tsc_khz_cpuid;
}
static void __init acrn_init_platform(void)
{
/* Install system interrupt handler for ACRN hypervisor callback */
sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback);
+
+ acrn_tsc_khz_cpuid = cpuid_eax(ACRN_CPUID_TIMING_INFO);
+ if (acrn_tsc_khz_cpuid) {
+ x86_init.hyper.get_tsc_khz = acrn_get_tsc_khz;
+ x86_init.hyper.get_cpu_khz = acrn_get_tsc_khz;
+ }
}
static bool acrn_x2apic_available(void)
@@ -80,6 +88,4 @@ const __initconst struct hypervisor_x86 x86_hyper_acrn = {
.type = X86_HYPER_ACRN,
.init.init_platform = acrn_init_platform,
.init.x2apic_available = acrn_x2apic_available,
- .init.get_tsc_khz = acrn_get_tsc_khz,
- .init.get_cpu_khz = acrn_get_tsc_khz,
};
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* Re: [PATCH v5 12/51] x86/acrn: Register TSC/CPU frequency callbacks iff frequency is actually in CPUID
2026-07-01 19:31 ` [PATCH v5 12/51] x86/acrn: Register TSC/CPU frequency callbacks iff frequency is actually in CPUID Sean Christopherson
@ 2026-07-01 19:52 ` sashiko-bot
2026-07-01 20:10 ` Sean Christopherson
0 siblings, 1 reply; 72+ messages in thread
From: sashiko-bot @ 2026-07-01 19:52 UTC (permalink / raw)
To: Sean Christopherson; +Cc: linux-hyperv, kvm
Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
Pre-existing issues:
- [High] ACRN hypervisor integration statically hardcodes its CPUID leaf to 0x40000010 without applying the dynamic base offset, breaking nested setups and potentially causing severe timekeeping corruption.
--
commit 8beac699ed1da5364ebdf7f1cc4475bed5521d60
Author: Sean Christopherson <seanjc@google.com>
x86/acrn: Register TSC/CPU frequency callbacks iff frequency is actually in CPUID
This commit modifies ACRN initialization to only register the TSC and CPU
frequency override callbacks if the hypervisor actually provides a non-zero
frequency in its CPUID leaf. It achieves this by reading the frequency
during platform initialization and caching the result.
> diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c
> index ad8f2da8003b3..dc71a6fdd4613 100644
> --- a/arch/x86/kernel/cpu/acrn.c
> +++ b/arch/x86/kernel/cpu/acrn.c
[ ... ]
> @@ -26,13 +28,19 @@ static u32 __init acrn_detect(void)
>
> static unsigned int __init acrn_get_tsc_khz(void)
> {
> - return cpuid_eax(ACRN_CPUID_TIMING_INFO);
> + return acrn_tsc_khz_cpuid;
> }
>
> static void __init acrn_init_platform(void)
> {
> /* Install system interrupt handler for ACRN hypervisor callback */
> sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback);
> +
> + acrn_tsc_khz_cpuid = cpuid_eax(ACRN_CPUID_TIMING_INFO);
[Severity: High]
This is a pre-existing issue, but does hardcoding ACRN_CPUID_TIMING_INFO to
0x40000010 without dynamically applying the base offset break nested setups?
If Linux is booted as an L2 guest with an L0 hypervisor like KVM occupying
the base 0x40000000 leaf, and L1 ACRN resides at 0x40000100, acrn_detect()
correctly finds ACRN using cpuid_base_hypervisor() at the 0x40000100 base.
However, acrn_init_platform() hardcodes the fetch:
arch/x86/include/asm/acrn.h:
#define ACRN_CPUID_TIMING_INFO 0x40000010
Will this bypass the dynamic base detected earlier and read the timing info
from the wrong hypervisor's CPUID space (e.g., KVM's L0 frequency)?
If nested under a hypervisor that returns garbage for 0x40000010, could
acrn_tsc_khz_cpuid be populated with a non-zero bogus value and corrupt
guest timekeeping?
> + if (acrn_tsc_khz_cpuid) {
> + x86_init.hyper.get_tsc_khz = acrn_get_tsc_khz;
> + x86_init.hyper.get_cpu_khz = acrn_get_tsc_khz;
> + }
> }
>
> static bool acrn_x2apic_available(void)
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260701193212.749551-1-seanjc@google.com?part=12
^ permalink raw reply [flat|nested] 72+ messages in thread* Re: [PATCH v5 12/51] x86/acrn: Register TSC/CPU frequency callbacks iff frequency is actually in CPUID
2026-07-01 19:52 ` sashiko-bot
@ 2026-07-01 20:10 ` Sean Christopherson
0 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 20:10 UTC (permalink / raw)
To: sashiko-reviews; +Cc: linux-hyperv, kvm
On Wed, Jul 01, 2026, sashiko-bot@kernel.org wrote:
> > diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c
> > index ad8f2da8003b3..dc71a6fdd4613 100644
> > --- a/arch/x86/kernel/cpu/acrn.c
> > +++ b/arch/x86/kernel/cpu/acrn.c
> [ ... ]
> > @@ -26,13 +28,19 @@ static u32 __init acrn_detect(void)
> >
> > static unsigned int __init acrn_get_tsc_khz(void)
> > {
> > - return cpuid_eax(ACRN_CPUID_TIMING_INFO);
> > + return acrn_tsc_khz_cpuid;
> > }
> >
> > static void __init acrn_init_platform(void)
> > {
> > /* Install system interrupt handler for ACRN hypervisor callback */
> > sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback);
> > +
> > + acrn_tsc_khz_cpuid = cpuid_eax(ACRN_CPUID_TIMING_INFO);
>
> [Severity: High]
> This is a pre-existing issue, but does hardcoding ACRN_CPUID_TIMING_INFO to
> 0x40000010 without dynamically applying the base offset break nested setups?
>
> If Linux is booted as an L2 guest with an L0 hypervisor like KVM occupying
> the base 0x40000000 leaf, and L1 ACRN resides at 0x40000100, acrn_detect()
> correctly finds ACRN using cpuid_base_hypervisor() at the 0x40000100 base.
>
> However, acrn_init_platform() hardcodes the fetch:
>
> arch/x86/include/asm/acrn.h:
> #define ACRN_CPUID_TIMING_INFO 0x40000010
>
> Will this bypass the dynamic base detected earlier and read the timing info
> from the wrong hypervisor's CPUID space (e.g., KVM's L0 frequency)?
>
> If nested under a hypervisor that returns garbage for 0x40000010, could
> acrn_tsc_khz_cpuid be populated with a non-zero bogus value and corrupt
> guest timekeeping?
Yes, but it's not at all clear that ACRN supports advertising itself anywhere
but at 0x40000000. Regardless, this can and should be addressed separately.
^ permalink raw reply [flat|nested] 72+ messages in thread
* [PATCH v5 13/51] x86/acrn: Mark TSC frequency as known when using ACRN for calibration
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (11 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 12/51] x86/acrn: Register TSC/CPU frequency callbacks iff frequency is actually in CPUID Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 14/51] x86/tsc: Consolidate forcing of X86_FEATURE_TSC_KNOWN_FREQ for PV code Sean Christopherson
` (37 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Mark the TSC frequency as known when using ACRN's PV CPUID information.
Per commit 81a71f51b89e ("x86/acrn: Set up timekeeping") and common sense,
the TSC freq is explicitly provided by the hypervisor.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/cpu/acrn.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c
index dc71a6fdd461..3818f6ae0629 100644
--- a/arch/x86/kernel/cpu/acrn.c
+++ b/arch/x86/kernel/cpu/acrn.c
@@ -40,6 +40,7 @@ static void __init acrn_init_platform(void)
if (acrn_tsc_khz_cpuid) {
x86_init.hyper.get_tsc_khz = acrn_get_tsc_khz;
x86_init.hyper.get_cpu_khz = acrn_get_tsc_khz;
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
}
}
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 14/51] x86/tsc: Consolidate forcing of X86_FEATURE_TSC_KNOWN_FREQ for PV code
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (12 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 13/51] x86/acrn: Mark TSC frequency as known when using ACRN for calibration Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-02 17:47 ` Michael Kelley
2026-07-01 19:31 ` [PATCH v5 15/51] x86/tsc: Kill off x86_platform_ops.calibrate_{cpu,tsc}() hooks Sean Christopherson
` (36 subsequent siblings)
50 siblings, 1 reply; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Now that all paravirt code that explicitly specifies the TSC frequency
also sets X86_FEATURE_TSC_KNOWN_FREQ, replace all of the one-off code
and simply set X86_FEATURE_TSC_KNOWN_FREQ if the TSC frequency is known.
Do NOT force set TSC_KNOWN_FREQ if the "known" TSC frequency was provided
by the user. Per commit bd35c77e32e4 ("x86/tsc: Add tsc_early_khz command
line parameter"), one of the goals of the param is to allow the refined
calibration work "to do meaningful error checking".
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/coco/sev/core.c | 1 -
arch/x86/coco/tdx/tdx.c | 1 -
arch/x86/kernel/cpu/acrn.c | 1 -
arch/x86/kernel/cpu/mshyperv.c | 1 -
arch/x86/kernel/cpu/vmware.c | 2 --
arch/x86/kernel/jailhouse.c | 1 -
arch/x86/kernel/kvmclock.c | 1 -
arch/x86/kernel/tsc.c | 13 ++++++++++---
arch/x86/xen/time.c | 1 -
9 files changed, 10 insertions(+), 12 deletions(-)
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index bc5ae9ef74da..72313b36b6f5 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -2027,7 +2027,6 @@ unsigned int __init snp_secure_tsc_init(void)
secrets = (__force struct snp_secrets_page *)mem;
- setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
rdmsrq(MSR_AMD64_GUEST_TSC_FREQ, tsc_freq_mhz);
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index ae2d35f2ef33..94682aca188b 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -1205,7 +1205,6 @@ unsigned int __init tdx_tsc_init(void)
/* TSC is the only reliable clock in TDX guest */
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
- setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
return info.crystal_khz * info.numerator / info.denominator;
}
diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c
index 3818f6ae0629..dc71a6fdd461 100644
--- a/arch/x86/kernel/cpu/acrn.c
+++ b/arch/x86/kernel/cpu/acrn.c
@@ -40,7 +40,6 @@ static void __init acrn_init_platform(void)
if (acrn_tsc_khz_cpuid) {
x86_init.hyper.get_tsc_khz = acrn_get_tsc_khz;
x86_init.hyper.get_cpu_khz = acrn_get_tsc_khz;
- setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
}
}
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index f9bc1c2d8c93..e03c69a4db33 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -575,7 +575,6 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
x86_init.hyper.get_tsc_khz = hv_get_tsc_khz;
x86_init.hyper.get_cpu_khz = hv_get_tsc_khz;
- setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
}
if (ms_hyperv.priv_high & HV_ISOLATION) {
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 3cb473cae462..0a3bd90576d4 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -390,8 +390,6 @@ static void __init vmware_set_capabilities(void)
{
setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC);
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
- if (vmware_tsc_khz)
- setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMCALL)
setup_force_cpu_cap(X86_FEATURE_VMCALL);
else if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMMCALL)
diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
index e24c05ab4fae..ff173052cdce 100644
--- a/arch/x86/kernel/jailhouse.c
+++ b/arch/x86/kernel/jailhouse.c
@@ -255,7 +255,6 @@ static void __init jailhouse_init_platform(void)
pr_debug("Jailhouse: PM-Timer IO Port: %#x\n", pmtmr_ioport);
precalibrated_tsc_khz = setup_data.v1.tsc_khz;
- setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
pci_probe = 0;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 4f8299303a19..35a879d33e9e 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -138,7 +138,6 @@ static inline void kvm_sched_clock_init(bool stable)
*/
static unsigned int __init kvm_get_tsc_khz(void)
{
- setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
return pvclock_tsc_khz(this_cpu_pvti());
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 1dca9464b41c..676910292af7 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1541,11 +1541,18 @@ void __init tsc_early_init(void)
if (!known_tsc_khz && x86_init.hyper.get_tsc_khz)
known_tsc_khz = x86_init.hyper.get_tsc_khz();
+ /*
+ * Mark the TSC frequency as known if it was obtained from a hypervisor
+ * or trusted firmware.
+ */
+ if (known_tsc_khz)
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+
/*
* Ignore the user-provided TSC frequency if the exact frequency was
- * obtained from trusted firmware or the hypervisor, as the user-
- * provided frequency is intended as a "starting point", not a known,
- * guaranteed frequency.
+ * obtained from trusted firmware or the hypervisor, and don't mark the
+ * frequency as known, as the user-provided frequency is intended as a
+ * "starting point", not a known, guaranteed frequency
*/
if (!known_tsc_khz)
known_tsc_khz = tsc_early_khz;
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 1adb44fdddb2..487ad838c441 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -43,7 +43,6 @@ static unsigned int __init xen_tsc_khz(void)
struct pvclock_vcpu_time_info *info =
&HYPERVISOR_shared_info->vcpu_info[0].time;
- setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
return pvclock_tsc_khz(info);
}
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* RE: [PATCH v5 14/51] x86/tsc: Consolidate forcing of X86_FEATURE_TSC_KNOWN_FREQ for PV code
2026-07-01 19:31 ` [PATCH v5 14/51] x86/tsc: Consolidate forcing of X86_FEATURE_TSC_KNOWN_FREQ for PV code Sean Christopherson
@ 2026-07-02 17:47 ` Michael Kelley
0 siblings, 0 replies; 72+ messages in thread
From: Michael Kelley @ 2026-07-02 17:47 UTC (permalink / raw)
To: Sean Christopherson, Jonathan Corbet, Paolo Bonzini,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
x86@kernel.org, Kiryl Shutsemau, Rick Edgecombe, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc@vger.kernel.org, kvm@vger.kernel.org,
linux-kernel@vger.kernel.org, linux-coco@lists.linux.dev,
linux-hyperv@vger.kernel.org, virtualization@lists.linux.dev,
xen-devel@lists.xenproject.org, Tom Lendacky, Nikunj A Dadhania,
David Woodhouse, David Woodhouse, Michael Kelley, Thomas Gleixner
From: Sean Christopherson <seanjc@google.com> Sent: Wednesday, July 1, 2026 12:32 PM
>
> Now that all paravirt code that explicitly specifies the TSC frequency
> also sets X86_FEATURE_TSC_KNOWN_FREQ, replace all of the one-off code
> and simply set X86_FEATURE_TSC_KNOWN_FREQ if the TSC frequency is known.
>
> Do NOT force set TSC_KNOWN_FREQ if the "known" TSC frequency was provided
> by the user. Per commit bd35c77e32e4 ("x86/tsc: Add tsc_early_khz command
> line parameter"), one of the goals of the param is to allow the refined
> calibration work "to do meaningful error checking".
>
> No functional change intended.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
For the Hyper-V changes,
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
> ---
> arch/x86/coco/sev/core.c | 1 -
> arch/x86/coco/tdx/tdx.c | 1 -
> arch/x86/kernel/cpu/acrn.c | 1 -
> arch/x86/kernel/cpu/mshyperv.c | 1 -
> arch/x86/kernel/cpu/vmware.c | 2 --
> arch/x86/kernel/jailhouse.c | 1 -
> arch/x86/kernel/kvmclock.c | 1 -
> arch/x86/kernel/tsc.c | 13 ++++++++++---
> arch/x86/xen/time.c | 1 -
> 9 files changed, 10 insertions(+), 12 deletions(-)
>
> diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
> index bc5ae9ef74da..72313b36b6f5 100644
> --- a/arch/x86/coco/sev/core.c
> +++ b/arch/x86/coco/sev/core.c
> @@ -2027,7 +2027,6 @@ unsigned int __init snp_secure_tsc_init(void)
>
> secrets = (__force struct snp_secrets_page *)mem;
>
> - setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
> setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
>
> rdmsrq(MSR_AMD64_GUEST_TSC_FREQ, tsc_freq_mhz);
> diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
> index ae2d35f2ef33..94682aca188b 100644
> --- a/arch/x86/coco/tdx/tdx.c
> +++ b/arch/x86/coco/tdx/tdx.c
> @@ -1205,7 +1205,6 @@ unsigned int __init tdx_tsc_init(void)
>
> /* TSC is the only reliable clock in TDX guest */
> setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
> - setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
>
> return info.crystal_khz * info.numerator / info.denominator;
> }
> diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c
> index 3818f6ae0629..dc71a6fdd461 100644
> --- a/arch/x86/kernel/cpu/acrn.c
> +++ b/arch/x86/kernel/cpu/acrn.c
> @@ -40,7 +40,6 @@ static void __init acrn_init_platform(void)
> if (acrn_tsc_khz_cpuid) {
> x86_init.hyper.get_tsc_khz = acrn_get_tsc_khz;
> x86_init.hyper.get_cpu_khz = acrn_get_tsc_khz;
> - setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
> }
> }
>
> diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
> index f9bc1c2d8c93..e03c69a4db33 100644
> --- a/arch/x86/kernel/cpu/mshyperv.c
> +++ b/arch/x86/kernel/cpu/mshyperv.c
> @@ -575,7 +575,6 @@ static void __init ms_hyperv_init_platform(void)
> ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
> x86_init.hyper.get_tsc_khz = hv_get_tsc_khz;
> x86_init.hyper.get_cpu_khz = hv_get_tsc_khz;
> - setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
> }
>
> if (ms_hyperv.priv_high & HV_ISOLATION) {
> diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
> index 3cb473cae462..0a3bd90576d4 100644
> --- a/arch/x86/kernel/cpu/vmware.c
> +++ b/arch/x86/kernel/cpu/vmware.c
> @@ -390,8 +390,6 @@ static void __init vmware_set_capabilities(void)
> {
> setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC);
> setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
> - if (vmware_tsc_khz)
> - setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
> if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMCALL)
> setup_force_cpu_cap(X86_FEATURE_VMCALL);
> else if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMMCALL)
> diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
> index e24c05ab4fae..ff173052cdce 100644
> --- a/arch/x86/kernel/jailhouse.c
> +++ b/arch/x86/kernel/jailhouse.c
> @@ -255,7 +255,6 @@ static void __init jailhouse_init_platform(void)
> pr_debug("Jailhouse: PM-Timer IO Port: %#x\n", pmtmr_ioport);
>
> precalibrated_tsc_khz = setup_data.v1.tsc_khz;
> - setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
>
> pci_probe = 0;
>
> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> index 4f8299303a19..35a879d33e9e 100644
> --- a/arch/x86/kernel/kvmclock.c
> +++ b/arch/x86/kernel/kvmclock.c
> @@ -138,7 +138,6 @@ static inline void kvm_sched_clock_init(bool stable)
> */
> static unsigned int __init kvm_get_tsc_khz(void)
> {
> - setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
> return pvclock_tsc_khz(this_cpu_pvti());
> }
>
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 1dca9464b41c..676910292af7 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -1541,11 +1541,18 @@ void __init tsc_early_init(void)
> if (!known_tsc_khz && x86_init.hyper.get_tsc_khz)
> known_tsc_khz = x86_init.hyper.get_tsc_khz();
>
> + /*
> + * Mark the TSC frequency as known if it was obtained from a hypervisor
> + * or trusted firmware.
> + */
> + if (known_tsc_khz)
> + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
> +
> /*
> * Ignore the user-provided TSC frequency if the exact frequency was
> - * obtained from trusted firmware or the hypervisor, as the user-
> - * provided frequency is intended as a "starting point", not a known,
> - * guaranteed frequency.
> + * obtained from trusted firmware or the hypervisor, and don't mark the
> + * frequency as known, as the user-provided frequency is intended as a
> + * "starting point", not a known, guaranteed frequency
> */
> if (!known_tsc_khz)
> known_tsc_khz = tsc_early_khz;
> diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
> index 1adb44fdddb2..487ad838c441 100644
> --- a/arch/x86/xen/time.c
> +++ b/arch/x86/xen/time.c
> @@ -43,7 +43,6 @@ static unsigned int __init xen_tsc_khz(void)
> struct pvclock_vcpu_time_info *info =
> &HYPERVISOR_shared_info->vcpu_info[0].time;
>
> - setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
> return pvclock_tsc_khz(info);
> }
>
> --
> 2.55.0.rc0.799.gd6f94ed593-goog
>
^ permalink raw reply [flat|nested] 72+ messages in thread
* [PATCH v5 15/51] x86/tsc: Kill off x86_platform_ops.calibrate_{cpu,tsc}() hooks
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (13 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 14/51] x86/tsc: Consolidate forcing of X86_FEATURE_TSC_KNOWN_FREQ for PV code Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 16/51] x86/tsc: Rename pit_hpet_ptimer_calibrate_cpu() => native_calibrate_cpu_late() Sean Christopherson
` (35 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Now that getting the CPU and/or TSC frequencies from the hypervisor uses
dedicated hooks, drop x86_platform_ops.calibrate_{cpu,tsc}() and instead
directly invoke the correct helper at each phase of (re)calibration. In
addition to eliminating unnecessary code, this makes it a bit more obvious
when the "late" path invokes pit_hpet_ptimer_calibrate_cpu() instead of
x86_platform_ops.calibrate_cpu().
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/tsc.h | 2 --
arch/x86/include/asm/x86_init.h | 4 ----
arch/x86/kernel/tsc.c | 28 ++++++++++++----------------
arch/x86/kernel/x86_init.c | 2 --
4 files changed, 12 insertions(+), 24 deletions(-)
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index b6b86e24e1bf..c09ec485abcd 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -95,8 +95,6 @@ extern void mark_tsc_unstable(char *reason);
extern int unsynchronized_tsc(void);
extern int check_tsc_unstable(void);
extern void mark_tsc_async_resets(char *reason);
-extern unsigned long native_calibrate_cpu_early(void);
-extern unsigned long native_calibrate_tsc(void);
extern unsigned long long native_sched_clock_from_tsc(u64 tsc);
extern int tsc_clocksource_reliable;
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 0c89bf40f507..e879e6e83428 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -295,8 +295,6 @@ struct x86_hyper_runtime {
/**
* struct x86_platform_ops - platform specific runtime functions
- * @calibrate_cpu: calibrate CPU
- * @calibrate_tsc: calibrate TSC, if different from CPU
* @get_wallclock: get time from HW clock like RTC etc.
* @set_wallclock: set time back to HW clock
* @iommu_shutdown: set by an IOMMU driver for shutdown if necessary
@@ -320,8 +318,6 @@ struct x86_hyper_runtime {
* @guest: guest incarnations callbacks
*/
struct x86_platform_ops {
- unsigned long (*calibrate_cpu)(void);
- unsigned long (*calibrate_tsc)(void);
void (*get_wallclock)(struct timespec64 *ts);
int (*set_wallclock)(const struct timespec64 *ts);
void (*iommu_shutdown)(void);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 676910292af7..a877b82d0991 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -672,7 +672,7 @@ int cpuid_get_tsc_info(struct cpuid_tsc_info *info)
* native_calibrate_tsc - determine TSC frequency
* Determine TSC frequency via CPUID, else return 0.
*/
-unsigned long native_calibrate_tsc(void)
+static unsigned long native_calibrate_tsc(void)
{
struct cpuid_tsc_info info;
@@ -904,7 +904,7 @@ static unsigned long pit_hpet_ptimer_calibrate_cpu(void)
/**
* native_calibrate_cpu_early - can calibrate the cpu early in boot
*/
-unsigned long native_calibrate_cpu_early(void)
+static unsigned long native_calibrate_cpu_early(void)
{
unsigned long flags, fast_calibrate = cpu_khz_from_cpuid();
@@ -918,7 +918,7 @@ unsigned long native_calibrate_cpu_early(void)
return fast_calibrate;
}
-
+#ifndef CONFIG_SMP
/**
* native_calibrate_cpu - calibrate the cpu
*/
@@ -931,6 +931,7 @@ static unsigned long native_calibrate_cpu(void)
return tsc_freq;
}
+#endif
void recalibrate_cpu_khz(void)
{
@@ -943,8 +944,8 @@ void recalibrate_cpu_khz(void)
if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_TSC_KNOWN_FREQ)))
return;
- cpu_khz = x86_platform.calibrate_cpu();
- tsc_khz = x86_platform.calibrate_tsc();
+ cpu_khz = native_calibrate_cpu();
+ tsc_khz = native_calibrate_tsc();
if (tsc_khz == 0)
tsc_khz = cpu_khz;
else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
@@ -1458,17 +1459,19 @@ static bool __init determine_cpu_tsc_frequencies(bool early,
WARN_ON(cpu_khz || tsc_khz);
if (early) {
+ /*
+ * Early CPU calibration can only use methods that are available
+ * early in boot (obviously).
+ */
if (known_cpu_khz)
cpu_khz = known_cpu_khz;
else
- cpu_khz = x86_platform.calibrate_cpu();
+ cpu_khz = native_calibrate_cpu_early();
if (known_tsc_khz)
tsc_khz = known_tsc_khz;
else
- tsc_khz = x86_platform.calibrate_tsc();
+ tsc_khz = native_calibrate_tsc();
} else {
- /* We should not be here with non-native cpu calibration */
- WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu);
cpu_khz = pit_hpet_ptimer_calibrate_cpu();
}
@@ -1571,13 +1574,6 @@ void __init tsc_init(void)
return;
}
- /*
- * native_calibrate_cpu_early can only calibrate using methods that are
- * available early in boot.
- */
- if (x86_platform.calibrate_cpu == native_calibrate_cpu_early)
- x86_platform.calibrate_cpu = native_calibrate_cpu;
-
if (!tsc_khz) {
/* We failed to determine frequencies earlier, try again */
if (!determine_cpu_tsc_frequencies(false, 0, 0)) {
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 252c5827d063..b7a48e622f48 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -147,8 +147,6 @@ static void enc_kexec_finish_noop(void) {}
static bool is_private_mmio_noop(u64 addr) {return false; }
struct x86_platform_ops x86_platform __ro_after_init = {
- .calibrate_cpu = native_calibrate_cpu_early,
- .calibrate_tsc = native_calibrate_tsc,
.get_wallclock = mach_get_cmos_time,
.set_wallclock = mach_set_cmos_time,
.iommu_shutdown = iommu_shutdown_noop,
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 16/51] x86/tsc: Rename pit_hpet_ptimer_calibrate_cpu() => native_calibrate_cpu_late()
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (14 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 15/51] x86/tsc: Kill off x86_platform_ops.calibrate_{cpu,tsc}() hooks Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 17/51] x86/tsc: Fold native_calibrate_cpu() into recalibrate_cpu_khz() Sean Christopherson
` (34 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Rename the late CPU calibration routine so that its relationship to the
early routine is more obvious and intuitive.
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/tsc.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index a877b82d0991..9764ac758081 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -752,7 +752,7 @@ static unsigned long cpu_khz_from_cpuid(void)
* calibrate cpu using pit, hpet, and ptimer methods. They are available
* later in boot after acpi is initialized.
*/
-static unsigned long pit_hpet_ptimer_calibrate_cpu(void)
+static unsigned long native_calibrate_cpu_late(void)
{
u64 tsc1, tsc2, delta, ref1, ref2;
unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
@@ -927,7 +927,7 @@ static unsigned long native_calibrate_cpu(void)
unsigned long tsc_freq = native_calibrate_cpu_early();
if (!tsc_freq)
- tsc_freq = pit_hpet_ptimer_calibrate_cpu();
+ tsc_freq = native_calibrate_cpu_late();
return tsc_freq;
}
@@ -1472,7 +1472,7 @@ static bool __init determine_cpu_tsc_frequencies(bool early,
else
tsc_khz = native_calibrate_tsc();
} else {
- cpu_khz = pit_hpet_ptimer_calibrate_cpu();
+ cpu_khz = native_calibrate_cpu_late();
}
/*
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 17/51] x86/tsc: Fold native_calibrate_cpu() into recalibrate_cpu_khz()
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (15 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 16/51] x86/tsc: Rename pit_hpet_ptimer_calibrate_cpu() => native_calibrate_cpu_late() Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:50 ` sashiko-bot
2026-07-01 19:31 ` [PATCH v5 18/51] x86/kvmclock: Rename kvm_get_tsc_khz() to kvmclock_get_tsc_khz() Sean Christopherson
` (33 subsequent siblings)
50 siblings, 1 reply; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Fold the guts of native_calibrate_cpu() into its sole remaining caller,
recalibrate_cpu_khz() to eliminate the extra SMP=n #ifdef, and so that it's
more obvious that directly invoking the early vs. late calibration routines
in determine_cpu_tsc_frequencies() is intentional.
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/tsc.c | 19 +++----------------
1 file changed, 3 insertions(+), 16 deletions(-)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 9764ac758081..6ed6f8f012eb 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -918,21 +918,6 @@ static unsigned long native_calibrate_cpu_early(void)
return fast_calibrate;
}
-#ifndef CONFIG_SMP
-/**
- * native_calibrate_cpu - calibrate the cpu
- */
-static unsigned long native_calibrate_cpu(void)
-{
- unsigned long tsc_freq = native_calibrate_cpu_early();
-
- if (!tsc_freq)
- tsc_freq = native_calibrate_cpu_late();
-
- return tsc_freq;
-}
-#endif
-
void recalibrate_cpu_khz(void)
{
#ifndef CONFIG_SMP
@@ -944,7 +929,9 @@ void recalibrate_cpu_khz(void)
if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_TSC_KNOWN_FREQ)))
return;
- cpu_khz = native_calibrate_cpu();
+ cpu_khz = native_calibrate_cpu_early();
+ if (!cpu_khz)
+ cpu_khz = native_calibrate_cpu_late();
tsc_khz = native_calibrate_tsc();
if (tsc_khz == 0)
tsc_khz = cpu_khz;
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* Re: [PATCH v5 17/51] x86/tsc: Fold native_calibrate_cpu() into recalibrate_cpu_khz()
2026-07-01 19:31 ` [PATCH v5 17/51] x86/tsc: Fold native_calibrate_cpu() into recalibrate_cpu_khz() Sean Christopherson
@ 2026-07-01 19:50 ` sashiko-bot
0 siblings, 0 replies; 72+ messages in thread
From: sashiko-bot @ 2026-07-01 19:50 UTC (permalink / raw)
To: Sean Christopherson; +Cc: linux-hyperv, kvm
Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [Medium] The refactoring in `recalibrate_cpu_khz()` introduces a race condition where the global variable `cpu_khz` is temporarily set to 0 for up to 150ms during runtime CPU frequency recalibration. This exposes the system to invalid state reads.
--
commit 34cf1b98d726e6c329bdefbd5c9712901d8b4bc8
Author: Sean Christopherson <seanjc@google.com>
x86/tsc: Fold native_calibrate_cpu() into recalibrate_cpu_khz()
This commit folds the implementation of native_calibrate_cpu() directly into
recalibrate_cpu_khz() to remove an extra CONFIG_SMP=n ifdef block. This change
aims to clarify the intentional use of early versus late calibration routines.
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 9764ac758081f..6ed6f8f012ebf 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
[ ... ]
> @@ -944,7 +929,9 @@ void recalibrate_cpu_khz(void)
> if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_TSC_KNOWN_FREQ)))
> return;
>
> - cpu_khz = native_calibrate_cpu();
> + cpu_khz = native_calibrate_cpu_early();
> + if (!cpu_khz)
> + cpu_khz = native_calibrate_cpu_late();
[Severity: Medium]
Does this expose a race condition by temporarily assigning 0 to the global
cpu_khz variable?
On a UP kernel during cpufreq transitions, if early calibration fails, the
global cpu_khz is directly assigned 0. It remains 0 while
native_calibrate_cpu_late() executes, which can take ~150ms and enables
interrupts.
Could this lead to sysfs readers via arch_freq_get_on_cpu() observing a 0
frequency, or the NMI watchdog being misconfigured with a 0 sample period if
toggled during this window?
The previous implementation used a local variable inside native_calibrate_cpu()
to hold the intermediate state, preventing concurrent readers from seeing an
invalid 0 value before the slow fallback calibration completes.
> tsc_khz = native_calibrate_tsc();
> if (tsc_khz == 0)
> tsc_khz = cpu_khz;
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260701193212.749551-1-seanjc@google.com?part=17
^ permalink raw reply [flat|nested] 72+ messages in thread
* [PATCH v5 18/51] x86/kvmclock: Rename kvm_get_tsc_khz() to kvmclock_get_tsc_khz()
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (16 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 17/51] x86/tsc: Fold native_calibrate_cpu() into recalibrate_cpu_khz() Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 19/51] x86/kvmclock: Drop dead check on TSC being unstable during kvmclock_init() Sean Christopherson
` (32 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Rename kvm_get_tsc_khz() to kvmclock_get_tsc_khz() in anticipation of
adding support for getting TSC info from PV CPUID, i.e. in a KVM specific
way, but without non-kvmclock.
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/kvmclock.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 35a879d33e9e..061a22d31dea 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -136,7 +136,7 @@ static inline void kvm_sched_clock_init(bool stable)
* poll of guests can be running and trouble each other. So we preset
* lpj here
*/
-static unsigned int __init kvm_get_tsc_khz(void)
+static unsigned int __init kvmclock_get_tsc_khz(void)
{
return pvclock_tsc_khz(this_cpu_pvti());
}
@@ -146,7 +146,7 @@ static void __init kvm_get_preset_lpj(void)
unsigned long khz;
u64 lpj;
- khz = kvm_get_tsc_khz();
+ khz = kvmclock_get_tsc_khz();
lpj = ((u64)khz * 1000);
do_div(lpj, HZ);
@@ -342,8 +342,8 @@ void __init kvmclock_init(void)
flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
- x86_init.hyper.get_tsc_khz = kvm_get_tsc_khz;
- x86_init.hyper.get_cpu_khz = kvm_get_tsc_khz;
+ x86_init.hyper.get_tsc_khz = kvmclock_get_tsc_khz;
+ x86_init.hyper.get_cpu_khz = kvmclock_get_tsc_khz;
x86_platform.get_wallclock = kvm_get_wallclock;
x86_platform.set_wallclock = kvm_set_wallclock;
#ifdef CONFIG_X86_LOCAL_APIC
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 19/51] x86/kvmclock: Drop dead check on TSC being unstable during kvmclock_init()
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (17 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 18/51] x86/kvmclock: Rename kvm_get_tsc_khz() to kvmclock_get_tsc_khz() Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 20/51] KVM: x86: Officially define CPUID 0x40000010 as PV Timing Info (TSC and Bus) Sean Christopherson
` (31 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
As pointed out by Sashiko[*], kvmclock_init() runs before __setup() and
thus before notsc_setup() or tsc_setup() can mark the TSC unstable.
kvmclock_init() also runs well before tsc_init(), and even before
tsc_early_init(). Simply delete the check, as it's been dead code since
it was introduced.
Note, odds are good the check_tsc_unstable() call was copied from Xen's
xen_time_init()+xen_tsc_safe_clocksource() logic (as so much of KVM's PV
code was). However, xen_time_init() runs via x86_init.timers.timer_init(),
which is invoke from x86_late_time_init(), and thus after params have been
parsed.
Alternatively, kvmclock could register itself later on, or tsc_setup()
could be parsed as an early param. Given that there's zero evidence there
was any meaningful intent or need to actually check for an unstable TSC,
go with the simplest option.
Fixes: 7539b174aef4 ("x86: kvmguest: use TSC clocksource if invariant TSC is exposed")
Link: https://lore.kernel.org/all/20260529181213.0B27A1F00893@smtp.kernel.org [*]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/kvmclock.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 061a22d31dea..29ca37e9a3bc 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -362,8 +362,7 @@ void __init kvmclock_init(void)
*
*/
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
- boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
- !check_tsc_unstable())
+ boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
kvm_clock.rating = 299;
clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 20/51] KVM: x86: Officially define CPUID 0x40000010 as PV Timing Info (TSC and Bus)
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (18 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 19/51] x86/kvmclock: Drop dead check on TSC being unstable during kvmclock_init() Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 21/51] x86/kvm: Obtain TSC frequency from PV CPUID if present Sean Christopherson
` (30 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
From: David Woodhouse <dwmw@amazon.co.uk>
Formally define and document CPUID 0x40000010 as providing TSC and local
APIC bus frequency information for KVM's PV CPUID range. Way back in
2008, VMware proposed (https://lkml.org/lkml/2008/10/1/246) carving out a
range of CPUID leaves for use by hypervisors. While the broader proposal
from VMware was mostly shot down in flames, use of CPUID 0x40000010 to
provide TSC and local APIC bus frequency information survived and made it's
way into multiple guest operating systems.
XNU unconditionally assumes CPUID 0x40000010 contains the frequency
information, if it's present on any hypervisor:
https://github.com/apple/darwin-xnu/blob/main/osfmk/i386/cpuid.c
As does FreeBSD:
https://github.com/freebsd/freebsd-src/commit/4a432614f68
More importantly, QEMU (the de facto "reference" VMM for KVM) has
conditionally provided timing information in CPUID 0x40000010 for almost
9 years, since commit 9954a1582e ("x86-KVM: Supply TSC and APIC clock
rates to guest like VMWare").
So at this point it would be daft for KVM (or any hypervisor) to expose
0x40000010 for any *other* content. Officially carve out and define the
CPUID leaf so that Linux-as-a-guest can follow suit and pull TSC and Local
APIC Bus frequency information from CPUID.
Defer providing userspace with the necessary information needed to
precisely and accurately enumerate the _actual_ configured TSC frequency
to the guest (that exact information, along with the scaled ratio, isn't
exposed to userspace). As evidenced by QEMU, providing CPUID 0x40000010
without help from KVM is entirely possible, just not ideal.
Link: https://lore.kernel.org/all/ea0d7f43d910cee9600b254e303f468722fa355b.camel@infradead.org
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
[sean: drop KVM filling of CPUID, add documentation, massage changelog]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
Documentation/virt/kvm/x86/cpuid.rst | 12 ++++++++++++
arch/x86/include/uapi/asm/kvm_para.h | 11 +++++++++++
2 files changed, 23 insertions(+)
diff --git a/Documentation/virt/kvm/x86/cpuid.rst b/Documentation/virt/kvm/x86/cpuid.rst
index bda3e3e737d7..a5ee8ff052ce 100644
--- a/Documentation/virt/kvm/x86/cpuid.rst
+++ b/Documentation/virt/kvm/x86/cpuid.rst
@@ -122,3 +122,15 @@ KVM_HINTS_REALTIME 0 guest checks this feature bit to
preempted for an unlimited time
allowing optimizations
================== ============ =================================
+
+function: KVM_CPUID_TIMING_INFO (0x40000010)
+
+returns::
+
+ eax = (Virtual) TSC frequency in kHz
+ ebx = (Virtual) Bus (local APIC timer) frequency in kHz
+ ecx = 0 (Reserved)
+ edx = 0 (Reserved)
+
+Note, KVM only defines the semantics of KVM_CPUID_TIMING_INFO; KVM does NOT
+advertise support via KVM_GET_SUPPORTED_CPUID.
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index a1efa7907a0b..c3a384711f3a 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -44,6 +44,17 @@
*/
#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24
+/*
+ * The timing information leaf provides TSC and local APIC timer frequency
+ * information to the guest. Note, userspace is responsible for filling the
+ * leaf with the correct information.
+ *
+ * # EAX: (Virtual) TSC frequency in kHz.
+ * # EBX: (Virtual) Bus (local APIC timer) frequency in kHz.
+ * # ECX, EDX: Reserved (must be zero).
+ */
+#define KVM_CPUID_TIMING_INFO 0x40000010
+
#define MSR_KVM_WALL_CLOCK 0x11
#define MSR_KVM_SYSTEM_TIME 0x12
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 21/51] x86/kvm: Obtain TSC frequency from PV CPUID if present
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (19 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 20/51] KVM: x86: Officially define CPUID 0x40000010 as PV Timing Info (TSC and Bus) Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 22/51] x86/kvm: Mark TSC as reliable when it's constant and nonstop Sean Christopherson
` (29 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
From: David Woodhouse <dwmw@amazon.co.uk>
In https://lkml.org/lkml/2008/10/1/246 a proposal was made for generic
CPUID conventions across hypervisors. It was mostly shot down in flames,
but the leaf at 0x40000010 containing timing information didn't die.
It's used by XNU and FreeBSD guests under all hypervisors¹² to determine
the TSC frequency, and also exposed by the EC2 Nitro hypervisor (as
well as, presumably, VMware). FreeBSD's Bhyve is probably just about
to start exposing it too.
Use it under KVM to obtain the TSC frequency more accurately, instead of
reverse-calculating the frequency from the mul/shift values in the KVM
clock. Use the information to get the CPU frequency as well (kvmclock
feeds in kvm_get_tsc_khz() for both TSC and CPU calibration), as the info
from CPUID is superior in every way; whether or not kvmclock should be
overriding CPU calibration in the first place is an entirely different
question.
Use the info from CPUID even if the user explicitly disables kvmclock, or
if it's unsupported. The PV CPUID leaf has no dependency on kvmclock, and
is in fact more useful if kvmclock is disabled since the kernel won't be
able to use kvmclock to derive a derive the TSC frequency.
Before:
[ 0.000020] tsc: Detected 2900.014 MHz processor
After:
[ 0.000020] tsc: Detected 2900.015 MHz processor
$ cpuid -1 -l 0x40000010
CPU:
hypervisor generic timing information (0x40000010):
TSC frequency (Hz) = 2900015
bus frequency (Hz) = 1000000
Note! *Independently* query for non-null get_{cpu,tsc}_khz() overrides so
that kvmclock doesn't clobber x86_init.hyper.get_cpu_khz() if/when KVM adds
support for getting the CPU frequency separately from the TSC frequency.
¹ https://github.com/apple/darwin-xnu/blob/main/osfmk/i386/cpuid.c
² https://github.com/freebsd/freebsd-src/commit/4a432614f68
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/kvm.c | 33 +++++++++++++++++++++++++++++++++
arch/x86/kernel/kvmclock.c | 6 ++++--
2 files changed, 37 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index dcef84da304b..909d3e5e5bcd 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -49,6 +49,8 @@
#include <asm/svm.h>
#include <asm/e820/api.h>
+static unsigned int kvm_tsc_khz_cpuid __initdata;
+
DEFINE_STATIC_KEY_FALSE_RO(kvm_async_pf_enabled);
static int kvmapf = 1;
@@ -911,6 +913,21 @@ bool kvm_para_available(void)
}
EXPORT_SYMBOL_GPL(kvm_para_available);
+static u32 __init kvm_cpuid_timing_info_leaf(void)
+{
+ u32 base = kvm_cpuid_base();
+
+ if (!base || cpuid_eax(base) < (base | KVM_CPUID_TIMING_INFO))
+ return 0;
+
+ return base | KVM_CPUID_TIMING_INFO;
+}
+
+static unsigned int __init kvm_get_tsc_khz(void)
+{
+ return kvm_tsc_khz_cpuid;
+}
+
unsigned int kvm_arch_para_features(void)
{
return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
@@ -960,6 +977,7 @@ static void __init kvm_init_platform(void)
.mask_lo = (u32)(~(SZ_4G - tolud - 1)) | MTRR_PHYSMASK_V,
.mask_hi = (BIT_ULL(boot_cpu_data.x86_phys_bits) - 1) >> 32,
};
+ u32 timing_info_leaf;
if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) &&
kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) {
@@ -1007,6 +1025,21 @@ static void __init kvm_init_platform(void)
wrmsrq(MSR_KVM_MIGRATION_CONTROL,
KVM_MIGRATION_READY);
}
+
+ /*
+ * If KVM advertises the frequency directly in CPUID, use that instead
+ * of reverse-calculating it from the KVM clock data, or worse, trying
+ * to calibratate the TSC using an emulated device.
+ */
+ timing_info_leaf = kvm_cpuid_timing_info_leaf();
+ if (timing_info_leaf) {
+ kvm_tsc_khz_cpuid = cpuid_eax(timing_info_leaf);
+ if (kvm_tsc_khz_cpuid) {
+ x86_init.hyper.get_tsc_khz = kvm_get_tsc_khz;
+ x86_init.hyper.get_cpu_khz = kvm_get_tsc_khz;
+ }
+ }
+
kvmclock_init();
x86_platform.apic_post_init = kvm_apic_init;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 29ca37e9a3bc..f55d0305d1f3 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -342,8 +342,10 @@ void __init kvmclock_init(void)
flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
- x86_init.hyper.get_tsc_khz = kvmclock_get_tsc_khz;
- x86_init.hyper.get_cpu_khz = kvmclock_get_tsc_khz;
+ if (!x86_init.hyper.get_tsc_khz)
+ x86_init.hyper.get_tsc_khz = kvmclock_get_tsc_khz;
+ if (!x86_init.hyper.get_cpu_khz)
+ x86_init.hyper.get_cpu_khz = kvmclock_get_tsc_khz;
x86_platform.get_wallclock = kvm_get_wallclock;
x86_platform.set_wallclock = kvm_set_wallclock;
#ifdef CONFIG_X86_LOCAL_APIC
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 22/51] x86/kvm: Mark TSC as reliable when it's constant and nonstop
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (20 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 21/51] x86/kvm: Obtain TSC frequency from PV CPUID if present Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 20:03 ` sashiko-bot
2026-07-01 19:31 ` [PATCH v5 23/51] x86/tsc: Add standalone helper for getting CPU frequency from CPUID Sean Christopherson
` (28 subsequent siblings)
50 siblings, 1 reply; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Mark the TSC as reliable if the hypervisor (KVM) has enumerated the TSC
as constant and nonstop. Like most (all?) virtualization setups, any
secondary clocksource that's used as a watchdog is guaranteed to be less
reliable than a constant, nonstop TSC, as all clocksources the kernel uses
as a watchdog are all but guaranteed to be emulated when running as a KVM
guest. I.e. any observed discrepancies between the TSC and watchdog will
be due to jitter in the watchdog.
This is especially true for KVM, as the watchdog clocksource is usually
emulated in host userspace, i.e. reading the clock incurs a roundtrip
cost of thousands of cycles.
Marking the TSC reliable addresses a flaw where the TSC will occasionally
be marked unstable if the host is under moderate/heavy load.
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/kvm_para.h | 2 +-
arch/x86/kernel/kvm.c | 12 +++++++++++-
arch/x86/kernel/kvmclock.c | 14 +++++---------
3 files changed, 17 insertions(+), 11 deletions(-)
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 4a47c16e2df8..4a49fc286b4c 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -118,7 +118,7 @@ static inline long kvm_sev_hypercall3(unsigned int nr, unsigned long p1,
}
#ifdef CONFIG_KVM_GUEST
-void kvmclock_init(void);
+void kvmclock_init(bool prefer_tsc);
void kvmclock_disable(void);
bool kvm_para_available(void);
unsigned int kvm_arch_para_features(void);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 909d3e5e5bcd..1cef54e1e7d9 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -978,6 +978,7 @@ static void __init kvm_init_platform(void)
.mask_hi = (BIT_ULL(boot_cpu_data.x86_phys_bits) - 1) >> 32,
};
u32 timing_info_leaf;
+ bool tsc_is_reliable;
if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) &&
kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) {
@@ -1040,7 +1041,16 @@ static void __init kvm_init_platform(void)
}
}
- kvmclock_init();
+ /*
+ * If the TSC counts at a constant frequency across P/T states and in
+ * deep C-states, treat the TSC reliable, as guaranteed by KVM.
+ */
+ tsc_is_reliable = boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+ boot_cpu_has(X86_FEATURE_NONSTOP_TSC);
+ if (tsc_is_reliable)
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+
+ kvmclock_init(tsc_is_reliable);
x86_platform.apic_post_init = kvm_apic_init;
/*
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f55d0305d1f3..2e7ab54cb9dc 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -307,7 +307,7 @@ static int kvmclock_setup_percpu(unsigned int cpu)
return p ? 0 : -ENOMEM;
}
-void __init kvmclock_init(void)
+void __init kvmclock_init(bool prefer_tsc)
{
u8 flags;
@@ -356,15 +356,11 @@ void __init kvmclock_init(void)
kvm_get_preset_lpj();
/*
- * X86_FEATURE_NONSTOP_TSC is TSC runs at constant rate
- * with P/T states and does not stop in deep C-states.
- *
- * Invariant TSC exposed by host means kvmclock is not necessary:
- * can use TSC as clocksource.
- *
+ * If TSC is preferred over kvmlock, drop kvmclock's rating so that TSC
+ * is chosen as the clocksource (but still register kvmclock in case
+ * the kernel doesn't want to use TSC for whatever reason).
*/
- if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
- boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
+ if (prefer_tsc)
kvm_clock.rating = 299;
clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* Re: [PATCH v5 22/51] x86/kvm: Mark TSC as reliable when it's constant and nonstop
2026-07-01 19:31 ` [PATCH v5 22/51] x86/kvm: Mark TSC as reliable when it's constant and nonstop Sean Christopherson
@ 2026-07-01 20:03 ` sashiko-bot
2026-07-01 20:13 ` Sean Christopherson
0 siblings, 1 reply; 72+ messages in thread
From: sashiko-bot @ 2026-07-01 20:03 UTC (permalink / raw)
To: Sean Christopherson; +Cc: linux-hyperv, kvm
Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [High] Unconditionally forcing X86_FEATURE_TSC_RELIABLE bypasses SMP cross-CPU TSC synchronization checks, which will cause time to go backwards for guests running on hosts with unsynchronized TSCs.
--
commit fb8fd2f8d18aabaf24f424391748d58e9427bdf7
Author: Sean Christopherson <seanjc@google.com>
x86/kvm: Mark TSC as reliable when it's constant and nonstop
This commit marks the TSC as reliable if KVM enumerates it as constant
and nonstop. This aims to prevent the TSC from occasionally being marked
unstable when the host is under moderate or heavy load.
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index 909d3e5e5bcd5..1cef54e1e7d9e 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
[ ... ]
> @@ -1040,7 +1041,16 @@ static void __init kvm_init_platform(void)
> }
> }
>
> - kvmclock_init();
> + /*
> + * If the TSC counts at a constant frequency across P/T states and in
> + * deep C-states, treat the TSC reliable, as guaranteed by KVM.
> + */
> + tsc_is_reliable = boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
> + boot_cpu_has(X86_FEATURE_NONSTOP_TSC);
> + if (tsc_is_reliable)
> + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
[Severity: High]
Does setting X86_FEATURE_TSC_RELIABLE here bypass SMP cross-CPU TSC
synchronization checks?
Forcing X86_FEATURE_TSC_RELIABLE based only on CONSTANT_TSC and NONSTOP_TSC,
without verifying if the hypervisor actually guarantees cross-CPU
synchronization (e.g., via KVM_FEATURE_CLOCKSOURCE_STABLE_BIT), sets
tsc_clocksource_reliable = 1.
This bypasses the synchronization check in check_tsc_sync_source():
arch/x86/kernel/tsc_sync.c:check_tsc_sync_source() {
...
if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable)
return;
...
}
Could this cause time to go backwards for guests running on hosts with
unsynchronized TSCs when threads migrate between vCPUs?
> +
> + kvmclock_init(tsc_is_reliable);
> x86_platform.apic_post_init = kvm_apic_init;
>
> /*
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260701193212.749551-1-seanjc@google.com?part=22
^ permalink raw reply [flat|nested] 72+ messages in thread* Re: [PATCH v5 22/51] x86/kvm: Mark TSC as reliable when it's constant and nonstop
2026-07-01 20:03 ` sashiko-bot
@ 2026-07-01 20:13 ` Sean Christopherson
0 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 20:13 UTC (permalink / raw)
To: sashiko-reviews; +Cc: linux-hyperv, kvm
On Wed, Jul 01, 2026, sashiko-bot@kernel.org wrote:
> > diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> > index 909d3e5e5bcd5..1cef54e1e7d9e 100644
> > --- a/arch/x86/kernel/kvm.c
> > +++ b/arch/x86/kernel/kvm.c
> [ ... ]
> > @@ -1040,7 +1041,16 @@ static void __init kvm_init_platform(void)
> > }
> > }
> >
> > - kvmclock_init();
> > + /*
> > + * If the TSC counts at a constant frequency across P/T states and in
> > + * deep C-states, treat the TSC reliable, as guaranteed by KVM.
> > + */
> > + tsc_is_reliable = boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
> > + boot_cpu_has(X86_FEATURE_NONSTOP_TSC);
> > + if (tsc_is_reliable)
> > + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
>
> [Severity: High]
> Does setting X86_FEATURE_TSC_RELIABLE here bypass SMP cross-CPU TSC
> synchronization checks?
No, KVM_FEATURE_CLOCKSOURCE_STABLE_BIT is for kvmclock, not for TSC. The
CONSTANT+NONSTOP combination is KVM's way of guaranteeing cross-CPU synchronization.
> Forcing X86_FEATURE_TSC_RELIABLE based only on CONSTANT_TSC and NONSTOP_TSC,
> without verifying if the hypervisor actually guarantees cross-CPU
> synchronization (e.g., via KVM_FEATURE_CLOCKSOURCE_STABLE_BIT), sets
> tsc_clocksource_reliable = 1.
>
> This bypasses the synchronization check in check_tsc_sync_source():
>
> arch/x86/kernel/tsc_sync.c:check_tsc_sync_source() {
> ...
> if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable)
> return;
> ...
> }
>
> Could this cause time to go backwards for guests running on hosts with
> unsynchronized TSCs when threads migrate between vCPUs?
^ permalink raw reply [flat|nested] 72+ messages in thread
* [PATCH v5 23/51] x86/tsc: Add standalone helper for getting CPU frequency from CPUID
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (21 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 22/51] x86/kvm: Mark TSC as reliable when it's constant and nonstop Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 24/51] x86/kvm: Get CPU base frequency from CPUID when it's available Sean Christopherson
` (27 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Extract the guts of cpu_khz_from_cpuid() to a standalone helper that
doesn't restrict the usage to Intel CPUs. This will allow sharing the
core logic with KVM-as-a-guest, as KVM generally doesn't restrict CPUID
based on vendor.
No functional change intended.
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/tsc.h | 1 +
arch/x86/kernel/tsc.c | 31 +++++++++++++++----------------
2 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index c09ec485abcd..cb682f097ea7 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -88,6 +88,7 @@ struct cpuid_tsc_info {
unsigned int crystal_khz;
};
extern int cpuid_get_tsc_info(struct cpuid_tsc_info *info);
+extern unsigned int __cpu_khz_from_cpuid(void);
extern void tsc_early_init(void);
extern void tsc_init(void);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 6ed6f8f012eb..56e73e96920a 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -668,6 +668,18 @@ int cpuid_get_tsc_info(struct cpuid_tsc_info *info)
return 0;
}
+unsigned int __cpu_khz_from_cpuid(void)
+{
+ unsigned int eax_base_mhz, ebx, ecx, edx;
+
+ if (boot_cpu_data.cpuid_level < CPUID_LEAF_FREQ)
+ return 0;
+
+ cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx, &ecx, &edx);
+
+ return eax_base_mhz * 1000;
+}
+
/**
* native_calibrate_tsc - determine TSC frequency
* Determine TSC frequency via CPUID, else return 0.
@@ -703,12 +715,8 @@ static unsigned long native_calibrate_tsc(void)
* clock, but we can easily calculate it to a high degree of accuracy
* by considering the crystal ratio and the CPU speed.
*/
- if (!info.crystal_khz && boot_cpu_data.cpuid_level >= CPUID_LEAF_FREQ) {
- unsigned int eax_base_mhz, ebx, ecx, edx;
-
- cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx, &ecx, &edx);
- info.crystal_khz = eax_base_mhz * 1000 * info.denominator / info.numerator;
- }
+ if (!info.crystal_khz)
+ info.crystal_khz = __cpu_khz_from_cpuid() * info.denominator / info.numerator;
if (!info.crystal_khz)
return 0;
@@ -733,19 +741,10 @@ static unsigned long native_calibrate_tsc(void)
static unsigned long cpu_khz_from_cpuid(void)
{
- unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx;
-
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return 0;
- if (boot_cpu_data.cpuid_level < CPUID_LEAF_FREQ)
- return 0;
-
- eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0;
-
- cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx);
-
- return eax_base_mhz * 1000;
+ return __cpu_khz_from_cpuid();
}
/*
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 24/51] x86/kvm: Get CPU base frequency from CPUID when it's available
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (22 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 23/51] x86/tsc: Add standalone helper for getting CPU frequency from CPUID Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:54 ` sashiko-bot
2026-07-01 19:31 ` [PATCH v5 25/51] clocksource: hyper-v: Register sched_clock save/restore iff it's necessary Sean Christopherson
` (26 subsequent siblings)
50 siblings, 1 reply; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
If CPUID.0x16 is present and valid, use the CPU frequency provided by
CPUID instead of assuming that the virtual CPU runs at the same
frequency as TSC and/or kvmclock. Back before constant TSCs were a
thing, treating the TSC and CPU frequencies as one and the same was
somewhat reasonable, but now it's nonsensical, especially if the
hypervisor explicitly enumerates the CPU frequency.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/kvm.c | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 1cef54e1e7d9..6c7011ff7bd1 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -50,6 +50,7 @@
#include <asm/e820/api.h>
static unsigned int kvm_tsc_khz_cpuid __initdata;
+static unsigned int kvm_cpu_khz_cpuid __initdata;
DEFINE_STATIC_KEY_FALSE_RO(kvm_async_pf_enabled);
@@ -928,6 +929,11 @@ static unsigned int __init kvm_get_tsc_khz(void)
return kvm_tsc_khz_cpuid;
}
+static unsigned int __init kvm_get_cpu_khz(void)
+{
+ return kvm_cpu_khz_cpuid;
+}
+
unsigned int kvm_arch_para_features(void)
{
return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
@@ -1041,6 +1047,14 @@ static void __init kvm_init_platform(void)
}
}
+ /*
+ * Prefer CPUID.0x16 over KVM's PV CPUID when possible, as the base CPU
+ * frequency isn't necessarily the same as the TSC frequency.
+ */
+ kvm_cpu_khz_cpuid = __cpu_khz_from_cpuid();
+ if (kvm_cpu_khz_cpuid)
+ x86_init.hyper.get_cpu_khz = kvm_get_cpu_khz;
+
/*
* If the TSC counts at a constant frequency across P/T states and in
* deep C-states, treat the TSC reliable, as guaranteed by KVM.
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* Re: [PATCH v5 24/51] x86/kvm: Get CPU base frequency from CPUID when it's available
2026-07-01 19:31 ` [PATCH v5 24/51] x86/kvm: Get CPU base frequency from CPUID when it's available Sean Christopherson
@ 2026-07-01 19:54 ` sashiko-bot
0 siblings, 0 replies; 72+ messages in thread
From: sashiko-bot @ 2026-07-01 19:54 UTC (permalink / raw)
To: Sean Christopherson; +Cc: kvm, linux-hyperv
Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [Medium] The base CPU frequency obtained from CPUID.0x16 is unconditionally clamped back to the TSC frequency if it differs by >10%, defeating the purpose of the patch.
--
commit 1331572140e49219bdcc6ceabc0370555476adde
Author: Sean Christopherson <seanjc@google.com>
x86/kvm: Get CPU base frequency from CPUID when it's available
This commit reads CPUID.0x16 to fetch the CPU base frequency if valid.
It then sets x86_init.hyper.get_cpu_khz to kvm_get_cpu_khz to avoid
assuming the CPU base frequency matches the TSC frequency.
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index 1cef54e1e7d9e..6c7011ff7bd13 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
[ ... ]
> @@ -1041,6 +1047,14 @@ static void __init kvm_init_platform(void)
> }
> }
>
> + /*
> + * Prefer CPUID.0x16 over KVM's PV CPUID when possible, as the base CPU
> + * frequency isn't necessarily the same as the TSC frequency.
> + */
> + kvm_cpu_khz_cpuid = __cpu_khz_from_cpuid();
> + if (kvm_cpu_khz_cpuid)
> + x86_init.hyper.get_cpu_khz = kvm_get_cpu_khz;
[Severity: Medium]
Will this new CPU frequency actually be used if it diverges significantly
from the TSC frequency?
Looking at determine_cpu_tsc_frequencies() in arch/x86/kernel/tsc.c, there
is existing logic that unconditionally clamps the CPU frequency back to the
TSC frequency if they differ by more than 10%:
else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
cpu_khz = tsc_khz;
Does this clamping defeat the purpose of this patch for guests running on
hosts where the base CPU frequency and TSC frequency differ by >10%?
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260701193212.749551-1-seanjc@google.com?part=24
^ permalink raw reply [flat|nested] 72+ messages in thread
* [PATCH v5 25/51] clocksource: hyper-v: Register sched_clock save/restore iff it's necessary
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (23 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 24/51] x86/kvm: Get CPU base frequency from CPUID when it's available Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 26/51] clocksource: hyper-v: Drop wrappers to sched_clock save/restore helpers Sean Christopherson
` (25 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Register the Hyper-V reference counter (refcounter) callbacks for saving
and restoring its PV sched_clock, if and only if the refcounter is
actually being used for sched_clock. Currently, Hyper-V overrides the
save/restore hooks if the reference TSC available, whereas the Hyper-V
refcounter code only overrides sched_clock if the reference TSC is
available *and* it's not invariant. The flaw is effectively papered over
by invoking the "old" save/restore callbacks as part of save/restore, but
that's unnecessary and fragile.
To avoid introducing more complexity, and to allow for additional cleanups
of the PV sched_clock code, move the save/restore hooks and logic into
hyperv_timer.c and simply wire up the hooks when overriding sched_clock
itself.
Note, while the Hyper-V refcounter code is intended to be architecture
neutral, CONFIG_PARAVIRT is firmly x86-only, i.e. adding a small amount of
x86 specific code (which will be reduced in future cleanups) doesn't
meaningfully pollute generic code.
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Tested-by: Michael Kelley <mhklinux@outlook.com>
Acked-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/cpu/mshyperv.c | 58 ------------------------------
drivers/clocksource/hyperv_timer.c | 50 ++++++++++++++++++++++++++
2 files changed, 50 insertions(+), 58 deletions(-)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index e03c69a4db33..c4da8e35319a 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -275,63 +275,6 @@ static void hv_guest_crash_shutdown(struct pt_regs *regs)
}
#endif /* CONFIG_CRASH_DUMP */
-static u64 hv_ref_counter_at_suspend;
-static void (*old_save_sched_clock_state)(void);
-static void (*old_restore_sched_clock_state)(void);
-
-/*
- * Hyper-V clock counter resets during hibernation. Save and restore clock
- * offset during suspend/resume, while also considering the time passed
- * before suspend. This is to make sure that sched_clock using hv tsc page
- * based clocksource, proceeds from where it left off during suspend and
- * it shows correct time for the timestamps of kernel messages after resume.
- */
-static void save_hv_clock_tsc_state(void)
-{
- hv_ref_counter_at_suspend = hv_read_reference_counter();
-}
-
-static void restore_hv_clock_tsc_state(void)
-{
- /*
- * Adjust the offsets used by hv tsc clocksource to
- * account for the time spent before hibernation.
- * adjusted value = reference counter (time) at suspend
- * - reference counter (time) now.
- */
- hv_adj_sched_clock_offset(hv_ref_counter_at_suspend - hv_read_reference_counter());
-}
-
-/*
- * Functions to override save_sched_clock_state and restore_sched_clock_state
- * functions of x86_platform. The Hyper-V clock counter is reset during
- * suspend-resume and the offset used to measure time needs to be
- * corrected, post resume.
- */
-static void hv_save_sched_clock_state(void)
-{
- old_save_sched_clock_state();
- save_hv_clock_tsc_state();
-}
-
-static void hv_restore_sched_clock_state(void)
-{
- restore_hv_clock_tsc_state();
- old_restore_sched_clock_state();
-}
-
-static void __init x86_setup_ops_for_tsc_pg_clock(void)
-{
- if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))
- return;
-
- old_save_sched_clock_state = x86_platform.save_sched_clock_state;
- x86_platform.save_sched_clock_state = hv_save_sched_clock_state;
-
- old_restore_sched_clock_state = x86_platform.restore_sched_clock_state;
- x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state;
-}
-
#ifdef CONFIG_X86_64
DEFINE_STATIC_CALL(hv_hypercall, hv_std_hypercall);
EXPORT_STATIC_CALL_TRAMP_GPL(hv_hypercall);
@@ -736,7 +679,6 @@ static void __init ms_hyperv_init_platform(void)
/* Register Hyper-V specific clocksource */
hv_init_clocksource();
- x86_setup_ops_for_tsc_pg_clock();
hv_vtl_init_platform();
#endif
/*
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index df567795d175..4293173c3a27 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -554,10 +554,60 @@ static __always_inline void hv_setup_sched_clock(void *sched_clock)
#elif defined CONFIG_PARAVIRT
#include <asm/timer.h>
+static u64 hv_ref_counter_at_suspend;
+static void (*old_save_sched_clock_state)(void);
+static void (*old_restore_sched_clock_state)(void);
+
+/*
+ * Hyper-V clock counter resets during hibernation. Save and restore clock
+ * offset during suspend/resume, while also considering the time passed
+ * before suspend. This is to make sure that sched_clock using hv tsc page
+ * based clocksource, proceeds from where it left off during suspend and
+ * it shows correct time for the timestamps of kernel messages after resume.
+ */
+static void save_hv_clock_tsc_state(void)
+{
+ hv_ref_counter_at_suspend = hv_read_reference_counter();
+}
+
+static void restore_hv_clock_tsc_state(void)
+{
+ /*
+ * Adjust the offsets used by hv tsc clocksource to
+ * account for the time spent before hibernation.
+ * adjusted value = reference counter (time) at suspend
+ * - reference counter (time) now.
+ */
+ hv_adj_sched_clock_offset(hv_ref_counter_at_suspend - hv_read_reference_counter());
+}
+/*
+ * Functions to override save_sched_clock_state and restore_sched_clock_state
+ * functions of x86_platform. The Hyper-V clock counter is reset during
+ * suspend-resume and the offset used to measure time needs to be
+ * corrected, post resume.
+ */
+static void hv_save_sched_clock_state(void)
+{
+ old_save_sched_clock_state();
+ save_hv_clock_tsc_state();
+}
+
+static void hv_restore_sched_clock_state(void)
+{
+ restore_hv_clock_tsc_state();
+ old_restore_sched_clock_state();
+}
+
static __always_inline void hv_setup_sched_clock(void *sched_clock)
{
/* We're on x86/x64 *and* using PV ops */
paravirt_set_sched_clock(sched_clock);
+
+ old_save_sched_clock_state = x86_platform.save_sched_clock_state;
+ x86_platform.save_sched_clock_state = hv_save_sched_clock_state;
+
+ old_restore_sched_clock_state = x86_platform.restore_sched_clock_state;
+ x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state;
}
#else /* !CONFIG_GENERIC_SCHED_CLOCK && !CONFIG_PARAVIRT */
static __always_inline void hv_setup_sched_clock(void *sched_clock) {}
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 26/51] clocksource: hyper-v: Drop wrappers to sched_clock save/restore helpers
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (24 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 25/51] clocksource: hyper-v: Register sched_clock save/restore iff it's necessary Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 27/51] clocksource: hyper-v: Don't save/restore TSC offset when using HV sched_clock Sean Christopherson
` (24 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Now that all of the Hyper-V reference counter sched_clock code is located
in a single file, drop the superfluous wrappers for the save/restore flows.
No functional change intended.
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Tested-by: Michael Kelley <mhklinux@outlook.com>
Acked-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
drivers/clocksource/hyperv_timer.c | 34 +++++-------------------------
include/clocksource/hyperv_timer.h | 2 --
2 files changed, 5 insertions(+), 31 deletions(-)
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index 4293173c3a27..daa8cbfe61ee 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -488,17 +488,6 @@ static void resume_hv_clock_tsc(struct clocksource *arg)
hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
}
-/*
- * Called during resume from hibernation, from overridden
- * x86_platform.restore_sched_clock_state routine. This is to adjust offsets
- * used to calculate time for hv tsc page based sched_clock, to account for
- * time spent before hibernation.
- */
-void hv_adj_sched_clock_offset(u64 offset)
-{
- hv_sched_clock_offset -= offset;
-}
-
#ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK
static int hv_cs_enable(struct clocksource *cs)
{
@@ -565,12 +554,14 @@ static void (*old_restore_sched_clock_state)(void);
* based clocksource, proceeds from where it left off during suspend and
* it shows correct time for the timestamps of kernel messages after resume.
*/
-static void save_hv_clock_tsc_state(void)
+static void hv_save_sched_clock_state(void)
{
+ old_save_sched_clock_state();
+
hv_ref_counter_at_suspend = hv_read_reference_counter();
}
-static void restore_hv_clock_tsc_state(void)
+static void hv_restore_sched_clock_state(void)
{
/*
* Adjust the offsets used by hv tsc clocksource to
@@ -578,23 +569,8 @@ static void restore_hv_clock_tsc_state(void)
* adjusted value = reference counter (time) at suspend
* - reference counter (time) now.
*/
- hv_adj_sched_clock_offset(hv_ref_counter_at_suspend - hv_read_reference_counter());
-}
-/*
- * Functions to override save_sched_clock_state and restore_sched_clock_state
- * functions of x86_platform. The Hyper-V clock counter is reset during
- * suspend-resume and the offset used to measure time needs to be
- * corrected, post resume.
- */
-static void hv_save_sched_clock_state(void)
-{
- old_save_sched_clock_state();
- save_hv_clock_tsc_state();
-}
+ hv_sched_clock_offset -= (hv_ref_counter_at_suspend - hv_read_reference_counter());
-static void hv_restore_sched_clock_state(void)
-{
- restore_hv_clock_tsc_state();
old_restore_sched_clock_state();
}
diff --git a/include/clocksource/hyperv_timer.h b/include/clocksource/hyperv_timer.h
index d48dd4176fd3..a4c81a60f53d 100644
--- a/include/clocksource/hyperv_timer.h
+++ b/include/clocksource/hyperv_timer.h
@@ -38,8 +38,6 @@ extern void hv_remap_tsc_clocksource(void);
extern unsigned long hv_get_tsc_pfn(void);
extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void);
-extern void hv_adj_sched_clock_offset(u64 offset);
-
static __always_inline bool
hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg,
u64 *cur_tsc, u64 *time)
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 27/51] clocksource: hyper-v: Don't save/restore TSC offset when using HV sched_clock
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (25 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 26/51] clocksource: hyper-v: Drop wrappers to sched_clock save/restore helpers Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 28/51] x86/kvmclock: Setup kvmclock for secondary CPUs iff CONFIG_SMP=y Sean Christopherson
` (23 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Now that Hyper-V overrides the sched_clock save/restore hooks if and only
sched_clock itself is set to the Hyper-V reference counter, drop the
invocation of the "old" save/restore callbacks. When the registration of
the PV sched_clock was done separately from overriding the save/restore
hooks, it was possible for Hyper-V to clobber the TSC save/restore
callbacks without actually switching to the Hyper-V refcounter.
Enabling a PV sched_clock is a one-way street, i.e. the kernel will never
revert to using TSC for sched_clock, and so there is no need to invoke the
TSC save/restore hooks (and if there was, it belongs in common PV code).
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Tested-by: Michael Kelley <mhklinux@outlook.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Acked-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
drivers/clocksource/hyperv_timer.c | 10 ----------
1 file changed, 10 deletions(-)
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index daa8cbfe61ee..220668207d19 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -544,9 +544,6 @@ static __always_inline void hv_setup_sched_clock(void *sched_clock)
#include <asm/timer.h>
static u64 hv_ref_counter_at_suspend;
-static void (*old_save_sched_clock_state)(void);
-static void (*old_restore_sched_clock_state)(void);
-
/*
* Hyper-V clock counter resets during hibernation. Save and restore clock
* offset during suspend/resume, while also considering the time passed
@@ -556,8 +553,6 @@ static void (*old_restore_sched_clock_state)(void);
*/
static void hv_save_sched_clock_state(void)
{
- old_save_sched_clock_state();
-
hv_ref_counter_at_suspend = hv_read_reference_counter();
}
@@ -570,8 +565,6 @@ static void hv_restore_sched_clock_state(void)
* - reference counter (time) now.
*/
hv_sched_clock_offset -= (hv_ref_counter_at_suspend - hv_read_reference_counter());
-
- old_restore_sched_clock_state();
}
static __always_inline void hv_setup_sched_clock(void *sched_clock)
@@ -579,10 +572,7 @@ static __always_inline void hv_setup_sched_clock(void *sched_clock)
/* We're on x86/x64 *and* using PV ops */
paravirt_set_sched_clock(sched_clock);
- old_save_sched_clock_state = x86_platform.save_sched_clock_state;
x86_platform.save_sched_clock_state = hv_save_sched_clock_state;
-
- old_restore_sched_clock_state = x86_platform.restore_sched_clock_state;
x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state;
}
#else /* !CONFIG_GENERIC_SCHED_CLOCK && !CONFIG_PARAVIRT */
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 28/51] x86/kvmclock: Setup kvmclock for secondary CPUs iff CONFIG_SMP=y
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (26 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 27/51] clocksource: hyper-v: Don't save/restore TSC offset when using HV sched_clock Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 29/51] x86/kvm: Don't disable kvmclock on BSP in syscore_suspend() Sean Christopherson
` (22 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Gate kvmclock's secondary CPU code on CONFIG_SMP, not CONFIG_X86_LOCAL_APIC.
Originally, kvmclock piggybacked PV APIC ops to setup secondary CPUs.
When that wart was fixed by commit df156f90a0f9 ("x86: Introduce
x86_cpuinit.early_percpu_clock_init hook"), the dependency on a local APIC
got carried forward unnecessarily.
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/kvmclock.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 2e7ab54cb9dc..b0c871ba8232 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -208,7 +208,7 @@ static void kvm_restore_sched_clock_state(void)
kvm_register_clock("primary cpu clock, resume");
}
-#ifdef CONFIG_X86_LOCAL_APIC
+#ifdef CONFIG_SMP
static void kvm_setup_secondary_clock(void)
{
kvm_register_clock("secondary cpu clock");
@@ -348,7 +348,7 @@ void __init kvmclock_init(bool prefer_tsc)
x86_init.hyper.get_cpu_khz = kvmclock_get_tsc_khz;
x86_platform.get_wallclock = kvm_get_wallclock;
x86_platform.set_wallclock = kvm_set_wallclock;
-#ifdef CONFIG_X86_LOCAL_APIC
+#ifdef CONFIG_SMP
x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock;
#endif
x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 29/51] x86/kvm: Don't disable kvmclock on BSP in syscore_suspend()
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (27 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 28/51] x86/kvmclock: Setup kvmclock for secondary CPUs iff CONFIG_SMP=y Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 20:03 ` sashiko-bot
2026-07-01 19:31 ` [PATCH v5 30/51] x86/paravirt: Remove unnecessary PARAVIRT=n stub for paravirt_set_sched_clock() Sean Christopherson
` (21 subsequent siblings)
50 siblings, 1 reply; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Don't disable kvmclock on the BSP during syscore_suspend(), as the BSP's
clock is NOT restored during syscore_resume(), but is instead restored
earlier via the sched_clock restore callback. If suspend is aborted, e.g.
due to a late wakeup, the BSP will run without its clock enabled, which
"works" only because KVM-the-hypervisor is kind enough to not clobber the
shared memory when the clock is disabled. But over time, the BSP's view
of time will drift from APs.
Plumb in an "action" to KVM-as-a-guest and kvmclock code in preparation
for additional cleanups to kvmclock's suspend/resume logic.
Fixes: c02027b5742b ("x86/kvm: Disable kvmclock on all CPUs on shutdown")
Cc: stable@vger.kernel.org
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/kvm_para.h | 8 +++++++-
arch/x86/kernel/kvm.c | 15 ++++++++-------
arch/x86/kernel/kvmclock.c | 31 +++++++++++++++++++++++++------
3 files changed, 40 insertions(+), 14 deletions(-)
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 4a49fc286b4c..08686ff19caa 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -118,8 +118,14 @@ static inline long kvm_sev_hypercall3(unsigned int nr, unsigned long p1,
}
#ifdef CONFIG_KVM_GUEST
+enum kvm_guest_cpu_action {
+ KVM_GUEST_BSP_SUSPEND,
+ KVM_GUEST_AP_OFFLINE,
+ KVM_GUEST_SHUTDOWN,
+};
+
void kvmclock_init(bool prefer_tsc);
-void kvmclock_disable(void);
+void kvmclock_cpu_action(enum kvm_guest_cpu_action action);
bool kvm_para_available(void);
unsigned int kvm_arch_para_features(void);
unsigned int kvm_arch_para_hints(void);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 6c7011ff7bd1..604b52f233aa 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -460,7 +460,7 @@ static void __init sev_map_percpu_data(void)
}
}
-static void kvm_guest_cpu_offline(bool shutdown)
+static void kvm_guest_cpu_offline(enum kvm_guest_cpu_action action)
{
kvm_disable_steal_time();
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
@@ -468,9 +468,10 @@ static void kvm_guest_cpu_offline(bool shutdown)
if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
wrmsrq(MSR_KVM_MIGRATION_CONTROL, 0);
kvm_pv_disable_apf();
- if (!shutdown)
+ if (action != KVM_GUEST_SHUTDOWN)
apf_task_wake_all();
- kvmclock_disable();
+
+ kvmclock_cpu_action(action);
}
static int kvm_cpu_online(unsigned int cpu)
@@ -726,7 +727,7 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
unsigned long flags;
local_irq_save(flags);
- kvm_guest_cpu_offline(false);
+ kvm_guest_cpu_offline(KVM_GUEST_AP_OFFLINE);
local_irq_restore(flags);
return 0;
}
@@ -737,7 +738,7 @@ static int kvm_suspend(void *data)
{
u64 val = 0;
- kvm_guest_cpu_offline(false);
+ kvm_guest_cpu_offline(KVM_GUEST_BSP_SUSPEND);
#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
@@ -768,7 +769,7 @@ static struct syscore kvm_syscore = {
static void kvm_pv_guest_cpu_reboot(void *unused)
{
- kvm_guest_cpu_offline(true);
+ kvm_guest_cpu_offline(KVM_GUEST_SHUTDOWN);
}
static int kvm_pv_reboot_notify(struct notifier_block *nb,
@@ -792,7 +793,7 @@ static struct notifier_block kvm_pv_reboot_nb = {
#ifdef CONFIG_CRASH_DUMP
static void kvm_crash_shutdown(struct pt_regs *regs)
{
- kvm_guest_cpu_offline(true);
+ kvm_guest_cpu_offline(KVM_GUEST_SHUTDOWN);
native_machine_crash_shutdown(regs);
}
#endif
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index b0c871ba8232..a3ec298d56d7 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -199,8 +199,22 @@ static void kvm_register_clock(char *txt)
pr_debug("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
}
+static void kvmclock_disable(void)
+{
+ if (msr_kvm_system_time)
+ native_write_msr(msr_kvm_system_time, 0);
+}
+
static void kvm_save_sched_clock_state(void)
{
+ /*
+ * Stop host writes to kvmclock immediately prior to suspend/hibernate.
+ * If the system is hibernating, then kvmclock will likely reside at a
+ * different physical address when the system awakens, and host writes
+ * to the old address prior to reconfiguring kvmclock would clobber
+ * random memory.
+ */
+ kvmclock_disable();
}
static void kvm_restore_sched_clock_state(void)
@@ -208,6 +222,17 @@ static void kvm_restore_sched_clock_state(void)
kvm_register_clock("primary cpu clock, resume");
}
+void kvmclock_cpu_action(enum kvm_guest_cpu_action action)
+{
+ /*
+ * Don't disable kvmclock on the BSP during suspend. If kvmclock is
+ * being used for sched_clock, then it needs to be kept alive until the
+ * last minute, and restored as quickly as possible after resume.
+ */
+ if (action != KVM_GUEST_BSP_SUSPEND)
+ kvmclock_disable();
+}
+
#ifdef CONFIG_SMP
static void kvm_setup_secondary_clock(void)
{
@@ -215,12 +240,6 @@ static void kvm_setup_secondary_clock(void)
}
#endif
-void kvmclock_disable(void)
-{
- if (msr_kvm_system_time)
- native_write_msr(msr_kvm_system_time, 0);
-}
-
static void __init kvmclock_init_mem(void)
{
unsigned long ncpus;
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* Re: [PATCH v5 29/51] x86/kvm: Don't disable kvmclock on BSP in syscore_suspend()
2026-07-01 19:31 ` [PATCH v5 29/51] x86/kvm: Don't disable kvmclock on BSP in syscore_suspend() Sean Christopherson
@ 2026-07-01 20:03 ` sashiko-bot
2026-07-01 20:43 ` Sean Christopherson
0 siblings, 1 reply; 72+ messages in thread
From: sashiko-bot @ 2026-07-01 20:03 UTC (permalink / raw)
To: Sean Christopherson; +Cc: kvm, linux-hyperv
Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
New issues:
- [High] The patch inadvertently leaves the BSP's kvmclock enabled when powering off the machine for ACPI S4 hibernation, leading to potential memory corruption when the VM awakens.
Pre-existing issues:
- [Critical] Stack-allocated waitqueue head use-after-free in async page fault handling.
--
Note: The standard inline diff format has been altered to a free-form
summary due to patch recitation restrictions.
commit 03858970f2d7a88b91bdd5b16c1573166126d3c4
Author: Sean Christopherson <seanjc@google.com>
Subject: x86/kvm: Don't disable kvmclock on BSP in syscore_suspend()
This patch modifies the guest suspend logic so that kvmclock is not disabled
on the BSP during syscore_suspend(), relying instead on the sched_clock
restore callback.
[Severity: High]
Does this change inadvertently leave the BSP's kvmclock enabled when
powering off the machine for ACPI S4 hibernation?
Looking at the final S4 power-off path in hibernation_platform_enter() in
kernel/power/hibernate.c, it calls syscore_suspend() but does not call
save_processor_state(). Because this patch removes the unconditional BSP
kvmclock_disable() from syscore_suspend() and relies on
kvm_save_sched_clock_state() (which only fires during image creation),
kvmclock appears to remain fully enabled during the S4 power-off.
Could this lead to memory corruption when the VM awakens, since the
hypervisor might continue writing to the old kvmclock address?
[Severity: Critical]
This isn't a bug introduced by this patch, but is there a use-after-free
risk with the stack-allocated waitqueue head in the async page fault
handling in arch/x86/kernel/kvm.c?
When a guest task waits for an async page fault in
kvm_async_pf_task_wait_schedule(), it waits on a stack-allocated struct
kvm_task_sleep_node. Concurrently, when a waker CPU executes
apf_task_wake_one(), it unhashes the node via hlist_del_init().
The waiter checks hlist_unhashed() locklessly. If it observes the unhashed
state before the waker is finished, the waiter can break its loop, return,
and destroy the stack frame. The waker might then dereference the
deallocated waitqueue head to call swake_up_one(), potentially corrupting
stack memory.
Can this sequence occur during concurrent page fault completions?
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260701193212.749551-1-seanjc@google.com?part=29
^ permalink raw reply [flat|nested] 72+ messages in thread
* Re: [PATCH v5 29/51] x86/kvm: Don't disable kvmclock on BSP in syscore_suspend()
2026-07-01 20:03 ` sashiko-bot
@ 2026-07-01 20:43 ` Sean Christopherson
0 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 20:43 UTC (permalink / raw)
To: sashiko-reviews; +Cc: kvm, linux-hyperv
On Wed, Jul 01, 2026, sashiko-bot@kernel.org wrote:
> [Severity: High]
> Does this change inadvertently leave the BSP's kvmclock enabled when
> powering off the machine for ACPI S4 hibernation?
>
> Looking at the final S4 power-off path in hibernation_platform_enter() in
> kernel/power/hibernate.c, it calls syscore_suspend() but does not call
> save_processor_state(). Because this patch removes the unconditional BSP
> kvmclock_disable() from syscore_suspend() and relies on
> kvm_save_sched_clock_state() (which only fires during image creation),
> kvmclock appears to remain fully enabled during the S4 power-off.
No?
hibernate()
|
-> hibernation_snapshot()
| |
| -> create_image()
| |
| -> save_processor_state()
|
-> power_down()
|
-> hibernation_platform_enter()
I'm not familiar with the /dev/snapshot interface, but I gotta imagine userspace
needs to do SNAPSHOT_CREATE_IMAGE before SNAPSHOT_POWER_OFF if it actually wants
to restore state after S4.
> [Severity: Critical]
> This isn't a bug introduced by this patch, but is there a use-after-free
> risk with the stack-allocated waitqueue head in the async page fault
> handling in arch/x86/kernel/kvm.c?
And I thought I had a problem with scope creep during review...
> When a guest task waits for an async page fault in
> kvm_async_pf_task_wait_schedule(), it waits on a stack-allocated struct
> kvm_task_sleep_node. Concurrently, when a waker CPU executes
> apf_task_wake_one(), it unhashes the node via hlist_del_init().
>
> The waiter checks hlist_unhashed() locklessly. If it observes the unhashed
> state before the waker is finished, the waiter can break its loop, return,
> and destroy the stack frame. The waker might then dereference the
> deallocated waitqueue head to call swake_up_one(), potentially corrupting
> stack memory.
>
> Can this sequence occur during concurrent page fault completions?
No idea, definitely something to sort out at a different time.
^ permalink raw reply [flat|nested] 72+ messages in thread
* [PATCH v5 30/51] x86/paravirt: Remove unnecessary PARAVIRT=n stub for paravirt_set_sched_clock()
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (28 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 29/51] x86/kvm: Don't disable kvmclock on BSP in syscore_suspend() Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 31/51] x86/paravirt: Move handling of unstable PV clocks into paravirt_set_sched_clock() Sean Christopherson
` (20 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Remove the unnecessary paravirt_set_sched_clock() stub for PARAVIRT=n, as
all callers are gated by PARAVIRT=y. Eliminating the stub will avoid a
pile of pointless churn as the "real" implementation evolves.
No functional change intended.
Fixes: 39965afb1151 ("x86/paravirt: Move paravirt_sched_clock() related code into tsc.c")
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/timer.h | 3 +++
arch/x86/kernel/tsc.c | 1 -
2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index fda18bcb19b4..c71b466d6ace 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -12,7 +12,10 @@ extern void recalibrate_cpu_khz(void);
extern int no_timer_check;
extern bool using_native_sched_clock(void);
+
+#ifdef CONFIG_PARAVIRT
void paravirt_set_sched_clock(u64 (*func)(void));
+#endif
/*
* We use the full linear equation: f(x) = a + b*x, in order to allow
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 56e73e96920a..375b0279df66 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -288,7 +288,6 @@ void paravirt_set_sched_clock(u64 (*func)(void))
u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock")));
bool using_native_sched_clock(void) { return true; }
-void paravirt_set_sched_clock(u64 (*func)(void)) { }
#endif
notrace u64 sched_clock(void)
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 31/51] x86/paravirt: Move handling of unstable PV clocks into paravirt_set_sched_clock()
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (29 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 30/51] x86/paravirt: Remove unnecessary PARAVIRT=n stub for paravirt_set_sched_clock() Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 32/51] x86/kvmclock: Move sched_clock save/restore helpers up in kvmclock.c Sean Christopherson
` (19 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Move the handling of unstable PV clocks, of which kvmclock is the only
example, into paravirt_set_sched_clock(). This will allow modifying
paravirt_set_sched_clock() to keep using the TSC for sched_clock in
certain scenarios without unintentionally marking the TSC-based clock as
unstable.
No functional change intended.
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/timer.h | 7 ++++++-
arch/x86/kernel/kvmclock.c | 5 +----
arch/x86/kernel/tsc.c | 5 ++++-
3 files changed, 11 insertions(+), 6 deletions(-)
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index c71b466d6ace..fe41d40a9ae6 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -14,7 +14,12 @@ extern int no_timer_check;
extern bool using_native_sched_clock(void);
#ifdef CONFIG_PARAVIRT
-void paravirt_set_sched_clock(u64 (*func)(void));
+void __paravirt_set_sched_clock(u64 (*func)(void), bool stable);
+
+static inline void paravirt_set_sched_clock(u64 (*func)(void))
+{
+ __paravirt_set_sched_clock(func, true);
+}
#endif
/*
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index a3ec298d56d7..4bc0495f1f9e 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -12,7 +12,6 @@
#include <linux/hardirq.h>
#include <linux/cpuhotplug.h>
#include <linux/sched.h>
-#include <linux/sched/clock.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/set_memory.h>
@@ -115,10 +114,8 @@ static noinstr u64 kvm_sched_clock_read(void)
static inline void kvm_sched_clock_init(bool stable)
{
- if (!stable)
- clear_sched_clock_stable();
kvm_sched_clock_offset = kvm_clock_read();
- paravirt_set_sched_clock(kvm_sched_clock_read);
+ __paravirt_set_sched_clock(kvm_sched_clock_read, stable);
pr_info("kvm-clock: using sched offset of %llu cycles",
kvm_sched_clock_offset);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 375b0279df66..a762cb5cec0f 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -280,8 +280,11 @@ bool using_native_sched_clock(void)
return static_call_query(pv_sched_clock) == native_sched_clock;
}
-void paravirt_set_sched_clock(u64 (*func)(void))
+void __paravirt_set_sched_clock(u64 (*func)(void), bool stable)
{
+ if (!stable)
+ clear_sched_clock_stable();
+
static_call_update(pv_sched_clock, func);
}
#else
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 32/51] x86/kvmclock: Move sched_clock save/restore helpers up in kvmclock.c
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (30 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 31/51] x86/paravirt: Move handling of unstable PV clocks into paravirt_set_sched_clock() Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 33/51] x86/xen/time: NOP-ify x86_platform's sched_clock save/restore hooks Sean Christopherson
` (18 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Move kvmclock's sched_clock save/restore helper "up" so that they can
(eventually) be referenced by kvm_sched_clock_init().
No functional change intended.
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/kvmclock.c | 108 ++++++++++++++++++-------------------
1 file changed, 54 insertions(+), 54 deletions(-)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 4bc0495f1f9e..07e875738c39 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -71,6 +71,25 @@ static int kvm_set_wallclock(const struct timespec64 *now)
return -ENODEV;
}
+static void kvm_register_clock(char *txt)
+{
+ struct pvclock_vsyscall_time_info *src = this_cpu_hvclock();
+ u64 pa;
+
+ if (!src)
+ return;
+
+ pa = slow_virt_to_phys(&src->pvti) | 0x01ULL;
+ wrmsrq(msr_kvm_system_time, pa);
+ pr_debug("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
+}
+
+static void kvmclock_disable(void)
+{
+ if (msr_kvm_system_time)
+ native_write_msr(msr_kvm_system_time, 0);
+}
+
static u64 kvm_clock_read(void)
{
u64 ret;
@@ -112,6 +131,30 @@ static noinstr u64 kvm_sched_clock_read(void)
return pvclock_clocksource_read_nowd(this_cpu_pvti()) - kvm_sched_clock_offset;
}
+static void kvm_save_sched_clock_state(void)
+{
+ /*
+ * Stop host writes to kvmclock immediately prior to suspend/hibernate.
+ * If the system is hibernating, then kvmclock will likely reside at a
+ * different physical address when the system awakens, and host writes
+ * to the old address prior to reconfiguring kvmclock would clobber
+ * random memory.
+ */
+ kvmclock_disable();
+}
+
+#ifdef CONFIG_SMP
+static void kvm_setup_secondary_clock(void)
+{
+ kvm_register_clock("secondary cpu clock");
+}
+#endif
+
+static void kvm_restore_sched_clock_state(void)
+{
+ kvm_register_clock("primary cpu clock, resume");
+}
+
static inline void kvm_sched_clock_init(bool stable)
{
kvm_sched_clock_offset = kvm_clock_read();
@@ -124,6 +167,17 @@ static inline void kvm_sched_clock_init(bool stable)
sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
}
+void kvmclock_cpu_action(enum kvm_guest_cpu_action action)
+{
+ /*
+ * Don't disable kvmclock on the BSP during suspend. If kvmclock is
+ * being used for sched_clock, then it needs to be kept alive until the
+ * last minute, and restored as quickly as possible after resume.
+ */
+ if (action != KVM_GUEST_BSP_SUSPEND)
+ kvmclock_disable();
+}
+
/*
* If we don't do that, there is the possibility that the guest
* will calibrate under heavy load - thus, getting a lower lpj -
@@ -183,60 +237,6 @@ static struct clocksource kvm_clock = {
.enable = kvm_cs_enable,
};
-static void kvm_register_clock(char *txt)
-{
- struct pvclock_vsyscall_time_info *src = this_cpu_hvclock();
- u64 pa;
-
- if (!src)
- return;
-
- pa = slow_virt_to_phys(&src->pvti) | 0x01ULL;
- wrmsrq(msr_kvm_system_time, pa);
- pr_debug("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
-}
-
-static void kvmclock_disable(void)
-{
- if (msr_kvm_system_time)
- native_write_msr(msr_kvm_system_time, 0);
-}
-
-static void kvm_save_sched_clock_state(void)
-{
- /*
- * Stop host writes to kvmclock immediately prior to suspend/hibernate.
- * If the system is hibernating, then kvmclock will likely reside at a
- * different physical address when the system awakens, and host writes
- * to the old address prior to reconfiguring kvmclock would clobber
- * random memory.
- */
- kvmclock_disable();
-}
-
-static void kvm_restore_sched_clock_state(void)
-{
- kvm_register_clock("primary cpu clock, resume");
-}
-
-void kvmclock_cpu_action(enum kvm_guest_cpu_action action)
-{
- /*
- * Don't disable kvmclock on the BSP during suspend. If kvmclock is
- * being used for sched_clock, then it needs to be kept alive until the
- * last minute, and restored as quickly as possible after resume.
- */
- if (action != KVM_GUEST_BSP_SUSPEND)
- kvmclock_disable();
-}
-
-#ifdef CONFIG_SMP
-static void kvm_setup_secondary_clock(void)
-{
- kvm_register_clock("secondary cpu clock");
-}
-#endif
-
static void __init kvmclock_init_mem(void)
{
unsigned long ncpus;
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 33/51] x86/xen/time: NOP-ify x86_platform's sched_clock save/restore hooks
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (31 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 32/51] x86/kvmclock: Move sched_clock save/restore helpers up in kvmclock.c Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 34/51] x86/vmware: NOP-ify save/restore hooks when using VMware's sched_clock Sean Christopherson
` (17 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
NOP-ify the x86_platform sched_clock save/restore hooks when setting up
Xen's PV clock to make it somewhat obvious the hooks aren't used when
running as a Xen guest (Xen uses a paravirtualized suspend/resume flow).
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/xen/time.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 487ad838c441..477441752f40 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -567,6 +567,12 @@ static void __init xen_init_time_common(void)
xen_sched_clock_offset = xen_clocksource_read();
static_call_update(pv_steal_clock, xen_steal_clock);
paravirt_set_sched_clock(xen_sched_clock);
+ /*
+ * Xen has paravirtualized suspend/resume and so doesn't use the common
+ * x86 sched_clock save/restore hooks.
+ */
+ x86_platform.save_sched_clock_state = x86_init_noop;
+ x86_platform.restore_sched_clock_state = x86_init_noop;
x86_init.hyper.get_tsc_khz = xen_tsc_khz;
x86_platform.get_wallclock = xen_get_wallclock;
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 34/51] x86/vmware: NOP-ify save/restore hooks when using VMware's sched_clock
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (32 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 33/51] x86/xen/time: NOP-ify x86_platform's sched_clock save/restore hooks Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 35/51] x86/tsc: WARN if TSC sched_clock save/restore used with PV sched_clock Sean Christopherson
` (16 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
NOP-ify the sched_clock save/restore hooks when using VMware's version of
sched_clock. This will allow extending paravirt_set_sched_clock() to set
the save/restore hooks, without having to simultaneously change the
behavior of VMware guests.
Note, it's not at all obvious that it's safe/correct for VMware guests to
do nothing on suspend/resume, but that's a pre-existing problem. Leave it
for a VMware expert to sort out.
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/cpu/vmware.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 0a3bd90576d4..5c1ccaf4a25e 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -347,8 +347,11 @@ static void __init vmware_paravirt_ops_setup(void)
vmware_cyc2ns_setup();
- if (vmw_sched_clock)
+ if (vmw_sched_clock) {
paravirt_set_sched_clock(vmware_sched_clock);
+ x86_platform.save_sched_clock_state = x86_init_noop;
+ x86_platform.restore_sched_clock_state = x86_init_noop;
+ }
if (vmware_is_stealclock_available()) {
has_steal_clock = true;
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 35/51] x86/tsc: WARN if TSC sched_clock save/restore used with PV sched_clock
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (33 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 34/51] x86/vmware: NOP-ify save/restore hooks when using VMware's sched_clock Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 36/51] x86/paravirt: Pass sched_clock save/restore helpers during registration Sean Christopherson
` (15 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Now that all PV clocksources override the sched_clock save/restore hooks
when overriding sched_clock, WARN if the "default" TSC hooks are invoked
when using a PV sched_clock, e.g. to guard against regressions.
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/tsc.c | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index a762cb5cec0f..7473dcab4775 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -947,9 +947,17 @@ EXPORT_SYMBOL_FOR_MODULES(recalibrate_cpu_khz, "p4-clockmod,powernow-k7");
static unsigned long long cyc2ns_suspend;
+static __always_inline bool tsc_is_save_restore_needed(void)
+{
+ if (WARN_ON_ONCE(!using_native_sched_clock()))
+ return false;
+
+ return static_branch_likely(&__use_tsc) || sched_clock_stable();
+}
+
void tsc_save_sched_clock_state(void)
{
- if (!static_branch_likely(&__use_tsc) && !sched_clock_stable())
+ if (!tsc_is_save_restore_needed())
return;
cyc2ns_suspend = sched_clock();
@@ -969,7 +977,7 @@ void tsc_restore_sched_clock_state(void)
unsigned long flags;
int cpu;
- if (!static_branch_likely(&__use_tsc) && !sched_clock_stable())
+ if (!tsc_is_save_restore_needed())
return;
local_irq_save(flags);
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 36/51] x86/paravirt: Pass sched_clock save/restore helpers during registration
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (34 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 35/51] x86/tsc: WARN if TSC sched_clock save/restore used with PV sched_clock Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-02 17:48 ` Michael Kelley
2026-07-01 19:31 ` [PATCH v5 37/51] x86/kvmclock: Move kvm_sched_clock_init() down in kvmclock.c Sean Christopherson
` (14 subsequent siblings)
50 siblings, 1 reply; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Pass in a PV clock's save/restore helpers when configuring sched_clock
instead of relying on each PV clock to manually set the save/restore hooks.
In addition to bringing sanity to the code, this will allow gracefully
"rejecting" a PV sched_clock, e.g. when running as a CoCo guest that has
access to a "secure" TSC.
No functional change intended.
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/timer.h | 9 ++++++---
arch/x86/kernel/cpu/vmware.c | 8 +++-----
arch/x86/kernel/kvmclock.c | 6 +++---
arch/x86/kernel/tsc.c | 5 ++++-
arch/x86/xen/time.c | 5 ++---
drivers/clocksource/hyperv_timer.c | 6 ++----
6 files changed, 20 insertions(+), 19 deletions(-)
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index fe41d40a9ae6..e97cd1ae03d1 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -14,11 +14,14 @@ extern int no_timer_check;
extern bool using_native_sched_clock(void);
#ifdef CONFIG_PARAVIRT
-void __paravirt_set_sched_clock(u64 (*func)(void), bool stable);
+void __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
+ void (*save)(void), void (*restore)(void));
-static inline void paravirt_set_sched_clock(u64 (*func)(void))
+static inline void paravirt_set_sched_clock(u64 (*func)(void),
+ void (*save)(void),
+ void (*restore)(void))
{
- __paravirt_set_sched_clock(func, true);
+ __paravirt_set_sched_clock(func, true, save, restore);
}
#endif
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 5c1ccaf4a25e..232255279a6e 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -347,11 +347,9 @@ static void __init vmware_paravirt_ops_setup(void)
vmware_cyc2ns_setup();
- if (vmw_sched_clock) {
- paravirt_set_sched_clock(vmware_sched_clock);
- x86_platform.save_sched_clock_state = x86_init_noop;
- x86_platform.restore_sched_clock_state = x86_init_noop;
- }
+ if (vmw_sched_clock)
+ paravirt_set_sched_clock(vmware_sched_clock,
+ x86_init_noop, x86_init_noop);
if (vmware_is_stealclock_available()) {
has_steal_clock = true;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 07e875738c39..5b9955343199 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -158,7 +158,9 @@ static void kvm_restore_sched_clock_state(void)
static inline void kvm_sched_clock_init(bool stable)
{
kvm_sched_clock_offset = kvm_clock_read();
- __paravirt_set_sched_clock(kvm_sched_clock_read, stable);
+ __paravirt_set_sched_clock(kvm_sched_clock_read, stable,
+ kvm_save_sched_clock_state,
+ kvm_restore_sched_clock_state);
pr_info("kvm-clock: using sched offset of %llu cycles",
kvm_sched_clock_offset);
@@ -367,8 +369,6 @@ void __init kvmclock_init(bool prefer_tsc)
#ifdef CONFIG_SMP
x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock;
#endif
- x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
- x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
kvm_get_preset_lpj();
/*
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7473dcab4775..83353d643150 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -280,12 +280,15 @@ bool using_native_sched_clock(void)
return static_call_query(pv_sched_clock) == native_sched_clock;
}
-void __paravirt_set_sched_clock(u64 (*func)(void), bool stable)
+void __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
+ void (*save)(void), void (*restore)(void))
{
if (!stable)
clear_sched_clock_stable();
static_call_update(pv_sched_clock, func);
+ x86_platform.save_sched_clock_state = save;
+ x86_platform.restore_sched_clock_state = restore;
}
#else
u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock")));
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 477441752f40..8cd8bfaf1320 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -566,13 +566,12 @@ static void __init xen_init_time_common(void)
{
xen_sched_clock_offset = xen_clocksource_read();
static_call_update(pv_steal_clock, xen_steal_clock);
- paravirt_set_sched_clock(xen_sched_clock);
+
/*
* Xen has paravirtualized suspend/resume and so doesn't use the common
* x86 sched_clock save/restore hooks.
*/
- x86_platform.save_sched_clock_state = x86_init_noop;
- x86_platform.restore_sched_clock_state = x86_init_noop;
+ paravirt_set_sched_clock(xen_sched_clock, x86_init_noop, x86_init_noop);
x86_init.hyper.get_tsc_khz = xen_tsc_khz;
x86_platform.get_wallclock = xen_get_wallclock;
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index 220668207d19..8ee7a9de0f4f 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -570,10 +570,8 @@ static void hv_restore_sched_clock_state(void)
static __always_inline void hv_setup_sched_clock(void *sched_clock)
{
/* We're on x86/x64 *and* using PV ops */
- paravirt_set_sched_clock(sched_clock);
-
- x86_platform.save_sched_clock_state = hv_save_sched_clock_state;
- x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state;
+ paravirt_set_sched_clock(sched_clock, hv_save_sched_clock_state,
+ hv_restore_sched_clock_state);
}
#else /* !CONFIG_GENERIC_SCHED_CLOCK && !CONFIG_PARAVIRT */
static __always_inline void hv_setup_sched_clock(void *sched_clock) {}
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* RE: [PATCH v5 36/51] x86/paravirt: Pass sched_clock save/restore helpers during registration
2026-07-01 19:31 ` [PATCH v5 36/51] x86/paravirt: Pass sched_clock save/restore helpers during registration Sean Christopherson
@ 2026-07-02 17:48 ` Michael Kelley
0 siblings, 0 replies; 72+ messages in thread
From: Michael Kelley @ 2026-07-02 17:48 UTC (permalink / raw)
To: Sean Christopherson, Jonathan Corbet, Paolo Bonzini,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
x86@kernel.org, Kiryl Shutsemau, Rick Edgecombe, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc@vger.kernel.org, kvm@vger.kernel.org,
linux-kernel@vger.kernel.org, linux-coco@lists.linux.dev,
linux-hyperv@vger.kernel.org, virtualization@lists.linux.dev,
xen-devel@lists.xenproject.org, Tom Lendacky, Nikunj A Dadhania,
David Woodhouse, David Woodhouse, Michael Kelley, Thomas Gleixner
From: Sean Christopherson <seanjc@google.com> Sent: Wednesday, July 1, 2026 12:32 PM
>
> Pass in a PV clock's save/restore helpers when configuring sched_clock
> instead of relying on each PV clock to manually set the save/restore hooks.
> In addition to bringing sanity to the code, this will allow gracefully
> "rejecting" a PV sched_clock, e.g. when running as a CoCo guest that has
> access to a "secure" TSC.
>
> No functional change intended.
>
> Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
For the Hyper-V changes,
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
> ---
> arch/x86/include/asm/timer.h | 9 ++++++---
> arch/x86/kernel/cpu/vmware.c | 8 +++-----
> arch/x86/kernel/kvmclock.c | 6 +++---
> arch/x86/kernel/tsc.c | 5 ++++-
> arch/x86/xen/time.c | 5 ++---
> drivers/clocksource/hyperv_timer.c | 6 ++----
> 6 files changed, 20 insertions(+), 19 deletions(-)
>
> diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
> index fe41d40a9ae6..e97cd1ae03d1 100644
> --- a/arch/x86/include/asm/timer.h
> +++ b/arch/x86/include/asm/timer.h
> @@ -14,11 +14,14 @@ extern int no_timer_check;
> extern bool using_native_sched_clock(void);
>
> #ifdef CONFIG_PARAVIRT
> -void __paravirt_set_sched_clock(u64 (*func)(void), bool stable);
> +void __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
> + void (*save)(void), void (*restore)(void));
>
> -static inline void paravirt_set_sched_clock(u64 (*func)(void))
> +static inline void paravirt_set_sched_clock(u64 (*func)(void),
> + void (*save)(void),
> + void (*restore)(void))
> {
> - __paravirt_set_sched_clock(func, true);
> + __paravirt_set_sched_clock(func, true, save, restore);
> }
> #endif
>
> diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
> index 5c1ccaf4a25e..232255279a6e 100644
> --- a/arch/x86/kernel/cpu/vmware.c
> +++ b/arch/x86/kernel/cpu/vmware.c
> @@ -347,11 +347,9 @@ static void __init vmware_paravirt_ops_setup(void)
>
> vmware_cyc2ns_setup();
>
> - if (vmw_sched_clock) {
> - paravirt_set_sched_clock(vmware_sched_clock);
> - x86_platform.save_sched_clock_state = x86_init_noop;
> - x86_platform.restore_sched_clock_state = x86_init_noop;
> - }
> + if (vmw_sched_clock)
> + paravirt_set_sched_clock(vmware_sched_clock,
> + x86_init_noop, x86_init_noop);
>
> if (vmware_is_stealclock_available()) {
> has_steal_clock = true;
> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> index 07e875738c39..5b9955343199 100644
> --- a/arch/x86/kernel/kvmclock.c
> +++ b/arch/x86/kernel/kvmclock.c
> @@ -158,7 +158,9 @@ static void kvm_restore_sched_clock_state(void)
> static inline void kvm_sched_clock_init(bool stable)
> {
> kvm_sched_clock_offset = kvm_clock_read();
> - __paravirt_set_sched_clock(kvm_sched_clock_read, stable);
> + __paravirt_set_sched_clock(kvm_sched_clock_read, stable,
> + kvm_save_sched_clock_state,
> + kvm_restore_sched_clock_state);
>
> pr_info("kvm-clock: using sched offset of %llu cycles",
> kvm_sched_clock_offset);
> @@ -367,8 +369,6 @@ void __init kvmclock_init(bool prefer_tsc)
> #ifdef CONFIG_SMP
> x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock;
> #endif
> - x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
> - x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
> kvm_get_preset_lpj();
>
> /*
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 7473dcab4775..83353d643150 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -280,12 +280,15 @@ bool using_native_sched_clock(void)
> return static_call_query(pv_sched_clock) == native_sched_clock;
> }
>
> -void __paravirt_set_sched_clock(u64 (*func)(void), bool stable)
> +void __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
> + void (*save)(void), void (*restore)(void))
> {
> if (!stable)
> clear_sched_clock_stable();
>
> static_call_update(pv_sched_clock, func);
> + x86_platform.save_sched_clock_state = save;
> + x86_platform.restore_sched_clock_state = restore;
> }
> #else
> u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock")));
> diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
> index 477441752f40..8cd8bfaf1320 100644
> --- a/arch/x86/xen/time.c
> +++ b/arch/x86/xen/time.c
> @@ -566,13 +566,12 @@ static void __init xen_init_time_common(void)
> {
> xen_sched_clock_offset = xen_clocksource_read();
> static_call_update(pv_steal_clock, xen_steal_clock);
> - paravirt_set_sched_clock(xen_sched_clock);
> +
> /*
> * Xen has paravirtualized suspend/resume and so doesn't use the common
> * x86 sched_clock save/restore hooks.
> */
> - x86_platform.save_sched_clock_state = x86_init_noop;
> - x86_platform.restore_sched_clock_state = x86_init_noop;
> + paravirt_set_sched_clock(xen_sched_clock, x86_init_noop, x86_init_noop);
>
> x86_init.hyper.get_tsc_khz = xen_tsc_khz;
> x86_platform.get_wallclock = xen_get_wallclock;
> diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
> index 220668207d19..8ee7a9de0f4f 100644
> --- a/drivers/clocksource/hyperv_timer.c
> +++ b/drivers/clocksource/hyperv_timer.c
> @@ -570,10 +570,8 @@ static void hv_restore_sched_clock_state(void)
> static __always_inline void hv_setup_sched_clock(void *sched_clock)
> {
> /* We're on x86/x64 *and* using PV ops */
> - paravirt_set_sched_clock(sched_clock);
> -
> - x86_platform.save_sched_clock_state = hv_save_sched_clock_state;
> - x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state;
> + paravirt_set_sched_clock(sched_clock, hv_save_sched_clock_state,
> + hv_restore_sched_clock_state);
> }
> #else /* !CONFIG_GENERIC_SCHED_CLOCK && !CONFIG_PARAVIRT */
> static __always_inline void hv_setup_sched_clock(void *sched_clock) {}
> --
> 2.55.0.rc0.799.gd6f94ed593-goog
>
^ permalink raw reply [flat|nested] 72+ messages in thread
* [PATCH v5 37/51] x86/kvmclock: Move kvm_sched_clock_init() down in kvmclock.c
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (35 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 36/51] x86/paravirt: Pass sched_clock save/restore helpers during registration Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:31 ` [PATCH v5 38/51] x86/xen/time: Mark xen_setup_vsyscall_time_info() as __init Sean Christopherson
` (13 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Move kvm_sched_clock_init() "down" so that it can reference the global
kvm_clock structure without needing a forward declaration.
Opportunistically mark the helper as "__init" instead of "inline" to make
its usage more obvious; modern compilers don't need a hint to inline a
single-use function, and an extra CALL+RET pair during boot is a complete
non-issue. And, if the compiler ignores the hint and does NOT inline the
function, the resulting code may not get discarded after boot due lack of
an __init annotation.
No functional change intended.
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/kvmclock.c | 28 ++++++++++++++--------------
1 file changed, 14 insertions(+), 14 deletions(-)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 5b9955343199..5220d205abc7 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -155,20 +155,6 @@ static void kvm_restore_sched_clock_state(void)
kvm_register_clock("primary cpu clock, resume");
}
-static inline void kvm_sched_clock_init(bool stable)
-{
- kvm_sched_clock_offset = kvm_clock_read();
- __paravirt_set_sched_clock(kvm_sched_clock_read, stable,
- kvm_save_sched_clock_state,
- kvm_restore_sched_clock_state);
-
- pr_info("kvm-clock: using sched offset of %llu cycles",
- kvm_sched_clock_offset);
-
- BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
- sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
-}
-
void kvmclock_cpu_action(enum kvm_guest_cpu_action action)
{
/*
@@ -325,6 +311,20 @@ static int kvmclock_setup_percpu(unsigned int cpu)
return p ? 0 : -ENOMEM;
}
+static __init void kvm_sched_clock_init(bool stable)
+{
+ kvm_sched_clock_offset = kvm_clock_read();
+ __paravirt_set_sched_clock(kvm_sched_clock_read, stable,
+ kvm_save_sched_clock_state,
+ kvm_restore_sched_clock_state);
+
+ pr_info("kvm-clock: using sched offset of %llu cycles",
+ kvm_sched_clock_offset);
+
+ BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
+ sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
+}
+
void __init kvmclock_init(bool prefer_tsc)
{
u8 flags;
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 38/51] x86/xen/time: Mark xen_setup_vsyscall_time_info() as __init
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (36 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 37/51] x86/kvmclock: Move kvm_sched_clock_init() down in kvmclock.c Sean Christopherson
@ 2026-07-01 19:31 ` Sean Christopherson
2026-07-01 19:32 ` [PATCH v5 39/51] x86/pvclock: Mark setup helpers and related various as __init/__ro_after_init Sean Christopherson
` (12 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:31 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Annotate xen_setup_vsyscall_time_info() as being used only during kernel
initialization; it's called only by xen_time_init(), which is already
tagged __init.
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/xen/time.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 8cd8bfaf1320..bc26f00fc53e 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -443,7 +443,7 @@ void xen_restore_time_memory_area(void)
xen_sched_clock_offset = xen_clocksource_read() - xen_clock_value_saved;
}
-static void xen_setup_vsyscall_time_info(void)
+static void __init xen_setup_vsyscall_time_info(void)
{
struct vcpu_register_time_memory_area t;
struct pvclock_vsyscall_time_info *ti;
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 39/51] x86/pvclock: Mark setup helpers and related various as __init/__ro_after_init
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (37 preceding siblings ...)
2026-07-01 19:31 ` [PATCH v5 38/51] x86/xen/time: Mark xen_setup_vsyscall_time_info() as __init Sean Christopherson
@ 2026-07-01 19:32 ` Sean Christopherson
2026-07-01 19:32 ` [PATCH v5 40/51] x86/pvclock: WARN if pvclock's valid_flags are overwritten Sean Christopherson
` (11 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:32 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Now that Xen PV clock and kvmclock explicitly do setup only during init,
tag the common PV clock flags/vsyscall variables and their mutators with
__init.
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/pvclock.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index b3f81379c2fc..a51adce67f92 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -16,10 +16,10 @@
#include <asm/pvclock.h>
#include <asm/vgtod.h>
-static u8 valid_flags __read_mostly = 0;
-static struct pvclock_vsyscall_time_info *pvti_cpu0_va __read_mostly;
+static u8 valid_flags __ro_after_init = 0;
+static struct pvclock_vsyscall_time_info *pvti_cpu0_va __ro_after_init;
-void pvclock_set_flags(u8 flags)
+void __init pvclock_set_flags(u8 flags)
{
valid_flags = flags;
}
@@ -153,7 +153,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
set_normalized_timespec64(ts, now.tv_sec, now.tv_nsec);
}
-void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti)
+void __init pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti)
{
WARN_ON(vclock_was_used(VDSO_CLOCKMODE_PVCLOCK));
pvti_cpu0_va = pvti;
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 40/51] x86/pvclock: WARN if pvclock's valid_flags are overwritten
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (38 preceding siblings ...)
2026-07-01 19:32 ` [PATCH v5 39/51] x86/pvclock: Mark setup helpers and related various as __init/__ro_after_init Sean Christopherson
@ 2026-07-01 19:32 ` Sean Christopherson
2026-07-01 19:32 ` [PATCH v5 41/51] x86/kvmclock: Refactor handling of PVCLOCK_TSC_STABLE_BIT during kvmclock_init() Sean Christopherson
` (10 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:32 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
WARN if the common PV clock valid_flags are overwritten; all PV clocks
expect that they are the one and only PV clock, i.e. don't guard against
another PV clock having modified the flags.
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/pvclock.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index a51adce67f92..8d098841a225 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -21,6 +21,7 @@ static struct pvclock_vsyscall_time_info *pvti_cpu0_va __ro_after_init;
void __init pvclock_set_flags(u8 flags)
{
+ WARN_ON(valid_flags);
valid_flags = flags;
}
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 41/51] x86/kvmclock: Refactor handling of PVCLOCK_TSC_STABLE_BIT during kvmclock_init()
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (39 preceding siblings ...)
2026-07-01 19:32 ` [PATCH v5 40/51] x86/pvclock: WARN if pvclock's valid_flags are overwritten Sean Christopherson
@ 2026-07-01 19:32 ` Sean Christopherson
2026-07-01 19:32 ` [PATCH v5 42/51] timekeeping: Resume clocksources before reading persistent clock Sean Christopherson
` (9 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:32 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Clean up the setting of PVCLOCK_TSC_STABLE_BIT during kvmclock init to
make it somewhat obvious that pvclock_read_flags() must be called *after*
pvclock_set_flags().
Note, in theory, a different PV clock could have set PVCLOCK_TSC_STABLE_BIT
in the supported flags, i.e. reading flags only if
KVM_FEATURE_CLOCKSOURCE_STABLE_BIT is set could very, very theoretically
result in a change in behavior. In practice, the kernel only supports a
single PV clock.
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/kvmclock.c | 15 +++++++++++----
1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 5220d205abc7..61d4d943fe74 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -327,7 +327,7 @@ static __init void kvm_sched_clock_init(bool stable)
void __init kvmclock_init(bool prefer_tsc)
{
- u8 flags;
+ bool stable = false;
if (!kvm_para_available() || !kvmclock)
return;
@@ -354,11 +354,18 @@ void __init kvmclock_init(bool prefer_tsc)
kvm_register_clock("primary cpu clock");
pvclock_set_pvti_cpu0_va(hv_clock_boot);
- if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
+ if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) {
pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
- flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
- kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
+ /*
+ * Check if the clock is stable *after* marking TSC_STABLE as a
+ * valid flag.
+ */
+ stable = pvclock_read_flags(&hv_clock_boot[0].pvti) &
+ PVCLOCK_TSC_STABLE_BIT;
+ }
+
+ kvm_sched_clock_init(stable);
if (!x86_init.hyper.get_tsc_khz)
x86_init.hyper.get_tsc_khz = kvmclock_get_tsc_khz;
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 42/51] timekeeping: Resume clocksources before reading persistent clock
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (40 preceding siblings ...)
2026-07-01 19:32 ` [PATCH v5 41/51] x86/kvmclock: Refactor handling of PVCLOCK_TSC_STABLE_BIT during kvmclock_init() Sean Christopherson
@ 2026-07-01 19:32 ` Sean Christopherson
2026-07-01 19:32 ` [PATCH v5 43/51] x86/kvmclock: Hook clocksource.suspend/resume when kvmclock isn't sched_clock Sean Christopherson
` (8 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:32 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
When resuming timekeeping after suspend, restore clocksources prior to
reading the persistent clock. Paravirt clocks, e.g. kvmclock, tie the
validity of a PV persistent clock to a clocksource, i.e. reading the PV
persistent clock will return garbage if the underlying PV clocksource
hasn't been enabled. The flaw has gone unnoticed because kvmclock is a
mess and uses its own suspend/resume hooks instead of the clocksource
suspend/resume hooks, which happens to work by sheer dumb luck (the
kvmclock resume hook runs before timekeeping_resume()).
Note, there is no evidence that any clocksource supported by the kernel
depends on a persistent clock.
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
kernel/time/timekeeping.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index b1b5ec43c0f2..5bc77d36c7a3 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2180,11 +2180,16 @@ void timekeeping_resume(void)
u64 cycle_now, nsec;
unsigned long flags;
- read_persistent_clock64(&ts_new);
-
clockevents_resume();
clocksource_resume();
+ /*
+ * Read persistent time after clocksources have been resumed. Paravirt
+ * clocks have a nasty habit of piggybacking a persistent clock on a
+ * system clock, and may return garbage if the system clock is suspended.
+ */
+ read_persistent_clock64(&ts_new);
+
raw_spin_lock_irqsave(&tk_core.lock, flags);
/*
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 43/51] x86/kvmclock: Hook clocksource.suspend/resume when kvmclock isn't sched_clock
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (41 preceding siblings ...)
2026-07-01 19:32 ` [PATCH v5 42/51] timekeeping: Resume clocksources before reading persistent clock Sean Christopherson
@ 2026-07-01 19:32 ` Sean Christopherson
2026-07-01 19:32 ` [PATCH v5 44/51] x86/kvmclock: WARN if wall clock is read while kvmclock is suspended Sean Christopherson
` (7 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:32 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Save/restore kvmclock across suspend/resume via clocksource hooks when
kvmclock isn't being used for sched_clock. This will allow using kvmclock
as a clocksource (or for wallclock!) without also using it for sched_clock.
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/kvmclock.c | 23 ++++++++++++++++++++++-
1 file changed, 22 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 61d4d943fe74..41aff709b90a 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -152,7 +152,17 @@ static void kvm_setup_secondary_clock(void)
static void kvm_restore_sched_clock_state(void)
{
- kvm_register_clock("primary cpu clock, resume");
+ kvm_register_clock("primary cpu, sched_clock resume");
+}
+
+static void kvmclock_suspend(struct clocksource *cs)
+{
+ kvmclock_disable();
+}
+
+static void kvmclock_resume(struct clocksource *cs)
+{
+ kvm_register_clock("primary cpu, clocksource resume");
}
void kvmclock_cpu_action(enum kvm_guest_cpu_action action)
@@ -223,6 +233,8 @@ static struct clocksource kvm_clock = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
.id = CSID_X86_KVM_CLK,
.enable = kvm_cs_enable,
+ .suspend = kvmclock_suspend,
+ .resume = kvmclock_resume,
};
static void __init kvmclock_init_mem(void)
@@ -318,6 +330,15 @@ static __init void kvm_sched_clock_init(bool stable)
kvm_save_sched_clock_state,
kvm_restore_sched_clock_state);
+ /*
+ * The BSP's clock is managed via dedicated sched_clock save/restore
+ * hooks when kvmclock is used as sched_clock, as sched_clock needs to
+ * be kept alive until the very end of suspend entry, and restored as
+ * quickly as possible after resume.
+ */
+ kvm_clock.suspend = NULL;
+ kvm_clock.resume = NULL;
+
pr_info("kvm-clock: using sched offset of %llu cycles",
kvm_sched_clock_offset);
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 44/51] x86/kvmclock: WARN if wall clock is read while kvmclock is suspended
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (42 preceding siblings ...)
2026-07-01 19:32 ` [PATCH v5 43/51] x86/kvmclock: Hook clocksource.suspend/resume when kvmclock isn't sched_clock Sean Christopherson
@ 2026-07-01 19:32 ` Sean Christopherson
2026-07-01 19:32 ` [PATCH v5 45/51] x86/paravirt: Mark __paravirt_set_sched_clock() as __init Sean Christopherson
` (6 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:32 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
WARN if kvmclock is still suspended when its wallclock is read, i.e. when
the kernel reads its persistent clock. The wallclock subtly depends on
the BSP's kvmclock being enabled, and returns garbage if kvmclock is
disabled.
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/kvmclock.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 41aff709b90a..2cc3dd2ba355 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -53,6 +53,8 @@ static struct pvclock_vsyscall_time_info *hvclock_mem;
DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
EXPORT_PER_CPU_SYMBOL_GPL(hv_clock_per_cpu);
+static bool kvmclock_suspended;
+
/*
* The wallclock is the time of day when we booted. Since then, some time may
* have elapsed since the hypervisor wrote the data. So we try to account for
@@ -60,6 +62,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(hv_clock_per_cpu);
*/
static void kvm_get_wallclock(struct timespec64 *now)
{
+ WARN_ON_ONCE(kvmclock_suspended);
wrmsrq(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock));
preempt_disable();
pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now);
@@ -140,6 +143,7 @@ static void kvm_save_sched_clock_state(void)
* to the old address prior to reconfiguring kvmclock would clobber
* random memory.
*/
+ kvmclock_suspended = true;
kvmclock_disable();
}
@@ -152,16 +156,19 @@ static void kvm_setup_secondary_clock(void)
static void kvm_restore_sched_clock_state(void)
{
+ kvmclock_suspended = false;
kvm_register_clock("primary cpu, sched_clock resume");
}
static void kvmclock_suspend(struct clocksource *cs)
{
+ kvmclock_suspended = true;
kvmclock_disable();
}
static void kvmclock_resume(struct clocksource *cs)
{
+ kvmclock_suspended = false;
kvm_register_clock("primary cpu, clocksource resume");
}
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 45/51] x86/paravirt: Mark __paravirt_set_sched_clock() as __init
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (43 preceding siblings ...)
2026-07-01 19:32 ` [PATCH v5 44/51] x86/kvmclock: WARN if wall clock is read while kvmclock is suspended Sean Christopherson
@ 2026-07-01 19:32 ` Sean Christopherson
2026-07-01 19:32 ` [PATCH v5 46/51] x86/paravirt: Plumb a return code into __paravirt_set_sched_clock() Sean Christopherson
` (5 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:32 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Annotate __paravirt_set_sched_clock() as __init, and make its wrapper
__always_inline to ensure sanitizers don't result in a non-inline version
hanging around. All callers run during __init, and changing sched_clock
after boot would be all kinds of crazy.
No functional change intended.
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/timer.h | 10 +++++-----
arch/x86/kernel/tsc.c | 4 ++--
2 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index e97cd1ae03d1..96ae7feac47c 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -14,12 +14,12 @@ extern int no_timer_check;
extern bool using_native_sched_clock(void);
#ifdef CONFIG_PARAVIRT
-void __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
- void (*save)(void), void (*restore)(void));
+void __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
+ void (*save)(void), void (*restore)(void));
-static inline void paravirt_set_sched_clock(u64 (*func)(void),
- void (*save)(void),
- void (*restore)(void))
+static __always_inline void paravirt_set_sched_clock(u64 (*func)(void),
+ void (*save)(void),
+ void (*restore)(void))
{
__paravirt_set_sched_clock(func, true, save, restore);
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 83353d643150..0f92b29adecc 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -280,8 +280,8 @@ bool using_native_sched_clock(void)
return static_call_query(pv_sched_clock) == native_sched_clock;
}
-void __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
- void (*save)(void), void (*restore)(void))
+void __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
+ void (*save)(void), void (*restore)(void))
{
if (!stable)
clear_sched_clock_stable();
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 46/51] x86/paravirt: Plumb a return code into __paravirt_set_sched_clock()
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (44 preceding siblings ...)
2026-07-01 19:32 ` [PATCH v5 45/51] x86/paravirt: Mark __paravirt_set_sched_clock() as __init Sean Christopherson
@ 2026-07-01 19:32 ` Sean Christopherson
2026-07-01 19:32 ` [PATCH v5 47/51] x86/paravirt: Don't use a PV sched_clock in CoCo guests with trusted TSC Sean Christopherson
` (4 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:32 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Add a return code to __paravirt_set_sched_clock() so that the kernel can
reject attempts to use a PV sched_clock without breaking the caller. E.g.
when running as a CoCo VM with a secure TSC, using a PV clock is generally
undesirable.
Note, kvmclock is the only PV clock that does anything "extra" beyond
simply registering itself as sched_clock, i.e. is the only caller that
needs to check the new return value.
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/timer.h | 6 +++---
arch/x86/kernel/kvmclock.c | 9 ++++++---
arch/x86/kernel/tsc.c | 5 +++--
3 files changed, 12 insertions(+), 8 deletions(-)
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 96ae7feac47c..ca5c95d48c03 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -14,14 +14,14 @@ extern int no_timer_check;
extern bool using_native_sched_clock(void);
#ifdef CONFIG_PARAVIRT
-void __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
- void (*save)(void), void (*restore)(void));
+int __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
+ void (*save)(void), void (*restore)(void));
static __always_inline void paravirt_set_sched_clock(u64 (*func)(void),
void (*save)(void),
void (*restore)(void))
{
- __paravirt_set_sched_clock(func, true, save, restore);
+ (void)__paravirt_set_sched_clock(func, true, save, restore);
}
#endif
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 2cc3dd2ba355..22e8855fcd4d 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -332,10 +332,13 @@ static int kvmclock_setup_percpu(unsigned int cpu)
static __init void kvm_sched_clock_init(bool stable)
{
+ /* Ensure the offset is configured before making kvmclock visible! */
kvm_sched_clock_offset = kvm_clock_read();
- __paravirt_set_sched_clock(kvm_sched_clock_read, stable,
- kvm_save_sched_clock_state,
- kvm_restore_sched_clock_state);
+
+ if (__paravirt_set_sched_clock(kvm_sched_clock_read, stable,
+ kvm_save_sched_clock_state,
+ kvm_restore_sched_clock_state))
+ return;
/*
* The BSP's clock is managed via dedicated sched_clock save/restore
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 0f92b29adecc..012321fed5e5 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -280,8 +280,8 @@ bool using_native_sched_clock(void)
return static_call_query(pv_sched_clock) == native_sched_clock;
}
-void __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
- void (*save)(void), void (*restore)(void))
+int __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
+ void (*save)(void), void (*restore)(void))
{
if (!stable)
clear_sched_clock_stable();
@@ -289,6 +289,7 @@ void __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
static_call_update(pv_sched_clock, func);
x86_platform.save_sched_clock_state = save;
x86_platform.restore_sched_clock_state = restore;
+ return 0;
}
#else
u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock")));
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 47/51] x86/paravirt: Don't use a PV sched_clock in CoCo guests with trusted TSC
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (45 preceding siblings ...)
2026-07-01 19:32 ` [PATCH v5 46/51] x86/paravirt: Plumb a return code into __paravirt_set_sched_clock() Sean Christopherson
@ 2026-07-01 19:32 ` Sean Christopherson
2026-07-02 17:48 ` Michael Kelley
2026-07-01 19:32 ` [PATCH v5 48/51] x86/kvmclock: Use TSC for sched_clock if it's constant and non-stop Sean Christopherson
` (3 subsequent siblings)
50 siblings, 1 reply; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:32 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Silently ignore attempts to switch to a paravirt sched_clock when running
as a CoCo guest with trusted TSC. In hand-wavy theory, a misbehaving
hypervisor could attack the guest by manipulating the PV clock to affect
guest scheduling in some weird and/or predictable way. More importantly,
reading TSC on such platforms is faster than any PV clock, and sched_clock
is all about speed.
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/tsc.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 012321fed5e5..a146fc7b5e74 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -283,6 +283,15 @@ bool using_native_sched_clock(void)
int __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
void (*save)(void), void (*restore)(void))
{
+ /*
+ * Don't replace TSC with a PV clock when running as a CoCo guest and
+ * the TSC is secure/trusted; PV clocks are emulated by the hypervisor,
+ * which isn't in the guest's TCB.
+ */
+ if (cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC) ||
+ boot_cpu_has(X86_FEATURE_TDX_GUEST))
+ return -EPERM;
+
if (!stable)
clear_sched_clock_stable();
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* RE: [PATCH v5 47/51] x86/paravirt: Don't use a PV sched_clock in CoCo guests with trusted TSC
2026-07-01 19:32 ` [PATCH v5 47/51] x86/paravirt: Don't use a PV sched_clock in CoCo guests with trusted TSC Sean Christopherson
@ 2026-07-02 17:48 ` Michael Kelley
0 siblings, 0 replies; 72+ messages in thread
From: Michael Kelley @ 2026-07-02 17:48 UTC (permalink / raw)
To: Sean Christopherson, Jonathan Corbet, Paolo Bonzini,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
x86@kernel.org, Kiryl Shutsemau, Rick Edgecombe, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc@vger.kernel.org, kvm@vger.kernel.org,
linux-kernel@vger.kernel.org, linux-coco@lists.linux.dev,
linux-hyperv@vger.kernel.org, virtualization@lists.linux.dev,
xen-devel@lists.xenproject.org, Tom Lendacky, Nikunj A Dadhania,
David Woodhouse, David Woodhouse, Michael Kelley, Thomas Gleixner
From: Sean Christopherson <seanjc@google.com> Sent: Wednesday, July 1, 2026 12:32 PM
>
> Silently ignore attempts to switch to a paravirt sched_clock when running
> as a CoCo guest with trusted TSC. In hand-wavy theory, a misbehaving
> hypervisor could attack the guest by manipulating the PV clock to affect
> guest scheduling in some weird and/or predictable way. More importantly,
> reading TSC on such platforms is faster than any PV clock, and sched_clock
> is all about speed.
>
> Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
> arch/x86/kernel/tsc.c | 9 +++++++++
> 1 file changed, 9 insertions(+)
>
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 012321fed5e5..a146fc7b5e74 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -283,6 +283,15 @@ bool using_native_sched_clock(void)
> int __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
> void (*save)(void), void (*restore)(void))
> {
> + /*
> + * Don't replace TSC with a PV clock when running as a CoCo guest and
> + * the TSC is secure/trusted; PV clocks are emulated by the hypervisor,
> + * which isn't in the guest's TCB.
> + */
> + if (cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC) ||
> + boot_cpu_has(X86_FEATURE_TDX_GUEST))
> + return -EPERM;
Do a pr_warn() in the error case? Your commit message says to
do the ignore silently, but I wonder if that's a good idea. At least
for Hyper-V, the error case shouldn't happen.
Michael
> +
> if (!stable)
> clear_sched_clock_stable();
>
> --
> 2.55.0.rc0.799.gd6f94ed593-goog
>
^ permalink raw reply [flat|nested] 72+ messages in thread
* [PATCH v5 48/51] x86/kvmclock: Use TSC for sched_clock if it's constant and non-stop
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (46 preceding siblings ...)
2026-07-01 19:32 ` [PATCH v5 47/51] x86/paravirt: Don't use a PV sched_clock in CoCo guests with trusted TSC Sean Christopherson
@ 2026-07-01 19:32 ` Sean Christopherson
2026-07-01 19:32 ` [PATCH v5 49/51] x86/kvmclock: Plumb in AP-online and BSP-resume to kvmlock, for documentation Sean Christopherson
` (2 subsequent siblings)
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:32 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Prefer the TSC over kvmclock for sched_clock if the TSC is constant and
nonstop. I.e. use the same criteria as tweaking the clocksource rating so
that TSC is preferred over kvmclock. Per the below comment from
native_sched_clock(), sched_clock is more tolerant of slop than
clocksource; using TSC for clocksource but not sched_clock makes little to
no sense, especially now that KVM CoCo guests with a trusted TSC use TSC,
not kvmclock.
/*
* Fall back to jiffies if there's no TSC available:
* ( But note that we still use it if the TSC is marked
* unstable. We do this because unlike Time Of Day,
* the scheduler clock tolerates small errors and it's
* very important for it to be as fast as the platform
* can achieve it. )
*/
The only advantage of using kvmclock is that doing so allows for early
and common detection of PVCLOCK_GUEST_STOPPED, but that code has been
broken for over two years with nary a complaint, i.e. it can't be
_that_ valuable. And as above, certain types of KVM guests are losing
the functionality regardless, i.e. acknowledging PVCLOCK_GUEST_STOPPED
needs to be decoupled from sched_clock() no matter what.
Link: https://lore.kernel.org/all/Z4hDK27OV7wK572A@google.com
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/kvmclock.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 22e8855fcd4d..bc98ebb8587d 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -396,7 +396,6 @@ void __init kvmclock_init(bool prefer_tsc)
PVCLOCK_TSC_STABLE_BIT;
}
- kvm_sched_clock_init(stable);
if (!x86_init.hyper.get_tsc_khz)
x86_init.hyper.get_tsc_khz = kvmclock_get_tsc_khz;
@@ -416,6 +415,8 @@ void __init kvmclock_init(bool prefer_tsc)
*/
if (prefer_tsc)
kvm_clock.rating = 299;
+ else
+ kvm_sched_clock_init(stable);
clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
pv_info.name = "KVM";
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 49/51] x86/kvmclock: Plumb in AP-online and BSP-resume to kvmlock, for documentation
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (47 preceding siblings ...)
2026-07-01 19:32 ` [PATCH v5 48/51] x86/kvmclock: Use TSC for sched_clock if it's constant and non-stop Sean Christopherson
@ 2026-07-01 19:32 ` Sean Christopherson
2026-07-01 19:32 ` [PATCH v5 50/51] x86/paravirt: Move using_native_sched_clock() stub into timer.h Sean Christopherson
2026-07-01 19:32 ` [PATCH v5 51/51] x86/kvm: Get local APIC bus frequency from PV CPUID Timing Info Sean Christopherson
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:32 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Invoke kvmclock_cpu_action() with AP_ONLINE and BSP_RESUME, even though
kvmclock doesn't need to do anything in either case, so that the asymmetry
of kvmclock is a detail buried in kvmclock, and to explicitly document
that doing nothing during those phases is intentional and correct.
For all intents and purposes, no functional change intended.
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/kvm_para.h | 2 ++
arch/x86/kernel/kvm.c | 22 +++++++++++++-------
arch/x86/kernel/kvmclock.c | 37 ++++++++++++++++++++++++++-------
3 files changed, 45 insertions(+), 16 deletions(-)
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 08686ff19caa..763ed017738a 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -120,6 +120,8 @@ static inline long kvm_sev_hypercall3(unsigned int nr, unsigned long p1,
#ifdef CONFIG_KVM_GUEST
enum kvm_guest_cpu_action {
KVM_GUEST_BSP_SUSPEND,
+ KVM_GUEST_BSP_RESUME,
+ KVM_GUEST_AP_ONLINE,
KVM_GUEST_AP_OFFLINE,
KVM_GUEST_SHUTDOWN,
};
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 604b52f233aa..f9a6346077b0 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -474,18 +474,24 @@ static void kvm_guest_cpu_offline(enum kvm_guest_cpu_action action)
kvmclock_cpu_action(action);
}
+static void __kvm_cpu_online(unsigned int cpu, enum kvm_guest_cpu_action action)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ kvmclock_cpu_action(action);
+ kvm_guest_cpu_init();
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_SMP
+
static int kvm_cpu_online(unsigned int cpu)
{
- unsigned long flags;
-
- local_irq_save(flags);
- kvm_guest_cpu_init();
- local_irq_restore(flags);
+ __kvm_cpu_online(cpu, KVM_GUEST_AP_ONLINE);
return 0;
}
-#ifdef CONFIG_SMP
-
static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
static bool pv_tlb_flush_supported(void)
@@ -750,7 +756,7 @@ static int kvm_suspend(void *data)
static void kvm_resume(void *data)
{
- kvm_cpu_online(raw_smp_processor_id());
+ __kvm_cpu_online(raw_smp_processor_id(), KVM_GUEST_BSP_RESUME);
#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index bc98ebb8587d..842f38c5f6ca 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -150,7 +150,7 @@ static void kvm_save_sched_clock_state(void)
#ifdef CONFIG_SMP
static void kvm_setup_secondary_clock(void)
{
- kvm_register_clock("secondary cpu clock");
+ kvm_register_clock("secondary cpu, startup");
}
#endif
@@ -174,13 +174,34 @@ static void kvmclock_resume(struct clocksource *cs)
void kvmclock_cpu_action(enum kvm_guest_cpu_action action)
{
- /*
- * Don't disable kvmclock on the BSP during suspend. If kvmclock is
- * being used for sched_clock, then it needs to be kept alive until the
- * last minute, and restored as quickly as possible after resume.
- */
- if (action != KVM_GUEST_BSP_SUSPEND)
+ switch (action) {
+ /*
+ * The BSP's clock is managed via clocksource suspend/resume,
+ * to ensure it's enabled/disabled when timekeeping needs it
+ * to be, e.g. before reading wallclock (which uses kvmclock).
+ */
+ case KVM_GUEST_BSP_SUSPEND:
+ case KVM_GUEST_BSP_RESUME:
+ break;
+ case KVM_GUEST_AP_ONLINE:
+ /*
+ * Secondary CPUs use a dedicated hook to enable kvmclock early
+ * during bringup, there's nothing to be done during CPU online
+ * (which runs at CPUHP_AP_ONLINE_DYN). When kvmclock is being
+ * used as sched_clock, kvmclock must be enabled *very* early,
+ * and even when kvmclock is "only" being used for the main
+ * clocksource, it still needs to be enabled long before the
+ * dynamic CPUHP calls are made.
+ */
+ break;
+ case KVM_GUEST_AP_OFFLINE:
+ case KVM_GUEST_SHUTDOWN:
kvmclock_disable();
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
}
/*
@@ -382,7 +403,7 @@ void __init kvmclock_init(bool prefer_tsc)
msr_kvm_system_time, msr_kvm_wall_clock);
this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]);
- kvm_register_clock("primary cpu clock");
+ kvm_register_clock("primary cpu, online");
pvclock_set_pvti_cpu0_va(hv_clock_boot);
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) {
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 50/51] x86/paravirt: Move using_native_sched_clock() stub into timer.h
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (48 preceding siblings ...)
2026-07-01 19:32 ` [PATCH v5 49/51] x86/kvmclock: Plumb in AP-online and BSP-resume to kvmlock, for documentation Sean Christopherson
@ 2026-07-01 19:32 ` Sean Christopherson
2026-07-01 19:32 ` [PATCH v5 51/51] x86/kvm: Get local APIC bus frequency from PV CPUID Timing Info Sean Christopherson
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:32 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
Now that timer.h ended up with CONFIG_PARAVIRT #ifdeffery anyways, move the
PARAVIRT=n using_native_sched_clock() stub into timer.h as a "free"
optimization.
No functional change intended.
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/timer.h | 6 ++++--
arch/x86/kernel/tsc.c | 2 --
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index ca5c95d48c03..a52388af6055 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -11,9 +11,9 @@ extern void recalibrate_cpu_khz(void);
extern int no_timer_check;
-extern bool using_native_sched_clock(void);
-
#ifdef CONFIG_PARAVIRT
+extern bool using_native_sched_clock(void);
+
int __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
void (*save)(void), void (*restore)(void));
@@ -23,6 +23,8 @@ static __always_inline void paravirt_set_sched_clock(u64 (*func)(void),
{
(void)__paravirt_set_sched_clock(func, true, save, restore);
}
+#else
+static inline bool using_native_sched_clock(void) { return true; }
#endif
/*
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index a146fc7b5e74..564be4faa5a0 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -302,8 +302,6 @@ int __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
}
#else
u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock")));
-
-bool using_native_sched_clock(void) { return true; }
#endif
notrace u64 sched_clock(void)
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread* [PATCH v5 51/51] x86/kvm: Get local APIC bus frequency from PV CPUID Timing Info
2026-07-01 19:31 [PATCH v5 00/51] x86: Try to wrangle PV clocks vs. TSC Sean Christopherson
` (49 preceding siblings ...)
2026-07-01 19:32 ` [PATCH v5 50/51] x86/paravirt: Move using_native_sched_clock() stub into timer.h Sean Christopherson
@ 2026-07-01 19:32 ` Sean Christopherson
50 siblings, 0 replies; 72+ messages in thread
From: Sean Christopherson @ 2026-07-01 19:32 UTC (permalink / raw)
To: Jonathan Corbet, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
Rick Edgecombe, Sean Christopherson, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
Juergen Gross, Daniel Lezcano, John Stultz
Cc: Shuah Khan, H. Peter Anvin, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, linux-doc, kvm, linux-kernel, linux-coco,
linux-hyperv, virtualization, xen-devel, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, David Woodhouse,
Michael Kelley, Thomas Gleixner
When running as a KVM guest with PV timing info provided by the host,
stuff the APIC timer period/frequency with the local APIC bus frequency
reported in CPUID.0x40000010.EBX instead of trying to calibrate/guess the
frequency.
See Documentation/virt/kvm/x86/cpuid.rst for details.
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/kvm.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index f9a6346077b0..beea0b6aa78e 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -990,7 +990,7 @@ static void __init kvm_init_platform(void)
.mask_lo = (u32)(~(SZ_4G - tolud - 1)) | MTRR_PHYSMASK_V,
.mask_hi = (BIT_ULL(boot_cpu_data.x86_phys_bits) - 1) >> 32,
};
- u32 timing_info_leaf;
+ u32 timing_info_leaf, apic_khz;
bool tsc_is_reliable;
if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) &&
@@ -1052,6 +1052,11 @@ static void __init kvm_init_platform(void)
x86_init.hyper.get_tsc_khz = kvm_get_tsc_khz;
x86_init.hyper.get_cpu_khz = kvm_get_tsc_khz;
}
+
+ /* The leaf also includes the local APIC bus/timer frequency.*/
+ apic_khz = cpuid_ebx(timing_info_leaf);
+ if (apic_khz)
+ apic_set_timer_period_khz(apic_khz, "KVM hypervisor");
}
/*
--
2.55.0.rc0.799.gd6f94ed593-goog
^ permalink raw reply related [flat|nested] 72+ messages in thread