* [PATCH v3 05/41] x86/tdx: Override PV calibration routines with CPUID-based calibration
From: Sean Christopherson @ 2026-05-15 19:19 UTC (permalink / raw)
To: Kiryl Shutsemau, Paolo Bonzini, Sean Christopherson,
K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
Ajay Kaher, Alexey Makhalov, Jan Kiszka, Dave Hansen,
Andy Lutomirski, Peter Zijlstra, Juergen Gross, Daniel Lezcano,
Thomas Gleixner, John Stultz
Cc: Rick Edgecombe, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
In-Reply-To: <20260515191942.1892718-1-seanjc@google.com>
When running as a TDX guest, explicitly override the TSC frequency
calibration routine with CPUID-based calibration instead of potentially
relying on a hypervisor-controlled PV routine. For TDX guests, CPUID.0x15
is always emulated by the TDX-Module, i.e. the information from CPUID is
more trustworthy than the information provided by the hypervisor.
To maintain backwards compatibility with TDX guest kernels that use native
calibration, and because it's the least awful option, retain
native_calibrate_tsc()'s stuffing of the local APIC bus period using the
core crystal frequency. While it's entirely possible for the hypervisor
to emulate the APIC timer at a different frequency than the core crystal
frequency, the commonly accepted interpretation of Intel's SDM is that APIC
timer runs at the core crystal frequency when that latter is enumerated via
CPUID:
The APIC timer frequency will be the processor’s bus clock or core
crystal clock frequency (when TSC/core crystal clock ratio is enumerated
in CPUID leaf 0x15).
If the hypervisor is malicious and deliberately runs the APIC timer at the
wrong frequency, nothing would stop the hypervisor from modifying the
frequency at any time, i.e. attempting to manually calibrate the frequency
out of paranoia would be futile.
Deliberately leave the CPU frequency calibration routine as is, since the
TDX-Module doesn't provide any guarantees with respect to CPUID.0x16.
Opportunistically add a comment explaining that CoCo TSC initialization
needs to come after hypervisor specific initialization.
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/coco/tdx/tdx.c | 30 +++++++++++++++++++++++++++---
arch/x86/include/asm/tdx.h | 2 ++
arch/x86/kernel/tsc.c | 8 ++++++++
3 files changed, 37 insertions(+), 3 deletions(-)
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 29b6f1ed59ec..26890cea790b 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -8,6 +8,7 @@
#include <linux/export.h>
#include <linux/io.h>
#include <linux/kexec.h>
+#include <asm/apic.h>
#include <asm/coco.h>
#include <asm/tdx.h>
#include <asm/vmx.h>
@@ -1123,9 +1124,6 @@ void __init tdx_early_init(void)
setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
- /* TSC is the only reliable clock in TDX guest */
- setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
-
cc_vendor = CC_VENDOR_INTEL;
/* Configure the TD */
@@ -1195,3 +1193,29 @@ void __init tdx_early_init(void)
tdx_announce();
}
+
+static unsigned long tdx_get_tsc_khz(void)
+{
+ struct cpuid_tsc_info info;
+
+ if (WARN_ON_ONCE(cpuid_get_tsc_freq(&info)))
+ return 0;
+
+ lapic_timer_period = info.crystal_khz * 1000 / HZ;
+
+ return info.tsc_khz;
+}
+
+void __init tdx_tsc_init(void)
+{
+ /* TSC is the only reliable clock in TDX guest */
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+
+ /*
+ * Override the PV calibration routines (if set) with more trustworthy
+ * CPUID-based calibration. The TDX module emulates CPUID, whereas any
+ * PV information is provided by the hypervisor.
+ */
+ tsc_register_calibration_routines(tdx_get_tsc_khz, NULL);
+}
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 15eac89b0afb..60deab0ed979 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -57,6 +57,7 @@ struct ve_info {
#ifdef CONFIG_INTEL_TDX_GUEST
void __init tdx_early_init(void);
+void __init tdx_tsc_init(void);
void tdx_get_ve_info(struct ve_info *ve);
@@ -78,6 +79,7 @@ void __init tdx_dump_td_ctls(u64 td_ctls);
#else
static inline void tdx_early_init(void) { };
+static inline void tdx_tsc_init(void) { }
static inline void tdx_halt(void) { };
static inline bool tdx_early_handle_ve(struct pt_regs *regs) { return false; }
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 243999692aea..e00f53e3dd8d 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -34,6 +34,7 @@
#include <asm/topology.h>
#include <asm/uv/uv.h>
#include <asm/sev.h>
+#include <asm/tdx.h>
unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);
@@ -1559,8 +1560,15 @@ void __init tsc_early_init(void)
if (is_early_uv_system())
return;
+ /*
+ * Do CoCo specific "secure" TSC initialization *after* hypervisor
+ * platform initialization so that the secure variant can override the
+ * hypervisor's PV calibration routine with a more trusted method.
+ */
if (cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
snp_secure_tsc_init();
+ else if (boot_cpu_has(X86_FEATURE_TDX_GUEST))
+ tdx_tsc_init();
if (!determine_cpu_tsc_frequencies(true))
return;
--
2.54.0.563.g4f69b47b94-goog
^ permalink raw reply related
* [PATCH v3 04/41] x86/sev: Move check for SNP Secure TSC support to tsc_early_init()
From: Sean Christopherson @ 2026-05-15 19:19 UTC (permalink / raw)
To: Kiryl Shutsemau, Paolo Bonzini, Sean Christopherson,
K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
Ajay Kaher, Alexey Makhalov, Jan Kiszka, Dave Hansen,
Andy Lutomirski, Peter Zijlstra, Juergen Gross, Daniel Lezcano,
Thomas Gleixner, John Stultz
Cc: Rick Edgecombe, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
In-Reply-To: <20260515191942.1892718-1-seanjc@google.com>
Move the check on having a Secure TSC to the common tsc_early_init() so
that it's obvious that having a Secure TSC is conditional, and to prepare
for adding TDX to the mix (blindly initializing *both* SNP and TDX TSC
logic looks especially weird).
No functional change intended.
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Nikunj A Dadhania <nikunj@amd.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/coco/sev/core.c | 3 ---
arch/x86/kernel/tsc.c | 3 ++-
2 files changed, 2 insertions(+), 4 deletions(-)
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index 14ced854cd83..39fb50697542 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -2029,9 +2029,6 @@ void __init snp_secure_tsc_init(void)
unsigned long tsc_freq_mhz;
void *mem;
- if (!cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
- return;
-
mem = early_memremap_encrypted(sev_secrets_pa, PAGE_SIZE);
if (!mem) {
pr_err("Unable to get TSC_FACTOR: failed to map the SNP secrets page.\n");
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7e639c0a94a2..243999692aea 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1559,7 +1559,8 @@ void __init tsc_early_init(void)
if (is_early_uv_system())
return;
- snp_secure_tsc_init();
+ if (cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
+ snp_secure_tsc_init();
if (!determine_cpu_tsc_frequencies(true))
return;
--
2.54.0.563.g4f69b47b94-goog
^ permalink raw reply related
* [PATCH v3 03/41] x86/sev: Mark TSC as reliable when configuring Secure TSC
From: Sean Christopherson @ 2026-05-15 19:19 UTC (permalink / raw)
To: Kiryl Shutsemau, Paolo Bonzini, Sean Christopherson,
K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
Ajay Kaher, Alexey Makhalov, Jan Kiszka, Dave Hansen,
Andy Lutomirski, Peter Zijlstra, Juergen Gross, Daniel Lezcano,
Thomas Gleixner, John Stultz
Cc: Rick Edgecombe, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
In-Reply-To: <20260515191942.1892718-1-seanjc@google.com>
Move the code to mark the TSC as reliable from sme_early_init() to
snp_secure_tsc_init(). The only reader of TSC_RELIABLE is the aptly
named check_system_tsc_reliable(), which runs in tsc_init(), i.e.
after snp_secure_tsc_init().
This will allow consolidating the handling of TSC_KNOWN_FREQ and
TSC_RELIABLE when overriding the TSC calibration routine.
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Nikunj A Dadhania <nikunj@amd.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/coco/sev/core.c | 2 ++
arch/x86/mm/mem_encrypt_amd.c | 3 ---
2 files changed, 2 insertions(+), 3 deletions(-)
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index d27cf8f8b025..14ced854cd83 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -2041,6 +2041,8 @@ void __init snp_secure_tsc_init(void)
secrets = (__force struct snp_secrets_page *)mem;
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+
rdmsrq(MSR_AMD64_GUEST_TSC_FREQ, tsc_freq_mhz);
/* Extract the GUEST TSC MHZ from BIT[17:0], rest is reserved space */
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
index 2f8c32173972..6c3af974c7c2 100644
--- a/arch/x86/mm/mem_encrypt_amd.c
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -535,9 +535,6 @@ void __init sme_early_init(void)
*/
x86_init.resources.dmi_setup = snp_dmi_setup;
}
-
- if (sev_status & MSR_AMD64_SNP_SECURE_TSC)
- setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
}
void __init mem_encrypt_free_decrypted_mem(void)
--
2.54.0.563.g4f69b47b94-goog
^ permalink raw reply related
* [PATCH v3 02/41] x86/tsc: Add helper to register CPU and TSC freq calibration routines
From: Sean Christopherson @ 2026-05-15 19:19 UTC (permalink / raw)
To: Kiryl Shutsemau, Paolo Bonzini, Sean Christopherson,
K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
Ajay Kaher, Alexey Makhalov, Jan Kiszka, Dave Hansen,
Andy Lutomirski, Peter Zijlstra, Juergen Gross, Daniel Lezcano,
Thomas Gleixner, John Stultz
Cc: Rick Edgecombe, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
In-Reply-To: <20260515191942.1892718-1-seanjc@google.com>
Add a helper to register non-native, i.e. PV and CoCo, CPU and TSC
frequency calibration routines. This will allow consolidating handling
of common TSC properties that are forced by hypervisor (PV routines),
and will also allow adding sanity checks to guard against overriding a
TSC calibration routine with a routine that is less robust/trusted.
Make the CPU calibration routine optional, as Xen (very sanely) doesn't
assume the CPU runs as the same frequency as the TSC.
Wrap the helper in an #ifdef to document that the kernel overrides
the native routines when running as a VM, and to guard against unwanted
usage. Add a TODO to call out that AMD_MEM_ENCRYPT is a mess and doesn't
depend on HYPERVISOR_GUEST because it gates both guest and host code.
No functional change intended.
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Tested-by: Michael Kelley <mhklinux@outlook.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/coco/sev/core.c | 4 ++--
arch/x86/include/asm/tsc.h | 4 ++++
arch/x86/kernel/cpu/acrn.c | 4 ++--
arch/x86/kernel/cpu/mshyperv.c | 3 +--
arch/x86/kernel/cpu/vmware.c | 4 ++--
arch/x86/kernel/jailhouse.c | 4 ++--
arch/x86/kernel/kvmclock.c | 4 ++--
arch/x86/kernel/tsc.c | 17 +++++++++++++++++
arch/x86/xen/time.c | 2 +-
9 files changed, 33 insertions(+), 13 deletions(-)
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index 7ed3da998489..d27cf8f8b025 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -2048,8 +2048,8 @@ void __init snp_secure_tsc_init(void)
snp_tsc_freq_khz = SNP_SCALE_TSC_FREQ(tsc_freq_mhz * 1000, secrets->tsc_factor);
- x86_platform.calibrate_cpu = securetsc_get_tsc_khz;
- x86_platform.calibrate_tsc = securetsc_get_tsc_khz;
+ tsc_register_calibration_routines(securetsc_get_tsc_khz,
+ securetsc_get_tsc_khz);
early_memunmap(mem, PAGE_SIZE);
}
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 0c57fadc4a39..bae709f5f44d 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -94,6 +94,10 @@ extern int cpuid_get_tsc_freq(struct cpuid_tsc_info *info);
extern void tsc_early_init(void);
extern void tsc_init(void);
+#if defined(CONFIG_HYPERVISOR_GUEST) || defined(CONFIG_AMD_MEM_ENCRYPT)
+extern void tsc_register_calibration_routines(unsigned long (*calibrate_tsc)(void),
+ unsigned long (*calibrate_cpu)(void));
+#endif
extern void mark_tsc_unstable(char *reason);
extern int unsynchronized_tsc(void);
extern int check_tsc_unstable(void);
diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c
index 2c5b51aad91a..c1506cb87d8c 100644
--- a/arch/x86/kernel/cpu/acrn.c
+++ b/arch/x86/kernel/cpu/acrn.c
@@ -29,8 +29,8 @@ static void __init acrn_init_platform(void)
/* Install system interrupt handler for ACRN hypervisor callback */
sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback);
- x86_platform.calibrate_tsc = acrn_get_tsc_khz;
- x86_platform.calibrate_cpu = acrn_get_tsc_khz;
+ tsc_register_calibration_routines(acrn_get_tsc_khz,
+ acrn_get_tsc_khz);
}
static bool acrn_x2apic_available(void)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 640e6b223c2d..8d2401be420c 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -573,8 +573,7 @@ static void __init ms_hyperv_init_platform(void)
if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
- x86_platform.calibrate_tsc = hv_get_tsc_khz;
- x86_platform.calibrate_cpu = hv_get_tsc_khz;
+ tsc_register_calibration_routines(hv_get_tsc_khz, hv_get_tsc_khz);
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
}
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 34b73573b108..b88d9ca01202 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -419,8 +419,8 @@ static void __init vmware_platform_setup(void)
}
vmware_tsc_khz = tsc_khz;
- x86_platform.calibrate_tsc = vmware_get_tsc_khz;
- x86_platform.calibrate_cpu = vmware_get_tsc_khz;
+ tsc_register_calibration_routines(vmware_get_tsc_khz,
+ vmware_get_tsc_khz);
#ifdef CONFIG_X86_LOCAL_APIC
/* Skip lapic calibration since we know the bus frequency. */
diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
index f58ce9220e0f..db8f31fdb480 100644
--- a/arch/x86/kernel/jailhouse.c
+++ b/arch/x86/kernel/jailhouse.c
@@ -210,8 +210,6 @@ static void __init jailhouse_init_platform(void)
x86_init.mpparse.parse_smp_cfg = jailhouse_parse_smp_config;
x86_init.pci.arch_init = jailhouse_pci_arch_init;
- x86_platform.calibrate_cpu = jailhouse_get_tsc;
- x86_platform.calibrate_tsc = jailhouse_get_tsc;
x86_platform.get_wallclock = jailhouse_get_wallclock;
x86_platform.legacy.rtc = 0;
x86_platform.legacy.warm_reset = 0;
@@ -221,6 +219,8 @@ static void __init jailhouse_init_platform(void)
machine_ops.emergency_restart = jailhouse_no_restart;
+ tsc_register_calibration_routines(jailhouse_get_tsc, jailhouse_get_tsc);
+
while (pa_data) {
mapping = early_memremap(pa_data, sizeof(header));
memcpy(&header, mapping, sizeof(header));
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index b5991d53fc0e..e9e7394140dd 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -321,8 +321,8 @@ void __init kvmclock_init(void)
flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
- x86_platform.calibrate_tsc = kvm_get_tsc_khz;
- x86_platform.calibrate_cpu = kvm_get_tsc_khz;
+ tsc_register_calibration_routines(kvm_get_tsc_khz, kvm_get_tsc_khz);
+
x86_platform.get_wallclock = kvm_get_wallclock;
x86_platform.set_wallclock = kvm_set_wallclock;
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index f92236f40cbc..7e639c0a94a2 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1281,6 +1281,23 @@ static void __init check_system_tsc_reliable(void)
tsc_disable_clocksource_watchdog();
}
+/*
+ * TODO: Disentangle AMD_MEM_ENCRYPT and make SEV guest support depend on
+ * HYPERVISOR_GUEST.
+ */
+#if defined(CONFIG_HYPERVISOR_GUEST) || defined(CONFIG_AMD_MEM_ENCRYPT)
+void tsc_register_calibration_routines(unsigned long (*calibrate_tsc)(void),
+ unsigned long (*calibrate_cpu)(void))
+{
+ if (WARN_ON_ONCE(!calibrate_tsc))
+ return;
+
+ x86_platform.calibrate_tsc = calibrate_tsc;
+ if (calibrate_cpu)
+ x86_platform.calibrate_cpu = calibrate_cpu;
+}
+#endif
+
/*
* Make an educated guess if the TSC is trustworthy and synchronized
* over all CPUs.
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index d62c14334b35..3d3165eef821 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -569,7 +569,7 @@ static void __init xen_init_time_common(void)
static_call_update(pv_steal_clock, xen_steal_clock);
paravirt_set_sched_clock(xen_sched_clock);
- x86_platform.calibrate_tsc = xen_tsc_khz;
+ tsc_register_calibration_routines(xen_tsc_khz, NULL);
x86_platform.get_wallclock = xen_get_wallclock;
}
--
2.54.0.563.g4f69b47b94-goog
^ permalink raw reply related
* [PATCH v3 01/41] x86/tsc: Add a standalone helpers for getting TSC info from CPUID.0x15
From: Sean Christopherson @ 2026-05-15 19:19 UTC (permalink / raw)
To: Kiryl Shutsemau, Paolo Bonzini, Sean Christopherson,
K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
Ajay Kaher, Alexey Makhalov, Jan Kiszka, Dave Hansen,
Andy Lutomirski, Peter Zijlstra, Juergen Gross, Daniel Lezcano,
Thomas Gleixner, John Stultz
Cc: Rick Edgecombe, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
In-Reply-To: <20260515191942.1892718-1-seanjc@google.com>
Extract retrieval of TSC frequency information from CPUID into standalone
helpers so that TDX guest support can reuse the logic. Provide a version
that includes the multiplier math as TDX does NOT want to use
native_calibrate_tsc()'s fallback logic that derives the TSC frequency
based on CPUID.0x16, when the core crystal frequency isn't known.
Opportunsitically drop native_calibrate_tsc()'s "== 0" and "!= 0" checks
in favor of the kernel's preferred style.
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/tsc.h | 9 +++++
arch/x86/kernel/tsc.c | 67 +++++++++++++++++++++++++-------------
2 files changed, 53 insertions(+), 23 deletions(-)
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 4f7f09f50552..0c57fadc4a39 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -83,6 +83,15 @@ static inline cycles_t get_cycles(void)
}
#define get_cycles get_cycles
+struct cpuid_tsc_info {
+ unsigned int denominator;
+ unsigned int numerator;
+ unsigned int crystal_khz;
+ unsigned int tsc_khz;
+};
+extern int cpuid_get_tsc_info(struct cpuid_tsc_info *info);
+extern int cpuid_get_tsc_freq(struct cpuid_tsc_info *info);
+
extern void tsc_early_init(void);
extern void tsc_init(void);
extern void mark_tsc_unstable(char *reason);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index c5110eb554bc..f92236f40cbc 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -658,46 +658,67 @@ static unsigned long quick_pit_calibrate(void)
return delta;
}
+int cpuid_get_tsc_info(struct cpuid_tsc_info *info)
+{
+ unsigned int ecx_hz, edx;
+
+ memset(info, 0, sizeof(*info));
+
+ if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC)
+ return -ENOENT;
+
+ /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
+ cpuid(CPUID_LEAF_TSC, &info->denominator, &info->numerator, &ecx_hz, &edx);
+
+ if (!info->denominator || !info->numerator)
+ return -ENOENT;
+
+ /*
+ * Note, some CPUs provide the multiplier information, but not the core
+ * crystal frequency. The multiplier information is still useful for
+ * such CPUs, as the crystal frequency can be gleaned from CPUID.0x16.
+ */
+ info->crystal_khz = ecx_hz / 1000;
+ return 0;
+}
+
+int cpuid_get_tsc_freq(struct cpuid_tsc_info *info)
+{
+ if (cpuid_get_tsc_info(info) || !info->crystal_khz)
+ return -ENOENT;
+
+ info->tsc_khz = info->crystal_khz * info->numerator / info->denominator;
+ return 0;
+}
+
/**
* native_calibrate_tsc - determine TSC frequency
* Determine TSC frequency via CPUID, else return 0.
*/
unsigned long native_calibrate_tsc(void)
{
- unsigned int eax_denominator, ebx_numerator, ecx_hz, edx;
- unsigned int crystal_khz;
+ struct cpuid_tsc_info info;
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return 0;
- if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC)
+ if (cpuid_get_tsc_info(&info))
return 0;
- eax_denominator = ebx_numerator = ecx_hz = edx = 0;
-
- /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
- cpuid(CPUID_LEAF_TSC, &eax_denominator, &ebx_numerator, &ecx_hz, &edx);
-
- if (ebx_numerator == 0 || eax_denominator == 0)
- return 0;
-
- crystal_khz = ecx_hz / 1000;
-
/*
* Denverton SoCs don't report crystal clock, and also don't support
* CPUID_LEAF_FREQ for the calculation below, so hardcode the 25MHz
* crystal clock.
*/
- if (crystal_khz == 0 &&
- boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D)
- crystal_khz = 25000;
+ if (!info.crystal_khz && boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D)
+ info.crystal_khz = 25000;
/*
* TSC frequency reported directly by CPUID is a "hardware reported"
* frequency and is the most accurate one so far we have. This
* is considered a known frequency.
*/
- if (crystal_khz != 0)
+ if (info.crystal_khz)
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
/*
@@ -705,15 +726,15 @@ unsigned long native_calibrate_tsc(void)
* clock, but we can easily calculate it to a high degree of accuracy
* by considering the crystal ratio and the CPU speed.
*/
- if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= CPUID_LEAF_FREQ) {
+ if (!info.crystal_khz && boot_cpu_data.cpuid_level >= CPUID_LEAF_FREQ) {
unsigned int eax_base_mhz, ebx, ecx, edx;
cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx, &ecx, &edx);
- crystal_khz = eax_base_mhz * 1000 *
- eax_denominator / ebx_numerator;
+ info.crystal_khz = eax_base_mhz * 1000 *
+ info.denominator / info.numerator;
}
- if (crystal_khz == 0)
+ if (!info.crystal_khz)
return 0;
/*
@@ -730,10 +751,10 @@ unsigned long native_calibrate_tsc(void)
* lapic_timer_period here to avoid having to calibrate the APIC
* timer later.
*/
- lapic_timer_period = crystal_khz * 1000 / HZ;
+ lapic_timer_period = info.crystal_khz * 1000 / HZ;
#endif
- return crystal_khz * ebx_numerator / eax_denominator;
+ return info.crystal_khz * info.numerator / info.denominator;
}
static unsigned long cpu_khz_from_cpuid(void)
--
2.54.0.563.g4f69b47b94-goog
^ permalink raw reply related
* [PATCH v3 00/41] x86: Try to wrangle PV clocks vs. TSC
From: Sean Christopherson @ 2026-05-15 19:19 UTC (permalink / raw)
To: Kiryl Shutsemau, Paolo Bonzini, Sean Christopherson,
K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
Ajay Kaher, Alexey Makhalov, Jan Kiszka, Dave Hansen,
Andy Lutomirski, Peter Zijlstra, Juergen Gross, Daniel Lezcano,
Thomas Gleixner, John Stultz
Cc: Rick Edgecombe, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
Dave/Thomas/Peter/Boris, what's the going rate for bribes to take something
like this through the tip tree?
The bulk of the changes are in kvmclock and TSC, but pretty much every
hypervisor's guest-side code gets touched at some point. I am reaonsably
confident in the correctness of the KVM changes. Michael tested Hyper-V in
v2, and while there were conflicts when rebasing, they were largely
superficial (and I've just jinxed myself). For all other hypervisors, assume
the code is compile-tested only, but those changes are all quite small and
straightforward.
The only changes that are questionable/contentious are the last two patches,
which have KVM-as-a-guest use CPUID 0x16 to get the CPU frequency, even on
AMD (that's the dubious part). I very deliberately put them last, so that
they can be dropped at will (I don't care terribly if those patches land).
To merge them, I would want explicit Acks from Paolo and David W.
So, except for the last two patches, to get the stuff I really care about
landed, I think/hope it's just the TSC and guest-side CoCo changes that need
reviews/acks?
The primary goal of this series is (or at least was, when I started) to
fix flaws with SNP and TDX guests where a PV clock provided by the untrusted
hypervisor is used instead of the secure/trusted TSC that is controlled by
trusted firmware.
The secondary goal is to draft off of the SNP and TDX changes to slightly
modernize running under KVM. Currently, KVM guests will use TSC for
clocksource, but not sched_clock. And they ignore Intel's CPUID-based TSC
and CPU frequency enumeration, even when using the TSC instead of kvmclock.
And if the host provides the core crystal frequency in CPUID.0x15, then KVM
guests can use that for the APIC timer period instead of manually calibrating
the frequency.
The tertiary goal is to clean up all of the PV clock code to deduplicate logic
across hypervisors, and to hopefully make it all easier to maintain going
forward.
Lots more background on the SNP/TDX motiviation:
https://lore.kernel.org/all/20250106124633.1418972-13-nikunj@amd.com
Note, I deliberately omitted:
jailhouse-dev@googlegroups.com
from the To/Cc, as those emails bounced on v1, AFAICT nothing has changed, and
I have zero desire to get 41 emails telling me an email couldn't be delivered.
v3:
- Collect reviews. [Michael, Thomas]
- Use Hyper-V reference counter / refcounter instead of Hyper-V timer. [Michael]
- Use the paravirt CPUID interface first proposed by VMware for KVM's
"official" mechanism for communicating frequency to KVM-aware guests,
instead of abusing Intel's CPUID leafs. [David]
- Deal with paravirt code being moved into asm/timers.h and
arch/x86/kernel/tsc.c.
v2:
- https://lore.kernel.org/all/Z8YWttWDtvkyCtdJ@google.com
- Add struct to hold the TSC CPUID output. [Boris]
- Don't pointlessly inline the TSC CPUID helpers. [Boris]
- Fix a variable goof in a helper, hopefully for real this time. [Dan]
- Collect reviews. [Nikunj]
- Override the sched_clock save/restore hooks if and only if a PV clock
is successfully registered.
- During resome, restore clocksources before reading persistent time.
- Clean up more warts created by kvmclock.
- Fix more bugs in kvmclock's suspend/resume handling.
- Try to harden kvmclock against future bugs.
v1: https://lore.kernel.org/all/20250201021718.699411-1-seanjc@google.com
David Woodhouse (2):
KVM: x86: Officially define CPUID 0x40000010 as PV Timing Info (TSC
and Bus)
x86/kvmclock: Obtain TSC frequency from CPUID if present
Sean Christopherson (39):
x86/tsc: Add a standalone helpers for getting TSC info from CPUID.0x15
x86/tsc: Add helper to register CPU and TSC freq calibration routines
x86/sev: Mark TSC as reliable when configuring Secure TSC
x86/sev: Move check for SNP Secure TSC support to tsc_early_init()
x86/tdx: Override PV calibration routines with CPUID-based calibration
x86/acrn: Mark TSC frequency as known when using ACRN for calibration
clocksource: hyper-v: Register sched_clock save/restore iff it's
necessary
clocksource: hyper-v: Drop wrappers to sched_clock save/restore
helpers
clocksource: hyper-v: Don't save/restore TSC offset when using HV
sched_clock
x86/kvmclock: Setup kvmclock for secondary CPUs iff CONFIG_SMP=y
x86/kvm: Don't disable kvmclock on BSP in syscore_suspend()
x86/paravirt: Remove unnecessary PARAVIRT=n stub for
paravirt_set_sched_clock()
x86/paravirt: Move handling of unstable PV clocks into
paravirt_set_sched_clock()
x86/kvmclock: Move sched_clock save/restore helpers up in kvmclock.c
x86/xen/time: Nullify x86_platform's sched_clock save/restore hooks
x86/vmware: Nullify save/restore hooks when using VMware's sched_clock
x86/tsc: WARN if TSC sched_clock save/restore used with PV sched_clock
x86/paravirt: Pass sched_clock save/restore helpers during
registration
x86/kvmclock: Move kvm_sched_clock_init() down in kvmclock.c
x86/xen/time: Mark xen_setup_vsyscall_time_info() as __init
x86/pvclock: Mark setup helpers and related various as
__init/__ro_after_init
x86/pvclock: WARN if pvclock's valid_flags are overwritten
x86/kvmclock: Refactor handling of PVCLOCK_TSC_STABLE_BIT during
kvmclock_init()
timekeeping: Resume clocksources before reading persistent clock
x86/kvmclock: Hook clocksource.suspend/resume when kvmclock isn't
sched_clock
x86/kvmclock: WARN if wall clock is read while kvmclock is suspended
x86/kvmclock: Enable kvmclock on APs during onlining if kvmclock isn't
sched_clock
x86/paravirt: Mark __paravirt_set_sched_clock() as __init
x86/paravirt: Plumb a return code into __paravirt_set_sched_clock()
x86/paravirt: Don't use a PV sched_clock in CoCo guests with trusted
TSC
x86/tsc: Pass KNOWN_FREQ and RELIABLE as params to registration
x86/tsc: Rejects attempts to override TSC calibration with lesser
routine
x86/kvmclock: Mark TSC as reliable when it's constant and nonstop
x86/kvmclock: Get local APIC bus frequency from PV CPUID Timing Info
x86/kvmclock: Use TSC for sched_clock if it's constant and non-stop
x86/paravirt: kvmclock: Setup kvmclock early iff it's sched_clock
x86/paravirt: Move using_native_sched_clock() stub into timer.h
x86/tsc: Add standalone helper for getting CPU frequency from CPUID
x86/kvmclock: Get CPU base frequency from CPUID when it's available
Documentation/virt/kvm/x86/cpuid.rst | 12 ++
arch/x86/coco/sev/core.c | 9 +-
arch/x86/coco/tdx/tdx.c | 27 ++-
arch/x86/include/asm/kvm_para.h | 12 +-
arch/x86/include/asm/tdx.h | 2 +
arch/x86/include/asm/timer.h | 18 +-
arch/x86/include/asm/tsc.h | 20 +++
arch/x86/include/asm/x86_init.h | 2 -
arch/x86/include/uapi/asm/kvm_para.h | 11 ++
arch/x86/kernel/cpu/acrn.c | 5 +-
arch/x86/kernel/cpu/mshyperv.c | 69 +-------
arch/x86/kernel/cpu/vmware.c | 11 +-
arch/x86/kernel/jailhouse.c | 6 +-
arch/x86/kernel/kvm.c | 62 +++++--
arch/x86/kernel/kvmclock.c | 250 +++++++++++++++++++--------
arch/x86/kernel/pvclock.c | 9 +-
arch/x86/kernel/smpboot.c | 3 +-
arch/x86/kernel/tsc.c | 184 +++++++++++++++-----
arch/x86/kernel/x86_init.c | 1 -
arch/x86/mm/mem_encrypt_amd.c | 3 -
arch/x86/xen/time.c | 13 +-
drivers/clocksource/hyperv_timer.c | 38 ++--
include/clocksource/hyperv_timer.h | 2 -
kernel/time/timekeeping.c | 9 +-
24 files changed, 537 insertions(+), 241 deletions(-)
base-commit: 1196e304db58189264bb5953b4e8da7e90cda615
--
2.54.0.563.g4f69b47b94-goog
^ permalink raw reply
* Re: [PATCH V3 09/11] x86/hyperv: Implement Hyper-V virtual IOMMU
From: Jason Gunthorpe @ 2026-05-15 18:23 UTC (permalink / raw)
To: Mukesh R
Cc: hpa, robin.murphy, robh, wei.liu, mhklinux, muislam, namjain,
magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
linux-pci, linux-arch, kys, haiyangz, decui, longli, tglx, mingo,
bp, dave.hansen, x86, joro, will, lpieralisi, kwilczynski,
bhelgaas, arnd, jacob.pan
In-Reply-To: <20260512020259.1678627-10-mrathor@linux.microsoft.com>
On Mon, May 11, 2026 at 07:02:57PM -0700, Mukesh R wrote:
> +static struct iommu_domain *hv_iommu_domain_alloc_paging(struct device *dev)
> +{
> + struct hv_domain *hvdom;
> + int rc;
> +
> + if (hv_l1vh_partition() && !hv_curr_thread_is_vmm()) {
> + pr_err("Hyper-V: l1vh iommu does not support host devices\n");
> + return NULL;
> + }
> +
> + hvdom = kzalloc(sizeof(struct hv_domain), GFP_KERNEL);
> + if (hvdom == NULL)
> + return NULL;
> +
> + spin_lock_init(&hvdom->mappings_lock);
> + hvdom->mappings_tree = RB_ROOT_CACHED;
> +
> + /* Called under iommu group mutex, so single threaded */
> + if (++unique_id == HV_DEVICE_DOMAIN_ID_S2_NULL) /* ie, UINTMAX */
> + goto out_err;
> +
> + hvdom->domid_num = unique_id;
> + hvdom->partid = hv_get_current_partid();
> + hvdom->iommu_dom.geometry = default_geometry;
> + hvdom->iommu_dom.pgsize_bitmap = HV_IOMMU_PGSIZES;
> +
> + /* For guests, by default we do direct attaches, so no domain in hyp */
> + if (hv_dom_owner_is_vmm(hvdom) && !hv_no_attdev)
> + hvdom->attached_dom = true;
What are you thinking sending something like this?!?!?
The function is called *alloc domain PAGING*, it does not, and can not
allocate weird "special" domains that are not PAGING domains. I just
spent a long time removing all this kind of crazyness from drivers.
There is alot of other things I don't like in this patch, but this is
too much.
You have to drop this "direct attach" idea from the first iteration,
Linux can't do it without alot more work, you should start with the
basic paging domain mode.
Jason
^ permalink raw reply
* RE: [PATCH v1 4/4] iommu/hyperv: Add page-selective IOTLB flush support
From: Michael Kelley @ 2026-05-15 18:00 UTC (permalink / raw)
To: Yu Zhang, Michael Kelley
Cc: linux-kernel@vger.kernel.org, linux-hyperv@vger.kernel.org,
iommu@lists.linux.dev, linux-pci@vger.kernel.org,
linux-arch@vger.kernel.org, wei.liu@kernel.org, kys@microsoft.com,
haiyangz@microsoft.com, decui@microsoft.com, longli@microsoft.com,
joro@8bytes.org, will@kernel.org, robin.murphy@arm.com,
bhelgaas@google.com, kwilczynski@kernel.org,
lpieralisi@kernel.org, mani@kernel.org, robh@kernel.org,
arnd@arndb.de, jgg@ziepe.ca, jacob.pan@linux.microsoft.com,
tgopinath@linux.microsoft.com,
easwar.hariharan@linux.microsoft.com
In-Reply-To: <7wil6tzqp74gdvhyjvpv47zhfernncs42wnfoueznneluz5zrp@pzr63lhy7s5f>
From: Yu Zhang <zhangyu1@linux.microsoft.com> Sent: Friday, May 15, 2026 9:24 AM
>
> On Thu, May 14, 2026 at 06:14:22PM +0000, Michael Kelley wrote:
> > From: Yu Zhang <zhangyu1@linux.microsoft.com> Sent: Monday, May 11, 2026 9:24 AM
> > >
[....]
> > > + unsigned long nr_pages = end_pfn - start_pfn;
> > > + u16 count = 0;
> > > +
> > > + while (nr_pages > 0) {
> > > + unsigned long flush_pages;
> > > + int order;
> > > + unsigned long pfn_align;
> > > + unsigned long size_align;
> > > +
> > > + if (count >= HV_IOMMU_MAX_FLUSH_VA_COUNT) {
> > > + count = HV_IOMMU_FLUSH_VA_OVERFLOW;
> > > + break;
> > > + }
> > > +
> > > + if (start_pfn)
> > > + pfn_align = __ffs(start_pfn);
> >
> > I don't understand why __ffs() is correct here. I would expect
> > __fls() so it is consistent with the calculation of size_align. But I
> > can only surmise how the hypercall works since there's no
> > documentation, so maybe my understanding of the hypercall is
> > wrong. If __ffs really is correct, a comment explaining why
> > would help. :-)
> >
>
> The use of __ffs() is intentional. Each flush entry invalidates a
> naturally aligned 2^N page block, and the hypervisor requires the
> page_number to be aligned to 2^page_mask_shift.
>
> Here __ffs() and __fls() serve different purposes:
> - __ffs(start_pfn) is about the alignment constraint, e.g., how
> large a block can this address support?
> - __fls(nr_pages) is about the size constraint, e.g., how large
> a block can the remaining range hold?
>
> Taking min() of both ensures each entry is both properly aligned
> and within bounds.
>
> Thanks for raising this - it definitely deserves a comment. I had to
> stare at it for a while myself to remember why. :)
Hmmm. Something about this still nags at me. I'll run some
experiments to either convince myself that you are right, or to
come up with a counterexample.
A related thought occurred to me. If each flush entry that is passed
to Hyper-V describes a naturally aligned 2^N page block, I don't
think the HV_IOMMU_MAX_FLUSH_VA_COUNT can ever
be reached. The number of entries is limited by the number of
bits in a PFN and the pages count, both of which are 64. And with
52 bit physical addressing and 4KiB pages, the actual size of
a PFN and pages count is even smaller than 64.
HV_IOMMU_MAX_FLUSH_VA_COUNT is the number of 8 byte
union hv_iommu_flush_va entries that fit in a 4KiB page, so
it's ~500.
My statement applies to a single flush range. If multiple flush
ranges were strung together in a single hypercall, a larger count
could be reached, but hv_flush_device_domain_list() only does
a single range. So I don't think the overflow case in
hv_flush_device_domain_list() can ever happen. But let me
do my experiments, and I will also look at this aspect to confirm
if it's right.
>
> > > + else
> > > + pfn_align = BITS_PER_LONG - 1;
> > > +
> > > + size_align = __fls(nr_pages);
> > > + order = min(pfn_align, size_align);
> > > + iova_list[count].page_mask_shift = order;
> > > + iova_list[count].page_number = start_pfn;
> > > +
> > > + flush_pages = 1UL << order;
> > > + start_pfn += flush_pages;
> > > + nr_pages -= flush_pages;
> > > + count++;
> > > + }
> > > +
> > > + return count;
> > > +}
> > > +
> > > +static void hv_flush_device_domain_list(struct hv_iommu_domain *hv_domain,
> > > + struct iommu_iotlb_gather *iotlb_gather)
> > > +{
> > > + u64 status;
> > > + u16 count;
> > > + unsigned long flags;
> > > + struct hv_input_flush_device_domain_list *input;
> > > +
> > > + local_irq_save(flags);
> > > +
> > > + input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> > > + memset(input, 0, sizeof(*input));
> > > +
> > > + input->device_domain = hv_domain->device_domain;
> > > + input->flags |= HV_FLUSH_DEVICE_DOMAIN_LIST_IOMMU_FORMAT;
> >
> > I would suggest moving the memset() and setting the input fields down
> > under the "else" below so that they are parallel with the flush all case.
> >
>
> I agree the structure should be more symmetric. Yet I guess the memset and
> hv_iommu_fill_iova_list() need to stay before the branch since the fill
> writes directly into input->iova_list[]. :)
Agreed.
>
> > > + count = hv_iommu_fill_iova_list(input->iova_list,
> > > + iotlb_gather->start,
> > > + iotlb_gather->end);
> > > + if (count == HV_IOMMU_FLUSH_VA_OVERFLOW) {
> > > + /*
> > > + * Range exceeds hypercall page capacity. Fall back to a full
> > > + * domain flush.
> > > + */
> > > + struct hv_input_flush_device_domain *flush_all = (void *)input;
> > > +
> > > + memset(flush_all, 0, sizeof(*flush_all));
> > > + flush_all->device_domain = hv_domain->device_domain;
> > > + status = hv_do_hypercall(HVCALL_FLUSH_DEVICE_DOMAIN,
> > > + flush_all, NULL);
> > > + } else {
> > > + status = hv_do_rep_hypercall(
> > > + HVCALL_FLUSH_DEVICE_DOMAIN_LIST,
> > > + count, 0, input, NULL);
> > > + }
> > > +
> > > + local_irq_restore(flags);
> > > +
> > > + if (!hv_result_success(status))
> > > + pr_err("HVCALL_FLUSH_DEVICE_DOMAIN_LIST failed, status %lld\n", status);
> >
> > As Sashiko pointed out, a failure here can lead to all kinds of trouble because
> > of leaving unflushed entries. Maybe a WARN() is more appropriate? Also, maybe
> > a failure in the list flush should try a flush all as a fallback, with the WARN()
> > only if the flush all fails.
> >
>
> Good idea. How about we restructure this routine to sth. like this:
>
>
> memset(input, 0, sizeof(*input));
> count = hv_iommu_fill_iova_list(...);
>
> if (count != HV_IOMMU_FLUSH_VA_OVERFLOW) {
> input->device_domain = ...;
> ...
> status = hv_do_rep_hypercall(FLUSH_DEVICE_DOMAIN_LIST, ...);
> if (hv_result_success(status))
> goto out;
> }
>
> /* overflow or list flush failed: fallback to full domain flush */
> flush_all = (void *)input;
> memset(flush_all, 0, sizeof(*flush_all));
> flush_all->device_domain = ...;
> status = hv_do_hypercall(FLUSH_DEVICE_DOMAIN, ...);
> WARN(!hv_result_success(status), "IOTLB flush failed, status %lld\n", status);
>
> out:
> local_irq_restore(flags);
>
Yes, I think this works. But per my earlier comment, if I'm right that
the overflow case never occurs, it could be simplified further to just
do the list flush with the full flush as the error fallback. Then WARN
if the full flush fails.
Michael
^ permalink raw reply
* RE: [PATCH v1 3/4] iommu/hyperv: Add para-virtualized IOMMU support for Hyper-V guest
From: Michael Kelley @ 2026-05-15 17:36 UTC (permalink / raw)
To: Yu Zhang, Michael Kelley
Cc: linux-kernel@vger.kernel.org, linux-hyperv@vger.kernel.org,
iommu@lists.linux.dev, linux-pci@vger.kernel.org,
linux-arch@vger.kernel.org, wei.liu@kernel.org, kys@microsoft.com,
haiyangz@microsoft.com, decui@microsoft.com, longli@microsoft.com,
joro@8bytes.org, will@kernel.org, robin.murphy@arm.com,
bhelgaas@google.com, kwilczynski@kernel.org,
lpieralisi@kernel.org, mani@kernel.org, robh@kernel.org,
arnd@arndb.de, jgg@ziepe.ca, jacob.pan@linux.microsoft.com,
tgopinath@linux.microsoft.com,
easwar.hariharan@linux.microsoft.com, Mukesh R
In-Reply-To: <fw2pruvjgo7yigtcxssf3xv27soibsj6hmw2ls5wj4rylfhdha@e63f32cwu2x5>
From: Yu Zhang <zhangyu1@linux.microsoft.com> Sent: Friday, May 15, 2026 9:54 AM
>
> On Fri, May 15, 2026 at 02:51:38PM +0000, Michael Kelley wrote:
> > From: Yu Zhang <zhangyu1@linux.microsoft.com> Sent: Friday, May 15, 2026 7:00 AM
> > >
> > > On Thu, May 14, 2026 at 06:13:24PM +0000, Michael Kelley wrote:
> > > > From: Yu Zhang <zhangyu1@linux.microsoft.com> Sent: Monday, May 11, 2026 9:24 AM
[....]
> > > >
> > > > Previous versions of this function did hv_iommu_detach_dev(). With that call
> > > > removed from here, hv_iommu_detach_dev() is only called when attaching a
> > > > domain to a device that already has a domain attached. Is it the case that
> > > > Hyper-V doesn't require the detach as a cleanup step?
> > > >
> > >
> > > The IOMMU core attaches the device to release_domain (our blocking domain)
> > > before calling release_device(), so I believe the explicit detach in the RFC
> > > was redundant. I simply didn't realize that at the time.
> > >
> >
> > Got it. But after the IOMMU core attaches the device to the blocking
> > domain, there's the possibility that the vPCI device is rescinded by
> > Hyper-V and it goes away entirely. Or the device might be subjected
> > to an "unbind/bind" cycle in Linux. Does the detach need to be done
> > on the blocking domain in such cases? In this version of the patches, the
> > Hyper-V "attach" and "detach" hypercalls still end up unbalanced. That
> > seems a bit untidy at best, and I wonder if there are scenarios where
> > Hyper-V will complain about the lack of balance.
> >
>
> Thank you, Michael. May I ask what "the vPCI device is rescinded by
> Hyper-V and it goes away entirely" mean?
>
See the documentation at Documentation/virt/hyperv/vpci.rst in a
kernel source code tree, and particularly the section entitled "PCI Device
Removal". Such removals can and do happen in running Azure guest
VMs. Start with that info and then I'll do my best to answer follow-up
questions you may have.
The unbind/bind case is separate, but has some of the same effects in
that Linux should be removing all setup of the PCI device. There's actually
two unbind steps -- one to unbind the device-specific driver (e.g., the
Mellanox MLX5 driver or the NMVe driver) driver from the device, and
potentially a second to unbind the VMBus vPCI driver from the device.
These unbind/bind sequences can be done in the Linux guest without
the Hyper-V host rescinding the device.
Michael
^ permalink raw reply
* Re: [PATCH 0/9] drm: Limit DRM_IOCTL_WAIT_VBLANK to vblank interrupts
From: Michel Dänzer @ 2026-05-15 16:56 UTC (permalink / raw)
To: Thomas Zimmermann, simona, airlied, pekka.paalanen, jadahl,
contact, maarten.lankhorst, mripard
Cc: amd-gfx, dri-devel, linux-hyperv, virtualization, spice-devel,
wayland-devel
In-Reply-To: <b5d03921-1e6f-4c4f-900e-fc9e28222176@mailbox.org>
[ Adding the wayland-devel list for awareness ]
On 5/15/26 17:12, Michel Dänzer wrote:
> On 5/15/26 13:55, Thomas Zimmermann wrote:
>> DRM's WAIT_VBLANK ioctl synchronizes user-space clients to display
>> refresh. This is meaningless with vblank timers, which run unrelated
>> to the hardware's vblank.
>>
>> Disable the ioctl for simulated vblanks. Set DRM_VBLANK_FLAG_SIMULATED
>> for CRTCs with simulated vblank events in all such drivers. The vblank
>> timers of these devices still rate-limit the number of page-flip events
>> to match the display refresh.
>>
>> According to maintainers, user-space compositors do not require the ioctl
>> for rate-limitting display output. Weston and Kwin rely on page-flip
>> events. Mutter uses and internal timer to limit the number of display
>> updates per second.
>
> Actually mutter fundamentally relies on atomic commit completion events for that, same as Weston & KWin. Mutter uses the WAIT_VBLANK ioctl only for minimizing input → output latency (which can hide issues when completion of atomic commits isn't properly throttled).
>
>
> (Just a side not on the cover letter, no objections to the patches themselves)
After more discussion on IRC, I have some concerns.
The big one first: For drivers with no strict refresh cycle (i.e. an atomic commit can take effect more or less anytime after at least one "refresh cycle" has passed since the last one), does this change really make sense / what's the actual benefit?
In the case of the asahi & nvidia drivers, the problem with exposing this functionality to user space is that if the timestamps aren't accurate, it can result in missing display refresh cycles, which are dictated by hardware. That's why it makes sense to reject it there.
When there's no strict refresh cycle, that issue doesn't apply though.
Any changes made to the WAIT_VBLANK ioctl should also be made to the CRTC_GET_SEQUENCE / CRTC_QUEUE_SEQUENCE ioctls, which are slightly different UAPI for the same functionality.
--
Earthling Michel Dänzer \ GNOME / Xwayland / Mesa developer
https://redhat.com \ Libre software enthusiast
^ permalink raw reply
* Re: [PATCH v1 3/4] iommu/hyperv: Add para-virtualized IOMMU support for Hyper-V guest
From: Yu Zhang @ 2026-05-15 16:53 UTC (permalink / raw)
To: Michael Kelley
Cc: linux-kernel@vger.kernel.org, linux-hyperv@vger.kernel.org,
iommu@lists.linux.dev, linux-pci@vger.kernel.org,
linux-arch@vger.kernel.org, wei.liu@kernel.org, kys@microsoft.com,
haiyangz@microsoft.com, decui@microsoft.com, longli@microsoft.com,
joro@8bytes.org, will@kernel.org, robin.murphy@arm.com,
bhelgaas@google.com, kwilczynski@kernel.org,
lpieralisi@kernel.org, mani@kernel.org, robh@kernel.org,
arnd@arndb.de, jgg@ziepe.ca, jacob.pan@linux.microsoft.com,
tgopinath@linux.microsoft.com,
easwar.hariharan@linux.microsoft.com, Mukesh R
In-Reply-To: <SN6PR02MB415734108A86BDFB66AEE4CED4042@SN6PR02MB4157.namprd02.prod.outlook.com>
On Fri, May 15, 2026 at 02:51:38PM +0000, Michael Kelley wrote:
> From: Yu Zhang <zhangyu1@linux.microsoft.com> Sent: Friday, May 15, 2026 7:00 AM
> >
> > On Thu, May 14, 2026 at 06:13:24PM +0000, Michael Kelley wrote:
> > > From: Yu Zhang <zhangyu1@linux.microsoft.com> Sent: Monday, May 11, 2026 9:24 AM
> > > >
> > > > Add a para-virtualized IOMMU driver for Linux guests running on Hyper-V.
> > > > This driver implements stage-1 IO translation within the guest OS.
> > > > It integrates with the Linux IOMMU core, utilizing Hyper-V hypercalls
> > > > for:
> > > > - Capability discovery
> > > > - Domain allocation, configuration, and deallocation
> > > > - Device attachment and detachment
> > > > - IOTLB invalidation
> > > >
> > > > The driver constructs x86-compatible stage-1 IO page tables in the
> > > > guest memory using consolidated IO page table helpers. This allows
> > > > the guest to manage stage-1 translations independently of vendor-
> > > > specific drivers (like Intel VT-d or AMD IOMMU).
> > > >
> > > > Hyper-V consumes this stage-1 IO page table when a device domain is
> > > > created and configured, and nests it with the host's stage-2 IO page
> > > > tables, therefore eliminating the VM exits for guest IOMMU mapping
> > > > operations. For unmapping operations, VM exits to perform the IOTLB
> > > > flush are still unavoidable.
> > > >
> > > > Hyper-V identifies each PCI pass-thru device by a logical device ID
> > > > in its hypercall interface. The vPCI driver (pci-hyperv) registers the
> > > > per-bus portion of this ID with the pvIOMMU driver during bus probe.
> > > > The pvIOMMU driver stores this mapping and combines it with the function
> > > > number of the endpoint PCI device to form the complete ID for hypercalls.
> > >
> > > As you are probably aware, Mukesh's patch series to support PCI
> > > pass-thru devices also needs to get the logical device ID. Maybe the
> > > registration mechanism needs to move somewhere that can be shared
> > > with his code.
> > >
> >
> > Thank you so much for the review, Michael!
> >
> > Yes, I looked at Mukesh's series and noticed his hv_pci_vmbus_device_id()
> > in pci-hyperv.c has the same dev_instance byte manipulation. We do need
> > a common registration mechanism.
> >
> > Any suggestion on where to put it? drivers/hv/hv_common.c seems like a
> > natural place, but the register/lookup functions are currently only
> > meaningful when CONFIG_HYPERV_PVIOMMU is set. If Mukesh's pass-thru
> > code also needs them, we might need a new shared Kconfig option that
> > both can select. Open to better ideas.
>
> Unfortunately, I have not looked at Mukesh's series in detail yet, so
> I don't have enough knowledge of the full situation to offer a good
> recommendation.
>
Sorry I forgot to Cc Mukesh in the previous reply. :(
@Mukesh, any thoughts on sharing the logical device ID registration mechanism?
> >
> > [...]
> >
> > > > +static void hv_flush_device_domain(struct hv_iommu_domain *hv_domain)
> > > > +{
> > > > + u64 status;
> > > > + unsigned long flags;
> > > > + struct hv_input_flush_device_domain *input;
> > > > +
> > > > + local_irq_save(flags);
> > > > +
> > > > + input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> > > > + memset(input, 0, sizeof(*input));
> > > > + input->device_domain = hv_domain->device_domain;
> > >
> > > The previous version of this patch had code to set several other fields in
> > > the input. I wanted to confirm that not setting them in this version is
> > > intentional. Were they not needed?
> > >
> >
> > Oh. The RFC v1 set partition_id, owner_vtl, domain_id.type, and domain_id.id
> > individually. In this version, I just simplified it to a struct assignment.
> > No functional change.
>
> Of course! I should have looked more closely at the details before making
> this comment. :-(
>
> [...]
>
> > >
> > > Previous versions of this function did hv_iommu_detach_dev(). With that call
> > > removed from here, hv_iommu_detach_dev() is only called when attaching a
> > > domain to a device that already has a domain attached. Is it the case that
> > > Hyper-V doesn't require the detach as a cleanup step?
> > >
> >
> > The IOMMU core attaches the device to release_domain (our blocking domain)
> > before calling release_device(), so I believe the explicit detach in the RFC
> > was redundant. I simply didn't realize that at the time.
> >
>
> Got it. But after the IOMMU core attaches the device to the blocking
> domain, there's the possibility that the vPCI device is rescinded by
> Hyper-V and it goes away entirely. Or the device might be subjected
> to an "unbind/bind" cycle in Linux. Does the detach need to be done
> on the blocking domain in such cases? In this version of the patches, the
> Hyper-V "attach" and "detach" hypercalls still end up unbalanced. That
> seems a bit untidy at best, and I wonder if there are scenarios where
> Hyper-V will complain about the lack of balance.
>
Thank you, Michael. May I ask what "the vPCI device is rescinded by
Hyper-V and it goes away entirely" mean?
I realized it's a bit untidy. But I want to understand this issue more
clearly first. :)
B.R.
Yu
^ permalink raw reply
* Re: [PATCH v1 4/4] iommu/hyperv: Add page-selective IOTLB flush support
From: Yu Zhang @ 2026-05-15 16:23 UTC (permalink / raw)
To: Michael Kelley
Cc: linux-kernel@vger.kernel.org, linux-hyperv@vger.kernel.org,
iommu@lists.linux.dev, linux-pci@vger.kernel.org,
linux-arch@vger.kernel.org, wei.liu@kernel.org, kys@microsoft.com,
haiyangz@microsoft.com, decui@microsoft.com, longli@microsoft.com,
joro@8bytes.org, will@kernel.org, robin.murphy@arm.com,
bhelgaas@google.com, kwilczynski@kernel.org,
lpieralisi@kernel.org, mani@kernel.org, robh@kernel.org,
arnd@arndb.de, jgg@ziepe.ca, jacob.pan@linux.microsoft.com,
tgopinath@linux.microsoft.com,
easwar.hariharan@linux.microsoft.com
In-Reply-To: <SN6PR02MB41577D5EEC884EAE8AF5E14ED4072@SN6PR02MB4157.namprd02.prod.outlook.com>
On Thu, May 14, 2026 at 06:14:22PM +0000, Michael Kelley wrote:
> From: Yu Zhang <zhangyu1@linux.microsoft.com> Sent: Monday, May 11, 2026 9:24 AM
> >
> > Add page-selective IOTLB flush using HVCALL_FLUSH_DEVICE_DOMAIN_LIST.
> > This hypercall accepts a list of (page_number, page_mask_shift) entries,
> > enabling finer-grained IOTLB invalidation compared to the domain-wide
> > HVCALL_FLUSH_DEVICE_DOMAIN used by hv_iommu_flush_iotlb_all().
> >
> > hv_iommu_fill_iova_list() decomposes a contiguous IOVA range into a
> > minimal set of aligned power-of-two regions that fit in a single
> > hypercall input page. When the range exceeds the page capacity, the
> > code falls back to a full domain flush automatically.
> >
> > Signed-off-by: Yu Zhang <zhangyu1@linux.microsoft.com>
> > Signed-off-by: Easwar Hariharan <easwar.hariharan@linux.microsoft.com>
> > ---
> > drivers/iommu/hyperv/iommu.c | 91 +++++++++++++++++++++++++++++++++++-
> > include/hyperv/hvgdk_mini.h | 1 +
> > include/hyperv/hvhdk_mini.h | 17 +++++++
> > 3 files changed, 108 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/iommu/hyperv/iommu.c b/drivers/iommu/hyperv/iommu.c
> > index e5fc625314b5..3bca362b7815 100644
> > --- a/drivers/iommu/hyperv/iommu.c
> > +++ b/drivers/iommu/hyperv/iommu.c
> > @@ -486,10 +486,98 @@ static void hv_iommu_flush_iotlb_all(struct iommu_domain *domain)
> > hv_flush_device_domain(to_hv_iommu_domain(domain));
> > }
> >
> > +/* Max number of iova_list entries in a single hypercall input page. */
> > +#define HV_IOMMU_MAX_FLUSH_VA_COUNT \
> > + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_flush_device_domain_list)) / \
> > + sizeof(union hv_iommu_flush_va))
> > +
> > +/* Returned by hv_iommu_fill_iova_list() when the range exceeds the capacity */
> > +#define HV_IOMMU_FLUSH_VA_OVERFLOW U16_MAX
> > +
> > +static inline u16 hv_iommu_fill_iova_list(union hv_iommu_flush_va *iova_list,
> > + unsigned long start,
> > + unsigned long end)
> > +{
> > + unsigned long start_pfn = start >> PAGE_SHIFT;
> > + unsigned long end_pfn = PAGE_ALIGN(end) >> PAGE_SHIFT;
>
> "end" is an inclusive end address per comment in struct iommu_iotlb_gather.
> So a page aligned value would typically have 0xFFF as the low order 12 bits,
> and PAGE_ALIGN() will do the right thing. But I don't think the value is
> *required* to be page aligned. If the value of "end" had 0x000 as the
> low order 12 bits, the above calculation would fail to include the page
> that has the address ending in 0x000. I think it needs to be
> PAGE_ALIGN(end + 1) in order to work correctly for this corner case.
>
Good catch! Will use HVPFN_DOWN(start) and HVPFN_UP(end + 1) as you
suggested in your follow-up mail.
> > + unsigned long nr_pages = end_pfn - start_pfn;
> > + u16 count = 0;
> > +
> > + while (nr_pages > 0) {
> > + unsigned long flush_pages;
> > + int order;
> > + unsigned long pfn_align;
> > + unsigned long size_align;
> > +
> > + if (count >= HV_IOMMU_MAX_FLUSH_VA_COUNT) {
> > + count = HV_IOMMU_FLUSH_VA_OVERFLOW;
> > + break;
> > + }
> > +
> > + if (start_pfn)
> > + pfn_align = __ffs(start_pfn);
>
> I don't understand why __ffs() is correct here. I would expect
> __fls() so it is consistent with the calculation of size_align. But I
> can only surmise how the hypercall works since there's no
> documentation, so maybe my understanding of the hypercall is
> wrong. If __ffs really is correct, a comment explaining why
> would help. :-)
>
The use of __ffs() is intentional. Each flush entry invalidates a
naturally aligned 2^N page block, and the hypervisor requires the
page_number to be aligned to 2^page_mask_shift.
Here __ffs() and __fls() serve different purposes:
- __ffs(start_pfn) is about the alignment constraint, e.g., how
large a block can this address support?
- __fls(nr_pages) is about the size constraint, e.g., how large
a block can the remaining range hold?
Taking min() of both ensures each entry is both properly aligned
and within bounds.
Thanks for raising this — it definitely deserves a comment. I had to
stare at it for a while myself to remember why. :)
> > + else
> > + pfn_align = BITS_PER_LONG - 1;
> > +
> > + size_align = __fls(nr_pages);
> > + order = min(pfn_align, size_align);
> > + iova_list[count].page_mask_shift = order;
> > + iova_list[count].page_number = start_pfn;
> > +
> > + flush_pages = 1UL << order;
> > + start_pfn += flush_pages;
> > + nr_pages -= flush_pages;
> > + count++;
> > + }
> > +
> > + return count;
> > +}
> > +
> > +static void hv_flush_device_domain_list(struct hv_iommu_domain *hv_domain,
> > + struct iommu_iotlb_gather *iotlb_gather)
> > +{
> > + u64 status;
> > + u16 count;
> > + unsigned long flags;
> > + struct hv_input_flush_device_domain_list *input;
> > +
> > + local_irq_save(flags);
> > +
> > + input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> > + memset(input, 0, sizeof(*input));
> > +
> > + input->device_domain = hv_domain->device_domain;
> > + input->flags |= HV_FLUSH_DEVICE_DOMAIN_LIST_IOMMU_FORMAT;
>
> I would suggest moving the memset() and setting the input fields down
> under the "else" below so that they are parallel with the flush all case.
>
I agree the structure should be more symmetric. Yet I guess the memset and
hv_iommu_fill_iova_list() need to stay before the branch since the fill
writes directly into input->iova_list[]. :)
> > + count = hv_iommu_fill_iova_list(input->iova_list,
> > + iotlb_gather->start,
> > + iotlb_gather->end);
> > + if (count == HV_IOMMU_FLUSH_VA_OVERFLOW) {
> > + /*
> > + * Range exceeds hypercall page capacity. Fall back to a full
> > + * domain flush.
> > + */
> > + struct hv_input_flush_device_domain *flush_all = (void *)input;
> > +
> > + memset(flush_all, 0, sizeof(*flush_all));
> > + flush_all->device_domain = hv_domain->device_domain;
> > + status = hv_do_hypercall(HVCALL_FLUSH_DEVICE_DOMAIN,
> > + flush_all, NULL);
> > + } else {
> > + status = hv_do_rep_hypercall(
> > + HVCALL_FLUSH_DEVICE_DOMAIN_LIST,
> > + count, 0, input, NULL);
> > + }
> > +
> > + local_irq_restore(flags);
> > +
> > + if (!hv_result_success(status))
> > + pr_err("HVCALL_FLUSH_DEVICE_DOMAIN_LIST failed, status %lld\n", status);
>
> As Sashiko pointed out, a failure here can lead to all kinds of trouble because
> of leaving unflushed entries. Maybe a WARN() is more appropriate? Also, maybe
> a failure in the list flush should try a flush all as a fallback, with the WARN()
> only if the flush all fails.
>
Good idea. How about we restructure this routine to sth. like this:
memset(input, 0, sizeof(*input));
count = hv_iommu_fill_iova_list(...);
if (count != HV_IOMMU_FLUSH_VA_OVERFLOW) {
input->device_domain = ...;
...
status = hv_do_rep_hypercall(FLUSH_DEVICE_DOMAIN_LIST, ...);
if (hv_result_success(status))
goto out;
}
/* overflow or list flush failed: fallback to full domain flush */
flush_all = (void *)input;
memset(flush_all, 0, sizeof(*flush_all));
flush_all->device_domain = ...;
status = hv_do_hypercall(FLUSH_DEVICE_DOMAIN, ...);
WARN(!hv_result_success(status), "IOTLB flush failed, status %lld\n", status);
out:
local_irq_restore(flags);
B.R.
Yu
^ permalink raw reply
* Re: [PATCH 0/9] drm: Limit DRM_IOCTL_WAIT_VBLANK to vblank interrupts
From: Michel Dänzer @ 2026-05-15 15:12 UTC (permalink / raw)
To: Thomas Zimmermann, simona, airlied, pekka.paalanen, jadahl,
contact, maarten.lankhorst, mripard
Cc: amd-gfx, dri-devel, linux-hyperv, virtualization, spice-devel
In-Reply-To: <20260515120916.333614-1-tzimmermann@suse.de>
On 5/15/26 13:55, Thomas Zimmermann wrote:
> DRM's WAIT_VBLANK ioctl synchronizes user-space clients to display
> refresh. This is meaningless with vblank timers, which run unrelated
> to the hardware's vblank.
>
> Disable the ioctl for simulated vblanks. Set DRM_VBLANK_FLAG_SIMULATED
> for CRTCs with simulated vblank events in all such drivers. The vblank
> timers of these devices still rate-limit the number of page-flip events
> to match the display refresh.
>
> According to maintainers, user-space compositors do not require the ioctl
> for rate-limitting display output. Weston and Kwin rely on page-flip
> events. Mutter uses and internal timer to limit the number of display
> updates per second.
Actually mutter fundamentally relies on atomic commit completion events for that, same as Weston & KWin. Mutter uses the WAIT_VBLANK ioctl only for minimizing input → output latency (which can hide issues when completion of atomic commits isn't properly throttled).
(Just a side not on the cover letter, no objections to the patches themselves)
--
Earthling Michel Dänzer \ GNOME / Xwayland / Mesa developer
https://redhat.com \ Libre software enthusiast
^ permalink raw reply
* RE: [PATCH v1 3/4] iommu/hyperv: Add para-virtualized IOMMU support for Hyper-V guest
From: Michael Kelley @ 2026-05-15 14:51 UTC (permalink / raw)
To: Yu Zhang, Michael Kelley
Cc: linux-kernel@vger.kernel.org, linux-hyperv@vger.kernel.org,
iommu@lists.linux.dev, linux-pci@vger.kernel.org,
linux-arch@vger.kernel.org, wei.liu@kernel.org, kys@microsoft.com,
haiyangz@microsoft.com, decui@microsoft.com, longli@microsoft.com,
joro@8bytes.org, will@kernel.org, robin.murphy@arm.com,
bhelgaas@google.com, kwilczynski@kernel.org,
lpieralisi@kernel.org, mani@kernel.org, robh@kernel.org,
arnd@arndb.de, jgg@ziepe.ca, jacob.pan@linux.microsoft.com,
tgopinath@linux.microsoft.com,
easwar.hariharan@linux.microsoft.com
In-Reply-To: <qeyycsdnejwrqle4zwrvkjvkvrpjifeanwxjaa7i7y2ab7rnt2@b6gvugqayarg>
From: Yu Zhang <zhangyu1@linux.microsoft.com> Sent: Friday, May 15, 2026 7:00 AM
>
> On Thu, May 14, 2026 at 06:13:24PM +0000, Michael Kelley wrote:
> > From: Yu Zhang <zhangyu1@linux.microsoft.com> Sent: Monday, May 11, 2026 9:24 AM
> > >
> > > Add a para-virtualized IOMMU driver for Linux guests running on Hyper-V.
> > > This driver implements stage-1 IO translation within the guest OS.
> > > It integrates with the Linux IOMMU core, utilizing Hyper-V hypercalls
> > > for:
> > > - Capability discovery
> > > - Domain allocation, configuration, and deallocation
> > > - Device attachment and detachment
> > > - IOTLB invalidation
> > >
> > > The driver constructs x86-compatible stage-1 IO page tables in the
> > > guest memory using consolidated IO page table helpers. This allows
> > > the guest to manage stage-1 translations independently of vendor-
> > > specific drivers (like Intel VT-d or AMD IOMMU).
> > >
> > > Hyper-V consumes this stage-1 IO page table when a device domain is
> > > created and configured, and nests it with the host's stage-2 IO page
> > > tables, therefore eliminating the VM exits for guest IOMMU mapping
> > > operations. For unmapping operations, VM exits to perform the IOTLB
> > > flush are still unavoidable.
> > >
> > > Hyper-V identifies each PCI pass-thru device by a logical device ID
> > > in its hypercall interface. The vPCI driver (pci-hyperv) registers the
> > > per-bus portion of this ID with the pvIOMMU driver during bus probe.
> > > The pvIOMMU driver stores this mapping and combines it with the function
> > > number of the endpoint PCI device to form the complete ID for hypercalls.
> >
> > As you are probably aware, Mukesh's patch series to support PCI
> > pass-thru devices also needs to get the logical device ID. Maybe the
> > registration mechanism needs to move somewhere that can be shared
> > with his code.
> >
>
> Thank you so much for the review, Michael!
>
> Yes, I looked at Mukesh's series and noticed his hv_pci_vmbus_device_id()
> in pci-hyperv.c has the same dev_instance byte manipulation. We do need
> a common registration mechanism.
>
> Any suggestion on where to put it? drivers/hv/hv_common.c seems like a
> natural place, but the register/lookup functions are currently only
> meaningful when CONFIG_HYPERV_PVIOMMU is set. If Mukesh's pass-thru
> code also needs them, we might need a new shared Kconfig option that
> both can select. Open to better ideas.
Unfortunately, I have not looked at Mukesh's series in detail yet, so
I don't have enough knowledge of the full situation to offer a good
recommendation.
>
> [...]
>
> > > +static void hv_flush_device_domain(struct hv_iommu_domain *hv_domain)
> > > +{
> > > + u64 status;
> > > + unsigned long flags;
> > > + struct hv_input_flush_device_domain *input;
> > > +
> > > + local_irq_save(flags);
> > > +
> > > + input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> > > + memset(input, 0, sizeof(*input));
> > > + input->device_domain = hv_domain->device_domain;
> >
> > The previous version of this patch had code to set several other fields in
> > the input. I wanted to confirm that not setting them in this version is
> > intentional. Were they not needed?
> >
>
> Oh. The RFC v1 set partition_id, owner_vtl, domain_id.type, and domain_id.id
> individually. In this version, I just simplified it to a struct assignment.
> No functional change.
Of course! I should have looked more closely at the details before making
this comment. :-(
[...]
> >
> > Previous versions of this function did hv_iommu_detach_dev(). With that call
> > removed from here, hv_iommu_detach_dev() is only called when attaching a
> > domain to a device that already has a domain attached. Is it the case that
> > Hyper-V doesn't require the detach as a cleanup step?
> >
>
> The IOMMU core attaches the device to release_domain (our blocking domain)
> before calling release_device(), so I believe the explicit detach in the RFC
> was redundant. I simply didn't realize that at the time.
>
Got it. But after the IOMMU core attaches the device to the blocking
domain, there's the possibility that the vPCI device is rescinded by
Hyper-V and it goes away entirely. Or the device might be subjected
to an "unbind/bind" cycle in Linux. Does the detach need to be done
on the blocking domain in such cases? In this version of the patches, the
Hyper-V "attach" and "detach" hypercalls still end up unbalanced. That
seems a bit untidy at best, and I wonder if there are scenarios where
Hyper-V will complain about the lack of balance.
Michael
^ permalink raw reply
* Re: [PATCH v1 3/4] iommu/hyperv: Add para-virtualized IOMMU support for Hyper-V guest
From: Yu Zhang @ 2026-05-15 13:59 UTC (permalink / raw)
To: Michael Kelley
Cc: linux-kernel@vger.kernel.org, linux-hyperv@vger.kernel.org,
iommu@lists.linux.dev, linux-pci@vger.kernel.org,
linux-arch@vger.kernel.org, wei.liu@kernel.org, kys@microsoft.com,
haiyangz@microsoft.com, decui@microsoft.com, longli@microsoft.com,
joro@8bytes.org, will@kernel.org, robin.murphy@arm.com,
bhelgaas@google.com, kwilczynski@kernel.org,
lpieralisi@kernel.org, mani@kernel.org, robh@kernel.org,
arnd@arndb.de, jgg@ziepe.ca, jacob.pan@linux.microsoft.com,
tgopinath@linux.microsoft.com,
easwar.hariharan@linux.microsoft.com
In-Reply-To: <SN6PR02MB4157FB81CC9B6347DCCC8C56D4072@SN6PR02MB4157.namprd02.prod.outlook.com>
On Thu, May 14, 2026 at 06:13:24PM +0000, Michael Kelley wrote:
> From: Yu Zhang <zhangyu1@linux.microsoft.com> Sent: Monday, May 11, 2026 9:24 AM
> >
> > Add a para-virtualized IOMMU driver for Linux guests running on Hyper-V.
> > This driver implements stage-1 IO translation within the guest OS.
> > It integrates with the Linux IOMMU core, utilizing Hyper-V hypercalls
> > for:
> > - Capability discovery
> > - Domain allocation, configuration, and deallocation
> > - Device attachment and detachment
> > - IOTLB invalidation
> >
> > The driver constructs x86-compatible stage-1 IO page tables in the
> > guest memory using consolidated IO page table helpers. This allows
> > the guest to manage stage-1 translations independently of vendor-
> > specific drivers (like Intel VT-d or AMD IOMMU).
> >
> > Hyper-V consumes this stage-1 IO page table when a device domain is
> > created and configured, and nests it with the host's stage-2 IO page
> > tables, therefore eliminating the VM exits for guest IOMMU mapping
> > operations. For unmapping operations, VM exits to perform the IOTLB
> > flush are still unavoidable.
> >
> > Hyper-V identifies each PCI pass-thru device by a logical device ID
> > in its hypercall interface. The vPCI driver (pci-hyperv) registers the
> > per-bus portion of this ID with the pvIOMMU driver during bus probe.
> > The pvIOMMU driver stores this mapping and combines it with the function
> > number of the endpoint PCI device to form the complete ID for hypercalls.
>
> As you are probably aware, Mukesh's patch series to support PCI
> pass-thru devices also needs to get the logical device ID. Maybe the
> registration mechanism needs to move somewhere that can be shared
> with his code.
>
Thank you so much for the review, Michael!
Yes, I looked at Mukesh's series and noticed his hv_pci_vmbus_device_id()
in pci-hyperv.c has the same dev_instance byte manipulation. We do need
a common registration mechanism.
Any suggestion on where to put it? drivers/hv/hv_common.c seems like a
natural place, but the register/lookup functions are currently only
meaningful when CONFIG_HYPERV_PVIOMMU is set. If Mukesh's pass-thru
code also needs them, we might need a new shared Kconfig option that
both can select. Open to better ideas.
Adding Mukesh to the loop. :)
> >
> > Co-developed-by: Wei Liu <wei.liu@kernel.org>
> > Signed-off-by: Wei Liu <wei.liu@kernel.org>
> > Co-developed-by: Easwar Hariharan <easwar.hariharan@linux.microsoft.com>
> > Signed-off-by: Easwar Hariharan <easwar.hariharan@linux.microsoft.com>
> > Signed-off-by: Yu Zhang <zhangyu1@linux.microsoft.com>
> > ---
> > arch/x86/hyperv/hv_init.c | 4 +
> > arch/x86/include/asm/mshyperv.h | 4 +
> > drivers/iommu/hyperv/Kconfig | 17 +
> > drivers/iommu/hyperv/Makefile | 1 +
> > drivers/iommu/hyperv/iommu.c | 705 ++++++++++++++++++++++++++++
> > drivers/iommu/hyperv/iommu.h | 54 +++
> > drivers/pci/controller/pci-hyperv.c | 19 +-
> > include/asm-generic/mshyperv.h | 12 +
> > 8 files changed, 815 insertions(+), 1 deletion(-)
> > create mode 100644 drivers/iommu/hyperv/iommu.c
> > create mode 100644 drivers/iommu/hyperv/iommu.h
> >
> > diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
> > index 323adc93f2dc..2c8ff8e06249 100644
> > --- a/arch/x86/hyperv/hv_init.c
> > +++ b/arch/x86/hyperv/hv_init.c
> > @@ -578,6 +578,10 @@ void __init hyperv_init(void)
> > old_setup_percpu_clockev = x86_init.timers.setup_percpu_clockev;
> > x86_init.timers.setup_percpu_clockev = hv_stimer_setup_percpu_clockev;
> >
> > +#ifdef CONFIG_HYPERV_PVIOMMU
> > + x86_init.iommu.iommu_init = hv_iommu_init;
> > +#endif
> > +
> > hv_apic_init();
> >
> > x86_init.pci.arch_init = hv_pci_init;
> > diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
> > index f64393e853ee..20d947c2c758 100644
> > --- a/arch/x86/include/asm/mshyperv.h
> > +++ b/arch/x86/include/asm/mshyperv.h
> > @@ -313,6 +313,10 @@ static inline void mshv_vtl_return_hypercall(void) {}
> > static inline void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) {}
> > #endif
> >
> > +#ifdef CONFIG_HYPERV_PVIOMMU
> > +int __init hv_iommu_init(void);
> > +#endif
> > +
> > #include <asm-generic/mshyperv.h>
> >
> > #endif
> > diff --git a/drivers/iommu/hyperv/Kconfig b/drivers/iommu/hyperv/Kconfig
> > index 30f40d867036..9e658d5c9a77 100644
> > --- a/drivers/iommu/hyperv/Kconfig
> > +++ b/drivers/iommu/hyperv/Kconfig
> > @@ -8,3 +8,20 @@ config HYPERV_IOMMU
> > help
> > Stub IOMMU driver to handle IRQs to support Hyper-V Linux
> > guest and root partitions.
> > +
> > +if HYPERV_IOMMU
> > +config HYPERV_PVIOMMU
> > + bool "Microsoft Hypervisor para-virtualized IOMMU support"
> > + depends on X86 && HYPERV
>
> What is the intent w.r.t. 32-bit builds? Using X86 instead of X86_64
> allows it. I did a 32-bit build and didn't get any build failures, which is
> good. But I can't run it to see if the pvIOMMU actually works in a
> 32-bit build. I don't know how building X86_64 generic PT entries
> would fare.
>
Sorry, no intent to support 32-bit. I'll change to `depends on X86_64 && HYPERV`
[...]
> > +static void hv_flush_device_domain(struct hv_iommu_domain *hv_domain)
> > +{
> > + u64 status;
> > + unsigned long flags;
> > + struct hv_input_flush_device_domain *input;
> > +
> > + local_irq_save(flags);
> > +
> > + input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> > + memset(input, 0, sizeof(*input));
> > + input->device_domain = hv_domain->device_domain;
>
> The previous version of this patch had code to set several other fields in
> the input. I wanted to confirm that not setting them in this version is
> intentional. Were they not needed?
>
Oh. The RFC v1 set partition_id, owner_vtl, domain_id.type, and domain_id.id
individually. In this version, I just simplified it to a struct assignment.
No functional change.
> > + status = hv_do_hypercall(HVCALL_FLUSH_DEVICE_DOMAIN, input, NULL);
> > +
> > + local_irq_restore(flags);
> > +
> > + if (!hv_result_success(status))
> > + pr_err("HVCALL_FLUSH_DEVICE_DOMAIN failed, status %lld\n", status);
> > +}
> > +
> > +static void hv_iommu_detach_dev(struct iommu_domain *domain, struct device *dev)
> > +{
> > + u64 status;
> > + unsigned long flags;
> > + struct hv_input_detach_device_domain *input;
> > + struct pci_dev *pdev;
> > + struct hv_iommu_domain *hv_domain = to_hv_iommu_domain(domain);
> > + struct hv_iommu_endpoint *vdev = dev_iommu_priv_get(dev);
> > +
> > + /* See the attach function, only PCI devices for now */
> > + if (!dev_is_pci(dev) || vdev->hv_domain != hv_domain)
> > + return;
>
> Are these sanity checks necessary? The only caller is hv_iommu_attach_dev()
> and it has already done the checks.
You're right, they're redundant.
> > +
> > + pdev = to_pci_dev(dev);
> > +
> > + dev_dbg(dev, "detaching from domain %d\n", hv_domain->device_domain.domain_id.id);
> > +
> > + local_irq_save(flags);
> > +
> > + input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> > + memset(input, 0, sizeof(*input));
> > + input->partition_id = HV_PARTITION_ID_SELF;
> > + if (hv_iommu_lookup_logical_dev_id(pdev, &input->device_id.as_uint64)) {
>
> As Sashiko and Jacob Pan pointed out, doing the lookup while interrupts are disabled
> is problematic. My suggestion would be to just do the lookup into a local variable
> before disabling interrupts (rather than using a raw spin lock as Jacob suggested).
>
> Same situation occurs in hv_iommu_attach_dev() and
> hv_iommu_get_logical_device_property().
>
Thanks! I would also prefer to look up before disabling interrupts.
> > + local_irq_restore(flags);
> > + dev_warn(&pdev->dev, "no IOMMU registration for vPCI bus on detach\n");
> > + return;
> > + }
> > + status = hv_do_hypercall(HVCALL_DETACH_DEVICE_DOMAIN, input, NULL);
> > +
> > + local_irq_restore(flags);
> > +
> > + if (!hv_result_success(status))
> > + pr_err("HVCALL_DETACH_DEVICE_DOMAIN failed, status %lld\n", status);
> > +
> > + hv_flush_device_domain(hv_domain);
> > +
> > + vdev->hv_domain = NULL;
> > +}
> > +
[...]
> > +static int hv_iommu_get_logical_device_property(struct device *dev,
> > + u32 code,
> > + struct hv_output_get_logical_device_property *property)
> > +{
> > + u64 status, lid;
> > + unsigned long flags;
> > + int ret;
> > + struct hv_input_get_logical_device_property *input;
> > + struct hv_output_get_logical_device_property *output;
> > +
> > + local_irq_save(flags);
> > +
> > + input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> > + output = *this_cpu_ptr(hyperv_pcpu_input_arg) + sizeof(*input);
>
> Nit: The other way to set output is:
>
> output = input + 1;
>
> I think this produces slightly better code because of not needing to
> reference the per-cpu variable hyperv_pcpu_input_arg a 2nd time.
>
>
Indeed! It's more elegant. :)
> > + memset(input, 0, sizeof(*input));
> > + input->partition_id = HV_PARTITION_ID_SELF;
> > + ret = hv_iommu_lookup_logical_dev_id(to_pci_dev(dev), &lid);
> > + if (ret) {
> > + local_irq_restore(flags);
> > + return ret;
> > + }
> > + input->logical_device_id = lid;
> > + input->code = code;
> > + status = hv_do_hypercall(HVCALL_GET_LOGICAL_DEVICE_PROPERTY, input, output);
> > + *property = *output;
> > +
> > + local_irq_restore(flags);
> > +
> > + if (!hv_result_success(status))
> > + pr_err("HVCALL_GET_LOGICAL_DEVICE_PROPERTY failed, status %lld\n", status);
> > +
> > + return hv_result_to_errno(status);
> > +}
> > +
[...]
> > +static void hv_iommu_release_device(struct device *dev)
> > +{
> > + struct hv_iommu_endpoint *vdev = dev_iommu_priv_get(dev);
> > + struct pci_dev *pdev = to_pci_dev(dev);
> > +
> > + if (pdev->ats_enabled)
> > + pci_disable_ats(pdev);
> > +
> > + dev_iommu_priv_set(dev, NULL);
> > + set_dma_ops(dev, NULL);
>
> Previous versions of this function did hv_iommu_detach_dev(). With that call
> removed from here, hv_iommu_detach_dev() is only called when attaching a
> domain to a device that already has a domain attached. Is it the case that
> Hyper-V doesn't require the detach as a cleanup step?
>
The IOMMU core attaches the device to release_domain (our blocking domain)
before calling release_device(), so I believe the explicit detach in the RFC
was redundant. I simply didn't realize that at the time.
Sorry I forgot to mention this in the changelog.
[...]
> > +static struct iommu_domain *hv_iommu_domain_alloc_paging(struct device *dev)
> > +{
> > + int ret;
> > + struct hv_iommu_domain *hv_domain;
> > + struct pt_iommu_x86_64_cfg cfg = {};
> > +
> > + hv_domain = kzalloc_obj(*hv_domain, GFP_KERNEL);
> > + if (!hv_domain)
> > + return ERR_PTR(-ENOMEM);
> > +
> > + ret = hv_create_device_domain(hv_domain, HV_DEVICE_DOMAIN_TYPE_S1);
> > + if (ret) {
> > + kfree(hv_domain);
> > + return ERR_PTR(ret);
> > + }
> > +
> > + hv_domain->domain.geometry = hv_iommu_device->geometry;
> > + hv_domain->pt_iommu.nid = dev_to_node(dev);
> > +
> > + cfg.common.hw_max_vasz_lg2 = hv_iommu_device->max_iova_width;
> > + cfg.common.hw_max_oasz_lg2 = 52;
> > + cfg.top_level = (hv_iommu_device->max_iova_width > 48) ? 4 : 3;
> > +
> > + ret = pt_iommu_x86_64_init(&hv_domain->pt_iommu_x86_64, &cfg, GFP_KERNEL);
> > + if (ret) {
> > + hv_delete_device_domain(hv_domain);
> > + kfree(hv_domain);
> > + return ERR_PTR(ret);
> > + }
> > +
> > + /* Constrain to page sizes the hypervisor supports */
> > + hv_domain->domain.pgsize_bitmap &= hv_iommu_device->pgsize_bitmap;
> > +
> > + hv_domain->domain.ops = &hv_iommu_paging_domain_ops;
> > +
> > + ret = hv_configure_device_domain(hv_domain, __IOMMU_DOMAIN_PAGING);
> > + if (ret) {
> > + pt_iommu_deinit(&hv_domain->pt_iommu);
> > + hv_delete_device_domain(hv_domain);
> > + kfree(hv_domain);
> > + return ERR_PTR(ret);
> > + }
> > +
> > + return &hv_domain->domain;
>
> I think this function would be better if the error paths did "goto"
> a cascading set of error labels. That's the typical pattern, and it's what you
> use in hv_iommu_init(), for example.
>
Good point. Will restructure to use goto-based error labels
> > +}
> > +
> > +static struct iommu_ops hv_iommu_ops = {
> > + .capable = hv_iommu_capable,
> > + .domain_alloc_paging = hv_iommu_domain_alloc_paging,
> > + .probe_device = hv_iommu_probe_device,
> > + .release_device = hv_iommu_release_device,
> > + .device_group = hv_iommu_device_group,
> > + .get_resv_regions = hv_iommu_get_resv_regions,
> > + .owner = THIS_MODULE,
> > + .identity_domain = &hv_identity_domain.domain,
> > + .blocked_domain = &hv_blocking_domain.domain,
> > + .release_domain = &hv_blocking_domain.domain,
> > +};
> > +
> > +static int hv_iommu_detect(struct hv_output_get_iommu_capabilities *hv_iommu_cap)
> > +{
> > + u64 status;
> > + unsigned long flags;
> > + struct hv_input_get_iommu_capabilities *input;
> > + struct hv_output_get_iommu_capabilities *output;
> > +
> > + local_irq_save(flags);
> > +
> > + input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> > + output = *this_cpu_ptr(hyperv_pcpu_input_arg) + sizeof(*input);
>
> Potentially use "output = input + 1" here as well.
>
Yes. Thanks!
[...]
> > @@ -3857,13 +3858,25 @@ static int hv_pci_probe(struct hv_device *hdev,
> >
> > hbus->state = hv_pcibus_probed;
> >
> > - ret = create_root_hv_pci_bus(hbus);
> > + /* Notify pvIOMMU before any device on the bus is scanned. */
> > + prefix = (hdev->dev_instance.b[5] << 24) |
> > + (hdev->dev_instance.b[4] << 16) |
> > + (hdev->dev_instance.b[7] << 8) |
> > + (hdev->dev_instance.b[6] & 0xf8);
>
> This assembling of the logical device id prefix duplicates the
> code in hv_irq_retarget_interrupt(). Could this code save the
> prefix in struct hv_pcibus_device, and then have
> hv_irq_retarget_interrupt() use it? Then it would be clear
> that HVCALL_RETARGET_INTERRUPT is using exactly the same
> logical device id as the IOMMU hypercalls.
>
Good point. I think we can do it. :)
> > +
> > + ret = hv_iommu_register_pci_bus(dom, prefix);
> > if (ret)
> > goto free_windows;
> >
> > + ret = create_root_hv_pci_bus(hbus);
> > + if (ret)
> > + goto unregister_pviommu;
> > +
> > mutex_unlock(&hbus->state_lock);
> > return 0;
> >
> > +unregister_pviommu:
> > + hv_iommu_unregister_pci_bus(dom);
> > free_windows:
> > hv_pci_free_bridge_windows(hbus);
> > exit_d0:
> > @@ -3974,8 +3987,10 @@ static int hv_pci_bus_exit(struct hv_device *hdev, bool
> > keep_devs)
> > static void hv_pci_remove(struct hv_device *hdev)
> > {
> > struct hv_pcibus_device *hbus;
> > + int dom;
> >
> > hbus = hv_get_drvdata(hdev);
> > + dom = hbus->bridge->domain_nr;
>
> Nit: Setting "dom" here feels a little weird because the value is only needed
> under the "if" statement. The value must be read before the root bus is
> removed, but even so moving it under the "if" statement would make more
> sense to me.
>
Sure. Thanks!
> > if (hbus->state == hv_pcibus_installed) {
> > tasklet_disable(&hdev->channel->callback_event);
> > hbus->state = hv_pcibus_removing;
> > @@ -3994,6 +4009,8 @@ static void hv_pci_remove(struct hv_device *hdev)
> > hv_pci_remove_slots(hbus);
> > pci_remove_root_bus(hbus->bridge->bus);
> > pci_unlock_rescan_remove();
> > +
> > + hv_iommu_unregister_pci_bus(dom);
> > }
> >
> > hv_pci_bus_exit(hdev, false);
B.R.
Yu
^ permalink raw reply
* Re: [PATCH V3 01/11] iommu/hyperv: Rename hyperv-iommu.c to hyperv-irq.c
From: Yu Zhang @ 2026-05-15 13:58 UTC (permalink / raw)
To: Michael Kelley
Cc: Jacob Pan, Mukesh R, hpa@zytor.com, robin.murphy@arm.com,
robh@kernel.org, wei.liu@kernel.org, muislam@microsoft.com,
namjain@linux.microsoft.com, magnuskulke@linux.microsoft.com,
anbelski@linux.microsoft.com, linux-kernel@vger.kernel.org,
linux-hyperv@vger.kernel.org, iommu@lists.linux.dev,
linux-pci@vger.kernel.org, linux-arch@vger.kernel.org,
kys@microsoft.com, haiyangz@microsoft.com, decui@microsoft.com,
longli@microsoft.com, tglx@kernel.org, mingo@redhat.com,
bp@alien8.de, dave.hansen@linux.intel.com, x86@kernel.org,
joro@8bytes.org, will@kernel.org, lpieralisi@kernel.org,
kwilczynski@kernel.org, bhelgaas@google.com, arnd@arndb.de
In-Reply-To: <SN6PR02MB4157371A1A3931F582F9A2E9D4062@SN6PR02MB4157.namprd02.prod.outlook.com>
On Wed, May 13, 2026 at 03:15:52AM +0000, Michael Kelley wrote:
> From: Jacob Pan <jacob.pan@linux.microsoft.com> Sent: Tuesday, May 12, 2026 4:46 PM
> >
> > Hi Mukesh,
> >
> > On Mon, 11 May 2026 19:02:49 -0700
> > Mukesh R <mrathor@linux.microsoft.com> wrote:
> >
> > > This file actually implements irq remapping, so rename to more
> > > appropriate hyperv-irq.c. A new file to implement hyperv iommu will
> > > be introduced later. Also, it should not be tied to HYPERV_IOMMU,
> > > but to CONFIG_HYPERV and IRQ_REMAP. The file already has #ifdef
> > > CONFIG_IRQ_REMAP.
> > >
> > > Reviewed-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
> > > Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
> > > ---
> > > MAINTAINERS | 2 +-
> > > drivers/iommu/Makefile | 2 +-
> > > drivers/iommu/{hyperv-iommu.c => hyperv-irq.c} | 6 +++---
> >
> > Given that we have multiple Hyper-V IOMMU-related files — this renamed
> > hyperv-irq.c, the existing hyperv-iommu code, iommu-root (this
> > series) and the recently posted guest pvIOMMU driver — should we create
> > a drivers/iommu/hyperv/ directory to consolidate them?
>
> Patch 1/4 in the guest pvIOMMU driver [1] that was recently posted by
> Yu Zhang does as you suggest.
>
> Michael
>
> [1] https://lore.kernel.org/linux-hyperv/20260511162408.1180069-1-zhangyu1@linux.microsoft.com/
>
Maybe we can send a standalone patch and get it merged
first (move it to drivers/iommu/hyperv/irq_remapping.c)?
The rename itself is a meaningful cleanup regardless of
either series, and in the future, both Mukesh and I can
can then build on top of it without conflicts. :)
B.R.
Yu
^ permalink raw reply
* Re: [PATCH v1 3/4] iommu/hyperv: Add para-virtualized IOMMU support for Hyper-V guest
From: Yu Zhang @ 2026-05-15 12:38 UTC (permalink / raw)
To: Jacob Pan
Cc: linux-kernel, linux-hyperv, iommu, linux-pci, linux-arch, wei.liu,
kys, haiyangz, decui, longli, joro, will, robin.murphy, bhelgaas,
kwilczynski, lpieralisi, mani, robh, arnd, jgg, mhklinux,
tgopinath, easwar.hariharan
In-Reply-To: <20260513113952.00005b20@linux.microsoft.com>
[...]
> > diff --git a/drivers/iommu/hyperv/Kconfig
> > b/drivers/iommu/hyperv/Kconfig index 30f40d867036..9e658d5c9a77 100644
> > --- a/drivers/iommu/hyperv/Kconfig
> > +++ b/drivers/iommu/hyperv/Kconfig
> > @@ -8,3 +8,20 @@ config HYPERV_IOMMU
> > help
> > Stub IOMMU driver to handle IRQs to support Hyper-V Linux
> > guest and root partitions.
> > +
> > +if HYPERV_IOMMU
> > +config HYPERV_PVIOMMU
> > + bool "Microsoft Hypervisor para-virtualized IOMMU support"
> > + depends on X86 && HYPERV
> > + select IOMMU_API
> > + select GENERIC_PT
> > + select IOMMU_PT
> > + select IOMMU_PT_X86_64
> nit:
> If HYPERV_PVIOMMU is enabled on a (hypothetical) platform with
> GENERIC_ATOMIC64=y, the select would force-enable IOMMU_PT_X86_64 even
> though its depends on is unsatisfied — leading to a build failure.
>
> In practice this can't happen today because HYPERV_PVIOMMU already
> depends on X86 && HYPERV, and x86 never sets GENERIC_ATOMIC64. But
> adding the explicit guard is more defensive.
> i.e.
> depends on !GENERIC_ATOMIC64 # for cmpxchg64 in IOMMU_PT
>
Good point. Will add "depends on !GENERIC_ATOMIC64".
[...]
> > +
> > +/*
> > + * Look up the logical device ID for a vPCI device. Returns 0 on
> > success
> > + * with *logical_id filled in; -ENODEV if no entry registered for
> > this
> > + * device's vPCI bus.
> > + */
> > +static int hv_iommu_lookup_logical_dev_id(struct pci_dev *pdev, u64
> > *logical_id) +{
> > + struct hv_pci_busdata *bus;
> > + int domain = pci_domain_nr(pdev->bus);
> > + int ret = -ENODEV;
> > +
> > + spin_lock(&hv_iommu_pci_bus_lock);
> this is called under local_irq_save, should use raw_spinlock_t for RT
> kernel?
>
Yes, this is problematic on PREEMPT_RT. Michael also suggested hoisting
the lookup before the local_irq_save() section instead of using a raw
spinlock, which I think is a great idea - all 3 call sites (detach_dev,
attach_dev, get_logical_device_property) will be changed.
> > + list_for_each_entry(bus, &hv_iommu_pci_bus_list, list) {
> > + if (bus->pci_domain_nr == domain) {
> > + *logical_id =
> > (u64)bus->logical_dev_id_prefix |
> > + PCI_FUNC(pdev->devfn);
> > + ret = 0;
> > + break;
> > + }
> > + }
> > + spin_unlock(&hv_iommu_pci_bus_lock);
> > + return ret;
> > +}
> > +
[...]
> > +static void hv_iommu_release_device(struct device *dev)
> > +{
> > + struct hv_iommu_endpoint *vdev = dev_iommu_priv_get(dev);
> > + struct pci_dev *pdev = to_pci_dev(dev);
> > +
> > + if (pdev->ats_enabled)
> > + pci_disable_ats(pdev);
> > +
> > + dev_iommu_priv_set(dev, NULL);
> > + set_dma_ops(dev, NULL);
> I don't think this is necessary.
>
Oh, yes. Thanks!
> > +
> > + kfree(vdev);
> > +}
> > +
> > +static struct iommu_group *hv_iommu_device_group(struct device *dev)
> > +{
> > + if (dev_is_pci(dev))
> > + return pci_device_group(dev);
> > + else
> > + return generic_device_group(dev);
> non pci device already rejected during attach, maybe we should warn
> here?
> WARN_ON_ONCE(1);
> return generic_device_group(dev);
>
Good idea. Will add WARN_ON_ONCE(1).
> > +}
> > +
> > +static int hv_configure_device_domain(struct hv_iommu_domain
> > *hv_domain, u32 domain_type) +{
> > + u64 status;
> > + unsigned long flags;
> > + struct pt_iommu_x86_64_hw_info pt_info;
> > + struct hv_input_configure_device_domain *input;
> > +
> > + local_irq_save(flags);
> > +
> > + input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> > + memset(input, 0, sizeof(*input));
> > + input->device_domain = hv_domain->device_domain;
> > + input->settings.flags.blocked = (domain_type ==
> > IOMMU_DOMAIN_BLOCKED);
> > + input->settings.flags.translation_enabled = (domain_type !=
> > IOMMU_DOMAIN_IDENTITY); +
> Should this be:
> input->settings.flags.translation_enabled =
> (domain_type & __IOMMU_DOMAIN_PAGING);
> Otherwise, blocked domain will have translation enabled. Maybe add some
> explanation of what HV expects.
>
I do agree this is not intuitive, but current hypervisor implementation
requires "blocked == 1" to be paired with "translation_enabled = 1",
otherwise it returns HV_STATUS_INVALID_PARAMETER. But I can add some
comment at least.
Thanks for the thorough review, Jacob!
B.R.
Yu
^ permalink raw reply
* [PATCH 8/9] drm/virtgpu: Set DRM_VBLANK_FLAG_SIMULATED
From: Thomas Zimmermann @ 2026-05-15 11:55 UTC (permalink / raw)
To: simona, airlied, mdaenzer, pekka.paalanen, jadahl, contact,
maarten.lankhorst, mripard
Cc: amd-gfx, dri-devel, linux-hyperv, virtualization, spice-devel,
Thomas Zimmermann
In-Reply-To: <20260515120916.333614-1-tzimmermann@suse.de>
Mark the vblank event on virtgpu as simulated, so that the WAIT_VBLANK
ioctl fails with an error. The ioctl should not be supported because
the output is not synchronized to a display refresh.
Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
---
drivers/gpu/drm/virtio/virtgpu_display.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/virtio/virtgpu_display.c b/drivers/gpu/drm/virtio/virtgpu_display.c
index 44ffffec550f..558d8001c54f 100644
--- a/drivers/gpu/drm/virtio/virtgpu_display.c
+++ b/drivers/gpu/drm/virtio/virtgpu_display.c
@@ -381,7 +381,7 @@ int virtio_gpu_modeset_init(struct virtio_gpu_device *vgdev)
for (i = 0 ; i < vgdev->num_scanouts; ++i)
vgdev_output_init(vgdev, i);
- ret = drm_vblank_init(vgdev->ddev, vgdev->num_scanouts);
+ ret = drmm_vblank_init(vgdev->ddev, vgdev->num_scanouts, DRM_VBLANK_FLAG_SIMULATED);
if (ret)
return ret;
--
2.54.0
^ permalink raw reply related
* [PATCH 7/9] drm/qxl: Set DRM_VBLANK_FLAG_SIMULATED
From: Thomas Zimmermann @ 2026-05-15 11:55 UTC (permalink / raw)
To: simona, airlied, mdaenzer, pekka.paalanen, jadahl, contact,
maarten.lankhorst, mripard
Cc: amd-gfx, dri-devel, linux-hyperv, virtualization, spice-devel,
Thomas Zimmermann
In-Reply-To: <20260515120916.333614-1-tzimmermann@suse.de>
Mark the vblank event on qxl as simulated, so that the WAIT_VBLANK
ioctl fails with an error. The ioctl should not be supported because
the output is not synchronized to a display refresh.
Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
---
drivers/gpu/drm/qxl/qxl_display.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/qxl/qxl_display.c b/drivers/gpu/drm/qxl/qxl_display.c
index a026bd35ef48..b808fdebbd89 100644
--- a/drivers/gpu/drm/qxl/qxl_display.c
+++ b/drivers/gpu/drm/qxl/qxl_display.c
@@ -1300,7 +1300,7 @@ int qxl_modeset_init(struct qxl_device *qdev)
qxl_display_read_client_monitors_config(qdev);
- ret = drm_vblank_init(&qdev->ddev, qxl_num_crtc);
+ ret = drmm_vblank_init(&qdev->ddev, qxl_num_crtc, DRM_VBLANK_FLAG_SIMULATED);
if (ret)
return ret;
--
2.54.0
^ permalink raw reply related
* [PATCH 9/9] drm/vkms: Set DRM_VBLANK_FLAG_SIMULATED
From: Thomas Zimmermann @ 2026-05-15 11:55 UTC (permalink / raw)
To: simona, airlied, mdaenzer, pekka.paalanen, jadahl, contact,
maarten.lankhorst, mripard
Cc: amd-gfx, dri-devel, linux-hyperv, virtualization, spice-devel,
Thomas Zimmermann
In-Reply-To: <20260515120916.333614-1-tzimmermann@suse.de>
Mark the vblank event on vkms as simulated, so that the WAIT_VBLANK
ioctl fails with an error. The ioctl should not be supported because
the output is not synchronized to a display refresh.
Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
---
drivers/gpu/drm/vkms/vkms_drv.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/vkms/vkms_drv.c b/drivers/gpu/drm/vkms/vkms_drv.c
index 5a640b531d88..c4cfa1e5ab01 100644
--- a/drivers/gpu/drm/vkms/vkms_drv.c
+++ b/drivers/gpu/drm/vkms/vkms_drv.c
@@ -192,8 +192,8 @@ int vkms_create(struct vkms_config *config)
goto out_devres;
}
- ret = drm_vblank_init(&vkms_device->drm,
- vkms_config_get_num_crtcs(config));
+ ret = drmm_vblank_init(&vkms_device->drm, vkms_config_get_num_crtcs(config),
+ DRM_VBLANK_FLAG_SIMULATED);
if (ret) {
DRM_ERROR("Failed to vblank\n");
goto out_devres;
--
2.54.0
^ permalink raw reply related
* [PATCH 5/9] drm/cirrus: Set DRM_VBLANK_FLAG_SIMULATED
From: Thomas Zimmermann @ 2026-05-15 11:55 UTC (permalink / raw)
To: simona, airlied, mdaenzer, pekka.paalanen, jadahl, contact,
maarten.lankhorst, mripard
Cc: amd-gfx, dri-devel, linux-hyperv, virtualization, spice-devel,
Thomas Zimmermann
In-Reply-To: <20260515120916.333614-1-tzimmermann@suse.de>
Mark the vblank event on cirrus as simulated, so that the WAIT_VBLANK
ioctl fails with an error. The ioctl should not be supported because
the output is not synchronized to a display refresh.
Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
---
drivers/gpu/drm/tiny/cirrus-qemu.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/tiny/cirrus-qemu.c b/drivers/gpu/drm/tiny/cirrus-qemu.c
index 075221b431d3..01522d1158b2 100644
--- a/drivers/gpu/drm/tiny/cirrus-qemu.c
+++ b/drivers/gpu/drm/tiny/cirrus-qemu.c
@@ -501,7 +501,7 @@ static int cirrus_pipe_init(struct cirrus_device *cirrus)
if (ret)
return ret;
- ret = drm_vblank_init(dev, 1);
+ ret = drmm_vblank_init(dev, 1, DRM_VBLANK_FLAG_SIMULATED);
if (ret)
return ret;
--
2.54.0
^ permalink raw reply related
* [PATCH 6/9] drm/hypervdrm: Set DRM_VBLANK_FLAG_SIMULATED
From: Thomas Zimmermann @ 2026-05-15 11:55 UTC (permalink / raw)
To: simona, airlied, mdaenzer, pekka.paalanen, jadahl, contact,
maarten.lankhorst, mripard
Cc: amd-gfx, dri-devel, linux-hyperv, virtualization, spice-devel,
Thomas Zimmermann
In-Reply-To: <20260515120916.333614-1-tzimmermann@suse.de>
Mark the vblank event on hypervdrm as simulated, so that the WAIT_VBLANK
ioctl fails with an error. The ioctl should not be supported because the
output is not synchronized to a display refresh.
Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
---
drivers/gpu/drm/hyperv/hyperv_drm_modeset.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/hyperv/hyperv_drm_modeset.c b/drivers/gpu/drm/hyperv/hyperv_drm_modeset.c
index 1bbb7de5ab49..24bed31c35e7 100644
--- a/drivers/gpu/drm/hyperv/hyperv_drm_modeset.c
+++ b/drivers/gpu/drm/hyperv/hyperv_drm_modeset.c
@@ -329,7 +329,7 @@ int hyperv_mode_config_init(struct hyperv_drm_device *hv)
return ret;
}
- ret = drm_vblank_init(dev, 1);
+ ret = drmm_vblank_init(dev, 1, DRM_VBLANK_FLAG_SIMULATED);
if (ret)
return ret;
--
2.54.0
^ permalink raw reply related
* [PATCH 4/9] drm/bochs: Set DRM_VBLANK_FLAG_SIMULATED
From: Thomas Zimmermann @ 2026-05-15 11:55 UTC (permalink / raw)
To: simona, airlied, mdaenzer, pekka.paalanen, jadahl, contact,
maarten.lankhorst, mripard
Cc: amd-gfx, dri-devel, linux-hyperv, virtualization, spice-devel,
Thomas Zimmermann
In-Reply-To: <20260515120916.333614-1-tzimmermann@suse.de>
Mark the vblank event on bochs as simulated, so that the WAIT_VBLANK
ioctl fails with an error. The ioctl should not be supported because
the output is not synchronized to a display refresh.
Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
---
drivers/gpu/drm/tiny/bochs.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/tiny/bochs.c b/drivers/gpu/drm/tiny/bochs.c
index e2d957e51505..b5955ef39e31 100644
--- a/drivers/gpu/drm/tiny/bochs.c
+++ b/drivers/gpu/drm/tiny/bochs.c
@@ -677,7 +677,7 @@ static int bochs_kms_init(struct bochs_device *bochs)
drm_connector_attach_edid_property(connector);
drm_connector_attach_encoder(connector, encoder);
- ret = drm_vblank_init(dev, 1);
+ ret = drmm_vblank_init(dev, 1, DRM_VBLANK_FLAG_SIMULATED);
if (ret)
return ret;
--
2.54.0
^ permalink raw reply related
* [PATCH 3/9] drm/amdgpu: vkms: Set DRM_VBLANK_FLAG_SIMULATED
From: Thomas Zimmermann @ 2026-05-15 11:55 UTC (permalink / raw)
To: simona, airlied, mdaenzer, pekka.paalanen, jadahl, contact,
maarten.lankhorst, mripard
Cc: amd-gfx, dri-devel, linux-hyperv, virtualization, spice-devel,
Thomas Zimmermann
In-Reply-To: <20260515120916.333614-1-tzimmermann@suse.de>
Mark the vblank event on amdgpu's vkms as simulated, so that the
WAIT_VBLANK ioctl fails with an error. The ioctl should not be
supported because the output is not synchronized to a display refresh.
Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
index 170adaf7e76a..bc88acc819a6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
@@ -413,7 +413,8 @@ static int amdgpu_vkms_sw_init(struct amdgpu_ip_block *ip_block)
return r;
}
- r = drm_vblank_init(adev_to_drm(adev), adev->mode_info.num_crtc);
+ r = drmm_vblank_init(adev_to_drm(adev), adev->mode_info.num_crtc,
+ DRM_VBLANK_FLAG_SIMULATED);
if (r)
return r;
--
2.54.0
^ permalink raw reply related
* [PATCH 2/9] drm/vblank: Add DRM_VBLANK_FLAG_SIMULATED
From: Thomas Zimmermann @ 2026-05-15 11:55 UTC (permalink / raw)
To: simona, airlied, mdaenzer, pekka.paalanen, jadahl, contact,
maarten.lankhorst, mripard
Cc: amd-gfx, dri-devel, linux-hyperv, virtualization, spice-devel,
Thomas Zimmermann
In-Reply-To: <20260515120916.333614-1-tzimmermann@suse.de>
Add DRM_VBLANK_FLAG_SIMULATED for CRTCs that do not have a hardware
vblank interrupt. Setting the flag tells DRM to not report vblank
capabilities from the WAIT_VBLANK ioctl.
DRM_IOCTL_WAIT_VBLANK queries timestamps from a vblank event or waits
for the next vblank event to occur. DRM clients use this functionality
to synchronize their output with the display's vblank phase. Hence this
is only supported for hardware implementations.
Software implementations are not synchronized to the display and merely
act as a rate limiter for page-flip events. The WAIT_VBLANK ioctl thus
should fail with an error.
Suggested-by: Simona Vetter <simona@ffwll.ch>
Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
---
drivers/gpu/drm/drm_vblank.c | 3 +++
include/drm/drm_vblank.h | 5 +++++
2 files changed, 8 insertions(+)
diff --git a/drivers/gpu/drm/drm_vblank.c b/drivers/gpu/drm/drm_vblank.c
index 21ca91b4c014..92b699a4e8be 100644
--- a/drivers/gpu/drm/drm_vblank.c
+++ b/drivers/gpu/drm/drm_vblank.c
@@ -1794,6 +1794,9 @@ int drm_wait_vblank_ioctl(struct drm_device *dev, void *data,
vblank = drm_vblank_crtc(dev, pipe);
+ if (vblank->flags & DRM_VBLANK_FLAG_SIMULATED)
+ return -EOPNOTSUPP;
+
/* If the counter is currently enabled and accurate, short-circuit
* queries to return the cached timestamp of the last vblank.
*/
diff --git a/include/drm/drm_vblank.h b/include/drm/drm_vblank.h
index 39a201b83781..03fa7259b6ac 100644
--- a/include/drm/drm_vblank.h
+++ b/include/drm/drm_vblank.h
@@ -37,6 +37,11 @@ struct drm_device;
struct drm_crtc;
struct drm_vblank_work;
+/**
+ * DRM_VBLANK_FLAG_SIMULATED - vblank uses a software timer
+ */
+#define DRM_VBLANK_FLAG_SIMULATED BIT(1)
+
/**
* struct drm_pending_vblank_event - pending vblank event tracking
*/
--
2.54.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox