From: "Xin Li (Intel)" <xin@zytor.com>
To: linux-kernel@vger.kernel.org, kvm@vger.kernel.org,
linux-pm@vger.kernel.org
Cc: seanjc@google.com, pbonzini@redhat.com, tglx@linutronix.de,
mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com,
x86@kernel.org, hpa@zytor.com, rafael@kernel.org,
pavel@kernel.org, brgerst@gmail.com, xin@zytor.com,
david.kaplan@amd.com, peterz@infradead.org,
andrew.cooper3@citrix.com, kprateek.nayak@amd.com,
arjan@linux.intel.com, chao.gao@intel.com,
rick.p.edgecombe@intel.com, dan.j.williams@intel.com
Subject: [RFC PATCH v1 1/5] x86/boot: Shift VMXON from KVM init to CPU startup phase
Date: Tue, 9 Sep 2025 11:28:21 -0700 [thread overview]
Message-ID: <20250909182828.1542362-2-xin@zytor.com> (raw)
In-Reply-To: <20250909182828.1542362-1-xin@zytor.com>
Move the VMXON setup from the KVM initialization path to the CPU startup
phase to guarantee that hardware virtualization is enabled early and
without interruption.
As a result, KVM, often loaded as a kernel module, no longer needs to worry
about whether or not VMXON has been executed on a CPU (e.g., CPU offline
events or system reboots while KVM is loading).
Signed-off-by: Xin Li (Intel) <xin@zytor.com>
---
arch/x86/include/asm/processor.h | 1 +
arch/x86/include/asm/vmx.h | 5 ++
arch/x86/kernel/cpu/common.c | 91 ++++++++++++++++++++++++
arch/x86/kvm/vmx/vmcs.h | 5 --
arch/x86/kvm/vmx/vmx.c | 117 ++-----------------------------
arch/x86/power/cpu.c | 7 +-
6 files changed, 107 insertions(+), 119 deletions(-)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index bde58f6510ac..59660428f46d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -230,6 +230,7 @@ void init_cpu_devs(void);
void get_cpu_vendor(struct cpuinfo_x86 *c);
extern void early_cpu_init(void);
extern void identify_secondary_cpu(unsigned int cpu);
+extern void cpu_enable_virtualization(void);
extern void print_cpu_info(struct cpuinfo_x86 *);
void print_cpu_msr(struct cpuinfo_x86 *);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index cca7d6641287..736d04c1b2fc 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -20,6 +20,11 @@
#include <asm/trapnr.h>
#include <asm/vmxfeatures.h>
+struct vmcs_hdr {
+ u32 revision_id:31;
+ u32 shadow_vmcs:1;
+};
+
#define VMCS_CONTROL_BIT(x) BIT(VMX_FEATURE_##x & 0x1f)
/*
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 34a054181c4d..e36877b5a240 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -72,6 +72,7 @@
#include <asm/tdx.h>
#include <asm/posted_intr.h>
#include <asm/runtime-const.h>
+#include <asm/vmx.h>
#include "cpu.h"
@@ -1923,6 +1924,84 @@ static void generic_identify(struct cpuinfo_x86 *c)
#endif
}
+static bool is_vmx_supported(void)
+{
+ int cpu = raw_smp_processor_id();
+
+ if (!(cpuid_ecx(1) & (1 << (X86_FEATURE_VMX & 31)))) {
+ /* May not be an Intel CPU */
+ pr_info("VMX not supported by CPU%d\n", cpu);
+ return false;
+ }
+
+ if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
+ !this_cpu_has(X86_FEATURE_VMX)) {
+ pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU%d\n", cpu);
+ return false;
+ }
+
+ return true;
+}
+
+/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
+union vmxon_vmcs {
+ struct vmcs_hdr hdr;
+ char data[PAGE_SIZE];
+};
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(union vmxon_vmcs, vmxon_vmcs);
+
+/*
+ * Executed during the CPU startup phase to execute VMXON to enable VMX. This
+ * ensures that KVM, often loaded as a kernel module, no longer needs to worry
+ * about whether or not VMXON has been executed on a CPU (e.g., CPU offline
+ * events or system reboots while KVM is loading).
+ *
+ * VMXON is not expected to fault, but fault handling is kept as a precaution
+ * against any unexpected code paths that might trigger it and can be removed
+ * later if unnecessary.
+ */
+void cpu_enable_virtualization(void)
+{
+ u64 vmxon_pointer = __pa(this_cpu_ptr(&vmxon_vmcs));
+ int cpu = raw_smp_processor_id();
+ u64 basic_msr;
+
+ if (!is_vmx_supported())
+ return;
+
+ if (cr4_read_shadow() & X86_CR4_VMXE) {
+ pr_err("VMX already enabled on CPU%d\n", cpu);
+ return;
+ }
+
+ memset(this_cpu_ptr(&vmxon_vmcs), 0, PAGE_SIZE);
+
+ /*
+ * Even though not explicitly documented by TLFS, VMXArea passed as
+ * VMXON argument should still be marked with revision_id reported by
+ * physical CPU.
+ */
+ rdmsrq(MSR_IA32_VMX_BASIC, basic_msr);
+ this_cpu_ptr(&vmxon_vmcs)->hdr.revision_id = vmx_basic_vmcs_revision_id(basic_msr);
+
+ intel_pt_handle_vmx(1);
+
+ cr4_set_bits(X86_CR4_VMXE);
+
+ asm goto("1: vmxon %[vmxon_pointer]\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ : : [vmxon_pointer] "m"(vmxon_pointer)
+ : : fault);
+
+ return;
+
+fault:
+ pr_err("VMXON faulted on CPU%d\n", cpu);
+ cr4_clear_bits(X86_CR4_VMXE);
+ intel_pt_handle_vmx(0);
+}
+
/*
* This does the hard work of actually picking apart the CPU stuff...
*/
@@ -2120,6 +2199,12 @@ void identify_secondary_cpu(unsigned int cpu)
tsx_ap_init();
c->initialized = true;
+
+ /*
+ * Enable AP virtualization immediately after initializing the per-CPU
+ * cpuinfo_x86 structure, ensuring that this_cpu_has() operates correctly.
+ */
+ cpu_enable_virtualization();
}
void print_cpu_info(struct cpuinfo_x86 *c)
@@ -2551,6 +2636,12 @@ void __init arch_cpu_finalize_init(void)
*c = boot_cpu_data;
c->initialized = true;
+ /*
+ * Enable BSP virtualization right after the BSP cpuinfo_x86 structure
+ * is initialized to ensure this_cpu_has() works as expected.
+ */
+ cpu_enable_virtualization();
+
alternative_instructions();
if (IS_ENABLED(CONFIG_X86_64)) {
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index b25625314658..da5631924432 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -13,11 +13,6 @@
#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
-struct vmcs_hdr {
- u32 revision_id:31;
- u32 shadow_vmcs:1;
-};
-
struct vmcs {
struct vmcs_hdr hdr;
u32 abort;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index aa157fe5b7b3..f6742df0c4ff 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -468,7 +468,6 @@ noinline void invept_error(unsigned long ext, u64 eptp)
vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp);
}
-static DEFINE_PER_CPU(struct vmcs *, vmxarea);
DEFINE_PER_CPU(struct vmcs *, current_vmcs);
/*
* We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
@@ -2736,43 +2735,14 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
return 0;
}
-static bool __kvm_is_vmx_supported(void)
-{
- int cpu = smp_processor_id();
-
- if (!(cpuid_ecx(1) & feature_bit(VMX))) {
- pr_err("VMX not supported by CPU %d\n", cpu);
- return false;
- }
-
- if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
- !this_cpu_has(X86_FEATURE_VMX)) {
- pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu);
- return false;
- }
-
- return true;
-}
-
-static bool kvm_is_vmx_supported(void)
-{
- bool supported;
-
- migrate_disable();
- supported = __kvm_is_vmx_supported();
- migrate_enable();
-
- return supported;
-}
-
int vmx_check_processor_compat(void)
{
int cpu = raw_smp_processor_id();
struct vmcs_config vmcs_conf;
struct vmx_capability vmx_cap;
- if (!__kvm_is_vmx_supported())
- return -EIO;
+ if (!(cr4_read_shadow() & X86_CR4_VMXE))
+ return -EOPNOTSUPP;
if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
pr_err("Failed to setup VMCS config on CPU %d\n", cpu);
@@ -2787,34 +2757,12 @@ int vmx_check_processor_compat(void)
return 0;
}
-static int kvm_cpu_vmxon(u64 vmxon_pointer)
-{
- u64 msr;
-
- cr4_set_bits(X86_CR4_VMXE);
-
- asm goto("1: vmxon %[vmxon_pointer]\n\t"
- _ASM_EXTABLE(1b, %l[fault])
- : : [vmxon_pointer] "m"(vmxon_pointer)
- : : fault);
- return 0;
-
-fault:
- WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
- rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
- cr4_clear_bits(X86_CR4_VMXE);
-
- return -EFAULT;
-}
-
int vmx_enable_virtualization_cpu(void)
{
int cpu = raw_smp_processor_id();
- u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
- int r;
- if (cr4_read_shadow() & X86_CR4_VMXE)
- return -EBUSY;
+ if (!(cr4_read_shadow() & X86_CR4_VMXE))
+ return -EOPNOTSUPP;
/*
* This can happen if we hot-added a CPU but failed to allocate
@@ -2823,14 +2771,6 @@ int vmx_enable_virtualization_cpu(void)
if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu))
return -EFAULT;
- intel_pt_handle_vmx(1);
-
- r = kvm_cpu_vmxon(phys_addr);
- if (r) {
- intel_pt_handle_vmx(0);
- return r;
- }
-
return 0;
}
@@ -2931,47 +2871,6 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
return -ENOMEM;
}
-static void free_kvm_area(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- free_vmcs(per_cpu(vmxarea, cpu));
- per_cpu(vmxarea, cpu) = NULL;
- }
-}
-
-static __init int alloc_kvm_area(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct vmcs *vmcs;
-
- vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
- if (!vmcs) {
- free_kvm_area();
- return -ENOMEM;
- }
-
- /*
- * When eVMCS is enabled, alloc_vmcs_cpu() sets
- * vmcs->revision_id to KVM_EVMCS_VERSION instead of
- * revision_id reported by MSR_IA32_VMX_BASIC.
- *
- * However, even though not explicitly documented by
- * TLFS, VMXArea passed as VMXON argument should
- * still be marked with revision_id reported by
- * physical CPU.
- */
- if (kvm_is_using_evmcs())
- vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
-
- per_cpu(vmxarea, cpu) = vmcs;
- }
- return 0;
-}
-
static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
struct kvm_segment *save)
{
@@ -8204,8 +8103,6 @@ void vmx_hardware_unsetup(void)
if (nested)
nested_vmx_hardware_unsetup();
-
- free_kvm_area();
}
void vmx_vm_destroy(struct kvm *kvm)
@@ -8499,10 +8396,6 @@ __init int vmx_hardware_setup(void)
vmx_set_cpu_caps();
- r = alloc_kvm_area();
- if (r && nested)
- nested_vmx_hardware_unsetup();
-
kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
/*
@@ -8554,7 +8447,7 @@ int __init vmx_init(void)
KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx);
- if (!kvm_is_vmx_supported())
+ if (!(cr4_read_shadow() & X86_CR4_VMXE))
return -EOPNOTSUPP;
/*
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 916441f5e85c..0eec314b79c2 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -206,11 +206,11 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
/* cr4 was introduced in the Pentium CPU */
#ifdef CONFIG_X86_32
if (ctxt->cr4)
- __write_cr4(ctxt->cr4);
+ __write_cr4(ctxt->cr4 & ~X86_CR4_VMXE);
#else
/* CONFIG X86_64 */
wrmsrq(MSR_EFER, ctxt->efer);
- __write_cr4(ctxt->cr4);
+ __write_cr4(ctxt->cr4 & ~X86_CR4_VMXE);
#endif
write_cr3(ctxt->cr3);
write_cr2(ctxt->cr2);
@@ -291,6 +291,9 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
* because some of the MSRs are "emulated" in microcode.
*/
msr_restore_context(ctxt);
+
+ if (ctxt->cr4 & X86_CR4_VMXE)
+ cpu_enable_virtualization();
}
/* Needed by apm.c */
--
2.51.0
next prev parent reply other threads:[~2025-09-09 18:31 UTC|newest]
Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-09-09 18:28 [RFC PATCH v1 0/5] x86/boot, KVM: Move VMXON/VMXOFF handling from KVM to CPU lifecycle Xin Li (Intel)
2025-09-09 18:28 ` Xin Li (Intel) [this message]
2025-09-10 5:37 ` [RFC PATCH v1 1/5] x86/boot: Shift VMXON from KVM init to CPU startup phase Adrian Hunter
2025-09-10 7:25 ` Chao Gao
2025-09-11 6:57 ` Xin Li
2025-09-10 8:02 ` Huang, Kai
2025-09-10 11:10 ` Chao Gao
2025-09-10 11:35 ` Huang, Kai
2025-09-10 13:13 ` Arjan van de Ven
2025-09-10 20:52 ` Huang, Kai
2025-09-09 18:28 ` [RFC PATCH v1 2/5] x86/boot: Move VMXOFF from KVM teardown to CPU shutdown phase Xin Li (Intel)
2025-09-09 18:28 ` [RFC PATCH v1 3/5] x86/shutdown, KVM: VMX: Move VMCLEAR of VMCSs to cpu_disable_virtualization() Xin Li (Intel)
2025-09-09 18:28 ` [RFC PATCH v1 4/5] x86/reboot: Remove emergency_reboot_disable_virtualization() Xin Li (Intel)
2025-09-09 18:28 ` [RFC PATCH v1 5/5] KVM: Remove kvm_rebooting and its references Xin Li (Intel)
2025-09-16 17:56 ` Sean Christopherson
2025-09-17 16:51 ` Xin Li
2025-09-17 23:02 ` Sean Christopherson
2025-09-11 14:20 ` [RFC PATCH v1 0/5] x86/boot, KVM: Move VMXON/VMXOFF handling from KVM to CPU lifecycle Sean Christopherson
2025-09-11 15:20 ` Dave Hansen
2025-09-16 17:29 ` Sean Christopherson
2025-09-11 17:04 ` Arjan van de Ven
2025-09-16 17:54 ` Sean Christopherson
2025-09-16 18:25 ` Jim Mattson
2025-09-17 13:48 ` Arjan van de Ven
2025-09-17 17:30 ` Xin Li
2025-09-17 22:40 ` Sean Christopherson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250909182828.1542362-2-xin@zytor.com \
--to=xin@zytor.com \
--cc=andrew.cooper3@citrix.com \
--cc=arjan@linux.intel.com \
--cc=bp@alien8.de \
--cc=brgerst@gmail.com \
--cc=chao.gao@intel.com \
--cc=dan.j.williams@intel.com \
--cc=dave.hansen@linux.intel.com \
--cc=david.kaplan@amd.com \
--cc=hpa@zytor.com \
--cc=kprateek.nayak@amd.com \
--cc=kvm@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-pm@vger.kernel.org \
--cc=mingo@redhat.com \
--cc=pavel@kernel.org \
--cc=pbonzini@redhat.com \
--cc=peterz@infradead.org \
--cc=rafael@kernel.org \
--cc=rick.p.edgecombe@intel.com \
--cc=seanjc@google.com \
--cc=tglx@linutronix.de \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox