public inbox for linux-pm@vger.kernel.org
 help / color / mirror / Atom feed
From: "Xin Li (Intel)" <xin@zytor.com>
To: linux-kernel@vger.kernel.org, kvm@vger.kernel.org,
	linux-pm@vger.kernel.org
Cc: seanjc@google.com, pbonzini@redhat.com, tglx@linutronix.de,
	mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com,
	x86@kernel.org, hpa@zytor.com, rafael@kernel.org,
	pavel@kernel.org, brgerst@gmail.com, xin@zytor.com,
	david.kaplan@amd.com, peterz@infradead.org,
	andrew.cooper3@citrix.com, kprateek.nayak@amd.com,
	arjan@linux.intel.com, chao.gao@intel.com,
	rick.p.edgecombe@intel.com, dan.j.williams@intel.com
Subject: [RFC PATCH v1 2/5] x86/boot: Move VMXOFF from KVM teardown to CPU shutdown phase
Date: Tue,  9 Sep 2025 11:28:22 -0700	[thread overview]
Message-ID: <20250909182828.1542362-3-xin@zytor.com> (raw)
In-Reply-To: <20250909182828.1542362-1-xin@zytor.com>

Relocate VMXOFF from the KVM module unload path to the CPU shutdown phase.
This simplifies proper virtualization cleanup during system shutdown, CPU
hotplug (online/offline cycles), and suspend-to-disk (S4) transitions.

Since INIT interrupts are blocked during VMX operation, VMXOFF must run
just before a CPU shuts down to allow it to be brought back online later.

As a result, VMX instructions are no longer expected to fault.

Signed-off-by: Xin Li (Intel) <xin@zytor.com>
---
 arch/x86/include/asm/processor.h |  1 +
 arch/x86/kernel/cpu/common.c     | 37 ++++++++++++++++++++++++++++++++
 arch/x86/kernel/crash.c          |  4 ++++
 arch/x86/kernel/process.c        |  3 +++
 arch/x86/kernel/reboot.c         | 11 ++++++----
 arch/x86/kernel/smp.c            |  5 +++++
 arch/x86/kernel/smpboot.c        |  6 ++++++
 arch/x86/kvm/vmx/vmx.c           | 30 --------------------------
 arch/x86/power/cpu.c             |  3 +++
 9 files changed, 66 insertions(+), 34 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 59660428f46d..0bfd4eb1e9e2 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -231,6 +231,7 @@ void get_cpu_vendor(struct cpuinfo_x86 *c);
 extern void early_cpu_init(void);
 extern void identify_secondary_cpu(unsigned int cpu);
 extern void cpu_enable_virtualization(void);
+extern void cpu_disable_virtualization(void);
 extern void print_cpu_info(struct cpuinfo_x86 *);
 void print_cpu_msr(struct cpuinfo_x86 *);
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e36877b5a240..39b9be9a2fb1 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -2002,6 +2002,43 @@ void cpu_enable_virtualization(void)
 	intel_pt_handle_vmx(0);
 }
 
+/*
+ * Because INIT interrupts are blocked during VMX operation, this function
+ * must be called just before a CPU shuts down to ensure it can be brought
+ * back online later.
+ *
+ * Consequently, VMX instructions are no longer expected to fault.
+ *
+ * Although VMXOFF should not fault, fault handling is retained as a
+ * precaution against any unexpected code paths that might trigger it and
+ * can be removed later if unnecessary.
+ */
+void cpu_disable_virtualization(void)
+{
+	int cpu = raw_smp_processor_id();
+
+	if (!is_vmx_supported())
+		return;
+
+	if (!(cr4_read_shadow() & X86_CR4_VMXE)) {
+		pr_err("VMX not enabled or already disabled on CPU%d\n", cpu);
+		return;
+	}
+
+	asm goto("1: vmxoff\n\t"
+		 _ASM_EXTABLE(1b, %l[fault])
+		 ::: "cc", "memory" : fault);
+
+exit:
+	cr4_clear_bits(X86_CR4_VMXE);
+	intel_pt_handle_vmx(0);
+	return;
+
+fault:
+	pr_err("VMXOFF faulted on CPU%d\n", cpu);
+	goto exit;
+}
+
 /*
  * This does the hard work of actually picking apart the CPU stuff...
  */
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index c6b12bed173d..772c6d350b50 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -111,6 +111,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
 
 	crash_smp_send_stop();
 
+	/* Kept to VMCLEAR loaded VMCSs */
 	cpu_emergency_disable_virtualization();
 
 	/*
@@ -141,6 +142,9 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
 	x86_platform.guest.enc_kexec_finish();
 
 	crash_save_cpu(regs, smp_processor_id());
+
+	/* Disable virtualization on the last running CPU, usually the BSP */
+	cpu_disable_virtualization();
 }
 
 #if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_HOTPLUG)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 1b7960cf6eb0..a0f6397b81ab 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -827,6 +827,9 @@ void __noreturn stop_this_cpu(void *dummy)
 	disable_local_APIC();
 	mcheck_cpu_clear(c);
 
+	/* Disable virtualization, usually this is an AP */
+	cpu_disable_virtualization();
+
 	/*
 	 * Use wbinvd on processors that support SME. This provides support
 	 * for performing a successful kexec when going from SME inactive
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 964f6b0a3d68..7433e634018f 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -764,6 +764,9 @@ void native_machine_shutdown(void)
 
 	if (kexec_in_progress)
 		x86_platform.guest.enc_kexec_finish();
+
+	/* Disable virtualization on the last running CPU, usually the BSP */
+	cpu_disable_virtualization();
 }
 
 static void __machine_emergency_restart(int emergency)
@@ -873,14 +876,14 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
 	if (shootdown_callback)
 		shootdown_callback(cpu, regs);
 
-	/*
-	 * Prepare the CPU for reboot _after_ invoking the callback so that the
-	 * callback can safely use virtualization instructions, e.g. VMCLEAR.
-	 */
+	/* Kept to VMCLEAR loaded VMCSs */
 	cpu_emergency_disable_virtualization();
 
 	atomic_dec(&waiting_for_crash_ipi);
 
+	/* Disable virtualization, usually this is an AP */
+	cpu_disable_virtualization();
+
 	if (smp_ops.stop_this_cpu) {
 		smp_ops.stop_this_cpu();
 		BUG();
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index b014e6d229f9..eb6a389ba1a9 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -124,7 +124,9 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
 	if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
 		return NMI_HANDLED;
 
+	/* Kept to VMCLEAR loaded VMCSs */
 	cpu_emergency_disable_virtualization();
+
 	stop_this_cpu(NULL);
 
 	return NMI_HANDLED;
@@ -136,7 +138,10 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
 DEFINE_IDTENTRY_SYSVEC(sysvec_reboot)
 {
 	apic_eoi();
+
+	/* Kept to VMCLEAR loaded VMCSs */
 	cpu_emergency_disable_virtualization();
+
 	stop_this_cpu(NULL);
 }
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 33e166f6ab12..fe3b04f33b3f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1229,6 +1229,12 @@ int native_cpu_disable(void)
          */
 	apic_soft_disable();
 
+	/*
+	 * IPIs have been disabled as mentioned above, so virtualization
+	 * can now be safely shut down.
+	 */
+	cpu_disable_virtualization();
+
 	return 0;
 }
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f6742df0c4ff..26af0a8ae08f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -674,29 +674,6 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
 	return ret;
 }
 
-/*
- * Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
- *
- * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
- * atomically track post-VMXON state, e.g. this may be called in NMI context.
- * Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
- * faults are guaranteed to be due to the !post-VMXON check unless the CPU is
- * magically in RM, VM86, compat mode, or at CPL>0.
- */
-static int kvm_cpu_vmxoff(void)
-{
-	asm goto("1: vmxoff\n\t"
-			  _ASM_EXTABLE(1b, %l[fault])
-			  ::: "cc", "memory" : fault);
-
-	cr4_clear_bits(X86_CR4_VMXE);
-	return 0;
-
-fault:
-	cr4_clear_bits(X86_CR4_VMXE);
-	return -EIO;
-}
-
 void vmx_emergency_disable_virtualization_cpu(void)
 {
 	int cpu = raw_smp_processor_id();
@@ -719,8 +696,6 @@ void vmx_emergency_disable_virtualization_cpu(void)
 		if (v->shadow_vmcs)
 			vmcs_clear(v->shadow_vmcs);
 	}
-
-	kvm_cpu_vmxoff();
 }
 
 static void __loaded_vmcs_clear(void *arg)
@@ -2788,12 +2763,7 @@ void vmx_disable_virtualization_cpu(void)
 {
 	vmclear_local_loaded_vmcss();
 
-	if (kvm_cpu_vmxoff())
-		kvm_spurious_fault();
-
 	hv_reset_evmcs();
-
-	intel_pt_handle_vmx(0);
 }
 
 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 0eec314b79c2..d2c865fdb069 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -129,6 +129,9 @@ static void __save_processor_state(struct saved_context *ctxt)
 	ctxt->misc_enable_saved = !rdmsrq_safe(MSR_IA32_MISC_ENABLE,
 					       &ctxt->misc_enable);
 	msr_save_context(ctxt);
+
+	/* Now CR4 is saved, disable VMX and clear CR4.VMXE */
+	cpu_disable_virtualization();
 }
 
 /* Needed by apm.c */
-- 
2.51.0


  parent reply	other threads:[~2025-09-09 18:31 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-09-09 18:28 [RFC PATCH v1 0/5] x86/boot, KVM: Move VMXON/VMXOFF handling from KVM to CPU lifecycle Xin Li (Intel)
2025-09-09 18:28 ` [RFC PATCH v1 1/5] x86/boot: Shift VMXON from KVM init to CPU startup phase Xin Li (Intel)
2025-09-10  5:37   ` Adrian Hunter
2025-09-10  7:25   ` Chao Gao
2025-09-11  6:57     ` Xin Li
2025-09-10  8:02   ` Huang, Kai
2025-09-10 11:10     ` Chao Gao
2025-09-10 11:35       ` Huang, Kai
2025-09-10 13:13         ` Arjan van de Ven
2025-09-10 20:52           ` Huang, Kai
2025-09-09 18:28 ` Xin Li (Intel) [this message]
2025-09-09 18:28 ` [RFC PATCH v1 3/5] x86/shutdown, KVM: VMX: Move VMCLEAR of VMCSs to cpu_disable_virtualization() Xin Li (Intel)
2025-09-09 18:28 ` [RFC PATCH v1 4/5] x86/reboot: Remove emergency_reboot_disable_virtualization() Xin Li (Intel)
2025-09-09 18:28 ` [RFC PATCH v1 5/5] KVM: Remove kvm_rebooting and its references Xin Li (Intel)
2025-09-16 17:56   ` Sean Christopherson
2025-09-17 16:51     ` Xin Li
2025-09-17 23:02       ` Sean Christopherson
2025-09-11 14:20 ` [RFC PATCH v1 0/5] x86/boot, KVM: Move VMXON/VMXOFF handling from KVM to CPU lifecycle Sean Christopherson
2025-09-11 15:20   ` Dave Hansen
2025-09-16 17:29     ` Sean Christopherson
2025-09-11 17:04   ` Arjan van de Ven
2025-09-16 17:54     ` Sean Christopherson
2025-09-16 18:25       ` Jim Mattson
2025-09-17 13:48       ` Arjan van de Ven
2025-09-17 17:30       ` Xin Li
2025-09-17 22:40         ` Sean Christopherson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250909182828.1542362-3-xin@zytor.com \
    --to=xin@zytor.com \
    --cc=andrew.cooper3@citrix.com \
    --cc=arjan@linux.intel.com \
    --cc=bp@alien8.de \
    --cc=brgerst@gmail.com \
    --cc=chao.gao@intel.com \
    --cc=dan.j.williams@intel.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=david.kaplan@amd.com \
    --cc=hpa@zytor.com \
    --cc=kprateek.nayak@amd.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-pm@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=pavel@kernel.org \
    --cc=pbonzini@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rafael@kernel.org \
    --cc=rick.p.edgecombe@intel.com \
    --cc=seanjc@google.com \
    --cc=tglx@linutronix.de \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox