* [PATCH 6.17.y 2/3] KVM: x86: Add support for RDMSR/WRMSRNS w/ immediate on Intel
2025-11-20 19:07 ` [PATCH 6.17.y 1/3] KVM: x86: Rename local "ecx" variables to "msr" and "pmc" as appropriate Sasha Levin
@ 2025-11-20 19:07 ` Sasha Levin
2025-11-20 19:07 ` [PATCH 6.17.y 3/3] KVM: VMX: Inject #UD if guest tries to execute SEAMCALL or TDCALL Sasha Levin
1 sibling, 0 replies; 4+ messages in thread
From: Sasha Levin @ 2025-11-20 19:07 UTC (permalink / raw)
To: stable; +Cc: Xin Li, Sean Christopherson, Sasha Levin
From: Xin Li <xin@zytor.com>
[ Upstream commit 885df2d2109a60f84d84639ce6d95a91045f6c45 ]
Add support for the immediate forms of RDMSR and WRMSRNS (currently
Intel-only). The immediate variants are only valid in 64-bit mode, and
use a single general purpose register for the data (the register is also
encoded in the instruction, i.e. not implicit like regular RDMSR/WRMSR).
The immediate variants are primarily motivated by performance, not code
size: by having the MSR index in an immediate, it is available *much*
earlier in the CPU pipeline, which allows hardware much more leeway about
how a particular MSR is handled.
Intel VMX support for the immediate forms of MSR accesses communicates
exit information to the host as follows:
1) The immediate form of RDMSR uses VM-Exit Reason 84.
2) The immediate form of WRMSRNS uses VM-Exit Reason 85.
3) For both VM-Exit reasons 84 and 85, the Exit Qualification field is
set to the MSR index that triggered the VM-Exit.
4) Bits 3 ~ 6 of the VM-Exit Instruction Information field are set to
the register encoding used by the immediate form of the instruction,
i.e. the destination register for RDMSR, and the source for WRMSRNS.
5) The VM-Exit Instruction Length field records the size of the
immediate form of the MSR instruction.
To deal with userspace RDMSR exits, stash the destination register in a
new kvm_vcpu_arch field, similar to cui_linear_rip, pio, etc.
Alternatively, the register could be saved in kvm_run.msr or re-retrieved
from the VMCS, but the former would require sanitizing the value to ensure
userspace doesn't clobber the value to an out-of-bounds index, and the
latter would require a new one-off kvm_x86_ops hook.
Don't bother adding support for the instructions in KVM's emulator, as the
only way for RDMSR/WRMSR to be encountered is if KVM is emulating large
swaths of code due to invalid guest state, and a vCPU cannot have invalid
guest state while in 64-bit mode.
Signed-off-by: Xin Li (Intel) <xin@zytor.com>
[sean: minor tweaks, massage and expand changelog]
Link: https://lore.kernel.org/r/20250805202224.1475590-5-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
Stable-dep-of: 9d7dfb95da2c ("KVM: VMX: Inject #UD if guest tries to execute SEAMCALL or TDCALL")
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
arch/x86/include/asm/kvm_host.h | 3 ++
arch/x86/include/uapi/asm/vmx.h | 6 +++-
arch/x86/kvm/vmx/nested.c | 13 ++++++--
arch/x86/kvm/vmx/vmx.c | 21 +++++++++++++
arch/x86/kvm/vmx/vmx.h | 5 +++
arch/x86/kvm/x86.c | 55 +++++++++++++++++++++++++++------
6 files changed, 90 insertions(+), 13 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a35ee44ec70ad..7e87e7d9ba5ae 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -926,6 +926,7 @@ struct kvm_vcpu_arch {
bool emulate_regs_need_sync_from_vcpu;
int (*complete_userspace_io)(struct kvm_vcpu *vcpu);
unsigned long cui_linear_rip;
+ int cui_rdmsr_imm_reg;
gpa_t time;
s8 pvclock_tsc_shift;
@@ -2155,7 +2156,9 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiat
int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data);
int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data);
int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu);
+int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg);
int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu);
+int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg);
int kvm_emulate_as_nop(struct kvm_vcpu *vcpu);
int kvm_emulate_invd(struct kvm_vcpu *vcpu);
int kvm_emulate_mwait(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index f0f4a4cf84a72..9792e329343e8 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -94,6 +94,8 @@
#define EXIT_REASON_BUS_LOCK 74
#define EXIT_REASON_NOTIFY 75
#define EXIT_REASON_TDCALL 77
+#define EXIT_REASON_MSR_READ_IMM 84
+#define EXIT_REASON_MSR_WRITE_IMM 85
#define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@@ -158,7 +160,9 @@
{ EXIT_REASON_TPAUSE, "TPAUSE" }, \
{ EXIT_REASON_BUS_LOCK, "BUS_LOCK" }, \
{ EXIT_REASON_NOTIFY, "NOTIFY" }, \
- { EXIT_REASON_TDCALL, "TDCALL" }
+ { EXIT_REASON_TDCALL, "TDCALL" }, \
+ { EXIT_REASON_MSR_READ_IMM, "MSR_READ_IMM" }, \
+ { EXIT_REASON_MSR_WRITE_IMM, "MSR_WRITE_IMM" }
#define VMX_EXIT_REASON_FLAGS \
{ VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" }
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index b8ea1969113df..4e6352ef95201 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -6216,19 +6216,26 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12,
union vmx_exit_reason exit_reason)
{
- u32 msr_index = kvm_rcx_read(vcpu);
+ u32 msr_index;
gpa_t bitmap;
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
return true;
+ if (exit_reason.basic == EXIT_REASON_MSR_READ_IMM ||
+ exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM)
+ msr_index = vmx_get_exit_qual(vcpu);
+ else
+ msr_index = kvm_rcx_read(vcpu);
+
/*
* The MSR_BITMAP page is divided into four 1024-byte bitmaps,
* for the four combinations of read/write and low/high MSR numbers.
* First we need to figure out which of the four to use:
*/
bitmap = vmcs12->msr_bitmap;
- if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
+ if (exit_reason.basic == EXIT_REASON_MSR_WRITE ||
+ exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM)
bitmap += 2048;
if (msr_index >= 0xc0000000) {
msr_index -= 0xc0000000;
@@ -6527,6 +6534,8 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
case EXIT_REASON_MSR_READ:
case EXIT_REASON_MSR_WRITE:
+ case EXIT_REASON_MSR_READ_IMM:
+ case EXIT_REASON_MSR_WRITE_IMM:
return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
case EXIT_REASON_INVALID_STATE:
return true;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index aa157fe5b7b31..4d1af365f5845 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6003,6 +6003,23 @@ static int handle_notify(struct kvm_vcpu *vcpu)
return 1;
}
+static int vmx_get_msr_imm_reg(struct kvm_vcpu *vcpu)
+{
+ return vmx_get_instr_info_reg(vmcs_read32(VMX_INSTRUCTION_INFO));
+}
+
+static int handle_rdmsr_imm(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_rdmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
+ vmx_get_msr_imm_reg(vcpu));
+}
+
+static int handle_wrmsr_imm(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
+ vmx_get_msr_imm_reg(vcpu));
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -6061,6 +6078,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_ENCLS] = handle_encls,
[EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit,
[EXIT_REASON_NOTIFY] = handle_notify,
+ [EXIT_REASON_MSR_READ_IMM] = handle_rdmsr_imm,
+ [EXIT_REASON_MSR_WRITE_IMM] = handle_wrmsr_imm,
};
static const int kvm_vmx_max_exit_handlers =
@@ -6495,6 +6514,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
#ifdef CONFIG_MITIGATION_RETPOLINE
if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
return kvm_emulate_wrmsr(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM)
+ return handle_wrmsr_imm(vcpu);
else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
return handle_preemption_timer(vcpu);
else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index d3389baf3ab3d..24d65dac5e897 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -706,6 +706,11 @@ static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu)
void dump_vmcs(struct kvm_vcpu *vcpu);
+static inline int vmx_get_instr_info_reg(u32 vmx_instr_info)
+{
+ return (vmx_instr_info >> 3) & 0xf;
+}
+
static inline int vmx_get_instr_info_reg2(u32 vmx_instr_info)
{
return (vmx_instr_info >> 28) & 0xf;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f98b801d9efdf..0bfab634c5912 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1997,6 +1997,15 @@ static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
return complete_fast_msr_access(vcpu);
}
+static int complete_fast_rdmsr_imm(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu->run->msr.error)
+ kvm_register_write(vcpu, vcpu->arch.cui_rdmsr_imm_reg,
+ vcpu->run->msr.data);
+
+ return complete_fast_msr_access(vcpu);
+}
+
static u64 kvm_msr_reason(int r)
{
switch (r) {
@@ -2031,39 +2040,53 @@ static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
return 1;
}
-int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
+static int __kvm_emulate_rdmsr(struct kvm_vcpu *vcpu, u32 msr, int reg,
+ int (*complete_rdmsr)(struct kvm_vcpu *))
{
- u32 msr = kvm_rcx_read(vcpu);
u64 data;
int r;
r = kvm_get_msr_with_filter(vcpu, msr, &data);
-
if (!r) {
trace_kvm_msr_read(msr, data);
- kvm_rax_write(vcpu, data & -1u);
- kvm_rdx_write(vcpu, (data >> 32) & -1u);
+ if (reg < 0) {
+ kvm_rax_write(vcpu, data & -1u);
+ kvm_rdx_write(vcpu, (data >> 32) & -1u);
+ } else {
+ kvm_register_write(vcpu, reg, data);
+ }
} else {
/* MSR read failed? See if we should ask user space */
if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_RDMSR, 0,
- complete_fast_rdmsr, r))
+ complete_rdmsr, r))
return 0;
trace_kvm_msr_read_ex(msr);
}
return kvm_x86_call(complete_emulated_msr)(vcpu, r);
}
+
+int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
+{
+ return __kvm_emulate_rdmsr(vcpu, kvm_rcx_read(vcpu), -1,
+ complete_fast_rdmsr);
+}
EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
-int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
+int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg)
+{
+ vcpu->arch.cui_rdmsr_imm_reg = reg;
+
+ return __kvm_emulate_rdmsr(vcpu, msr, reg, complete_fast_rdmsr_imm);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr_imm);
+
+static int __kvm_emulate_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
- u32 msr = kvm_rcx_read(vcpu);
- u64 data = kvm_read_edx_eax(vcpu);
int r;
r = kvm_set_msr_with_filter(vcpu, msr, data);
-
if (!r) {
trace_kvm_msr_write(msr, data);
} else {
@@ -2079,8 +2102,20 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
return kvm_x86_call(complete_emulated_msr)(vcpu, r);
}
+
+int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
+{
+ return __kvm_emulate_wrmsr(vcpu, kvm_rcx_read(vcpu),
+ kvm_read_edx_eax(vcpu));
+}
EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
+int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg)
+{
+ return __kvm_emulate_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg));
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr_imm);
+
int kvm_emulate_as_nop(struct kvm_vcpu *vcpu)
{
return kvm_skip_emulated_instruction(vcpu);
--
2.51.0
^ permalink raw reply related [flat|nested] 4+ messages in thread* [PATCH 6.17.y 3/3] KVM: VMX: Inject #UD if guest tries to execute SEAMCALL or TDCALL
2025-11-20 19:07 ` [PATCH 6.17.y 1/3] KVM: x86: Rename local "ecx" variables to "msr" and "pmc" as appropriate Sasha Levin
2025-11-20 19:07 ` [PATCH 6.17.y 2/3] KVM: x86: Add support for RDMSR/WRMSRNS w/ immediate on Intel Sasha Levin
@ 2025-11-20 19:07 ` Sasha Levin
1 sibling, 0 replies; 4+ messages in thread
From: Sasha Levin @ 2025-11-20 19:07 UTC (permalink / raw)
To: stable
Cc: Sean Christopherson, Kai Huang, Xiaoyao Li, Rick Edgecombe,
Dan Williams, Binbin Wu, Sasha Levin
From: Sean Christopherson <seanjc@google.com>
[ Upstream commit 9d7dfb95da2cb5c1287df2f3468bcb70d8b31087 ]
Add VMX exit handlers for SEAMCALL and TDCALL to inject a #UD if a non-TD
guest attempts to execute SEAMCALL or TDCALL. Neither SEAMCALL nor TDCALL
is gated by any software enablement other than VMXON, and so will generate
a VM-Exit instead of e.g. a native #UD when executed from the guest kernel.
Note! No unprivileged DoS of the L1 kernel is possible as TDCALL and
SEAMCALL #GP at CPL > 0, and the CPL check is performed prior to the VMX
non-root (VM-Exit) check, i.e. userspace can't crash the VM. And for a
nested guest, KVM forwards unknown exits to L1, i.e. an L2 kernel can
crash itself, but not L1.
Note #2! The Intel® Trust Domain CPU Architectural Extensions spec's
pseudocode shows the CPL > 0 check for SEAMCALL coming _after_ the VM-Exit,
but that appears to be a documentation bug (likely because the CPL > 0
check was incorrectly bundled with other lower-priority #GP checks).
Testing on SPR and EMR shows that the CPL > 0 check is performed before
the VMX non-root check, i.e. SEAMCALL #GPs when executed in usermode.
Note #3! The aforementioned Trust Domain spec uses confusing pseudocode
that says that SEAMCALL will #UD if executed "inSEAM", but "inSEAM"
specifically means in SEAM Root Mode, i.e. in the TDX-Module. The long-
form description explicitly states that SEAMCALL generates an exit when
executed in "SEAM VMX non-root operation". But that's a moot point as the
TDX-Module injects #UD if the guest attempts to execute SEAMCALL, as
documented in the "Unconditionally Blocked Instructions" section of the
TDX-Module base specification.
Cc: stable@vger.kernel.org
Cc: Kai Huang <kai.huang@intel.com>
Cc: Xiaoyao Li <xiaoyao.li@intel.com>
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Binbin Wu <binbin.wu@linux.intel.com>
Reviewed-by: Kai Huang <kai.huang@intel.com>
Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Link: https://lore.kernel.org/r/20251016182148.69085-2-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
arch/x86/include/uapi/asm/vmx.h | 1 +
arch/x86/kvm/vmx/nested.c | 8 ++++++++
arch/x86/kvm/vmx/vmx.c | 8 ++++++++
3 files changed, 17 insertions(+)
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 9792e329343e8..1baa86dfe0293 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -93,6 +93,7 @@
#define EXIT_REASON_TPAUSE 68
#define EXIT_REASON_BUS_LOCK 74
#define EXIT_REASON_NOTIFY 75
+#define EXIT_REASON_SEAMCALL 76
#define EXIT_REASON_TDCALL 77
#define EXIT_REASON_MSR_READ_IMM 84
#define EXIT_REASON_MSR_WRITE_IMM 85
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 4e6352ef95201..c66145aca2d8d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -6587,6 +6587,14 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
case EXIT_REASON_NOTIFY:
/* Notify VM exit is not exposed to L1 */
return false;
+ case EXIT_REASON_SEAMCALL:
+ case EXIT_REASON_TDCALL:
+ /*
+ * SEAMCALL and TDCALL unconditionally VM-Exit, but aren't
+ * virtualized by KVM for L1 hypervisors, i.e. L1 should
+ * never want or expect such an exit.
+ */
+ return false;
default:
return true;
}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4d1af365f5845..7bd1679634e93 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5953,6 +5953,12 @@ static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_tdx_instruction(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
#ifndef CONFIG_X86_SGX_KVM
static int handle_encls(struct kvm_vcpu *vcpu)
{
@@ -6078,6 +6084,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_ENCLS] = handle_encls,
[EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit,
[EXIT_REASON_NOTIFY] = handle_notify,
+ [EXIT_REASON_SEAMCALL] = handle_tdx_instruction,
+ [EXIT_REASON_TDCALL] = handle_tdx_instruction,
[EXIT_REASON_MSR_READ_IMM] = handle_rdmsr_imm,
[EXIT_REASON_MSR_WRITE_IMM] = handle_wrmsr_imm,
};
--
2.51.0
^ permalink raw reply related [flat|nested] 4+ messages in thread