Linux Confidential Computing Development

Linux Confidential Computing Development
 help / color / mirror / Atom feed

* [PATCH v2 4/6] KVM/x86: Return -errno instead of "1" for VMX related MSR emulation
From: Juergen Gross @ 2026-05-28 11:36 UTC (permalink / raw)
  To: linux-kernel, x86, kvm, linux-coco
  Cc: Juergen Gross, Sean Christopherson, Paolo Bonzini,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Kiryl Shutsemau, Rick Edgecombe
In-Reply-To: <20260528113605.267111-1-jgross@suse.com>

Instead of a literal "1" for signalling an error, use a negative errno
value in the emulation code of VMX related MSR registers.

Signed-off-by: Juergen Gross <jgross@suse.com>
---
V2:
- use -errno instead of KVM_MSR_RET_ERR
---
 arch/x86/kvm/vmx/nested.c    |  2 +-
 arch/x86/kvm/vmx/pmu_intel.c | 16 +++---
 arch/x86/kvm/vmx/tdx.c       | 10 ++--
 arch/x86/kvm/vmx/vmx.c       | 96 ++++++++++++++++++------------------
 4 files changed, 62 insertions(+), 62 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 3fe88f29be7a..2236f15ffab2 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -1611,7 +1611,7 @@ int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
 		*pdata = msrs->vmfunc_controls;
 		break;
 	default:
-		return 1;
+		return -EINVAL;
 	}
 
 	return 0;
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 27eb76e6b6a0..4f7e354c4b50 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -362,7 +362,7 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		} else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true)) {
 			break;
 		}
-		return 1;
+		return -EINVAL;
 	}
 
 	return 0;
@@ -379,14 +379,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	switch (msr) {
 	case MSR_CORE_PERF_FIXED_CTR_CTRL:
 		if (data & pmu->fixed_ctr_ctrl_rsvd)
-			return 1;
+			return -EINVAL;
 
 		if (pmu->fixed_ctr_ctrl != data)
 			reprogram_fixed_counters(pmu, data);
 		break;
 	case MSR_IA32_PEBS_ENABLE:
 		if (data & pmu->pebs_enable_rsvd)
-			return 1;
+			return -EINVAL;
 
 		if (pmu->pebs_enable != data) {
 			diff = pmu->pebs_enable ^ data;
@@ -396,13 +396,13 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		break;
 	case MSR_IA32_DS_AREA:
 		if (is_noncanonical_msr_address(data, vcpu))
-			return 1;
+			return -EINVAL;
 
 		pmu->ds_area = data;
 		break;
 	case MSR_PEBS_DATA_CFG:
 		if (data & pmu->pebs_data_cfg_rsvd)
-			return 1;
+			return -EINVAL;
 
 		pmu->pebs_data_cfg = data;
 		break;
@@ -411,7 +411,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		    (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
 			if ((msr & MSR_PMC_FULL_WIDTH_BIT) &&
 			    (data & ~pmu->counter_bitmask[KVM_PMC_GP]))
-				return 1;
+				return -EINVAL;
 
 			if (!msr_info->host_initiated &&
 			    !(msr & MSR_PMC_FULL_WIDTH_BIT))
@@ -427,7 +427,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			    (pmu->raw_event_mask & HSW_IN_TX_CHECKPOINTED))
 				reserved_bits ^= HSW_IN_TX_CHECKPOINTED;
 			if (data & reserved_bits)
-				return 1;
+				return -EINVAL;
 
 			if (data != pmc->eventsel) {
 				pmc->eventsel = data;
@@ -439,7 +439,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			break;
 		}
 		/* Not a known PMU MSR. */
-		return 1;
+		return -EINVAL;
 	}
 
 	return 0;
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 04ce321ebdf3..acc3242af4f4 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -2158,12 +2158,12 @@ int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 		return 0;
 	case MSR_IA32_MCG_EXT_CTL:
 		if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P))
-			return 1;
+			return -EINVAL;
 		msr->data = vcpu->arch.mcg_ext_ctl;
 		return 0;
 	default:
 		if (!tdx_has_emulated_msr(msr->index))
-			return 1;
+			return -EACCES;
 
 		return kvm_get_msr_common(vcpu, msr);
 	}
@@ -2175,15 +2175,15 @@ int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 	case MSR_IA32_MCG_EXT_CTL:
 		if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) ||
 		    (msr->data & ~MCG_EXT_CTL_LMCE_EN))
-			return 1;
+			return -EINVAL;
 		vcpu->arch.mcg_ext_ctl = msr->data;
 		return 0;
 	default:
 		if (tdx_is_read_only_msr(msr->index))
-			return 1;
+			return -EACCES;
 
 		if (!tdx_has_emulated_msr(msr->index))
-			return 1;
+			return -EACCES;
 
 		return kvm_set_msr_common(vcpu, msr);
 	}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b9103de01428..2eee599fca30 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2076,7 +2076,7 @@ int vmx_get_feature_msr(u32 msr, u64 *data)
 	switch (msr) {
 	case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
 		if (!nested)
-			return 1;
+			return -EINVAL;
 		return vmx_get_vmx_msr(&vmcs_config.nested, msr, data);
 	default:
 		return KVM_MSR_RET_UNSUPPORTED;
@@ -2111,18 +2111,18 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_TSX_CTRL:
 		if (!msr_info->host_initiated &&
 		    !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
-			return 1;
+			return -EINVAL;
 		goto find_uret_msr;
 	case MSR_IA32_UMWAIT_CONTROL:
 		if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
-			return 1;
+			return -EINVAL;
 
 		msr_info->data = vmx->msr_ia32_umwait_control;
 		break;
 	case MSR_IA32_SPEC_CTRL:
 		if (!msr_info->host_initiated &&
 		    !guest_has_spec_ctrl_msr(vcpu))
-			return 1;
+			return -EINVAL;
 
 		msr_info->data = to_vmx(vcpu)->spec_ctrl;
 		break;
@@ -2139,14 +2139,14 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (!kvm_mpx_supported() ||
 		    (!msr_info->host_initiated &&
 		     !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
-			return 1;
+			return -EINVAL;
 		msr_info->data = vmcs_read64(GUEST_BNDCFGS);
 		break;
 	case MSR_IA32_MCG_EXT_CTL:
 		if (!msr_info->host_initiated &&
 		    !(vmx->msr_ia32_feature_control &
 		      FEAT_CTL_LMCE_ENABLED))
-			return 1;
+			return -EINVAL;
 		msr_info->data = vcpu->arch.mcg_ext_ctl;
 		break;
 	case MSR_IA32_FEAT_CTL:
@@ -2155,16 +2155,16 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
 		if (!msr_info->host_initiated &&
 		    !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC))
-			return 1;
+			return -EINVAL;
 		msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
 			[msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
 		break;
 	case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
 		if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
-			return 1;
+			return -EINVAL;
 		if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
 				    &msr_info->data))
-			return 1;
+			return -EINVAL;
 #ifdef CONFIG_KVM_HYPERV
 		/*
 		 * Enlightened VMCS v1 doesn't have certain VMCS fields but
@@ -2180,19 +2180,19 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		break;
 	case MSR_IA32_RTIT_CTL:
 		if (!vmx_pt_mode_is_host_guest())
-			return 1;
+			return -EINVAL;
 		msr_info->data = vmx->pt_desc.guest.ctl;
 		break;
 	case MSR_IA32_RTIT_STATUS:
 		if (!vmx_pt_mode_is_host_guest())
-			return 1;
+			return -EINVAL;
 		msr_info->data = vmx->pt_desc.guest.status;
 		break;
 	case MSR_IA32_RTIT_CR3_MATCH:
 		if (!vmx_pt_mode_is_host_guest() ||
 			!intel_pt_validate_cap(vmx->pt_desc.caps,
 						PT_CAP_cr3_filtering))
-			return 1;
+			return -EINVAL;
 		msr_info->data = vmx->pt_desc.guest.cr3_match;
 		break;
 	case MSR_IA32_RTIT_OUTPUT_BASE:
@@ -2201,7 +2201,7 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 					PT_CAP_topa_output) &&
 			 !intel_pt_validate_cap(vmx->pt_desc.caps,
 					PT_CAP_single_range_output)))
-			return 1;
+			return -EINVAL;
 		msr_info->data = vmx->pt_desc.guest.output_base;
 		break;
 	case MSR_IA32_RTIT_OUTPUT_MASK:
@@ -2210,14 +2210,14 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 					PT_CAP_topa_output) &&
 			 !intel_pt_validate_cap(vmx->pt_desc.caps,
 					PT_CAP_single_range_output)))
-			return 1;
+			return -EINVAL;
 		msr_info->data = vmx->pt_desc.guest.output_mask;
 		break;
 	case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
 		index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
 		if (!vmx_pt_mode_is_host_guest() ||
 		    (index >= 2 * vmx->pt_desc.num_address_ranges))
-			return 1;
+			return -EINVAL;
 		if (index % 2)
 			msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
 		else
@@ -2359,7 +2359,7 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		break;
 	case MSR_IA32_DEBUGCTLMSR:
 		if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated))
-			return 1;
+			return -EINVAL;
 
 		data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
 
@@ -2377,10 +2377,10 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (!kvm_mpx_supported() ||
 		    (!msr_info->host_initiated &&
 		     !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
-			return 1;
+			return -EINVAL;
 		if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) ||
 		    (data & MSR_IA32_BNDCFGS_RSVD))
-			return 1;
+			return -EINVAL;
 
 		if (is_guest_mode(vcpu) &&
 		    ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) ||
@@ -2391,21 +2391,21 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		break;
 	case MSR_IA32_UMWAIT_CONTROL:
 		if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
-			return 1;
+			return -EINVAL;
 
 		/* The reserved bit 1 and non-32 bit [63:32] should be zero */
 		if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
-			return 1;
+			return -EINVAL;
 
 		vmx->msr_ia32_umwait_control = data;
 		break;
 	case MSR_IA32_SPEC_CTRL:
 		if (!msr_info->host_initiated &&
 		    !guest_has_spec_ctrl_msr(vcpu))
-			return 1;
+			return -EINVAL;
 
 		if (kvm_spec_ctrl_test_value(data))
-			return 1;
+			return -EINVAL;
 
 		vmx->spec_ctrl = data;
 		if (!data)
@@ -2430,9 +2430,9 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_TSX_CTRL:
 		if (!msr_info->host_initiated &&
 		    !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
-			return 1;
+			return -EINVAL;
 		if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
-			return 1;
+			return -EINVAL;
 		goto find_uret_msr;
 	case MSR_IA32_CR_PAT:
 		ret = kvm_set_msr_common(vcpu, msr_info);
@@ -2451,12 +2451,12 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		     !(to_vmx(vcpu)->msr_ia32_feature_control &
 		       FEAT_CTL_LMCE_ENABLED)) ||
 		    (data & ~MCG_EXT_CTL_LMCE_EN))
-			return 1;
+			return -EINVAL;
 		vcpu->arch.mcg_ext_ctl = data;
 		break;
 	case MSR_IA32_FEAT_CTL:
 		if (!is_vmx_feature_control_msr_valid(vmx, msr_info))
-			return 1;
+			return -EINVAL;
 
 		vmx->msr_ia32_feature_control = data;
 		if (msr_info->host_initiated && data == 0)
@@ -2481,70 +2481,70 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		    (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) ||
 		    ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
 		    !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
-			return 1;
+			return -EINVAL;
 		vmx->msr_ia32_sgxlepubkeyhash
 			[msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
 		break;
 	case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
 		if (!msr_info->host_initiated)
-			return 1; /* they are read-only */
+			return -EINVAL; /* they are read-only */
 		if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
-			return 1;
+			return -EINVAL;
 		return vmx_set_vmx_msr(vcpu, msr_index, data);
 	case MSR_IA32_RTIT_CTL:
 		if (!vmx_pt_mode_is_host_guest() ||
 			vmx_rtit_ctl_check(vcpu, data) ||
 			vmx->nested.vmxon)
-			return 1;
+			return -EINVAL;
 		vmcs_write64(GUEST_IA32_RTIT_CTL, data);
 		vmx->pt_desc.guest.ctl = data;
 		pt_update_intercept_for_msr(vcpu);
 		break;
 	case MSR_IA32_RTIT_STATUS:
 		if (!pt_can_write_msr(vmx))
-			return 1;
+			return -EINVAL;
 		if (data & MSR_IA32_RTIT_STATUS_MASK)
-			return 1;
+			return -EINVAL;
 		vmx->pt_desc.guest.status = data;
 		break;
 	case MSR_IA32_RTIT_CR3_MATCH:
 		if (!pt_can_write_msr(vmx))
-			return 1;
+			return -EINVAL;
 		if (!intel_pt_validate_cap(vmx->pt_desc.caps,
 					   PT_CAP_cr3_filtering))
-			return 1;
+			return -EINVAL;
 		vmx->pt_desc.guest.cr3_match = data;
 		break;
 	case MSR_IA32_RTIT_OUTPUT_BASE:
 		if (!pt_can_write_msr(vmx))
-			return 1;
+			return -EINVAL;
 		if (!intel_pt_validate_cap(vmx->pt_desc.caps,
 					   PT_CAP_topa_output) &&
 		    !intel_pt_validate_cap(vmx->pt_desc.caps,
 					   PT_CAP_single_range_output))
-			return 1;
+			return -EINVAL;
 		if (!pt_output_base_valid(vcpu, data))
-			return 1;
+			return -EINVAL;
 		vmx->pt_desc.guest.output_base = data;
 		break;
 	case MSR_IA32_RTIT_OUTPUT_MASK:
 		if (!pt_can_write_msr(vmx))
-			return 1;
+			return -EINVAL;
 		if (!intel_pt_validate_cap(vmx->pt_desc.caps,
 					   PT_CAP_topa_output) &&
 		    !intel_pt_validate_cap(vmx->pt_desc.caps,
 					   PT_CAP_single_range_output))
-			return 1;
+			return -EINVAL;
 		vmx->pt_desc.guest.output_mask = data;
 		break;
 	case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
 		if (!pt_can_write_msr(vmx))
-			return 1;
+			return -EINVAL;
 		index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
 		if (index >= 2 * vmx->pt_desc.num_address_ranges)
-			return 1;
+			return -EINVAL;
 		if (is_noncanonical_msr_address(data, vcpu))
-			return 1;
+			return -EINVAL;
 		if (index % 2)
 			vmx->pt_desc.guest.addr_b[index / 2] = data;
 		else
@@ -2563,20 +2563,20 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (data & PERF_CAP_LBR_FMT) {
 			if ((data & PERF_CAP_LBR_FMT) !=
 			    (kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT))
-				return 1;
+				return -EINVAL;
 			if (!cpuid_model_is_consistent(vcpu))
-				return 1;
+				return -EINVAL;
 		}
 		if (data & PERF_CAP_PEBS_FORMAT) {
 			if ((data & PERF_CAP_PEBS_MASK) !=
 			    (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
-				return 1;
+				return -EINVAL;
 			if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS))
-				return 1;
+				return -EINVAL;
 			if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64))
-				return 1;
+				return -EINVAL;
 			if (!cpuid_model_is_consistent(vcpu))
-				return 1;
+				return -EINVAL;
 		}
 		ret = kvm_set_msr_common(vcpu, msr_info);
 		break;
-- 
2.54.0


^ permalink raw reply related

* COCONUT-SVSM Development Release v2026.05-devel
From: Jörg Rödel @ 2026-05-28 11:44 UTC (permalink / raw)
  To: coconut-svsm, linux-coco

Hi all,

The month is almost over and it is time for a new COCONUT-SVSM development
release. This one turned out bigger than usual with 33 merges that brought in
91 non-merge commits from 9 contributors.

The changes include:

  - Boot flow: stage2 was removed and replaced by the new simpler boot/bldr
    loader. Build, xbuild, IGVM builder, configs, and launch paths now
    prefer/consume bldr.

  - Platform/CPU feature model: CPUID handling was routed through the platform
    abstraction, with a feature lookup table added for x2APIC, physical address
    size, Hyper-V discovery, CET, FPU/SSE, INVLPGB, C-bit, and related SNP
    features.

  - Attestation: Added vsock transport support with serial fallback, refactored
    aproxy transport handling, added read_exact / write_all helpers, and
    documented the vsock transport option.

  - Protocol hardening: Core and attestation protocol handlers gained stricter
    region validation, reserved-bit checks, request validation, mutually
    exclusive core calls, safer CAA/VMSA handling, and better guest fault
    forwarding.
  
  - Memory and guest handling: Shared pages are made private on SharedBox drop,
    guest memory reads now require FromBytes, VMSA registration checks
    overlaps, and CAA/VMSA tracking was tightened.

  - Virtio/vTPM fixes: Virtio owning queue validation now checks tokens and
    lengths before indexing/slicing. vTPM failure mode no longer returns
    uninitialized heap bytes.

  - ACPI/fw_cfg cleanup: Removed leftover fw_cfg-based ACPI/MADT logic and
    dropped the ACPI fuzz target.

  - Common architecture code: MSR, CR0/CR4, SEV status, x86, and APIC
    definitions moved into cpuarch.

  - Scripts and CI: QEMU launch no longer invokes sudo, gained --tcg, dropped
    the QEMU >= 11 nocc object path, improved test timeout/error reporting,
    fixed workflow triggers, added Verus caching, updated dependency review to
    Node 24, and dumps host dmesg on QEMU/test failures.

  - Verification: Documentation and workflows now reflect cargo-verus usage and
    Verus installation changes.

  - Misc fixes: ELF symbol/buffer bounds fixes, IPI race fixes, CPU vendor
    display, kernel version display during guest launch, and IGVM target VTL
    selection based on firmware presence.

As usual, the full shortlog since the last release is attached.

Have fun!

Regards,

	Joerg

Carlos López (18):
      kernel: platform: add platform_method!() macro
      kernel: platform: do not take &self to query CPUID
      kernel: cpu/vc: simplify snp_cpuid()
      kernel: platform/snp: properly handle CPUID leaf 0xd
      kernel: platform: add default CPUID implementation
      kernel: always route CPUID through the platform abstraction
      kernel: cpu/features: create feature lookup table
      kernel: cet: make CET discovery into a feature
      kernel: cpu/sse: make FPU feature detection into a feature
      kernel: hyperv: make Hyper-V discovery into a feature
      kernel: sev/tlb: make INVLPGB max entry detection into a feature
      kernel: platform: make phys address sizes into a feature
      kernel: platform: make x2apic into a feature
      kernel: platform: make platform statics read-only after init
      kernel: platform: remove trivial FIXME for SvsmPlatformCell
      kernel: platform/snp: make C-bit into a feature
      kernel: platform/snp: get physical address size through CPU features
      kernel: platform: remove setup_guest_host_comm()

Joerg Roedel (23):
      kernel/guestmem: Require FromBytes for read_from_guest()
      kernel/mm: Make all pages private when SharedBox is dropped
      kernel/protocols: Forward GuestPtr faults to the guest
      kernel/protocols: Make sure memory regions are valid for attestation protocol
      kernel/greq: Round extended guest request size up to page-size
      kernel/percpu: Always page_align CAA address before mapping
      kernel/percpu: Track CAA address in PERCPU_VMSAS
      kernel/percpu: Check for region overlap in VMSA registration
      kernel/percpu: Return SvsmError from PERCPU_VMSAS.unregister()
      kernel/snp: Register initial guest VMSA
      kernel/protocols: Check for valid regions in core_pvalidate
      kernel/protocols: Update PERCPU_VMSAS in core_remap_caa()
      kernel/protocols: Do not deregister VMSA before updating RMP state
      kernel/protocols: Check for reserved bits in core_pvalidate_one()
      kernel/protocols: Check for region validity in core_pvalidate_one()
      kernel/protocols: Check whether attestation requests are valid
      kernel/protocols: Remove try_from_as_ref() from attestation structures
      kernel/protocols: Use valid_phys_region() where needed
      kernel/protocols: Make some core protocol calls mutually exclusive
      kernel/protocols: Use MemoryRegion::checked_new() in core protocol handlers
      Elf: Make sure buffer length is multiple of 16
      Elf: Do not read symbols beyond symbol table
      COCONUT-SVSM Release 2026.05-devel

Jon Lange (22):
      cpuarch: move MSR and CR0/CR4 definitions to common crate
      igvmbuilder: reshape the initial low-mem page tables
      bldr: implement a simpler boot loader
      igvmbuild: support bldr
      xbuild: support bldr
      build: consume bldr instead of stage2
      Merge pull request #1029 from msft-jlange/bldr
      svsm: prefer bldr to stage2
      xbuild: remove stage2 support
      igvmbuilder: remove support for stage2
      stage2: remove stage2
      bldr: clear temporary mapping PTEs after use
      Merge pull request #1064 from MelodyHuibo/init_vgif
      Merge pull request #1068 from 00xc/platform/fixme
      error: avoid `SvsmReqError` outside of SVSM-specific paths
      svsm: detect and display CPU vendor
      scripts: display kernel version when launching guest
      cpu/ipi: fix race conditions
      Merge pull request #1076 from msft-jlange/cpu_vendor
      Merge pull request #1077 from msft-jlange/kernel_info
      igvmbuilder: configure target VTL based on the presence of firmware
      cpuarch: move APIC constants to a common location

Jörg Rödel (24):
      Merge pull request #1048 from luigix25/fix_ci
      Merge pull request #1052 from n-ramacciotti/ci/remove_unmaintained_action
      Merge pull request #1054 from msft-jlange/bldr_ptes
      Merge pull request #1043 from mvanhorn/feat/1042-tools-check
      Merge pull request #1050 from n-ramacciotti/ci/simplify-test-in-svsm
      Merge pull request #1053 from msft-jlange/remove_stage2
      Merge pull request #1055 from n-ramacciotti/ci/update_dep_review_node_24
      Merge pull request #1030 from 00xc/platform/cpuid-v2
      Merge pull request #1059 from ziqiaozhou/fix-broken-alloc-proof
      Merge pull request #1067 from 00xc/ro-after-init
      Merge pull request #1070 from MelodyHuibo/enable_alternate_injection
      Merge pull request #1071 from 00xc/platform/remove-host-comm
      Merge pull request #1072 from stefano-garzarella/virtio-fix-owning-pop
      Merge pull request #1074 from stefano-garzarella/fix-tpm-allocation
      Merge pull request #1078 from msft-jlange/ipi_fix
      Merge pull request #1038 from ziqiaozhou/cargo-verus
      Merge pull request #1079 from stefano-garzarella/verus-cache
      Merge pull request #1080 from stefano-garzarella/ci-fix-verification-label-trigger
      Merge pull request #1082 from stefano-garzarella/ci-dmesg
      Merge pull request #1066 from luigix25/remove_sudo
      Merge pull request #1090 from luigix25/fw_cfg_cleanup
      Merge pull request #1085 from msft-jlange/igvm_vtl
      Merge pull request #1087 from msft-jlange/cpu_apic
      Merge pull request #1069 from 00xc/platform/missing-features

Luigi Leonardi (14):
      github/workflows: add apt update before apt install in publish-docs
      io: add `read_exact` and `write_all` to Read and Write trait
      aproxy: use read/write traits
      aproxy: factor out accept loop to a separate function
      aproxy: enable vsock for attestation
      kernel/attest: abstract transport implementation
      kernel/attest: switch to write_all/read_exact
      kernel/attest: add vsock transport with serial fallback
      Documentation/ATTESTATION: document vsock transport option
      scripts/launch_guest: drop nocc object for QEMU >= 11.0
      scripts/launch_guest: add --tcg option to use TCG acceleration
      github/workflows: set up /dev/kvm permissions
      scripts/launch_guest: remove sudo from QEMU invocation
      acpi: remove fw_cfg-based ACPI/MADT leftover

Matt Van Horn (1):
      testing/scripts: Check required host tools before launching guest

Melody Wang (2):
      cpu: Make sure guest's GIF is set
      boot: Allow Alternate Injection to be configured via boot params

Nicola Ramacciotti (5):
      github/workflows: Remove unmaintained action
      scripts/test-in-svsm: Print exit code when failing
      scripts/test-in-svsm: Add optional timeout handling
      github/workflows: Use the test-in-svsm script directly
      github/workflows: update dependency review to node 24

Stefano Garzarella (10):
      Merge pull request #879 from luigix25/add_attestation_to_vsock
      Merge pull request #1058 from luigix25/qemu_11_launch
      Merge pull request #1065 from msft-jlange/svsm_req_error
      virtio-drivers: queue/owning: validate the token before indexing buffer table
      virtio-drivers: queue/owning: validate len before slicing buffer
      kernel/vtpm: fix uninitialized heap bytes returned in TPM failure mode
      github/manual-verify: fix triggering on 'verification' label
      github/manual-verify: cache verus toolchain
      Merge pull request #1073 from joergroedel/fixes
      github/qemu: dump host kernel messages on QEMU or test failure

Ziqiao Zhou (5):
      mm/alloc.verus: update phys_to_virt proof after stage2 removal
      verification: Support Verus's verita test via cargo-verus.
      scripts: Update vsinstall.sh to directly install verus.
      workflow: Revert "github/manual-verify: check cargo-v output for errors"
      doc: Update verification.md to reflect the use of cargo-verus



^ permalink raw reply

* Re: [PATCH v2 0/6] KVM/x86: Drop "1" as MSR emulation return value
From: Juergen Gross @ 2026-05-28 11:58 UTC (permalink / raw)
  To: linux-kernel, x86, kvm, linux-coco
  Cc: Sean Christopherson, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin, Vitaly Kuznetsov,
	Kiryl Shutsemau, Rick Edgecombe, David Woodhouse, Paul Durrant
In-Reply-To: <20260528113605.267111-1-jgross@suse.com>


[-- Attachment #1.1.1: Type: text/plain, Size: 1788 bytes --]

Please disregard this series, there is one complication sashiko made me
aware of.


Juergen

On 28.05.26 13:35, Juergen Gross wrote:
> Get rid of the literal "1" used as general error return value in KVM
> MSR emulation. It can easily be replaced by negative errno values
> instead.
> 
> This is meant to avoid confusion with the literal "1" used as return
> value for "return to guest".
> 
> Changes in V2:
> - series carved out from initial "KVM: Avoid literal numbers as return
>    values" series
> - don't use new KVM_MSR_RET_* defines, but 0 and -errno
> 
> Juergen Gross (6):
>    KVM/x86: Change comment before KVM_MSR_RET_* defines
>    KVM/x86: Return -errno instead of "1" for APIC related MSR emulation
>    KVM/x86: Return -errno instead of "1" for Hyper-V related MSR
>      emulation
>    KVM/x86: Return -errno instead of "1" for VMX related MSR emulation
>    KVM/x86: Return -errno instead of "1" for SVM related MSR emulation
>    KVM/x86: Return -errno instead of "1" for common MSR emulation
> 
>   arch/x86/kvm/hyperv.c        |  72 ++++++++++++-------------
>   arch/x86/kvm/lapic.c         |  39 +++++++-------
>   arch/x86/kvm/mtrr.c          |   6 +--
>   arch/x86/kvm/pmu.c           |   8 +--
>   arch/x86/kvm/svm/pmu.c       |   4 +-
>   arch/x86/kvm/svm/svm.c       |  36 ++++++-------
>   arch/x86/kvm/vmx/nested.c    |   2 +-
>   arch/x86/kvm/vmx/pmu_intel.c |  16 +++---
>   arch/x86/kvm/vmx/tdx.c       |  10 ++--
>   arch/x86/kvm/vmx/vmx.c       |  96 ++++++++++++++++-----------------
>   arch/x86/kvm/x86.c           | 102 +++++++++++++++++------------------
>   arch/x86/kvm/x86.h           |   4 +-
>   arch/x86/kvm/xen.c           |  10 ++--
>   13 files changed, 202 insertions(+), 203 deletions(-)
> 


[-- Attachment #1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 3743 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 495 bytes --]

^ permalink raw reply

* Re: [PATCH v2 0/6] KVM/x86: Drop "1" as MSR emulation return value
From: Sean Christopherson @ 2026-05-28 13:09 UTC (permalink / raw)
  To: Juergen Gross
  Cc: linux-kernel, x86, kvm, linux-coco, Paolo Bonzini,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Vitaly Kuznetsov, Kiryl Shutsemau, Rick Edgecombe,
	David Woodhouse, Paul Durrant
In-Reply-To: <a8f495f0-b4a9-42b6-be74-4fa9d83c7346@suse.com>

On Thu, May 28, 2026, Juergen Gross wrote:
> Please disregard this series, there is one complication sashiko made me
> aware of.

Sashiko beat me to the punch. :-)

See commit 2368048bf5c2 ("KVM: x86: Signal #GP, not -EPERM, on bad WRMSR(MCi_CTL/STATUS)")
for a real world example of how things can and will go wrong.

^ permalink raw reply

* Re: [PATCH v2 0/6] KVM/x86: Drop "1" as MSR emulation return value
From: Jürgen Groß @ 2026-05-28 13:18 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: linux-kernel, x86, kvm, linux-coco, Paolo Bonzini,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Vitaly Kuznetsov, Kiryl Shutsemau, Rick Edgecombe,
	David Woodhouse, Paul Durrant
In-Reply-To: <ahg-bEiwyqYTdWOD@google.com>


[-- Attachment #1.1.1: Type: text/plain, Size: 832 bytes --]

On 28.05.26 15:09, Sean Christopherson wrote:
> On Thu, May 28, 2026, Juergen Gross wrote:
>> Please disregard this series, there is one complication sashiko made me
>> aware of.
> 
> Sashiko beat me to the punch. :-)
> 
> See commit 2368048bf5c2 ("KVM: x86: Signal #GP, not -EPERM, on bad WRMSR(MCi_CTL/STATUS)")
> for a real world example of how things can and will go wrong.

Yeah, with Sashiko's pointer it was easy to spot.

Question now is whether the already existing cases of -errno passed as return
value are wrong or on purpose. If the latter, there should be a comment for
that, otherwise they need to be fixed..

Disentangling the MSR emulation return values from the "normal" ones ("return
to guest"/"return to user mode") will be quite interesting with the overloaded
semantics of "1".


Juergen

[-- Attachment #1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 3743 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 495 bytes --]

^ permalink raw reply

* Re: [PATCH v2 0/6] KVM/x86: Drop "1" as MSR emulation return value
From: Sean Christopherson @ 2026-05-28 13:21 UTC (permalink / raw)
  To: Jürgen Groß
  Cc: linux-kernel, x86, kvm, linux-coco, Paolo Bonzini,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Vitaly Kuznetsov, Kiryl Shutsemau, Rick Edgecombe,
	David Woodhouse, Paul Durrant
In-Reply-To: <476964db-837a-45d8-a647-c5a2ea8b0cb6@suse.com>

On Thu, May 28, 2026, Jürgen Groß wrote:
> On 28.05.26 15:09, Sean Christopherson wrote:
> > On Thu, May 28, 2026, Juergen Gross wrote:
> > > Please disregard this series, there is one complication sashiko made me
> > > aware of.
> > 
> > Sashiko beat me to the punch. :-)
> > 
> > See commit 2368048bf5c2 ("KVM: x86: Signal #GP, not -EPERM, on bad WRMSR(MCi_CTL/STATUS)")
> > for a real world example of how things can and will go wrong.
> 
> Yeah, with Sashiko's pointer it was easy to spot.
> 
> Question now is whether the already existing cases of -errno passed as return
> value are wrong or on purpose. 

What are the existing cases?

> If the latter, there should be a comment for
> that, otherwise they need to be fixed..
> 
> Disentangling the MSR emulation return values from the "normal" ones ("return
> to guest"/"return to user mode") will be quite interesting with the overloaded
> semantics of "1".

LOL, "interesting".

^ permalink raw reply

* Re: [PATCH v2 0/6] KVM/x86: Drop "1" as MSR emulation return value
From: Jürgen Groß @ 2026-05-28 14:01 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: linux-kernel, x86, kvm, linux-coco, Paolo Bonzini,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Vitaly Kuznetsov, Kiryl Shutsemau, Rick Edgecombe,
	David Woodhouse, Paul Durrant
In-Reply-To: <ahhBQDvyzHQfTCBD@google.com>


[-- Attachment #1.1.1: Type: text/plain, Size: 834 bytes --]

On 28.05.26 15:21, Sean Christopherson wrote:
> On Thu, May 28, 2026, Jürgen Groß wrote:
>> On 28.05.26 15:09, Sean Christopherson wrote:
>>> On Thu, May 28, 2026, Juergen Gross wrote:
>>>> Please disregard this series, there is one complication sashiko made me
>>>> aware of.
>>>
>>> Sashiko beat me to the punch. :-)
>>>
>>> See commit 2368048bf5c2 ("KVM: x86: Signal #GP, not -EPERM, on bad WRMSR(MCi_CTL/STATUS)")
>>> for a real world example of how things can and will go wrong.
>>
>> Yeah, with Sashiko's pointer it was easy to spot.
>>
>> Question now is whether the already existing cases of -errno passed as return
>> value are wrong or on purpose.
> 
> What are the existing cases?

Have a look at:

kvm_hv_msr_get_crash_data()
kvm_hv_msr_set_crash_data()
svm_get_msr()
svm_set_msr()


Juergen

[-- Attachment #1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 3743 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 495 bytes --]

^ permalink raw reply

* Re: [PATCH v2 0/6] KVM/x86: Drop "1" as MSR emulation return value
From: Jürgen Groß @ 2026-05-28 14:33 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: linux-kernel, x86, kvm, linux-coco, Paolo Bonzini,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Vitaly Kuznetsov, Kiryl Shutsemau, Rick Edgecombe,
	David Woodhouse, Paul Durrant
In-Reply-To: <ahhBQDvyzHQfTCBD@google.com>


[-- Attachment #1.1.1: Type: text/plain, Size: 814 bytes --]

On 28.05.26 15:21, Sean Christopherson wrote:
> On Thu, May 28, 2026, Jürgen Groß wrote:
>> On 28.05.26 15:09, Sean Christopherson wrote:
>>> On Thu, May 28, 2026, Juergen Gross wrote:
>>>> Please disregard this series, there is one complication sashiko made me
>>>> aware of.
>>>
>>> Sashiko beat me to the punch. :-)
>>>
>>> See commit 2368048bf5c2 ("KVM: x86: Signal #GP, not -EPERM, on bad WRMSR(MCi_CTL/STATUS)")
>>> for a real world example of how things can and will go wrong.
>>
>> Yeah, with Sashiko's pointer it was easy to spot.
>>
>> Question now is whether the already existing cases of -errno passed as return
>> value are wrong or on purpose.
> 
> What are the existing cases?

Found another one:

kvm_xen_write_hypercall_page() (called by kvm_set_msr_common())


Juergen

[-- Attachment #1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 3743 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 495 bytes --]

^ permalink raw reply

* Re: [PATCH v5 4/7] x86/sev: Add support to perform RMP optimizations asynchronously
From: Ackerley Tng @ 2026-05-28 14:45 UTC (permalink / raw)
  To: Ashish Kalra, tglx, mingo, bp, dave.hansen, x86, hpa, seanjc,
	peterz, thomas.lendacky, herbert, davem, ardb
  Cc: pbonzini, aik, Michael.Roth, KPrateek.Nayak, Tycho.Andersen,
	Nathan.Fontenot, jackyli, pgonda, rientjes, jacobhxu, xin,
	pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen, darwi,
	linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <6f1ec3d8ebcf3aaceccc099c07d0deb545dd4ab9.1779133590.git.ashish.kalra@amd.com>

Ashish Kalra <Ashish.Kalra@amd.com> writes:

Thank you Ashish!

> From: Ashish Kalra <ashish.kalra@amd.com>
>
> When SEV-SNP is enabled, all writes to memory are checked to ensure
> integrity of SNP guest memory. This imposes performance overhead on the
> whole system.
>
> RMPOPT is a new instruction that minimizes the performance overhead of
> RMP checks on the hypervisor and on non-SNP guests by allowing RMP
> checks to be skipped for 1GB regions of memory that are known not to
> contain any SEV-SNP guest memory.
>
> Add support for performing RMP optimizations asynchronously using a
> dedicated workqueue.
>
> Enable RMPOPT optimizations globally for all system RAM up to 2TB at

This should also be updated to say "Enable RMPOPT optimizations for up
to 2TB worth of system RAM at..."

The current phrasing sounds like only addresses [0, 2TB) are allowed to
be optimized, but actually any address [start, start + 2TB) can be
optimized?

> RMP initialization time. RMP checks can initially be skipped for 1GB
> memory ranges that do not contain SEV-SNP guest memory (excluding
> preassigned pages such as the RMP table and firmware pages). As SNP
> guests are launched, RMPUPDATE will disable the corresponding RMPOPT
> optimizations.
>
> Suggested-by: Thomas Lendacky <thomas.lendacky@amd.com>
> Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
> Reviewed-by: Ackerley Tng <ackerleytng@google.com>
> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
> ---
>  arch/x86/virt/svm/sev.c | 167 +++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 164 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
> index 82f9dc7a57c3..8876cac052d5 100644
> --- a/arch/x86/virt/svm/sev.c
> +++ b/arch/x86/virt/svm/sev.c
> @@ -19,6 +19,7 @@
>  #include <linux/iommu.h>
>  #include <linux/amd-iommu.h>
>  #include <linux/nospec.h>
> +#include <linux/workqueue.h>
>
>  #include <asm/sev.h>
>  #include <asm/processor.h>
> @@ -125,7 +126,18 @@ static void *rmp_bookkeeping __ro_after_init;
>  static u64 probed_rmp_base, probed_rmp_size;
>
>  static cpumask_t rmpopt_cpumask;
> -static phys_addr_t rmpopt_pa_start;
> +static phys_addr_t rmpopt_pa_start, rmpopt_pa_end;
> +
> +enum rmpopt_function {
> +	RMPOPT_FUNC_VERIFY_AND_REPORT_STATUS,
> +	RMPOPT_FUNC_REPORT_STATUS
> +};
> +
> +#define RMPOPT_WORK_TIMEOUT	10000
> +
> +static struct workqueue_struct *rmpopt_wq;
> +static struct delayed_work rmpopt_delayed_work;
> +static DEFINE_MUTEX(rmpopt_wq_mutex);
>
>  static LIST_HEAD(snp_leaked_pages_list);
>  static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
> @@ -564,12 +576,21 @@ EXPORT_SYMBOL_FOR_MODULES(snp_prepare, "ccp");
>
>  static void rmpopt_cleanup(void)
>  {
> +	guard(mutex)(&rmpopt_wq_mutex);
> +
> +	if (!rmpopt_wq)
> +		return;
> +
> +	cancel_delayed_work_sync(&rmpopt_delayed_work);
> +	destroy_workqueue(rmpopt_wq);
> +
>  	cpus_read_lock();
>  	wrmsrq_on_cpus(&rmpopt_cpumask, MSR_AMD64_RMPOPT_BASE, 0);
>  	cpus_read_unlock();
>
>  	cpumask_clear(&rmpopt_cpumask);
> -	rmpopt_pa_start = 0;
> +	rmpopt_pa_start = rmpopt_pa_end = 0;
> +	rmpopt_wq = NULL;
>  }
>
>  void snp_shutdown(void)
> @@ -587,6 +608,105 @@ void snp_shutdown(void)
>  }
>  EXPORT_SYMBOL_FOR_MODULES(snp_shutdown, "ccp");
>
> +static inline bool __rmpopt(u64 rax, u64 rcx)

Perhaps use pa_start instead of rax and op_type for rcx?

> +{
> +	bool optimized;
> +
> +	asm volatile(".byte 0xf2, 0x0f, 0x01, 0xfc"
> +		     : "=@ccc" (optimized)
> +		     : "a" (rax), "c" (rcx)
> +		     : "memory", "cc");
> +
> +	return optimized;
> +}
> +
> +static void rmpopt(u64 pa)
> +{
> +	u64 rax = ALIGN_DOWN(pa, SZ_1G);
> +	u64 rcx = RMPOPT_FUNC_VERIFY_AND_REPORT_STATUS;
> +

And pa_start and op_type here too.

> +	__rmpopt(rax, rcx);
> +}
> +
> +/*
> + * 'val' is a system physical address.
> + */
> +static void rmpopt_smp(void *val)
> +{
> +	rmpopt((u64)val);
> +}
> +
> +/*
> + * RMPOPT optimizations skip RMP checks at 1GB granularity if this
> + * range of memory does not contain any SNP guest memory.
> + */
> +static void rmpopt_work_handler(struct work_struct *work)
> +{
> +	bool current_cpu_cleared = false;
> +	phys_addr_t pa;
> +	int this_cpu;
> +
> +	pr_info("Attempt RMP optimizations on physical address range @1GB alignment [0x%016llx - 0x%016llx]\n",
> +		rmpopt_pa_start, rmpopt_pa_end);
> +
> +	/*
> +	 * RMPOPT scans the RMP table, stores the result of the scan in the
> +	 * reserved processor memory. The RMP scan is the most expensive
> +	 * part. If a second RMPOPT occurs, it can skip the expensive scan
> +	 * if they can see a cached result in the reserved processor memory.
> +	 *
> +	 * Do RMPOPT on one CPU alone. Then, follow that up with RMPOPT
> +	 * on every other primary thread. This potentially allows the

I like the leader and follower comments below, thanks! With this
leader/follower setup, will the followers definitely see the cached scan
results, or might the followers still potentially not benefit from the
caching? If it's still only "potentially", why?

> +	 * followers to use the "cached" scan results to avoid repeating
> +	 * full scans.
> +	 */
> +
> +	/*
> +	 * Pin the worker to the current CPU for the leader loop so that
> +	 * this_cpu remains valid and the RMPOPT instruction executes on
> +	 * the CPU that was cleared from the cpumask.  The workqueue is
> +	 * WQ_UNBOUND, so without pinning, the scheduler could migrate
> +	 * the worker between the cpumask manipulation and the leader
> +	 * loop, causing the leader to run on a different CPU while
> +	 * this_cpu's core is skipped entirely.
> +	 *
> +	 * Use migrate_disable() rather than get_cpu() to prevent
> +	 * migration while still allowing preemption.
> +	 *
> +	 * Note: rmpopt_cpumask is modified here without holding
> +	 * rmpopt_wq_mutex.  This is safe because the delayed_work
> +	 * mechanism guarantees single-threaded execution of this
> +	 * handler, and rmpopt_cleanup() calls cancel_delayed_work_sync()
> +	 * to ensure handler completion before tearing down the cpumask.
> +	 */
> +	migrate_disable();
> +	this_cpu = smp_processor_id();
> +	if (cpumask_test_cpu(this_cpu, &rmpopt_cpumask)) {
> +		cpumask_clear_cpu(this_cpu, &rmpopt_cpumask);
> +		current_cpu_cleared = true;
> +	}
> +

Instead of reusing the global rmpopt_cpumask, why not make a copy of
rmpopt_cpumask for this function? Then this function won't have to
figure out current_cpu_cleared or restore rmpopt_cpumask at the end.

I'm thinking to also drop the test and clear, this function can just
always clear, like

  cpumask_clear_cpu(smp_processor_id(), followers_cpumask);

and later

  on_each_cpu_mask(&followers_cpumask, ...);

Actually, if for whatever reason cpumask_test_cpu(this_cpu,
&rmpopt_cpumask) above returns false, would that mean somehow some cpu
exists that wasn't enabled right when rmpopt was initialized? If yes,
what happens if we call rmpopt() on a cpu where it wasn't initialized?

> +	/* Leader: prime the RMPOPT cache on this CPU */
> +	for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G)
> +		rmpopt(pa);
> +
> +	migrate_enable();
> +
> +	/* Followers: run RMPOPT on all other cores */
> +	cpus_read_lock();
> +	for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
> +		on_each_cpu_mask(&rmpopt_cpumask, rmpopt_smp,
> +				 (void *)pa, true);
> +
> +		 /* Give a chance for other threads to run */
> +		cond_resched();
> +	}
> +	cpus_read_unlock();
> +
> +	if (current_cpu_cleared)
> +		cpumask_set_cpu(this_cpu, &rmpopt_cpumask);
> +}
> +
>
> [...snip...]
>

^ permalink raw reply

* Re: [PATCH v2 0/6] KVM/x86: Drop "1" as MSR emulation return value
From: David Woodhouse @ 2026-05-28 15:32 UTC (permalink / raw)
  To: Jürgen Groß, Sean Christopherson
  Cc: linux-kernel, x86, kvm, linux-coco, Paolo Bonzini,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Vitaly Kuznetsov, Kiryl Shutsemau, Rick Edgecombe,
	Paul Durrant
In-Reply-To: <8d3f3cbe-89ca-44df-b35f-90c2724e9154@suse.com>

[-- Attachment #1: Type: text/plain, Size: 1168 bytes --]

On Thu, 2026-05-28 at 16:33 +0200, Jürgen Groß wrote:
> On 28.05.26 15:21, Sean Christopherson wrote:
> > On Thu, May 28, 2026, Jürgen Groß wrote:
> > > On 28.05.26 15:09, Sean Christopherson wrote:
> > > > On Thu, May 28, 2026, Juergen Gross wrote:
> > > > > Please disregard this series, there is one complication sashiko made me
> > > > > aware of.
> > > > 
> > > > Sashiko beat me to the punch. :-)
> > > > 
> > > > See commit 2368048bf5c2 ("KVM: x86: Signal #GP, not -EPERM, on bad WRMSR(MCi_CTL/STATUS)")
> > > > for a real world example of how things can and will go wrong.
> > > 
> > > Yeah, with Sashiko's pointer it was easy to spot.
> > > 
> > > Question now is whether the already existing cases of -errno passed as return
> > > value are wrong or on purpose.
> > 
> > What are the existing cases?
> 
> Found another one:
> 
> kvm_xen_write_hypercall_page() (called by kvm_set_msr_common())

You mean in the case where it's using the user-provided hypercall page,
and can't copy from the buffer that the VMM provided?

I think that's correct to return -errno via PTR_ERR() and let the guest
die?

The rest return 0 or 1.

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5069 bytes --]

^ permalink raw reply

* Re: [PATCH v2 0/6] KVM/x86: Drop "1" as MSR emulation return value
From: Jürgen Groß @ 2026-05-28 15:36 UTC (permalink / raw)
  To: David Woodhouse, Sean Christopherson
  Cc: linux-kernel, x86, kvm, linux-coco, Paolo Bonzini,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Vitaly Kuznetsov, Kiryl Shutsemau, Rick Edgecombe,
	Paul Durrant
In-Reply-To: <a6fdbcc655ea24209ed03e377f7e4a2a4d43bd9c.camel@infradead.org>


[-- Attachment #1.1.1: Type: text/plain, Size: 1337 bytes --]

On 28.05.26 17:32, David Woodhouse wrote:
> On Thu, 2026-05-28 at 16:33 +0200, Jürgen Groß wrote:
>> On 28.05.26 15:21, Sean Christopherson wrote:
>>> On Thu, May 28, 2026, Jürgen Groß wrote:
>>>> On 28.05.26 15:09, Sean Christopherson wrote:
>>>>> On Thu, May 28, 2026, Juergen Gross wrote:
>>>>>> Please disregard this series, there is one complication sashiko made me
>>>>>> aware of.
>>>>>
>>>>> Sashiko beat me to the punch. :-)
>>>>>
>>>>> See commit 2368048bf5c2 ("KVM: x86: Signal #GP, not -EPERM, on bad WRMSR(MCi_CTL/STATUS)")
>>>>> for a real world example of how things can and will go wrong.
>>>>
>>>> Yeah, with Sashiko's pointer it was easy to spot.
>>>>
>>>> Question now is whether the already existing cases of -errno passed as return
>>>> value are wrong or on purpose.
>>>
>>> What are the existing cases?
>>
>> Found another one:
>>
>> kvm_xen_write_hypercall_page() (called by kvm_set_msr_common())
> 
> You mean in the case where it's using the user-provided hypercall page,
> and can't copy from the buffer that the VMM provided?

Yes.

> 
> I think that's correct to return -errno via PTR_ERR() and let the guest
> die?

In this case I think a comment in this regard would be nice, as it would
prevent others stumbling over it asking the same question again.


Juergen

[-- Attachment #1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 3743 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 495 bytes --]

^ permalink raw reply

* Re: [PATCH v2 0/6] KVM/x86: Drop "1" as MSR emulation return value
From: Jürgen Groß @ 2026-05-28 15:50 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: linux-kernel, x86, kvm, linux-coco, Paolo Bonzini,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Vitaly Kuznetsov, Kiryl Shutsemau, Rick Edgecombe,
	David Woodhouse, Paul Durrant
In-Reply-To: <ahhBQDvyzHQfTCBD@google.com>


[-- Attachment #1.1.1: Type: text/plain, Size: 1487 bytes --]

On 28.05.26 15:21, Sean Christopherson wrote:
> On Thu, May 28, 2026, Jürgen Groß wrote:
>> On 28.05.26 15:09, Sean Christopherson wrote:
>>> On Thu, May 28, 2026, Juergen Gross wrote:
>>>> Please disregard this series, there is one complication sashiko made me
>>>> aware of.
>>>
>>> Sashiko beat me to the punch. :-)
>>>
>>> See commit 2368048bf5c2 ("KVM: x86: Signal #GP, not -EPERM, on bad WRMSR(MCi_CTL/STATUS)")
>>> for a real world example of how things can and will go wrong.
>>
>> Yeah, with Sashiko's pointer it was easy to spot.
>>
>> Question now is whether the already existing cases of -errno passed as return
>> value are wrong or on purpose.
> 
> What are the existing cases?
> 
>> If the latter, there should be a comment for
>> that, otherwise they need to be fixed..
>>
>> Disentangling the MSR emulation return values from the "normal" ones ("return
>> to guest"/"return to user mode") will be quite interesting with the overloaded
>> semantics of "1".
> 
> LOL, "interesting".

What do you think about the following idea:

Lets pass struct msr_info * down to all functions which get their return
value passed up. Then extend msr_info with a bool "return_to_guest" (valid
only if !host_initiated), which should be set instead of passing "1" up to
the caller (probably using an inline helper). Then the return value could
be 0 or -errno, and after MSR emulation the return_to_guest indicator can
be tested if needed.


Juergen

[-- Attachment #1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 3743 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 495 bytes --]

^ permalink raw reply

* Re: [PATCH v3 2/2] x86/tdx: Fix zero-extension for 32-bit port I/O
From: Dave Hansen @ 2026-05-28 16:43 UTC (permalink / raw)
  To: Kiryl Shutsemau
  Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H . Peter Anvin, Rick Edgecombe, Kuppuswamy Sathyanarayanan,
	Kai Huang, Sean Christopherson, Borys Tsyrulnikov, linux-kernel,
	linux-coco, kvm, stable
In-Reply-To: <ahgUBLjBRGhxULu3@thinkstation>

On 5/28/26 03:14, Kiryl Shutsemau wrote:
> +	switch (size) {
> +	case 1:
> +		*(u8 *)&regs->ax = (u8)val;
> +		break;
> +	case 2:
> +		*(u16 *)&regs->ax = (u16)val;
> +		break;

Is this intentionally clever to only work on little endian, or
accidentally clever? This seems like a great IOCCC thing to do, but it's
far too clever for my taste.

I mean, it's making a pointer to a 64-bit value with an 8-bit name and
casting that to an 8-bit pointer and then assigning that to a 32-bit
value cast to a u8.

Is it just my tiny brain that thinks this will be unintelligible on Monday?

How about we just make the CPU do the thinking for us? IN[BWL] and
MOV[BWL] have the same semantics here, right? So even if 'rax' and 'val'
are 64-bit values here, the following should have all the right
behaviors, I think.

I generally loathe inline assembly. But we have a CPU that kinda knows
the rules already. No need for us to laboriously reimplement it. Right?

Thanks to the friendly LLM that knows inline assembly better than I do.
The resulting compiled assembly looks right to me.

/*
 * Use MOV[BWL] to/from registers to match the IN[BWL] behavior
 * including the fact that INL zeros the upper 64-bits while
 * IN[BW] don't zero anything.
 */

    switch (size) {
    case 1:
	// Just write 1 byte of RAX:
        __asm__ volatile ("movb %b1, %b0" : "+q"(rax)
                                          : "q"(val));
        break;
    case 2:
	// Write 2 bytes of RAX:
        __asm__ volatile ("movw %w1, %w0" : "+r"(rax)
                                          : "r"(val));
        break;
    case 4:
	// Write 'val' into lower 32 bits. Zero the upper 32 bits:
        __asm__ volatile ("movl %k1, %k0" : "=r"(rax)
                                          : "r"(val));
        break;
    default:
	// WARN
    }

Thoughts?

^ permalink raw reply

* Re: [PATCH v3 2/2] x86/tdx: Fix zero-extension for 32-bit port I/O
From: Dave Hansen @ 2026-05-28 17:25 UTC (permalink / raw)
  To: Kiryl Shutsemau
  Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H . Peter Anvin, Rick Edgecombe, Kuppuswamy Sathyanarayanan,
	Kai Huang, Sean Christopherson, Borys Tsyrulnikov, linux-kernel,
	linux-coco, kvm, stable
In-Reply-To: <ahgUBLjBRGhxULu3@thinkstation>

On 5/28/26 03:14, Kiryl Shutsemau wrote:
> What about the patch below. Inspired by kvm's assign_register().

I think I could stand this if it consolidated this site with kvm's
assign_register(). The copy/paste is too much to bear.



^ permalink raw reply

* RE: [PATCH v5 05/20] dma-pool: track decrypted atomic pools and select them via attrs
From: Michael Kelley @ 2026-05-28 18:29 UTC (permalink / raw)
  To: Aneesh Kumar K.V (Arm), iommu@lists.linux.dev,
	linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, linux-coco@lists.linux.dev
  Cc: Robin Murphy, Marek Szyprowski, Will Deacon, Marc Zyngier,
	Steven Price, Suzuki K Poulose, Catalin Marinas, Jiri Pirko,
	Jason Gunthorpe, Mostafa Saleh, Petr Tesarik,
	Alexey Kardashevskiy, Dan Williams, Xu Yilun,
	linuxppc-dev@lists.ozlabs.org, linux-s390@vger.kernel.org,
	Madhavan Srinivasan, Michael Ellerman, Nicholas Piggin,
	Christophe Leroy (CS GROUP), Alexander Gordeev, Gerald Schaefer,
	Heiko Carstens, Vasily Gorbik, Christian Borntraeger,
	Sven Schnelle, x86@kernel.org, Jiri Pirko
In-Reply-To: <20260522042815.370873-6-aneesh.kumar@kernel.org>

From: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>Sent: Thursday, May 21, 2026 9:28 PM
> 
> Teach the atomic DMA pool code to distinguish between encrypted and
> unencrypted pools, and make pool allocation select the matching pool based
> on DMA attributes.
> 
> Introduce a dma_gen_pool wrapper that records whether a pool is
> unencrypted, initialize that state when the atomic pools are created, and
> use it when expanding and resizing the pools. Update dma_alloc_from_pool()
> to take attrs and skip pools whose encrypted state does not match
> DMA_ATTR_CC_SHARED. Update dma_free_from_pool() accordingly.
> 
> Also pass DMA_ATTR_CC_SHARED from the swiotlb atomic allocation path so
> decrypted swiotlb allocations are taken from the correct atomic pool.
> 
> Tested-by: Jiri Pirko <jiri@nvidia.com>
> Reviewed-by: Mostafa Saleh <smostafa@google.com>
> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
> ---
>  drivers/iommu/dma-iommu.c   |   2 +-
>  include/linux/dma-map-ops.h |   2 +-
>  kernel/dma/direct.c         |  11 ++-
>  kernel/dma/pool.c           | 167 +++++++++++++++++++++++-------------
>  kernel/dma/swiotlb.c        |   7 +-
>  5 files changed, 123 insertions(+), 66 deletions(-)
>

[snip]

> +static __init struct dma_gen_pool *__dma_atomic_pool_init(struct dma_gen_pool *dma_pool,
> +		size_t pool_size, gfp_t gfp)
>  {
> -	struct gen_pool *pool;
>  	int ret;
> 
> -	pool = gen_pool_create(PAGE_SHIFT, NUMA_NO_NODE);
> -	if (!pool)
> +	dma_pool->pool = gen_pool_create(PAGE_SHIFT, NUMA_NO_NODE);
> +	if (!dma_pool->pool)
>  		return NULL;
> 
> -	gen_pool_set_algo(pool, gen_pool_first_fit_order_align, NULL);
> +	gen_pool_set_algo(dma_pool->pool, gen_pool_first_fit_order_align, NULL);
> +
> +	/* if platform is using memory encryption atomic pools are by default decrypted. */
> +	if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
> +		dma_pool->unencrypted = true;
> +	else
> +		dma_pool->unencrypted = false;

I'm curious about the name of the "unencrypted" field in struct dma_gen_pool,
and similarly in Patch 7 of the series for the swiotlb struct io_tlb_pool and
struct io_tlb_mem. Up through v3 of this series, you used "decrypted", but
starting in v4 switched to "unencrypted".

To me, the above "if" statement has some cognitive dissonance in that if
CC_ATTR_MEM_ENCRYPT is false (i.e., a normal VM), "unencrypted" is set
to false. But I think of memory in a normal VM as "unencrypted" since it
was never encrypted. A similar "if" statement occurs in your swiotlb changes.

Two related concepts are captured by the field:
1) Is some action needed to put the memory into the unencrypted state,
and to remove it from that state? This applies when assigning memory to the
pool, or freeing the memory in the pool.
2) Is the memory currently in the unencrypted state? This applies when
allocating memory from the pool to a caller.

It's hard to capture all that in a short field name. But I think I prefer "decrypted"
over "unencrypted".  The former implies that some action was taken. It's a
little easier to think of a normal VM as *not* having decrypted memory. The
memory was never encrypted in the first place, so no decryption action was taken.

Throughout the kernel, "decrypted" occurs much more frequently than
"unencrypted".  We have set_memory_encrypted() and set_memory_decrypted()
that are "take action" names.  But we also have force_dma_unencrypted(),
phys_to_dma_unencrypted(), and dma_addr_unencrypted(). So it's a bit
of a mess.

But maybe there's more background here that led to the change
between your v3 and v4.

Michael

^ permalink raw reply

* Re: [PATCH v13 07/22] KVM: selftests: Introduce structures for TDX guest boot parameters
From: Yosry Ahmed @ 2026-05-28 19:25 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Lisa Wang, Andrew Jones, Ackerley Tng, Binbin Wu, Chao Gao,
	Chenyi Qiang, Dave Hansen, Erdem Aktas, Ira Weiny, Isaku Yamahata,
	Kiryl Shutsemau, linux-kselftest, Paolo Bonzini, Pratik R. Sampat,
	Reinette Chatre, Rick Edgecombe, Roger Wang, Ryan Afranji,
	Sagi Shahar, Shuah Khan, Oliver Upton, Jeremiah McReynolds, kvm,
	linux-coco, linux-kernel, x86
In-Reply-To: <CAO9r8zMaiGL8v=f72EAwWbwofoUHOkH8r6Se22k2TVxnUCQLOQ@mail.gmail.com>

On Fri, May 22, 2026 at 04:50:07PM -0700, Yosry Ahmed wrote:
> > > Sean, is this the preferred way to expose offsets to asm files (or asm
> > > code blocks) -- as opposed to say using .equ [*]?
> >
> > For actual .S assembly, yes.  For inline asm, maybe?  If it looks prettier, go
> > for it.
> >
> > > If yes, I can rework my nVMX GPR fixes to use the same approach for
> > > register offsets. I wonder if the non-TDX part of this patch (i.e.
> > > Makefile stuff) can be split, then patch 6 and the Makefile stuff can
> > > land independently and allow development on top.
> > >
> > > I can also split them out and include them in the next version of my
> > > series, then whichever series lands first will land the offsets
> > > support.
> > >
> > > WDYT?
> >
> > Hmm, I'd say keep your series as-is for now.  The OFFSET() infrastructure really
> > shines for proper assembly.  For what you're doing, AFAICT it's only marginally
> > better.  So I don't think it's worth juggling dependencies to use it right away,
> > we can always convert if/when the TDX series lands the fancy stuff.
> 
> Ack. We can do the switch later like you say.

I take this back. My series builds with the internal toolchain, but not
when I just use make with LLVM. Probably different compiler versions or
build options, but the fact the .equ thing doesn't always work means I
can't use it.

I would paste the error here, but the compiler literally spits out
incomprehensible garbage.

Lisa, if you will send a new version of this series for other reasons,
do you mind splitting out the non-TDX parts of this patch? Ideally we'd
have 1-2 patches that introduce the OFFSET() infrastructure without any
TDX parts, which should make it easier to pick up separately or include
with other series.

If a new version won't be needed anyway, I will just wait for this to
land before refreshing my series on top.

^ permalink raw reply

* Re: [PATCH v5 2/7] x86/msr: add wrmsrq_on_cpus helper
From: Kalra, Ashish @ 2026-05-28 19:37 UTC (permalink / raw)
  To: Borislav Petkov, Dave Hansen
  Cc: tglx, mingo, dave.hansen, x86, hpa, seanjc, peterz,
	thomas.lendacky, herbert, davem, ardb, pbonzini, aik,
	Michael.Roth, KPrateek.Nayak, Tycho.Andersen, Nathan.Fontenot,
	ackerleytng, jackyli, pgonda, rientjes, jacobhxu, xin,
	pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen, darwi,
	linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <20260528004332.GDahePtGqVp2boiEJL@fat_crate.local>

Hello Boris and Dave,

On 5/27/2026 7:43 PM, Borislav Petkov wrote:
> On Wed, May 27, 2026 at 02:38:05PM -0700, Dave Hansen wrote:
>> This one is my doing.
> 
> I know.
> 
> But hey, maybe we should not disagree on the public ML because the submitter
> might disappear like the last one. :-P
> 
>> wrmsr_on_cpus() is kinda a mess. I think it only has a single user. It's
>> also not very flexible because it needs a 'struct msr __percpu *msrs'
>> argument where each MSR has a value in memory.
> 
> Right, we did that a looong time ago.
> 
> The only reason I'd have for per-CPU MSR structs is reading different MSR
> values on different cores, modifying only the bits you need and then *keeping*
> the remaining values as they were. And that interface allows you to do that
> while this new thing won't.
> 
> And I'm going to venture a guess here that adding a simpler interface which
> simply forces a new value ontop of a whole MSR could cause a lot of subtle
> bugs when people don't pay attention to keep the old values.
> 
>> The use case for RMPOPT is that all CPUs get the same value. It'd be a
>> little awkward to go create a percpu data structure to duplcate the same
>> value to call wrmsr_on_cpus(). The RMPOPT case is also arguably
>> performance sensitive since it's done during boot. It should do the IPIs
>> in parallel.
> 
> Oh sure, my meaning was to create something that serves both purposes.
> 
>> toggle_ecc_err_reporting(), on the other hand, is done at module init
>> time. It's not really performance sensitive. It's probably pretty easy
>> to zap wrmsr_on_cpus() and just have toggle_ecc_err_reporting() do
>> something slightly less efficient.
> 
> Sure. That's fine.
> 
>> Yeah, the
>>
>> 	wrmsr_on_cpus()
>> 	wrmsrq_on_cpus()
>>
>> naming pain is real. There's little chance of bugs coming from it
>> because the function signatures are *SO* different. But, it certainly
>> could confuse humans for a minute.
> 
> Yap.
> 
>> But the real solution to this is axing wrmsr_on_cpus(). 
> 
> Yap, for example. Basically reingeneering the whole
> write-MSRs-on-multiple-CPUs functionality is what I meant.
> 
>> Which I think we could do after killing its one user which the attached
>> (completely untested) patch does. The only downside of the patch is that it
>> does RDMSR via IPIs one CPU at a time. But, looking at the code, I'm not
>> sure anyone would care. If anyone did, I _think_ all those MSRs have the
>> same value and the code could be simplified further. But that would take
>> more than 3 minutes.
>>
>> It's also possible that my grepping was bad or I'm completely
>> misunderstanding amd64_edac.c. Cluebat welcome if I'm being dense.
> 
> Looks ok to me, we can surely do that. I even hw to test it. I think...
> 
>> BTW, I also don't feel the need to make Ashish go do any of this edac
>> cleanup. I think it can just be done in parallel. But I wouldn't stop
>> him if he volunteered.
> 
> Why not?
> 
> It has always been the case: cleanups and bug fixes first, new features ontop.
> 
> So yeah, modulo figuring out how to redefine the *msr_on_cpus() interface,
> I think this all makes sense.

snp_setup_rmpopt() runs once during init and rmpopt_cleanup() runs once during shutdown. The batch IPI optimization
is irrelevant here. This RMPOPT_BASE MSR setup/programming is not in a performance critical path.

A simple loop would be perfectly fine and avoids the need for the wrmsrq_on_cpus() helper entirely:

  for_each_cpu(cpu, &rmpopt_cpumask)
      wrmsrq_on_cpu(cpu, MSR_AMD64_RMPOPT_BASE, rmpopt_base);

Calling wrmsrq_on_cpus() here for programming RMPOPT_BASE MSR:

-       wrmsrq_on_cpus(&rmpopt_cpumask, MSR_AMD64_RMPOPT_BASE, rmpopt_base);
+       for_each_cpu(cpu, &rmpopt_cpumask)
+               wrmsrq_on_cpu(cpu, MSR_AMD64_RMPOPT_BASE, rmpopt_base);

So i will drop this helper patch.

Thanks,
Ashish

> 
> Thx.
> 

^ permalink raw reply

* Re: [PATCH v5 2/7] x86/msr: add wrmsrq_on_cpus helper
From: Dave Hansen @ 2026-05-28 19:50 UTC (permalink / raw)
  To: Kalra, Ashish, Borislav Petkov
  Cc: tglx, mingo, dave.hansen, x86, hpa, seanjc, peterz,
	thomas.lendacky, herbert, davem, ardb, pbonzini, aik,
	Michael.Roth, KPrateek.Nayak, Tycho.Andersen, Nathan.Fontenot,
	ackerleytng, jackyli, pgonda, rientjes, jacobhxu, xin,
	pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen, darwi,
	linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <2d164e19-5cc6-47ca-9150-f4d432dd10c4@amd.com>

On 5/28/26 12:37, Kalra, Ashish wrote:
> A simple loop would be perfectly fine and avoids the need for the wrmsrq_on_cpus() helper entirely:
> 
>   for_each_cpu(cpu, &rmpopt_cpumask)
>       wrmsrq_on_cpu(cpu, MSR_AMD64_RMPOPT_BASE, rmpopt_base);

I'm glad we're on the same page finally. I just hope we can get to this
point more quickly next time. I started off with exactly this
suggestion, but someone chimed in to the thread and said it was "slower":

> https://lore.kernel.org/lkml/6a50d050-f602-43fd-a44a-cecedd9823eb@amd.com/


^ permalink raw reply

* Re: [PATCH 00/15] Enable TDX Module Extensions and DICE-based TDX Quoting
From: Sohil Mehta @ 2026-05-28 19:50 UTC (permalink / raw)
  To: Xu Yilun
  Cc: kas, djbw, rick.p.edgecombe, x86, peter.fang, linux-coco,
	linux-kernel, kvm, yilun.xu, baolu.lu, zhenzhong.duan, xiaoyao.li
In-Reply-To: <ahfJ9MW+kJp+kE6A@yilunxu-OptiPlex-7050>

On 5/27/2026 9:52 PM, Xu Yilun wrote:

> No the memory needed varies depends on the feature or the number of
> features. But currently I see the total requirement is ~50MB.
> 
This is important consideration when defining the default policy. Could
you please elaborate on how this will scale in the future?

How are the memory requirements expected to grow with additional features?

Let's say a future platform has a lot more features and needs
significantly more memory. Wouldn't loading a legacy kernel with this
default policy lead to excessive wastage?

Maybe I am missing something obvious. The struct in patch 1,
memory_pool_required_pages is u16. So, will the Extensions support never
require more than 256MB?

^ permalink raw reply

* Re: [PATCH v5 2/7] x86/msr: add wrmsrq_on_cpus helper
From: Kalra, Ashish @ 2026-05-28 19:55 UTC (permalink / raw)
  To: Dave Hansen, Borislav Petkov
  Cc: tglx, mingo, dave.hansen, x86, hpa, seanjc, peterz,
	thomas.lendacky, herbert, davem, ardb, pbonzini, aik,
	Michael.Roth, KPrateek.Nayak, Tycho.Andersen, Nathan.Fontenot,
	ackerleytng, jackyli, pgonda, rientjes, jacobhxu, xin,
	pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen, darwi,
	linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <c40dcb8c-5706-4c0f-ac85-c22957b9e192@intel.com>

Hello Dave,

On 5/28/2026 2:50 PM, Dave Hansen wrote:
> On 5/28/26 12:37, Kalra, Ashish wrote:
>> A simple loop would be perfectly fine and avoids the need for the wrmsrq_on_cpus() helper entirely:
>>
>>   for_each_cpu(cpu, &rmpopt_cpumask)
>>       wrmsrq_on_cpu(cpu, MSR_AMD64_RMPOPT_BASE, rmpopt_base);
> 
> I'm glad we're on the same page finally. I just hope we can get to this
> point more quickly next time. I started off with exactly this
> suggestion, but someone chimed in to the thread and said it was "slower":
> 
>> https://lore.kernel.org/lkml/6a50d050-f602-43fd-a44a-cecedd9823eb@amd.com/
> 

Yes, actually i should have made it explicitly clear that we need to do it in
parallel especially for issuing the RMPOPT instruction itself, as that is
in a performance critical path (and for that we are using on_each_cpu_mask()).

Thanks,
Ashish

^ permalink raw reply

* Re: [PATCH v3 2/2] x86/tdx: Fix zero-extension for 32-bit port I/O
From: David Laight @ 2026-05-28 19:58 UTC (permalink / raw)
  To: Kiryl Shutsemau
  Cc: Dave Hansen, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Rick Edgecombe,
	Kuppuswamy Sathyanarayanan, Kai Huang, Sean Christopherson,
	Borys Tsyrulnikov, linux-kernel, linux-coco, kvm, stable
In-Reply-To: <ahgUBLjBRGhxULu3@thinkstation>

On Thu, 28 May 2026 11:14:38 +0100
Kiryl Shutsemau <kas@kernel.org> wrote:

> On Wed, May 27, 2026 at 10:45:28AM -0700, Dave Hansen wrote:
> > On 5/27/26 05:05, Kiryl Shutsemau (Meta) wrote:
> > ...  
> > > -	/* Update part of the register affected by the emulated instruction */
> > > -	regs->ax &= ~mask;
> > > +	/*
> > > +	 * IN writes the result into a sub-register of RAX. Only the
> > > +	 * 32-bit form zero-extends; the smaller forms leave the upper
> > > +	 * bits untouched:
> > > +	 *
> > > +	 *   insn  dest  size  bits written     bits preserved
> > > +	 *   inb   AL    1     RAX[ 7: 0]       RAX[63: 8]
> > > +	 *   inw   AX    2     RAX[15: 0]       RAX[63:16]
> > > +	 *   inl   EAX   4     RAX[63: 0]       (none, zero-extended)
> > > +	 *
> > > +	 * 'mask' only covers the low 'size' bytes, which is exactly the
> > > +	 * range affected for size 1 and 2. For size 4 the write also
> > > +	 * clears RAX[63:32], so widen the clear-mask.
> > > +	 */
> > > +	if (size == 4)
> > > +		regs->ax = 0;
> > > +	else
> > > +		regs->ax &= ~mask;
> > > +  
> > 
> > Is there any way we could do this with fewer comments and more code?
> > 
> > I mean, there's only three cases. Why have;
> > 
> > 	u64 mask = GENMASK(BITS_PER_BYTE * size - 1, 0);
> > 
> > When there are only 3 possible cases:
> > 
> > 	1 => 0xf
> > 	2 => 0xff
> > 	4 => 0xffff
> > 
> > and one of those cases needs a special case on top of it.
> > 
> > Maybe something like this?
> > 
> > 	/* Clear out part of RAX so part of args.r11 can be OR'd in: */
> > 	switch (size) {
> > 	case 1:
> > 		/* inb consumes lower 8 bits of r11: */
> > 		regs->ax &= ~GENMASK_ULL(7, 0);
> > 		args.r11 &=  GENMASK_ULL(7, 0);
> > 		break;
> > 	case 2:
> > 		/* inw consumes lower 16 bits of r11: */
> > 		regs->ax &= ~GENMASK_ULL(15, 0);
> > 		args.r11 &=  GENMASK_ULL(15, 0);
> > 		break;
> > 	case 4:
> > 		/* inl is weird and zeros the whole register: */
> > 		regs->ax &= ~GENMASK_ULL(63, 0);
> > 		/* But only consumes 32-bits from r11: */
> > 		args.r11 &=  GENMASK_ULL(31, 0);
> > 		break;
> > 	default:
> > 		/* Probable TDX module bug. Illegal in[bwl] size: */
> > 		WARN_ON_ONCE(1);
> > 		success = 0;
> > 	}
> > 
> > 	if (success)
> > 		regs->ax |= args.r11;
> > 
> > It might need a temporary variable for args.r11, but you get the point.
> > That's basically the data from the comment but written as code.  
> 
> I hate how verbose it is. All these GENMASK_ULL() make it hard to
> follow.
> 
> What about the patch below. Inspired by kvm's assign_register().
> 
> diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
> index 65119362f9a2..460b9fbabf14 100644
> --- a/arch/x86/coco/tdx/tdx.c
> +++ b/arch/x86/coco/tdx/tdx.c
> @@ -693,8 +693,8 @@ static bool handle_in(struct pt_regs *regs, int size, int port)
>  		.r13 = PORT_READ,
>  		.r14 = port,
>  	};
> -	u64 mask = GENMASK(BITS_PER_BYTE * size - 1, 0);
>  	bool success;
> +	u32 val;
>  
>  	/*
>  	 * Emulate the I/O read via hypercall. More info about ABI can be found
> @@ -703,10 +703,33 @@ static bool handle_in(struct pt_regs *regs, int size, int port)
>  	 */
>  	success = !__tdx_hypercall(&args);
>  
> -	/* Update part of the register affected by the emulated instruction */
> -	regs->ax &= ~mask;
>  	if (success)
> -		regs->ax |= args.r11 & mask;
> +		val = args.r11;
> +	else
> +		val = 0;
> +
> +	/*
> +	 * IN writes the result into a sub-register of RAX.
> +	 *
> +	 * Only the 32-bit form zero-extends; the smaller forms leave
> +	 * the upper bits untouched.
> +	 */
> +	switch (size) {
> +	case 1:
> +		*(u8 *)&regs->ax = (u8)val;
> +		break;
> +	case 2:
> +		*(u16 *)&regs->ax = (u16)val;
> +		break;
> +	case 4:
> +		/* zero-extended */
> +		regs->ax = val;
> +		break;
> +	default:
> +		/* Probable TDX module bug. Illegal in[bwl] size. */
> +		WARN_ON_ONCE(1);
> +		break;
> +	}

Just write it as normal arithmetic code:

	/* IN writes the result into a sub-register of RAX. */
	switch (size) {
	case 1:
		regs->ax = (regs->ax & ~0xfful) | (val & 0xff);
		break;
	case 2:
		regs->ax = (regs->ax & ~0xfffful) | (val & 0xffff);
		break;
	case 4:
	default:
		/* 32bit 'INB' will zero the high bits. */
		regs->ax = val
		break;
	}

Succinct, obvious and readable.

-- David


>  
>  	return success;
>  }


^ permalink raw reply

* Re: [PATCH 01/15] x86/virt/tdx: Read global metadata for TDX Module Extensions
From: Edgecombe, Rick P @ 2026-05-28 21:00 UTC (permalink / raw)
  To: Fang, Peter, kas@kernel.org, djbw@kernel.org,
	yilun.xu@linux.intel.com, x86@kernel.org
  Cc: Xu, Yilun, Duan, Zhenzhong, baolu.lu@linux.intel.com, Li, Xiaoyao,
	linux-kernel@vger.kernel.org, Mehta, Sohil, kvm@vger.kernel.org,
	linux-coco@lists.linux.dev
In-Reply-To: <20260522034128.3144354-2-yilun.xu@linux.intel.com>

On Fri, 2026-05-22 at 11:41 +0800, Xu Yilun wrote:
> +struct tdx_sys_info_ext {
> +	u16 memory_pool_required_pages;

> +	u8 ext_required;

The docs say this is a bool.

> +};
> +


^ permalink raw reply

* Re: [PATCH 01/15] x86/virt/tdx: Read global metadata for TDX Module Extensions
From: Edgecombe, Rick P @ 2026-05-28 21:17 UTC (permalink / raw)
  To: kas@kernel.org, yilun.xu@linux.intel.com
  Cc: Xu, Yilun, x86@kernel.org, baolu.lu@linux.intel.com, Li, Xiaoyao,
	djbw@kernel.org, linux-kernel@vger.kernel.org, Duan, Zhenzhong,
	Mehta, Sohil, kvm@vger.kernel.org, linux-coco@lists.linux.dev,
	Fang, Peter
In-Reply-To: <ahfD1KMYnTrXJziq@yilunxu-OptiPlex-7050>

On Thu, 2026-05-28 at 12:25 +0800, Xu Yilun wrote:
> > > 
> > > If I read the TDX module base spec correctly, the amount of memory for
> > > extensions and EXT_REQUIRED field depends on the enabled features, which
> > > is
> > > determined by TDH.SYS.CONFIG/TDH.SYS.UPDATE ?
> 
> Yes.
> 
> > 
> > This is my read too. Looks like we need a separate step after
> > config_tdx_module() to readout config-dependatant metadata.
> 
> 
> The timing for when metadata becomes valid is now variable, e.g., the
> TDX QUOTING metadata is only valid after TDH.QUOTE.INIT [1].
> 
> Based on recent discussion, I think we should introduce runtime metadata
> reading interfaces for specific metadata sets as needed, rather than
> another catch-all step right after config_tdx_module(). See [2] for the
> proposed approach for Extensions metadata.
> 
> [1]:
> https://lore.kernel.org/all/20260522034128.3144354-7-yilun.xu@linux.intel.com/
> [2]: https://lore.kernel.org/all/ahXAL41ZmIDHmgfu@yilunxu-OptiPlex-7050/

Yea It is going to get confusing as to which metadata is populated at which
step. And if anything updates it.

I'm not sure we need to have all the metadata stored permanently. Some of the
metadata is needed for KVM and someday TSM. But a lot of it is onetime internal
use. There is some handiness in referring to a global var, but also those
reference add confusion as to when it got populated.

We only use ext_required, max_quote_size and memory_pool_required_pages each
once. So why not just read them to the stack and leave them out of struct
tdx_sys_info? Making it so there is not confusion of when it was read. And also
saving a global var that is never used again is a bit wrong.

How about for struct tdx_sys_info_ext read it to the stack in init_tdx_ext() and
pass it into init_tdx_ext_features(). For max_quote_size read it where it is
already read, but not into the global struct.

Do you see a problem?

^ permalink raw reply

* Re: [PATCH 04/15] x86/virt/tdx: Enable the Extensions right after basic TDX Module init
From: Edgecombe, Rick P @ 2026-05-28 21:32 UTC (permalink / raw)
  To: Fang, Peter, kas@kernel.org, djbw@kernel.org,
	yilun.xu@linux.intel.com, x86@kernel.org
  Cc: Xu, Yilun, Duan, Zhenzhong, baolu.lu@linux.intel.com, Li, Xiaoyao,
	linux-kernel@vger.kernel.org, Mehta, Sohil, kvm@vger.kernel.org,
	linux-coco@lists.linux.dev
In-Reply-To: <20260522034128.3144354-5-yilun.xu@linux.intel.com>

On Fri, 2026-05-22 at 11:41 +0800, Xu Yilun wrote:
> The detailed initialization flow for TDX Module Extensions has been
> fully implemented.
> 

I'm not sure what this means exactly. Why "detailed". Is that important?

>  Enable the flow after basic TDX Module
> initialization.
> 
> Theoretically, the Extensions doesn't need to be enabled right after
> basic TDX initialization. It could be enabled right before the first
> Extension SEAMCALL is issued. That would save or postpone memory usage.
> But it isn't worth the complexity, the needs for the Extensions are vast
> but the savings are little for a typical TDX capable system (about
> 0.001% of memory). So the Linux decision is to just enable it along with
> the basic TDX.

The Linux decision is whatever this patch turns out to be after community
review. So for the patch log we just need to justify why it's a good idea, not
not make an argument to defer to authority.

> 
> Note that the Extensions initialization flow will still not start if no
> add-on features require Extensions. The enabling of add-on features will
> be in later patches. Until then, the system hasn't consumed extra memory.

Hmm, this patch reads like we are finally doing the initialization up until this
point. Then it turns out we don't actually light up the new code yet... 

A lot of this diff is adding __init to the function added in the earlier
patches. Do we need to do this? Why not add them as __init in the original
patches?

I think we maybe want to say instead that we are setting up to enable extensions
at TDX module init time, and do the explanation of why. Then without the __init
stuff, the patch is just about the init time decision. Which seems about right
sized.

^ permalink raw reply

* Re: [RFC PATCH 05/15] x86/virt/tdx: Move tdx_tdr_pa() up in the file
From: Edgecombe, Rick P @ 2026-05-28 21:32 UTC (permalink / raw)
  To: Fang, Peter, kas@kernel.org, djbw@kernel.org,
	yilun.xu@linux.intel.com, x86@kernel.org
  Cc: Xu, Yilun, Duan, Zhenzhong, baolu.lu@linux.intel.com, Li, Xiaoyao,
	linux-kernel@vger.kernel.org, Mehta, Sohil, kvm@vger.kernel.org,
	linux-coco@lists.linux.dev
In-Reply-To: <20260522034128.3144354-6-yilun.xu@linux.intel.com>

On Fri, 2026-05-22 at 11:41 +0800, Xu Yilun wrote:
> From: Peter Fang <peter.fang@intel.com>
> 
> Move the tdx_tdr_pa() in preparation for upcoming changes to use them
> during TDX bringup.
> 
> No functional change intended.
> 
> Signed-off-by: Peter Fang <peter.fang@intel.com>
> Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>

Reviewed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox