Linux Confidential Computing Development

Linux Confidential Computing Development
 help / color / mirror / Atom feed

* [PATCH v2 4/4] x86/virt/tdx: Move mk_keyed_paddr() to tdx.c due to no external users
From: Yan Zhao @ 2026-04-30  1:50 UTC (permalink / raw)
  To: dave.hansen, pbonzini, seanjc
  Cc: tglx, mingo, bp, kas, x86, linux-kernel, kvm, linux-coco,
	kai.huang, rick.p.edgecombe, yan.y.zhao, yilun.xu, vannapurve,
	ackerleytng, sagis, binbin.wu, xiaoyao.li, isaku.yamahata
In-Reply-To: <20260430014852.24183-1-yan.y.zhao@intel.com>

Move mk_keyed_paddr() from tdx.h to tdx.c to avoid unnecessary header
inclusion and improve encapsulation since there are no users outside of
tdx.c.

No functional change intended.
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
 arch/x86/include/asm/tdx.h  | 6 ------
 arch/x86/virt/vmx/tdx/tdx.c | 6 ++++++
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 9c63deaa0e8f..503f9a3f46d6 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -177,12 +177,6 @@ struct tdx_vp {
 	struct page **tdcx_pages;
 };
 
-static inline u64 mk_keyed_paddr(u16 hkid, kvm_pfn_t pfn)
-{
-	/* KeyID bits are just above the physical address bits. */
-	return PFN_PHYS(pfn) | ((u64)hkid << boot_cpu_data.x86_phys_bits);
-}
-
 u64 tdh_vp_enter(struct tdx_vp *vp, struct tdx_module_args *args);
 u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page);
 u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, kvm_pfn_t pfn, struct page *source,
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index deb67e68f85f..967482ae3c80 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -1911,6 +1911,12 @@ u64 tdh_phymem_cache_wb(bool resume)
 }
 EXPORT_SYMBOL_FOR_KVM(tdh_phymem_cache_wb);
 
+static inline u64 mk_keyed_paddr(u16 hkid, kvm_pfn_t pfn)
+{
+	/* KeyID bits are just above the physical address bits. */
+	return PFN_PHYS(pfn) | ((u64)hkid << boot_cpu_data.x86_phys_bits);
+}
+
 u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td)
 {
 	struct tdx_module_args args = {};
-- 
2.43.2


^ permalink raw reply related

* [PATCH v2 3/4] x86/tdx: Drop exported function tdx_quirk_reset_page()
From: Yan Zhao @ 2026-04-30  1:50 UTC (permalink / raw)
  To: dave.hansen, pbonzini, seanjc
  Cc: tglx, mingo, bp, kas, x86, linux-kernel, kvm, linux-coco,
	kai.huang, rick.p.edgecombe, yan.y.zhao, yilun.xu, vannapurve,
	ackerleytng, sagis, binbin.wu, xiaoyao.li, isaku.yamahata
In-Reply-To: <20260430014852.24183-1-yan.y.zhao@intel.com>

KVM invokes tdx_quirk_reset_page() to reset TDX control pages (including
S-EPT pages, TDR page, etc.), as all those pages are allocated by KVM TDX
and thus always have struct page.

However, it's also reasonable for KVM to reset those TDX control pages via
tdx_quirk_reset_paddr() directly, eliminating the need to export two
parallel APIs. Keeping tdx_quirk_reset_page() as a one-line helper in the
header file is also unnecessary.

No functional change intended.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Suggested-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
 arch/x86/include/asm/tdx.h  | 1 -
 arch/x86/kvm/vmx/tdx.c      | 4 ++--
 arch/x86/virt/vmx/tdx/tdx.c | 6 ------
 3 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 65f7d874fb5a..9c63deaa0e8f 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -153,7 +153,6 @@ int tdx_guest_keyid_alloc(void);
 u32 tdx_get_nr_guest_keyids(void);
 void tdx_guest_keyid_free(unsigned int keyid);
 
-void tdx_quirk_reset_page(struct page *page);
 void tdx_quirk_reset_paddr(unsigned long base, unsigned long size);
 
 struct tdx_td {
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index a2aadc6d0174..9bd4fd748e2a 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -343,7 +343,7 @@ static int tdx_reclaim_page(struct page *page)
 
 	r = __tdx_reclaim_page(page);
 	if (!r)
-		tdx_quirk_reset_page(page);
+		tdx_quirk_reset_paddr(page_to_phys(page), PAGE_SIZE);
 	return r;
 }
 
@@ -597,7 +597,7 @@ static void tdx_reclaim_td_control_pages(struct kvm *kvm)
 	if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
 		return;
 
-	tdx_quirk_reset_page(kvm_tdx->td.tdr_page);
+	tdx_quirk_reset_paddr(page_to_phys(kvm_tdx->td.tdr_page), PAGE_SIZE);
 
 	__free_page(kvm_tdx->td.tdr_page);
 	kvm_tdx->td.tdr_page = NULL;
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index e5a37ea2d4a0..deb67e68f85f 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -731,12 +731,6 @@ void tdx_quirk_reset_paddr(unsigned long base, unsigned long size)
 }
 EXPORT_SYMBOL_FOR_KVM(tdx_quirk_reset_paddr);
 
-void tdx_quirk_reset_page(struct page *page)
-{
-	tdx_quirk_reset_paddr(page_to_phys(page), PAGE_SIZE);
-}
-EXPORT_SYMBOL_FOR_KVM(tdx_quirk_reset_page);
-
 static __init void tdmr_quirk_reset_pamt(struct tdmr_info *tdmr)
 
 {
-- 
2.43.2


^ permalink raw reply related

* [PATCH v2 2/4] x86/tdx: Use PFN directly for unmapping guest private memory
From: Yan Zhao @ 2026-04-30  1:49 UTC (permalink / raw)
  To: dave.hansen, pbonzini, seanjc
  Cc: tglx, mingo, bp, kas, x86, linux-kernel, kvm, linux-coco,
	kai.huang, rick.p.edgecombe, yan.y.zhao, yilun.xu, vannapurve,
	ackerleytng, sagis, binbin.wu, xiaoyao.li, isaku.yamahata
In-Reply-To: <20260430014852.24183-1-yan.y.zhao@intel.com>

From: Sean Christopherson <seanjc@google.com>

Remove struct page assumptions/constraints in APIs for unmapping guest
private memory and have them take physical address directly.

Having core TDX make assumptions that guest private memory must be backed
by struct page (and/or folio) will create subtle dependencies on how
KVM/guest_memfd allocates/manages memory (e.g., whether it uses memory
allocated from core MM, if the memory is refcounted, or if the folio is
split) that are easily avoided. [1].

KVM's MMUs work with PFNs. This is very much an intentional design choice.
It ensures that the KVM MMUs remain flexible and are not too tightly tied
to the regular CPU MMUs and the kernel code around them. Using
"struct page" for TDX guest memory is not a good fit anywhere near the KVM
MMU code [2].

Therefore, for unmapping guest private memory: export
tdx_quirk_reset_paddr() for direct KVM invocation, and convert the SEAMCALL
wrapper API tdh_phymem_page_wbinvd_hkid() to take PFN as input (thus
updating mk_keyed_paddr() and tdh_phymem_page_wbinvd_tdr()).

Intentionally have KVM pass PAGE_SIZE (rather than KVM_HPAGE_SIZE(level))
to tdx_quirk_reset_paddr() in tdx_sept_remove_private_spte() to avoid
mixing in huge page changes. The KVM_BUG_ON() check for !PG_LEVEL_4K in
tdx_sept_remove_private_spte() justifies using PAGE_SIZE.

Do not convert tdx_reclaim_page() to use PFN as input since it currently
does not remove guest private memory.

Use "kvm_pfn_t pfn" for type safety. Using this KVM type is appropriate
since APIs tdh_phymem_page_wbinvd_hkid() and tdx_quirk_reset_paddr() are
exported to KVM only.

[Yan: Use kvm_pfn_t,exclude tdx_reclaim_page(),use tdx_quirk_reset_paddr()]

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
Link: https://lore.kernel.org/all/aWgyhmTJphGQqO0Y@google.com [1]
Link: https://lore.kernel.org/all/ac7V0g2q2hN3dU5u@google.com [2]
---
 arch/x86/include/asm/tdx.h  | 14 +++++---------
 arch/x86/kvm/vmx/tdx.c      |  6 +++---
 arch/x86/virt/vmx/tdx/tdx.c |  9 +++++----
 3 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 619aed134c83..65f7d874fb5a 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -154,6 +154,7 @@ u32 tdx_get_nr_guest_keyids(void);
 void tdx_guest_keyid_free(unsigned int keyid);
 
 void tdx_quirk_reset_page(struct page *page);
+void tdx_quirk_reset_paddr(unsigned long base, unsigned long size);
 
 struct tdx_td {
 	/* TD root structure: */
@@ -177,15 +178,10 @@ struct tdx_vp {
 	struct page **tdcx_pages;
 };
 
-static inline u64 mk_keyed_paddr(u16 hkid, struct page *page)
+static inline u64 mk_keyed_paddr(u16 hkid, kvm_pfn_t pfn)
 {
-	u64 ret;
-
-	ret = page_to_phys(page);
-	/* KeyID bits are just above the physical address bits: */
-	ret |= (u64)hkid << boot_cpu_data.x86_phys_bits;
-
-	return ret;
+	/* KeyID bits are just above the physical address bits. */
+	return PFN_PHYS(pfn) | ((u64)hkid << boot_cpu_data.x86_phys_bits);
 }
 
 u64 tdh_vp_enter(struct tdx_vp *vp, struct tdx_module_args *args);
@@ -218,7 +214,7 @@ u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, enum pg_level level,
 			u64 *ext_err1, u64 *ext_err2);
 u64 tdh_phymem_cache_wb(bool resume);
 u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td);
-u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page);
+u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, kvm_pfn_t pfn);
 #else
 static inline void tdx_init(void) { }
 static inline u32 tdx_get_nr_guest_keyids(void) { return 0; }
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 9b47dd257ff4..a2aadc6d0174 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1774,8 +1774,8 @@ static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
 static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
 					 enum pg_level level, u64 mirror_spte)
 {
-	struct page *page = pfn_to_page(spte_to_pfn(mirror_spte));
 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+	kvm_pfn_t pfn = spte_to_pfn(mirror_spte);
 	gpa_t gpa = gfn_to_gpa(gfn);
 	u64 err, entry, level_state;
 
@@ -1814,11 +1814,11 @@ static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
 	if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_REMOVE, entry, level_state, kvm))
 		return;
 
-	err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
+	err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, pfn);
 	if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
 		return;
 
-	tdx_quirk_reset_page(page);
+	tdx_quirk_reset_paddr(PFN_PHYS(pfn), PAGE_SIZE);
 }
 
 void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index b24b81cea5ea..e5a37ea2d4a0 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -710,7 +710,7 @@ static __init int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list,
  * to normal kernel memory. Systems with the X86_BUG_TDX_PW_MCE erratum need to
  * do the conversion explicitly via MOVDIR64B.
  */
-static void tdx_quirk_reset_paddr(unsigned long base, unsigned long size)
+void tdx_quirk_reset_paddr(unsigned long base, unsigned long size)
 {
 	const void *zero_page = (const void *)page_address(ZERO_PAGE(0));
 	unsigned long phys, end;
@@ -729,6 +729,7 @@ static void tdx_quirk_reset_paddr(unsigned long base, unsigned long size)
 	 */
 	mb();
 }
+EXPORT_SYMBOL_FOR_KVM(tdx_quirk_reset_paddr);
 
 void tdx_quirk_reset_page(struct page *page)
 {
@@ -1920,17 +1921,17 @@ u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td)
 {
 	struct tdx_module_args args = {};
 
-	args.rcx = mk_keyed_paddr(tdx_global_keyid, td->tdr_page);
+	args.rcx = mk_keyed_paddr(tdx_global_keyid, page_to_pfn(td->tdr_page));
 
 	return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
 }
 EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_tdr);
 
-u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page)
+u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, kvm_pfn_t pfn)
 {
 	struct tdx_module_args args = {};
 
-	args.rcx = mk_keyed_paddr(hkid, page);
+	args.rcx = mk_keyed_paddr(hkid, pfn);
 
 	return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
 }
-- 
2.43.2


^ permalink raw reply related

* [PATCH v2 1/4] x86/tdx: Use PFN directly for mapping guest private memory
From: Yan Zhao @ 2026-04-30  1:49 UTC (permalink / raw)
  To: dave.hansen, pbonzini, seanjc
  Cc: tglx, mingo, bp, kas, x86, linux-kernel, kvm, linux-coco,
	kai.huang, rick.p.edgecombe, yan.y.zhao, yilun.xu, vannapurve,
	ackerleytng, sagis, binbin.wu, xiaoyao.li, isaku.yamahata
In-Reply-To: <20260430014852.24183-1-yan.y.zhao@intel.com>

From: Sean Christopherson <seanjc@google.com>

Remove struct page assumptions/constraints in the SEAMCALL wrapper APIs for
mapping guest private memory and have them take PFN directly.

Having core TDX make assumptions that guest private memory must be backed
by struct page (and/or folio) will create subtle dependencies on how
KVM/guest_memfd allocates/manages memory (e.g., whether it uses memory
allocated from core MM, if the memory is refcounted, or if the folio is
split) that are easily avoided. [1].

KVM's MMUs work with PFNs. This is very much an intentional design choice.
It ensures that the KVM MMUs remain flexible and are not too tied to the
regular CPU MMUs and the kernel code around them. Using 'struct page' for
TDX guest memory is not a good fit anywhere near the KVM MMU code [2].

Use "kvm_pfn_t pfn" for type safety. Using this KVM type is appropriate
since APIs tdh_mem_page_add() and tdh_mem_page_aug() are exported to KVM
only.

[ Yan: Replace "u64 pfn" with "kvm_pfn_t pfn" ]

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
Link: https://lore.kernel.org/all/aWgyhmTJphGQqO0Y@google.com [1]
Link: https://lore.kernel.org/all/ac7V0g2q2hN3dU5u@google.com [2]
---
 arch/x86/include/asm/tdx.h  |  6 ++++--
 arch/x86/kvm/vmx/tdx.c      |  7 +++----
 arch/x86/virt/vmx/tdx/tdx.c | 19 ++++++++++++-------
 3 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 0cb77ed4adc5..619aed134c83 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -6,6 +6,7 @@
 #include <linux/init.h>
 #include <linux/bits.h>
 #include <linux/mmzone.h>
+#include <linux/kvm_types.h>
 
 #include <asm/errno.h>
 #include <asm/ptrace.h>
@@ -189,11 +190,12 @@ static inline u64 mk_keyed_paddr(u16 hkid, struct page *page)
 
 u64 tdh_vp_enter(struct tdx_vp *vp, struct tdx_module_args *args);
 u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page);
-u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2);
+u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, kvm_pfn_t pfn, struct page *source,
+		     u64 *ext_err1, u64 *ext_err2);
 u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, enum pg_level level, struct page *page,
 		     u64 *ext_err1, u64 *ext_err2);
 u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page);
-u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, enum pg_level level, struct page *page,
+u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, enum pg_level level, kvm_pfn_t pfn,
 		     u64 *ext_err1, u64 *ext_err2);
 u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, enum pg_level level, u64 *ext_err1,
 			u64 *ext_err2);
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 77aea8920a4a..9b47dd257ff4 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1624,8 +1624,8 @@ static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level,
 	    KVM_BUG_ON(!kvm_tdx->page_add_src, kvm))
 		return -EIO;
 
-	err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
-			       kvm_tdx->page_add_src, &entry, &level_state);
+	err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn, kvm_tdx->page_add_src,
+			       &entry, &level_state);
 	if (unlikely(tdx_operand_busy(err)))
 		return -EBUSY;
 
@@ -1639,12 +1639,11 @@ static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
 			    enum pg_level level, kvm_pfn_t pfn)
 {
 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
-	struct page *page = pfn_to_page(pfn);
 	gpa_t gpa = gfn_to_gpa(gfn);
 	u64 entry, level_state;
 	u64 err;
 
-	err = tdh_mem_page_aug(&kvm_tdx->td, gpa, level, page, &entry, &level_state);
+	err = tdh_mem_page_aug(&kvm_tdx->td, gpa, level, pfn, &entry, &level_state);
 	if (unlikely(tdx_operand_busy(err)))
 		return -EBUSY;
 
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index a6e77afafa79..b24b81cea5ea 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -30,7 +30,6 @@
 #include <linux/suspend.h>
 #include <linux/syscore_ops.h>
 #include <linux/idr.h>
-#include <linux/kvm_types.h>
 #include <asm/page.h>
 #include <asm/special_insns.h>
 #include <asm/msr-index.h>
@@ -1568,6 +1567,11 @@ static void tdx_clflush_page(struct page *page)
 	clflush_cache_range(page_to_virt(page), PAGE_SIZE);
 }
 
+static void tdx_clflush_pfn(kvm_pfn_t pfn)
+{
+	clflush_cache_range(__va(PFN_PHYS(pfn)), PAGE_SIZE);
+}
+
 static int pg_level_to_tdx_sept_level(enum pg_level level)
 {
 	WARN_ON_ONCE(level == PG_LEVEL_NONE);
@@ -1594,17 +1598,18 @@ u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page)
 }
 EXPORT_SYMBOL_FOR_KVM(tdh_mng_addcx);
 
-u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2)
+u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, kvm_pfn_t pfn, struct page *source,
+		     u64 *ext_err1, u64 *ext_err2)
 {
 	struct tdx_module_args args = {
 		.rcx = gpa,
 		.rdx = tdx_tdr_pa(td),
-		.r8 = page_to_phys(page),
+		.r8 = PFN_PHYS(pfn),
 		.r9 = page_to_phys(source),
 	};
 	u64 ret;
 
-	tdx_clflush_page(page);
+	tdx_clflush_pfn(pfn);
 	ret = seamcall_ret(TDH_MEM_PAGE_ADD, &args);
 
 	*ext_err1 = args.rcx;
@@ -1647,16 +1652,16 @@ u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page)
 EXPORT_SYMBOL_FOR_KVM(tdh_vp_addcx);
 
 u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, enum pg_level level,
-		     struct page *page, u64 *ext_err1, u64 *ext_err2)
+		     kvm_pfn_t pfn, u64 *ext_err1, u64 *ext_err2)
 {
 	struct tdx_module_args args = {
 		.rcx = gpa | pg_level_to_tdx_sept_level(level),
 		.rdx = tdx_tdr_pa(td),
-		.r8 = page_to_phys(page),
+		.r8 = PFN_PHYS(pfn),
 	};
 	u64 ret;
 
-	tdx_clflush_page(page);
+	tdx_clflush_pfn(pfn);
 	ret = seamcall_ret(TDH_MEM_PAGE_AUG, &args);
 
 	*ext_err1 = args.rcx;
-- 
2.43.2


^ permalink raw reply related

* [PATCH v2 0/4] struct page to PFN conversion for TDX guest private memory
From: Yan Zhao @ 2026-04-30  1:48 UTC (permalink / raw)
  To: dave.hansen, pbonzini, seanjc
  Cc: tglx, mingo, bp, kas, x86, linux-kernel, kvm, linux-coco,
	kai.huang, rick.p.edgecombe, yan.y.zhao, yilun.xu, vannapurve,
	ackerleytng, sagis, binbin.wu, xiaoyao.li, isaku.yamahata

Hi

This is v2 of the struct page to PFN conversion series, which converts TDX
guest private memory mapping/unmapping APIs from taking struct page to
taking PFN as input.

v2 is based on v7.1.0-rc1 + Sean's 4 cleanup patches (see details in
section "Base" below). The purpose is to get Dave's Ack, so Sean can take
it from the KVM x86 tree. The full stack of v2 is available at [14].

Compared to v1, v2:
- Rewrote commit messages of patches 1/2 (the conversion patches for
  mapping and unmapping) by specifically explaining the downside of
  assuming guest private memory must be backed by struct page, and
  incorporating Dave's rewording that also works for Sean.

- Updated patch 2 (which is for unmapping) to use tdx_quirk_reset_paddr()
  directly for unmapping guest private memory, and added patch 3 to drop
  the exported function tdx_quirk_reset_page() by having KVM invoke
  tdx_quirk_reset_paddr() in all scenarios, as suggested by Paolo and
  Xiaoyao.

- Split patch 4 (moving mk_keyed_paddr() to .c) out of patch 2, so patch 2
  can focus on the struct page to PFN conversion for unmapping.

Note: as agreed in v1, Kirill's concern about AUG "level" will be addressed
in a separate patch later.

Background
----------
TDX SEAMCALL wrappers take struct page as input, which provides:
1. Type safety
2. Make it harder to misuse and make it obvious that physical pages in RAM
   are expected from just looking at the API declaration [2][3][4][5].

This is appropriate for SEAMCALL wrappers for TDX control pages (e.g., TDR
page, TDCS pages, TDX SEPT pages), since KVM manages and allocates those
pages explicitly from core MM.

However, unlike TDX control pages, KVM guest memory is not necessarily
backed by refcounted struct page or even struct page (e.g., VM_PFNMAP
memory [6]). Taking struct page as input for SEAMCALL wrappers for
mapping/unmapping guest private memory imposes unnecessary assumptions on
how KVM and guest_memfd manage memory [7]. So, Sean suggested converting
from using struct page to PFN for SEAMCALL wrappers operating on guest
private memory [8].

This series therefore converts struct page to PFN for guest private memory
while keeping struct page for TDX control pages, and uses kvm_pfn_t for
type safety.

Sanity check
------------
Reasonable PFN sanity checks in the guest private memory mapping/unmapping
APIs are still agreed upon [9][10], such as checking TDX convertibility to
avoid SEAMCALL failure.

However, we decided not to provide any in-kernel sanity checks to avoid
introducing unnecessary overhead, both because those failures are supposed
to only occur when there are kernel bugs, and due to the lack of
satisfactory tiny checks to ensure convertibility. When unexpected
non-TDX-convertible PFNs are passed in, just let SEAMCALLs fail or have
#MCs or #PFs generated, which are obvious enough in themselves.

Base:
----
This v2 is rebased on top of v7.1.0-rc1 (kvm/next, commit 39f1c201b93f) +
the first 4 patches from Sean's v5 "TDX: Dynamic PAMT + S-EPT Hugepage"
series [11].

Note: due to the instability of v7.1.0-rc1, I also applied series [12] and
[13] to pass CI.

Changelogs:
-----------
v1 [1] --> v2:
    1. Updated patch logs of patches 1/2. (Dave).
    2. Added patch 3 to drop tdx_quirk_reset_page() and export
       tdx_quirk_reset_paddr() only. (Paolo, Xiaoyao)
    3. Split out patch 4 to move mk_keyed_paddr() from .h to .c.
    4. Rebased to v7.1.0-rc1 + Sean's 4 cleanup patches.

Sean's original patch [0] --> v1:
    1. Rebased to kvm-x86-next-2026.03.13.
    2. Split to 2 patches for easy review.  (Rick)
    3. Replaced "u64 pfn" with "kvm_pfn_t pfn"  (Rick)
    4. Dropped using PFN as input to tdx_reclaim_page(). (Rick)
    5. Move mk_keyed_paddr() from tdx.h to tdx.c. 

Thanks
Yan

[0] https://lore.kernel.org/kvm/20260129011517.3545883-26-seanjc@google.com
[1] https://lore.kernel.org/all/20260319005605.8965-1-yan.y.zhao@intel.com
[2] https://lore.kernel.org/all/30d0cef5-82d5-4325-b149-0e99833b8785@intel.com
[3] https://lore.kernel.org/kvm/f4240495-120b-4124-b91a-b365e45bf50a@intel.com
[4] https://lore.kernel.org/kvm/435b8d81-b4de-4933-b0ae-357dea311488@intel.com
[5] https://lore.kernel.org/kvm/1b236a64-d511-49a2-9962-55f4b1eb08e3@intel.com
[6] https://lore.kernel.org/all/20241010182427.1434605-1-seanjc@google.com
[7] https://lore.kernel.org/all/aWgyhmTJphGQqO0Y@google.com
[8] https://lore.kernel.org/all/aWe1tKpFw-As6VKg@google.com
[9] https://lore.kernel.org/all/aWkVLViKBgiVGgaI@google.com
[10] https://lore.kernel.org/all/d119c824-4770-41d2-a926-4ab5268ea3a6@intel.com
[11] https://lore.kernel.org/all/20260129011517.3545883-1-seanjc@google.com
[12] https://lore.kernel.org/all/20260423155611.216805954@infradead.org
[13] https://lore.kernel.org/all/20260428024746.1040531-1-binbin.wu@linux.intel.com
[14] https://github.com/intel-staging/tdx/tree/struct_page_to_pfn_v2

Sean Christopherson (2):
  x86/tdx: Use PFN directly for mapping guest private memory
  x86/tdx: Use PFN directly for unmapping guest private memory

Yan Zhao (2):
  x86/tdx: Drop exported function tdx_quirk_reset_page()
  x86/virt/tdx: Move mk_keyed_paddr() to tdx.c due to no external users

 arch/x86/include/asm/tdx.h  | 21 ++++++-------------
 arch/x86/kvm/vmx/tdx.c      | 17 ++++++++--------
 arch/x86/virt/vmx/tdx/tdx.c | 40 +++++++++++++++++++++----------------
 3 files changed, 37 insertions(+), 41 deletions(-)

-- 
2.43.2

^ permalink raw reply

* Re: [PATCH v8 08/21] x86/virt/seamldr: Allocate and populate a module update request
From: Dave Hansen @ 2026-04-30  0:45 UTC (permalink / raw)
  To: Chao Gao, kvm, linux-coco, linux-kernel, x86
  Cc: binbin.wu, dave.hansen, djbw, ira.weiny, kai.huang, kas,
	nik.borisov, paulmck, pbonzini, reinette.chatre, rick.p.edgecombe,
	sagis, seanjc, tony.lindgren, vannapurve, vishal.l.verma,
	yilun.xu, xiaoyao.li, yan.y.zhao, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, H. Peter Anvin
In-Reply-To: <20260427152854.101171-9-chao.gao@intel.com>

On 4/27/26 08:28, Chao Gao wrote:
> P-SEAMLDR uses the SEAMLDR_PARAMS structure to describe TDX module
> update requests. This structure contains physical addresses pointing to
> the module binary and its signature file (or sigstruct), along with an
> update scenario field.
> 
> TDX modules are distributed in the tdx_blob format defined in
> blob_structure.txt from the "Intel TDX module Binaries Repository". A
> tdx_blob contains a header, sigstruct, and module binary. This is also the
> format supplied by the userspace to the kernel.
> 
> Parse the tdx_blob format and populate a SEAMLDR_PARAMS structure. The
> header is consumed solely by the kernel to extract the sigstruct and
> module, so validate it before processing to protect the kernel ABI. The
> sigstruct and module are passed to and validated by P-SEAMLDR, so don't
> duplicate any validation in the kernel.
> 
> Note: the sigstruct_pa field in SEAMLDR_PARAMS has been extended to
> a 4-element array. The updated "SEAM Loader (SEAMLDR) Interface
> Specification" will be published separately.

These changelogs have all the right info, but I find them really hard to
parse. For instance, if you're going to have a 'struct seamldr_params',
then just stick with that name. Don't use the "SEAMLDR_PARAMS" name too.

Start with the data structures:

There are two important ABIs here:

'struct tdx_blob'       - the on-disk and in-memory format for a TDX
 		          module update image.
'struct seamldr_params' - The in-memory ABI passed to the TDX module
			  loader. Points to a single 'struct tdx_blob'

> diff --git a/arch/x86/virt/vmx/tdx/seamldr.c b/arch/x86/virt/vmx/tdx/seamldr.c
> index 650c0f097aac..f70be8e2a07b 100644
> --- a/arch/x86/virt/vmx/tdx/seamldr.c
> +++ b/arch/x86/virt/vmx/tdx/seamldr.c
> @@ -7,6 +7,7 @@
>  #define pr_fmt(fmt)	"seamldr: " fmt
>  
>  #include <linux/mm.h>
> +#include <linux/slab.h>
>  #include <linux/spinlock.h>
>  
>  #include <asm/seamldr.h>
> @@ -16,6 +17,33 @@
>  /* P-SEAMLDR SEAMCALL leaf function */
>  #define P_SEAMLDR_INFO			0x8000000000000000
>  
> +#define SEAMLDR_MAX_NR_MODULE_PAGES	496
> +#define SEAMLDR_MAX_NR_SIG_PAGES	4

Gah. All this complexity for the variable-length sigstruct to save a
maximum of 4 pages. Wow.

This whole thing could have been:

struct tdx_image {
	u16	version; // This ABI is always 0x100
	u16	checksum;
	u8	signature[8];
	u32	length;
	u8	reserved[4076];
	u8	sigstruct[SIGSTRUCT_SIZE];
	u8	module[];
}

One variable array. No module offset calculations or munging.

Why do we do this to ourselves for 3 measly pages? ;)

> +/*
> + * The seamldr_params "scenario" field specifies the operation mode:
> + * 0: Install TDX module from scratch (not used by kernel)
> + * 1: Update existing TDX module to a compatible version
> + */
> +#define SEAMLDR_SCENARIO_UPDATE		1
> +
> +/*
> + * This is called the "SEAMLDR_PARAMS" data structure and is defined
> + * in "SEAM Loader (SEAMLDR) Interface Specification".
> + *
> + * It describes the TDX module that will be installed.
> + */
> +struct seamldr_params {
> +	u32	version;
> +	u32	scenario;
> +	u64	sigstruct_pa[SEAMLDR_MAX_NR_SIG_PAGES];
> +	u8	reserved[80];
> +	u64	num_module_pages;
> +	u64	mod_pages_pa_list[SEAMLDR_MAX_NR_MODULE_PAGES];
> +} __packed;
> +
> +static_assert(sizeof(struct seamldr_params) == 4096);
> +
>  /*
>   * Serialize P-SEAMLDR calls since the hardware only allows a single CPU to
>   * interact with P-SEAMLDR simultaneously. Use raw version as the calls can
> @@ -43,6 +71,128 @@ int seamldr_get_info(struct seamldr_info *seamldr_info)
>  }
>  EXPORT_SYMBOL_FOR_MODULES(seamldr_get_info, "tdx-host");
>  
> +/*
> + * Intel TDX module blob. Its format is defined at:
> + * https://github.com/intel/tdx-module-binaries/blob/main/blob_structure.txt

Heh, so URLs are not OK in changelogs because they go stale, but they're
fine in the code?

> + * Note this structure differs from the reference above: the two variable-length
> + * fields "@sigstruct" and "@module" are represented as a single "@data" field
> + * here and split programmatically using the offset_of_module value.

This is good info. But, it's copied and pasted between the changelog and
here. I'd choose one, honestly.

> + * Note @offset_of_module is relative to the start of struct tdx_blob, not
> + * @data, and @length is the total length of the blob, not the length of
> + * @data.
> + */

Out of line comments aren't great. Do these in the data structure if at
all possible. Or, in the code. For instance:

> +struct tdx_blob {
> +	u16	version; // This ABI is always 0x100
> +	u16	checksum;
> +	u32	offset_of_module; // from start of tdx_blob
> +	u8	signature[8];
> +	u32	length;
> +	u32	reserved0;
> +	u64	reserved1[509];
> +	u8	data[]; // contains sigstruct[] and module[]
> +} __packed;

That's probably _better_ than the two duplicated comments that are there
now.

Also, why bother having two reserved arrays instead of:

	u8 reserved[4076];

?

> +/* Supported versions of the tdx_blob */
> +#define TDX_BLOB_VERSION_1	0x100

The comment here doesn't help much.

> +/*
> + * Blob fields are processed by the kernel and the payloads
> + * are passed to the TDX module. Do normal user input type
> + * check for any fields that don't get passed to the TDX module.
> + */

I made it this far, but I rather despise the 'blob' terminology. It's
just bad naming. We should really just call it 'tdx_update_image' or
'tdx_image' everywhere and stop saying 'blob'. Blob is one of those
names that people throw at things when they give up on naming.

> +static const struct tdx_blob *get_and_check_blob(const u8 *data, u32 size)
> +{
> +	const struct tdx_blob *blob = (const void *)data;
> +
> +	/*
> +	 * Ensure the size is valid otherwise reading any field from the
> +	 * blob may overflow.
> +	 */
> +	if (size <= sizeof(struct tdx_blob))
> +		return ERR_PTR(-EINVAL);

Couple of things here:

First, using sizeof() on a type with a variable-length array is a big
warning sign. It needs commenting. It's especially subtle because this
will go on and parse patently invalid 'data' images that don't even have
room for sigstruct[] or module[].

This is *specifically* about the pre-data[] fields that are going to be
read below.

> +	/*
> +	 * Don't care about user passing the wrong file, but protect
> +	 * kernel ABI by preventing accepting garbage.
> +	 */
> +	if (memcmp(blob->signature, "TDX-BLOB", 8))
> +		return ERR_PTR(-EINVAL);

Is there really no helper in the kernel anywhere that can safely do the
8-byte compare against two known-to-the-compiler 8-byte-wide fields
without hard-coding the 8?

> +	/*
> +	 * Ensure the offset of the module is within valid bounds and
> +	 * page-aligned.
> +	 */
> +	if (blob->offset_of_module >= size || blob->offset_of_module <= sizeof(struct tdx_blob))
> +		return ERR_PTR(-EINVAL);

Again, the sizeof(struct tdx_blob) is wonky. Why does this disallow
pointing blob->offset_of_module at reserved1[508] but not sigstruct[]?

> +	if (!IS_ALIGNED(blob->offset_of_module, PAGE_SIZE))
> +		return ERR_PTR(-EINVAL);

Wait a sec. Unless blob->offset_of_module==0, how could this check pass
and "blob->offset_of_module <= sizeof(struct tdx_blob)" fail?

> +	if (blob->version != TDX_BLOB_VERSION_1)
> +		return ERR_PTR(-EINVAL);

This should be the first check, IMNHO. If this doesn't pass then the
rest of the fields are invalid. No?

> +	if (blob->reserved0 || memchr_inv(blob->reserved1, 0, sizeof(blob->reserved1)))
> +		return ERR_PTR(-EINVAL);

There should not be two reserved, must-be-0 fields. There should be 1.
There must be 1.

Also I don't like the proposed data structure. It would make a lot more
sense to me if it were:

struct tdx_image_header {
	u16	version; // This ABI is always 0x100
	u16	checksum;
	u32	offset_of_module; // from start of the header
	u8	signature[8];
	u32	length;
	u8	reserved[4076];
}

struct p {
	u8[PAGE_SIZE];
};

struct tdx_image {
	struct tdx_image_header h;
	struct p pages[];
};

Then you can do things like check if sizeof(struct tdx_image_header) ==
PAGE_SIZE. Or whether offset_of_module points past the header.

That stuff only makes sense if you separate out the header structure
from the payloads which are the page-aligned sigstruct and module image
itself.

But exposing the double-variable-length arrays seems really wonky to me.

> +	return blob;
> +}
> +
> +static struct seamldr_params *alloc_seamldr_params(const struct tdx_blob *blob, unsigned int blob_size)
> +{

This does far more than "alloc" something.

> +	struct seamldr_params *params;
> +	int module_pg_cnt, sig_pg_cnt;
> +	const u8 *sig, *module;
> +	int i;
> +
> +	params = (struct seamldr_params *)get_zeroed_page(GFP_KERNEL);
> +	if (!params)
> +		return ERR_PTR(-ENOMEM);

kzmalloc(PAGE_SIZE, GFP_KERNEL) will save you a cast.

> +	/*
> +	 * Split the blob into a sigstruct and a module. Assume all
> +	 * size/offsets are within bounds of blob_size due to prior checks.
> +	 */
> +	sig		= blob->data;
> +	sig_pg_cnt	= (blob->offset_of_module - sizeof(struct tdx_blob)) >> PAGE_SHIFT;

Of course, the size of the first thing is defined by the offset of the
second thing.

This really should just be called ->end_of_sig.

> +	module		= (const u8 *)blob + blob->offset_of_module;
> +	module_pg_cnt	= (blob_size - blob->offset_of_module) >> PAGE_SHIFT;

This looks halfway sane:

	 /* adjust for size of the header: */
	sig_size    = blob->end_of_sig - PAGE_SIZE;
	module_size = module_image_size - blob->end_of_sig;

Then, page-adjust it later. One bit of magic at a time, please.

> +	/*
> +	 * Only use version 1 when required (sigstruct > 4KB) for backward
> +	 * compatibility with P-SEAMLDR that lacks version 1 support.
> +	 */
> +	params->version = sig_pg_cnt > 1;

Ewwww.

But what do we do if we're on an old P-SEAMLDR but get a big sigstruct?
It'll just fail?

How many old P-SEAMLDRs are there in the wild? Do we even care about this?

> +	params->scenario = SEAMLDR_SCENARIO_UPDATE;
> +
> +	for (i = 0; i < MIN(sig_pg_cnt, SEAMLDR_MAX_NR_SIG_PAGES); i++) {

Same for the MIN(). Do all the calculations separate from the loop.

> +		params->sigstruct_pa[i] = vmalloc_to_pfn(sig) << PAGE_SHIFT;
> +		sig += PAGE_SIZE;
> +	}
> +
> +	params->num_module_pages = MIN(module_pg_cnt, SEAMLDR_MAX_NR_MODULE_PAGES);
> +	for (i = 0; i < params->num_module_pages; i++) {
> +		params->mod_pages_pa_list[i] = vmalloc_to_pfn(module) << PAGE_SHIFT;
> +		module += PAGE_SIZE;
> +	}

Really what you want here is a helper. Have it take the module or
sigstruct pointer, a pointer to the pa_list[] and a maximum size.

Then call the helper twice.

> +	return params;
> +}
> +
> +static struct seamldr_params *init_seamldr_params(const u8 *data, u32 size)
> +{
> +	const struct tdx_blob *blob;
> +
> +	blob = get_and_check_blob(data, size);
> +	if (IS_ERR(blob))
> +		return ERR_CAST(blob);
> +
> +	return alloc_seamldr_params(blob, size);
> +}
> +
> +DEFINE_FREE(free_seamldr_params, struct seamldr_params *,
> +	    if (!IS_ERR_OR_NULL(_T)) free_page((unsigned long)_T))

Is this really worth it?

>  /**
>   * seamldr_install_module - Install a new TDX module.
>   * @data: Pointer to the TDX module update blob.
> @@ -52,6 +202,11 @@ EXPORT_SYMBOL_FOR_MODULES(seamldr_get_info, "tdx-host");
>   */
>  int seamldr_install_module(const u8 *data, u32 size)
>  {
> +	struct seamldr_params *params __free(free_seamldr_params) =
> +						init_seamldr_params(data, size);
> +	if (IS_ERR(params))
> +		return PTR_ERR(params);
> +
>  	/* TODO: Update TDX module here */
>  	return 0;
>  }

IMNHO, this patch has way too much going on. It took well over an hour
to go through it. That's problematic.

^ permalink raw reply

* Re: [PATCH RFC v5 00/53] guest_memfd: In-place conversion support
From: Michael Roth @ 2026-04-29 23:51 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
	wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, Paolo Bonzini, Sean Christopherson, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Jonathan Corbet, Shuah Khan, Shuah Khan, Vishal Annapurve,
	Andrew Morton, Chris Li, Kairui Song, Kemeng Shi, Nhat Pham,
	Baoquan He, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
	Youngjun Park, Qi Zheng, Shakeel Butt, Kiryl Shutsemau,
	Jason Gunthorpe, Vlastimil Babka, kvm, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest, linux-mm,
	linux-coco
In-Reply-To: <20260428-gmem-inplace-conversion-v5-0-d8608ccfca22@google.com>

On Tue, Apr 28, 2026 at 04:24:55PM -0700, Ackerley Tng via B4 Relay wrote:
> [Some people who received this message don't often get email from devnull+ackerleytng.google.com@kernel.org. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
> 
> This is RFC v5 of guest_memfd in-place conversion support.
> 
> Up till now, guest_memfd supports the entire inode worth of memory being
> used as all-shared, or all-private. CoCo VMs may request guest memory to be
> converted between private and shared states, and the only way to support
> that currently would be to have the userspace VMM provide two sources of
> backing memory from completely different areas of physical memory.
> 
> pKVM has a use case for in-place sharing: the guest and host may be
> cooperating on given data, and pKVM doesn't protect data through
> encryption, so copying that given data between different areas of physical
> memory as part of conversions would be unnecessary work.
> 
> This series also serves as a foundation for guest_memfd huge page
> support. Now, guest_memfd only supports PAGE_SIZE pages, so if two sources
> of backing memory are used, the userspace VMM could maintain a steady total
> memory utilized by punching out the pages that are not used. When huge
> pages are available in guest_memfd, even if the backing memory source
> supports hole punching within a huge page, punching out pages to maintain
> the total memory utilized by a VM would be introducing lots of
> fragmentation.
> 
> In-place conversion avoids fragmentation by allowing the same physical
> memory to be used for both shared and private memory, with guest_memfd
> tracks the shared/private status of all the pages at a per-page
> granularity.
> 
> The central principle, which guest_memfd continues to uphold, is that any
> guest-private page will not be mappable to host userspace. All pages will
> be mmap()-able in host userspace, but accesses to guest-private pages (as
> tracked by guest_memfd) will result in a SIGBUS.
> 
> This series introduces a guest_memfd ioctl (not kvm, vm or vcpu, but
> guest_memfd ioctl) that allows userspace to set memory
> attributes (shared/private) directly through the guest_memfd. This is the
> appropriate interface because shared/private-ness is a property of memory
> and hence the request should be sent directly to the memory provider -
> guest_memfd.
> 
> Tested with both CONFIG_KVM_VM_MEMORY_ATTRIBUTES enabled and disabled:
> 
> + tools/testing/selftests/kvm/guest_memfd_test.c
> + tools/testing/selftests/kvm/pre_fault_memory_test.c
> + tools/testing/selftests/kvm/x86/guest_memfd_conversions_test.c
> + tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
> + tools/testing/selftests/kvm/x86/private_mem_conversions_test.sh
> + tools/testing/selftests/kvm/x86/private_mem_kvm_exits_test.c
> 
> Updates for this revision:
> 
> + For TDX and SNP, PRESERVE supported only before VM is finalized only for
>   to_private conversions.
>     + This allows PRESERVE to be used as part of the VM memory
>       loading/encryption flow
>     + Only support PRESERVE for to_private conversions (to_shared on
>       populated memory on TDX would cause zeroing)
>     + Relaxed constraints for SNP and TDX to allow NULL to be passed as
>       source address.
> + Dropped KVM_CAP_MEMORY_ATTRIBUTES2. KVM_CAP_MEMORY_ATTRIBUTES reports
>   attributes supported by the KVM_SET_MEMORY_ATTRIBUTES VM ioctl, and
>   KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES reports attributes supported bt the
>   KVM_SET_MEMORY_ATTRIBUTES2 guest_memfd ioctl.
>     + KVM_SET_MEMORY_ATTRIBUTES2 is not supported by the VM ioctl
> + Resolve locking issue when kvm_gmem_get_attribute() is called from
>   kvm_mmu_zap_collapsible_spte() by bugging the VM. guest_memfd memslots
>   don't support dirty tracking, so the locking issue is not on an
>   accessible code path.
> + Moved guest_memfd_conversions_test.c to only be compiled and tested for
>   x86, since it depends so heavily on KVM_X86_SW_PROTECTED_VM's as a
>   testing vehicle
> 
> TODOs
> 
> + Perhaps further clarify PRESERVE flag: [8]

I made a super-long-winded reply to that thread, but to summarize:

PRESERVE flag has different enumeration/behavior/enforcement for pre-launch
vs. post-launch, and similar considerations might come into play for
other flags, so to make it easier to enumerate what flags are available
for pre-launch/post-launch, maybe we could have 2 capabilities instead
of 1:

  KVM_CAP_MEMORY_ATTRIBUTES2_PRE_LAUNCH_FLAGS
  KVM_CAP_MEMORY_ATTRIBUTES2_FLAGS

where SNP/TDX would only advertise PRESERVE for PRE_LAUNCH, and pKVM I
guess would enumerate it for both (or maybe just POST_LAUNCH?)

That lets us keep the flags definitions more straightforward but still
allows userspace to easily enumerate what exactly should be available at
pre vs. post launch time, and give us some flexibility to detail
variations in behavior between the 2 phases without documenting
edge-cases in terms of VM types.

> + Resolve issue where guest_memfd_conversions_test, which uses the
>   kselftest framework, doesn't perform teardown on assertion
>   failure. Please see proposal at [9]
> + Test with TDX selftests. We're in the process of rebasing TDX selftests
>   on this series and will post updates when that's tested.
> 
> I would like feedback on:
> 
> + Content modes: 0 (MODE_UNSPECIFIED), ZERO, and PRESERVE. Is that all
>   good, or does anyone think there is a use case for something else?
> + Should the content modes apply even if no attribute changes are required?
>     + See notes added in "KVM: guest_memfd: Apply content modes while
>       setting memory attributes"

Looking at the example you have there:

  + Note: These content modes apply to the entire requested range, not
  + just the parts of the range that underwent conversion. For example, if
  + this was the initial state:
  + 
  +   * [0x0000, 0x1000): shared
  +   * [0x1000, 0x2000): private
  +   * [0x2000, 0x3000): shared
  + and range [0x0000, 0x3000) was set to shared, the content mode would
  + apply to all memory in [0x0000, 0x3000), not just the range that
  + underwent conversion [0x1000, 0x2000).

Userspace would be aware of whether the range contains pages that were
already set to private, so if it really wants to set the just the
[0x1000, 0x2000) range to shared with appropriate content mode, it is
fully able to do so by just issuing the ioctl for that specific range.
If it attempts to issue it for the entire range, it only seems like it
would defy normal expectations and cause confusion to skip ranges, and
I'm not sure it gains us anything useful in exchange for that potential
confusion.

>     + Possibly related: should setting attributes be allowed if some
>       sub-range requested already has the requested attribute?

As it is now, userspace has that capability (to use finer-grained ranges
if it doesn't want to re-issue unecessary/unwanted conversions), similar
to above. And KVM internally will just issue kvm_arch_gmem_prepare()
calls so that architecture-specific handling can deal with this case
(e.g. SNP's sev_gmem_prepare() already checks if the corresponding
attribute is set in the RMP table and just skips it otherwise). So I
don't think we really gain anything but added complexity if we try to
make gmem more selective about it.

-Mike

> + Structure of how various content modes are checked for support or
>   applied? I used overridable weak functions for architectures that haven't
>   defined support, and defined overrides for x86 to show how I think it would
>   work. For CoCo platforms, I only implemented TDX for illustration purposes
>   and might need help with the other platforms. Should I have used
>   kvm_x86_ops? I tried and found myself defining lots of boilerplate.
> + The use of private_mem_conversions_test.sh to run different options in
>   private_mem_conversions_test. If this makes sense, I'll adjust the
>   Makefile to have private_mem_conversions_test tested only via the script.
> 
> This series is based on kvm/next, and here's the tree for your convenience:
> 
> https://github.com/googleprodkernel/linux-cc/commits/guest_memfd-inplace-conversion-v5
> 
> Older series:
> 
> + RFCv4 is at [7]
> + RFCv3 is at [6]
> + RFCv2 is at [5]
> + RFCv1 is at [4]
> + Previous versions of this feature, part of other series, are available at
>   [1][2][3].
> 
> [1] https://lore.kernel.org/all/bd163de3118b626d1005aa88e71ef2fb72f0be0f.1726009989.git.ackerleytng@google.com/
> [2] https://lore.kernel.org/all/20250117163001.2326672-6-tabba@google.com/
> [3] https://lore.kernel.org/all/b784326e9ccae6a08388f1bf39db70a2204bdc51.1747264138.git.ackerleytng@google.com/
> [4] https://lore.kernel.org/all/cover.1760731772.git.ackerleytng@google.com/T/
> [5] https://lore.kernel.org/all/cover.1770071243.git.ackerleytng@google.com/T/
> [6] https://lore.kernel.org/r/20260313-gmem-inplace-conversion-v3-0-5fc12a70ec89@google.com/T/
> [7] https://lore.kernel.org/all/20260326-gmem-inplace-conversion-v4-0-e202fe950ffd@google.com/T/
> [8] https://lore.kernel.org/all/CAEvNRgGbMhkX310CkFY_M5x-zod=BDTiuznrZ0XvFPUK7weL1A@mail.gmail.com/
> [9] https://lore.kernel.org/all/20260414-selftest-global-metadata-v1-0-fd223922bc57@google.com/T/
> 
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> ---
> Ackerley Tng (34):
>       KVM: x86/mmu: Bug the VM if gmem attributes are queried to determine max mapping level
>       KVM: guest_memfd: Update kvm_gmem_populate() to use gmem attributes
>       KVM: guest_memfd: Only prepare folios for private pages
>       KVM: Move kvm_supported_mem_attributes() to kvm_host.h
>       KVM: guest_memfd: Add basic support for KVM_SET_MEMORY_ATTRIBUTES2
>       KVM: guest_memfd: Ensure pages are not in use before conversion
>       KVM: guest_memfd: Call arch invalidate hooks on conversion
>       KVM: guest_memfd: Return early if range already has requested attributes
>       KVM: guest_memfd: Advertise KVM_SET_MEMORY_ATTRIBUTES2 ioctl
>       KVM: guest_memfd: Handle lru_add fbatch refcounts during conversion safety check
>       KVM: guest_memfd: Use actual size for invalidation in kvm_gmem_release()
>       KVM: guest_memfd: Determine invalidation filter from memory attributes
>       KVM: guest_memfd: Introduce default handlers for content modes
>       KVM: guest_memfd: Apply content modes while setting memory attributes
>       KVM: x86: Support SW_PROTECTED_VM in applying content modes
>       KVM: TDX: Make source page optional for KVM_TDX_INIT_MEM_REGION
>       KVM: x86: Support SNP and TDX applying content modes
>       KVM: x86: Bug CoCo VM on page fault before finalizing
>       KVM: Add CAP to enumerate supported SET_MEMORY_ATTRIBUTES2 flags
>       KVM: selftests: Test basic single-page conversion flow
>       KVM: selftests: Test conversion flow when INIT_SHARED
>       KVM: selftests: Test conversion precision in guest_memfd
>       KVM: selftests: Test conversion before allocation
>       KVM: selftests: Convert with allocated folios in different layouts
>       KVM: selftests: Test that truncation does not change shared/private status
>       KVM: selftests: Test conversion with elevated page refcount
>       KVM: selftests: Test that conversion to private does not support ZERO
>       KVM: selftests: Support checking that data not equal expected
>       KVM: selftests: Test that not specifying a conversion flag scrambles memory contents
>       KVM: selftests: Reset shared memory after hole-punching
>       KVM: selftests: Provide function to look up guest_memfd details from gpa
>       KVM: selftests: Make TEST_EXPECT_SIGBUS thread-safe
>       KVM: selftests: Update private_mem_conversions_test to mmap() guest_memfd
>       KVM: selftests: Add script to exercise private_mem_conversions_test
> 
> Michael Roth (1):
>       KVM: SEV: Make 'uaddr' parameter optional for KVM_SEV_SNP_LAUNCH_UPDATE
> 
> Sean Christopherson (18):
>       KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
>       KVM: Rename KVM_GENERIC_MEMORY_ATTRIBUTES to KVM_VM_MEMORY_ATTRIBUTES
>       KVM: Enumerate support for PRIVATE memory iff kvm_arch_has_private_mem is defined
>       KVM: Stub in ability to disable per-VM memory attribute tracking
>       KVM: guest_memfd: Wire up kvm_get_memory_attributes() to per-gmem attributes
>       KVM: Move KVM_VM_MEMORY_ATTRIBUTES config definition to x86
>       KVM: Let userspace disable per-VM mem attributes, enable per-gmem attributes
>       KVM: guest_memfd: Enable INIT_SHARED on guest_memfd for x86 Coco VMs
>       KVM: selftests: Create gmem fd before "regular" fd when adding memslot
>       KVM: selftests: Rename guest_memfd{,_offset} to gmem_{fd,offset}
>       KVM: selftests: Add support for mmap() on guest_memfd in core library
>       KVM: selftests: Add selftests global for guest memory attributes capability
>       KVM: selftests: Add helpers for calling ioctls on guest_memfd
>       KVM: selftests: Test that shared/private status is consistent across processes
>       KVM: selftests: Provide common function to set memory attributes
>       KVM: selftests: Check fd/flags provided to mmap() when setting up memslot
>       KVM: selftests: Update pre-fault test to work with per-guest_memfd attributes
>       KVM: selftests: Update private memory exits test to work with per-gmem attributes
> 
>  Documentation/virt/kvm/api.rst                     | 139 ++++-
>  .../virt/kvm/x86/amd-memory-encryption.rst         |  19 +-
>  Documentation/virt/kvm/x86/intel-tdx.rst           |   4 +
>  arch/x86/include/asm/kvm_host.h                    |   2 +-
>  arch/x86/kvm/Kconfig                               |  15 +-
>  arch/x86/kvm/mmu/mmu.c                             |  20 +-
>  arch/x86/kvm/svm/sev.c                             |  18 +-
>  arch/x86/kvm/vmx/tdx.c                             |   8 +-
>  arch/x86/kvm/x86.c                                 | 145 ++++-
>  include/linux/kvm_host.h                           |  74 ++-
>  include/trace/events/kvm.h                         |   4 +-
>  include/uapi/linux/kvm.h                           |  21 +
>  mm/swap.c                                          |   2 +
>  tools/testing/selftests/kvm/Makefile.kvm           |   5 +
>  tools/testing/selftests/kvm/include/kvm_util.h     | 141 ++++-
>  tools/testing/selftests/kvm/include/test_util.h    |  34 +-
>  .../selftests/kvm/kvm_has_gmem_attributes.c        |  17 +
>  tools/testing/selftests/kvm/lib/kvm_util.c         | 130 +++--
>  tools/testing/selftests/kvm/lib/test_util.c        |   7 -
>  tools/testing/selftests/kvm/lib/x86/sev.c          |   2 +-
>  .../testing/selftests/kvm/pre_fault_memory_test.c  |   4 +-
>  .../kvm/x86/guest_memfd_conversions_test.c         | 552 +++++++++++++++++++
>  .../kvm/x86/private_mem_conversions_test.c         |  55 +-
>  .../kvm/x86/private_mem_conversions_test.sh        | 128 +++++
>  .../selftests/kvm/x86/private_mem_kvm_exits_test.c |  38 +-
>  virt/kvm/Kconfig                                   |   3 +-
>  virt/kvm/guest_memfd.c                             | 591 ++++++++++++++++++++-
>  virt/kvm/kvm_main.c                                |  87 ++-
>  28 files changed, 2075 insertions(+), 190 deletions(-)
> ---
> base-commit: 39f1c201b93f4ff71631bac72cff6eb155f976a4
> change-id: 20260225-gmem-inplace-conversion-bd0dbd39753a
> 
> Best regards,
> --
> Ackerley Tng <ackerleytng@google.com>
> 
> 

^ permalink raw reply

* Re: [PATCH v8 07/21] coco/tdx-host: Implement firmware upload sysfs ABI for TDX module updates
From: Dave Hansen @ 2026-04-29 23:17 UTC (permalink / raw)
  To: Chao Gao, kvm, linux-coco, linux-kernel, x86
  Cc: binbin.wu, dave.hansen, djbw, ira.weiny, kai.huang, kas,
	nik.borisov, paulmck, pbonzini, reinette.chatre, rick.p.edgecombe,
	sagis, seanjc, tony.lindgren, vannapurve, vishal.l.verma,
	yilun.xu, xiaoyao.li, yan.y.zhao, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, H. Peter Anvin
In-Reply-To: <20260427152854.101171-8-chao.gao@intel.com>

On 4/27/26 08:28, Chao Gao wrote:
> Linux kernel supports two primary firmware update mechanisms:
>   - request_firmware()
>   - firmware upload (or fw_upload)

All the stuff here is good info, but it was hard to extract the
implementation information from the background.

I think this would do:

	Select fw_upload for doing TDX module updates. The process of
	selecting among available update images is complicated and
	nuanced. Punt the selection policy out to userspace.

...
> +static int seamldr_init(struct device *dev)
> +{
> +	struct fw_upload *tdx_fwl;
> +
> +	if (!can_expose_seamldr())
> +		return 0;

can_expose_seamldr() has a not great name.

Why not just have naming that says:

	if (supports_runtime_update())
		...

Why abstract this out to what can or can't be exposed?



^ permalink raw reply

* Re: [PATCH v4 0/7] Add RMPOPT support.
From: Kalra, Ashish @ 2026-04-29 23:07 UTC (permalink / raw)
  To: tglx, mingo, bp, dave.hansen, x86, hpa, seanjc, peterz,
	thomas.lendacky, herbert, davem, ardb
  Cc: pbonzini, aik, Michael.Roth, KPrateek.Nayak, Tycho.Andersen,
	Nathan.Fontenot, jackyli, pgonda, rientjes, jacobhxu, xin,
	pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen, darwi,
	linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <cover.1775874970.git.ashish.kalra@amd.com>

Hello Dave, Sean,

Looking forward to your feedback, comments, thoughts on RMPOPT v4 patch series.

Thanks,
Ashish

On 4/13/2026 2:42 PM, Ashish Kalra wrote:
> From: Ashish Kalra <ashish.kalra@amd.com>
> 
> In the SEV-SNP architecture, hypervisor and non-SNP guests are subject
> to RMP checks on writes to provide integrity of SEV-SNP guest memory.
> 
> The RMPOPT architecture enables optimizations whereby the RMP checks
> can be skipped if 1GB regions of memory are known to not contain any
> SNP guest memory.
> 
> RMPOPT is a new instruction designed to minimize the performance
> overhead of RMP checks for the hypervisor and non-SNP guests.
> 
> RMPOPT instruction currently supports two functions. In case of the
> verify and report status function the CPU will read the RMP contents,
> verify the entire 1GB region starting at the provided SPA is HV-owned.
> For the entire 1GB region it checks that all RMP entries in this region
> are HV-owned (i.e, not in assigned state) and then accordingly updates
> the RMPOPT table to indicate if optimization has been enabled and
> provide indication to software if the optimization was successful.
> 
> In case of report status function, the CPU returns the optimization
> status for the 1GB region.
> 
> The RMPOPT table is managed by a combination of software and hardware.
> Software uses the RMPOPT instruction to set bits in the table,
> indicating that regions of memory are entirely HV-owned.  Hardware
> automatically clears bits in the RMPOPT table when RMP contents are
> changed during RMPUPDATE instruction.
> 
> For more information on the RMPOPT instruction, see the AMD64 RMPOPT
> technical documentation.
> 
> As SNP is enabled by default the hypervisor and non-SNP guests are
> subject to RMP write checks to provide integrity of SNP guest memory.
> 
> This patch-series adds support to enable RMP optimizations for up to
> 2TB of system RAM across the system and allow RMPUPDATE to disable
> those optimizations as SNP guests are launched.
> 
> Support for RAM larger than 2 TB will be added in follow-on series.
> 
> This series also introduces support to re-enable RMP optimizations
> during SNP guest termination, after guest pages have been converted
> back to shared.
> 
> RMP optimizations are performed asynchronously by queuing work on a
> dedicated workqueue after a 10 second delay.
> 
> Delaying work allows batching of multiple SNP guest terminations.
> 
> Once 1GB hugetlb guest_memfd support is merged, support for
> re-enabling RMPOPT optimizations during 1GB page cleanup will be added
> in follow-on series.
> 
> Additionally add debugfs interface to report per-CPU RMPOPT status
> across all system RAM.
> 
> v4:
> - Add new wrmsrq_on_cpus() helper to write same u64 value to a
>   per-CPU MSR across a cpumask without per-cpu struct allocation
>   overhead. 
> - Rename configure_and_enable_rmpopt() to snp_setup_rmpopt().
> - Use wrmsrq_on_cpus() instead of wrmsrq_on_cpu() loop for
>   programming RMPOPT_BASE MSRs.
> - Add setup_clear_cpu_cap(X86_FEATURE_RMPOPT) if segmented RMP
>   setup fails or workqueue allocation fails.
> - Add X86_FEATURE_RMPOPT feature clear logic in amd_cc_platform_clear()
>   for CC_ATTR_HOST_SEV_SNP.
> - All of the above allow checking for only X86_FEATURE_RMPOPT for both
>   RMPOPT setup/enable and RMP re-optimizations.
> - Rename snp_perform_rmp_optimization() to snp_rmpopt_all_physmem().
> - Split rmpopt() into rmpopt() and rmpopt_smp() for SMP callback use.
> - Introduce separate rmpopt_report_cpumask for debugfs reporting,
>   distinct from rmpopt_cpumask used for primary thread tracking.
> - Remove snp_perform_rmp_optimization() call from __sev_snp_init_locked() 
>   and instead setup and enable RMPOPT after SNP is enabled and 
>   initialized.
> 
> v3:
> - Drop all RMPOPT kthread support and introduce adding custom and
>   dedicated workqueue to schedule delayed and asynchronous RMPOPT work.
> - Drop the guest_memfd inode cleanup interface and add support to
>   re-enable RMP optimizations during guest shutdown using the
>   asynchronous and delayed workqueue interface.
> - Introduce new __rmpopt() helper and rmpopt() and
>   rmpopt_report_status() wrappers on top which use rax and rcx
>   parameters to closely match RMPOPT specs.
> - Use new optimized RMPOPT loop to issue RMPOPT instructions on all
>   system RAM upto 2TB and all CPUs, by optimizing each range on one CPU
>   first, then let other CPUs execute RMPOPT in parallel so they can skip
>   most work as the range has already been optimized.
> - Also add support for running the optimized RMPOPT loop only on
>   one thread per core.
> - Replace all PUD_SIZE references with SZ_1G to conform to 1GB regions
>   as specified by RMPOPT specifications and not be dependent on PUD_SIZE
>   which makes the RMPOPT patch-set independent of x86 page table sizes.
> - Use wrmsrq_on_cpu() to program the RMPOPT_BASE MSR registers on
>   all CPUs that removes all ugly casting to use on_each_cpu_mask().
> - Fix inline commits and patch commit messages
> 
> 
> v2:
> - Drop all NUMA and Socket configuration and enablement support and
>   enable RMPOPT support for up to 2TB of system RAM.
> - Drop get_cpumask_of_primary_threads() and enable per-core RMPOPT
>   base MSRs and issue RMPOPT instruction on all CPUs.
> - Drop the configfs interface to manually re-enable RMP optimizations.
> - Add new guest_memfd cleanup interface to automatically re-enable
>   RMP optimizations during guest shutdown.
> - Include references to the public RMPOPT documentation.
> - Move debugfs directory for RMPOPT under architecuture specific
>   parent directory.
> 
> Ashish Kalra (7):
>   x86/cpufeatures: Add X86_FEATURE_AMD_RMPOPT feature flag
>   x86/msr: add wrmsrq_on_cpus helper
>   x86/sev: Initialize RMPOPT configuration MSRs
>   x86/sev: Add support to perform RMP optimizations asynchronously
>   x86/sev: Add interface to re-enable RMP optimizations.
>   KVM: SEV: Perform RMP optimizations on SNP guest shutdown
>   x86/sev: Add debugfs support for RMPOPT
> 
>  arch/x86/coco/core.c               |   1 +
>  arch/x86/include/asm/cpufeatures.h |   2 +-
>  arch/x86/include/asm/msr-index.h   |   3 +
>  arch/x86/include/asm/msr.h         |   5 +
>  arch/x86/include/asm/sev.h         |   4 +
>  arch/x86/kernel/cpu/scattered.c    |   1 +
>  arch/x86/kvm/svm/sev.c             |   2 +
>  arch/x86/lib/msr-smp.c             |  20 +++
>  arch/x86/virt/svm/sev.c            | 271 ++++++++++++++++++++++++++++-
>  drivers/crypto/ccp/sev-dev.c       |   3 +
>  10 files changed, 310 insertions(+), 2 deletions(-)
> 
> --
> 2.43.0
> 
> 

^ permalink raw reply

* Re: [PATCH RFC v5 00/53] guest_memfd: In-place conversion support
From: Sean Christopherson @ 2026-04-29 15:06 UTC (permalink / raw)
  To: Ackerley Tng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, suzuki.poulose, aneesh.kumar, Paolo Bonzini,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
	Vishal Annapurve, Andrew Morton, Chris Li, Kairui Song,
	Kemeng Shi, Nhat Pham, Baoquan He, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka, kvm,
	linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260428-gmem-inplace-conversion-v5-0-d8608ccfca22@google.com>

On Tue, Apr 28, 2026, Ackerley Tng wrote:
> This is RFC v5 of guest_memfd in-place conversion support.

...

> TODOs
> 
> + Perhaps further clarify PRESERVE flag: [8]
> + Resolve issue where guest_memfd_conversions_test, which uses the
>   kselftest framework, doesn't perform teardown on assertion
>   failure. Please see proposal at [9]
> + Test with TDX selftests. We're in the process of rebasing TDX selftests
>   on this series and will post updates when that's tested.

Why exactly is this still RFC?  The TODOs here don't strike me as things that
would make this RFC.  Blockers for merge, yes/maybe/probably, but at a glance,
it feels like we've moved beyond RFC for the code itself.

^ permalink raw reply

* COCONUT-SVSM Development Release v2026.04-devel
From: Jörg Rödel @ 2026-04-29 12:03 UTC (permalink / raw)
  To: coconut-svsm, linux-coco

Hi all,

It is my pleasure to announce the COCONUT-SVSM development release
v2026.04-devel. It features 75 non-merge commits since the last release, the
highlights are:

  - Added initial VirtIO-VSOCK support in SVSM.

  - Reworked task scheduling and execution control.

  - Refactored virtual memory and page fault handling.

  - Added support for partial TLB flushes, including range-based CPU TLB
    flushing.

  - Updated the IGVM/stage2 boot flow.

  - Expanded kernel test coverage, including compile-time layout assertions
    plus simple stacktrace and symbol-resolution tests.
  
  - Refreshed documentation, especially around attestation, installation
    requirements, and debugging guidance.

All the details are in the shortlog below. A big thanks again to the
COCONUT-SVSM community for all the effort in driving the project forward!

Best,

	Joerg

Shortlog:

Carlos López (32):
      kernel: debug/symbols: fix symtab slice length
      kernel: debug/symbols: add a simple symbol resolution test
      kernel: debug/stacktrace: add a simple stacktrace test
      kernel: cpu/tlb: rename TlbFlushScope::flush_all() to flush_all_cpus()
      kernel: cpu/tlb: allow specifying address ranges in TlbFlushScope
      kernel: cpu/tlb: support partial TLB flushes
      kernel: sev/tlb: strongly type INVLPGB RAX
      kernel: sev/tlb: merge flush_tlb_global_sync and flush_tlb_sync()
      kernel: sev/tlb: hide helper TLB functions
      kernel: sev/tlb: support partial TLB flushes
      kernel: cpu/tlb: expose partial TLB flush functions
      kernel: protocols/core: flush individual PVALIDATE entries
      kernel: sev/tlb: add additional inline assembly flags
      kernel: tests: convert struct layout tests to compile-time assertions
      kernel: mm/address_space: const-compute temporary mapping area size
      kernel: cpu/percpu: initialize vrange allocators in initialize_vm_ranges()
      kernel: mm/vm: remove Mapping type
      kernel: mm/vm/range: remove mapping in one go
      kernel: mm/vm/mapping: remove incorrect documentation
      kernel: mm/vm/range: make VMR::virt_range() return a MemoryRegion
      kernel: task: remove unnecessary unwrap()
      kernel: task: remove unused Task::handle_pf()
      kernel: mm/vm/range: always populate page table as part of #PF handling
      kernel: mm/vm: make handle_page_fault() take &self
      kernel: mm/vm: rewire page faults
      kernel: task/mm: ensure VMR addresses are VMR_GRANULE-aligned
      kernel: mm/vm: check VMR invariants upfront
      libtcgtpm: do not link libcrt for userspace targets
      xbuild: use workspace dependencies
      packit: update to latest commit
      xbuild: use packit as a library
      cpuarch: add CPUID page accessors

Gerd Hoffmann (1):
      libtcgtpm: remove libtcgtpm.a

Joerg Roedel (1):
      COCONUT-SVSM Release 2026.04-devel

Jon Lange (12):
      bootimg: implement `Error` for boot image errors
      igvmbuilder: use `Error::Display` for error messages
      stage2: build boot image in igvmbuilder
      stage2: remove secrets page from stage2
      svsm: validate lowmem in kernel if not done in stage2
      igvmbuilder: define abstraction for IGVM parameter layout
      igvmbuilder: make stage2 optional
      task: remove lock around `TaskSchedState`
      task: enforce mutual exclusion of execution
      task: add task termination test
      task/schedule: modify task rescheduling
      task/schedule: permit scheduler callers to disable interrupts

Luigi Leonardi (14):
      kernel: introduce VsockTransport trait for driver abstraction
      Revert "virtio: remove vsock support"
      virtio-drivers/socket: apply clippy lints
      virtio-drivers/connectionmanager: track connection established state
      virtio-drivers/connectionmanager: reject operations after peer shutdown
      virtio-drivers/connectionmanager: add `is_local_port_used`
      kernel/vsock: introduce virtio-vsock support
      kernel/vsock: introduce VsockStream
      scripts/launch_guest: add vsock to launch_guest
      kernel/vsock: add in-svsm tests for VsockStream
      Makefile: fix clippy with CARGO_HACK
      kernel/vsock: fix SVSM crash in VsockStream when device is not available
      Documentation/ATTESTATION: fix wrong protocol and path in example description
      Documentation/ATTESTATION: correct wrong parameter name

Nicola Ramacciotti (8):
      docs: Move debugging information to developer section
      docs: Improve debugging guide
      docs: Clarify gdb usage
      gitignore: Avoid tracking gdb_history
      github/workflows: Update some actions to Node.js 24
      github/workflows: Fix cargo audit installation
      github/workflows: Unify and always run both compliance checks
      github/workflows: enable vhost support in QEMU build and install netcat

Nihal (1):
      github/workflows: cargo audit

Oliver Steffen (1):
      kernel: Verify ACPI table checksum

Stefano Garzarella (3):
      kernel/ghcb: add missing register offset assertions
      docs/INSTALL: update guest image requirement
      github/workflows: drop explicit checkout ref in PR workflows

Tanya Agarwal (1):
      Actions: add SPDX header check script

Vaishali Thakkar (1):
      igvmbuilder: remove obsolete debug_swap CLI options


^ permalink raw reply

* Re: [PATCH v4 3/3] coco: guest: arm64: Query host IPA-change alignment via RHI
From: Aneesh Kumar K.V @ 2026-04-29  9:03 UTC (permalink / raw)
  To: Will Deacon
  Cc: linux-kernel, iommu, linux-coco, linux-arm-kernel, kvmarm,
	Catalin Marinas, Jason Gunthorpe, Marc Zyngier, Marek Szyprowski,
	Robin Murphy, Steven Price, Suzuki K Poulose, Thomas Gleixner,
	sebastianene
In-Reply-To: <afC8hZTJY6Cx8Liz@willie-the-truck>

Will Deacon <will@kernel.org> writes:

> [+Seb for the ITS]
>
> On Mon, Apr 27, 2026 at 12:01:08PM +0530, Aneesh Kumar K.V (Arm) wrote:
>> Add the Realm Host Interface support needed to query host configuration
>> from a Realm guest. Define the RHI hostconf SMCs, add rsi_host_call(), and
>> use them during Realm initialization to retrieve the host IPA-change
>> alignment size.
>> 
>> Expose that alignment through realm_get_hyp_pagesize() and
>> mem_decrypt_granule_size() so shared-buffer allocation and
>> encryption/decryption paths can honor the ipa change page-size requirement.
>> 
>> If the host reports an invalid alignment (when alginment value is not
>> multiple of 4K), do not enable Realm support.
>> 
>> This provides the host alignment information required by the shared buffer
>> alignment changes.
>> 
>> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
>> ---
>>  arch/arm64/include/asm/mem_encrypt.h |  3 ++
>>  arch/arm64/include/asm/rhi.h         | 24 +++++++++++++
>>  arch/arm64/include/asm/rsi.h         |  2 ++
>>  arch/arm64/include/asm/rsi_cmds.h    | 10 ++++++
>>  arch/arm64/include/asm/rsi_smc.h     |  7 ++++
>>  arch/arm64/kernel/Makefile           |  2 +-
>>  arch/arm64/kernel/rhi.c              | 54 ++++++++++++++++++++++++++++
>>  arch/arm64/kernel/rsi.c              | 13 +++++++
>>  arch/arm64/mm/mem_encrypt.c          |  8 +++++
>>  9 files changed, 122 insertions(+), 1 deletion(-)
>>  create mode 100644 arch/arm64/include/asm/rhi.h
>>  create mode 100644 arch/arm64/kernel/rhi.c
>
> [...]
>
>> diff --git a/arch/arm64/mm/mem_encrypt.c b/arch/arm64/mm/mem_encrypt.c
>> index 38c62c9e4e74..f5d64bc29c20 100644
>> --- a/arch/arm64/mm/mem_encrypt.c
>> +++ b/arch/arm64/mm/mem_encrypt.c
>> @@ -59,3 +59,11 @@ int set_memory_decrypted(unsigned long addr, int numpages)
>>  	return crypt_ops->decrypt(addr, numpages);
>>  }
>>  EXPORT_SYMBOL_GPL(set_memory_decrypted);
>> +
>> +size_t mem_decrypt_granule_size(void)
>> +{
>> +	if (is_realm_world())
>> +		return max(PAGE_SIZE, realm_get_hyp_pagesize());
>> +	return PAGE_SIZE;
>
> No, this should be indirected via 'struct arm64_mem_crypt_ops' because
> there's nothing particularly unique to realms here. For pKVM protected
> guests using a smaller page-size than the host, we'd presumably need
> something similar for the ITS (where restricted-dma isn't used).
>

Sure, I will rework this to use struct arm64_mem_crypt_ops in the next revision.

-aneesh

^ permalink raw reply

* Re: [PATCH v4 3/3] coco: guest: arm64: Query host IPA-change alignment via RHI
From: Aneesh Kumar K.V @ 2026-04-29  9:01 UTC (permalink / raw)
  To: Marc Zyngier
  Cc: linux-kernel, iommu, linux-coco, linux-arm-kernel, kvmarm,
	Catalin Marinas, Jason Gunthorpe, Marek Szyprowski, Robin Murphy,
	Steven Price, Suzuki K Poulose, Thomas Gleixner, Will Deacon
In-Reply-To: <86tssvyz2v.wl-maz@kernel.org>

Marc Zyngier <maz@kernel.org> writes:

> On Tue, 28 Apr 2026 13:49:46 +0100,
> Aneesh Kumar K.V <aneesh.kumar@kernel.org> wrote:
>> 
>> Marc Zyngier <maz@kernel.org> writes:
>> 
>> > On Mon, 27 Apr 2026 07:31:08 +0100,
>> > "Aneesh Kumar K.V (Arm)" <aneesh.kumar@kernel.org> wrote:
>> >> 
>> >> Add the Realm Host Interface support needed to query host configuration
>> >> from a Realm guest. Define the RHI hostconf SMCs, add rsi_host_call(), and
>> >> use them during Realm initialization to retrieve the host IPA-change
>> >> alignment size.
>> >
>> > I don't understand what "IPA-change" means. What you are after is the
>> > host's sharing granule size.
>> >
>> 
>> This is part of the RHI specification, and the call is named
>> RHI_HOSTCONF_GET_IPA_CHANGE_ALIGNMENT. The intent is to determine the
>> alignment requirements for changing IPA attributes (protected vs.
>> unprotected IPA
>
> This really is a terrible name. Why the 'change' part? It doesn't
> change, it is a constant.
>
> Oh well...
>
> [...]
>
>> >> +static inline unsigned long rsi_host_call(struct rsi_host_call *rhi_call)
>> >> +{
>> >> +	phys_addr_t addr = virt_to_phys(rhi_call);
>> >> +	struct arm_smccc_res res;
>> >> +
>> >> +	arm_smccc_1_1_invoke(SMC_RSI_HOST_CALL, addr, &res);
>> >
>> > Errr... What guarantees that *rhi_call is *IPA contiguous*? This is
>> > incredibly fragile. You should at the very least check that this isn't
>> > vmalloc'd.
>> >
>> 
>> 
>> I didn’t quite follow that. We have other RSI calls (even RMI calls)
>> that do similar things, and the caller understands that the address
>> should be IPA-contiguous.
>
> Does it? Where is it documented?  All you get is a pointer, so all
> bets are off.
>
>> Are you suggesting that all RSI calls should
>> add checks for this?. or are you suggesting to update the API to
>> 
>> unsigned long rsi_host_call(unsigned long rhi_call_phys) ?
>
> I'm suggesting that this API is subtly broken because it makes random
> assumption about the physical contiguity of the VA space. It does so
> without any check, without any documentation.
>
> Simply changing the parameter to phys_addr_t could at the very least
> capture some of the requirements, but I'd like something in big bold
> letters.
>
>>
>> >> +
>> >> +	return res.a0;
>> >> +}
>> >> +
>> >>  #endif /* __ASM_RSI_CMDS_H */
>> >> diff --git a/arch/arm64/include/asm/rsi_smc.h b/arch/arm64/include/asm/rsi_smc.h
>> >> index e19253f96c94..9ee8b5c7612e 100644
>> >> --- a/arch/arm64/include/asm/rsi_smc.h
>> >> +++ b/arch/arm64/include/asm/rsi_smc.h
>> >> @@ -182,6 +182,13 @@ struct realm_config {
>> >>   */
>> >>  #define SMC_RSI_IPA_STATE_GET			SMC_RSI_FID(0x198)
>> >>  
>> >> +struct rsi_host_call {
>> >> +	union {
>> >> +		u16 imm;
>> >> +		u64 padding0;
>> >> +	};
>> >> +	u64 gprs[31];
>> >> +} __aligned(0x100);
>> >>  /*
>> >>   * Make a Host call.
>> >>   *
>> >> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
>> >> index fe627100d199..3e72dd9584ed 100644
>> >> --- a/arch/arm64/kernel/Makefile
>> >> +++ b/arch/arm64/kernel/Makefile
>> >> @@ -34,7 +34,7 @@ obj-y			:= debug-monitors.o entry.o irq.o fpsimd.o		\
>> >>  			   cpufeature.o alternative.o cacheinfo.o		\
>> >>  			   smp.o smp_spin_table.o topology.o smccc-call.o	\
>> >>  			   syscall.o proton-pack.o idle.o patching.o pi/	\
>> >> -			   rsi.o jump_label.o
>> >> +			   rsi.o jump_label.o rhi.o
>> >>  
>> >>  obj-$(CONFIG_COMPAT)			+= sys32.o signal32.o			\
>> >>  					   sys_compat.o
>> >> diff --git a/arch/arm64/kernel/rhi.c b/arch/arm64/kernel/rhi.c
>> >> new file mode 100644
>> >> index 000000000000..7cd6c5102464
>> >> --- /dev/null
>> >> +++ b/arch/arm64/kernel/rhi.c
>> >> @@ -0,0 +1,54 @@
>> >> +// SPDX-License-Identifier: GPL-2.0-only
>> >> +/*
>> >> + * Copyright (C) 2026 ARM Ltd.
>> >> + */
>> >> +
>> >> +#include <linux/mm.h>
>> >> +#include <asm/rsi.h>
>> >> +#include <asm/rhi.h>
>> >> +
>> >> +/* we need an aligned rhicall for rsi_host_call. slab is not yet ready */
>> >> +static struct rsi_host_call hyp_pagesize_rhicall;
>> >
>> > Why the "hyp_" prefix? This has absolutely nothing to with the
>> > hypervisor.
>> >
>> 
>> Sure will update "hyp_" reference to host. 
>> 
>> 
>> >> +unsigned long rhi_get_ipa_change_alignment(void)
>> >> +{
>> >> +	long ret;
>> >> +	unsigned long ipa_change_align;
>> >> +
>> >> +	hyp_pagesize_rhicall.imm = 0;
>> >> +	hyp_pagesize_rhicall.gprs[0] = RHI_HOSTCONF_VERSION;
>> >> +	ret = rsi_host_call(lm_alias(&hyp_pagesize_rhicall));
>> >> +	if (ret != RSI_SUCCESS)
>> >> +		goto err_out;
>> >> +
>> >> +	if (hyp_pagesize_rhicall.gprs[0] != RHI_HOSTCONF_VER_1_0)
>> >> +		goto err_out;
>> >> +
>> >> +	hyp_pagesize_rhicall.imm = 0;
>> >> +	hyp_pagesize_rhicall.gprs[0] = RHI_HOSTCONF_FEATURES;
>> >> +	ret = rsi_host_call(lm_alias(&hyp_pagesize_rhicall));
>> >> +	if (ret != RSI_SUCCESS)
>> >> +		goto err_out;
>> >> +
>> >> +	if (!(hyp_pagesize_rhicall.gprs[0] & __RHI_HOSTCONF_GET_IPA_CHANGE_ALIGNMENT))
>> >> +		goto err_out;
>> >> +
>> >> +	hyp_pagesize_rhicall.imm = 0;
>> >> +	hyp_pagesize_rhicall.gprs[0] = RHI_HOSTCONF_GET_IPA_CHANGE_ALIGNMENT;
>> >> +	ret = rsi_host_call(lm_alias(&hyp_pagesize_rhicall));
>> >> +	if (ret != RSI_SUCCESS)
>> >> +		goto err_out;
>> >> +
>> >> +	ipa_change_align = hyp_pagesize_rhicall.gprs[0];
>> >> +	/* This error needs special handling in the caller */
>> >> +	if (ipa_change_align & (SZ_4K - 1))
>> >> +		return 0;
>> >> +
>> >> +	return ipa_change_align;
>> >> +
>> >> +err_out:
>> >> +	/*
>> >> +	 * For failure condition assume host is built with 4K page size
>> >> +	 * and hence ipa change alignment can be guest PAGE_SIZE.
>> >> +	 */
>> >> +	return PAGE_SIZE;
>> >> +}
>> >
>> > Why can't this be part of rsi.c? This is an RSI call, and it should be
>> > part of the RSI initialisation.
>> >
>> 
>> This is an RHI call as per the specification, hence it has been added to
>> rhi.c.
>
> News flash: this is the Linux kernel, not an ARM spec. We organise
> things based on the logical use, not on the TLA associated with it.
>
> And RHI is implemented in terms of RSI. In rsi.c it goes. We don't
> need this pointless proliferation of helper files that only result in
> equally pointless global symbols.
>
>> 
>> >> diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c
>> >> index 9e846ce4ef9c..ff735c04e236 100644
>> >> --- a/arch/arm64/kernel/rsi.c
>> >> +++ b/arch/arm64/kernel/rsi.c
>> >> @@ -14,8 +14,10 @@
>> >>  #include <asm/mem_encrypt.h>
>> >>  #include <asm/pgtable.h>
>> >>  #include <asm/rsi.h>
>> >> +#include <asm/rhi.h>
>> >>  
>> >>  static struct realm_config config;
>> >> +static unsigned long ipa_change_alignment = PAGE_SIZE;
>> >>  
>> >>  unsigned long prot_ns_shared;
>> >>  EXPORT_SYMBOL(prot_ns_shared);
>> >> @@ -139,6 +141,11 @@ static int realm_ioremap_hook(phys_addr_t phys, size_t size, pgprot_t *prot)
>> >>  	return 0;
>> >>  }
>> >>  
>> >> +unsigned long realm_get_hyp_pagesize(void)
>> >> +{
>> >> +	return ipa_change_alignment;
>> >> +}
>> >
>> > Again, this has nothing to do with the hypervisor, but the host. And
>> > ipa_change_alignment is still a wording I can't wrap my small head
>> > around.
>> >
>> >> +
>> >>  void __init arm64_rsi_init(void)
>> >>  {
>> >>  	if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_SMC)
>> >> @@ -147,6 +154,12 @@ void __init arm64_rsi_init(void)
>> >>  		return;
>> >>  	if (WARN_ON(rsi_get_realm_config(&config)))
>> >>  		return;
>> >> +
>> >> +	ipa_change_alignment = rhi_get_ipa_change_alignment();
>> >> +	/* If we don't get a correct alignment response, don't enable realm */
>> >> +	if (!ipa_change_alignment)
>> >> +		return;
>> >
>> > But at the same time, you override a global value with an error, and
>> > then paper over it in mem_decrypt_granule_size()...
>> >
>> 
>> 
>> I believe I received similar feedback on my previous version as well,
>> which I didn’t quite follow.
>
> And you didn't think of asking? Sometimes I wonder what these patch
> reviews are for... Just to waste some more electrons, I guess :-/.
>
>> 
>> rhi_get_ipa_change_alignment() only returns an error when the host
>> returns a size that is not 4K-aligned. Otherwise, it returns the
>> host-determined size, or defaults to guest PAGE_SIZE if the RHI call
>> itself is not supported.
>
> You encode the error as 0. You override ipa_change_alignment with 0.
>
> Then...
>
>> >> +size_t mem_decrypt_granule_size(void)
>> >> +{
>> >> +	if (is_realm_world())
>> >> +		return max(PAGE_SIZE, realm_get_hyp_pagesize());
>> >
>> > If you didn't mess with ipa_change_alignment above, you shouldn't need
>> > this max().
>> >
>> 
>> size_t mem_decrypt_granule_size(void)
>> {
>> 	if (is_realm_world())
>> 		return max(PAGE_SIZE, realm_get_hyp_pagesize());
>> 	return PAGE_SIZE;
>> }
>> 
>> That needs to use max(), because we should align to the guest PAGE_SIZE
>> if it is larger than the host-specified alignment value.
>
> ... you need to correct that back to PAGE_SIZE because you have stored
> something smaller than PAGE_SIZE.
>
> Isn't the problem really obvious? ipa_change_alignment can *NEVER* go
> down. It should never be allowed to reduce, because that's exactly
> the property you are trying to enforce.
>

Sure, I will update rhi_get_ipa_change_alignment() to always return the
max value.

-aneesh

^ permalink raw reply

* Re: [PATCH RFC v5 24/53] KVM: SEV: Make 'uaddr' parameter optional for KVM_SEV_SNP_LAUNCH_UPDATE
From: Ackerley Tng @ 2026-04-28 23:40 UTC (permalink / raw)
  To: Ackerley Tng via B4 Relay, aik, andrew.jones, binbin.wu, brauner,
	chao.p.peng, david, ira.weiny, jmattson, jthoughton, michael.roth,
	oupton, pankaj.gupta, qperret, rick.p.edgecombe, rientjes,
	shivankg, steven.price, tabba, willy, wyihan, yan.y.zhao,
	forkloop, pratyush, suzuki.poulose, aneesh.kumar, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260428-gmem-inplace-conversion-v5-24-d8608ccfca22@google.com>

Ackerley Tng via B4 Relay <devnull+ackerleytng.google.com@kernel.org>
writes:

> From: Michael Roth <michael.roth@amd.com>
>

Thanks Michael!

>
> [...snip...]
>
>
> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> index c2126b3c30724..bf10d24907a00 100644
> --- a/arch/x86/kvm/svm/sev.c
> +++ b/arch/x86/kvm/svm/sev.c
> @@ -2343,7 +2343,15 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
>  	int level;
>  	int ret;
>
> -	if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src_page))
> +	/*
> +	 * For vm_memory_attributes=1, in-place conversion/population is not
> +	 * supported, so the initial contents necessarily need to come from a
> +	 * separate src address. For vm_memory_attributes=0, this isn't
> +	 * necessarily the case, since the pages may have been populated
> +	 * directly from userspace before calling KVM_SEV_SNP_LAUNCH_UPDATE.
> +	 */

I dropped the #ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES from [1] since
vm_memory_attributes is #define-d as false when if
CONFIG_KVM_VM_MEMORY_ATTRIBUTES is not defined.

> +	if (vm_memory_attributes &&
> +	    sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src_page)
>  		return -EINVAL;
>
>  	ret = snp_lookup_rmpentry((u64)pfn, &assigned, &level);

[1] https://github.com/AMDESE/linux/commit/7e7c29afdf3763822ced0b7007fc0f93b8fb993d

>
> [...snip...]
>

^ permalink raw reply

* [POC PATCH 6/6] KVM: selftests: Test content modes ZERO and PRESERVE for SNP
From: Ackerley Tng @ 2026-04-28 23:33 UTC (permalink / raw)
  To: devnull+ackerleytng.google.com
  Cc: ackerleytng, aik, akpm, andrew.jones, aneesh.kumar, axelrasmussen,
	baohua, bhe, binbin.wu, bp, brauner, chao.p.peng, chrisl, corbet,
	dave.hansen, david, forkloop, hpa, ira.weiny, jgg, jmattson,
	jthoughton, kas, kasong, kvm, linux-coco, linux-doc, linux-kernel,
	linux-kselftest, linux-mm, linux-trace-kernel, mathieu.desnoyers,
	mhiramat, michael.roth, mingo, nphamcs, oupton, pankaj.gupta,
	pbonzini, pratyush, qi.zheng, qperret, rick.p.edgecombe, rientjes,
	rostedt, seanjc, shakeel.butt, shikemeng, shivankg, shuah, skhan,
	steven.price, suzuki.poulose, tabba, tglx, vannapurve, vbabka,
	weixugc, willy, wyihan, x86, yan.y.zhao, youngjun.park, yuanchu
In-Reply-To: <cover.1777418884.git.ackerleytng@google.com>

Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 .../selftests/kvm/x86/sev_smoke_test.c        | 47 +++++++++++++++++--
 1 file changed, 44 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/kvm/x86/sev_smoke_test.c b/tools/testing/selftests/kvm/x86/sev_smoke_test.c
index 86f17e59e9392..7a91a113c4fb7 100644
--- a/tools/testing/selftests/kvm/x86/sev_smoke_test.c
+++ b/tools/testing/selftests/kvm/x86/sev_smoke_test.c
@@ -365,7 +365,26 @@ static void guest_code_conversion(u8 *test_shared_gva, u8 *test_private_gva, u64
 	vmgexit();
 }
 
-static void test_conversion(u64 policy)
+static void vm_set_memory_attributes_expect_error(struct kvm_vm *vm, u64 gpa,
+						  size_t size, u64 attributes,
+						  u64 flags, int expected_errno)
+{
+	loff_t error_offset = -1;
+	size_t len_ignored;
+	loff_t offset;
+	int gmem_fd;
+	int ret;
+
+	gmem_fd = kvm_gpa_to_guest_memfd(vm, gpa, &offset, &len_ignored);
+	ret = __gmem_set_memory_attributes(gmem_fd, offset, size, attributes,
+					   &error_offset, flags);
+
+	TEST_ASSERT_EQ(ret, -1);
+	TEST_ASSERT_EQ(offset, error_offset);
+	TEST_ASSERT_EQ(errno, expected_errno);
+}
+
+static void test_conversion(u64 policy, u64 content_mode)
 {
 	gva_t test_private_gva;
 	gva_t test_shared_gva;
@@ -409,6 +428,21 @@ static void test_conversion(u64 policy)
 	TEST_ASSERT_EQ(vcpu->run->hypercall.args[1], 1);
 	TEST_ASSERT_EQ(vcpu->run->hypercall.args[2], KVM_MAP_GPA_RANGE_ENCRYPTED | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
 
+	/* ZERO when setting memory attributes to private is always not supported. */
+	vm_set_memory_attributes_expect_error(vm, test_gpa, PAGE_SIZE,
+					      KVM_MEMORY_ATTRIBUTE_PRIVATE,
+					      KVM_SET_MEMORY_ATTRIBUTES2_ZERO,
+					      EOPNOTSUPP);
+
+	/* PRESERVE is not supported for SNP. */
+	vm_set_memory_attributes_expect_error(vm, test_gpa, PAGE_SIZE, 0,
+					      KVM_SET_MEMORY_ATTRIBUTES2_PRESERVE,
+					      EOPNOTSUPP);
+	vm_set_memory_attributes_expect_error(vm, test_gpa, PAGE_SIZE,
+					      KVM_MEMORY_ATTRIBUTE_PRIVATE,
+					      KVM_SET_MEMORY_ATTRIBUTES2_PRESERVE,
+					      EOPNOTSUPP);
+
 	vm_mem_set_private(vm, test_gpa, PAGE_SIZE, KVM_SET_MEMORY_ATTRIBUTES2_MODE_UNSPECIFIED);
 
 	vcpu_run(vcpu);
@@ -419,7 +453,12 @@ static void test_conversion(u64 policy)
 	TEST_ASSERT_EQ(vcpu->run->hypercall.args[1], 1);
 	TEST_ASSERT_EQ(vcpu->run->hypercall.args[2], KVM_MAP_GPA_RANGE_DECRYPTED | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
 
-	vm_mem_set_shared(vm, test_gpa, PAGE_SIZE, KVM_SET_MEMORY_ATTRIBUTES2_MODE_UNSPECIFIED);
+	vm_mem_set_shared(vm, test_gpa, PAGE_SIZE, content_mode);
+
+	if (content_mode == KVM_SET_MEMORY_ATTRIBUTES2_ZERO)
+		TEST_ASSERT_EQ(READ_ONCE(*(u8 *)test_hva), 0);
+	else
+		fprintf(stderr, "test_hva contents = %x\n", READ_ONCE(*(u8 *)test_hva));
 
 	vcpu_run(vcpu);
 
@@ -441,7 +480,9 @@ int main(int argc, char *argv[])
 	// 	test_sev_smoke(guest_sev_es_code, KVM_X86_SEV_ES_VM, SEV_POLICY_ES);
 
 	if (kvm_cpu_has(X86_FEATURE_SEV_SNP)) {
-		test_conversion(snp_default_policy());
+		test_conversion(snp_default_policy(), KVM_SET_MEMORY_ATTRIBUTES2_MODE_UNSPECIFIED);
+		test_conversion(snp_default_policy(), KVM_SET_MEMORY_ATTRIBUTES2_ZERO);
+
 		// test_sev_smoke(guest_snp_code, KVM_X86_SNP_VM, snp_default_policy());
 	}
 
-- 
2.54.0.545.g6539524ca2-goog


^ permalink raw reply related

* [POC PATCH 5/6] KVM: selftests: Test conversions for SNP
From: Ackerley Tng @ 2026-04-28 23:33 UTC (permalink / raw)
  To: devnull+ackerleytng.google.com
  Cc: ackerleytng, aik, akpm, andrew.jones, aneesh.kumar, axelrasmussen,
	baohua, bhe, binbin.wu, bp, brauner, chao.p.peng, chrisl, corbet,
	dave.hansen, david, forkloop, hpa, ira.weiny, jgg, jmattson,
	jthoughton, kas, kasong, kvm, linux-coco, linux-doc, linux-kernel,
	linux-kselftest, linux-mm, linux-trace-kernel, mathieu.desnoyers,
	mhiramat, michael.roth, mingo, nphamcs, oupton, pankaj.gupta,
	pbonzini, pratyush, qi.zheng, qperret, rick.p.edgecombe, rientjes,
	rostedt, seanjc, shakeel.butt, shikemeng, shivankg, shuah, skhan,
	steven.price, suzuki.poulose, tabba, tglx, vannapurve, vbabka,
	weixugc, willy, wyihan, x86, yan.y.zhao, youngjun.park, yuanchu
In-Reply-To: <cover.1777418884.git.ackerleytng@google.com>

Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 .../selftests/kvm/x86/sev_smoke_test.c        | 190 +++++++++++++++++-
 1 file changed, 185 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/kvm/x86/sev_smoke_test.c b/tools/testing/selftests/kvm/x86/sev_smoke_test.c
index 8b859adf4cf6f..86f17e59e9392 100644
--- a/tools/testing/selftests/kvm/x86/sev_smoke_test.c
+++ b/tools/testing/selftests/kvm/x86/sev_smoke_test.c
@@ -253,17 +253,197 @@ static void test_sev_smoke(void *guest, u32 type, u64 policy)
 	}
 }
 
+#define GHCB_MSR_REG_GPA_REQ		0x012
+#define GHCB_MSR_REG_GPA_REQ_VAL(v)                \
+	/* GHCBData[63:12] */                      \
+	(((u64)((v) & GENMASK_ULL(51, 0)) << 12) | \
+	 /* GHCBData[11:0] */			   \
+	 GHCB_MSR_REG_GPA_REQ)
+
+#define GHCB_MSR_REG_GPA_RESP		0x013
+#define GHCB_MSR_REG_GPA_RESP_VAL(v)			\
+	/* GHCBData[63:12] */				\
+	(((u64)(v) & GENMASK_ULL(63, 12)) >> 12)
+
+#define GHCB_DATA_LOW			12
+#define GHCB_MSR_INFO_MASK		(BIT_ULL(GHCB_DATA_LOW) - 1)
+#define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK)
+
+/*
+ * SNP Page State Change Operation
+ *
+ * GHCBData[55:52] - Page operation:
+ *   0x0001	Page assignment, Private
+ *   0x0002	Page assignment, Shared
+ */
+enum psc_op {
+	SNP_PAGE_STATE_PRIVATE = 1,
+	SNP_PAGE_STATE_SHARED,
+};
+
+#define GHCB_MSR_PSC_REQ		0x014
+#define GHCB_MSR_PSC_REQ_GFN(gfn, op)			\
+	/* GHCBData[55:52] */				\
+	(((u64)((op) & 0xf) << 52) |			\
+	/* GHCBData[51:12] */				\
+	((u64)((gfn) & GENMASK_ULL(39, 0)) << 12) |	\
+	/* GHCBData[11:0] */				\
+	GHCB_MSR_PSC_REQ)
+
+#define GHCB_MSR_PSC_RESP		0x015
+#define GHCB_MSR_PSC_RESP_VAL(val)			\
+	/* GHCBData[63:32] */				\
+	(((u64)(val) & GENMASK_ULL(63, 32)) >> 32)
+
+static u64 ghcb_gpa;
+static void snp_register_ghcb(void)
+{
+	u64 ghcb_pfn = ghcb_gpa >> PAGE_SHIFT;
+	u64 val;
+
+	GUEST_ASSERT(ghcb_gpa);
+
+	wrmsr(MSR_AMD64_SEV_ES_GHCB, GHCB_MSR_REG_GPA_REQ_VAL(ghcb_gpa >> PAGE_SHIFT));
+	vmgexit();
+
+	val = rdmsr(MSR_AMD64_SEV_ES_GHCB);
+	GUEST_ASSERT_EQ(GHCB_RESP_CODE(val), GHCB_MSR_REG_GPA_RESP);
+	GUEST_ASSERT_EQ(GHCB_MSR_REG_GPA_RESP_VAL(val), ghcb_pfn);
+}
+
+static void snp_page_state_change(u64 gpa, enum psc_op op)
+{
+	u64 val;
+
+	wrmsr(MSR_AMD64_SEV_ES_GHCB, GHCB_MSR_PSC_REQ_GFN(gpa >> PAGE_SHIFT, op));
+	vmgexit();
+
+	val = rdmsr(MSR_AMD64_SEV_ES_GHCB);
+	GUEST_ASSERT_EQ(GHCB_RESP_CODE(val), GHCB_MSR_PSC_RESP);
+	GUEST_ASSERT_EQ(GHCB_MSR_PSC_RESP_VAL(val), 0);
+}
+
+#define RMP_PG_SIZE_4K			0
+static inline void pvalidate(void *vaddr, bool validate)
+{
+	bool no_rmpupdate;
+	int rc;
+
+	/* "pvalidate" mnemonic support in binutils 2.36 and newer */
+	asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFF\n\t"
+		     : "=@ccc"(no_rmpupdate), "=a"(rc)
+		     : "a"(vaddr), "c"(RMP_PG_SIZE_4K), "d"(validate)
+		     : "memory", "cc");
+
+	GUEST_ASSERT(!no_rmpupdate);
+	GUEST_ASSERT_EQ(rc, 0);
+}
+
+#define CONVERSION_TEST_VALUE_SHARED_1 0xab
+#define CONVERSION_TEST_VALUE_SHARED_2 0xcd
+#define CONVERSION_TEST_VALUE_PRIVATE 0xef
+#define CONVERSION_TEST_VALUE_SHARED_3 0xbc
+static void guest_code_conversion(u8 *test_shared_gva, u8 *test_private_gva, u64 test_gpa)
+{
+	snp_register_ghcb();
+
+	GUEST_ASSERT_EQ(READ_ONCE(*test_shared_gva), CONVERSION_TEST_VALUE_SHARED_1);
+	WRITE_ONCE(*test_shared_gva, CONVERSION_TEST_VALUE_SHARED_2);
+
+	snp_page_state_change(test_gpa, SNP_PAGE_STATE_PRIVATE);
+	pvalidate(test_private_gva, true);
+
+	WRITE_ONCE(*test_private_gva, CONVERSION_TEST_VALUE_PRIVATE);
+	GUEST_ASSERT_EQ(READ_ONCE(*test_private_gva), CONVERSION_TEST_VALUE_PRIVATE);
+
+	pvalidate(test_private_gva, false);
+	snp_page_state_change(test_gpa, SNP_PAGE_STATE_SHARED);
+
+	WRITE_ONCE(*test_shared_gva, CONVERSION_TEST_VALUE_SHARED_3);
+
+	wrmsr(MSR_AMD64_SEV_ES_GHCB, GHCB_MSR_TERM_REQ);
+	vmgexit();
+}
+
+static void test_conversion(u64 policy)
+{
+	gva_t test_private_gva;
+	gva_t test_shared_gva;
+	struct kvm_vcpu *vcpu;
+	gva_t ghcb_gva;
+	gpa_t test_gpa;
+	struct kvm_vm *vm;
+	void *ghcb_hva;
+	void *test_hva;
+
+	vm = vm_sev_create_with_one_vcpu(KVM_X86_SNP_VM, guest_code_conversion, &vcpu);
+
+	ghcb_gva = vm_alloc_shared(vm, PAGE_SIZE, KVM_UTIL_MIN_VADDR,
+				   MEM_REGION_TEST_DATA);
+	ghcb_hva = addr_gva2hva(vm, ghcb_gva);
+	ghcb_gpa = addr_gva2gpa(vm, ghcb_gva);
+	sync_global_to_guest(vm, ghcb_gpa);
+
+	test_shared_gva = vm_alloc_shared(vm, PAGE_SIZE, KVM_UTIL_MIN_VADDR,
+					  MEM_REGION_TEST_DATA);
+	test_hva = addr_gva2hva(vm, test_shared_gva);
+	test_gpa = addr_gva2gpa(vm, test_shared_gva);
+
+	test_private_gva = vm_unused_gva_gap(vm, PAGE_SIZE, KVM_UTIL_MIN_VADDR);
+	___virt_pg_map(vm, &vm->mmu, test_private_gva, test_gpa, PG_SIZE_4K, true);
+
+	vcpu_args_set(vcpu, 3, test_shared_gva, test_private_gva, test_gpa);
+
+	vm_sev_launch(vm, policy, NULL);
+
+	WRITE_ONCE(*(u8 *)test_hva, CONVERSION_TEST_VALUE_SHARED_1);
+
+	fprintf(stderr, "ghcb_hva=%p ghcb_gpa=%lx ghcb_gva=%lx\n", ghcb_hva, ghcb_gpa, ghcb_gva);
+	fprintf(stderr, "test_hva=%p test_gpa=%lx test_private_gva=%lx test_shared_gva=%lx\n", test_hva, test_gpa, test_private_gva, test_shared_gva);
+
+	vcpu_run(vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_HYPERCALL);
+	TEST_ASSERT_EQ(vcpu->run->hypercall.nr, KVM_HC_MAP_GPA_RANGE);
+	TEST_ASSERT_EQ(vcpu->run->hypercall.args[0], test_gpa);
+	TEST_ASSERT_EQ(vcpu->run->hypercall.args[1], 1);
+	TEST_ASSERT_EQ(vcpu->run->hypercall.args[2], KVM_MAP_GPA_RANGE_ENCRYPTED | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
+
+	vm_mem_set_private(vm, test_gpa, PAGE_SIZE, KVM_SET_MEMORY_ATTRIBUTES2_MODE_UNSPECIFIED);
+
+	vcpu_run(vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_HYPERCALL);
+	TEST_ASSERT_EQ(vcpu->run->hypercall.nr, KVM_HC_MAP_GPA_RANGE);
+	TEST_ASSERT_EQ(vcpu->run->hypercall.args[0], test_gpa);
+	TEST_ASSERT_EQ(vcpu->run->hypercall.args[1], 1);
+	TEST_ASSERT_EQ(vcpu->run->hypercall.args[2], KVM_MAP_GPA_RANGE_DECRYPTED | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
+
+	vm_mem_set_shared(vm, test_gpa, PAGE_SIZE, KVM_SET_MEMORY_ATTRIBUTES2_MODE_UNSPECIFIED);
+
+	vcpu_run(vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_SYSTEM_EVENT);
+	TEST_ASSERT_EQ(vcpu->run->system_event.type, KVM_SYSTEM_EVENT_SEV_TERM);
+	TEST_ASSERT_EQ(vcpu->run->system_event.ndata, 1);
+	TEST_ASSERT_EQ(vcpu->run->system_event.data[0], GHCB_MSR_TERM_REQ);
+
+	TEST_ASSERT_EQ(*(u8 *)test_hva, CONVERSION_TEST_VALUE_SHARED_3);
+}
+
 int main(int argc, char *argv[])
 {
 	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV));
 
-	test_sev_smoke(guest_sev_code, KVM_X86_SEV_VM, 0);
+	// test_sev_smoke(guest_sev_code, KVM_X86_SEV_VM, 0);
 
-	if (kvm_cpu_has(X86_FEATURE_SEV_ES))
-		test_sev_smoke(guest_sev_es_code, KVM_X86_SEV_ES_VM, SEV_POLICY_ES);
+	// if (kvm_cpu_has(X86_FEATURE_SEV_ES))
+	// 	test_sev_smoke(guest_sev_es_code, KVM_X86_SEV_ES_VM, SEV_POLICY_ES);
 
-	if (kvm_cpu_has(X86_FEATURE_SEV_SNP))
-		test_sev_smoke(guest_snp_code, KVM_X86_SNP_VM, snp_default_policy());
+	if (kvm_cpu_has(X86_FEATURE_SEV_SNP)) {
+		test_conversion(snp_default_policy());
+		// test_sev_smoke(guest_snp_code, KVM_X86_SNP_VM, snp_default_policy());
+	}
 
 	return 0;
 }
-- 
2.54.0.545.g6539524ca2-goog


^ permalink raw reply related

* [POC PATCH 4/6] KVM: selftests: Allow specifying CoCo-privateness while mapping a page
From: Ackerley Tng @ 2026-04-28 23:33 UTC (permalink / raw)
  To: devnull+ackerleytng.google.com
  Cc: ackerleytng, aik, akpm, andrew.jones, aneesh.kumar, axelrasmussen,
	baohua, bhe, binbin.wu, bp, brauner, chao.p.peng, chrisl, corbet,
	dave.hansen, david, forkloop, hpa, ira.weiny, jgg, jmattson,
	jthoughton, kas, kasong, kvm, linux-coco, linux-doc, linux-kernel,
	linux-kselftest, linux-mm, linux-trace-kernel, mathieu.desnoyers,
	mhiramat, michael.roth, mingo, nphamcs, oupton, pankaj.gupta,
	pbonzini, pratyush, qi.zheng, qperret, rick.p.edgecombe, rientjes,
	rostedt, seanjc, shakeel.butt, shikemeng, shivankg, shuah, skhan,
	steven.price, suzuki.poulose, tabba, tglx, vannapurve, vbabka,
	weixugc, willy, wyihan, x86, yan.y.zhao, youngjun.park, yuanchu
In-Reply-To: <cover.1777418884.git.ackerleytng@google.com>

Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 tools/testing/selftests/kvm/include/x86/processor.h |  2 ++
 tools/testing/selftests/kvm/lib/x86/processor.c     | 13 ++++++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h
index 77f576ee7789d..683f21452db58 100644
--- a/tools/testing/selftests/kvm/include/x86/processor.h
+++ b/tools/testing/selftests/kvm/include/x86/processor.h
@@ -1507,6 +1507,8 @@ enum pg_level {
 void tdp_mmu_init(struct kvm_vm *vm, int pgtable_levels,
 		  struct pte_masks *pte_masks);
 
+void ___virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, gva_t gva,
+		    gpa_t gpa,  int level, bool private);
 void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, gva_t gva,
 		   gpa_t gpa,  int level);
 void virt_map_level(struct kvm_vm *vm, gva_t gva, gpa_t gpa,
diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c
index b51467d70f6e7..02781194f51a2 100644
--- a/tools/testing/selftests/kvm/lib/x86/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86/processor.c
@@ -256,8 +256,8 @@ static u64 *virt_create_upper_pte(struct kvm_vm *vm,
 	return pte;
 }
 
-void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, gva_t gva,
-		   gpa_t gpa, int level)
+void ___virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, gva_t gva,
+		    gpa_t gpa, int level, bool private)
 {
 	const u64 pg_size = PG_LEVEL_SIZE(level);
 	u64 *pte = &mmu->pgd;
@@ -309,12 +309,19 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, gva_t gva,
 	 * Neither SEV nor TDX supports shared page tables, so only the final
 	 * leaf PTE needs manually set the C/S-bit.
 	 */
-	if (vm_is_gpa_protected(vm, gpa))
+	if (private)
 		*pte |= PTE_C_BIT_MASK(mmu);
 	else
 		*pte |= PTE_S_BIT_MASK(mmu);
 }
 
+void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, gva_t gva,
+		   gpa_t gpa, int level)
+{
+	___virt_pg_map(vm, mmu, gva, gpa, level,
+		       vm_is_gpa_protected(vm, gpa));
+}
+
 void virt_arch_pg_map(struct kvm_vm *vm, gva_t gva, gpa_t gpa)
 {
 	__virt_pg_map(vm, &vm->mmu, gva, gpa, PG_LEVEL_4K);
-- 
2.54.0.545.g6539524ca2-goog


^ permalink raw reply related

* [POC PATCH 3/6] KVM: selftests: Make guest_code_xsave more friendly
From: Ackerley Tng @ 2026-04-28 23:33 UTC (permalink / raw)
  To: devnull+ackerleytng.google.com
  Cc: ackerleytng, aik, akpm, andrew.jones, aneesh.kumar, axelrasmussen,
	baohua, bhe, binbin.wu, bp, brauner, chao.p.peng, chrisl, corbet,
	dave.hansen, david, forkloop, hpa, ira.weiny, jgg, jmattson,
	jthoughton, kas, kasong, kvm, linux-coco, linux-doc, linux-kernel,
	linux-kselftest, linux-mm, linux-trace-kernel, mathieu.desnoyers,
	mhiramat, michael.roth, mingo, nphamcs, oupton, pankaj.gupta,
	pbonzini, pratyush, qi.zheng, qperret, rick.p.edgecombe, rientjes,
	rostedt, seanjc, shakeel.butt, shikemeng, shivankg, shuah, skhan,
	steven.price, suzuki.poulose, tabba, tglx, vannapurve, vbabka,
	weixugc, willy, wyihan, x86, yan.y.zhao, youngjun.park, yuanchu
In-Reply-To: <cover.1777418884.git.ackerleytng@google.com>

The original implementation of guest_code_xsave makes a jmp to
guest_sev_es_code in inline assembly. When code that uses guest_sev_es_code
is removed, guest_sev_es_code will be optimized out, leading to a linking
error since guest_code_xsave still tries to jmp to guest_sev_es_code.

Rewrite guest_code_xsave() to instead make a call, in C, to
guest_sev_es_code(), so that usage of guest_sev_es_code() is made known to
the compiler.

This rewriting also gives a name to the xsave inline assembly, improving
readability.

Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 .../selftests/kvm/x86/sev_smoke_test.c        | 24 +++++++++++++------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/kvm/x86/sev_smoke_test.c b/tools/testing/selftests/kvm/x86/sev_smoke_test.c
index 1a49ee3915864..8b859adf4cf6f 100644
--- a/tools/testing/selftests/kvm/x86/sev_smoke_test.c
+++ b/tools/testing/selftests/kvm/x86/sev_smoke_test.c
@@ -80,13 +80,23 @@ static void guest_sev_code(void)
 	GUEST_DONE();
 }
 
-/* Stash state passed via VMSA before any compiled code runs.  */
-extern void guest_code_xsave(void);
-asm("guest_code_xsave:\n"
-    "mov $" __stringify(XFEATURE_MASK_X87_AVX) ", %eax\n"
-    "xor %edx, %edx\n"
-    "xsave (%rdi)\n"
-    "jmp guest_sev_es_code");
+static void xsave_all_registers(void *addr)
+{
+	__asm__ __volatile__(
+		"mov $" __stringify(XFEATURE_MASK_X87_AVX) ", %eax\n"
+		"xor %edx, %edx\n"
+		"xsave (%0)"
+		:
+		: "r"(addr)
+		: "eax", "edx", "memory"
+	 );
+}
+
+static void guest_code_xsave(void *vmsa_gva)
+{
+	xsave_all_registers(vmsa_gva);
+	guest_sev_es_code();
+}
 
 static void compare_xsave(u8 *from_host, u8 *from_guest)
 {
-- 
2.54.0.545.g6539524ca2-goog


^ permalink raw reply related

* [POC PATCH 1/6] KVM: selftests: Initialize guest_memfd with INIT_SHARED
From: Ackerley Tng @ 2026-04-28 23:33 UTC (permalink / raw)
  To: devnull+ackerleytng.google.com
  Cc: ackerleytng, aik, akpm, andrew.jones, aneesh.kumar, axelrasmussen,
	baohua, bhe, binbin.wu, bp, brauner, chao.p.peng, chrisl, corbet,
	dave.hansen, david, forkloop, hpa, ira.weiny, jgg, jmattson,
	jthoughton, kas, kasong, kvm, linux-coco, linux-doc, linux-kernel,
	linux-kselftest, linux-mm, linux-trace-kernel, mathieu.desnoyers,
	mhiramat, michael.roth, mingo, nphamcs, oupton, pankaj.gupta,
	pbonzini, pratyush, qi.zheng, qperret, rick.p.edgecombe, rientjes,
	rostedt, seanjc, shakeel.butt, shikemeng, shivankg, shuah, skhan,
	steven.price, suzuki.poulose, tabba, tglx, vannapurve, vbabka,
	weixugc, willy, wyihan, x86, yan.y.zhao, youngjun.park, yuanchu,
	Sagi Shahar
In-Reply-To: <cover.1777418884.git.ackerleytng@google.com>

Initialize guest_memfd with INIT_SHARED for VM types that require
guest_memfd.

Memory in the first memslot is used by the selftest framework to load
code, page tables, interrupt descriptor tables, and basically everything
the selftest needs to run. The selftest framework sets all of these up
assuming that the memory in the memslot can be written to from the
host. Align with that behavior by initializing guest_memfd as shared so
that all the writes from the host are permitted.

guest_memfd memory can later be marked private if necessary by CoCo
platform-specific initialization functions.

Suggested-by: Sagi Shahar <sagis@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 tools/testing/selftests/kvm/lib/kvm_util.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 216d6e037153c..3811aef8c98cd 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -483,8 +483,10 @@ struct kvm_vm *__vm_create(struct vm_shape shape, u32 nr_runnable_vcpus,
 {
 	u64 nr_pages = vm_nr_pages_required(shape.mode, nr_runnable_vcpus,
 						 nr_extra_pages);
+	enum vm_mem_backing_src_type src_type;
 	struct userspace_mem_region *slot0;
 	struct kvm_vm *vm;
+	u64 gmem_flags;
 	int i, flags;
 
 	kvm_set_files_rlimit(nr_runnable_vcpus);
@@ -502,7 +504,15 @@ struct kvm_vm *__vm_create(struct vm_shape shape, u32 nr_runnable_vcpus,
 	if (is_guest_memfd_required(shape))
 		flags |= KVM_MEM_GUEST_MEMFD;
 
-	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, flags);
+	gmem_flags = 0;
+	src_type = VM_MEM_SRC_ANONYMOUS;
+	if (is_guest_memfd_required(shape) && kvm_has_gmem_attributes) {
+		src_type = VM_MEM_SRC_SHMEM;
+		gmem_flags = GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_INIT_SHARED;
+	}
+
+	vm_mem_add(vm, src_type, 0, 0, nr_pages, flags, -1, 0, gmem_flags);
+
 	for (i = 0; i < NR_MEM_REGIONS; i++)
 		vm->memslots[i] = 0;
 
-- 
2.54.0.545.g6539524ca2-goog


^ permalink raw reply related

* [POC PATCH 2/6] KVM: selftests: Use guest_memfd memory contents in-place for SNP launch update
From: Ackerley Tng @ 2026-04-28 23:33 UTC (permalink / raw)
  To: devnull+ackerleytng.google.com
  Cc: ackerleytng, aik, akpm, andrew.jones, aneesh.kumar, axelrasmussen,
	baohua, bhe, binbin.wu, bp, brauner, chao.p.peng, chrisl, corbet,
	dave.hansen, david, forkloop, hpa, ira.weiny, jgg, jmattson,
	jthoughton, kas, kasong, kvm, linux-coco, linux-doc, linux-kernel,
	linux-kselftest, linux-mm, linux-trace-kernel, mathieu.desnoyers,
	mhiramat, michael.roth, mingo, nphamcs, oupton, pankaj.gupta,
	pbonzini, pratyush, qi.zheng, qperret, rick.p.edgecombe, rientjes,
	rostedt, seanjc, shakeel.butt, shikemeng, shivankg, shuah, skhan,
	steven.price, suzuki.poulose, tabba, tglx, vannapurve, vbabka,
	weixugc, willy, wyihan, x86, yan.y.zhao, youngjun.park, yuanchu
In-Reply-To: <cover.1777418884.git.ackerleytng@google.com>

Update the SEV-SNP launch update flow to utilize guest_memfd in-place
conversion.

Include the KVM_SET_MEMORY_ATTRIBUTES2_PRESERVE flag when setting memory
attributes to private. This is permitted before the SNP VM is finalized.

In snp_launch_update_data, pass 0 as the host virtual address. This
instructs the kernel to perform the launch update using the guest_memfd
backing the guest physical address rather than a userspace-provided
buffer.

Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 tools/testing/selftests/kvm/lib/x86/sev.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/kvm/lib/x86/sev.c b/tools/testing/selftests/kvm/lib/x86/sev.c
index d0205b3299e0b..72b2935871fe4 100644
--- a/tools/testing/selftests/kvm/lib/x86/sev.c
+++ b/tools/testing/selftests/kvm/lib/x86/sev.c
@@ -32,13 +32,14 @@ static void encrypt_region(struct kvm_vm *vm, struct userspace_mem_region *regio
 		const u64 size = (j - i + 1) * vm->page_size;
 		const u64 offset = (i - lowest_page_in_region) * vm->page_size;
 
-		if (private)
-			vm_mem_set_private(vm, gpa_base + offset, size, 0);
+		if (private) {
+			vm_mem_set_private(vm, gpa_base + offset, size,
+					   KVM_SET_MEMORY_ATTRIBUTES2_PRESERVE);
+		}
 
 		if (is_sev_snp_vm(vm))
 			snp_launch_update_data(vm, gpa_base + offset,
-					       (u64)addr_gpa2hva(vm, gpa_base + offset),
-					       size, page_type);
+					       0, size, page_type);
 		else
 			sev_launch_update_data(vm, gpa_base + offset, size);
 
-- 
2.54.0.545.g6539524ca2-goog


^ permalink raw reply related

* [POC PATCH 0/6] guest_memfd in-place conversion selftests for SNP
From: Ackerley Tng @ 2026-04-28 23:33 UTC (permalink / raw)
  To: devnull+ackerleytng.google.com
  Cc: ackerleytng, aik, akpm, andrew.jones, aneesh.kumar, axelrasmussen,
	baohua, bhe, binbin.wu, bp, brauner, chao.p.peng, chrisl, corbet,
	dave.hansen, david, forkloop, hpa, ira.weiny, jgg, jmattson,
	jthoughton, kas, kasong, kvm, linux-coco, linux-doc, linux-kernel,
	linux-kselftest, linux-mm, linux-trace-kernel, mathieu.desnoyers,
	mhiramat, michael.roth, mingo, nphamcs, oupton, pankaj.gupta,
	pbonzini, pratyush, qi.zheng, qperret, rick.p.edgecombe, rientjes,
	rostedt, seanjc, shakeel.butt, shikemeng, shivankg, shuah, skhan,
	steven.price, suzuki.poulose, tabba, tglx, vannapurve, vbabka,
	weixugc, willy, wyihan, x86, yan.y.zhao, youngjun.park, yuanchu
In-Reply-To: <20260428-gmem-inplace-conversion-v5-0-d8608ccfca22@google.com>

With these POC patches, I was able to test the set memory
attributes/conversion ioctls with SNP.

The content policies work, and PRESERVE can be used before the SNP VM
is finalized. SNP_LAUNCH_UPDATE can accept 0 for source address and
the SNP VM runs fine. :)

Ackerley Tng (6):
  KVM: selftests: Initialize guest_memfd with INIT_SHARED
  KVM: selftests: Use guest_memfd memory contents in-place for SNP
    launch update
  KVM: selftests: Make guest_code_xsave more friendly
  KVM: selftests: Allow specifying CoCo-privateness while mapping a page
  KVM: selftests: Test conversions for SNP
  KVM: selftests: Test content modes ZERO and PRESERVE for SNP

 .../selftests/kvm/include/x86/processor.h     |   2 +
 tools/testing/selftests/kvm/lib/kvm_util.c    |  12 +-
 .../testing/selftests/kvm/lib/x86/processor.c |  13 +-
 tools/testing/selftests/kvm/lib/x86/sev.c     |   9 +-
 .../selftests/kvm/x86/sev_smoke_test.c        | 255 +++++++++++++++++-
 5 files changed, 271 insertions(+), 20 deletions(-)

--
2.54.0.545.g6539524ca2-goog

^ permalink raw reply

* [PATCH RFC v5 48/53] KVM: selftests: Check fd/flags provided to mmap() when setting up memslot
From: Ackerley Tng via B4 Relay @ 2026-04-28 23:25 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, suzuki.poulose, aneesh.kumar, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260428-gmem-inplace-conversion-v5-0-d8608ccfca22@google.com>

From: Sean Christopherson <seanjc@google.com>

Check that a valid fd provided to mmap() must be accompanied by MAP_SHARED.

With an invalid fd (usually used for anonymous mappings), there are no
constraints on mmap() flags.

Add this check to make sure that when a guest_memfd is used as region->fd,
the flag provided to mmap() will include MAP_SHARED.

Signed-off-by: Sean Christopherson <seanjc@google.com>
[Rephrase assertion message.]
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 tools/testing/selftests/kvm/lib/kvm_util.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 12e031a8fc20d..29b3f4e9fb4a7 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1088,6 +1088,9 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
 					     src_type == VM_MEM_SRC_SHARED_HUGETLB);
 	}
 
+	TEST_ASSERT(region->fd == -1 || backing_src_is_shared(src_type),
+		    "A valid fd provided to mmap() must be accompanied by MAP_SHARED.");
+
 	region->mmap_start = __kvm_mmap(region->mmap_size, PROT_READ | PROT_WRITE,
 					vm_mem_backing_src_alias(src_type)->flag,
 					region->fd, mmap_offset);

-- 
2.54.0.545.g6539524ca2-goog



^ permalink raw reply related

* [PATCH RFC v5 49/53] KVM: selftests: Make TEST_EXPECT_SIGBUS thread-safe
From: Ackerley Tng via B4 Relay @ 2026-04-28 23:25 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, suzuki.poulose, aneesh.kumar, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260428-gmem-inplace-conversion-v5-0-d8608ccfca22@google.com>

From: Ackerley Tng <ackerleytng@google.com>

The TEST_EXPECT_SIGBUS macro is not thread-safe as it uses a global
sigjmp_buf and installs a global SIGBUS signal handler. If multiple threads
execute the macro concurrently, they will race on installing the signal
handler and stomp on other threads' jump buffers, leading to incorrect test
behavior.

Make TEST_EXPECT_SIGBUS thread-safe with the following changes:

Share the KVM tests' global signal handler. sigaction() applies to all
threads; without sharing a global signal handler, one thread may have
removed the signal handler that another thread added, hence leading to
unexpected signals.

The alternative of layering signal handlers was considered, but calling
sigaction() within TEST_EXPECT_SIGBUS() necessarily creates a race. To
avoid adding new setup and teardown routines to do sigaction() and keep
usage of TEST_EXPECT_SIGBUS() simple, share the KVM tests' global signal
handler.

Opportunistically rename report_unexpected_signal to
catchall_signal_handler.

To continue to only expect SIGBUS within specific regions of code, use a
thread-specific variable, expecting_sigbus, to replace installing and
removing signal handlers.

Make the execution environment for the thread, sigjmp_buf, a
thread-specific variable.

As part of TEST_EXPECT_SIGBUS(), assert the prerequisite for this setup,
that the current signal handler is the catchall_signal_handler.

Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 tools/testing/selftests/kvm/include/test_util.h | 32 +++++++++++++------------
 tools/testing/selftests/kvm/lib/kvm_util.c      | 18 ++++++++++----
 tools/testing/selftests/kvm/lib/test_util.c     |  7 ------
 3 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
index c280c3233f502..6907b99fe564b 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -82,21 +82,23 @@ do {									\
 	__builtin_unreachable(); \
 } while (0)
 
-extern sigjmp_buf expect_sigbus_jmpbuf;
-void expect_sigbus_handler(int signum);
-
-#define TEST_EXPECT_SIGBUS(action)						\
-do {										\
-	struct sigaction sa_old, sa_new = {					\
-		.sa_handler = expect_sigbus_handler,				\
-	};									\
-										\
-	sigaction(SIGBUS, &sa_new, &sa_old);					\
-	if (sigsetjmp(expect_sigbus_jmpbuf, 1) == 0) {				\
-		action;								\
-		TEST_FAIL("'%s' should have triggered SIGBUS", #action);	\
-	}									\
-	sigaction(SIGBUS, &sa_old, NULL);					\
+extern __thread sigjmp_buf expect_sigbus_jmpbuf;
+extern __thread volatile sig_atomic_t expecting_sigbus;
+extern void catchall_signal_handler(int signum);
+
+#define TEST_EXPECT_SIGBUS(action)					\
+do {									\
+	struct sigaction sa = {};					\
+									\
+	TEST_ASSERT_EQ(sigaction(SIGBUS, NULL, &sa), 0);		\
+	TEST_ASSERT_EQ(sa.sa_handler, &catchall_signal_handler);	\
+									\
+	expecting_sigbus = true;					\
+	if (sigsetjmp(expect_sigbus_jmpbuf, 1) == 0) {			\
+		action;							\
+		TEST_FAIL("'%s' should have triggered SIGBUS", #action);\
+	}								\
+	expecting_sigbus = false;					\
 } while (0)
 
 size_t parse_size(const char *size);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 29b3f4e9fb4a7..216d6e037153c 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -2269,13 +2269,20 @@ __weak void kvm_selftest_arch_init(void)
 {
 }
 
-static void report_unexpected_signal(int signum)
+__thread sigjmp_buf expect_sigbus_jmpbuf;
+__thread volatile sig_atomic_t expecting_sigbus;
+
+void catchall_signal_handler(int signum)
 {
+	switch (signum) {
+	case SIGBUS: {
+		if (expecting_sigbus)
+			siglongjmp(expect_sigbus_jmpbuf, 1);
+
+		TEST_FAIL("Unexpected SIGBUS (%d)\n", signum);
+	}
 #define KVM_CASE_SIGNUM(sig)					\
 	case sig: TEST_FAIL("Unexpected " #sig " (%d)\n", signum)
-
-	switch (signum) {
-	KVM_CASE_SIGNUM(SIGBUS);
 	KVM_CASE_SIGNUM(SIGSEGV);
 	KVM_CASE_SIGNUM(SIGILL);
 	KVM_CASE_SIGNUM(SIGFPE);
@@ -2287,12 +2294,13 @@ static void report_unexpected_signal(int signum)
 void __attribute((constructor)) kvm_selftest_init(void)
 {
 	struct sigaction sig_sa = {
-		.sa_handler = report_unexpected_signal,
+		.sa_handler = catchall_signal_handler,
 	};
 
 	/* Tell stdout not to buffer its content. */
 	setbuf(stdout, NULL);
 
+	expecting_sigbus = false;
 	sigaction(SIGBUS, &sig_sa, NULL);
 	sigaction(SIGSEGV, &sig_sa, NULL);
 	sigaction(SIGILL, &sig_sa, NULL);
diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c
index bab1bd2b775b6..30eb701e4becd 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -18,13 +18,6 @@
 
 #include "test_util.h"
 
-sigjmp_buf expect_sigbus_jmpbuf;
-
-void __attribute__((used)) expect_sigbus_handler(int signum)
-{
-	siglongjmp(expect_sigbus_jmpbuf, 1);
-}
-
 /*
  * Random number generator that is usable from guest code. This is the
  * Park-Miller LCG using standard constants.

-- 
2.54.0.545.g6539524ca2-goog



^ permalink raw reply related

* [PATCH RFC v5 45/53] KVM: selftests: Reset shared memory after hole-punching
From: Ackerley Tng via B4 Relay @ 2026-04-28 23:25 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, suzuki.poulose, aneesh.kumar, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260428-gmem-inplace-conversion-v5-0-d8608ccfca22@google.com>

From: Ackerley Tng <ackerleytng@google.com>

private_mem_conversions_test used to reset the shared memory that was used
for the test to an initial pattern at the end of each test iteration. Then,
it would punch out the pages, which would zero memory.

Without in-place conversion, the resetting would write shared memory, and
hole-punching will zero private memory, hence resetting the test to the
state at the beginning of the for loop.

With in-place conversion, resetting writes memory as shared, and
hole-punching zeroes the same physical memory, hence undoing the reset
done before the hole punch.

Move the resetting after the hole-punching, and reset the entire
PER_CPU_DATA_SIZE instead of just the tested range.

With in-place conversion, this zeroes and then resets the same physical
memory. Without in-place conversion, the private memory is zeroed, and the
shared memory is reset to init_p.

This is sufficient since at each test stage, the memory is assumed to start
as shared, and private memory is always assumed to start zeroed. Conversion
zeroes memory, so the future test stages will work as expected.

Fixes: 43f623f350ce1 ("KVM: selftests: Add x86-only selftest for private memory conversions")
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 tools/testing/selftests/kvm/x86/private_mem_conversions_test.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
index 861baff201e78..289ad10063fca 100644
--- a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
+++ b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
@@ -202,15 +202,18 @@ static void guest_test_explicit_conversion(u64 base_gpa, bool do_fallocate)
 		guest_sync_shared(gpa, size, p3, p4);
 		memcmp_g(gpa, p4, size);
 
-		/* Reset the shared memory back to the initial pattern. */
-		memset((void *)gpa, init_p, size);
-
 		/*
 		 * Free (via PUNCH_HOLE) *all* private memory so that the next
 		 * iteration starts from a clean slate, e.g. with respect to
 		 * whether or not there are pages/folios in guest_mem.
 		 */
 		guest_map_shared(base_gpa, PER_CPU_DATA_SIZE, true);
+
+		/*
+		 * Hole-punching above zeroed private memory. Reset shared
+		 * memory in preparation for the next GUEST_STAGE.
+		 */
+		memset((void *)base_gpa, init_p, PER_CPU_DATA_SIZE);
 	}
 }
 

-- 
2.54.0.545.g6539524ca2-goog



^ permalink raw reply related

* [PATCH RFC v5 46/53] KVM: selftests: Provide function to look up guest_memfd details from gpa
From: Ackerley Tng via B4 Relay @ 2026-04-28 23:25 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, suzuki.poulose, aneesh.kumar, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260428-gmem-inplace-conversion-v5-0-d8608ccfca22@google.com>

From: Ackerley Tng <ackerleytng@google.com>

Introduce a new helper, kvm_gpa_to_guest_memfd(), to find the
guest_memfd-related details of a memory region that contains a given guest
physical address (GPA).

The function returns the file descriptor for the memfd, the offset into
the file that corresponds to the GPA, and the number of bytes remaining
in the region from that GPA.

kvm_gpa_to_guest_memfd() was factored out from vm_guest_mem_fallocate();
refactor vm_guest_mem_fallocate() to use the new helper.

Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/include/kvm_util.h |  3 +++
 tools/testing/selftests/kvm/lib/kvm_util.c     | 34 ++++++++++++++++----------
 2 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 62d917a2d2b19..7de88cbdfd2b8 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -428,6 +428,9 @@ static inline void vm_enable_cap(struct kvm_vm *vm, u32 cap, u64 arg0)
 	vm_ioctl(vm, KVM_ENABLE_CAP, &enable_cap);
 }
 
+int kvm_gpa_to_guest_memfd(struct kvm_vm *vm, gpa_t gpa, off_t *fd_offset,
+			   size_t *nr_bytes);
+
 /*
  * KVM_SET_MEMORY_ATTRIBUTES{,2} overwrites _all_ attributes.  These
  * flows need significant enhancements to support multiple attributes.
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 5e34593ad79c4..12e031a8fc20d 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1283,27 +1283,19 @@ void vm_guest_mem_fallocate(struct kvm_vm *vm, u64 base, u64 size,
 			    bool punch_hole)
 {
 	const int mode = FALLOC_FL_KEEP_SIZE | (punch_hole ? FALLOC_FL_PUNCH_HOLE : 0);
-	struct userspace_mem_region *region;
 	u64 end = base + size;
 	gpa_t gpa, len;
 	off_t fd_offset;
-	int ret;
+	int fd, ret;
 
 	for (gpa = base; gpa < end; gpa += len) {
-		u64 offset;
-
-		region = userspace_mem_region_find(vm, gpa, gpa);
-		TEST_ASSERT(region && region->region.flags & KVM_MEM_GUEST_MEMFD,
-			    "Private memory region not found for GPA 0x%lx", gpa);
+		fd = kvm_gpa_to_guest_memfd(vm, gpa, &fd_offset, &len);
+		len = min(end - gpa, len);
 
-		offset = gpa - region->region.guest_phys_addr;
-		fd_offset = region->region.guest_memfd_offset + offset;
-		len = min_t(u64, end - gpa, region->region.memory_size - offset);
-
-		ret = fallocate(region->region.guest_memfd, mode, fd_offset, len);
+		ret = fallocate(fd, mode, fd_offset, len);
 		TEST_ASSERT(!ret, "fallocate() failed to %s at %lx (len = %lu), fd = %d, mode = %x, offset = %lx",
 			    punch_hole ? "punch hole" : "allocate", gpa, len,
-			    region->region.guest_memfd, mode, fd_offset);
+			    fd, mode, fd_offset);
 	}
 }
 
@@ -1640,6 +1632,22 @@ void *addr_gpa2alias(struct kvm_vm *vm, gpa_t gpa)
 	return (void *) ((uintptr_t) region->host_alias + offset);
 }
 
+int kvm_gpa_to_guest_memfd(struct kvm_vm *vm, gpa_t gpa, off_t *fd_offset,
+			   size_t *nr_bytes)
+{
+	struct userspace_mem_region *region;
+	gpa_t gpa_offset;
+
+	region = userspace_mem_region_find(vm, gpa, gpa);
+	TEST_ASSERT(region && region->region.flags & KVM_MEM_GUEST_MEMFD,
+		    "guest_memfd memory region not found for GPA 0x%lx", gpa);
+
+	gpa_offset = gpa - region->region.guest_phys_addr;
+	*fd_offset = region->region.guest_memfd_offset + gpa_offset;
+	*nr_bytes = region->region.memory_size - gpa_offset;
+	return region->region.guest_memfd;
+}
+
 /* Create an interrupt controller chip for the specified VM. */
 void vm_create_irqchip(struct kvm_vm *vm)
 {

-- 
2.54.0.545.g6539524ca2-goog



^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox