Linux Confidential Computing Development

Linux Confidential Computing Development
 help / color / mirror / Atom feed

* [PATCH v2 04/17] x86/virt/tdx: Add extra memory to TDX module for the extensions
From: Xu Yilun @ 2026-06-18  8:13 UTC (permalink / raw)
  To: x86, kvm, linux-coco, linux-kernel
  Cc: djbw, kas, rick.p.edgecombe, yilun.xu, yilun.xu, xiaoyao.li,
	sohil.mehta, adrian.hunter, kishen.maloor, tony.lindgren,
	peter.fang, baolu.lu, zhenzhong.duan, dave.hansen, dave.hansen,
	seanjc
In-Reply-To: <20260618081355.3253581-1-yilun.xu@linux.intel.com>

TDX module extensions receive a one-time memory allocation at
initialization time. The extensions use this memory as the baseline for
their internal states and data required by the service APIs they offer.

Add a new memory feeding process backed by a new SEAMCALL
TDH.EXT.MEM.ADD. The process is mostly the same as adding PAMT. The
kernel queries TDX module how much memory needed by reading the
memory_pool_required_pages, allocates it, hands it over to the module,
and never gets it back.

TDH.EXT.MEM.ADD uses a new parameter type, HPA_LIST_INFO, to provide
this memory. This type represents a list of pages for TDX module to
access. It references an 'hpa_list page' which contains the list of
target HPAs. It collapses the HPA of the hpa_list page and the number
of valid target HPAs into a 64 bit raw value for SEAMCALL parameters.
The hpa_list page is always a medium, TDX module never keeps the
hpa_list page.

Don't CLFLUSH the pages handed to the TDX module, as is done for some
other SEAMCALLs. The flushing operation is not expected to be needed for
current and known future architectures. As more and more page feeding
interfaces to come, the conservative flushing operation becomes a
maintenance burden.

For now, TDX module extensions consume tens of megabytes memory that
will never be returned to host. Use contiguous page allocation to
isolate these large blocks entirely, avoiding permanent memory
fragmentation and reducing buddy allocator efficiency. Print the
allocation amount on TDX module extensions initialization for
visibility.

Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
---
 arch/x86/include/asm/tdx_global_metadata.h  |   1 +
 arch/x86/virt/vmx/tdx/tdx.h                 |   1 +
 arch/x86/virt/vmx/tdx/tdx.c                 | 107 +++++++++++++++++++-
 arch/x86/virt/vmx/tdx/tdx_global_metadata.c |   6 ++
 4 files changed, 112 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/tdx_global_metadata.h b/arch/x86/include/asm/tdx_global_metadata.h
index 83fc657a438e..b3442b7c88bb 100644
--- a/arch/x86/include/asm/tdx_global_metadata.h
+++ b/arch/x86/include/asm/tdx_global_metadata.h
@@ -53,6 +53,7 @@ struct tdx_sys_info {
 };
 
 struct tdx_sys_info_ext {
+	u32 memory_pool_required_pages;
 	bool ext_required;
 };
 
diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h
index a47e872480c7..a100634087e7 100644
--- a/arch/x86/virt/vmx/tdx/tdx.h
+++ b/arch/x86/virt/vmx/tdx/tdx.h
@@ -63,6 +63,7 @@
 #define TDH_SYS_SHUTDOWN		52
 #define TDH_SYS_UPDATE_V0		53
 #define TDH_SYS_UPDATE			SEAMCALL_LEAF_VER(TDH_SYS_UPDATE_V0, 1)
+#define TDH_EXT_MEM_ADD			61
 #define TDH_SYS_DISABLE			69
 
 /* TDX page types */
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index 6f3596f11d25..dab17822c1c6 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -31,6 +31,7 @@
 #include <linux/syscore_ops.h>
 #include <linux/idr.h>
 #include <linux/kvm_types.h>
+#include <linux/bitfield.h>
 #include <asm/page.h>
 #include <asm/special_insns.h>
 #include <asm/msr-index.h>
@@ -1166,6 +1167,108 @@ static __init int init_tdmrs(struct tdmr_info_list *tdmr_list)
 	return 0;
 }
 
+#define HPA_LIST_INFO_FIRST_ENTRY	GENMASK_U64(11, 3)
+#define HPA_LIST_INFO_PFN		GENMASK_U64(51, 12)
+#define HPA_LIST_INFO_LAST_ENTRY	GENMASK_U64(63, 55)
+
+static __init u64 to_hpa_list_info(struct page *hpa_list_page,
+				   unsigned int nr_pages)
+{
+	return FIELD_PREP(HPA_LIST_INFO_FIRST_ENTRY, 0) |
+	       FIELD_PREP(HPA_LIST_INFO_PFN, page_to_pfn(hpa_list_page)) |
+	       FIELD_PREP(HPA_LIST_INFO_LAST_ENTRY, nr_pages - 1);
+}
+
+static __init int tdx_ext_mem_add(struct page *hpa_list_page,
+				  unsigned int nr_pages)
+{
+	struct tdx_module_args args = {
+		.rcx = to_hpa_list_info(hpa_list_page, nr_pages),
+	};
+	u64 r;
+
+	do {
+		/*
+		 * TDH_EXT_MEM_ADD is designed to use output parameter RCX to
+		 * override/update input parameter RCX, so the caller doesn't
+		 * have to do manual parameter update on retry call.
+		 */
+		r = seamcall_ret(TDH_EXT_MEM_ADD, &args);
+	} while (r == TDX_INTERRUPTED_RESUMABLE);
+
+	if (r != TDX_SUCCESS)
+		return -EFAULT;
+
+	return 0;
+}
+
+struct tdx_hpa_list {
+	u64 phys[PAGE_SIZE / sizeof(u64)];
+};
+
+static_assert(sizeof(struct tdx_hpa_list) == PAGE_SIZE);
+
+static __init int tdx_ext_mem_setup(unsigned int required_pages)
+{
+	struct tdx_hpa_list *hpa_list;
+	struct page *page;
+	unsigned int i;
+	int ret;
+
+	/*
+	 * memory_pool_required_pages == 0 means no need to add pages,
+	 * skip the memory setup.
+	 */
+	if (!required_pages)
+		return 0;
+
+	hpa_list = kzalloc_obj(*hpa_list);
+	if (!hpa_list)
+		return -ENOMEM;
+
+	page = alloc_contig_pages(required_pages, GFP_KERNEL, numa_mem_id(),
+				  &node_online_map);
+	if (!page) {
+		ret = -ENOMEM;
+		goto out_free_hpa_list;
+	}
+
+	i = 0;
+	while (i < required_pages) {
+		unsigned int nents = min(required_pages - i,
+					 ARRAY_SIZE(hpa_list->phys));
+		unsigned int j;
+
+		for (j = 0; j < nents; j++)
+			hpa_list->phys[j] = page_to_phys(page + i + j);
+
+		ret = tdx_ext_mem_add(virt_to_page(hpa_list), nents);
+		/*
+		 * No SEAMCALLs to reclaim the added pages. For simple error
+		 * handling, leak all pages.
+		 */
+		WARN(ret, "Fatal: TDX module rejected (%d) memory for extensions, stranded all pages\n",
+		     ret);
+		if (ret)
+			break;
+
+		i += nents;
+	}
+
+	/*
+	 * Memory for extensions can't be reclaimed once added, print out the
+	 * amount, stop tracking it and free the hpa_list page, no matter
+	 * success or failure.
+	 */
+	pr_info("%lu KB consumed for TDX module extensions\n",
+		required_pages * PAGE_SIZE / 1024);
+
+out_free_hpa_list:
+	kfree(hpa_list);
+
+	return ret;
+}
+
 static __init int init_tdx_module_extensions(void)
 {
 	struct tdx_sys_info_ext sysinfo_ext;
@@ -1182,9 +1285,7 @@ static __init int init_tdx_module_extensions(void)
 	if (!sysinfo_ext.ext_required)
 		return 0;
 
-	/* TODO: add the extensions enabling steps here */
-
-	return 0;
+	return tdx_ext_mem_setup(sysinfo_ext.memory_pool_required_pages);
 }
 
 static __init int init_tdx_module(void)
diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
index b9e1c011a990..720cdaf76492 100644
--- a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
+++ b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
@@ -137,6 +137,12 @@ static __init int get_tdx_sys_info_ext(struct tdx_sys_info_ext *sysinfo_ext)
 	int ret;
 	u64 val;
 
+	ret = read_sys_metadata_field(0x3100000200000000, &val);
+	if (ret)
+		return ret;
+
+	sysinfo_ext->memory_pool_required_pages = val;
+
 	ret = read_sys_metadata_field(0x3100000000000001, &val);
 	if (ret)
 		return ret;
-- 
2.25.1


^ permalink raw reply related

* [PATCH v2 03/17] x86/virt/tdx: Detect if the extensions initialization is required
From: Xu Yilun @ 2026-06-18  8:13 UTC (permalink / raw)
  To: x86, kvm, linux-coco, linux-kernel
  Cc: djbw, kas, rick.p.edgecombe, yilun.xu, yilun.xu, xiaoyao.li,
	sohil.mehta, adrian.hunter, kishen.maloor, tony.lindgren,
	peter.fang, baolu.lu, zhenzhong.duan, dave.hansen, dave.hansen,
	seanjc
In-Reply-To: <20260618081355.3253581-1-yilun.xu@linux.intel.com>

TDX module extensions support extension SEAMCALLs that are preemptible
and resumable, unlike normal SEAMCALLs that run to completion while
monopolizing the CPU. This allows for higher-level API constructions,
so better supports some add-on features that implement higher order
security protocols.

Add infrastructure to initialize TDX module extensions. Introduce the
initial step of this process by detecting if the extensions are required
by checking:

  1. If the extensions are supported via TDX_FEATURES0_EXT.
  2. If any TDX add-on feature needs the extensions via a boolean
     metadata field ext_required.

Currently all metadata fields are read at the very beginning of basic
TDX initialization and stored in a global var. However, ext_required is
only valid after the add-on feature configuration, making it
incompatible with the existing metadata reading method.

To resolve this lifetime conflict, add a dedicated runtime metadata
reading interface for the extensions, call it when the extensions
initialization starts, and leave the field out of the global var. In
this way, there is no confusion of when the metadata should be read.

Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
---
 arch/x86/include/asm/tdx.h                  |  1 +
 arch/x86/include/asm/tdx_global_metadata.h  |  4 ++++
 arch/x86/virt/vmx/tdx/tdx.c                 | 25 +++++++++++++++++++++
 arch/x86/virt/vmx/tdx/tdx_global_metadata.c | 14 ++++++++++++
 4 files changed, 44 insertions(+)

diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index e5a9cf656c07..5fbf89d5317c 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -35,6 +35,7 @@
 /* Bit definitions of TDX_FEATURES0 metadata field */
 #define TDX_FEATURES0_TD_PRESERVING	BIT_ULL(1)
 #define TDX_FEATURES0_NO_RBP_MOD	BIT_ULL(18)
+#define TDX_FEATURES0_EXT		BIT_ULL(39)
 
 #ifndef __ASSEMBLER__
 
diff --git a/arch/x86/include/asm/tdx_global_metadata.h b/arch/x86/include/asm/tdx_global_metadata.h
index 41150d546589..83fc657a438e 100644
--- a/arch/x86/include/asm/tdx_global_metadata.h
+++ b/arch/x86/include/asm/tdx_global_metadata.h
@@ -52,4 +52,8 @@ struct tdx_sys_info {
 	struct tdx_sys_info_td_conf td_conf;
 };
 
+struct tdx_sys_info_ext {
+	bool ext_required;
+};
+
 #endif
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index 92305b5ea90d..6f3596f11d25 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -1166,6 +1166,27 @@ static __init int init_tdmrs(struct tdmr_info_list *tdmr_list)
 	return 0;
 }
 
+static __init int init_tdx_module_extensions(void)
+{
+	struct tdx_sys_info_ext sysinfo_ext;
+	int ret;
+
+	if (!(tdx_sysinfo.features.tdx_features0 & TDX_FEATURES0_EXT))
+		return 0;
+
+	ret = get_tdx_sys_info_ext(&sysinfo_ext);
+	if (ret)
+		return ret;
+
+	/* Skip if no feature requires TDX module extensions. */
+	if (!sysinfo_ext.ext_required)
+		return 0;
+
+	/* TODO: add the extensions enabling steps here */
+
+	return 0;
+}
+
 static __init int init_tdx_module(void)
 {
 	int ret;
@@ -1220,6 +1241,10 @@ static __init int init_tdx_module(void)
 	if (ret)
 		goto err_reset_pamts;
 
+	ret = init_tdx_module_extensions();
+	if (ret)
+		goto err_reset_pamts;
+
 	pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
 
 out_put_tdxmem:
diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
index e49c300f23d4..b9e1c011a990 100644
--- a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
+++ b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
@@ -131,3 +131,17 @@ static __init int get_tdx_sys_info(struct tdx_sys_info *sysinfo)
 
 	return ret;
 }
+
+static __init int get_tdx_sys_info_ext(struct tdx_sys_info_ext *sysinfo_ext)
+{
+	int ret;
+	u64 val;
+
+	ret = read_sys_metadata_field(0x3100000000000001, &val);
+	if (ret)
+		return ret;
+
+	sysinfo_ext->ext_required = val;
+
+	return 0;
+}
-- 
2.25.1


^ permalink raw reply related

* [PATCH v2 02/17] x86/virt/tdx: Configure add-on features on TDX module init and update
From: Xu Yilun @ 2026-06-18  8:13 UTC (permalink / raw)
  To: x86, kvm, linux-coco, linux-kernel
  Cc: djbw, kas, rick.p.edgecombe, yilun.xu, yilun.xu, xiaoyao.li,
	sohil.mehta, adrian.hunter, kishen.maloor, tony.lindgren,
	peter.fang, baolu.lu, zhenzhong.duan, dave.hansen, dave.hansen,
	seanjc
In-Reply-To: <20260618081355.3253581-1-yilun.xu@linux.intel.com>

In addition to basic TDX functionalities, TDX module provides add-on
features that can be progressively enabled as the kernel supports them.
The kernel should explicitly configure these features at boot or
post-update initialization time. Configuring an add-on feature, such as
TDX Quoting, that uses extension SEAMCALLs is the prerequisite for
initializing TDX module extensions. TDX Quoting is the target feature to
enable but defer it for now until full kernel support is in place.

TDX module extends TDH.SYS.CONFIG and TDH.SYS.UPDATE with new bitmap
input parameters to specify which add-on features to configure. The
bitmap uses the same definitions as TDX_FEATURES0.

For runtime update, Linux applies a policy that no newer features should
be added after update to avoid disrupting live TDX operations. To adhere
to this, TDH.SYS.UPDATE must configure the same features as the
TDH.SYS.CONFIG. Record the kernel required add-on feature bitmap in a
global var so that both phases can use it.

TDX module advances the version of TDH.SYS.CONFIG and TDH.SYS.UPDATE for
the change, so use the latest version (v1) for add-on feature enabling.
But supporting existing modules which only support v0 is still necessary
until they are deprecated. In fact, it is unlikely that TDH.SYS.CONFIG
ever needs to change again and the code would stay in v1. So there is
little value in worrying about deprecating v0 to save a couple lines of
code in 5-7 years when these original TDX platforms sunset.

Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
---
 arch/x86/virt/vmx/tdx/tdx.h |  6 ++++--
 arch/x86/virt/vmx/tdx/tdx.c | 28 ++++++++++++++++++++++++++--
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h
index fbb520704662..a47e872480c7 100644
--- a/arch/x86/virt/vmx/tdx/tdx.h
+++ b/arch/x86/virt/vmx/tdx/tdx.h
@@ -58,9 +58,11 @@
 #define TDH_PHYMEM_CACHE_WB		40
 #define TDH_PHYMEM_PAGE_WBINVD		41
 #define TDH_VP_WR			43
-#define TDH_SYS_CONFIG			45
+#define TDH_SYS_CONFIG_V0		45
+#define TDH_SYS_CONFIG			SEAMCALL_LEAF_VER(TDH_SYS_CONFIG_V0, 1)
 #define TDH_SYS_SHUTDOWN		52
-#define TDH_SYS_UPDATE			53
+#define TDH_SYS_UPDATE_V0		53
+#define TDH_SYS_UPDATE			SEAMCALL_LEAF_VER(TDH_SYS_UPDATE_V0, 1)
 #define TDH_SYS_DISABLE			69

 /* TDX page types */
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index 2a03152796e6..92305b5ea90d 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -57,6 +57,7 @@ static struct tdx_module_state tdx_module_state;
 static u32 tdx_global_keyid __ro_after_init;
 static u32 tdx_guest_keyid_start __ro_after_init;
 static u32 tdx_nr_guest_keyids __ro_after_init;
+static u64 tdx_addon_feature0 __ro_after_init;

 static DEFINE_IDA(tdx_guest_keyid_pool);

@@ -1004,9 +1005,18 @@ static __init int construct_tdmrs(struct list_head *tmb_list,
 	return ret;
 }

+static __init void set_tdx_addon_features(void)
+{
+	/*
+	 * To add DICE-based TDX Quoting feature bit in tdx_addon_feature0 when
+	 * kernel is ready.
+	 */
+}
+
 static __init int config_tdx_module(struct tdmr_info_list *tdmr_list,
 				    u64 global_keyid)
 {
+	u64 seamcall_fn = TDH_SYS_CONFIG_V0;
 	struct tdx_module_args args = {};
 	u64 *tdmr_pa_array;
 	size_t array_sz;
@@ -1032,7 +1042,15 @@ static __init int config_tdx_module(struct tdmr_info_list *tdmr_list,
 	args.rcx = __pa(tdmr_pa_array);
 	args.rdx = tdmr_list->nr_consumed_tdmrs;
 	args.r8 = global_keyid;
-	ret = seamcall_prerr(TDH_SYS_CONFIG, &args);
+
+	set_tdx_addon_features();
+
+	if (tdx_addon_feature0) {
+		args.r9 = tdx_addon_feature0;
+		seamcall_fn = TDH_SYS_CONFIG;
+	}
+
+	ret = seamcall_prerr(seamcall_fn, &args);

 	/* Free the array as it is not required anymore. */
 	kfree(tdmr_pa_array);
@@ -1314,10 +1332,16 @@ int tdx_module_shutdown(void)

 int tdx_module_run_update(void)
 {
+	u64 seamcall_fn = TDH_SYS_UPDATE_V0;
 	struct tdx_module_args args = {};
 	int ret;

-	ret = seamcall_prerr(TDH_SYS_UPDATE, &args);
+	if (tdx_addon_feature0) {
+		args.r9 = tdx_addon_feature0;
+		seamcall_fn = TDH_SYS_UPDATE;
+	}
+
+	ret = seamcall_prerr(seamcall_fn, &args);
 	if (ret)
 		return ret;

-- 
2.25.1

^ permalink raw reply related

* [PATCH v2 01/17] x86/virt/tdx: Embed version info in SEAMCALL leaf function definitions
From: Xu Yilun @ 2026-06-18  8:13 UTC (permalink / raw)
  To: x86, kvm, linux-coco, linux-kernel
  Cc: djbw, kas, rick.p.edgecombe, yilun.xu, yilun.xu, xiaoyao.li,
	sohil.mehta, adrian.hunter, kishen.maloor, tony.lindgren,
	peter.fang, baolu.lu, zhenzhong.duan, dave.hansen, dave.hansen,
	seanjc
In-Reply-To: <20260618081355.3253581-1-yilun.xu@linux.intel.com>

Embed version information in SEAMCALL leaf function definitions rather
than let the caller open code them. For now, only TDH.VP.INIT is
involved.

Don't bother the caller to choose the SEAMCALL version if unnecessary.
New version SEAMCALLs are guaranteed to be backward compatible, so
ideally the kernel doesn't need to keep version history and only uses
the latest version SEAMCALLs.

And in confidential computing world, system security requires us to stop
using an older TDX module when there is a newer one. So don't burden the
kernel with long-term supporting an older TDX module that doesn't
understand newer version SEAMCALLs.

The only concern is there may be transitional periods when a new TDX
module is not widely available, meaning the kernel may temporarily need
to support multiple SEAMCALL versions. As time goes by, the old TDX
modules deprecate and old version SEAMCALL definitions should disappear.

The old TDX modules that only support TDH.VP.INIT v0 are all deprecated,
so only provide the latest (v1) definition.

Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
---
 arch/x86/virt/vmx/tdx/tdx.h | 23 ++++++++++++++---------
 arch/x86/virt/vmx/tdx/tdx.c |  3 +--
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h
index bdfd0e1e337a..fbb520704662 100644
--- a/arch/x86/virt/vmx/tdx/tdx.h
+++ b/arch/x86/virt/vmx/tdx/tdx.h
@@ -2,6 +2,7 @@
 #ifndef _X86_VIRT_TDX_H
 #define _X86_VIRT_TDX_H
 
+#include <linux/bitfield.h>
 #include <linux/bits.h>
 
 /*
@@ -11,6 +12,18 @@
  * architectural definitions come first.
  */
 
+/*
+ * SEAMCALL leaf:
+ *
+ * Bit 15:0	Leaf number
+ * Bit 23:16	Version number
+ */
+#define SEAMCALL_LEAF			GENMASK(15, 0)
+#define SEAMCALL_VER			GENMASK(23, 16)
+
+#define SEAMCALL_LEAF_VER(l, v)		(FIELD_PREP(SEAMCALL_LEAF, l) | \
+					 FIELD_PREP(SEAMCALL_VER, v))
+
 /*
  * TDX module SEAMCALL leaf functions
  */
@@ -31,7 +44,7 @@
 #define TDH_VP_CREATE			10
 #define TDH_MNG_KEY_FREEID		20
 #define TDH_MNG_INIT			21
-#define TDH_VP_INIT			22
+#define TDH_VP_INIT			SEAMCALL_LEAF_VER(22, 1)
 #define TDH_PHYMEM_PAGE_RDMD		24
 #define TDH_VP_RD			26
 #define TDH_PHYMEM_PAGE_RECLAIM		28
@@ -50,14 +63,6 @@
 #define TDH_SYS_UPDATE			53
 #define TDH_SYS_DISABLE			69
 
-/*
- * SEAMCALL leaf:
- *
- * Bit 15:0	Leaf number
- * Bit 23:16	Version number
- */
-#define TDX_VERSION_SHIFT		16
-
 /* TDX page types */
 #define	PT_NDA		0x0
 #define	PT_RSVD		0x1
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index b15269b5941d..2a03152796e6 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -1903,8 +1903,7 @@ u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid)
 		.r8 = x2apicid,
 	};
 
-	/* apicid requires version == 1. */
-	return seamcall(TDH_VP_INIT | (1ULL << TDX_VERSION_SHIFT), &args);
+	return seamcall(TDH_VP_INIT, &args);
 }
 EXPORT_SYMBOL_FOR_KVM(tdh_vp_init);
 
-- 
2.25.1


^ permalink raw reply related

* [PATCH v2 00/17] Enable DICE-based TDX Quoting Extension
From: Xu Yilun @ 2026-06-18  8:13 UTC (permalink / raw)
  To: x86, kvm, linux-coco, linux-kernel
  Cc: djbw, kas, rick.p.edgecombe, yilun.xu, yilun.xu, xiaoyao.li,
	sohil.mehta, adrian.hunter, kishen.maloor, tony.lindgren,
	peter.fang, baolu.lu, zhenzhong.duan, dave.hansen, dave.hansen,
	seanjc

This series adds infrastructure to enable TDX module extensions and
then implements DICE-based TDX Quoting extension. This is the 2nd
version and a significant change is that we want the quoting part to
merge along with the basic TDX module extensions part, rather than
serving as an example. So the quoting part drops RFC tags and requires
initial review. The basic extensions part addresses v1 comments and
needs more detailed review.

The quoting part contains some KVM patches, so we sorted the series for
easier review and pick:

  Patches  1-6:  Enable the TDX module extensions support
  Patches  7-14: DICE-based TDX Quoting, x86/tdx part
  Patches  15-N: DICE-based TDX Quoting, KVM part

== Overview ==

To date, SEAMCALLs have been short lived routines that monopolize the
CPU for their duration. This limits their utility for implementing
higher order security protocols, or pushes complexity into Linux - such
as by fragmenting a protocol setup service into several SEAMCALLs. The
Linux appetite for ingesting complexity is low, so TDX now adds a new
class of SEAMCALLs that are preemptible and resumable. This capability
allows for higher-level API constructions - like "create a DICE-based
quote" - which are more aligned to what is a good fit for Linux.

This new "extension SEAMCALL" capability is akin to ARM CCA's "Stateful
RMI Operations (SRO)", and achieves similar externalized complexity
relief as a dedicated hardware co-processor like AMD SEV-SNP. The
mechanism is "give the service environment some memory", "invoke the
service API", and "continue invoking until complete". All protocol state
is internal to the service API.

TDX introduces "TDX module extensions" as the service environment for
some add-on features - such as DICE-based quoting, TDISP, and live
migration - to use "extension SEAMCALLs".

The extension SEAMCALLs are designed to be transparent to the host,
using the same interface as normal SEAMCALLs, but the service
environment should be initialized in several steps. First,
configure/select (via TDH.SYS.CONFIG) add-on features during basic TDX
initialization. Second, check if TDX module extensions are required to
support these add-on features by reading TDX global metadata. Third, add
extra memory to the TDX module via a SEAMCALL (TDH.EXT.MEM.ADD).
Finally, use another SEAMCALL (TDH.EXT.INIT) to initialize the
extensions.

== DICE-based Quoting extension ==

The first feature to use these extensions is the TDX Quoting extension [1],
which converts guest launch attestation reports into a document that can be
verified externally.

Today, the TDX host requires a separate software service to generate Quotes.
The Quoting extension allows the TDX module to generate Quotes directly,
without relying on a discrete Quoting engine. This simplifies the overall
attestation flow: KVM no longer needs to return to userspace for Quote
generation. Instead, Quote generation is handled directly by the TDX module
through an extension SEAMCALL. See [2] for an overview of TDX attestation.

The Device Identifier Composition Engine ("DICE") provides a standardized
framework for layering attestation evidence. It replaces SGX-based attestation
and moves away from Intel-proprietary formats. It also eliminates the SGX
requirement to contact an Intel service to obtain a certificate first.
Instead, all attestation evidence is embedded in the Quote itself.

== The trade-off ==

The extensions create an extension instance for each feature that
requires extension SEAMCALLs. More memory is consumed when more
extension instances are created. There are 3 extensions (quoting, TDISP,
Migration) in the foreseeable future. Turning on them all will require
tens of megabytes. Note that the host can never reclaim the memory added
to the extensions.

According to the TDX module design, basic TDX functionalities can run
without the extensions. So theoretically the extensions don't need to be
enabled at basic TDX initialization time. They could be lazily enabled
right before the first extension SEAMCALL is issued.

However, Linux applies a simple policy for TDX: turn on all the features
that Linux knows about all the time, unless and until any evidence makes
this approach untenable. Enabling the extensions along with the basic
TDX at boot time aligns with the policy, and offers several good
reasons:

  1. Simplify TDX state management, avoid runtime state transitions that
     could introduce race conditions or unexpected failure modes.

  2. The kernel doesn't have to keep track of which SEAMCALLs need the
     extensions, as there is no HW/FW enumeration for this.

  3. When no extension is configured, the extensions initialization is
     virtually skipped. So no impact on existing kernels.

  4. A small trade-off is that eager initialization allocates memory
     (tens of megabytes) at boot time before any feature starts to work.
     However, these features provide critical security capabilities in
     confidential computing. They are expected to be enabled eventually
     when available. So this merely advances the timing of memory
     allocation.

== Restore the extensions after runtime TDX module update ==

Runtime TDX module update introduces a mechanism to update the module
firmware while preserving and restoring TDX operations. As part of the
restoration process, TDX module extensions must also be re-initialized
to re-enable extension SEAMCALLs.

Similar to TDH.SYS.CONFIG, TDX module extends TDH.SYS.UPDATE with more
parameters for the host to re-enable desired add-on features. Then host
must re-execute all extensions initialization steps to restore extension
SEAMCALL functionality.

However, Linux runs the update in stop_machine() context, which prevents
memory allocation. This introduces a hard restriction that the updated
TDX environment must not consume more memory for the extensions.

Fortunately, Linux applies another policy that no newer features should
be added during runtime update to avoid disrupting live TDX operations.
To adhere to this, TDH.SYS.UPDATE must enable the same features as the
TDH.SYS.CONFIG. This policy mitigates the memory allocation problem a
lot by minimizing the chance of increased memory demand. So now the
restriction only affects the compatibility rule for choosing the update
image.

The same memory constraint applies to the Quoting extension. A compatible
runtime update must not increase the size limit of its Quotes, because the
buffer used for Quote generation is allocated during TDX bringup. Otherwise,
attestation could fail after the update if the TDX module requires a larger
buffer for Quotes.

== Some history ==

The TDX module extensions support part was first posted along with TDX
TDISP [3]. But quoting is the simplest consumer and is chosen as the
lead vehicle over TDISP.

== Misc ==

This series is based on tip/x86/tdx [4], because we need the extensions
play nice with runtime TDX module update.

Link: https://cdrdv2.intel.com/v1/dl/getContent/874303 # [1]
Link: Documentation/arch/x86/tdx.rst, Section "Attestation" # [2]
Link: https://lore.kernel.org/all/20260327160132.2946114-1-yilun.xu@linux.intel.com/ # [3]
Link: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/log/?h=x86/tdx # [4]

== Changelog ==

v2:
- Support runtime TDX module update
- Refine quoting patches, drop RFC tag
- Change the patch order. (Xiaoyao & Tony)
- Fold metadata readings changes into patches that use them.
- Read the extensions metadata at init_tdx_ext() (Rick & Xiaoyao)
- Don't do get_tdx_sys_info() a 2nd time after TDH.SYS.CONFIG (Rick & Xiaoyao)
- Delete tdx_clflush_hpa_list() (Rick)
- s/TDX Module/TDX module (Sohil)
- s/Extensions/extensions (Dave)
- Change the data type of ext_required to bool (Rick)
- Change the data type of memory_pool_required_pages from u16 to u32,
  the Module team see this problem and promise the change (Sohil)
- s/init_tdx_ext()/init_tdx_module_extensions() to disambiguate from
  tdx_ext_init() (Kishen)
- Cover-letter & change log re-phrase (All reviewers)

v1: https://lore.kernel.org/all/20260522034128.3144354-1-yilun.xu@linux.intel.com/

Peter Fang (11):
  x86/virt/tdx: Initialize Quoting extension
  x86/virt/tdx: Prepare Quote buffer during extension bringup
  x86/virt/tdx: Add interface to check Quoting availability
  x86/virt/tdx: Move tdx_tdr_pa() up in the file
  x86/virt/tdx: Add interface to generate a Quote
  x86/virt/tdx: Reinitialize the Quoting extension after TDX module
    update
  x86/virt/tdx: Enable Quoting extension
  x86/tdx: Move and rename Quote request structure
  KVM: TDX: Factor out userspace return path from tdx_get_quote()
  KVM: TDX: Add in-kernel Quote generation
  KVM: TDX: Support event-notify interrupts only with userspace Quoting

Xu Yilun (6):
  x86/virt/tdx: Embed version info in SEAMCALL leaf function definitions
  x86/virt/tdx: Configure add-on features on TDX module init and update
  x86/virt/tdx: Detect if the extensions initialization is required
  x86/virt/tdx: Add extra memory to TDX module for the extensions
  x86/virt/tdx: Make TDX module initialize the extensions
  x86/virt/tdx: Re-initialize the extensions on runtime TDX module
    update

 Documentation/arch/x86/tdx.rst              |  19 +-
 Documentation/virt/kvm/api.rst              |   3 +
 arch/x86/include/asm/tdx.h                  |  35 ++
 arch/x86/include/asm/tdx_global_metadata.h  |   9 +
 arch/x86/kvm/vmx/tdx.h                      |   6 +
 arch/x86/virt/vmx/tdx/tdx.h                 |  33 +-
 arch/x86/kvm/vmx/tdx.c                      | 176 +++++++-
 arch/x86/virt/vmx/tdx/tdx.c                 | 465 +++++++++++++++++++-
 arch/x86/virt/vmx/tdx/tdx_global_metadata.c |  34 ++
 drivers/virt/coco/tdx-guest/tdx-guest.c     |  47 +-
 virt/kvm/kvm_main.c                         |   1 +
 11 files changed, 755 insertions(+), 73 deletions(-)

base-commit: 2b9ad7a6154e0938b9458691536296dd0224942d
-- 
2.25.1

^ permalink raw reply

* Re: [PATCH v6 00/20] dma-mapping: Use DMA_ATTR_CC_SHARED through direct, pool and swiotlb paths
From: Aneesh Kumar K.V @ 2026-06-18  8:37 UTC (permalink / raw)
  To: Alexey Kardashevskiy, Jason Gunthorpe, Catalin Marinas
  Cc: iommu, linux-arm-kernel, linux-kernel, linux-coco, Robin Murphy,
	Marek Szyprowski, Will Deacon, Marc Zyngier, Steven Price,
	Suzuki K Poulose, Jiri Pirko, Mostafa Saleh, Petr Tesarik,
	Dan Williams, Xu Yilun, linuxppc-dev, linux-s390,
	Madhavan Srinivasan, Michael Ellerman, Nicholas Piggin,
	Christophe Leroy (CS GROUP), Alexander Gordeev, Gerald Schaefer,
	Heiko Carstens, Vasily Gorbik, Christian Borntraeger,
	Sven Schnelle, x86
In-Reply-To: <2ecfa1a8-6202-4319-9692-a6ffeb5a3dbf@amd.com>

Alexey Kardashevskiy <aik@amd.com> writes:

> On 10/6/26 00:47, Jason Gunthorpe wrote:
>> On Tue, Jun 09, 2026 at 02:43:08PM +0100, Catalin Marinas wrote:
>>> On Thu, Jun 04, 2026 at 02:09:39PM +0530, Aneesh Kumar K.V (Arm) wrote:
>>>> This series propagates DMA_ATTR_CC_SHARED through the dma-direct,
>>>> dma-pool, and swiotlb paths so that encrypted and decrypted DMA buffers
>>>> are handled consistently.
>>>>
>>>> Today, the direct DMA path mostly relies on force_dma_unencrypted() for
>>>> shared/decrypted buffer handling. This series consolidates the
>>>> force_dma_unencrypted() checks in the top-level functions and ensures
>>>> that the remaining DMA interfaces use DMA attributes to make the correct
>>>> decisions.
>>>
>>> Please check Sashiko's reports, it has some good points:
>>>
>>> https://sashiko.dev/#/patchset/20260604083959.1265923-1-aneesh.kumar@kernel.org
>>>
>>> I think the main one is the swiotlb_tbl_map_single() changes which break
>>> AMD SME host support. There cc_platform_has(CC_ATTR_MEM_ENCRYPT) is true
>>> but force_dma_unencrypted() is false. Normally you'd not end up on this
>>> path but you can have swiotlb=force.
>> 
>> IMHO that's an AMD issue, not with the design of this series..
>> 
>> The series is right, a device that is !force_dma_decrypted() must be
>> considerd to be a trusted device and we must never place any DMA
>> mappings for a trusted device into shared memory.
>
>
> swiotlb=force forces swiotlb, not decryption.
>
>> That AMD has done somethine insane:
>> 
>> bool force_dma_unencrypted(struct device *dev)
>> {
>> 	/*
>> 	 * For SEV, all DMA must be to unencrypted addresses.
>> 	 */
>> 	if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
>> 		return true;
>> 
>> 	/*
>> 	 * For SME, all DMA must be to unencrypted addresses if the
>> 	 * device does not support DMA to addresses that include the
>> 	 * encryption mask.
>> 	 */
>> 	if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) {
>> 		u64 dma_enc_mask = DMA_BIT_MASK(__ffs64(sme_me_mask));
>> 		u64 dma_dev_mask = min_not_zero(dev->coherent_dma_mask,
>> 						dev->bus_dma_limit);
>> 
>> 		if (dma_dev_mask <= dma_enc_mask)
>> 			return true;
>> 	}
>
>
> So when I try "mem_encrypt=on iommu=pt swiotlb=force" with this patchset, it fails to boot. But it boots with a hack like this:
>
> ===
> @@ -39,7 +41,7 @@ bool force_dma_unencrypted(struct device *dev)
>                          return true;
>          }
>   
> -       return false;
> +       return swiotlb_force_bounce;
>   }
> ===
>
> Or we say "mem_encrypt=on iommu=pt swiotlb=force" combo is just weird and we won't be supporting which bit in this? Thanks,
>

Something like?

modified   arch/x86/mm/mem_encrypt.c
@@ -34,6 +34,13 @@ bool force_dma_unencrypted(struct device *dev)
 		u64 dma_enc_mask = DMA_BIT_MASK(__ffs64(sme_me_mask));
 		u64 dma_dev_mask = min_not_zero(dev->coherent_dma_mask,
 						dev->bus_dma_limit);
+		/*
+		 * With memory encryption enabled, SWIOTLB is marked decrypted.
+		 * If SWIOTLB bouncing is forced, treat the device as requiring
+		 * decrypted DMA.
+		 */
+		if (is_swiotlb_force_bounce(dev))
+			return true;
 
 		if (dma_dev_mask <= dma_enc_mask)
 			return true;



-aneesh

^ permalink raw reply

* Re: [PATCH v6 00/20] dma-mapping: Use DMA_ATTR_CC_SHARED through direct, pool and swiotlb paths
From: Alexey Kardashevskiy @ 2026-06-18  4:44 UTC (permalink / raw)
  To: Jason Gunthorpe, Catalin Marinas
  Cc: Aneesh Kumar K.V (Arm), iommu, linux-arm-kernel, linux-kernel,
	linux-coco, Robin Murphy, Marek Szyprowski, Will Deacon,
	Marc Zyngier, Steven Price, Suzuki K Poulose, Jiri Pirko,
	Mostafa Saleh, Petr Tesarik, Dan Williams, Xu Yilun, linuxppc-dev,
	linux-s390, Madhavan Srinivasan, Michael Ellerman,
	Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
	Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, Sven Schnelle, x86
In-Reply-To: <20260609144746.GL2764304@ziepe.ca>



On 10/6/26 00:47, Jason Gunthorpe wrote:
> On Tue, Jun 09, 2026 at 02:43:08PM +0100, Catalin Marinas wrote:
>> On Thu, Jun 04, 2026 at 02:09:39PM +0530, Aneesh Kumar K.V (Arm) wrote:
>>> This series propagates DMA_ATTR_CC_SHARED through the dma-direct,
>>> dma-pool, and swiotlb paths so that encrypted and decrypted DMA buffers
>>> are handled consistently.
>>>
>>> Today, the direct DMA path mostly relies on force_dma_unencrypted() for
>>> shared/decrypted buffer handling. This series consolidates the
>>> force_dma_unencrypted() checks in the top-level functions and ensures
>>> that the remaining DMA interfaces use DMA attributes to make the correct
>>> decisions.
>>
>> Please check Sashiko's reports, it has some good points:
>>
>> https://sashiko.dev/#/patchset/20260604083959.1265923-1-aneesh.kumar@kernel.org
>>
>> I think the main one is the swiotlb_tbl_map_single() changes which break
>> AMD SME host support. There cc_platform_has(CC_ATTR_MEM_ENCRYPT) is true
>> but force_dma_unencrypted() is false. Normally you'd not end up on this
>> path but you can have swiotlb=force.
> 
> IMHO that's an AMD issue, not with the design of this series..
> 
> The series is right, a device that is !force_dma_decrypted() must be
> considerd to be a trusted device and we must never place any DMA
> mappings for a trusted device into shared memory.


swiotlb=force forces swiotlb, not decryption.

> That AMD has done somethine insane:
> 
> bool force_dma_unencrypted(struct device *dev)
> {
> 	/*
> 	 * For SEV, all DMA must be to unencrypted addresses.
> 	 */
> 	if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
> 		return true;
> 
> 	/*
> 	 * For SME, all DMA must be to unencrypted addresses if the
> 	 * device does not support DMA to addresses that include the
> 	 * encryption mask.
> 	 */
> 	if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) {
> 		u64 dma_enc_mask = DMA_BIT_MASK(__ffs64(sme_me_mask));
> 		u64 dma_dev_mask = min_not_zero(dev->coherent_dma_mask,
> 						dev->bus_dma_limit);
> 
> 		if (dma_dev_mask <= dma_enc_mask)
> 			return true;
> 	}


So when I try "mem_encrypt=on iommu=pt swiotlb=force" with this patchset, it fails to boot. But it boots with a hack like this:

===
@@ -39,7 +41,7 @@ bool force_dma_unencrypted(struct device *dev)
                         return true;
         }
  
-       return false;
+       return swiotlb_force_bounce;
  }
===

Or we say "mem_encrypt=on iommu=pt swiotlb=force" combo is just weird and we won't be supporting which bit in this? Thanks,


> 
> Is an AMD issue. We already have an address mask limit system built
> into the DMA API, arch code should not be co-opting the CC mechanism
> to create a special pool for address limited devices.
> 
> The correct thing is to ensure the DMA API is checking any address
> limits on the actual true dma_addr_t, not on an intermediate like a
> phys_addr before it is adjusted with any C bit. Then it is a normal
> low address swiotlb bounce like any other.
> 
> I think we can ignore this Sashiko remark, in real systems the use of
> swiotlb for 64 bit devices is very rare. Though it would be good to
> remove this code from AMD...> 
> Jason

-- 
Alexey


^ permalink raw reply

* Re: [PATCH v6 03/20] dma-direct: use DMA_ATTR_CC_SHARED in alloc/free paths
From: Alexey Kardashevskiy @ 2026-06-18  2:39 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Aneesh Kumar K.V (Arm), iommu, linux-arm-kernel, linux-kernel,
	linux-coco, Robin Murphy, Marek Szyprowski, Will Deacon,
	Marc Zyngier, Steven Price, Suzuki K Poulose, Catalin Marinas,
	Jiri Pirko, Mostafa Saleh, Petr Tesarik, Dan Williams, Xu Yilun,
	linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
	Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
	Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, Sven Schnelle, x86, Jiri Pirko,
	Michael Kelley, Cheloha, Scott
In-Reply-To: <20260617154101.GE3577091@ziepe.ca>



On 18/6/26 01:41, Jason Gunthorpe wrote:
> On Wed, Jun 17, 2026 at 10:50:39AM +1000, Alexey Kardashevskiy wrote:
>>> @@ -193,16 +193,31 @@ void *dma_direct_alloc(struct device *dev, size_t size,
>>>    		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
>>>    {
>>>    	bool remap = false, set_uncached = false;
>>> -	bool mark_mem_decrypt = true;
>>> +	bool mark_mem_decrypt = false;
>>>    	struct page *page;
>>>    	void *ret;
>>> +	/*
>>> +	 * DMA_ATTR_CC_SHARED is not a caller-visible dma_alloc_*()
>>> +	 * attribute. The direct allocator uses it internally after it has
>>> +	 * decided that the backing pages must be shared/decrypted, so the
>>> +	 * rest of the allocation path can consistently select DMA addresses,
>>> +	 * choose compatible pools and restore encryption on free.
>>
>> Why this limit?
>>
>> Context: I am looking for a memory pool for a few shared pages (to
>> do some guest<->host communication), SWIOTLB seems like the right
>> fit but swiotlb_alloc() is not exported and
>> dma_direct_alloc(DMA_ATTR_CC_SHARED) is not allowed.  Thanks,
> 
> Then setup your struct device so that the DMA API knows the
> guest<->host channel requires unecrypted and it will work correctly.
> 
> I think this is a reasonable API to use for that, and I was just
> advocating that hyperv should be using it too.
> 
> But it all relies on a properly setup struct device.

Sounds good but how do I do that in practice? DMA_ATTR_CC_SHARED is not externally available so I'll have to trick the DMA layer into using SWIOTLB (which is still all shared, right?) as I specifically want to skip page conversions. Setting low DMA mask won't guarantee that the DMA layer won't allocate a page outside of SWIOTLB and convert it. Manually do

dev->dma_io_tlb_mem->force_bounce = true;
dev->dma_io_tlb_mem->for_allow = true;

?
Or follow the Aneesh'es genpool approach? Thanks,


> 
> Jason

-- 
Alexey


^ permalink raw reply

* Re: [PATCH v8 3/7] crypto/ccp: Disable CPU hotplug while SNP is active
From: Kalra, Ashish @ 2026-06-17 22:23 UTC (permalink / raw)
  To: K Prateek Nayak, tglx, mingo, bp, dave.hansen, x86, hpa, seanjc,
	peterz, thomas.lendacky, herbert, davem, ardb
  Cc: pbonzini, aik, Michael.Roth, Tycho.Andersen, Nathan.Fontenot,
	ackerleytng, jackyli, pgonda, rientjes, jacobhxu, xin,
	pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen, darwi,
	linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <763bff29-e737-4033-ab30-cec8fd3e7438@amd.com>

Hello Prateek,

On 6/16/2026 11:33 PM, K Prateek Nayak wrote:
> Hello Ashish,
> 
> On 6/16/2026 1:19 AM, Ashish Kalra wrote:
>> From: Ashish Kalra <ashish.kalra@amd.com>
>>
>> The SEV firmware enumerates the CPUs at SNP initialization and is not
>> aware of the OS bringing CPUs online or offline afterwards, so OS CPU
>> hotplug can diverge from the firmware's expectations and break SNP.
>> Disable CPU hotplug while SNP is active.
> 
> Dumb question: Is this specific to RMPOPT? Otherwise ...

The actual reason is purely about the SEV firmware: it enumerates the BIOS-enabled CPUs at SNP_INIT_EX
and has no knowledge of OS hotplug afterward. That's true whether or not RMPOPT exists. 
RMPOPT only benefits from the side effect, which is a stable rmpopt_cpumask and an uncontended cpus_read_lock()
in the work handler.

So it is specific to SNP, but RMPOPT patches that come later in the series rely on it, therefore it
is a pre-patch here.

> 
>>
>> SNP is fully torn down only on the SNP_SHUTDOWN_EX x86_snp_shutdown
>> path; the legacy path leaves SNP enabled in hardware while clearing
>> snp_initialized, so __sev_snp_init_locked() can run again.  Track the
>> disable with a flag so it is balanced by a matching enable rather than
>> stacked, and re-enable hotplug only on the x86_snp_shutdown path, after
>> snp_shutdown() has cleared the per-core RMPOPT_BASE MSRs with hotplug
>> still disabled.
>>
>> This also keeps the CPU set stable for the asynchronous RMPOPT scan
>> added later in this series, and ensures cpus_read_lock() in the scan
>> is uncontended.
>>
>> Suggested-by: Thomas Lendacky <thomas.lendacky@amd.com>
>> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
>> ---
>>  drivers/crypto/ccp/sev-dev.c | 29 ++++++++++++++++++++++++++++-
>>  1 file changed, 28 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
>> index 217b6b19802e..c8c3c577463c 100644
>> --- a/drivers/crypto/ccp/sev-dev.c
>> +++ b/drivers/crypto/ccp/sev-dev.c
>> @@ -106,6 +106,9 @@ struct snp_hv_fixed_pages_entry {
>>  
>>  static LIST_HEAD(snp_hv_fixed_pages);
>>  
>> +/* Set while SNP has CPU hotplug disabled. */
>> +static bool snp_cpu_hotplug_disabled;
>> +
>>  /* Trusted Memory Region (TMR):
>>   *   The TMR is a 1MB area that must be 1MB aligned.  Use the page allocator
>>   *   to allocate the memory, which will return aligned memory for the specified
>> @@ -1479,6 +1482,17 @@ static int __sev_snp_init_locked(int *error, unsigned int max_snp_asid)
>>  
>>  	snp_hv_fixed_pages_state_update(sev, HV_FIXED);
>>  
>> +	/*
>> +	 * Disable CPU hotplug while SNP is active.  Guard against stacking
>> +	 * the disable count: the legacy SNP_SHUTDOWN_EX path clears
>> +	 * snp_initialized without re-enabling hotplug, so this can run
>> +	 * again while hotplug is already disabled.
>> +	 */
>> +	if (!snp_cpu_hotplug_disabled) {
>> +		cpu_hotplug_disable();
>> +		snp_cpu_hotplug_disabled = true;
>> +	}
>> +
> 
> ... should this be done before __sev_do_cmd_locked(SEV_CMD_SNP_INIT_EX)
> is issued?
> 
> I'm assuming that is when the firmware enumerates the CPUs during SNP
> initialization and any hotplug after that should be disallowed?

Yes, it makes sense to do it before SNP_INIT_EX is issued.

Thanks,
Ashish

> 
>>  	snp_setup_rmpopt();
>>  
>>  	sev->snp_initialized = true;

^ permalink raw reply

* Re: [PATCH v8 4/7] x86/sev: Add support to perform RMP optimizations asynchronously
From: Kalra, Ashish @ 2026-06-17 21:57 UTC (permalink / raw)
  To: K Prateek Nayak, tglx, mingo, bp, dave.hansen, x86, hpa, seanjc,
	peterz, thomas.lendacky, herbert, davem, ardb
  Cc: pbonzini, aik, Michael.Roth, Tycho.Andersen, Nathan.Fontenot,
	ackerleytng, jackyli, pgonda, rientjes, jacobhxu, xin,
	pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen, darwi,
	linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <75cf11f1-51fc-4f1a-a9a7-4b9403d2bb8b@amd.com>

Hello Prateek,

On 6/16/2026 11:20 PM, K Prateek Nayak wrote:
> Hello Ashish,
> 
> On 6/17/2026 1:26 AM, Kalra, Ashish wrote:
>> Hello Prateek,
>>
>> On 6/16/2026 2:27 AM, K Prateek Nayak wrote:
>>> Hello Ashish,
>>>
>>> On 6/16/2026 1:19 AM, Ashish Kalra wrote:
>>>> +	/*
>>>> +	 * RMPOPT scans the RMP table, stores the result of the scan in the
>>>> +	 * reserved processor memory. The RMP scan is the most expensive
>>>> +	 * part. If a second RMPOPT occurs, it can skip the expensive scan
>>>> +	 * if they can see a cached result in the reserved processor memory.
>>>> +	 *
>>>> +	 * Do RMPOPT on one CPU alone. Then, follow that up with RMPOPT
>>>> +	 * on every other primary thread. Followers are "designed to"
>>>> +	 * skip the scan if they see the "cached" scan results.
>>>> +	 */
>>>> +	cpumask_copy(follower_mask, &rmpopt_cpumask);
>>>
>>> rmpopt_cpumask is constructed after hotplug is disabled but ...
>>>
>>>> +
>>>> +	/*
>>>> +	 * Pin the worker to the current CPU for the leader loop so that
>>>> +	 * this_cpu remains valid and the RMPOPT instruction executes on
>>>> +	 * the correct CPU.
>>>> +	 *
>>>> +	 * Use migrate_disable() rather than get_cpu() to prevent
>>>> +	 * migration while still allowing preemption.
>>>> +	 */
>>>> +	migrate_disable();
>>>> +	this_cpu = smp_processor_id();
>>>> +
>>>> +	if (cpumask_test_cpu(this_cpu, follower_mask)) {
>>>> +		/*
>>>> +		 * Current CPU is a primary thread in rmpopt_cpumask.
>>>> +		 * Run leader locally and remove from follower mask.
>>>> +		 */
>>>> +		cpumask_clear_cpu(this_cpu, follower_mask);
>>>> +
>>>> +		for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
>>>> +			rmpopt(pa);
>>>> +			cond_resched();
>>>> +		}
>>>> +	} else if (cpumask_intersects(topology_sibling_cpumask(this_cpu),
>>>> +				      follower_mask)) {
>>>> +		/*
>>>> +		 * Current CPU is a sibling thread whose primary is in
>>>> +		 * rmpopt_cpumask.  RMPOPT_BASE MSR is per-core, so it
>>>> +		 * is safe to run the leader locally.  Remove the sibling's
>>>> +		 * primary from the follower mask as this core is already
>>>> +		 * covered by the leader.
>>>> +		 */
>>>> +		cpumask_andnot(follower_mask, follower_mask,
>>>> +			       topology_sibling_cpumask(this_cpu));
>>>> +
>>>> +		for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
>>>> +			rmpopt(pa);
>>>> +			cond_resched();
>>>> +		}
>>>> +	} else {
>>>> +		/*
>>>> +		 * Current CPU does not have RMPOPT_BASE MSR programmed.
>>>> +		 * Pick an explicit leader from the cpumask to avoid #UD.
>>>> +		 * Use work_on_cpu() to run in process context on the leader,
>>>> +		 * avoiding IPI latency.
>>>> +		 */
>>>
>>> ... this_cpu is neither in the "rmpopt_cpumask", nor is any of its
>>> siblings on "rmpopt_cpumask".
>>>
>>> How does that happen?
>>
>> Actually, this was the implementation before the CPU hotplug disable enforcement code was implemented and added in v8,
>> and i should have fixed this rmpopt_work_handler() accordingly for v8.
>>
>> With the enforced cpu hotplug disable support, case #3 here (above) is now dead code, and removing it lets
>> cases #1 and #2 collapse too.
>>
>> snp_prepare() requires cpu_online_mask == cpu_present_mask before SNP init — so when snp_setup_rmpopt() programs the MSRs, every
>> core's primary is online -> every core is in rmpopt_cpumask.
>>   
>> So now the work handler always runs on a CPU whose core is programmed. topology_sibling_cpumask(this_cpu) therefore always intersects
>> rmpopt_cpumask -> case #1 or #2 always matches.
>>
>> So i should actually drop case #3 here - which is: "this_cpu is neither in the "rmpopt_cpumask", nor is any of its
>> siblings on rmpopt_cpumask"
> 
> Ack.
> 
> Also the fact that cpu_mark_primary_thread() uses LSBs of APICID and if
> you have some insanely weird configuration - like boot with maxcpus=1,
> online all the secondary threads (CPUs 256-511 on a 256C/512T system),
> launch an SNP guest - it can actually leave everything except CORE0 out
> of the "rmpopt_cpumask".
> 
>>
>>
>>>
>>>> +		int leader_cpu = cpumask_first(follower_mask);
>>>> +
>>>> +		if (WARN_ON_ONCE(leader_cpu >= nr_cpu_ids)) {
>>>> +			migrate_enable();
>>>> +			goto out;
>>>> +		}
>>>> +
>>>> +		cpumask_clear_cpu(leader_cpu, follower_mask);
>>>> +
>>>> +		/* Release migration pin before work_on_cpu(). */
>>>> +		migrate_enable();
>>>> +
>>>> +		work_on_cpu(leader_cpu, rmpopt_leader_fn, NULL);
>>>
>>> This creates a delayed work and also waits for it to finish execution
>>> which will add more latency than a simple IPI if the comment about IPI
>>> latency above is accurate.
>>>
>>> I think there is some corner case in construction of the
>>> "rmpopt_cpumask" that requires this not-so-pretty else block. Can you
>>> elaborate why this is required?
>>>
>>> Perhaps the "rmpopt_cpumask" construction needs:
>>>
>>>     for_each_online_cpu(cpu) {
>>>         /* Nominate the first CPU on the sibling mask for RMPOPT */
>>>         if (cpu != cpumask_first(topology_sibling_cpumask(cpu)))
>>>             continue;
>>>         cpumask_set_cpu(cpu, &rmpopt_cpumask);
>>>     }
>>>
>>>
>>> and all you need here is:
>>>
>>>     /* Do RMPOPt for local core */
>>>     for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G)
>>>         rmpopt(pa);
>>>
>>>     /* Skip this core from concurrent RMPOPT */
>>>     cpumask_and_not(follower_mask, &rmpopt_cpumask, topology_sibling_cpumask(cpu));
>>>
>>> No?
>>>
>>
>> Yes, a simpler implementation will be like this: 
>> ...
>>
>>  	if (!alloc_cpumask_var(&follower_mask, GFP_KERNEL))
>>                 return;
>>
> 
> If you move the migrate_disable() here, you can simply do an andnot
> without needing to copy the rmpopt_cpumask beforehand and save on one
> cpumask iteration.

Yes, that's a nice optimization, we can read directly from rmpopt_cpumask and write follower_mask in one pass.

> 
>>  	cpumask_copy(follower_mask, &rmpopt_cpumask);
>>
>>         /*
>>          * The current CPU's core always has RMPOPT_BASE programmed
>>          * (snp_prepare() required all CPUs online at setup and CPU hotplug
>>          * is disabled while SNP is active), so it can always be the leader.
>>          * RMPOPT_BASE is per-core; exclude this core from the followers.
>>          */
>>         migrate_disable();
>>         cpumask_andnot(follower_mask, follower_mask,
>>                        topology_sibling_cpumask(smp_processor_id()));
>>
>>         for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
>>                 rmpopt(pa);
>>                 cond_resched();
>>         }
>>         migrate_enable();
>>
>>         cpus_read_lock();
> 
> I think you can even skip the cpus_read_lock() since we know for a
> fact that hotplug is disabled when we are here.
> 
> Perhaps we can have a lockdep_assert_cpu_hotplug_disabled() which
> ensures we'll get a splat if that assumption ever changes when
> running with LOCKDEP?

Yes, that is true when we have made sure that hotplug is disabled, but i think it is Ok
to keep cpus_read_lock() here as it keeps Sashiko happy.

> 
> I'll let others comment if that is a good idea or not.
> 
>>         for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
>>                 on_each_cpu_mask(follower_mask, rmpopt_smp, (void *)pa, true);
>>                 cond_resched();
>>         }
>>         cpus_read_unlock();
>>
>>         free_cpumask_var(follower_mask);
>>
>>
>>  Here, the leader exclusion must use the sibling mask, not clear_cpu(this_cpu). That's why my collapsed version uses:
>>
>>         cpumask_andnot(follower_mask, follower_mask,
>>                        topology_sibling_cpumask(smp_processor_id()));
>>
>>   - If this_cpu is a primary: its sibling mask contains itself (the primary) -> andnot removes this core's primary from the followers.
>>   
>>   - If this_cpu is a secondary: it isn't in follower_mask at all, but its sibling mask contains its primary, which is in
>>   follower_mask -> andnot still removes this core's primary. 
>>
>>   So either way the current core is dropped from the followers. (The old code needed two branches because case #1 used
>>   clear_cpu(this_cpu) — only correct when this_cpu is the primary — while case #2 used the sibling andnot. The single andnot works for
>>   both cases).
> 
> Ack! And I think this looks much cleaner (to my eyes at least ;-)
> 

Thanks,
Ashish

^ permalink raw reply

* Re: SVSM Development Call June 17th, 2026
From: Jörg Rödel @ 2026-06-17 20:35 UTC (permalink / raw)
  To: coconut-svsm, linux-coco
In-Reply-To: <ajF1XEHc42IG07Qw@8bytes.org>

Meeting minutes are ready:

	https://github.com/coconut-svsm/governance/pull/113

-Joerg

^ permalink raw reply

* Re: [PATCH v6 03/20] dma-direct: use DMA_ATTR_CC_SHARED in alloc/free paths
From: Jason Gunthorpe @ 2026-06-17 15:41 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Aneesh Kumar K.V (Arm), iommu, linux-arm-kernel, linux-kernel,
	linux-coco, Robin Murphy, Marek Szyprowski, Will Deacon,
	Marc Zyngier, Steven Price, Suzuki K Poulose, Catalin Marinas,
	Jiri Pirko, Mostafa Saleh, Petr Tesarik, Dan Williams, Xu Yilun,
	linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
	Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
	Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, Sven Schnelle, x86, Jiri Pirko,
	Michael Kelley, Cheloha, Scott
In-Reply-To: <845d0c8a-6d51-47aa-8e0b-8381e733444a@amd.com>

On Wed, Jun 17, 2026 at 10:50:39AM +1000, Alexey Kardashevskiy wrote:
> > @@ -193,16 +193,31 @@ void *dma_direct_alloc(struct device *dev, size_t size,
> >   		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
> >   {
> >   	bool remap = false, set_uncached = false;
> > -	bool mark_mem_decrypt = true;
> > +	bool mark_mem_decrypt = false;
> >   	struct page *page;
> >   	void *ret;
> > +	/*
> > +	 * DMA_ATTR_CC_SHARED is not a caller-visible dma_alloc_*()
> > +	 * attribute. The direct allocator uses it internally after it has
> > +	 * decided that the backing pages must be shared/decrypted, so the
> > +	 * rest of the allocation path can consistently select DMA addresses,
> > +	 * choose compatible pools and restore encryption on free.
> 
> Why this limit?
> 
> Context: I am looking for a memory pool for a few shared pages (to
> do some guest<->host communication), SWIOTLB seems like the right
> fit but swiotlb_alloc() is not exported and
> dma_direct_alloc(DMA_ATTR_CC_SHARED) is not allowed.  Thanks,

Then setup your struct device so that the DMA API knows the
guest<->host channel requires unecrypted and it will work correctly.

I think this is a reasonable API to use for that, and I was just
advocating that hyperv should be using it too.

But it all relies on a properly setup struct device.

Jason

^ permalink raw reply

* Re: [PATCH v6 03/20] dma-direct: use DMA_ATTR_CC_SHARED in alloc/free paths
From: Aneesh Kumar K.V @ 2026-06-17 14:46 UTC (permalink / raw)
  To: Alexey Kardashevskiy, iommu, linux-arm-kernel, linux-kernel,
	linux-coco
  Cc: Robin Murphy, Marek Szyprowski, Will Deacon, Marc Zyngier,
	Steven Price, Suzuki K Poulose, Catalin Marinas, Jiri Pirko,
	Jason Gunthorpe, Mostafa Saleh, Petr Tesarik, Dan Williams,
	Xu Yilun, linuxppc-dev, linux-s390, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy (CS GROUP),
	Alexander Gordeev, Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, Sven Schnelle, x86, Jiri Pirko,
	Michael Kelley, Cheloha, Scott
In-Reply-To: <845d0c8a-6d51-47aa-8e0b-8381e733444a@amd.com>

Alexey Kardashevskiy <aik@amd.com> writes:

> On 4/6/26 18:39, Aneesh Kumar K.V (Arm) wrote:
>> Propagate force_dma_unencrypted() into DMA_ATTR_CC_SHARED in the
>> dma-direct allocation path and use the attribute to drive the related
>> decisions.
>> 
>> This updates dma_direct_alloc(), dma_direct_free(), and
>> dma_direct_alloc_pages() to fold the forced unencrypted case into attrs.
>> 
>> Tested-by: Jiri Pirko <jiri@nvidia.com>
>> Tested-by: Michael Kelley <mhklinux@outlook.com>
>> Tested-by: Mostafa Saleh <smostafa@google.com>
>> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
>> ---
>>   kernel/dma/direct.c | 53 +++++++++++++++++++++++++++++++++++++--------
>>   1 file changed, 44 insertions(+), 9 deletions(-)
>> 
>> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
>> index a741c8a2ee66..90dc5057a0c0 100644
>> --- a/kernel/dma/direct.c
>> +++ b/kernel/dma/direct.c
>> @@ -193,16 +193,31 @@ void *dma_direct_alloc(struct device *dev, size_t size,
>>   		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
>>   {
>>   	bool remap = false, set_uncached = false;
>> -	bool mark_mem_decrypt = true;
>> +	bool mark_mem_decrypt = false;
>>   	struct page *page;
>>   	void *ret;
>>   
>> +	/*
>> +	 * DMA_ATTR_CC_SHARED is not a caller-visible dma_alloc_*()
>> +	 * attribute. The direct allocator uses it internally after it has
>> +	 * decided that the backing pages must be shared/decrypted, so the
>> +	 * rest of the allocation path can consistently select DMA addresses,
>> +	 * choose compatible pools and restore encryption on free.
>
> Why this limit?
>
> Context: I am looking for a memory pool for a few shared pages (to do
> some guest<->host communication), SWIOTLB seems like the right fit but
> swiotlb_alloc() is not exported and
> dma_direct_alloc(DMA_ATTR_CC_SHARED) is not allowed. Thanks,
>

swiotlb is not the right pool to use for that, right?
CCA had a similar requirement for ITS pages and ended up creating a genpool:
b08e2f42e86b ("irqchip/gic-v3-its: Share ITS tables with a non-trusted hypervisor")

-aneesh

^ permalink raw reply

* Re: [PATCH RFC 0/3] KVM: guest_memfd: folio migration for non-confidential VMs
From: David Hildenbrand (Arm) @ 2026-06-17 10:34 UTC (permalink / raw)
  To: Ackerley Tng, Sean Christopherson, Alexandru Elisei
  Cc: Shivank Garg, Matthew Wilcox (Oracle), Jan Kara, Andrew Morton,
	Vlastimil Babka, Suren Baghdasaryan, Michal Hocko,
	Brendan Jackman, Johannes Weiner, Zi Yan, Matthew Brost,
	Joshua Hahn, Rakie Kim, Byungchul Park, Gregory Price, Ying Huang,
	Alistair Popple, Paolo Bonzini, Shuah Khan, Chao Peng,
	Nikunj A Dadhania, Ira Weiny, Michael Roth, Pankaj Gupta,
	Fuad Tabba, Vishal Annapurve, Nikita Kalyazin, Patrick Roy,
	Pratik Sampat, Ashish Kalra, linux-fsdevel, linux-coco, linux-mm,
	linux-kernel, kvm, linux-kselftest
In-Reply-To: <CAEvNRgFQLEsKanKrj=ePHoShiY2cgQgxtGs_2CJcZHP=JOjidg@mail.gmail.com>

On 6/16/26 20:09, Ackerley Tng wrote:
> "David Hildenbrand (Arm)" <david@kernel.org> writes:
> 
>> On 6/15/26 19:39, Sean Christopherson wrote:
>>>
>>> +1000.  It's not just "nice to have", it's a core design principle of guest_memfd.
>>
>> Right, and I raised in the guest_memfd call also the rough idea of Alexandru's
>> use case of having non-movable guest_memfd pages such that we can support use
>> cases where we can hopefully guarantee that a stage-2 mapping will not just
>> randomly go away.
>>
>>>
> 
> More concretely, are y'all pointing towards a
> GUEST_MEMFD_FLAG_MIGRATABLE, which will set .migrate =
> kvm_gmem_migrate_folio, and for now, error out for CoCo VMs?
> 
>>>
>>> For the purposes of this discussion, we should separate the physical act of
>>> migrating pages from the features that trigger migration.  As I said in last week's
>>> guest-memfd call, I am a-ok with supporting page migration as a mechanism, but I
>>> am dead set against supporting NUMA balancing, KSM, LRU-based swap/reclaim, and
>>> anything else that goes against the goal of guest-first memory.
>>
>> Right. Page migration for supporting ZONE_MOVABLE/CMA, compaction, memory
>> offlining, virtio-mem and possibly some collapse mechanism if we were to support
>> THP of some sorts in guest_memfd would are all reasonable.
>>
> 
> Background question: how would virtio-mem use migration in the host/guest_memfd?

Good question! As long as there is no nested-virt support (and virtio-mem
support for coco still being in the making) that wouldn't apply, only ordinary
memory hot(un)plug (incl CXL).

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH RFC 0/3] KVM: guest_memfd: folio migration for non-confidential VMs
From: Garg, Shivank @ 2026-06-17 10:17 UTC (permalink / raw)
  To: Sean Christopherson, Alexandru Elisei
  Cc: Matthew Wilcox (Oracle), Jan Kara, Andrew Morton, Vlastimil Babka,
	Suren Baghdasaryan, Michal Hocko, Brendan Jackman,
	Johannes Weiner, Zi Yan, David Hildenbrand, Matthew Brost,
	Joshua Hahn, Rakie Kim, Byungchul Park, Gregory Price, Ying Huang,
	Alistair Popple, Paolo Bonzini, Shuah Khan, Chao Peng,
	Nikunj A Dadhania, Ira Weiny, Michael Roth, Pankaj Gupta,
	Ackerley Tng, Fuad Tabba, Vishal Annapurve, Nikita Kalyazin,
	Patrick Roy, Pratik Sampat, Ashish Kalra, linux-fsdevel,
	linux-coco, linux-mm, linux-kernel, kvm, linux-kselftest
In-Reply-To: <ajA4z_Wkb93cTW4m@google.com>



On 6/15/2026 11:09 PM, Sean Christopherson wrote:
> On Mon, Jun 15, 2026, Alexandru Elisei wrote:
>> Hi,
>>
>> On Mon, Jun 15, 2026 at 11:43:14AM +0100, Alexandru Elisei wrote:
>>> Hi,
>>>
>>> On Thu, Jun 11, 2026 at 01:05:07PM +0000, Shivank Garg wrote:
>>>> guest_memfd folios are currently marked unmovable, so the kernel cannot
>>>> perform NUMA-balancing, memory compaction, etc. This is unavoidable for
>>>> confidential VMs (SEV-SNP, TDX), since memory is encrypted and copying it
>>>> needs firmware assistance. However, for non-confidential VMs (like
>>>> Firecracker), we can migrate the folios.
>>>>
>>>> This series enables folio migration for non-confidential guest_memfd and
>>>> also lays the groundwork for migrating confidential guest_memfd later.
>>>> Once firmware-assisted copying support is available, those VMs can be
>>>> made movable, the confidential folio content can be copied separately,
>>>> and the destination folio marked with FOLIO_CONTENT_COPIED so
>>>> __migrate_folio() skips the host-side folio_mc_copy().
>>>
>>> I always thought that one of the nice things about using guest_memfd as a
>>> memory backend, as opposed to host userspace mappings, is that the host
>>> cannot unmap VM memory because of KSM, automatic NUMA balancing, hugepage
>>> collapse, compaction, etc, acting on the host userspace mapping of the
>>> VM memory, and outside of the VMM's or KVM's control.
> 
> +1000.  It's not just "nice to have", it's a core design principle of guest_memfd.
> 
>>> I think it would be useful to preserve this behaviour, even in the absence
>>> of confidential VMs (i.e, guest_memfd file descriptor created with
>>> GUEST_MEMFD_FLAG_MMAP).
>>
>> Just to be clear, I was thinking that it might be useful for both
>> behaviours to exist (migratable and non-migratable) for non-confidential
>> VMs, and allow KVM or userspace to decide which they prefer for a
>> guest_memfd.
> 
> For the purposes of this discussion, we should separate the physical act of
> migrating pages from the features that trigger migration.  As I said in last week's
> guest-memfd call, I am a-ok with supporting page migration as a mechanism, but I
> am dead set against supporting NUMA balancing, KSM, LRU-based swap/reclaim, and
> anything else that goes against the goal of guest-first memory.
> 
> If userspace wants mm/ functionality, then use anon, memfd, hugetlb, shmem, etc.
> 
> Shivank, what's the immediate motivation for this series?

Hi Sean,
This makes sense!

Tbh, my main motivation was to start a dialogue on this, since the
implementation+testing itself was easy.

Compaction and memory failure handling were the cases I initially
had in mind. And as David noted, ZONE_MOVABLE/CMA, compaction, memory
offlining, virtio-mem cases would be useful too.

I fully agree that NUMA balancing, LRU/reclaim and etc. features
should stay out, and keeping the migration as mechanism only for
guest_memfd.

Thanks,
Shivank

^ permalink raw reply

* Re: [PATCH v13 03/22] KVM: selftests: Initialize the TDX VM
From: Xiaoyao Li @ 2026-06-17  9:50 UTC (permalink / raw)
  To: Lisa Wang, Andrew Jones, Ackerley Tng, Binbin Wu, Chao Gao,
	Chenyi Qiang, Dave Hansen, Erdem Aktas, Ira Weiny, Isaku Yamahata,
	Kiryl Shutsemau, linux-kselftest, Paolo Bonzini, Pratik R. Sampat,
	Reinette Chatre, Rick Edgecombe, Roger Wang, Ryan Afranji,
	Sagi Shahar, Sean Christopherson, Shuah Khan, Oliver Upton
  Cc: Jeremiah McReynolds, kvm, linux-coco, linux-kernel, x86
In-Reply-To: <20260521-tdx-selftests-v13-v13-3-6983ae4c3a4d@google.com>

On 5/22/2026 7:16 AM, Lisa Wang wrote:
> diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c
> index b68ad1dc7e02..8d06e7186df1 100644
> --- a/tools/testing/selftests/kvm/lib/x86/processor.c
> +++ b/tools/testing/selftests/kvm/lib/x86/processor.c
> @@ -802,6 +802,9 @@ void kvm_arch_vm_post_create(struct kvm_vm *vm, unsigned int nr_vcpus)
>   		vm_sev_ioctl(vm, KVM_SEV_INIT2, &init);
>   	}
>   
> +	if (is_tdx_vm(vm))
> +		tdx_init_vm(vm, 0);
> +

It fails compilation:

kvm/tools/testing/selftests/kvm/lib/x86/processor.c:806:(.text+0x212c): 
undefined reference to `tdx_init_vm'

We need grab the change on Makefile.kvm from Patch 10 to this patch.

diff --git a/tools/testing/selftests/kvm/Makefile.kvm 
b/tools/testing/selftests/kvm/Makefile.kvm
index e5769268936a..0107ba02b01c 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -29,6 +29,7 @@ LIBKVM_x86 += lib/x86/sev.c
  LIBKVM_x86 += lib/x86/svm.c
  LIBKVM_x86 += lib/x86/ucall.c
  LIBKVM_x86 += lib/x86/vmx.c
+LIBKVM_x86 += lib/x86/tdx/tdx_util.c

  LIBKVM_arm64 += lib/arm64/gic.c
  LIBKVM_arm64 += lib/arm64/gic_v3.c

^ permalink raw reply related

* Re: [PATCH v8 3/7] crypto/ccp: Disable CPU hotplug while SNP is active
From: K Prateek Nayak @ 2026-06-17  4:33 UTC (permalink / raw)
  To: Ashish Kalra, tglx, mingo, bp, dave.hansen, x86, hpa, seanjc,
	peterz, thomas.lendacky, herbert, davem, ardb
  Cc: pbonzini, aik, Michael.Roth, Tycho.Andersen, Nathan.Fontenot,
	ackerleytng, jackyli, pgonda, rientjes, jacobhxu, xin,
	pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen, darwi,
	linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <1feccf6e2a56d949b30f403c0ca7949f580e5982.1781419998.git.ashish.kalra@amd.com>

Hello Ashish,

On 6/16/2026 1:19 AM, Ashish Kalra wrote:
> From: Ashish Kalra <ashish.kalra@amd.com>
> 
> The SEV firmware enumerates the CPUs at SNP initialization and is not
> aware of the OS bringing CPUs online or offline afterwards, so OS CPU
> hotplug can diverge from the firmware's expectations and break SNP.
> Disable CPU hotplug while SNP is active.

Dumb question: Is this specific to RMPOPT? Otherwise ...

> 
> SNP is fully torn down only on the SNP_SHUTDOWN_EX x86_snp_shutdown
> path; the legacy path leaves SNP enabled in hardware while clearing
> snp_initialized, so __sev_snp_init_locked() can run again.  Track the
> disable with a flag so it is balanced by a matching enable rather than
> stacked, and re-enable hotplug only on the x86_snp_shutdown path, after
> snp_shutdown() has cleared the per-core RMPOPT_BASE MSRs with hotplug
> still disabled.
> 
> This also keeps the CPU set stable for the asynchronous RMPOPT scan
> added later in this series, and ensures cpus_read_lock() in the scan
> is uncontended.
> 
> Suggested-by: Thomas Lendacky <thomas.lendacky@amd.com>
> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
> ---
>  drivers/crypto/ccp/sev-dev.c | 29 ++++++++++++++++++++++++++++-
>  1 file changed, 28 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
> index 217b6b19802e..c8c3c577463c 100644
> --- a/drivers/crypto/ccp/sev-dev.c
> +++ b/drivers/crypto/ccp/sev-dev.c
> @@ -106,6 +106,9 @@ struct snp_hv_fixed_pages_entry {
>  
>  static LIST_HEAD(snp_hv_fixed_pages);
>  
> +/* Set while SNP has CPU hotplug disabled. */
> +static bool snp_cpu_hotplug_disabled;
> +
>  /* Trusted Memory Region (TMR):
>   *   The TMR is a 1MB area that must be 1MB aligned.  Use the page allocator
>   *   to allocate the memory, which will return aligned memory for the specified
> @@ -1479,6 +1482,17 @@ static int __sev_snp_init_locked(int *error, unsigned int max_snp_asid)
>  
>  	snp_hv_fixed_pages_state_update(sev, HV_FIXED);
>  
> +	/*
> +	 * Disable CPU hotplug while SNP is active.  Guard against stacking
> +	 * the disable count: the legacy SNP_SHUTDOWN_EX path clears
> +	 * snp_initialized without re-enabling hotplug, so this can run
> +	 * again while hotplug is already disabled.
> +	 */
> +	if (!snp_cpu_hotplug_disabled) {
> +		cpu_hotplug_disable();
> +		snp_cpu_hotplug_disabled = true;
> +	}
> +

... should this be done before __sev_do_cmd_locked(SEV_CMD_SNP_INIT_EX)
is issued?

I'm assuming that is when the firmware enumerates the CPUs during SNP
initialization and any hotplug after that should be disallowed?

>  	snp_setup_rmpopt();
>  
>  	sev->snp_initialized = true;
-- 
Thanks and Regards,
Prateek


^ permalink raw reply

* Re: [PATCH v8 4/7] x86/sev: Add support to perform RMP optimizations asynchronously
From: K Prateek Nayak @ 2026-06-17  4:20 UTC (permalink / raw)
  To: Kalra, Ashish, tglx, mingo, bp, dave.hansen, x86, hpa, seanjc,
	peterz, thomas.lendacky, herbert, davem, ardb
  Cc: pbonzini, aik, Michael.Roth, Tycho.Andersen, Nathan.Fontenot,
	ackerleytng, jackyli, pgonda, rientjes, jacobhxu, xin,
	pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen, darwi,
	linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <8c5f4082-e3a5-4f65-b058-33938a7ee324@amd.com>

Hello Ashish,

On 6/17/2026 1:26 AM, Kalra, Ashish wrote:
> Hello Prateek,
> 
> On 6/16/2026 2:27 AM, K Prateek Nayak wrote:
>> Hello Ashish,
>>
>> On 6/16/2026 1:19 AM, Ashish Kalra wrote:
>>> +	/*
>>> +	 * RMPOPT scans the RMP table, stores the result of the scan in the
>>> +	 * reserved processor memory. The RMP scan is the most expensive
>>> +	 * part. If a second RMPOPT occurs, it can skip the expensive scan
>>> +	 * if they can see a cached result in the reserved processor memory.
>>> +	 *
>>> +	 * Do RMPOPT on one CPU alone. Then, follow that up with RMPOPT
>>> +	 * on every other primary thread. Followers are "designed to"
>>> +	 * skip the scan if they see the "cached" scan results.
>>> +	 */
>>> +	cpumask_copy(follower_mask, &rmpopt_cpumask);
>>
>> rmpopt_cpumask is constructed after hotplug is disabled but ...
>>
>>> +
>>> +	/*
>>> +	 * Pin the worker to the current CPU for the leader loop so that
>>> +	 * this_cpu remains valid and the RMPOPT instruction executes on
>>> +	 * the correct CPU.
>>> +	 *
>>> +	 * Use migrate_disable() rather than get_cpu() to prevent
>>> +	 * migration while still allowing preemption.
>>> +	 */
>>> +	migrate_disable();
>>> +	this_cpu = smp_processor_id();
>>> +
>>> +	if (cpumask_test_cpu(this_cpu, follower_mask)) {
>>> +		/*
>>> +		 * Current CPU is a primary thread in rmpopt_cpumask.
>>> +		 * Run leader locally and remove from follower mask.
>>> +		 */
>>> +		cpumask_clear_cpu(this_cpu, follower_mask);
>>> +
>>> +		for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
>>> +			rmpopt(pa);
>>> +			cond_resched();
>>> +		}
>>> +	} else if (cpumask_intersects(topology_sibling_cpumask(this_cpu),
>>> +				      follower_mask)) {
>>> +		/*
>>> +		 * Current CPU is a sibling thread whose primary is in
>>> +		 * rmpopt_cpumask.  RMPOPT_BASE MSR is per-core, so it
>>> +		 * is safe to run the leader locally.  Remove the sibling's
>>> +		 * primary from the follower mask as this core is already
>>> +		 * covered by the leader.
>>> +		 */
>>> +		cpumask_andnot(follower_mask, follower_mask,
>>> +			       topology_sibling_cpumask(this_cpu));
>>> +
>>> +		for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
>>> +			rmpopt(pa);
>>> +			cond_resched();
>>> +		}
>>> +	} else {
>>> +		/*
>>> +		 * Current CPU does not have RMPOPT_BASE MSR programmed.
>>> +		 * Pick an explicit leader from the cpumask to avoid #UD.
>>> +		 * Use work_on_cpu() to run in process context on the leader,
>>> +		 * avoiding IPI latency.
>>> +		 */
>>
>> ... this_cpu is neither in the "rmpopt_cpumask", nor is any of its
>> siblings on "rmpopt_cpumask".
>>
>> How does that happen?
> 
> Actually, this was the implementation before the CPU hotplug disable enforcement code was implemented and added in v8,
> and i should have fixed this rmpopt_work_handler() accordingly for v8.
> 
> With the enforced cpu hotplug disable support, case #3 here (above) is now dead code, and removing it lets
> cases #1 and #2 collapse too.
> 
> snp_prepare() requires cpu_online_mask == cpu_present_mask before SNP init — so when snp_setup_rmpopt() programs the MSRs, every
> core's primary is online -> every core is in rmpopt_cpumask.
>   
> So now the work handler always runs on a CPU whose core is programmed. topology_sibling_cpumask(this_cpu) therefore always intersects
> rmpopt_cpumask -> case #1 or #2 always matches.
> 
> So i should actually drop case #3 here - which is: "this_cpu is neither in the "rmpopt_cpumask", nor is any of its
> siblings on rmpopt_cpumask"

Ack.

Also the fact that cpu_mark_primary_thread() uses LSBs of APICID and if
you have some insanely weird configuration - like boot with maxcpus=1,
online all the secondary threads (CPUs 256-511 on a 256C/512T system),
launch an SNP guest - it can actually leave everything except CORE0 out
of the "rmpopt_cpumask".

> 
> 
>>
>>> +		int leader_cpu = cpumask_first(follower_mask);
>>> +
>>> +		if (WARN_ON_ONCE(leader_cpu >= nr_cpu_ids)) {
>>> +			migrate_enable();
>>> +			goto out;
>>> +		}
>>> +
>>> +		cpumask_clear_cpu(leader_cpu, follower_mask);
>>> +
>>> +		/* Release migration pin before work_on_cpu(). */
>>> +		migrate_enable();
>>> +
>>> +		work_on_cpu(leader_cpu, rmpopt_leader_fn, NULL);
>>
>> This creates a delayed work and also waits for it to finish execution
>> which will add more latency than a simple IPI if the comment about IPI
>> latency above is accurate.
>>
>> I think there is some corner case in construction of the
>> "rmpopt_cpumask" that requires this not-so-pretty else block. Can you
>> elaborate why this is required?
>>
>> Perhaps the "rmpopt_cpumask" construction needs:
>>
>>     for_each_online_cpu(cpu) {
>>         /* Nominate the first CPU on the sibling mask for RMPOPT */
>>         if (cpu != cpumask_first(topology_sibling_cpumask(cpu)))
>>             continue;
>>         cpumask_set_cpu(cpu, &rmpopt_cpumask);
>>     }
>>
>>
>> and all you need here is:
>>
>>     /* Do RMPOPt for local core */
>>     for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G)
>>         rmpopt(pa);
>>
>>     /* Skip this core from concurrent RMPOPT */
>>     cpumask_and_not(follower_mask, &rmpopt_cpumask, topology_sibling_cpumask(cpu));
>>
>> No?
>>
> 
> Yes, a simpler implementation will be like this: 
> ...
> 
>  	if (!alloc_cpumask_var(&follower_mask, GFP_KERNEL))
>                 return;
> 

If you move the migrate_disable() here, you can simply do an andnot
without needing to copy the rmpopt_cpumask beforehand and save on one
cpumask iteration.

>  	cpumask_copy(follower_mask, &rmpopt_cpumask);
> 
>         /*
>          * The current CPU's core always has RMPOPT_BASE programmed
>          * (snp_prepare() required all CPUs online at setup and CPU hotplug
>          * is disabled while SNP is active), so it can always be the leader.
>          * RMPOPT_BASE is per-core; exclude this core from the followers.
>          */
>         migrate_disable();
>         cpumask_andnot(follower_mask, follower_mask,
>                        topology_sibling_cpumask(smp_processor_id()));
> 
>         for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
>                 rmpopt(pa);
>                 cond_resched();
>         }
>         migrate_enable();
> 
>         cpus_read_lock();

I think you can even skip the cpus_read_lock() since we know for a
fact that hotplug is disabled when we are here.

Perhaps we can have a lockdep_assert_cpu_hotplug_disabled() which
ensures we'll get a splat if that assumption ever changes when
running with LOCKDEP?

I'll let others comment if that is a good idea or not.

>         for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
>                 on_each_cpu_mask(follower_mask, rmpopt_smp, (void *)pa, true);
>                 cond_resched();
>         }
>         cpus_read_unlock();
> 
>         free_cpumask_var(follower_mask);
> 
> 
>  Here, the leader exclusion must use the sibling mask, not clear_cpu(this_cpu). That's why my collapsed version uses:
> 
>         cpumask_andnot(follower_mask, follower_mask,
>                        topology_sibling_cpumask(smp_processor_id()));
> 
>   - If this_cpu is a primary: its sibling mask contains itself (the primary) -> andnot removes this core's primary from the followers.
>   
>   - If this_cpu is a secondary: it isn't in follower_mask at all, but its sibling mask contains its primary, which is in
>   follower_mask -> andnot still removes this core's primary. 
> 
>   So either way the current core is dropped from the followers. (The old code needed two branches because case #1 used
>   clear_cpu(this_cpu) — only correct when this_cpu is the primary — while case #2 used the sibling andnot. The single andnot works for
>   both cases).

Ack! And I think this looks much cleaner (to my eyes at least ;-)

-- 
Thanks and Regards,
Prateek


^ permalink raw reply

* Re: [PATCH v13 03/22] KVM: selftests: Initialize the TDX VM
From: Xiaoyao Li @ 2026-06-17  3:54 UTC (permalink / raw)
  To: Lisa Wang, Andrew Jones, Ackerley Tng, Binbin Wu, Chao Gao,
	Chenyi Qiang, Dave Hansen, Erdem Aktas, Ira Weiny, Isaku Yamahata,
	Kiryl Shutsemau, linux-kselftest, Paolo Bonzini, Pratik R. Sampat,
	Reinette Chatre, Rick Edgecombe, Roger Wang, Ryan Afranji,
	Sagi Shahar, Sean Christopherson, Shuah Khan, Oliver Upton
  Cc: Jeremiah McReynolds, kvm, linux-coco, linux-kernel, x86
In-Reply-To: <20260521-tdx-selftests-v13-v13-3-6983ae4c3a4d@google.com>

On 5/22/2026 7:16 AM, Lisa Wang wrote:
> +/*
> + * Filter CPUID based on TDX supported capabilities
> + *
> + * Input Args:
> + *   vm - Virtual Machine
> + *   cpuid_data - CPUID fields to filter
> + *
> + * Output Args: None
> + *
> + * Return: None
> + *
> + * For each CPUID leaf, filter out non-supported bits based on the capabilities reported
> + * by the TDX module
> + */

s/non-supported/unsupported/

and break the line to <80 chars

^ permalink raw reply

* Re: [PATCH v13 04/22] KVM: selftests: TDX: Use KVM_TDX_CAPABILITIES to validate TDs' attribute configuration
From: Xiaoyao Li @ 2026-06-17  3:51 UTC (permalink / raw)
  To: Lisa Wang, Andrew Jones, Ackerley Tng, Binbin Wu, Chao Gao,
	Chenyi Qiang, Dave Hansen, Erdem Aktas, Ira Weiny, Isaku Yamahata,
	Kiryl Shutsemau, linux-kselftest, Paolo Bonzini, Pratik R. Sampat,
	Reinette Chatre, Rick Edgecombe, Roger Wang, Ryan Afranji,
	Sagi Shahar, Sean Christopherson, Shuah Khan, Oliver Upton
  Cc: Jeremiah McReynolds, kvm, linux-coco, linux-kernel, x86
In-Reply-To: <20260521-tdx-selftests-v13-v13-4-6983ae4c3a4d@google.com>

On 5/22/2026 7:16 AM, Lisa Wang wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> Make sure that all the attributes enabled by the test are reported as
> supported by both the TDX module and KVM. KVM filters out the attributes
> not supported by itself.
> 
> This also exercises the KVM_TDX_CAPABILITIES ioctl.
> 
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> Co-developed-by: Sagi Shahar <sagis@google.com>
> Signed-off-by: Sagi Shahar <sagis@google.com>
> Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
> Reviewed-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Lisa Wang <wyihan@google.com>

Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>

> ---
>   tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c | 14 ++++++++++++++
>   1 file changed, 14 insertions(+)
> 
> diff --git a/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c b/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c
> index 868ff62e22f2..e5c998874a0d 100644
> --- a/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c
> +++ b/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c
> @@ -110,6 +110,18 @@ static void tdx_filter_cpuid(struct kvm_vm *vm,
>   	free(tdx_cap);
>   }
>   
> +static void tdx_check_attributes(struct kvm_vm *vm, u64 attributes)
> +{
> +	struct kvm_tdx_capabilities *tdx_cap;
> +
> +	tdx_cap = tdx_read_capabilities(vm);

well, this is another caller of tdx_read_capabilities().

As I commented in the previous patch, it's worth caching the result in 
tdx_read_capabilities() like what kvm_get_supported_cpuid() does for 
kvm_supported_cpuid.

And it can help only print the debug once.

> +	/* Make sure all the attributes are reported as supported */
> +	TEST_ASSERT_EQ(attributes & tdx_cap->supported_attrs, attributes);
> +
> +	free(tdx_cap);
> +}
> +
>   void tdx_init_vm(struct kvm_vm *vm, u64 attributes)
>   {
>   	struct kvm_tdx_init_vm *init_vm;
> @@ -129,6 +141,8 @@ void tdx_init_vm(struct kvm_vm *vm, u64 attributes)
>   	memcpy(&init_vm->cpuid, cpuid, kvm_cpuid2_size(cpuid->nent));
>   	free(cpuid);
>   
> +	tdx_check_attributes(vm, attributes);
> +
>   	init_vm->attributes = attributes;
>   
>   	tdx_vm_ioctl(vm, KVM_TDX_INIT_VM, 0, init_vm);
> 


^ permalink raw reply

* Re: [PATCH v13 03/22] KVM: selftests: Initialize the TDX VM
From: Xiaoyao Li @ 2026-06-17  3:21 UTC (permalink / raw)
  To: Lisa Wang, Andrew Jones, Ackerley Tng, Binbin Wu, Chao Gao,
	Chenyi Qiang, Dave Hansen, Erdem Aktas, Ira Weiny, Isaku Yamahata,
	Kiryl Shutsemau, linux-kselftest, Paolo Bonzini, Pratik R. Sampat,
	Reinette Chatre, Rick Edgecombe, Roger Wang, Ryan Afranji,
	Sagi Shahar, Sean Christopherson, Shuah Khan, Oliver Upton
  Cc: Jeremiah McReynolds, kvm, linux-coco, linux-kernel, x86
In-Reply-To: <20260521-tdx-selftests-v13-v13-3-6983ae4c3a4d@google.com>

On 5/22/2026 7:16 AM, Lisa Wang wrote:
> From: Sagi Shahar <sagis@google.com>
> 
> Add tdx_init_vm() to handle the mandatory VM-level initialization
> sequence required for Intel TDX.
> 
> For TDX, the guest's CPUID configuration must be "sealed" during
> KVM_TDX_INIT_VM before any vCPUs are created. This is necessary because
> the TDX hardware directly virtualizes CPUID and includes the
> configuration in the guest's initial security measurement.
> 
> The helper calculates the required CPUID values by filtering the host-
> supported bits (kvm_get_supported_cpuid) against the "directly
> configurable" bits reported by KVM_TDX_CAPABILITIES, ensuring
> compliance with the strict requirements of the TDH.MNG.INIT SEAMCALL.
> 
> Co-developed-by: Isaku Yamahata <isaku.yamahata@intel.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> Co-developed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
> Signed-off-by: Sagi Shahar <sagis@google.com>
> Reviewed-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Lisa Wang <wyihan@google.com>
> ---
>   .../selftests/kvm/include/x86/tdx/tdx_util.h       |  30 +++++
>   tools/testing/selftests/kvm/lib/x86/processor.c    |   3 +
>   tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c | 137 +++++++++++++++++++++
>   3 files changed, 170 insertions(+)
> 
> diff --git a/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h b/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h
> index f647e6ca6b34..48d4bd36c35b 100644
> --- a/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h
> +++ b/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h
> @@ -11,4 +11,34 @@ static inline bool is_tdx_vm(struct kvm_vm *vm)
>   	return vm->type == KVM_X86_TDX_VM;
>   }
>   
> +/*
> + * TDX ioctls
> + * Use underscores to avoid collisions with struct member names.
> + */
> +#define __tdx_vm_ioctl(vm, cmd, _flags, arg)				\
> +({									\
> +	int r;								\
> +									\
> +	union {								\
> +		struct kvm_tdx_cmd c;					\
> +		unsigned long raw;					\
> +	} tdx_cmd = { .c = {						\
> +		.id = (cmd),						\
> +		.flags = (u32)(_flags),				\
> +		.data = (u64)(arg),				\
> +	} };								\
> +									\
> +	r = __vm_ioctl(vm, KVM_MEMORY_ENCRYPT_OP, &tdx_cmd.raw);	\
> +	r ?: tdx_cmd.c.hw_error;					\
> +})

It looks __tdx_vm_ioctl() can be implemented as the static inline function.

Given all the existing xxx_ioctl() are implmeneted as MACRO, I'm OK with it.

> +
> +#define tdx_vm_ioctl(vm, cmd, flags, arg)				\
> +({									\
> +	int ret = __tdx_vm_ioctl(vm, cmd, flags, arg);			\
> +									\
> +	__TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd,	ret, vm);		\
> +})
> +
> +void tdx_init_vm(struct kvm_vm *vm, u64 attributes);
> +
>   #endif /* SELFTESTS_TDX_TDX_UTIL_H */
> diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c
> index b68ad1dc7e02..8d06e7186df1 100644
> --- a/tools/testing/selftests/kvm/lib/x86/processor.c
> +++ b/tools/testing/selftests/kvm/lib/x86/processor.c
> @@ -802,6 +802,9 @@ void kvm_arch_vm_post_create(struct kvm_vm *vm, unsigned int nr_vcpus)
>   		vm_sev_ioctl(vm, KVM_SEV_INIT2, &init);
>   	}
>   
> +	if (is_tdx_vm(vm))
> +		tdx_init_vm(vm, 0);
> +
>   	r = __vm_ioctl(vm, KVM_GET_TSC_KHZ, NULL);
>   	TEST_ASSERT(r > 0, "KVM_GET_TSC_KHZ did not provide a valid TSC frequency.");
>   	guest_tsc_khz = r;
> diff --git a/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c b/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c
> new file mode 100644
> index 000000000000..868ff62e22f2
> --- /dev/null
> +++ b/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c
> @@ -0,0 +1,137 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +
> +#include "kvm_util.h"
> +#include "processor.h"
> +#include "tdx/tdx_util.h"
> +
> +static struct kvm_tdx_capabilities *tdx_read_capabilities(struct kvm_vm *vm)
> +{
> +	struct kvm_tdx_capabilities *tdx_cap = NULL;
> +	int nr_cpuid_configs = 4;
> +	int rc = -1;
> +	int i;
> +
> +	do {
> +		nr_cpuid_configs *= 2;
> +
> +		tdx_cap = realloc(tdx_cap, sizeof(*tdx_cap) +
> +					   sizeof(tdx_cap->cpuid) +

No need to add sizeof(tdx_cap->cpuid). It's included by sizeof(*tdx_cap)

> +					   (sizeof(struct kvm_cpuid_entry2) * nr_cpuid_configs));
> +		TEST_ASSERT(tdx_cap,
> +			    "Could not allocate memory for tdx capability nr_cpuid_configs %d\n",
> +			    nr_cpuid_configs);
> +
> +		tdx_cap->cpuid.nent = nr_cpuid_configs;
> +		rc = __tdx_vm_ioctl(vm, KVM_TDX_CAPABILITIES, 0, tdx_cap);
> +	} while (rc < 0 && errno == E2BIG);
> +
> +	TEST_ASSERT(rc == 0, "KVM_TDX_CAPABILITIES failed: %d %d",
> +		    rc, errno);
> +
> +	pr_debug("tdx_cap: supported_attrs: 0x%016llx\n"
> +		 "tdx_cap: supported_xfam 0x%016llx\n",
> +		 tdx_cap->supported_attrs, tdx_cap->supported_xfam);
> +
> +	for (i = 0; i < tdx_cap->cpuid.nent; i++) {
> +		const struct kvm_cpuid_entry2 *config = &tdx_cap->cpuid.entries[i];
> +
> +		pr_debug("cpuid config[%d]: leaf 0x%x sub_leaf 0x%x eax 0x%08x ebx 0x%08x ecx 0x%08x edx 0x%08x\n",
> +			 i, config->function, config->index,
> +			 config->eax, config->ebx, config->ecx, config->edx);
> +	}

The debug info will be printed everytime the function is called, which 
is unnecessary.

Ideally, the kvm_tdx_capabilities can be cached like what is done for 
kvm_supported_cpuid.

> +	return tdx_cap;
> +}
> +
> +static struct kvm_cpuid_entry2 *tdx_find_cpuid_config(struct kvm_tdx_capabilities *cap,
> +						      u32 leaf, u32 sub_leaf)
> +{
> +	struct kvm_cpuid_entry2 *config;
> +	u32 i;
> +
> +	for (i = 0; i < cap->cpuid.nent; i++) {
> +		config = &cap->cpuid.entries[i];
> +
> +		if (config->function == leaf && config->index == sub_leaf)
> +			return config;
> +	}
> +
> +	return NULL;
> +}

No need to introduce a new fucntin. We can use get_cpuid_entry().


^ permalink raw reply

* Re: [PATCH v13 01/22] KVM: selftests: Add macros to simplify creating VM shapes for non-default types
From: Xiaoyao Li @ 2026-06-17  3:04 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Lisa Wang, Andrew Jones, Ackerley Tng, Binbin Wu, Chao Gao,
	Chenyi Qiang, Dave Hansen, Erdem Aktas, Ira Weiny, Isaku Yamahata,
	Kiryl Shutsemau, linux-kselftest, Paolo Bonzini, Pratik R. Sampat,
	Reinette Chatre, Rick Edgecombe, Roger Wang, Ryan Afranji,
	Sagi Shahar, Shuah Khan, Oliver Upton, Jeremiah McReynolds, kvm,
	linux-coco, linux-kernel, x86
In-Reply-To: <ajF-9isiWxPyzxci@google.com>

On 6/17/2026 12:51 AM, Sean Christopherson wrote:
> From: Sean Christopherson<seanjc@google.com>
> Date: Tue, 28 Oct 2025 21:20:27 +0000
> Subject: [PATCH] KVM: selftests: Add macros to simplify creating VM shapes for
>   non-default types
> 
> Add VM_TYPE() and __VM_SHAPE() macros to create a vm_shape structure given
> a type (and mode), and use the macros to define VM_SHAPE_{SEV,SEV_ES,SNP}
> shapes for x86's SEV family of VM shapes.  Providing common infrastructure
> will avoid having to copy+paste vm_sev_create_with_one_vcpu() for TDX.
> 
> Use the new SEV+ shapes and drop vm_sev_create_with_one_vcpu().
> 
> Opportunistically move the existing VM_SHAPE() (now __VM_SHAPE()) macro
> below the definitions of VM_MODE_DEFAULT so that all of the SHAPE/TYPE
> macros are bundled together.
> 
> No functional change intended.
> 
> Reviewed-by: Binbin Wu<binbin.wu@linux.intel.com>
> Reviewed-by: Ira Weiny<ira.weiny@intel.com>
> Signed-off-by: Sean Christopherson<seanjc@google.com>

Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>

some nits below

> ---
>   .../testing/selftests/kvm/include/kvm_util.h  | 28 +++++++------
>   .../selftests/kvm/include/x86/processor.h     |  4 ++
>   tools/testing/selftests/kvm/include/x86/sev.h |  2 -
>   tools/testing/selftests/kvm/lib/x86/sev.c     | 16 --------
>   .../selftests/kvm/x86/sev_smoke_test.c        | 40 +++++++++----------
>   5 files changed, 40 insertions(+), 50 deletions(-)
> 
> diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
> index dc70c6da63fa..46bae183d7fc 100644
> --- a/tools/testing/selftests/kvm/include/kvm_util.h
> +++ b/tools/testing/selftests/kvm/include/kvm_util.h
> @@ -221,18 +221,6 @@ struct vm_shape {
>   
>   kvm_static_assert(sizeof(struct vm_shape) == sizeof(u64));
>   
> -#define VM_TYPE_DEFAULT			0
> -
> -#define VM_SHAPE(__mode)			\
> -({						\
> -	struct vm_shape shape = {		\
> -		.mode = (__mode),		\
> -		.type = VM_TYPE_DEFAULT		\
> -	};					\
> -						\
> -	shape;					\
> -})
> -
>   extern enum vm_guest_mode vm_mode_default;
>   
>   #if defined(__aarch64__)
> @@ -270,8 +258,24 @@ extern enum vm_guest_mode vm_mode_default;
>   
>   #endif
>   
> +#define VM_TYPE_DEFAULT			0
> +
> +#define __VM_SHAPE(__mode, __type)			\

inconsistent indentation with below lines.

> +({						\
> +	struct vm_shape shape = {		\
> +		.mode = (__mode),		\
> +		.type = (__type),		\
> +	};					\
> +						\
> +	shape;					\
> +})
> +
> +

one extra new line.

> +#define VM_SHAPE(__mode)	__VM_SHAPE(__mode, VM_TYPE_DEFAULT)
>   #define VM_SHAPE_DEFAULT	VM_SHAPE(VM_MODE_DEFAULT)
>   
> +#define VM_TYPE(__type)		__VM_SHAPE(VM_MODE_DEFAULT, __type)
> +
>   #define MIN_PAGE_SIZE		(1U << MIN_PAGE_SHIFT)
>   #define PTES_PER_MIN_PAGE	ptes_per_page(MIN_PAGE_SIZE)


^ permalink raw reply

* Re: [PATCH v13 02/22] KVM: selftests: Update kvm_init_vm_address_properties() for TDX
From: Xiaoyao Li @ 2026-06-17  2:37 UTC (permalink / raw)
  To: Lisa Wang, Andrew Jones, Ackerley Tng, Binbin Wu, Chao Gao,
	Chenyi Qiang, Dave Hansen, Erdem Aktas, Ira Weiny, Isaku Yamahata,
	Kiryl Shutsemau, linux-kselftest, Paolo Bonzini, Pratik R. Sampat,
	Reinette Chatre, Rick Edgecombe, Roger Wang, Ryan Afranji,
	Sagi Shahar, Sean Christopherson, Shuah Khan, Oliver Upton
  Cc: Jeremiah McReynolds, kvm, linux-coco, linux-kernel, x86,
	Adrian Hunter
In-Reply-To: <20260521-tdx-selftests-v13-v13-2-6983ae4c3a4d@google.com>

On 5/22/2026 7:16 AM, Lisa Wang wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> Initialize the TDX S-bit and the GPA tag mask in
> kvm_init_vm_address_properties() for TDX VMs, similar to how the C-bit
> is initialized for SEV VMs.
> 
> The TDX S-bit is used to distinguish between shared and private guest
> physical addresses. Its position is determined by the guest physical
> address width, which is either 48 or 52 bits for current TDX
> implementations.
> 
> Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
> Co-developed-by: Adrian Hunter <adrian.hunter@intel.com>
> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> Co-developed-by: Sagi Shahar <sagis@google.com>
> Signed-off-by: Sagi Shahar <sagis@google.com>
> Reviewed-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Lisa Wang <wyihan@google.com>

Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>

> ---
>   tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h | 14 ++++++++++++++
>   tools/testing/selftests/kvm/lib/x86/processor.c        | 12 ++++++++++--
>   2 files changed, 24 insertions(+), 2 deletions(-)
> 
> diff --git a/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h b/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h
> new file mode 100644
> index 000000000000..f647e6ca6b34
> --- /dev/null
> +++ b/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h
> @@ -0,0 +1,14 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +#ifndef SELFTESTS_TDX_TDX_UTIL_H
> +#define SELFTESTS_TDX_TDX_UTIL_H
> +
> +#include <stdbool.h>
> +
> +#include "kvm_util.h"
> +
> +static inline bool is_tdx_vm(struct kvm_vm *vm)
> +{
> +	return vm->type == KVM_X86_TDX_VM;
> +}
> +
> +#endif /* SELFTESTS_TDX_TDX_UTIL_H */
> diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c
> index b51467d70f6e..b68ad1dc7e02 100644
> --- a/tools/testing/selftests/kvm/lib/x86/processor.c
> +++ b/tools/testing/selftests/kvm/lib/x86/processor.c
> @@ -11,6 +11,7 @@
>   #include "smm.h"
>   #include "svm_util.h"
>   #include "sev.h"
> +#include "tdx/tdx_util.h"
>   #include "vmx.h"
>   
>   #ifndef NUM_INTERRUPTS
> @@ -1311,12 +1312,19 @@ void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits)
>   
>   void kvm_init_vm_address_properties(struct kvm_vm *vm)
>   {
> +	u32 gpa_bits = kvm_cpu_property(X86_PROPERTY_GUEST_MAX_PHY_ADDR);
> +
> +	vm->arch.sev_fd = -1;
> +
>   	if (is_sev_vm(vm)) {
>   		vm->arch.sev_fd = open_sev_dev_path_or_exit();
>   		vm->arch.c_bit = BIT_ULL(this_cpu_property(X86_PROPERTY_SEV_C_BIT));
>   		vm->gpa_tag_mask = vm->arch.c_bit;
> -	} else {
> -		vm->arch.sev_fd = -1;
> +	} else if (is_tdx_vm(vm)) {
> +		TEST_ASSERT(gpa_bits == 48 || gpa_bits == 52,
> +			    "TDX: bad X86_PROPERTY_GUEST_MAX_PHY_ADDR value: %u", gpa_bits);
> +		vm->arch.s_bit = BIT_ULL(gpa_bits - 1);
> +		vm->gpa_tag_mask = vm->arch.s_bit;
>   	}
>   }
>   
> 


^ permalink raw reply

* Re: [PATCH v6 03/20] dma-direct: use DMA_ATTR_CC_SHARED in alloc/free paths
From: Alexey Kardashevskiy @ 2026-06-17  0:50 UTC (permalink / raw)
  To: Aneesh Kumar K.V (Arm), iommu, linux-arm-kernel, linux-kernel,
	linux-coco
  Cc: Robin Murphy, Marek Szyprowski, Will Deacon, Marc Zyngier,
	Steven Price, Suzuki K Poulose, Catalin Marinas, Jiri Pirko,
	Jason Gunthorpe, Mostafa Saleh, Petr Tesarik, Dan Williams,
	Xu Yilun, linuxppc-dev, linux-s390, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy (CS GROUP),
	Alexander Gordeev, Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, Sven Schnelle, x86, Jiri Pirko,
	Michael Kelley, Cheloha, Scott
In-Reply-To: <20260604083959.1265923-4-aneesh.kumar@kernel.org>



On 4/6/26 18:39, Aneesh Kumar K.V (Arm) wrote:
> Propagate force_dma_unencrypted() into DMA_ATTR_CC_SHARED in the
> dma-direct allocation path and use the attribute to drive the related
> decisions.
> 
> This updates dma_direct_alloc(), dma_direct_free(), and
> dma_direct_alloc_pages() to fold the forced unencrypted case into attrs.
> 
> Tested-by: Jiri Pirko <jiri@nvidia.com>
> Tested-by: Michael Kelley <mhklinux@outlook.com>
> Tested-by: Mostafa Saleh <smostafa@google.com>
> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
> ---
>   kernel/dma/direct.c | 53 +++++++++++++++++++++++++++++++++++++--------
>   1 file changed, 44 insertions(+), 9 deletions(-)
> 
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index a741c8a2ee66..90dc5057a0c0 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -193,16 +193,31 @@ void *dma_direct_alloc(struct device *dev, size_t size,
>   		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
>   {
>   	bool remap = false, set_uncached = false;
> -	bool mark_mem_decrypt = true;
> +	bool mark_mem_decrypt = false;
>   	struct page *page;
>   	void *ret;
>   
> +	/*
> +	 * DMA_ATTR_CC_SHARED is not a caller-visible dma_alloc_*()
> +	 * attribute. The direct allocator uses it internally after it has
> +	 * decided that the backing pages must be shared/decrypted, so the
> +	 * rest of the allocation path can consistently select DMA addresses,
> +	 * choose compatible pools and restore encryption on free.

Why this limit?

Context: I am looking for a memory pool for a few shared pages (to do some guest<->host communication), SWIOTLB seems like the right fit but swiotlb_alloc() is not exported and dma_direct_alloc(DMA_ATTR_CC_SHARED) is not allowed.  Thanks,


> +	 */
> +	if (attrs & DMA_ATTR_CC_SHARED)
> +		return NULL;
> +
> +	if (force_dma_unencrypted(dev)) {
> +		attrs |= DMA_ATTR_CC_SHARED;
> +		mark_mem_decrypt = true;
> +	}
> +
>   	size = PAGE_ALIGN(size);
>   	if (attrs & DMA_ATTR_NO_WARN)
>   		gfp |= __GFP_NOWARN;
>   
> -	if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
> -	    !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev))
> +	if (((attrs & (DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_CC_SHARED)) ==
> +	     DMA_ATTR_NO_KERNEL_MAPPING) && !is_swiotlb_for_alloc(dev))
>   		return dma_direct_alloc_no_mapping(dev, size, dma_handle, gfp);
>   
>   	if (!dev_is_dma_coherent(dev)) {
> @@ -236,7 +251,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
>   	 * Remapping or decrypting memory may block, allocate the memory from
>   	 * the atomic pools instead if we aren't allowed block.
>   	 */
> -	if ((remap || force_dma_unencrypted(dev)) &&
> +	if ((remap || (attrs & DMA_ATTR_CC_SHARED)) &&
>   	    dma_direct_use_pool(dev, gfp))
>   		return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
>   
> @@ -312,12 +327,24 @@ void dma_direct_free(struct device *dev, size_t size,
>   		void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
>   {
>   	phys_addr_t phys;
> -	bool mark_mem_encrypted = true;
> +	bool mark_mem_encrypted = false;
>   	struct io_tlb_pool *swiotlb_pool;
>   	unsigned int page_order = get_order(size);
>   
> -	if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
> -	    !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) {
> +	/* see dma_direct_alloc() for details */
> +	WARN_ON(attrs & DMA_ATTR_CC_SHARED);
> +
> +	/*
> +	 * if the device had requested for an unencrypted buffer,
> +	 * convert it to encrypted on free
> +	 */
> +	if (force_dma_unencrypted(dev)) {
> +		attrs |= DMA_ATTR_CC_SHARED;
> +		mark_mem_encrypted = true;
> +	}
> +
> +	if (((attrs & (DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_CC_SHARED)) ==
> +	     DMA_ATTR_NO_KERNEL_MAPPING) && !is_swiotlb_for_alloc(dev)) {
>   		/* cpu_addr is a struct page cookie, not a kernel address */
>   		dma_free_contiguous(dev, cpu_addr, size);
>   		return;
> @@ -366,10 +393,14 @@ void dma_direct_free(struct device *dev, size_t size,
>   struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
>   		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
>   {
> +	unsigned long attrs = 0;
>   	struct page *page;
>   	void *ret;
>   
> -	if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
> +	if (force_dma_unencrypted(dev))
> +		attrs |= DMA_ATTR_CC_SHARED;
> +
> +	if ((attrs & DMA_ATTR_CC_SHARED) && dma_direct_use_pool(dev, gfp))
>   		return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
>   
>   	if (is_swiotlb_for_alloc(dev)) {
> @@ -403,7 +434,11 @@ void dma_direct_free_pages(struct device *dev, size_t size,
>   	phys_addr_t phys;
>   	void *vaddr = page_address(page);
>   	struct io_tlb_pool *swiotlb_pool;
> -	bool mark_mem_encrypted = true;
> +	/*
> +	 * if the device had requested for an unencrypted buffer,
> +	 * convert it to encrypted on free
> +	 */
> +	bool mark_mem_encrypted = force_dma_unencrypted(dev);
>   
>   	/* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
>   	if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&


-- 
Alexey


^ permalink raw reply

* Re: [PATCH v8 4/7] x86/sev: Add support to perform RMP optimizations asynchronously
From: Kalra, Ashish @ 2026-06-16 19:56 UTC (permalink / raw)
  To: K Prateek Nayak, tglx, mingo, bp, dave.hansen, x86, hpa, seanjc,
	peterz, thomas.lendacky, herbert, davem, ardb
  Cc: pbonzini, aik, Michael.Roth, Tycho.Andersen, Nathan.Fontenot,
	ackerleytng, jackyli, pgonda, rientjes, jacobhxu, xin,
	pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen, darwi,
	linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <0fa0bc95-ff31-40c5-b083-3c885d09d0ab@amd.com>

Hello Prateek,

On 6/16/2026 2:27 AM, K Prateek Nayak wrote:
> Hello Ashish,
> 
> On 6/16/2026 1:19 AM, Ashish Kalra wrote:
>> +	/*
>> +	 * RMPOPT scans the RMP table, stores the result of the scan in the
>> +	 * reserved processor memory. The RMP scan is the most expensive
>> +	 * part. If a second RMPOPT occurs, it can skip the expensive scan
>> +	 * if they can see a cached result in the reserved processor memory.
>> +	 *
>> +	 * Do RMPOPT on one CPU alone. Then, follow that up with RMPOPT
>> +	 * on every other primary thread. Followers are "designed to"
>> +	 * skip the scan if they see the "cached" scan results.
>> +	 */
>> +	cpumask_copy(follower_mask, &rmpopt_cpumask);
> 
> rmpopt_cpumask is constructed after hotplug is disabled but ...
> 
>> +
>> +	/*
>> +	 * Pin the worker to the current CPU for the leader loop so that
>> +	 * this_cpu remains valid and the RMPOPT instruction executes on
>> +	 * the correct CPU.
>> +	 *
>> +	 * Use migrate_disable() rather than get_cpu() to prevent
>> +	 * migration while still allowing preemption.
>> +	 */
>> +	migrate_disable();
>> +	this_cpu = smp_processor_id();
>> +
>> +	if (cpumask_test_cpu(this_cpu, follower_mask)) {
>> +		/*
>> +		 * Current CPU is a primary thread in rmpopt_cpumask.
>> +		 * Run leader locally and remove from follower mask.
>> +		 */
>> +		cpumask_clear_cpu(this_cpu, follower_mask);
>> +
>> +		for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
>> +			rmpopt(pa);
>> +			cond_resched();
>> +		}
>> +	} else if (cpumask_intersects(topology_sibling_cpumask(this_cpu),
>> +				      follower_mask)) {
>> +		/*
>> +		 * Current CPU is a sibling thread whose primary is in
>> +		 * rmpopt_cpumask.  RMPOPT_BASE MSR is per-core, so it
>> +		 * is safe to run the leader locally.  Remove the sibling's
>> +		 * primary from the follower mask as this core is already
>> +		 * covered by the leader.
>> +		 */
>> +		cpumask_andnot(follower_mask, follower_mask,
>> +			       topology_sibling_cpumask(this_cpu));
>> +
>> +		for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
>> +			rmpopt(pa);
>> +			cond_resched();
>> +		}
>> +	} else {
>> +		/*
>> +		 * Current CPU does not have RMPOPT_BASE MSR programmed.
>> +		 * Pick an explicit leader from the cpumask to avoid #UD.
>> +		 * Use work_on_cpu() to run in process context on the leader,
>> +		 * avoiding IPI latency.
>> +		 */
> 
> ... this_cpu is neither in the "rmpopt_cpumask", nor is any of its
> siblings on "rmpopt_cpumask".
> 
> How does that happen?

Actually, this was the implementation before the CPU hotplug disable enforcement code was implemented and added in v8,
and i should have fixed this rmpopt_work_handler() accordingly for v8.

With the enforced cpu hotplug disable support, case #3 here (above) is now dead code, and removing it lets
cases #1 and #2 collapse too.

snp_prepare() requires cpu_online_mask == cpu_present_mask before SNP init — so when snp_setup_rmpopt() programs the MSRs, every
core's primary is online -> every core is in rmpopt_cpumask.
  
So now the work handler always runs on a CPU whose core is programmed. topology_sibling_cpumask(this_cpu) therefore always intersects
rmpopt_cpumask -> case #1 or #2 always matches.

So i should actually drop case #3 here - which is: "this_cpu is neither in the "rmpopt_cpumask", nor is any of its
siblings on rmpopt_cpumask"


> 
>> +		int leader_cpu = cpumask_first(follower_mask);
>> +
>> +		if (WARN_ON_ONCE(leader_cpu >= nr_cpu_ids)) {
>> +			migrate_enable();
>> +			goto out;
>> +		}
>> +
>> +		cpumask_clear_cpu(leader_cpu, follower_mask);
>> +
>> +		/* Release migration pin before work_on_cpu(). */
>> +		migrate_enable();
>> +
>> +		work_on_cpu(leader_cpu, rmpopt_leader_fn, NULL);
> 
> This creates a delayed work and also waits for it to finish execution
> which will add more latency than a simple IPI if the comment about IPI
> latency above is accurate.
> 
> I think there is some corner case in construction of the
> "rmpopt_cpumask" that requires this not-so-pretty else block. Can you
> elaborate why this is required?
> 
> Perhaps the "rmpopt_cpumask" construction needs:
> 
>     for_each_online_cpu(cpu) {
>         /* Nominate the first CPU on the sibling mask for RMPOPT */
>         if (cpu != cpumask_first(topology_sibling_cpumask(cpu)))
>             continue;
>         cpumask_set_cpu(cpu, &rmpopt_cpumask);
>     }
> 
> 
> and all you need here is:
> 
>     /* Do RMPOPt for local core */
>     for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G)
>         rmpopt(pa);
> 
>     /* Skip this core from concurrent RMPOPT */
>     cpumask_and_not(follower_mask, &rmpopt_cpumask, topology_sibling_cpumask(cpu));
> 
> No?
> 

Yes, a simpler implementation will be like this: 
...

 	if (!alloc_cpumask_var(&follower_mask, GFP_KERNEL))
                return;

 	cpumask_copy(follower_mask, &rmpopt_cpumask);

        /*
         * The current CPU's core always has RMPOPT_BASE programmed
         * (snp_prepare() required all CPUs online at setup and CPU hotplug
         * is disabled while SNP is active), so it can always be the leader.
         * RMPOPT_BASE is per-core; exclude this core from the followers.
         */
        migrate_disable();
        cpumask_andnot(follower_mask, follower_mask,
                       topology_sibling_cpumask(smp_processor_id()));

        for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
                rmpopt(pa);
                cond_resched();
        }
        migrate_enable();

        cpus_read_lock();
        for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
                on_each_cpu_mask(follower_mask, rmpopt_smp, (void *)pa, true);
                cond_resched();
        }
        cpus_read_unlock();

        free_cpumask_var(follower_mask);


 Here, the leader exclusion must use the sibling mask, not clear_cpu(this_cpu). That's why my collapsed version uses:

        cpumask_andnot(follower_mask, follower_mask,
                       topology_sibling_cpumask(smp_processor_id()));

  - If this_cpu is a primary: its sibling mask contains itself (the primary) -> andnot removes this core's primary from the followers.
  
  - If this_cpu is a secondary: it isn't in follower_mask at all, but its sibling mask contains its primary, which is in
  follower_mask -> andnot still removes this core's primary. 

  So either way the current core is dropped from the followers. (The old code needed two branches because case #1 used
  clear_cpu(this_cpu) — only correct when this_cpu is the primary — while case #2 used the sibling andnot. The single andnot works for
  both cases).

Thanks,
Ashish

>> +		goto followers;
>> +	}
>> +
>> +	migrate_enable();
>> +

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox