Linux Confidential Computing Development

Linux Confidential Computing Development
 help / color / mirror / Atom feed

* [PATCH v8 2/7] x86/sev: Initialize RMPOPT configuration MSRs
From: Ashish Kalra @ 2026-06-15 19:48 UTC (permalink / raw)
  To: tglx, mingo, bp, dave.hansen, x86, hpa, seanjc, peterz,
	thomas.lendacky, herbert, davem, ardb
  Cc: pbonzini, aik, Michael.Roth, KPrateek.Nayak, Tycho.Andersen,
	Nathan.Fontenot, ackerleytng, jackyli, pgonda, rientjes, jacobhxu,
	xin, pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen,
	darwi, linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <cover.1781419998.git.ashish.kalra@amd.com>

From: Ashish Kalra <ashish.kalra@amd.com>

The new RMPOPT instruction helps manage per-CPU RMP optimization
structures inside the CPU. It takes a 1GB-aligned physical address
and either returns the status of the optimizations or tries to enable
the optimizations.

Per-CPU RMPOPT tables support at most 2 TB of addressable memory for
RMP optimizations.

Initialize the per-CPU RMPOPT table base to the starting physical
address. This enables RMP optimization for up to 2 TB of system RAM on
all CPUs.

Additionally, add support to setup and enable RMPOPT once SNP is
enabled and initialized.

Suggested-by: Thomas Lendacky <thomas.lendacky@amd.com>
Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
---
 arch/x86/coco/core.c             |  2 +
 arch/x86/include/asm/msr-index.h |  3 ++
 arch/x86/include/asm/sev.h       |  4 ++
 arch/x86/virt/svm/sev.c          | 70 ++++++++++++++++++++++++++++++++
 drivers/crypto/ccp/sev-dev.c     |  3 ++
 5 files changed, 82 insertions(+)

diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
index 989ca9f72ba3..8c1393ddc5df 100644
--- a/arch/x86/coco/core.c
+++ b/arch/x86/coco/core.c
@@ -16,6 +16,7 @@
 #include <asm/archrandom.h>
 #include <asm/coco.h>
 #include <asm/processor.h>
+#include <asm/sev.h>
 
 enum cc_vendor cc_vendor __ro_after_init = CC_VENDOR_NONE;
 SYM_PIC_ALIAS(cc_vendor);
@@ -172,6 +173,7 @@ static void amd_cc_platform_clear(enum cc_attr attr)
 	switch (attr) {
 	case CC_ATTR_HOST_SEV_SNP:
 		cc_flags.host_sev_snp = 0;
+		snp_clear_rmpopt_configured();
 		break;
 	default:
 		break;
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 86554de9a3f5..28540744f1eb 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -761,6 +761,9 @@
 #define MSR_AMD64_SEG_RMP_ENABLED_BIT	0
 #define MSR_AMD64_SEG_RMP_ENABLED	BIT_ULL(MSR_AMD64_SEG_RMP_ENABLED_BIT)
 #define MSR_AMD64_RMP_SEGMENT_SHIFT(x)	(((x) & GENMASK_ULL(13, 8)) >> 8)
+#define MSR_AMD64_RMPOPT_BASE		0xc0010139
+#define MSR_AMD64_RMPOPT_ENABLE_BIT	0
+#define MSR_AMD64_RMPOPT_ENABLE		BIT_ULL(MSR_AMD64_RMPOPT_ENABLE_BIT)
 
 #define MSR_SVSM_CAA			0xc001f000
 
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 594cfa19cbd4..0d662221615a 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -662,6 +662,8 @@ static inline void snp_leak_pages(u64 pfn, unsigned int pages)
 	__snp_leak_pages(pfn, pages, true);
 }
 int snp_prepare(void);
+void snp_setup_rmpopt(void);
+void snp_clear_rmpopt_configured(void);
 void snp_shutdown(void);
 #else
 static inline bool snp_probe_rmptable_info(void) { return false; }
@@ -680,6 +682,8 @@ static inline void snp_leak_pages(u64 pfn, unsigned int npages) {}
 static inline void kdump_sev_callback(void) { }
 static inline void snp_fixup_e820_tables(void) {}
 static inline int snp_prepare(void) { return -ENODEV; }
+static inline void snp_setup_rmpopt(void) {}
+static inline void snp_clear_rmpopt_configured(void) {}
 static inline void snp_shutdown(void) {}
 #endif
 
diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
index 8bcdce98f6dc..1b5c18408f0b 100644
--- a/arch/x86/virt/svm/sev.c
+++ b/arch/x86/virt/svm/sev.c
@@ -124,6 +124,10 @@ static void *rmp_bookkeeping __ro_after_init;
 
 static u64 probed_rmp_base, probed_rmp_size;
 
+static cpumask_t rmpopt_cpumask;
+static phys_addr_t rmpopt_pa_start;
+static bool rmpopt_configured;
+
 static LIST_HEAD(snp_leaked_pages_list);
 static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
 
@@ -490,7 +494,12 @@ static bool __init setup_rmptable(void)
 	if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) {
 		if (!setup_segmented_rmptable())
 			return false;
+		rmpopt_configured = true;
 	} else {
+		/*
+		 * RMPOPT requires a segmented RMP table, so leave
+		 * rmpopt_configured clear on contiguous RMP systems.
+		 */
 		if (!setup_contiguous_rmptable())
 			return false;
 	}
@@ -555,6 +564,21 @@ int snp_prepare(void)
 }
 EXPORT_SYMBOL_FOR_MODULES(snp_prepare, "ccp");
 
+static void rmpopt_cleanup(void)
+{
+	int cpu;
+
+	cpus_read_lock();
+
+	for_each_cpu(cpu, &rmpopt_cpumask)
+		WARN_ON_ONCE(wrmsrq_on_cpu(cpu, MSR_AMD64_RMPOPT_BASE, 0));
+
+	cpus_read_unlock();
+
+	cpumask_clear(&rmpopt_cpumask);
+	rmpopt_pa_start = 0;
+}
+
 void snp_shutdown(void)
 {
 	u64 syscfg;
@@ -563,11 +587,57 @@ void snp_shutdown(void)
 	if (syscfg & MSR_AMD64_SYSCFG_SNP_EN)
 		return;
 
+	rmpopt_cleanup();
+
 	clear_rmp();
 	on_each_cpu(mfd_reconfigure, NULL, 1);
 }
 EXPORT_SYMBOL_FOR_MODULES(snp_shutdown, "ccp");
 
+void snp_clear_rmpopt_configured(void)
+{
+	rmpopt_configured = false;
+}
+
+void snp_setup_rmpopt(void)
+{
+	u64 rmpopt_base;
+	int cpu;
+
+	if (!cpu_feature_enabled(X86_FEATURE_RMPOPT) || !rmpopt_configured)
+		return;
+
+	cpus_read_lock();
+
+	/*
+	 * The RMPOPT_BASE MSR is per-core, so only one thread per core needs
+	 * to set up the RMPOPT_BASE MSR.
+	 *
+	 * Note: only online primary threads are included.  If a core's
+	 * primary thread is offline, that core is not covered.  CPU hotplug
+	 * is not currently supported with SNP enabled.
+	 */
+
+	for_each_online_cpu(cpu)
+		if (topology_is_primary_thread(cpu))
+			cpumask_set_cpu(cpu, &rmpopt_cpumask);
+
+	rmpopt_pa_start = ALIGN_DOWN(PFN_PHYS(min_low_pfn), SZ_1G);
+	rmpopt_base = rmpopt_pa_start | MSR_AMD64_RMPOPT_ENABLE;
+
+	/*
+	 * Per-CPU RMPOPT tables support at most 2 TB of addressable memory
+	 * for RMP optimizations. Initialize the per-CPU RMPOPT table base
+	 * to the starting physical address to enable RMP optimizations for
+	 * up to 2 TB of system RAM on all CPUs.
+	 */
+	for_each_cpu(cpu, &rmpopt_cpumask)
+		WARN_ON_ONCE(wrmsrq_on_cpu(cpu, MSR_AMD64_RMPOPT_BASE, rmpopt_base));
+
+	cpus_read_unlock();
+}
+EXPORT_SYMBOL_FOR_MODULES(snp_setup_rmpopt, "ccp");
+
 /*
  * Do the necessary preparations which are verified by the firmware as
  * described in the SNP_INIT_EX firmware command description in the SNP
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index 78f98aee7a66..217b6b19802e 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -1478,6 +1478,9 @@ static int __sev_snp_init_locked(int *error, unsigned int max_snp_asid)
 	}
 
 	snp_hv_fixed_pages_state_update(sev, HV_FIXED);
+
+	snp_setup_rmpopt();
+
 	sev->snp_initialized = true;
 	dev_dbg(sev->dev, "SEV-SNP firmware initialized, SEV-TIO is %s\n",
 		data.tio_en ? "enabled" : "disabled");
-- 
2.43.0


^ permalink raw reply related

* [PATCH v8 1/7] x86/cpufeatures: Add X86_FEATURE_RMPOPT feature flag
From: Ashish Kalra @ 2026-06-15 19:48 UTC (permalink / raw)
  To: tglx, mingo, bp, dave.hansen, x86, hpa, seanjc, peterz,
	thomas.lendacky, herbert, davem, ardb
  Cc: pbonzini, aik, Michael.Roth, KPrateek.Nayak, Tycho.Andersen,
	Nathan.Fontenot, ackerleytng, jackyli, pgonda, rientjes, jacobhxu,
	xin, pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen,
	darwi, linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <cover.1781419998.git.ashish.kalra@amd.com>

From: Ashish Kalra <ashish.kalra@amd.com>

Add a flag indicating whether RMPOPT instruction is supported.

RMPOPT is a new instruction that reduces the performance overhead of
RMP checks for the hypervisor and non-SNP guests by allowing those
checks to be skipped when 1-GB memory regions are known to contain no
SEV-SNP guest memory.

For more information on the RMPOPT instruction, see the AMD64 RMPOPT
technical documentation.

Suggested-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
---
 arch/x86/include/asm/cpufeatures.h       | 2 +-
 arch/x86/kernel/cpu/scattered.c          | 1 +
 tools/arch/x86/include/asm/cpufeatures.h | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 1d506e5d6f46..794cc96b8493 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -76,7 +76,7 @@
 #define X86_FEATURE_K8			( 3*32+ 4) /* Opteron, Athlon64 */
 #define X86_FEATURE_ZEN5		( 3*32+ 5) /* CPU based on Zen5 microarchitecture */
 #define X86_FEATURE_ZEN6		( 3*32+ 6) /* CPU based on Zen6 microarchitecture */
-/* Free                                 ( 3*32+ 7) */
+#define X86_FEATURE_RMPOPT		( 3*32+ 7) /* Support for AMD RMPOPT instruction */
 #define X86_FEATURE_CONSTANT_TSC	( 3*32+ 8) /* "constant_tsc" TSC ticks at a constant rate */
 #define X86_FEATURE_UP			( 3*32+ 9) /* "up" SMP kernel running on UP */
 #define X86_FEATURE_ART			( 3*32+10) /* "art" Always running timer (ART) */
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 937129ce6a96..021c0bf22de2 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -67,6 +67,7 @@ static const struct cpuid_bit cpuid_bits[] = {
 	{ X86_FEATURE_PERFMON_V2,		CPUID_EAX,  0, 0x80000022, 0 },
 	{ X86_FEATURE_AMD_LBR_V2,		CPUID_EAX,  1, 0x80000022, 0 },
 	{ X86_FEATURE_AMD_LBR_PMC_FREEZE,	CPUID_EAX,  2, 0x80000022, 0 },
+	{ X86_FEATURE_RMPOPT,			CPUID_EDX,  0, 0x80000025, 0 },
 	{ X86_FEATURE_AMD_HTR_CORES,		CPUID_EAX, 30, 0x80000026, 0 },
 	{ 0, 0, 0, 0, 0 }
 };
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index 86d17b195e79..7ce681af1dd7 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -76,7 +76,7 @@
 #define X86_FEATURE_K8			( 3*32+ 4) /* Opteron, Athlon64 */
 #define X86_FEATURE_ZEN5		( 3*32+ 5) /* CPU based on Zen5 microarchitecture */
 #define X86_FEATURE_ZEN6		( 3*32+ 6) /* CPU based on Zen6 microarchitecture */
-/* Free                                 ( 3*32+ 7) */
+#define X86_FEATURE_RMPOPT		( 3*32+ 7) /* Support for AMD RMPOPT instruction */
 #define X86_FEATURE_CONSTANT_TSC	( 3*32+ 8) /* "constant_tsc" TSC ticks at a constant rate */
 #define X86_FEATURE_UP			( 3*32+ 9) /* "up" SMP kernel running on UP */
 #define X86_FEATURE_ART			( 3*32+10) /* "art" Always running timer (ART) */
-- 
2.43.0


^ permalink raw reply related

* [PATCH v8 0/7] Add RMPOPT support.
From: Ashish Kalra @ 2026-06-15 19:47 UTC (permalink / raw)
  To: tglx, mingo, bp, dave.hansen, x86, hpa, seanjc, peterz,
	thomas.lendacky, herbert, davem, ardb
  Cc: pbonzini, aik, Michael.Roth, KPrateek.Nayak, Tycho.Andersen,
	Nathan.Fontenot, ackerleytng, jackyli, pgonda, rientjes, jacobhxu,
	xin, pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen,
	darwi, linux-kernel, linux-crypto, kvm, linux-coco

From: Ashish Kalra <ashish.kalra@amd.com>

In the SEV-SNP architecture, hypervisor and non-SNP guests are subject
to RMP checks on writes to provide integrity of SEV-SNP guest memory.

The RMPOPT architecture enables optimizations whereby the RMP checks
can be skipped if 1GB regions of memory are known to not contain any
SNP guest memory.

RMPOPT is a new instruction designed to minimize the performance
overhead of RMP checks for the hypervisor and non-SNP guests.

RMPOPT instruction currently supports two functions. In case of the
verify and report status function the CPU will read the RMP contents,
verify the entire 1GB region starting at the provided SPA is HV-owned.
For the entire 1GB region it checks that all RMP entries in this region
are HV-owned (i.e, not in assigned state) and then accordingly updates
the RMPOPT table to indicate if optimization has been enabled and
provide indication to software if the optimization was successful.

In case of report status function, the CPU returns the optimization
status for the 1GB region.

The RMPOPT table is managed by a combination of software and hardware.
Software uses the RMPOPT instruction to set bits in the table,
indicating that regions of memory are entirely HV-owned.  Hardware
automatically clears bits in the RMPOPT table when RMP contents are
changed during RMPUPDATE instruction.

For more information on the RMPOPT instruction, see the AMD64 RMPOPT
technical documentation.

As SNP is enabled by default the hypervisor and non-SNP guests are
subject to RMP write checks to provide integrity of SNP guest memory.

This patch-series adds support to enable RMP optimizations for up to
2TB of system RAM across the system and allow RMPUPDATE to disable
those optimizations as SNP guests are launched.

Support for RAM larger than 2 TB will be added in follow-on series.

This series also adds support to disable CPU hotplug while SNP is
active, as the SEV firmware enumerates CPUs at SNP initialization and is
not aware of the OS bringing CPUs online or offline afterwards.  This
also keeps the set of CPUs stable for the asynchronous RMPOPT scan, so
the per-core RMPOPT_BASE MSRs programmed during setup remain valid.

This series also introduces support to re-enable RMP optimizations
during SNP guest termination, after guest pages have been converted
back to shared.

RMP optimizations are performed asynchronously by queuing work on a
dedicated workqueue after a 10 second delay.

Delaying work allows batching of multiple SNP guest terminations.

Once 1GB hugetlb guest_memfd support is merged, support for
re-enabling RMPOPT optimizations during 1GB page cleanup will be added
in follow-on series.

Additionally add debugfs interface to report per-CPU RMPOPT status
across all system RAM.

v8:
- Add a new patch to disable CPU hotplug while SNP is active, keeping
  the CPU set stable for the RMPOPT work handler.
- Drop the setup_clear_cpu_cap(X86_FEATURE_RMPOPT) calls; the
  rmpopt_configured bool is the runtime guard.
- WARN_ON_ONCE() on the RMPOPT_BASE MSR writes that previously ignored
  their return value.
- Run the RMPOPT leader scan via work_on_cpu() instead of
  smp_call_function_single() so it executes in process context.  This
  fixes the AB-BA deadlock between migrate_disable() and cpus_read_lock()
  and avoids running the long RMP scan in IPI context with interrupts
  disabled.
- Use mod_delayed_work() in snp_rmpopt_all_physmem() so the batching
  delay tracks the last SNP guest termination.

  Sashiko AI code review identified several of the above issues.

v7:
- Sync tools/arch/x86/include/asm/cpufeatures.h to mirror the kernel
  header for X86_FEATURE_RMPOPT.
- Fix commit title to use X86_FEATURE_RMPOPT to match the code
  (was X86_FEATURE_AMD_RMPOPT).
- Add static bool rmpopt_configured, set only when segmented RMP setup
  succeeds in setup_rmptable().  Check rmpopt_configured alongside
  cpu_feature_enabled(X86_FEATURE_RMPOPT) in snp_setup_rmpopt() and
  snp_rmpopt_all_physmem(), because setup_clear_cpu_cap() is unreliable
  after alternatives are patched.  Add snp_clear_rmpopt_configured()
  called from amd_cc_platform_clear() when CC_ATTR_HOST_SEV_SNP is
  cleared.  Do not use __ro_after_init on rmpopt_configured since the
  writer snp_clear_rmpopt_configured() is not __init.
- Add cond_resched() to all three leader loops in rmpopt_work_handler()
  to prevent soft lockups on systems with up to 2TB of RAM.
- Add comment above __rmpopt() documenting the RMPOPT instruction
  encoding (F2 0F 01 FC) and register interface (RAX = system physical
  address input, RCX = operation type input, RFLAGS.CF = output).
  Note: RMPOPT does not modify RAX unlike PVALIDATE/RMPUPDATE, so
  the existing "a" (input-only) constraint is correct.

  Sashiko AI code review identified several of the above issues.

v6:
- Drop wrmsrq_on_cpus() helper; use for_each_cpu() with wrmsrq_on_cpu()
  instead, as RMPOPT_BASE MSR programming is not performance-critical.
- Rewrite rmpopt_work_handler() leader selection to use a local
  follower_mask copy instead of modifying the global rmpopt_cpumask.
  This eliminates the current_cpu_cleared tracking and the restore at
  the end, and removes the need for synchronization comments about
  transient cpumask inconsistency.
- Add three-way leader selection in rmpopt_work_handler():
  1. Current CPU is a primary thread in cpumask: run leader locally.
  2. Current CPU is a sibling thread whose primary is in cpumask:
     run leader locally (RMPOPT_BASE MSR is per-core), remove the
     primary from followers via cpumask_andnot(topology_sibling_cpumask).
  3. Current CPU's core has no RMPOPT_BASE MSR programmed: pick an
     explicit leader via cpumask_first() + smp_call_function_single()
     to avoid #UD, with cpus_read_lock() around the IPI loop.
- Add WARN_ON_ONCE guard for empty cpumask in the explicit leader
  fallback path, with migrate_enable() before goto out.
- Add .llseek = seq_lseek to rmpopt_table_fops for consistency with
  other seq_file-based debugfs files and to support tools like "less".
- Change debugfs file permissions from 0444 to 0400 to restrict access
  to root only.
- Add comment in rmpopt_table_seq_show() explaining why cpu_online_mask
  is safe: RMPOPT_BASE MSR is per-core and snp_prepare() ensures all
  CPUs are online when the MSR is programmed.

  Sashiko AI code review identified several of the above issues.

v5:
- Introduce rmpopt_cleanup() to tear down workqueue, debugfs, cpumask,
  and MSR state, called from snp_shutdown().
- Introduce rmpopt_wq_mutex to serialize snp_setup_rmpopt(),
  snp_rmpopt_all_physmem(), and rmpopt_cleanup().
- Introduce rmpopt_show_mutex to serialize debugfs reporting of
  rmpopt_report_cpumask.
- Move snp_rmpopt_all_physmem() call after SNP DECOMMISSION during
  guest shutdown.
- Use migrate_disable()/migrate_enable() for CPU pinning in the
  rmpopt_work_handler() leader loop to maintain CPU affinity without
  disabling preemption for the entire RMPOPT scan.
- Add cpus_read_lock()/cpus_read_unlock() around the follower
  on_each_cpu_mask() loop in rmpopt_work_handler().
- Guard snp_setup_rmpopt() against re-initialization when
  SNP_SHUTDOWN_EX with x86_snp_shutdown=0 skips rmpopt_cleanup()
  but clears snp_initialized, preventing workqueue and resource
  leaks on repeated init/shutdown cycles.
- Replace setup_clear_cpu_cap() with pr_err() on alloc_workqueue()
  failure in snp_setup_rmpopt(), as setup_clear_cpu_cap() cannot be
  used after alternatives are patched; callers check rmpopt_wq != NULL
  as the runtime guard instead.
- Add pr_info() when RMPOPT coverage is capped at 2TB.
- Add comments noting CPU hotplug is not supported with SNP enabled
  and only online primary threads are covered by rmpopt_cpumask.
- Add comment in setup_rmptable() noting Segmented RMP must be
  enabled to enable RMPOPT.
- Simplify cpumask setup loop to set if primary thread rather than
  skip if not primary.
- Improve grammar and clarity in snp_setup_rmpopt() comments.
- Added Reviewed-by's.

  Sashiko AI code review identified several of the above issues.

v4:
- Add new wrmsrq_on_cpus() helper to write same u64 value to a
  per-CPU MSR across a cpumask without per-cpu struct allocation
  overhead. 
- Rename configure_and_enable_rmpopt() to snp_setup_rmpopt().
- Use wrmsrq_on_cpus() instead of wrmsrq_on_cpu() loop for
  programming RMPOPT_BASE MSRs.
- Add setup_clear_cpu_cap(X86_FEATURE_RMPOPT) if segmented RMP
  setup fails or workqueue allocation fails.
- Add X86_FEATURE_RMPOPT feature clear logic in amd_cc_platform_clear()
  for CC_ATTR_HOST_SEV_SNP.
- All of the above allow checking for only X86_FEATURE_RMPOPT for both
  RMPOPT setup/enable and RMP re-optimizations.
- Rename snp_perform_rmp_optimization() to snp_rmpopt_all_physmem().
- Split rmpopt() into rmpopt() and rmpopt_smp() for SMP callback use.
- Introduce separate rmpopt_report_cpumask for debugfs reporting,
  distinct from rmpopt_cpumask used for primary thread tracking.
- Remove snp_perform_rmp_optimization() call from __sev_snp_init_locked() 
  and instead setup and enable RMPOPT after SNP is enabled and 
  initialized.

v3:
- Drop all RMPOPT kthread support and introduce adding custom and
  dedicated workqueue to schedule delayed and asynchronous RMPOPT work.
- Drop the guest_memfd inode cleanup interface and add support to
  re-enable RMP optimizations during guest shutdown using the
  asynchronous and delayed workqueue interface.
- Introduce new __rmpopt() helper and rmpopt() and
  rmpopt_report_status() wrappers on top which use rax and rcx
  parameters to closely match RMPOPT specs.
- Use new optimized RMPOPT loop to issue RMPOPT instructions on all
  system RAM upto 2TB and all CPUs, by optimizing each range on one CPU
  first, then let other CPUs execute RMPOPT in parallel so they can skip
  most work as the range has already been optimized.
- Also add support for running the optimized RMPOPT loop only on
  one thread per core.
- Replace all PUD_SIZE references with SZ_1G to conform to 1GB regions
  as specified by RMPOPT specifications and not be dependent on PUD_SIZE
  which makes the RMPOPT patch-set independent of x86 page table sizes.
- Use wrmsrq_on_cpu() to program the RMPOPT_BASE MSR registers on
  all CPUs that removes all ugly casting to use on_each_cpu_mask().
- Fix inline commits and patch commit messages


v2:
- Drop all NUMA and Socket configuration and enablement support and
  enable RMPOPT support for up to 2TB of system RAM.
- Drop get_cpumask_of_primary_threads() and enable per-core RMPOPT
  base MSRs and issue RMPOPT instruction on all CPUs.
- Drop the configfs interface to manually re-enable RMP optimizations.
- Add new guest_memfd cleanup interface to automatically re-enable
  RMP optimizations during guest shutdown.
- Include references to the public RMPOPT documentation.
- Move debugfs directory for RMPOPT under architecuture specific
  parent directory.

Ashish Kalra (7):
  x86/cpufeatures: Add X86_FEATURE_RMPOPT feature flag
  x86/sev: Initialize RMPOPT configuration MSRs
  crypto/ccp: Disable CPU hotplug while SNP is active
  x86/sev: Add support to perform RMP optimizations asynchronously
  x86/sev: Add interface to re-enable RMP optimizations.
  KVM: SEV: Perform RMP optimizations on SNP guest shutdown
  x86/sev: Add debugfs support for RMPOPT

 arch/x86/coco/core.c                     |   2 +
 arch/x86/include/asm/cpufeatures.h       |   2 +-
 arch/x86/include/asm/msr-index.h         |   3 +
 arch/x86/include/asm/sev.h               |   6 +
 arch/x86/kernel/cpu/scattered.c          |   1 +
 arch/x86/kvm/svm/sev.c                   |   2 +
 arch/x86/virt/svm/sev.c                  | 437 +++++++++++++++++++++++
 drivers/crypto/ccp/sev-dev.c             |  32 +-
 tools/arch/x86/include/asm/cpufeatures.h |   2 +-
 9 files changed, 484 insertions(+), 3 deletions(-)

-- 
2.43.0


^ permalink raw reply

* Re: [PATCH RFC 2/3] KVM: guest_memfd: support folio migration for non-confidential VMs
From: David Hildenbrand (Arm) @ 2026-06-15 18:35 UTC (permalink / raw)
  To: Shivank Garg, Matthew Wilcox (Oracle), Jan Kara, Andrew Morton,
	Vlastimil Babka, Suren Baghdasaryan, Michal Hocko,
	Brendan Jackman, Johannes Weiner, Zi Yan, Matthew Brost,
	Joshua Hahn, Rakie Kim, Byungchul Park, Gregory Price, Ying Huang,
	Alistair Popple, Paolo Bonzini, Shuah Khan, Chao Peng,
	Nikunj A Dadhania, Ira Weiny, Michael Roth, Pankaj Gupta,
	Ackerley Tng, Fuad Tabba, Sean Christopherson, Vishal Annapurve,
	Nikita Kalyazin, Patrick Roy, Pratik Sampat, Ashish Kalra
  Cc: linux-fsdevel, linux-coco, linux-mm, linux-kernel, kvm,
	linux-kselftest
In-Reply-To: <20260611-shivank-gmem-migrate-v1-2-2d266bfc6f95@amd.com>

On 6/11/26 15:05, Shivank Garg wrote:
> guest_memfd folios are currently marked unmmovable, so the kernel
> cannot perform NUMA-balancing, memory compaction, etc.
> This is unavoidable for confidential VMs (SEV-SNP, TDX),
> since memory is encrypted and copying it need firmware assistance.
> However, for non-cofidential VMs (like firecracker), we can migrate
> the folios.
> 
> Mark non-confidential VMs as movable and implement
> kvm_gmem_migrate_folio() using filemap_migrate_folio().
> 
> This lays the ground work for migrating cofidential guest_memfd
> later. Once the firmware-assisted copying support is available,
> those VMs can be made movable. The confidential folio content can
> be copied separately, and the destination folio can be marked with
> FOLIO_CONTENT_COPIED so __migrate_folio() skips the host-side
> folio_mc_copy().
> 
> Signed-off-by: Shivank Garg <shivankg@amd.com>
> ---
>  virt/kvm/guest_memfd.c | 50 +++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 45 insertions(+), 5 deletions(-)
> 
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 806a42f0e031a1c7729f53c786316d2502532553..e4470106fc7792f328bce5275419683328c8b4ab 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -487,13 +487,45 @@ static struct file_operations kvm_gmem_fops = {
>  	.fallocate	= kvm_gmem_fallocate,
>  };
>  
> +#ifdef CONFIG_MIGRATION
>  static int kvm_gmem_migrate_folio(struct address_space *mapping,
>  				  struct folio *dst, struct folio *src,
>  				  enum migrate_mode mode)
>  {
> -	WARN_ON_ONCE(1);
> -	return -EINVAL;
> +	struct inode *inode = mapping->host;
> +	pgoff_t start, end;
> +	int ret;
> +
> +	if (!filemap_invalidate_trylock_shared(mapping))
> +		return -EAGAIN;
> +
> +	start = src->index;
> +	end = start + folio_nr_pages(src);
> +
> +	kvm_gmem_invalidate_begin(inode, start, end);
> +
> +	/*
> +	 * For non-confidential guest_memfd the folio is host-readable,
> +	 * so filemap_migrate_folio() can copy the contents itself via
> +	 * folio_mc_copy().
> +	 *
> +	 * This is also the hook point for confidential VMs (SEV-SNP, TDX) once
> +	 * they are made movable: the host cannot copy encrypted/private memory,
> +	 * so a firmware-assisted copy would run here.
> +	 * Idea: https://lore.kernel.org/r/20260428155043.39251-8-shivankg@amd.com
> +	 * Mark the @dst->migrate_info field with FOLIO_CONTENT_COPIED, so
> +	 * __migrate_folio() skip folio_mc_copy() for confidential VMs.
> +	 */
> +	ret = filemap_migrate_folio(mapping, dst, src, mode);
> +
> +	kvm_gmem_invalidate_end(inode, start, end);
> +
> +	filemap_invalidate_unlock_shared(mapping);
> +	return ret;
>  }
> +#else
> +#define kvm_gmem_migrate_folio NULL
> +#endif
>  
>  static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio)
>  {
> @@ -592,9 +624,17 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
>  	inode->i_size = size;
>  	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
>  	mapping_set_inaccessible(inode->i_mapping);
> -	mapping_set_unmovable(inode->i_mapping);
> -	/* Unmovable mappings are supposed to be marked unevictable as well. */
> -	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
> +
> +	/*
> +	 * Confidential VMs (SEV-SNP, TDX) bind encryption to the physical
> +	 * address and require firmware assisted copy, so their folios cannot
> +	 * be migrated yet.
> +	 */
> +	if (kvm_arch_has_private_mem(kvm)) {
> +		mapping_set_unmovable(inode->i_mapping);
> +		/* Unmovable mappings are supposed to be marked unevictable as well. */
> +		WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));

We would still want our movable mappings to be flagged unevictable.

> +	}
>

As discussed, for guest_memfd instances that support page migration, we would
want to also allocate the pages in for guest_memfd as GFP_HIGHUSER_MOVABLE.

That is, handle the mapping_set_gfp_mask() call as well.

It will unlock access to areas reserved for movable allocations (CMA/
ZONE_MOVABLE) and properly let the page allocator group pages by mobility
(MOVABLE vs. UNMOVABLE vs. RECLAIMABLE).

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH RFC 0/3] KVM: guest_memfd: folio migration for non-confidential VMs
From: David Hildenbrand (Arm) @ 2026-06-15 18:30 UTC (permalink / raw)
  To: Alexandru Elisei, Shivank Garg
  Cc: Matthew Wilcox (Oracle), Jan Kara, Andrew Morton, Vlastimil Babka,
	Suren Baghdasaryan, Michal Hocko, Brendan Jackman,
	Johannes Weiner, Zi Yan, Matthew Brost, Joshua Hahn, Rakie Kim,
	Byungchul Park, Gregory Price, Ying Huang, Alistair Popple,
	Paolo Bonzini, Shuah Khan, Chao Peng, Nikunj A Dadhania,
	Ira Weiny, Michael Roth, Pankaj Gupta, Ackerley Tng, Fuad Tabba,
	Sean Christopherson, Vishal Annapurve, Nikita Kalyazin,
	Patrick Roy, Pratik Sampat, Ashish Kalra, linux-fsdevel,
	linux-coco, linux-mm, linux-kernel, kvm, linux-kselftest
In-Reply-To: <ai_XK__RTXMCEcCG@raptor>

On 6/15/26 12:43, Alexandru Elisei wrote:
> Hi,
> 
> On Thu, Jun 11, 2026 at 01:05:07PM +0000, Shivank Garg wrote:
>> guest_memfd folios are currently marked unmovable, so the kernel cannot
>> perform NUMA-balancing, memory compaction, etc. This is unavoidable for
>> confidential VMs (SEV-SNP, TDX), since memory is encrypted and copying it
>> needs firmware assistance. However, for non-confidential VMs (like
>> Firecracker), we can migrate the folios.
>>
>> This series enables folio migration for non-confidential guest_memfd and
>> also lays the groundwork for migrating confidential guest_memfd later.
>> Once firmware-assisted copying support is available, those VMs can be
>> made movable, the confidential folio content can be copied separately,
>> and the destination folio marked with FOLIO_CONTENT_COPIED so
>> __migrate_folio() skips the host-side folio_mc_copy().
> 
> I always thought that one of the nice things about using guest_memfd as a
> memory backend, as opposed to host userspace mappings, is that the host
> cannot unmap VM memory because of KSM, automatic NUMA balancing, hugepage
> collapse, compaction, etc, acting on the host userspace mapping of the
> VM memory, and outside of the VMM's or KVM's control.

Yeah, but it doesn't play nice with THPs / large folios. So if you want to run
something else on a hypervisor than just confidential VMs, you definitely want
guest_memfd to be as nice to the system.

That is, support page migration if nothing speaks against it.

Now, if something speaks against it, for sure we can just leave the pages be
unmovable.

Fortunately, the patch is rather trivial.

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH RFC 0/3] KVM: guest_memfd: folio migration for non-confidential VMs
From: David Hildenbrand (Arm) @ 2026-06-15 18:24 UTC (permalink / raw)
  To: Sean Christopherson, Alexandru Elisei
  Cc: Shivank Garg, Matthew Wilcox (Oracle), Jan Kara, Andrew Morton,
	Vlastimil Babka, Suren Baghdasaryan, Michal Hocko,
	Brendan Jackman, Johannes Weiner, Zi Yan, Matthew Brost,
	Joshua Hahn, Rakie Kim, Byungchul Park, Gregory Price, Ying Huang,
	Alistair Popple, Paolo Bonzini, Shuah Khan, Chao Peng,
	Nikunj A Dadhania, Ira Weiny, Michael Roth, Pankaj Gupta,
	Ackerley Tng, Fuad Tabba, Vishal Annapurve, Nikita Kalyazin,
	Patrick Roy, Pratik Sampat, Ashish Kalra, linux-fsdevel,
	linux-coco, linux-mm, linux-kernel, kvm, linux-kselftest
In-Reply-To: <ajA4z_Wkb93cTW4m@google.com>

On 6/15/26 19:39, Sean Christopherson wrote:
> On Mon, Jun 15, 2026, Alexandru Elisei wrote:
>> Hi,
>>
>> On Mon, Jun 15, 2026 at 11:43:14AM +0100, Alexandru Elisei wrote:
>>> Hi,
>>>
>>>
>>> I always thought that one of the nice things about using guest_memfd as a
>>> memory backend, as opposed to host userspace mappings, is that the host
>>> cannot unmap VM memory because of KSM, automatic NUMA balancing, hugepage
>>> collapse, compaction, etc, acting on the host userspace mapping of the
>>> VM memory, and outside of the VMM's or KVM's control.
> 
> +1000.  It's not just "nice to have", it's a core design principle of guest_memfd.

Right, and I raised in the guest_memfd call also the rough idea of Alexandru's
use case of having non-movable guest_memfd pages such that we can support use
cases where we can hopefully guarantee that a stage-2 mapping will not just
randomly go away.

> 
>>> I think it would be useful to preserve this behaviour, even in the absence
>>> of confidential VMs (i.e, guest_memfd file descriptor created with
>>> GUEST_MEMFD_FLAG_MMAP).
>>
>> Just to be clear, I was thinking that it might be useful for both
>> behaviours to exist (migratable and non-migratable) for non-confidential
>> VMs, and allow KVM or userspace to decide which they prefer for a
>> guest_memfd.
> 
> For the purposes of this discussion, we should separate the physical act of
> migrating pages from the features that trigger migration.  As I said in last week's
> guest-memfd call, I am a-ok with supporting page migration as a mechanism, but I
> am dead set against supporting NUMA balancing, KSM, LRU-based swap/reclaim, and
> anything else that goes against the goal of guest-first memory.

Right. Page migration for supporting ZONE_MOVABLE/CMA, compaction, memory
offlining, virtio-mem and possibly some collapse mechanism if we were to support
THP of some sorts in guest_memfd would are all reasonable.

As soon as we mix in access/lru semantics, we're going into the wrong direction.

Fortunately KSM is anon-only and not even worth a rant here :)



-- 
Cheers,

David

^ permalink raw reply

* Re: [RFC PATCH 13/15] KVM: TDX: Support event-notify interrupts only with userspace quoting
From: Peter Fang @ 2026-06-15 18:14 UTC (permalink / raw)
  To: Adrian Hunter
  Cc: Xu Yilun, kas, djbw, rick.p.edgecombe, x86, linux-coco,
	linux-kernel, kvm, sohil.mehta, yilun.xu, baolu.lu,
	zhenzhong.duan, xiaoyao.li
In-Reply-To: <2ae7d9a9-11da-40dd-a11d-b9e1bf111e1b@intel.com>

On Mon, Jun 15, 2026 at 07:39:01AM +0300, Adrian Hunter wrote:
> >>> @@ -7335,6 +7335,9 @@ inputs and outputs of the TDVMCALL.  Currently the following values of
> >>>     queued successfully, the TDX guest can poll the status field in the
> >>>     shared-memory area to check whether the Quote generation is completed or
> >>>     not. When completed, the generated Quote is returned via the same buffer.
> >>> +   If the host kernel generates Quotes through the TDX Quoting service provided
> >>> +   by the TDX module, KVM processes the GetQuote request and it will not appear
> >>> +   in userspace.
> >>
> >> There is an Attestation section in Documentation/virt/kvm/x86/intel-tdx.rst
> >> that could be updated too.
> > 
> > Can you please point me to it? I couldn't find that section in that
> > file.
> 
> Sorry, got he file name wrong: Documentation/arch/x86/tdx.rst

Thanks a lot for the pointers! It definitely needs to be updated.

> 

^ permalink raw reply

* Re: [PATCH RFC 0/3] KVM: guest_memfd: folio migration for non-confidential VMs
From: Sean Christopherson @ 2026-06-15 17:39 UTC (permalink / raw)
  To: Alexandru Elisei
  Cc: Shivank Garg, Matthew Wilcox (Oracle), Jan Kara, Andrew Morton,
	Vlastimil Babka, Suren Baghdasaryan, Michal Hocko,
	Brendan Jackman, Johannes Weiner, Zi Yan, David Hildenbrand,
	Matthew Brost, Joshua Hahn, Rakie Kim, Byungchul Park,
	Gregory Price, Ying Huang, Alistair Popple, Paolo Bonzini,
	Shuah Khan, Chao Peng, Nikunj A Dadhania, Ira Weiny, Michael Roth,
	Pankaj Gupta, Ackerley Tng, Fuad Tabba, Vishal Annapurve,
	Nikita Kalyazin, Patrick Roy, Pratik Sampat, Ashish Kalra,
	linux-fsdevel, linux-coco, linux-mm, linux-kernel, kvm,
	linux-kselftest
In-Reply-To: <ai_aczmeH2IA6JaB@raptor>

On Mon, Jun 15, 2026, Alexandru Elisei wrote:
> Hi,
> 
> On Mon, Jun 15, 2026 at 11:43:14AM +0100, Alexandru Elisei wrote:
> > Hi,
> > 
> > On Thu, Jun 11, 2026 at 01:05:07PM +0000, Shivank Garg wrote:
> > > guest_memfd folios are currently marked unmovable, so the kernel cannot
> > > perform NUMA-balancing, memory compaction, etc. This is unavoidable for
> > > confidential VMs (SEV-SNP, TDX), since memory is encrypted and copying it
> > > needs firmware assistance. However, for non-confidential VMs (like
> > > Firecracker), we can migrate the folios.
> > > 
> > > This series enables folio migration for non-confidential guest_memfd and
> > > also lays the groundwork for migrating confidential guest_memfd later.
> > > Once firmware-assisted copying support is available, those VMs can be
> > > made movable, the confidential folio content can be copied separately,
> > > and the destination folio marked with FOLIO_CONTENT_COPIED so
> > > __migrate_folio() skips the host-side folio_mc_copy().
> > 
> > I always thought that one of the nice things about using guest_memfd as a
> > memory backend, as opposed to host userspace mappings, is that the host
> > cannot unmap VM memory because of KSM, automatic NUMA balancing, hugepage
> > collapse, compaction, etc, acting on the host userspace mapping of the
> > VM memory, and outside of the VMM's or KVM's control.

+1000.  It's not just "nice to have", it's a core design principle of guest_memfd.

> > I think it would be useful to preserve this behaviour, even in the absence
> > of confidential VMs (i.e, guest_memfd file descriptor created with
> > GUEST_MEMFD_FLAG_MMAP).
> 
> Just to be clear, I was thinking that it might be useful for both
> behaviours to exist (migratable and non-migratable) for non-confidential
> VMs, and allow KVM or userspace to decide which they prefer for a
> guest_memfd.

For the purposes of this discussion, we should separate the physical act of
migrating pages from the features that trigger migration.  As I said in last week's
guest-memfd call, I am a-ok with supporting page migration as a mechanism, but I
am dead set against supporting NUMA balancing, KSM, LRU-based swap/reclaim, and
anything else that goes against the goal of guest-first memory.

If userspace wants mm/ functionality, then use anon, memfd, hugetlb, shmem, etc.

Shivank, what's the immediate motivation for this series?

^ permalink raw reply

* Re: [PATCH 04/15] x86/virt/tdx: Enable the Extensions right after basic TDX Module init
From: Xu Yilun @ 2026-06-15 15:58 UTC (permalink / raw)
  To: Dan Williams (nvidia)
  Cc: kas, rick.p.edgecombe, x86, peter.fang, linux-coco, linux-kernel,
	kvm, sohil.mehta, yilun.xu, baolu.lu, zhenzhong.duan, xiaoyao.li
In-Reply-To: <6a2c9f90d348_9b8551003e@djbw-dev.notmuch>

On Fri, Jun 12, 2026 at 05:08:48PM -0700, Dan Williams (nvidia) wrote:
> Xu Yilun wrote:
> > The detailed initialization flow for TDX Module Extensions has been
> > fully implemented. Enable the flow after basic TDX Module
> > initialization.
> > 
> > Theoretically, the Extensions doesn't need to be enabled right after
> > basic TDX initialization. It could be enabled right before the first
> > Extension SEAMCALL is issued. That would save or postpone memory usage.
> > But it isn't worth the complexity, the needs for the Extensions are vast
> > but the savings are little for a typical TDX capable system (about
> > 0.001% of memory). So the Linux decision is to just enable it along with
> > the basic TDX.
> 
> No real point in rehashing the rationale for the "any available, all the
> time" policy yet again especially when this directly conflicts with the
> "relatively large amount" comment in the original cover letter.

Agree. Will remove the section which is copied from cover letter.

> 
> Otherwise I agree with the proposed reordering of this initial series.
> 
> In general though, no big showstoppers for me in this first 4.

Thanks for the review!

^ permalink raw reply

* Re: [PATCH 02/15] x86/virt/tdx: Add extra memory to TDX Module for Extensions
From: Xu Yilun @ 2026-06-15 15:55 UTC (permalink / raw)
  To: Dan Williams (nvidia)
  Cc: kas, rick.p.edgecombe, x86, peter.fang, linux-coco, linux-kernel,
	kvm, sohil.mehta, yilun.xu, baolu.lu, zhenzhong.duan, xiaoyao.li
In-Reply-To: <6a2c9b10574ce_9b8551005d@djbw-dev.notmuch>

On Fri, Jun 12, 2026 at 04:49:36PM -0700, Dan Williams (nvidia) wrote:
> Xu Yilun wrote:
> > TDX Module introduces a new concept called "TDX Module Extensions" to
> > support long running / hard-irq preemptible flows inside. This makes TDX
> > Module capable of handling complex tasks through "Extension SEAMCALLs".
> > Adding more memory to TDX Module is the first step to enable Extensions.
> 
> Like I said on the cover, I think "long running hard-irq preemptible"
> invites more questions that it answers. The service calls are not "long
> running" on their own. I think it is sufficient to say they are
> resumable unlike typical calls that run to completion while monopolizing
> the CPU.

Yes, I'll drop long running, keep preemptible and resumable.

> 
> > Currently, TDX Module memory use is relatively static. But, the
> > Extensions need to use memory more dynamically. While 'static' here
> > means the kernel provides necessary amount of memory to TDX Module for
> > its basic functionalities, 'dynamic' means extra memory is needed only
> > if new add-on features are to be enabled. So add a new memory feeding
> > process backed by a new SEAMCALL TDH.EXT.MEM.ADD.
> 
> Rick commented on this as well, but a simpler way to say it is
> extensions receive a one time memory pool allocation at init time.  The
> extension uses that pool as its baseline for its own internal state and
> data for the service APIs it offers.

Good to me.

> > For now, TDX Module Extensions consumes relatively large amount of
> > memory (~50MB). Use contiguous page allocation to avoid permanently
> > fragment too much memory. Print the allocation amount on TDX Module
> > Extensions initialization for visibility.
> 
> To be clear I believe there is a low chance of fragmentation given this
> allocation happening early. However, at 10s of MB the benefit of
> isolating blocks of PFNs that will never be returned, it makes to not
> use the buddy allocator for that.

Agree. I'll change it as:

For now, TDX module extensions consume tens of megabytes memory that
will never be returned to host. Use contiguous page allocation to
isolate these large blocks entirely, avoiding permanent memory
fragmentation and reducing buddy allocator efficiency. Print ...


> > +	u64 *root;

...

> > +	root = kzalloc(PAGE_SIZE, GFP_KERNEL);
> > +	if (!root)
> > +		return -ENOMEM;
> 
> I think this "root" term is a holdover from the complicated TDX Connect

Agree. I really don't have to introduce a new "root" page term. The SPEC
says "The HPA_LIST is a 4KB page which contains a list of HPAs", so
hpa_list page is a good name.

> case where it might sometimes be this odd "singleton" object? You could
> just make it this for actual type safety.
> 
> struct tdx_hpa_list {
> 	u64 phys[PAGE_SIZE/sizeof(u64)];
> }
> 
> > +
> > +	page = alloc_contig_pages(nr_pages, GFP_KERNEL, numa_mem_id(),
> > +				  &node_online_map);
> > +	if (!page) {
> > +		ret = -ENOMEM;
> > +		goto out_free_root;
> > +	}
> > +
> > +	for (i = 0; i < nr_pages;) {
> > +		unsigned int nents = min(nr_pages - i,
> > +					 PAGE_SIZE / sizeof(*root));
> 
> This looks wrong, sizeof(struct page)?, or size of physical address?
> 
> Becomes less error prone if you do:
> 
> min(nr_pages - i, ARRAY_SIZE(hpa_list->phys))

OK, let me try.

> > +		ret = tdx_ext_mem_add(virt_to_page(root), nents);
> > +		/*
> > +		 * No SEAMCALLs to reclaim the added pages. For simple error
> > +		 * handling, leak all pages.
> > +		 */
> > +		WARN_ON_ONCE(ret);
> 
> Perhaps to be friendlier to folks without the source code in front of
> them drop the comment and do:
> 
> WARN(ret, "Fatal: TDX Module failed (%d) to accept memory, stranded %ld pages\n", ret, nr_pages)
> 
> ...the once flavor not needed, right? It's toast at this point.

Yes no need the 'once'.

Since I'll print all memory for the extensions anyway below. I'll use:

	WARN(ret, "Fatal: TDX Module rejected (%d) memory for extensions, stranded all pages\n",
	     ret);

Thanks,
Yilun

^ permalink raw reply

* Re: [PATCH 01/15] x86/virt/tdx: Read global metadata for TDX Module Extensions
From: Dave Hansen @ 2026-06-15 16:05 UTC (permalink / raw)
  To: Dan Williams (nvidia), Xu Yilun, kas, rick.p.edgecombe, x86,
	peter.fang
  Cc: linux-coco, linux-kernel, kvm, sohil.mehta, yilun.xu, baolu.lu,
	zhenzhong.duan, xiaoyao.li
In-Reply-To: <6a2c863a681d6_9b85510064@djbw-dev.notmuch>

On 6/12/26 15:20, Dan Williams (nvidia) wrote:
>> Check TDX_FEATURES0 before reading these metadata. If a feature is
>> advertised, a failure in reading associated metadata causes the entire
>> TDX initialization to fail, otherwise skip.
> Others already commented on the patch ordering, so I will just comment
> on the changelog to recommend referring back to the "any available
> extension, all the time" implementation policy rather than saying "Linux
> requires" which is ambiguous.

One other note on this: the current Linux policy of "any available
extension, all the time" is the simplest possible functional policy. If
Linux has one policy, I think that's the one it should have.

That said, I'm open to the idea that users might desire other policies.
We should absolutely explore them another day in another series.

^ permalink raw reply

* Re: [PATCH 00/15] Enable TDX Module Extensions and DICE-based TDX Quoting
From: Dave Hansen @ 2026-06-15 15:57 UTC (permalink / raw)
  To: Xu Yilun, Dan Williams (nvidia)
  Cc: kas, rick.p.edgecombe, x86, peter.fang, linux-coco, linux-kernel,
	kvm, sohil.mehta, yilun.xu, baolu.lu, zhenzhong.duan, xiaoyao.li
In-Reply-To: <ajAYxo83xVV1Sb+Y@yilunxu-OptiPlex-7050>

On 6/15/26 08:22, Xu Yilun wrote:
>> The TDX "Extension SEAMCALL" capability is akin to ARM CCA's "Stateful
>> RMI Operations (SRO)", and achieves similar externalized complexity
>> relief as a dedicated hardware coprocessor like AMD SEV-SNP. The
> I may not include the ARM/AMD examples, not sure I can explain them
> well.

I actually think they're pretty important proof points. One of the big
challenges as a maintainer evaluating these things is judging the
solution itself.

Is this architecture a good one? Is it overly complex? Are the avenues
for simplification?

If five vendors pop up all with similar problems and solutions, then
it's a pretty good bet that they're all on the right track. But, if
there are four going one direction and one going off by itself, it's a
sign that the errant one might need a course correction.

It would honestly be worth your time to go *talk* to the AMD and ARM
folks and ensure that you are all on the same page. Last I checked, they
seemed to be at least halfway reasonable human beings and don't bite.
Let me know if I can help with some introductions.

^ permalink raw reply

* Re: [PATCH 01/15] x86/virt/tdx: Read global metadata for TDX Module Extensions
From: Xu Yilun @ 2026-06-15 15:24 UTC (permalink / raw)
  To: Dan Williams (nvidia)
  Cc: kas, rick.p.edgecombe, x86, peter.fang, linux-coco, linux-kernel,
	kvm, sohil.mehta, yilun.xu, baolu.lu, zhenzhong.duan, xiaoyao.li
In-Reply-To: <6a2c863a681d6_9b85510064@djbw-dev.notmuch>

> > Check TDX_FEATURES0 before reading these metadata. If a feature is
> > advertised, a failure in reading associated metadata causes the entire
> > TDX initialization to fail, otherwise skip.
> 
> Others already commented on the patch ordering, so I will just comment
> on the changelog to recommend referring back to the "any available
> extension, all the time" implementation policy rather than saying "Linux
> requires" which is ambiguous.

Agree.

^ permalink raw reply

* Re: [PATCH 00/15] Enable TDX Module Extensions and DICE-based TDX Quoting
From: Xu Yilun @ 2026-06-15 15:22 UTC (permalink / raw)
  To: Dan Williams (nvidia)
  Cc: kas, rick.p.edgecombe, x86, peter.fang, linux-coco, linux-kernel,
	kvm, sohil.mehta, yilun.xu, baolu.lu, zhenzhong.duan, xiaoyao.li
In-Reply-To: <6a2c821a99e3_9b8551002a@djbw-dev.notmuch>

> The internal implementation details of extension seamcalls buries the
> lead on why this mechanism is important, why Linux should care, and why
> this brings TDX in line with the other major CC architectures. Something
> like:
> 
> ===
> To date, SEAMCALLs have been short lived routines that monopolize the
> CPU for their duration. This limits their utility for implementing
> higher order security protocols or pushes complexity into Linux. The
> Linux appetite for ingesting complexity is low, so TDX now adds a new
> class of SEAMCALLs that are preemptible and resumable. This capability
> enables higher order service APIs to carry out a security protocol like
> "establish an SPDM session".
> 
> The TDX "Extension SEAMCALL" capability is akin to ARM CCA's "Stateful
> RMI Operations (SRO)", and achieves similar externalized complexity
> relief as a dedicated hardware coprocessor like AMD SEV-SNP. The

I may not include the ARM/AMD examples, not sure I can explain them
well.

> mechanism is "give the service environment some memory", "invoke the
> service API", and "continue invoking until complete". All protocol state
> is internal the service API.
> 
> The simplest class of extension SEAMCALLs to support are in support of
> "DICE-based TDX Quoting", a service to turn guest launch attestation
> reports into a document that can be externally verified.
> ===

[...]

> > The Extensions consumes relatively large amount of memory (~50MB). So it
> > is designed to be off by default.
> 
> This confuses the TDX design with the Linux design, and sets up "50MB" as
> something to be quibbled with. The Linux design is turn on all the
> features that Linux knows about all the time. Unless and until the "any
> available, all the time" becomes untenable it just simplifies the init
> flow to not play piecemeal games. Await evidence to change the simple
> policy. Suffice to say the cost of this policy will burn 10s of
> megabytes.

[...]

> 
> > == Some history ==
> > 
> > The TDX Module Extensions part was first posted along with TDX
> > Connect [2]. Now this part is remarkably smaller because we've removed
> > the generic tdx_page_array abstraction for HPA_LIST_INFO. TDX Module
> > Extensions is the first user of HPA_LIST_INFO, and doesn't use it in a
> > typical way (HPA_LIST_INFO can only hold at most 2MB memory). There
> > isn't enough justification to make the abstraction in this series. A
> > possible plan is to rebuild tdx_page_array iteratively when more use
> > cases arise.
> 
> No need to talk about details not in this series. I would maybe just
> note that quoting is the simplest first consumer and was chosen as the
> lead vehicle over TDX Connect previously posted in case anyone asks.

Good to me, will include most of them, thanks.

^ permalink raw reply

* Re: [PATCH] PCI/TSM: Resume device to D0 for CMA-SPDM operation
From: Lukas Wunner @ 2026-06-15 15:37 UTC (permalink / raw)
  To: sashiko-reviews
  Cc: linux-pci, Dan Williams, Ashish Kalra, Tom Lendacky,
	Alexey Kardashevskiy, linux-coco, Jonathan Cameron
In-Reply-To: <20260615134252.B34A21F000E9@smtp.kernel.org>

On Mon, Jun 15, 2026 at 01:42:52PM +0000, sashiko-bot@kernel.org wrote:
> > +++ b/drivers/crypto/ccp/sev-dev-tsm.c
> > @@ -38,6 +40,10 @@ static int sev_tio_spdm_cmd(struct tio_dsm *dsm, int ret)
> >  	if (ret <= 0)
> >  		return ret;
> >  
> > +	PM_RUNTIME_ACQUIRE(&dsm->tsm.base_tsm.pdev->dev, pm);
> > +	if ((pm_ret = PM_RUNTIME_ACQUIRE_ERR(&pm)))
> > +		return pm_ret;
> > +
> 
> [Severity: Medium]
> The commit message mentions that "resuming to D0 needs to (only) be done in
> code paths which use DOE specifically for CMA-SPDM", but does this code
> unconditionally resume the device to D0 regardless of the operation?
> 
> Looking at sev_tio_spdm_cmd(), the resume happens before the loop checks if
> the operation is actually PCI_DOE_FEATURE_CMA or PCI_DOE_FEATURE_SSESSION.
> Could this cause unnecessary power transitions for generic operations where
> accessing the config space in D3hot would be sufficient?

Moving PM_RUNTIME_ACQUIRE() inside the while loop may lead to repeated
D0 -> D3hot -> D0 -> D3hot ... transitions (depending on autosuspend
settings of the device, which are user-configurable through sysfs).
It would also lead to overhead induced by runtime PM code (repeated
spinlock acquisition etc).

So I believe keeping PM_RUNTIME_ACQUIRE() outside the while loop is
the right thing to do, but I'll leave this to AMD engineers to decide.

> [Severity: High]
> Does using PM_RUNTIME_ACQUIRE() here risk leaving the hardware permanently
> stuck in D0 during the device removal path?
> 
> The PM_RUNTIME_ACQUIRE() macro queues an asynchronous pm_runtime_put() when
> dropping the reference. If sev_tio_spdm_cmd() is called during teardown
> sequences, the pending asynchronous suspend can be cancelled by a subsequent
> device_del() call, which disables runtime PM. 
> 
> Should a synchronous runtime put be used here to ensure the device actually
> suspends before the removal completes?

If the device is deleted anyway, we don't care about leaked references.
And we absolutely do not want to synchronously runtime suspend here.

> > +++ b/drivers/pci/tsm.c
> > @@ -894,6 +896,10 @@ int pci_tsm_doe_transfer(struct pci_dev *pdev, u8 type, const void *req,
> >  	if (!tsm->doe_mb)
> >  		return -ENXIO;
> >  
> > +	PM_RUNTIME_ACQUIRE(&pdev->dev, pm);
> > +	if ((rc = PM_RUNTIME_ACQUIRE_ERR(&pm)))
> > +		return rc;
> > +
> 
> [Severity: Medium]
> Similarly to sev_tio_spdm_cmd(), does this code unconditionally resume the
> device to D0, ignoring the type parameter?
> 
> If a generic DOE transfer is requested that does not strictly require D0, it
> appears this will still force a transition to D0, contradicting the commit
> message's stated power management requirements.

Hallucination, this code does not perform "generic DOE" exchanges, only
CMA-SPDM ones.

> [Severity: High]
> Is this exported API also susceptible to the same asynchronous put regression
> mentioned above?
> 
> If pci_tsm_doe_transfer() is invoked during a teardown sequence, the async
> idle and suspend queued by the macro might get canceled by device_del(),
> leaving the device in D0.

We have to leave de-enumerated devices in D0 to ensure that a subsequent
rescan successfully re-enumerates them.  E.g. leaving a Downstream Port
in D3hot upon de-enumeration would leave any children inaccessible.

We also leave unbound devices in D0 for similar reasons.

Thanks,

Lukas

^ permalink raw reply

* Re: [PATCH v7 2/6] firmware: hwrng: arm_smccc_trng: Register as an SMCCC device
From: Andre Przywara @ 2026-06-15 15:15 UTC (permalink / raw)
  To: Aneesh Kumar K.V (Arm), linux-coco, linux-arm-kernel,
	linux-kernel
  Cc: Catalin Marinas, Greg KH, Jeremy Linton, Jonathan Cameron,
	Lorenzo Pieralisi, Mark Rutland, Sudeep Holla, Will Deacon,
	Steven Price, Suzuki K Poulose
In-Reply-To: <20260611130429.295516-3-aneesh.kumar@kernel.org>

Hi Aneesh,

thanks for doing this, we have thought about this for quite a while, but 
no one dared to just bite the bullet...

On 6/11/26 15:04, Aneesh Kumar K.V (Arm) wrote:
> The SMCCC TRNG interface is a firmware-provided SMCCC service rather than a
> standalone platform device. Now that the SMCCC core has an SMCCC bus,
> create an arm-smccc-trng device for the discovered TRNG service and convert
> the hwrng driver to an SMCCC driver.
> 
> The SMCCC id table preserves module autoloading for systems where the TRNG
> driver is built as a module.
> 
> The sysfs device path changes from the old smccc_trng platform-device path
> to an arm-smccc device path. No known userspace dependency on the old path
> was found; a Debian Code Search lookup for the existing platform-device
> name/path did not find any users.
> 
> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
> ---
>   arch/arm64/include/asm/archrandom.h     |  2 +-
>   drivers/char/hw_random/arm_smccc_trng.c | 32 +++++++++-----
>   drivers/firmware/smccc/smccc.c          | 58 +++++++++++++++++++++----
>   3 files changed, 71 insertions(+), 21 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/archrandom.h b/arch/arm64/include/asm/archrandom.h
> index 8babfbe31f95..7605dd81bd1e 100644
> --- a/arch/arm64/include/asm/archrandom.h
> +++ b/arch/arm64/include/asm/archrandom.h
> @@ -12,7 +12,7 @@
>   
>   extern bool smccc_trng_available;
>   
> -static inline bool __init smccc_probe_trng(void)
> +static inline bool smccc_probe_trng(void)
>   {
>   	struct arm_smccc_res res;
>   
> diff --git a/drivers/char/hw_random/arm_smccc_trng.c b/drivers/char/hw_random/arm_smccc_trng.c
> index dcb8e7f37f25..8f7f9d830cf2 100644
> --- a/drivers/char/hw_random/arm_smccc_trng.c
> +++ b/drivers/char/hw_random/arm_smccc_trng.c
> @@ -16,8 +16,10 @@
>   #include <linux/device.h>
>   #include <linux/hw_random.h>
>   #include <linux/module.h>
> -#include <linux/platform_device.h>
>   #include <linux/arm-smccc.h>
> +#include <linux/arm-smccc-bus.h>
> +
> +#include <asm/archrandom.h>
>   
>   #ifdef CONFIG_ARM64
>   #define ARM_SMCCC_TRNG_RND	ARM_SMCCC_TRNG_RND64
> @@ -94,29 +96,37 @@ static int smccc_trng_read(struct hwrng *rng, void *data, size_t max, bool wait)
>   	return copied;
>   }
>   
> -static int smccc_trng_probe(struct platform_device *pdev)
> +static int smccc_trng_probe(struct arm_smccc_device *sdev)
>   {
>   	struct hwrng *trng;
>   
> -	trng = devm_kzalloc(&pdev->dev, sizeof(*trng), GFP_KERNEL);
> +	/* validate the minimum version requirement */
> +	if (!smccc_probe_trng())
> +		return -ENODEV;
> +
> +	trng = devm_kzalloc(&sdev->dev, sizeof(*trng), GFP_KERNEL);
>   	if (!trng)
>   		return -ENOMEM;
>   
>   	trng->name = "smccc_trng";
>   	trng->read = smccc_trng_read;
>   
> -	return devm_hwrng_register(&pdev->dev, trng);
> +	return devm_hwrng_register(&sdev->dev, trng);
>   }
>   
> -static struct platform_driver smccc_trng_driver = {
> -	.driver = {
> -		.name		= "smccc_trng",
> -	},
> -	.probe		= smccc_trng_probe,
> +static const struct arm_smccc_device_id smccc_trng_id_table[] = {
> +	{ .name = "arm-smccc-trng" },
> +	{}
> +};
> +MODULE_DEVICE_TABLE(arm_smccc, smccc_trng_id_table);
> +
> +static struct arm_smccc_driver smccc_trng_driver = {
> +	.name	  = KBUILD_MODNAME,
> +	.probe	  = smccc_trng_probe,
> +	.id_table = smccc_trng_id_table,
>   };
> -module_platform_driver(smccc_trng_driver);
> +module_arm_smccc_driver(smccc_trng_driver);
>   
> -MODULE_ALIAS("platform:smccc_trng");
>   MODULE_AUTHOR("Andre Przywara");
>   MODULE_DESCRIPTION("Arm SMCCC TRNG firmware interface support");
>   MODULE_LICENSE("GPL");
> diff --git a/drivers/firmware/smccc/smccc.c b/drivers/firmware/smccc/smccc.c
> index bdee057db2fd..a47696f3a5de 100644
> --- a/drivers/firmware/smccc/smccc.c
> +++ b/drivers/firmware/smccc/smccc.c
> @@ -9,7 +9,8 @@
>   #include <linux/init.h>
>   #include <linux/arm-smccc.h>
>   #include <linux/kernel.h>
> -#include <linux/platform_device.h>
> +#include <linux/arm-smccc-bus.h>
> +
>   #include <asm/archrandom.h>
>   
>   static u32 smccc_version = ARM_SMCCC_VERSION_1_0;
> @@ -81,16 +82,55 @@ bool arm_smccc_hypervisor_has_uuid(const uuid_t *hyp_uuid)
>   }
>   EXPORT_SYMBOL_GPL(arm_smccc_hypervisor_has_uuid);
>   
> +struct smccc_device_info {
> +	u32 func_id;
> +	bool requires_smc;
> +	const char *device_name;
> +};
> +
> +static const struct smccc_device_info smccc_devices[] __initconst = {
> +	{
> +		.func_id        = ARM_SMCCC_TRNG_VERSION,
> +		.requires_smc   = false,
> +		.device_name    = "arm-smccc-trng",
> +	},
> +};
> +
> +static bool __init smccc_probe_smccc_device(const struct smccc_device_info *smccc_dev)
> +{
> +	unsigned long ret;
> +	struct arm_smccc_res res;
> +
> +	if (smccc_conduit == SMCCC_CONDUIT_NONE)
> +		return false;
> +
> +	if (smccc_dev->requires_smc && smccc_conduit != SMCCC_CONDUIT_SMC)
> +		return false;
> +
> +	arm_smccc_1_1_invoke(smccc_dev->func_id, &res);
> +	ret = res.a0;

Mostly a nit:
Why the assignment to a variable of the same type here? Wouldn't it be 
cleaner to let "ret" be an "int"? Then you can save the cast below.
Or drop the assignment, and just cast res.a0 below directly.

In any case, I tested this in a KVM guest, and it worked flawlessly: the 
device is created, works, and sysfs looks good, both with this file 
compiled in (=y), and also as a module. Module autoloading also seems to 
work.
So that's:

Tested-by: Andre Przywara <andre.przywara@arm.com>

Cheers,
Andre.


> +
> +	if ((s32)ret == SMCCC_RET_NOT_SUPPORTED)
> +		return false;
> +
> +	return true;
> +}
> +
>   static int __init smccc_devices_init(void)
>   {
> -	struct platform_device *pdev;
> -
> -	if (smccc_trng_available) {
> -		pdev = platform_device_register_simple("smccc_trng", -1,
> -						       NULL, 0);
> -		if (IS_ERR(pdev))
> -			pr_err("smccc_trng: could not register device: %ld\n",
> -			       PTR_ERR(pdev));
> +	struct arm_smccc_device *sdev;
> +	const struct smccc_device_info *smccc_dev;
> +
> +	for (int i = 0; i < ARRAY_SIZE(smccc_devices); i++) {
> +		smccc_dev = &smccc_devices[i];
> +
> +		if (!smccc_probe_smccc_device(smccc_dev))
> +			continue;
> +
> +		sdev = arm_smccc_device_register(smccc_dev->device_name);
> +		if (IS_ERR(sdev))
> +			pr_err("%s: could not register device: %ld\n",
> +			       smccc_dev->device_name, PTR_ERR(sdev));
>   	}
>   
>   	return 0;


^ permalink raw reply

* [PATCH] PCI/TSM: Resume device to D0 for CMA-SPDM operation
From: Lukas Wunner @ 2026-06-15 13:19 UTC (permalink / raw)
  To: Dan Williams, Ashish Kalra, Tom Lendacky
  Cc: Vivaik Balasubrawmanian, John Allen, Bjorn Helgaas, linux-coco,
	linux-pci, Jonathan Cameron, Aneesh Kumar K.V, Yilun Xu,
	Zhenzhong Duan, Alexey Kardashevskiy

Per PCIe r7.0 sec 6.31.3, CMA-SPDM operation in non-D0 states is optional.
The spec does not define a way to determine if it's supported, so resume
to D0 unconditionally for the duration of a CMA-SPDM exchange.  Vivaik has
talked to Windows engineers and they said that Windows does the same.

Note that for plain DOE operation, it is sufficient for the device to be
in D3hot and its parents in D0 because config space remains accessible in
D3hot.  So CMA-SPDM goes beyond the requirements of plain DOE and hence
resuming to D0 needs to (only) be done in code paths which use DOE
specifically for CMA-SPDM.

The pattern used herein for runtime resume is the best practice introduced
by commit ef8057b07c72 ("PM: runtime: Wrapper macros for ACQUIRE()/
ACQUIRE_ERR()").

Fixes: 3225f52cde56 ("PCI/TSM: Establish Secure Sessions and Link Encryption")
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Cc: stable@vger.kernel.org # v6.19+
Cc: Vivaik Balasubrawmanian <vivaik.balasubrawmanian@intel.com>
---
We're in the merge window for v7.2 and this isn't super urgent,
so it's targeting v7.3 via tsm.git/next.

Technically I'd have permission to apply myself,
but I wouldn't want to without acks from Dan and AMD!
Thanks for taking a look!

 drivers/crypto/ccp/sev-dev-tsm.c | 6 ++++++
 drivers/pci/tsm.c                | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/drivers/crypto/ccp/sev-dev-tsm.c b/drivers/crypto/ccp/sev-dev-tsm.c
index b07ae52..108204f7 100644
--- a/drivers/crypto/ccp/sev-dev-tsm.c
+++ b/drivers/crypto/ccp/sev-dev-tsm.c
@@ -7,6 +7,7 @@
 #include <linux/tsm.h>
 #include <linux/iommu.h>
 #include <linux/pci-doe.h>
+#include <linux/pm_runtime.h>
 #include <linux/bitfield.h>
 #include <linux/module.h>
 
@@ -30,6 +31,7 @@ static int sev_tio_spdm_cmd(struct tio_dsm *dsm, int ret)
 {
 	struct tsm_dsm_tio *dev_data = &dsm->data;
 	struct tsm_spdm *spdm = &dev_data->spdm;
+	int pm_ret;
 
 	/* Check the main command handler response before entering the loop */
 	if (ret == 0 && dev_data->psp_ret != SEV_RET_SUCCESS)
@@ -38,6 +40,10 @@ static int sev_tio_spdm_cmd(struct tio_dsm *dsm, int ret)
 	if (ret <= 0)
 		return ret;
 
+	PM_RUNTIME_ACQUIRE(&dsm->tsm.base_tsm.pdev->dev, pm);
+	if ((pm_ret = PM_RUNTIME_ACQUIRE_ERR(&pm)))
+		return pm_ret;
+
 	/* ret > 0 means "SPDM requested" */
 	while (ret == PCI_DOE_FEATURE_CMA || ret == PCI_DOE_FEATURE_SSESSION) {
 		ret = pci_doe(dsm->tsm.doe_mb, PCI_VENDOR_ID_PCI_SIG, ret,
diff --git a/drivers/pci/tsm.c b/drivers/pci/tsm.c
index 5fdcd7f..af1817e 100644
--- a/drivers/pci/tsm.c
+++ b/drivers/pci/tsm.c
@@ -12,6 +12,7 @@
 #include <linux/pci.h>
 #include <linux/pci-doe.h>
 #include <linux/pci-tsm.h>
+#include <linux/pm_runtime.h>
 #include <linux/sysfs.h>
 #include <linux/tsm.h>
 #include <linux/xarray.h>
@@ -886,6 +887,7 @@ int pci_tsm_doe_transfer(struct pci_dev *pdev, u8 type, const void *req,
 			 size_t req_sz, void *resp, size_t resp_sz)
 {
 	struct pci_tsm_pf0 *tsm;
+	int rc;
 
 	if (!pdev->tsm || !is_pci_tsm_pf0(pdev))
 		return -ENXIO;
@@ -894,6 +896,10 @@ int pci_tsm_doe_transfer(struct pci_dev *pdev, u8 type, const void *req,
 	if (!tsm->doe_mb)
 		return -ENXIO;
 
+	PM_RUNTIME_ACQUIRE(&pdev->dev, pm);
+	if ((rc = PM_RUNTIME_ACQUIRE_ERR(&pm)))
+		return rc;
+
 	return pci_doe(tsm->doe_mb, PCI_VENDOR_ID_PCI_SIG, type, req, req_sz,
 		       resp, resp_sz);
 }
-- 
2.53.0


^ permalink raw reply related

* Re: [RFC PATCH] mm/vmalloc: add vmalloc_decrypted() and vzalloc_decrypted()
From: Jason Gunthorpe @ 2026-06-15 12:09 UTC (permalink / raw)
  To: Michael Kelley
  Cc: Catalin Marinas, Christoph Hellwig, Kameron Carr,
	akpm@linux-foundation.org, urezki@gmail.com, linux-mm@kvack.org,
	linux-kernel@vger.kernel.org, rppt@kernel.org,
	linux-coco@lists.linux.dev, Suzuki K Poulose
In-Reply-To: <SN6PR02MB4157EC032AD55D182FBC1318D4182@SN6PR02MB4157.namprd02.prod.outlook.com>

On Fri, Jun 12, 2026 at 07:06:00PM +0000, Michael Kelley wrote:

> > I thought arches are either preserving the memory content or zeroing
> > it, you are saying some arch leaves it as garbage? I'd argue that's an
> > arch bug and they should clear it in their path.
> 
> AMD SEV-SNP leaves the memory contents as garbage after an encryption
> or decryption state change. On the flip side, my understanding has been
> that TDX zeroes the memory (or at least has an option to do so) after
> such a state change, though a couple of AI chats say TDX also leaves
> garbage. To be sure, I'd have to run an experiment to check in a TDX
> guest on Hyper-V.

So there are many bugs then if the pre-zero is lost and you have to
zero it again. Even swiotlb doesn't reliably zero it's pools in the
right order under these rules, though alloc coherent does get it
right at least.

IMHO this is too sketchy to be usable and optimizing for AMD is not
the right call, IMHO.

> > Otherwise this sharp edge is not documented and we have many other
> > places getting it wrong, eg system_heap_allocate() doesn't re-zero the
> > memory after decrypting it.
> 
> In the Hyper-V code that uses set_memory_decrypted()/encrypted(),
> there's always an explicit call to set the memory to zero afterwards.

Good for it, maybe next time improve the APIs :(

Even more compelling that hyper-v should be using the dma api..

Jason

^ permalink raw reply

* Re: [PATCH v14 10/44] arm64: RMI: Add support for SRO
From: Steven Price @ 2026-06-15 11:45 UTC (permalink / raw)
  To: Dan Williams (nvidia), Gavin Shan, kvm, kvmarm
  Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Shanker Donthineni,
	Alper Gun, Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve,
	WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <6a2c91398fad5_a003b10027@djbw-dev.notmuch>

Hi Dan,

On 13/06/2026 00:07, Dan Williams (nvidia) wrote:
> Steven Price wrote:
> [..]
>>> alloc_pages_exact() will fail if the requested size exceeds the maximal
>>> allowed
>>> size (1 << MAX_PAGE_ORDER). The maximal size is usually smaller than
>>> PUD_SIZE
>>> but PUD_SIZE is allowed by the RMM.
>>
>> This is an area where to be honest I'm really not sure what to do.
>> Technically the RMM is allowed to ask for a contiguous range of 512GB
>> pages (on a 4K system - larger with larger page sizes) - but clearly no
>> real OS is going to be able to provide anything like that.
>>
>> In practise we don't expect the RMM to do anything so crazy. It's not
>> really clear to be whether even 2MB (PMD_SIZE) is needed. But the spec
>> is written to be generic.
>>
>> So my current approach is to calculate the required size and pass it
>> into alloc_pages_exact(). For "stupidly large" values this will fail and
>> Linux just doesn't support an RMM which attempts this. If there is ever
>> a usecase which needs this then we'd need to find a different method of
>> providing the memory (most likely some form of carveout to avoid
>> fragmentation). But my view is we should wait for that usecase to be
>> identified first.
> 
> Just some comparison comments as I am also going through the TDX patches
> which enable "Extension SEAMCALLs". These new SEAMCALLs are similar to
> the SRO mechanism [1].

Looks like at least at the moment it's much more one-way than the SRO
mechanism - there's no reclaim mechanism (yet).

> TDX asks for an upfront delegation of memory at init time using
> alloc_contig_pages() that is never returned until entire module is
> shutdown. alloc_contig_pages() is not subject to the MAX_ORDER limit,
> but not sure that alloc_contig_pages() is suitable for small+dynamic
> runtime memory add / release that SRO potentially wants to do?

Yeah I'm not sure quite what is best. I expect the RMM to only request
contiguous memory for very small allocations to use as hardware page
tables. It's an issue I'm trying to work through that the specification
doesn't provide any guidance for what sort of allocations the host
should expect to provide.

> Does SRO always balance the size of RMI_OP_MEM_REQ_DONATE with
> RMI_OP_MEM_REQ_RECLAIM, or might some donate requests be a one way
> donation like TDX? Just poking to see if there is a path to preallocate
> a pool vs the fine grained per-operation alloc/free.

The spec is unfortunately not prescriptive on this point. For an
operation which eventually fails, the expectation is that the RMM will
return all the memory that was provided (and exactly that memory). But
the specification doesn't actually require that.

The problem is that there are situations where a racing operation on
another CPU could trigger this to not happen. For example, a new page
table needs to be allocated to complete a map operation, but then a
racing operation on another CPU makes use of this page table (e.g due to
a map at a different address), the memory for the page table cannot be
returned even if the operation doesn't complete because it's in use from
the racing operation.

I don't believe the current RMM design will actually do this - but it's
not something we actually want to prevent in the spec.

Equally the expectation is that all the donated memory for a guest will
be returned when the guest is destroyed. But we don't have anything in
the spec to enforce this.

I don't particularly expect a pool to be that useful for the expected
memory allocation patterns as I expect SRO donations to be long lived.
We don't (yet at least) have a concept of donating memory just for
"scratch" memory during an operation. Although the SRO mechanism doesn't
rule that out.

Thanks,
Steve

^ permalink raw reply

* Re: [PATCH RFC 0/3] KVM: guest_memfd: folio migration for non-confidential VMs
From: Alexandru Elisei @ 2026-06-15 11:04 UTC (permalink / raw)
  To: Shivank Garg
  Cc: Matthew Wilcox (Oracle), Jan Kara, Andrew Morton, Vlastimil Babka,
	Suren Baghdasaryan, Michal Hocko, Brendan Jackman,
	Johannes Weiner, Zi Yan, David Hildenbrand, Matthew Brost,
	Joshua Hahn, Rakie Kim, Byungchul Park, Gregory Price, Ying Huang,
	Alistair Popple, Paolo Bonzini, Shuah Khan, Chao Peng,
	Nikunj A Dadhania, Ira Weiny, Michael Roth, Pankaj Gupta,
	Ackerley Tng, Fuad Tabba, Sean Christopherson, Vishal Annapurve,
	Nikita Kalyazin, Patrick Roy, Pratik Sampat, Ashish Kalra,
	linux-fsdevel, linux-coco, linux-mm, linux-kernel, kvm,
	linux-kselftest
In-Reply-To: <ai_XK__RTXMCEcCG@raptor>

Hi,

On Mon, Jun 15, 2026 at 11:43:14AM +0100, Alexandru Elisei wrote:
> Hi,
> 
> On Thu, Jun 11, 2026 at 01:05:07PM +0000, Shivank Garg wrote:
> > guest_memfd folios are currently marked unmovable, so the kernel cannot
> > perform NUMA-balancing, memory compaction, etc. This is unavoidable for
> > confidential VMs (SEV-SNP, TDX), since memory is encrypted and copying it
> > needs firmware assistance. However, for non-confidential VMs (like
> > Firecracker), we can migrate the folios.
> > 
> > This series enables folio migration for non-confidential guest_memfd and
> > also lays the groundwork for migrating confidential guest_memfd later.
> > Once firmware-assisted copying support is available, those VMs can be
> > made movable, the confidential folio content can be copied separately,
> > and the destination folio marked with FOLIO_CONTENT_COPIED so
> > __migrate_folio() skips the host-side folio_mc_copy().
> 
> I always thought that one of the nice things about using guest_memfd as a
> memory backend, as opposed to host userspace mappings, is that the host
> cannot unmap VM memory because of KSM, automatic NUMA balancing, hugepage
> collapse, compaction, etc, acting on the host userspace mapping of the
> VM memory, and outside of the VMM's or KVM's control.
> 
> I think it would be useful to preserve this behaviour, even in the absence
> of confidential VMs (i.e, guest_memfd file descriptor created with
> GUEST_MEMFD_FLAG_MMAP).

Just to be clear, I was thinking that it might be useful for both
behaviours to exist (migratable and non-migratable) for non-confidential
VMs, and allow KVM or userspace to decide which they prefer for a
guest_memfd.

Thanks,
Alex

^ permalink raw reply

* Re: [PATCH v13 09/22] KVM: selftests: Expose functions to get default sregs values
From: Chenyi Qiang @ 2026-06-15 10:54 UTC (permalink / raw)
  To: Binbin Wu, Lisa Wang
  Cc: Andrew Jones, Ackerley Tng, Chao Gao, Dave Hansen, Erdem Aktas,
	Ira Weiny, Isaku Yamahata, Kiryl Shutsemau, linux-kselftest,
	Paolo Bonzini, Pratik R. Sampat, Reinette Chatre, Rick Edgecombe,
	Roger Wang, Ryan Afranji, Sagi Shahar, Sean Christopherson,
	Shuah Khan, Oliver Upton, Jeremiah McReynolds, kvm, linux-coco,
	linux-kernel, x86
In-Reply-To: <434e7f9a-5f64-4488-bf9d-5be8c3f9eefe@linux.intel.com>



On 6/8/2026 2:39 PM, Binbin Wu wrote:
> On 5/22/2026 7:16 AM, Lisa Wang wrote:
> 
> [...]
> 
>> +
>> +static inline u64 kvm_get_default_cr4(void)
>> +{
>> +	u64 cr4 = X86_CR4_PAE | X86_CR4_OSFXSR;
>> +
>> +	if (kvm_cpu_has(X86_FEATURE_XSAVE))
>> +		cr4 |= X86_CR4_OSXSAVE;
>> +	return cr4;
>> +}
>> +
> 
> [...]
> 
>> @@ -647,16 +643,12 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
>>  	vcpu_sregs_get(vcpu, &sregs);
>>  
>>  	sregs.idt.base = vm->arch.idt;
>> -	sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1;
>> +	sregs.idt.limit = kvm_get_default_idt_limit();
>>  	sregs.gdt.base = vm->arch.gdt;
>> -	sregs.gdt.limit = getpagesize() - 1;
>> -
>> -	sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG;
>> -	sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR;
>> -	if (kvm_cpu_has(X86_FEATURE_XSAVE))
>> -		sregs.cr4 |= X86_CR4_OSXSAVE;
>> -	if (vm->mmu.pgtable_levels == 5)
>> -		sregs.cr4 |= X86_CR4_LA57;
> 
> I guess the 5-level paging thing is dropped unexpectedly during rebase?
> 
> 
>> +	sregs.gdt.limit = kvm_get_default_gdt_limit();
>>
>> +	sregs.cr0 = kvm_get_default_cr0();
>> +	sregs.cr4 |= kvm_get_default_cr4();
>>  	sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX);

Also, sregs.efer |= kvm_get_default_efer() is dropped unexpectedly during rebase.

>>  
>>  	kvm_seg_set_unusable(&sregs.ldt);
>>
> 


^ permalink raw reply

* Re: [PATCH RFC 0/3] KVM: guest_memfd: folio migration for non-confidential VMs
From: Alexandru Elisei @ 2026-06-15 10:43 UTC (permalink / raw)
  To: Shivank Garg
  Cc: Matthew Wilcox (Oracle), Jan Kara, Andrew Morton, Vlastimil Babka,
	Suren Baghdasaryan, Michal Hocko, Brendan Jackman,
	Johannes Weiner, Zi Yan, David Hildenbrand, Matthew Brost,
	Joshua Hahn, Rakie Kim, Byungchul Park, Gregory Price, Ying Huang,
	Alistair Popple, Paolo Bonzini, Shuah Khan, Chao Peng,
	Nikunj A Dadhania, Ira Weiny, Michael Roth, Pankaj Gupta,
	Ackerley Tng, Fuad Tabba, Sean Christopherson, Vishal Annapurve,
	Nikita Kalyazin, Patrick Roy, Pratik Sampat, Ashish Kalra,
	linux-fsdevel, linux-coco, linux-mm, linux-kernel, kvm,
	linux-kselftest
In-Reply-To: <20260611-shivank-gmem-migrate-v1-0-2d266bfc6f95@amd.com>

Hi,

On Thu, Jun 11, 2026 at 01:05:07PM +0000, Shivank Garg wrote:
> guest_memfd folios are currently marked unmovable, so the kernel cannot
> perform NUMA-balancing, memory compaction, etc. This is unavoidable for
> confidential VMs (SEV-SNP, TDX), since memory is encrypted and copying it
> needs firmware assistance. However, for non-confidential VMs (like
> Firecracker), we can migrate the folios.
> 
> This series enables folio migration for non-confidential guest_memfd and
> also lays the groundwork for migrating confidential guest_memfd later.
> Once firmware-assisted copying support is available, those VMs can be
> made movable, the confidential folio content can be copied separately,
> and the destination folio marked with FOLIO_CONTENT_COPIED so
> __migrate_folio() skips the host-side folio_mc_copy().

I always thought that one of the nice things about using guest_memfd as a
memory backend, as opposed to host userspace mappings, is that the host
cannot unmap VM memory because of KSM, automatic NUMA balancing, hugepage
collapse, compaction, etc, acting on the host userspace mapping of the
VM memory, and outside of the VMM's or KVM's control.

I think it would be useful to preserve this behaviour, even in the absence
of confidential VMs (i.e, guest_memfd file descriptor created with
GUEST_MEMFD_FLAG_MMAP).

Thanks,
Alex

> 
> Testing
> -------
> Host: 7.1-rc7 + this, 2 NUMA nodes
> 
> - KVM selftest: allocate folios on node 0, migrate them to node 1 and
>   back and verify resulting NUMA node and the folio contents at each
>   step.
> 
> - Firecracker [1]: booted a microVM backed by guest_memfd. While the
>   guest was running, forced host-side migration of its folios via
>   migratepages(8) and explicit move_pages(2) of guest_memfd
>   pages. Verify with /proc/firecracker_pid/numa_maps.
> 
> [1] https://github.com/firecracker-microvm/firecracker/tree/feature/secret-hiding
>     and change builder.rs to remove GUEST_MEMFD_FLAG_NO_DIRECT_MAP from
>     vm.create_guest_memfd()
> 
> Best regards,
> Shivank
> 
> Signed-off-by: Shivank Garg <shivankg@amd.com>
> ---
> Shivank Garg (3):
>       mm: split AS_UNMOVABLE back out of AS_INACCESSIBLE
>       KVM: guest_memfd: support folio migration for non-confidential VMs
>       KVM: selftests: exercise guest_memfd folio migration
> 
>  include/linux/pagemap.h                        | 24 ++++++--
>  mm/compaction.c                                | 12 ++--
>  mm/migrate.c                                   |  2 +-
>  tools/testing/selftests/kvm/guest_memfd_test.c | 77 ++++++++++++++++++++++++++
>  virt/kvm/guest_memfd.c                         | 49 ++++++++++++++--
>  5 files changed, 149 insertions(+), 15 deletions(-)
> ---
> base-commit: 4549871118cf616eecdd2d939f78e3b9e1dddc48
> change-id: 20260611-shivank-gmem-migrate-8c1c519b30a6
> 
> Best regards,
> -- 
> Shivank Garg <shivankg@amd.com>
> 
> 

^ permalink raw reply

* RE: [RFC PATCH 0/6] Support virtio-mem memory hotplug in TDX guests
From: Duan, Zhenzhong @ 2026-06-15  7:54 UTC (permalink / raw)
  To: Kiryl Shutsemau
  Cc: marcandre.lureau@redhat.com, david@kernel.org, Edgecombe, Rick P,
	prsampat@amd.com, pbonzini@redhat.com, mst@redhat.com,
	peterx@redhat.com, Qiang, Chenyi, Reshetova, Elena,
	michaeluth@amd.com, ackerleytng@google.com,
	linux-kernel@vger.kernel.org, linux-coco@lists.linux.dev,
	virtualization@lists.linux.dev, x86@kernel.org, Xu, Yilun,
	Li, Xiaoyao, Peng, Chao P
In-Reply-To: <aiv0y-Op9bfP-CVO@thinkstation>

>-----Original Message-----
>From: Kiryl Shutsemau <kas@kernel.org>
>Subject: Re: [RFC PATCH 0/6] Support virtio-mem memory hotplug in TDX guests
>
>On Thu, Jun 04, 2026 at 05:35:45AM -0400, Zhenzhong Duan wrote:
>> 2. Re-accepting already-accepted memory returns errors. Ignoring these errors
>> can mislead the guest into believing re-accepted memory is zeroed when it
>> contains stale data.
>
>Re-accepting concern is valid, but often overblown.

> Reaccepting memory that never got allocated is fine.

I don't quite understand. "Reaccepting" implies accepting memory that was
already accepted earlier. For that to happen, the memory must have already
been allocated on the VMM side, correct?

>
>> == About this series ==
>>
>> This series takes a different direction, supporting start-private memory
>> and addressing the limitations of previous series [1] by implementing a
>> callback-based infrastructure that integrates TDX memory acceptance and
>> release operations with proper subblock granularity.
>
>You are presenting these callbacks as generic memory hotplug thingy, but
>it is only plugged into virtio mem. ACPI hotplug won't accept/release
>memory unless I miss something. Are you expecting them to cover non
>virtio cases too?

You are right, I didn't add ACPI hotplug in this series. I'm working on RFCv2
supporting both virtio-mem and ACPI hotplug in eager/lazy accept mode.

>
>And these callbacks feels like very ad-hoc solution.

OK, will drop the callbacks in RFCv2.

>
>> See Rick and Paolo's
>> discussion about using TDG.MEM.PAGE.RELEASE in [1].
>
>Having RELEASE in hotplug path without addressing private->shared
>conversion first is odd. That's the most obvious path that has to be
>covered first.
>
>Hm?

This patch series assumes that memory is plugged in as private memory
and must remain private prior to being unplugged. During the unplugging
process, memory is allocated from the buddy system and marked as
FAKE_OFFLINE. Because all free memory within the buddy system is
strictly private, shared memory can never be unplugged.

Shared memory is originally converted from private memory allocated by
the buddy system. Consequently, the driver must convert any shared
memory back to private and return it to the buddy system before it can
be unplugged.

>
>> == Future work ==
>> support lazy accept
>
>It would be nice to have some outline on how we will get there to
>understand if this patchset is stepping stone or dead end that has to be
>thrown away later on.

I realized the callbacks are specially used for eager accept, they are not
useful for lazy accept. So, I will drop them in RFCv2.

>
>Hot[un]plug is often used to manager overcommited host. Eager accept
>might be counter-productive.

Agree, I should have taken lazy accept into consideration from start.

Thanks
Zhenzhong

^ permalink raw reply

* Re: [RFC PATCH 13/15] KVM: TDX: Support event-notify interrupts only with userspace quoting
From: Adrian Hunter @ 2026-06-15  4:39 UTC (permalink / raw)
  To: Peter Fang
  Cc: Xu Yilun, kas, djbw, rick.p.edgecombe, x86, linux-coco,
	linux-kernel, kvm, sohil.mehta, yilun.xu, baolu.lu,
	zhenzhong.duan, xiaoyao.li
In-Reply-To: <20260614125750.GB3425618@pedri>

>>> @@ -7335,6 +7335,9 @@ inputs and outputs of the TDVMCALL.  Currently the following values of
>>>     queued successfully, the TDX guest can poll the status field in the
>>>     shared-memory area to check whether the Quote generation is completed or
>>>     not. When completed, the generated Quote is returned via the same buffer.
>>> +   If the host kernel generates Quotes through the TDX Quoting service provided
>>> +   by the TDX module, KVM processes the GetQuote request and it will not appear
>>> +   in userspace.
>>
>> There is an Attestation section in Documentation/virt/kvm/x86/intel-tdx.rst
>> that could be updated too.
> 
> Can you please point me to it? I couldn't find that section in that
> file.

Sorry, got he file name wrong: Documentation/arch/x86/tdx.rst


^ permalink raw reply

* Re: [RFC PATCH 13/15] KVM: TDX: Support event-notify interrupts only with userspace quoting
From: Peter Fang @ 2026-06-14 12:57 UTC (permalink / raw)
  To: Adrian Hunter
  Cc: Xu Yilun, kas, djbw, rick.p.edgecombe, x86, linux-coco,
	linux-kernel, kvm, sohil.mehta, yilun.xu, baolu.lu,
	zhenzhong.duan, xiaoyao.li
In-Reply-To: <7090f4af-3a6d-40fd-82ab-0ba6272534dd@intel.com>

On Thu, Jun 11, 2026 at 10:36:52PM +0300, Adrian Hunter wrote:
> On 22/05/2026 06:41, Xu Yilun wrote:
> > From: Peter Fang <peter.fang@intel.com>
> > 
> > Tie userspace SetupEventNotifyInterrupt support to userspace Quote
> > generation. Delivering event-notify interrupts via userspace breaks if
> > KVM never exits to userspace in the first place.
> 
> Breaks how exactly?
> 
> Seems like a TDX guest has no way to know whether the VMM will use
> the Event Notify Interrupt anyway, so it cannot rely upon it, so
> it should already handle the case when the interrupt does not fire.

Hm that's an interesting point. But isn't the whole point of
SetupEventNotifyInterrupt to set up a contract with the host VMM? The
GHCI spec is quite loose about this.

If we say "the host VMM is not required to honor this contract", then
maybe this doesn't truly break anything. But then this stance kind of
makes this whole feature moot, or at least not very useful?

Not adding this patch feels like making this problem worse, right?
Because now we will have platforms that won't ever fire these
interrupts, and the host still tells the guest SetupEventNotifyInterrupt
is supported.

> 
> > 
> > No known guest currently requires event-notify interrupt support, so
> > defer adding in-kernel support for now. Linux TDX guests use polling
> > only.
> 
> If no guest is using it, then why does it need special treatment?

Just to maintain status quo basically. Seems like previously there was
some interest in adding this support to the guest at some point. This
patch simply turns off this feature when quoting is not done in
userspace. But platforms that do quoting in userspace (e.g. don't
support DICE extension) can observe the same behavior as today, if/when
such a guest comes into existence.

> 
> > 
> > @@ -7335,6 +7335,9 @@ inputs and outputs of the TDVMCALL.  Currently the following values of
> >     queued successfully, the TDX guest can poll the status field in the
> >     shared-memory area to check whether the Quote generation is completed or
> >     not. When completed, the generated Quote is returned via the same buffer.
> > +   If the host kernel generates Quotes through the TDX Quoting service provided
> > +   by the TDX module, KVM processes the GetQuote request and it will not appear
> > +   in userspace.
> 
> There is an Attestation section in Documentation/virt/kvm/x86/intel-tdx.rst
> that could be updated too.

Can you please point me to it? I couldn't find that section in that
file.

> 
> > +                  KVM only supports version 1 of the GetQuote request.
> 
> Is that relevant here?

Documenting this came up during some internal discussions. But yeah it
looks a bit out of place. I can remove it.

> 
> >  
> >   * ``TDVMCALL_GET_TD_VM_CALL_INFO``: the guest has requested the support
> >     status of TDVMCALLs.  The output values for the given leaf should be
> > @@ -7342,7 +7345,10 @@ inputs and outputs of the TDVMCALL.  Currently the following values of
> >     field of the union.
> >  
> >   * ``TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT``: the guest has requested to
> > -   set up a notification interrupt for vector ``vector``.
> > +   set up a notification interrupt for vector ``vector``.  Since this TDVMCALL
> > +   is used to optimize ``TDVMCALL_GET_QUOTE``, KVM disables this support in
> > +   userspace VMM if ``TDVMCALL_GET_QUOTE`` is completely handled in the kernel.
> > +   KVM may add kernel support for this in the future.
> 
> Is that really necessary?

I think this is related to the discussion above about how hard host VMM
should try to honor the SetupEventNotifyInterrupt contract.

> 

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox