($INBOX_DIR/description missing)
 help / color / mirror / Atom feed
* [PATCH v7 2/6] x86/sev: Initialize RMPOPT configuration MSRs
From: Ashish Kalra @ 2026-06-08 18:56 UTC (permalink / raw)
  To: tglx, mingo, bp, dave.hansen, x86, hpa, seanjc, peterz,
	thomas.lendacky, herbert, davem, ardb
  Cc: pbonzini, aik, Michael.Roth, KPrateek.Nayak, Tycho.Andersen,
	Nathan.Fontenot, ackerleytng, jackyli, pgonda, rientjes, jacobhxu,
	xin, pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen,
	darwi, linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <cover.1780903370.git.ashish.kalra@amd.com>

From: Ashish Kalra <ashish.kalra@amd.com>

The new RMPOPT instruction helps manage per-CPU RMP optimization
structures inside the CPU. It takes a 1GB-aligned physical address
and either returns the status of the optimizations or tries to enable
the optimizations.

Per-CPU RMPOPT tables support at most 2 TB of addressable memory for
RMP optimizations.

Initialize the per-CPU RMPOPT table base to the starting physical
address. This enables RMP optimization for up to 2 TB of system RAM on
all CPUs.

Additionally, add support to setup and enable RMPOPT once SNP is
enabled and initialized.

Suggested-by: Thomas Lendacky <thomas.lendacky@amd.com>
Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
---
 arch/x86/coco/core.c             |  3 ++
 arch/x86/include/asm/msr-index.h |  3 ++
 arch/x86/include/asm/sev.h       |  4 ++
 arch/x86/virt/svm/sev.c          | 72 +++++++++++++++++++++++++++++++-
 drivers/crypto/ccp/sev-dev.c     |  3 ++
 5 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
index 989ca9f72ba3..a8fc2ae50298 100644
--- a/arch/x86/coco/core.c
+++ b/arch/x86/coco/core.c
@@ -16,6 +16,7 @@
 #include <asm/archrandom.h>
 #include <asm/coco.h>
 #include <asm/processor.h>
+#include <asm/sev.h>
 
 enum cc_vendor cc_vendor __ro_after_init = CC_VENDOR_NONE;
 SYM_PIC_ALIAS(cc_vendor);
@@ -172,6 +173,8 @@ static void amd_cc_platform_clear(enum cc_attr attr)
 	switch (attr) {
 	case CC_ATTR_HOST_SEV_SNP:
 		cc_flags.host_sev_snp = 0;
+		snp_clear_rmpopt_configured();
+		setup_clear_cpu_cap(X86_FEATURE_RMPOPT);
 		break;
 	default:
 		break;
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 86554de9a3f5..28540744f1eb 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -761,6 +761,9 @@
 #define MSR_AMD64_SEG_RMP_ENABLED_BIT	0
 #define MSR_AMD64_SEG_RMP_ENABLED	BIT_ULL(MSR_AMD64_SEG_RMP_ENABLED_BIT)
 #define MSR_AMD64_RMP_SEGMENT_SHIFT(x)	(((x) & GENMASK_ULL(13, 8)) >> 8)
+#define MSR_AMD64_RMPOPT_BASE		0xc0010139
+#define MSR_AMD64_RMPOPT_ENABLE_BIT	0
+#define MSR_AMD64_RMPOPT_ENABLE		BIT_ULL(MSR_AMD64_RMPOPT_ENABLE_BIT)
 
 #define MSR_SVSM_CAA			0xc001f000
 
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 594cfa19cbd4..0d662221615a 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -662,6 +662,8 @@ static inline void snp_leak_pages(u64 pfn, unsigned int pages)
 	__snp_leak_pages(pfn, pages, true);
 }
 int snp_prepare(void);
+void snp_setup_rmpopt(void);
+void snp_clear_rmpopt_configured(void);
 void snp_shutdown(void);
 #else
 static inline bool snp_probe_rmptable_info(void) { return false; }
@@ -680,6 +682,8 @@ static inline void snp_leak_pages(u64 pfn, unsigned int npages) {}
 static inline void kdump_sev_callback(void) { }
 static inline void snp_fixup_e820_tables(void) {}
 static inline int snp_prepare(void) { return -ENODEV; }
+static inline void snp_setup_rmpopt(void) {}
+static inline void snp_clear_rmpopt_configured(void) {}
 static inline void snp_shutdown(void) {}
 #endif
 
diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
index 8bcdce98f6dc..482008bb07e4 100644
--- a/arch/x86/virt/svm/sev.c
+++ b/arch/x86/virt/svm/sev.c
@@ -124,6 +124,10 @@ static void *rmp_bookkeeping __ro_after_init;
 
 static u64 probed_rmp_base, probed_rmp_size;
 
+static cpumask_t rmpopt_cpumask;
+static phys_addr_t rmpopt_pa_start;
+static bool rmpopt_configured;
+
 static LIST_HEAD(snp_leaked_pages_list);
 static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
 
@@ -488,9 +492,14 @@ static bool __init setup_segmented_rmptable(void)
 static bool __init setup_rmptable(void)
 {
 	if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) {
-		if (!setup_segmented_rmptable())
+		if (!setup_segmented_rmptable()) {
+			setup_clear_cpu_cap(X86_FEATURE_RMPOPT);
 			return false;
+		}
+		rmpopt_configured = true;
 	} else {
+		/* Note that Segmented RMP must be enabled to enable RMPOPT. */
+		setup_clear_cpu_cap(X86_FEATURE_RMPOPT);
 		if (!setup_contiguous_rmptable())
 			return false;
 	}
@@ -555,6 +564,21 @@ int snp_prepare(void)
 }
 EXPORT_SYMBOL_FOR_MODULES(snp_prepare, "ccp");
 
+static void rmpopt_cleanup(void)
+{
+	int cpu;
+
+	cpus_read_lock();
+
+	for_each_cpu(cpu, &rmpopt_cpumask)
+		wrmsrq_on_cpu(cpu, MSR_AMD64_RMPOPT_BASE, 0);
+
+	cpus_read_unlock();
+
+	cpumask_clear(&rmpopt_cpumask);
+	rmpopt_pa_start = 0;
+}
+
 void snp_shutdown(void)
 {
 	u64 syscfg;
@@ -563,11 +587,57 @@ void snp_shutdown(void)
 	if (syscfg & MSR_AMD64_SYSCFG_SNP_EN)
 		return;
 
+	rmpopt_cleanup();
+
 	clear_rmp();
 	on_each_cpu(mfd_reconfigure, NULL, 1);
 }
 EXPORT_SYMBOL_FOR_MODULES(snp_shutdown, "ccp");
 
+void snp_clear_rmpopt_configured(void)
+{
+	rmpopt_configured = false;
+}
+
+void snp_setup_rmpopt(void)
+{
+	u64 rmpopt_base;
+	int cpu;
+
+	if (!cpu_feature_enabled(X86_FEATURE_RMPOPT) || !rmpopt_configured)
+		return;
+
+	cpus_read_lock();
+
+	/*
+	 * The RMPOPT_BASE MSR is per-core, so only one thread per core needs
+	 * to set up the RMPOPT_BASE MSR.
+	 *
+	 * Note: only online primary threads are included.  If a core's
+	 * primary thread is offline, that core is not covered.  CPU hotplug
+	 * is not currently supported with SNP enabled.
+	 */
+
+	for_each_online_cpu(cpu)
+		if (topology_is_primary_thread(cpu))
+			cpumask_set_cpu(cpu, &rmpopt_cpumask);
+
+	rmpopt_pa_start = ALIGN_DOWN(PFN_PHYS(min_low_pfn), SZ_1G);
+	rmpopt_base = rmpopt_pa_start | MSR_AMD64_RMPOPT_ENABLE;
+
+	/*
+	 * Per-CPU RMPOPT tables support at most 2 TB of addressable memory
+	 * for RMP optimizations. Initialize the per-CPU RMPOPT table base
+	 * to the starting physical address to enable RMP optimizations for
+	 * up to 2 TB of system RAM on all CPUs.
+	 */
+	for_each_cpu(cpu, &rmpopt_cpumask)
+		wrmsrq_on_cpu(cpu, MSR_AMD64_RMPOPT_BASE, rmpopt_base);
+
+	cpus_read_unlock();
+}
+EXPORT_SYMBOL_FOR_MODULES(snp_setup_rmpopt, "ccp");
+
 /*
  * Do the necessary preparations which are verified by the firmware as
  * described in the SNP_INIT_EX firmware command description in the SNP
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index 78f98aee7a66..217b6b19802e 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -1478,6 +1478,9 @@ static int __sev_snp_init_locked(int *error, unsigned int max_snp_asid)
 	}
 
 	snp_hv_fixed_pages_state_update(sev, HV_FIXED);
+
+	snp_setup_rmpopt();
+
 	sev->snp_initialized = true;
 	dev_dbg(sev->dev, "SEV-SNP firmware initialized, SEV-TIO is %s\n",
 		data.tio_en ? "enabled" : "disabled");
-- 
2.43.0


^ permalink raw reply related

* [PATCH v7 3/6] x86/sev: Add support to perform RMP optimizations asynchronously
From: Ashish Kalra @ 2026-06-08 18:56 UTC (permalink / raw)
  To: tglx, mingo, bp, dave.hansen, x86, hpa, seanjc, peterz,
	thomas.lendacky, herbert, davem, ardb
  Cc: pbonzini, aik, Michael.Roth, KPrateek.Nayak, Tycho.Andersen,
	Nathan.Fontenot, ackerleytng, jackyli, pgonda, rientjes, jacobhxu,
	xin, pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen,
	darwi, linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <cover.1780903370.git.ashish.kalra@amd.com>

From: Ashish Kalra <ashish.kalra@amd.com>

When SEV-SNP is enabled, all writes to memory are checked to ensure
integrity of SNP guest memory. This imposes performance overhead on the
whole system.

RMPOPT is a new instruction that minimizes the performance overhead of
RMP checks on the hypervisor and on non-SNP guests by allowing RMP
checks to be skipped for 1GB regions of memory that are known not to
contain any SEV-SNP guest memory.

Add support for performing RMP optimizations asynchronously using a
dedicated workqueue.

Enable RMPOPT optimizations for up to 2TB of system RAM starting from
the lowest physical memory address aligned down to a 1GB boundary at
RMP initialization time. RMP checks can initially be skipped for 1GB
memory ranges that do not contain SEV-SNP guest memory (excluding
preassigned pages such as the RMP table and firmware pages). As SNP
guests are launched, RMPUPDATE will disable the corresponding RMPOPT
optimizations.

Suggested-by: Thomas Lendacky <thomas.lendacky@amd.com>
Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
---
 arch/x86/virt/svm/sev.c | 208 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 205 insertions(+), 3 deletions(-)

diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
index 482008bb07e4..b42788a66d40 100644
--- a/arch/x86/virt/svm/sev.c
+++ b/arch/x86/virt/svm/sev.c
@@ -19,6 +19,7 @@
 #include <linux/iommu.h>
 #include <linux/amd-iommu.h>
 #include <linux/nospec.h>
+#include <linux/workqueue.h>
 
 #include <asm/sev.h>
 #include <asm/processor.h>
@@ -125,9 +126,20 @@ static void *rmp_bookkeeping __ro_after_init;
 static u64 probed_rmp_base, probed_rmp_size;
 
 static cpumask_t rmpopt_cpumask;
-static phys_addr_t rmpopt_pa_start;
+static phys_addr_t rmpopt_pa_start, rmpopt_pa_end;
 static bool rmpopt_configured;
 
+enum rmpopt_function {
+	RMPOPT_FUNC_VERIFY_AND_REPORT_STATUS,
+	RMPOPT_FUNC_REPORT_STATUS
+};
+
+#define RMPOPT_WORK_TIMEOUT	10000
+
+static struct workqueue_struct *rmpopt_wq;
+static struct delayed_work rmpopt_delayed_work;
+static DEFINE_MUTEX(rmpopt_wq_mutex);
+
 static LIST_HEAD(snp_leaked_pages_list);
 static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
 
@@ -568,6 +580,14 @@ static void rmpopt_cleanup(void)
 {
 	int cpu;
 
+	guard(mutex)(&rmpopt_wq_mutex);
+
+	if (!rmpopt_wq)
+		return;
+
+	cancel_delayed_work_sync(&rmpopt_delayed_work);
+	destroy_workqueue(rmpopt_wq);
+
 	cpus_read_lock();
 
 	for_each_cpu(cpu, &rmpopt_cpumask)
@@ -576,7 +596,8 @@ static void rmpopt_cleanup(void)
 	cpus_read_unlock();
 
 	cpumask_clear(&rmpopt_cpumask);
-	rmpopt_pa_start = 0;
+	rmpopt_pa_start = rmpopt_pa_end = 0;
+	rmpopt_wq = NULL;
 }
 
 void snp_shutdown(void)
@@ -599,6 +620,146 @@ void snp_clear_rmpopt_configured(void)
 	rmpopt_configured = false;
 }
 
+/*
+ * RMPOPT: F2 0F 01 FC
+ *   Input:  RAX = system physical address (1GB aligned)
+ *           RCX = operation type
+ *   Output: CF set if the range was optimized
+ */
+static inline bool __rmpopt(u64 pa_start, u64 op_type)
+{
+	bool optimized;
+
+	asm volatile(".byte 0xf2, 0x0f, 0x01, 0xfc"
+		     : "=@ccc" (optimized)
+		     : "a" (pa_start), "c" (op_type)
+		     : "memory", "cc");
+
+	return optimized;
+}
+
+static void rmpopt(u64 pa)
+{
+	u64 pa_start = ALIGN_DOWN(pa, SZ_1G);
+	u64 op_type = RMPOPT_FUNC_VERIFY_AND_REPORT_STATUS;
+
+	__rmpopt(pa_start, op_type);
+}
+
+/*
+ * 'val' is a system physical address.
+ */
+static void rmpopt_smp(void *val)
+{
+	rmpopt((u64)val);
+}
+
+/*
+ * RMPOPT optimizations skip RMP checks at 1GB granularity if this
+ * range of memory does not contain any SNP guest memory.
+ */
+static void rmpopt_work_handler(struct work_struct *work)
+{
+	cpumask_var_t follower_mask;
+	phys_addr_t pa;
+	int this_cpu;
+
+	pr_info("Attempt RMP optimizations on physical address range @1GB alignment [0x%016llx - 0x%016llx]\n",
+		rmpopt_pa_start, rmpopt_pa_end);
+
+	if (!alloc_cpumask_var(&follower_mask, GFP_KERNEL))
+		return;
+
+	/*
+	 * RMPOPT scans the RMP table, stores the result of the scan in the
+	 * reserved processor memory. The RMP scan is the most expensive
+	 * part. If a second RMPOPT occurs, it can skip the expensive scan
+	 * if they can see a cached result in the reserved processor memory.
+	 *
+	 * Do RMPOPT on one CPU alone. Then, follow that up with RMPOPT
+	 * on every other primary thread. Followers are "designed to"
+	 * skip the scan if they see the "cached" scan results.
+	 */
+	cpumask_copy(follower_mask, &rmpopt_cpumask);
+
+	/*
+	 * Pin the worker to the current CPU for the leader loop so that
+	 * this_cpu remains valid and the RMPOPT instruction executes on
+	 * the correct CPU.
+	 *
+	 * Use migrate_disable() rather than get_cpu() to prevent
+	 * migration while still allowing preemption.
+	 */
+	migrate_disable();
+	this_cpu = smp_processor_id();
+
+	if (cpumask_test_cpu(this_cpu, follower_mask)) {
+		/*
+		 * Current CPU is a primary thread in rmpopt_cpumask.
+		 * Run leader locally and remove from follower mask.
+		 */
+		cpumask_clear_cpu(this_cpu, follower_mask);
+
+		for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
+			rmpopt(pa);
+			cond_resched();
+		}
+	} else if (cpumask_intersects(topology_sibling_cpumask(this_cpu),
+				      follower_mask)) {
+		/*
+		 * Current CPU is a sibling thread whose primary is in
+		 * rmpopt_cpumask.  RMPOPT_BASE MSR is per-core, so it
+		 * is safe to run the leader locally.  Remove the sibling's
+		 * primary from the follower mask as this core is already
+		 * covered by the leader.
+		 */
+		cpumask_andnot(follower_mask, follower_mask,
+			       topology_sibling_cpumask(this_cpu));
+
+		for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
+			rmpopt(pa);
+			cond_resched();
+		}
+	} else {
+		/*
+		 * Current CPU does not have RMPOPT_BASE MSR programmed.
+		 * Pick an explicit leader from the cpumask to avoid #UD.
+		 */
+		int leader_cpu = cpumask_first(follower_mask);
+
+		if (WARN_ON_ONCE(leader_cpu >= nr_cpu_ids)) {
+			migrate_enable();
+			goto out;
+		}
+
+		cpumask_clear_cpu(leader_cpu, follower_mask);
+
+		cpus_read_lock();
+		for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
+			smp_call_function_single(leader_cpu, rmpopt_smp,
+						 (void *)pa, true);
+			cond_resched();
+		}
+		cpus_read_unlock();
+	}
+
+	migrate_enable();
+
+	/* Followers: run RMPOPT on remaining cores */
+	cpus_read_lock();
+	for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
+		on_each_cpu_mask(follower_mask, rmpopt_smp,
+				 (void *)pa, true);
+
+		 /* Give a chance for other threads to run */
+		cond_resched();
+	}
+	cpus_read_unlock();
+
+out:
+	free_cpumask_var(follower_mask);
+}
+
 void snp_setup_rmpopt(void)
 {
 	u64 rmpopt_base;
@@ -607,11 +768,35 @@ void snp_setup_rmpopt(void)
 	if (!cpu_feature_enabled(X86_FEATURE_RMPOPT) || !rmpopt_configured)
 		return;
 
+	guard(mutex)(&rmpopt_wq_mutex);
+
+	/*
+	 * Guard against re-initialization.  When SNP_SHUTDOWN_EX is issued
+	 * with x86_snp_shutdown=0, snp_shutdown() is not called and
+	 * rmpopt_cleanup() is skipped, but snp_initialized is still cleared.
+	 * A subsequent __sev_snp_init_locked() would call snp_setup_rmpopt()
+	 * again, leaking the existing workqueue, delayed work, debugfs
+	 * entries, and cpumask state.
+	 */
+	if (rmpopt_wq)
+		return;
+
+	/*
+	 * Create an RMPOPT-specific workqueue to avoid scheduling
+	 * RMPOPT workitem on the global system workqueue.
+	 */
+	rmpopt_wq = alloc_workqueue("rmpopt_wq", WQ_UNBOUND, 1);
+	if (!rmpopt_wq) {
+		pr_err("Failed to allocate RMPOPT workqueue\n");
+		return;
+	}
+
 	cpus_read_lock();
 
 	/*
 	 * The RMPOPT_BASE MSR is per-core, so only one thread per core needs
-	 * to set up the RMPOPT_BASE MSR.
+	 * to set up the RMPOPT_BASE MSR. Likewise, only one thread per core
+	 * needs to issue the RMPOPT instruction.
 	 *
 	 * Note: only online primary threads are included.  If a core's
 	 * primary thread is offline, that core is not covered.  CPU hotplug
@@ -635,6 +820,23 @@ void snp_setup_rmpopt(void)
 		wrmsrq_on_cpu(cpu, MSR_AMD64_RMPOPT_BASE, rmpopt_base);
 
 	cpus_read_unlock();
+
+	INIT_DELAYED_WORK(&rmpopt_delayed_work, rmpopt_work_handler);
+
+	rmpopt_pa_end = ALIGN(PFN_PHYS(max_pfn), SZ_1G);
+
+	/* Limit memory scanning to 2TB of RAM */
+	if ((rmpopt_pa_end - rmpopt_pa_start) > SZ_2T) {
+		pr_info("RMPOPT coverage limited to 2TB; memory above 0x%llx not optimized\n",
+			rmpopt_pa_start + SZ_2T);
+		rmpopt_pa_end = rmpopt_pa_start + SZ_2T;
+	}
+
+	/*
+	 * Once all per-CPU RMPOPT tables have been configured, enable RMPOPT
+	 * optimizations on all physical memory.
+	 */
+	queue_delayed_work(rmpopt_wq, &rmpopt_delayed_work, 0);
 }
 EXPORT_SYMBOL_FOR_MODULES(snp_setup_rmpopt, "ccp");
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH v7 4/6] x86/sev: Add interface to re-enable RMP optimizations.
From: Ashish Kalra @ 2026-06-08 18:57 UTC (permalink / raw)
  To: tglx, mingo, bp, dave.hansen, x86, hpa, seanjc, peterz,
	thomas.lendacky, herbert, davem, ardb
  Cc: pbonzini, aik, Michael.Roth, KPrateek.Nayak, Tycho.Andersen,
	Nathan.Fontenot, ackerleytng, jackyli, pgonda, rientjes, jacobhxu,
	xin, pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen,
	darwi, linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <cover.1780903370.git.ashish.kalra@amd.com>

From: Ashish Kalra <ashish.kalra@amd.com>

RMPOPT table is a per-CPU table which indicates if 1GB regions of
physical memory are entirely hypervisor-owned or not.

When performing host memory accesses in hypervisor mode as well as
non-SNP guest mode, the processor may consult the RMPOPT table to
potentially skip an RMP access and improve performance.

Events such as RMPUPDATE can clear RMP optimizations. Add an interface
to re-enable those optimizations.

Reviewed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
---
 arch/x86/include/asm/sev.h |  2 ++
 arch/x86/virt/svm/sev.c    | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 0d662221615a..a11306f25336 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -662,6 +662,7 @@ static inline void snp_leak_pages(u64 pfn, unsigned int pages)
 	__snp_leak_pages(pfn, pages, true);
 }
 int snp_prepare(void);
+void snp_rmpopt_all_physmem(void);
 void snp_setup_rmpopt(void);
 void snp_clear_rmpopt_configured(void);
 void snp_shutdown(void);
@@ -682,6 +683,7 @@ static inline void snp_leak_pages(u64 pfn, unsigned int npages) {}
 static inline void kdump_sev_callback(void) { }
 static inline void snp_fixup_e820_tables(void) {}
 static inline int snp_prepare(void) { return -ENODEV; }
+static inline void snp_rmpopt_all_physmem(void) {}
 static inline void snp_setup_rmpopt(void) {}
 static inline void snp_clear_rmpopt_configured(void) {}
 static inline void snp_shutdown(void) {}
diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
index b42788a66d40..db2d4c1f5dd7 100644
--- a/arch/x86/virt/svm/sev.c
+++ b/arch/x86/virt/svm/sev.c
@@ -760,6 +760,21 @@ static void rmpopt_work_handler(struct work_struct *work)
 	free_cpumask_var(follower_mask);
 }
 
+void snp_rmpopt_all_physmem(void)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_RMPOPT) || !rmpopt_configured)
+		return;
+
+	guard(mutex)(&rmpopt_wq_mutex);
+
+	if (!rmpopt_wq)
+		return;
+
+	queue_delayed_work(rmpopt_wq, &rmpopt_delayed_work,
+			   msecs_to_jiffies(RMPOPT_WORK_TIMEOUT));
+}
+EXPORT_SYMBOL_GPL(snp_rmpopt_all_physmem);
+
 void snp_setup_rmpopt(void)
 {
 	u64 rmpopt_base;
-- 
2.43.0


^ permalink raw reply related

* [PATCH v7 5/6] KVM: SEV: Perform RMP optimizations on SNP guest shutdown
From: Ashish Kalra @ 2026-06-08 18:57 UTC (permalink / raw)
  To: tglx, mingo, bp, dave.hansen, x86, hpa, seanjc, peterz,
	thomas.lendacky, herbert, davem, ardb
  Cc: pbonzini, aik, Michael.Roth, KPrateek.Nayak, Tycho.Andersen,
	Nathan.Fontenot, ackerleytng, jackyli, pgonda, rientjes, jacobhxu,
	xin, pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen,
	darwi, linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <cover.1780903370.git.ashish.kalra@amd.com>

From: Ashish Kalra <ashish.kalra@amd.com>

Pages are converted from shared to private as SNP guests are launched.
This destroys exisiting RMPOPT optimizations in the regions where
pages are converted.

Conversely, guest pages are converted back to shared during SNP guest
termination and their region may become eligible for RMPOPT
optimization.

To take advantage of this, perform RMPOPT after guest termination.
Do it after a delay so that a single RMPOPT pass can be done if
multiple guests terminate in a short period of time.

Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
---
 arch/x86/kvm/svm/sev.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index e107f368ed2d..29af6f6e603c 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -3005,6 +3005,8 @@ void sev_vm_destroy(struct kvm *kvm)
 		 */
 		if (snp_decommission_context(kvm))
 			return;
+
+		snp_rmpopt_all_physmem();
 	} else {
 		sev_unbind_asid(kvm, sev->handle);
 	}
-- 
2.43.0


^ permalink raw reply related

* [PATCH v7 6/6] x86/sev: Add debugfs support for RMPOPT
From: Ashish Kalra @ 2026-06-08 18:57 UTC (permalink / raw)
  To: tglx, mingo, bp, dave.hansen, x86, hpa, seanjc, peterz,
	thomas.lendacky, herbert, davem, ardb
  Cc: pbonzini, aik, Michael.Roth, KPrateek.Nayak, Tycho.Andersen,
	Nathan.Fontenot, ackerleytng, jackyli, pgonda, rientjes, jacobhxu,
	xin, pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen,
	darwi, linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <cover.1780903370.git.ashish.kalra@amd.com>

From: Ashish Kalra <ashish.kalra@amd.com>

Add a debugfs interface to report per-CPU RMPOPT status across all
system RAM.

To dump the per-CPU RMPOPT status for all system RAM:

/sys/kernel/debug/rmpopt# cat rmpopt-table

Memory @  0GB: CPU(s): none
Memory @  1GB: CPU(s): none
Memory @  2GB: CPU(s): 0-1023
Memory @  3GB: CPU(s): 0-1023
Memory @  4GB: CPU(s): none
Memory @  5GB: CPU(s): 0-1023
Memory @  6GB: CPU(s): 0-1023
Memory @  7GB: CPU(s): 0-1023
...
Memory @1025GB: CPU(s): 0-1023
Memory @1026GB: CPU(s): 0-1023
Memory @1027GB: CPU(s): 0-1023
Memory @1028GB: CPU(s): 0-1023
Memory @1029GB: CPU(s): 0-1023
Memory @1030GB: CPU(s): 0-1023
Memory @1031GB: CPU(s): 0-1023
Memory @1032GB: CPU(s): 0-1023
Memory @1033GB: CPU(s): 0-1023
Memory @1034GB: CPU(s): 0-1023
Memory @1035GB: CPU(s): 0-1023
Memory @1036GB: CPU(s): 0-1023
Memory @1037GB: CPU(s): 0-1023
Memory @1038GB: CPU(s): none

Suggested-by: Thomas Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
---
 arch/x86/virt/svm/sev.c | 128 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 128 insertions(+)

diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
index db2d4c1f5dd7..fe45a333df92 100644
--- a/arch/x86/virt/svm/sev.c
+++ b/arch/x86/virt/svm/sev.c
@@ -20,6 +20,8 @@
 #include <linux/amd-iommu.h>
 #include <linux/nospec.h>
 #include <linux/workqueue.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
 
 #include <asm/sev.h>
 #include <asm/processor.h>
@@ -145,6 +147,15 @@ static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
 
 static unsigned long snp_nr_leaked_pages;
 
+/* All users of rmpopt_report_cpumask must hold rmpopt_show_mutex. */
+static cpumask_t rmpopt_report_cpumask;
+static struct dentry *rmpopt_debugfs;
+static DEFINE_MUTEX(rmpopt_show_mutex);
+
+struct seq_paddr {
+	phys_addr_t next_seq_paddr;
+};
+
 #undef pr_fmt
 #define pr_fmt(fmt)	"SEV-SNP: " fmt
 
@@ -587,6 +598,8 @@ static void rmpopt_cleanup(void)
 
 	cancel_delayed_work_sync(&rmpopt_delayed_work);
 	destroy_workqueue(rmpopt_wq);
+	debugfs_remove_recursive(rmpopt_debugfs);
+	rmpopt_debugfs = NULL;
 
 	cpus_read_lock();
 
@@ -635,6 +648,10 @@ static inline bool __rmpopt(u64 pa_start, u64 op_type)
 		     : "a" (pa_start), "c" (op_type)
 		     : "memory", "cc");
 
+	if (op_type == RMPOPT_FUNC_REPORT_STATUS)
+		assign_cpu(smp_processor_id(), &rmpopt_report_cpumask,
+			   optimized);
+
 	return optimized;
 }
 
@@ -654,6 +671,115 @@ static void rmpopt_smp(void *val)
 	rmpopt((u64)val);
 }
 
+/*
+ * 'val' is a system physical address.
+ */
+static void rmpopt_report_status(void *val)
+{
+	u64 pa_start = ALIGN_DOWN((u64)val, SZ_1G);
+	u64 op_type = RMPOPT_FUNC_REPORT_STATUS;
+
+	__rmpopt(pa_start, op_type);
+}
+
+/*
+ * start() can be called multiple times if allocated buffer has overflowed
+ * and bigger buffer is allocated.
+ */
+static void *rmpopt_table_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	phys_addr_t end_paddr = rmpopt_pa_end;
+	struct seq_paddr *p = seq->private;
+
+	if (*pos == 0) {
+		p->next_seq_paddr = rmpopt_pa_start;
+		if (p->next_seq_paddr >= end_paddr)
+			return NULL;
+		return &p->next_seq_paddr;
+	}
+
+	if (p->next_seq_paddr >= end_paddr)
+		return NULL;
+
+	return &p->next_seq_paddr;
+}
+
+static void *rmpopt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	phys_addr_t end_paddr = rmpopt_pa_end;
+	phys_addr_t *curr_paddr = v;
+
+	(*pos)++;
+	*curr_paddr += SZ_1G;
+	if (*curr_paddr >= end_paddr)
+		return NULL;
+
+	return curr_paddr;
+}
+
+static void rmpopt_table_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int rmpopt_table_seq_show(struct seq_file *seq, void *v)
+{
+	phys_addr_t *curr_paddr = v;
+
+	guard(mutex)(&rmpopt_show_mutex);
+
+	seq_printf(seq, "Memory @%3lluGB: ",
+		   *curr_paddr >> (get_order(SZ_1G) + PAGE_SHIFT));
+
+	/*
+	 * Query all online CPUs rather than just rmpopt_cpumask (primary
+	 * threads only). The RMPOPT instruction only needs to run on one
+	 * thread per core for the optimization to take effect, but debugfs
+	 * reporting requires the RMPOPT status across all CPUs.
+	 * Performance is not a concern for this diagnostic interface.
+	 *
+	 * This is safe because RMPOPT_BASE MSR is per-core and
+	 * snp_prepare() ensures all CPUs are online when the MSR is
+	 * programmed during snp_setup_rmpopt().
+	 */
+	cpumask_clear(&rmpopt_report_cpumask);
+	on_each_cpu_mask(cpu_online_mask, rmpopt_report_status,
+			 (void *)*curr_paddr, true);
+
+	if (cpumask_empty(&rmpopt_report_cpumask))
+		seq_puts(seq, "CPU(s): none\n");
+	else
+		seq_printf(seq, "CPU(s): %*pbl\n", cpumask_pr_args(&rmpopt_report_cpumask));
+
+	return 0;
+}
+
+static const struct seq_operations rmpopt_table_seq_ops = {
+	.start = rmpopt_table_seq_start,
+	.next = rmpopt_table_seq_next,
+	.stop = rmpopt_table_seq_stop,
+	.show = rmpopt_table_seq_show
+};
+
+static int rmpopt_table_open(struct inode *inode, struct file *file)
+{
+	return seq_open_private(file, &rmpopt_table_seq_ops, sizeof(struct seq_paddr));
+}
+
+static const struct file_operations rmpopt_table_fops = {
+	.open = rmpopt_table_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release_private,
+};
+
+static void rmpopt_debugfs_setup(void)
+{
+	rmpopt_debugfs = debugfs_create_dir("rmpopt", arch_debugfs_dir);
+
+	debugfs_create_file("rmpopt-table", 0400, rmpopt_debugfs,
+			    NULL, &rmpopt_table_fops);
+}
+
 /*
  * RMPOPT optimizations skip RMP checks at 1GB granularity if this
  * range of memory does not contain any SNP guest memory.
@@ -852,6 +978,8 @@ void snp_setup_rmpopt(void)
 	 * optimizations on all physical memory.
 	 */
 	queue_delayed_work(rmpopt_wq, &rmpopt_delayed_work, 0);
+
+	rmpopt_debugfs_setup();
 }
 EXPORT_SYMBOL_FOR_MODULES(snp_setup_rmpopt, "ccp");
 
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH v5 5/5] iommufd/vdevice: add TSM request ioctl
From: Dan Williams (nvidia) @ 2026-06-08 20:58 UTC (permalink / raw)
  To: Aneesh Kumar K.V, Dan Williams (nvidia), Dan Williams (nvidia),
	Alexey Kardashevskiy, linux-coco, iommu, linux-kernel, kvm
  Cc: Bjorn Helgaas, Dan Williams, Jason Gunthorpe, Joerg Roedel,
	Jonathan Cameron, Kevin Tian, Nicolin Chen, Samuel Ortiz,
	Steven Price, Suzuki K Poulose, Will Deacon, Xu Yilun,
	Shameer Kolothum, Paolo Bonzini, Tony Krowiak, Halil Pasic,
	Jason Herne, Harald Freudenberger, Holger Dengler, Heiko Carstens,
	Vasily Gorbik, Alexander Gordeev, Christian Borntraeger,
	Sven Schnelle, Alex Williamson, Matthew Rosato, Farhan Ali,
	Eric Farman, linux-s390
In-Reply-To: <yq5aik81sf22.fsf@kernel.org>

Aneesh Kumar K.V wrote:
[..]
> > I think we can wait to move it to its own IOMMU operation unless/until
> > there is a need to set RUN outside of an explicit guest request, right?
> 
> Something like the below? (the diff against this series)
> 
> I have not yet integrated this into the full CCA patchset for testing,
> but I wanted to make sure we are aligned on the UAPI.
[..]
> -static bool iommufd_vdevice_tsm_req_scope_valid(u32 scope)
> +static bool iommufd_vdevice_tsm_req_arch_valid(u32 tvm_arch)
>  {
> -	if (scope > IOMMU_VDEVICE_TSM_REQ_SCOPE_PCI_LAST)
> +	switch (tvm_arch) {
> +	case IOMMU_VDEVICE_TSM_TVM_ARCH_CCA:
> +	case IOMMU_VDEVICE_TSM_TVM_ARCH_SEV:
> +	case IOMMU_VDEVICE_TSM_TVM_ARCH_TDX:

Makes sense for any command that needs tunneling. However, see below, what is
that set, and do we need a IOMMU_VDEVICE_TSM_COMMON when architecture
differentiation is not required?

> +		return true;
> +	default:
>  		return false;
> +	}
> +}
>  
> -	switch (scope) {
> -	case IOMMU_VDEVICE_TSM_REQ_PCI_INFO:
> -	case IOMMU_VDEVICE_TSM_REQ_PCI_STATE_CHANGE:
> -	case IOMMU_VDEVICE_TSM_REQ_PCI_DEBUG_READ:
> -	case IOMMU_VDEVICE_TSM_REQ_PCI_DEBUG_WRITE:
> +static bool iommufd_vdevice_tsm_req_op_valid(u32 op, u32 tvm_arch)
> +{
> +	switch (op) {
> +	case TSM_REQ_READ_OBJECT:
> +	case TSM_REQ_REGEN_OBJECT:
> +	case TSM_REQ_OBJECT_INFO:

The design goal of the netlink device-evidence interface is to be able
to respond to all shapes of requests for evidence. So netlink caches
objects that the hypercall handler can fill responses from.

It eliminates a class of commands that need tunneling.

> +	case TSM_REQ_VALIDATE_MMIO:
> +	case TSM_REQ_SET_TDI_STATE:

Are these potentially candidates for a IOMMU_VDEVICE_TSM_COMMON? The
handler knows how to do the arch-specific response from the common
iommufd result, or is there TSM-specific payload beyond @tsm_code for
these.

Make it the case that guest_req only needs non-common arch for
operations that are implementation unique, or where the response payload
exceeds what can be conveyed via @tsm_code.

>  		return true;
> +	case TSM_REQ_SEV_ENABLE_DMA:
> +	case TSM_REQ_SEV_DISABLE_DMA:
> +		return tvm_arch == IOMMU_VDEVICE_TSM_TVM_ARCH_SEV;

Right, this appears to be the only case where the command is
implementation unique. The handler can only ask iommufd to take
arch-specific action.

^ permalink raw reply

* Re: [PATCH v4 10/47] x86/tsc: Consolidate forcing of X86_FEATURE_TSC_KNOWN_FREQ for PV code
From: Sean Christopherson @ 2026-06-08 22:38 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Thomas Gleixner, Paolo Bonzini, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Kiryl Shutsemau, K. Y. Srinivasan,
	Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
	Alexey Makhalov, Jan Kiszka, Andy Lutomirski, Peter Zijlstra,
	Juergen Gross, Daniel Lezcano, John Stultz, H. Peter Anvin,
	Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, Tom Lendacky, Nikunj A Dadhania,
	Michael Kelley
In-Reply-To: <eef867eae15e30d08482ba16a1a32159745b64a7.camel@infradead.org>

On Sat, Jun 06, 2026, David Woodhouse wrote:
> On Sat, 2026-06-06 at 12:34 +0200, Thomas Gleixner wrote:
> > On Fri, May 29 2026 at 07:43, Sean Christopherson wrote:
> > 
> > > Now that all paravirt code that explicitly specifies the TSC frequency
> > > also sets X86_FEATURE_TSC_KNOWN_FREQ, replace all of the one-off code
> > > and simply set X86_FEATURE_TSC_KNOWN_FREQ if the TSC frequency is known.
> > > 
> > > Do NOT force set TSC_KNOWN_FREQ if the "known" TSC frequency was provided
> > > by the user.  Per commit bd35c77e32e4 ("x86/tsc: Add tsc_early_khz command
> > > line parameter"), one of the goals of the param is to allow the refined
> > > calibration work "to do meaningful error checking".
> > > 
> > > Note, preferring the user-provided TSC frequency over the frequency from
> > > the hypervisor or trusted firmware, while simultaneously not treating the
> > > user-provided frequency as gospel, is obviously incongruous.  Sweep the
> > > problem under the rug for now to avoid opening a big can of worms that
> > > likely doesn't have a great answer.
> > 
> > There is a good answer I think.
> > 
> > early_tsc_khz exists to cater for the overclocking crowd. On their
> > modded systems the firmware supplied TSC frequency (CPUID/MSR) is not
> > matching reality anymore. So they work around that by supplying a close
> > enough tsc_early_khz and then they let the refined calibration work
> > figure it out.
> > 
> > Arguably that's only relevant for bare metal systems and what's worse is
> > that in virtual environments the refined calibration work can fail,
> > which renders the TSC unstable.
> > 
> > So I'd rather say we change this logic to:
> > 
> >    if (!hypervisor_is_type(X86_HYPER_NATIVE)) {
> >       tsc_khz = x86_init.....();
> >       force(X86_FEATURE_TSC_KNOWN_FREQ);
> >    } else if (tsc_khz_early) {
> >       ....
> >    } else {
> >       ...
> >    }
> > 
> > Along with:
> > 
> >    if (!hypervisor_is_type(X86_HYPER_NATIVE)) {
> >       if (tsc_khz_early)
> >          pr_warn("Ignoring non-sensical tsc_early_khz command line argument\n");
> > 
> > or something daft like that.

Ya, I ended up in the same place once Sashiko pointed out that skipping the SNP/TDX
setup was hazardous[*], and also once I realized that tsc_khz_early *complemented*
the refinement instead of replacing it.

This is what I have locally:

        if (cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
                known_tsc_khz = snp_secure_tsc_init();
        else if (boot_cpu_has(X86_FEATURE_TDX_GUEST))
                known_tsc_khz = tdx_tsc_init();

        /*
         * If the TSC frequency wasn't provided by trusted firmware, try to get
         * it from the hypervisor (which is untrusted when running as a CoCo guest).
         */
        if (!known_tsc_khz && x86_init.hyper.get_tsc_khz)
                known_tsc_khz = x86_init.hyper.get_tsc_khz();

        /*
         * Mark the TSC frequency as known if it was obtained from a hypervisor
         * or trusted firmware.  Don't mark the frequency as known if the user
         * specified the frequency, as the user-provided frequency is intended
         * as a "starting point", not a known, guaranteed frequency.
         */
        if (known_tsc_khz && !tsc_early_khz)
                setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);

        /*
         * Ignore the user-provided TSC frequency if the exact frequency was
         * obtained from trusted firmware or the hypervisor, as the user-
         * provided frequency is intended as a "starting point", not a known,
         * guaranteed frequency.
         */
        if (!known_tsc_khz)
                known_tsc_khz = tsc_early_khz;
        else if (tsc_early_khz)
                pr_err("Ignoring 'tsc_early_khz' in favor of firmware/hypervisor.\n");

[*] https://lore.kernel.org/all/ahnF-FehodVd474X@google.com

> > The kernel has for various reasons always tried to cater for the needs
> > of users who are plagued by bonkers firmware, but we have to stop to
> > prioritize or treating equal ancient and modded out of spec hardware.
> > 
> > TBH, I consider that whole KVM clock nonsense to fall into the modded
> > out of spec hardware realm. Do a reality check:
> > 
> >    How many production systems are out there still which run VMs on CPUs
> >    with a broken TSC and the lack of VM TSC scaling?
> > 
> > I'm not saying that we should not support the few remaining systems
> > anymore, but our tendency to pretend that we can keep all of this
> > nonsense working and at the same time making progress is just a fallacy.

FWIW, I have the exact same sentiments about kvmclock, but I'm also trying my
best not to break folks that are happily running on what is effectively flawed,
ancient "hardward". 

> I don't know that we can take the KVM (and Xen) clock away from guests,
> but all of the *horrid* part about it is the way it attempts to cope
> with the possibility that the *host* timekeeping might flip away from
> TSC-based mode at any point in time. By the end of my outstanding
> cleanup series, that is the *only* thing the gtod_notifier remains for.
> 
> If we can trust the hardware *and* the host kernel, then KVM could
> theoretically hardwire the kvmclock into 'master clock mode' where it
> basically just advertises the TSC→kvmclock relationship *once* to all
> CPUs and it never changes.
> 
> All the nonsense about updating it every time we enter a CPU could just
> go away completely.

But to Thomas' point, why bother?  For actual old hardware, kvmclock is what it
is.  For modern hardware, it's completely antiquated.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox