Linux Confidential Computing Development
 help / color / mirror / Atom feed
* [PATCH v8 08/46] KVM: Provide generic interface for checking memory private/shared status
From: Ackerley Tng via B4 Relay @ 2026-06-19  0:31 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	jmattson, jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
	wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Sean Christopherson,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
	Vishal Annapurve, Andrew Morton, Chris Li, Kairui Song,
	Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen, Yuanchu Xie,
	Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt, Kiryl Shutsemau,
	Baoquan He, Jason Gunthorpe, Vlastimil Babka, Baoquan He
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260618-gmem-inplace-conversion-v8-0-9d2959357853@google.com>

From: Sean Christopherson <seanjc@google.com>

Introduce a generic kvm_mem_is_private() interface using a static call to
determine if a GFN is private. This allows the implementation for checking
a GFN's private/shared status to be set at runtime.

In preparation for choosing implementations between a guest_memfd lookup
and the existing VM attribute lookup, rename the existing
VM-attribute-based check to kvm_vm_mem_is_private to emphasize that it
looks up VM attributes.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/kvm_host.h | 12 +++++++++++-
 virt/kvm/kvm_main.c      | 15 +++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index eb26d4ea8945a..3915da2a61778 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2546,7 +2546,7 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
 bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
 					 struct kvm_gfn_range *range);
 
-static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
+static inline bool kvm_vm_mem_is_private(struct kvm *kvm, gfn_t gfn)
 {
 	return kvm_get_vm_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
 }
@@ -2557,6 +2557,16 @@ static inline bool kvm_mem_range_is_private(struct kvm *kvm, gfn_t start,
 						  KVM_MEMORY_ATTRIBUTE_PRIVATE,
 						  KVM_MEMORY_ATTRIBUTE_PRIVATE);
 }
+#endif  /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
+
+#ifdef kvm_arch_has_private_mem
+typedef bool (kvm_mem_is_private_t)(struct kvm *kvm, gfn_t gfn);
+DECLARE_STATIC_CALL(__kvm_mem_is_private, kvm_mem_is_private_t);
+
+static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
+{
+	return static_call(__kvm_mem_is_private)(kvm, gfn);
+}
 #else
 static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6669f1477013c..8b238e461b854 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2627,6 +2627,20 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
 }
 #endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
 
+#ifdef kvm_arch_has_private_mem
+DEFINE_STATIC_CALL_RET0(__kvm_mem_is_private, kvm_mem_is_private_t);
+EXPORT_STATIC_CALL_GPL(__kvm_mem_is_private);
+
+static void kvm_init_memory_attributes(void)
+{
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+	static_call_update(__kvm_mem_is_private, kvm_vm_mem_is_private);
+#endif
+}
+#else
+static void kvm_init_memory_attributes(void) { }
+#endif
+
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 {
 	return __gfn_to_memslot(kvm_memslots(kvm), gfn);
@@ -6528,6 +6542,7 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
 	kvm_preempt_ops.sched_in = kvm_sched_in;
 	kvm_preempt_ops.sched_out = kvm_sched_out;
 
+	kvm_init_memory_attributes();
 	kvm_init_debug();
 
 	r = kvm_vfio_ops_init();

-- 
2.55.0.rc0.738.g0c8ab3ebcc-goog



^ permalink raw reply related

* [PATCH v8 07/46] KVM: Rename memory attribute APIs to prepare for in-place gmem conversion
From: Ackerley Tng via B4 Relay @ 2026-06-19  0:31 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	jmattson, jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
	wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Sean Christopherson,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
	Vishal Annapurve, Andrew Morton, Chris Li, Kairui Song,
	Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen, Yuanchu Xie,
	Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt, Kiryl Shutsemau,
	Baoquan He, Jason Gunthorpe, Vlastimil Babka, Baoquan He
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260618-gmem-inplace-conversion-v8-0-9d2959357853@google.com>

From: Sean Christopherson <seanjc@google.com>

Rename memory attribute APIs to add a "vm_" in the name in anticipation of
moving PRIVATE tracking into guest_memfd, to allow in-place conversion
between SHARED and PRIVATE.  At that point, there will effectively be two
(potential) sources of memory attributes: the VM and guest_memfd.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c   |  6 +++---
 include/linux/kvm_host.h | 15 +++++++++++----
 virt/kvm/guest_memfd.c   |  6 +++---
 virt/kvm/kvm_main.c      | 16 ++++++++--------
 4 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index e0005a21b6e22..cbc50aef801fb 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -8087,11 +8087,11 @@ static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
 	const unsigned long end = start + KVM_PAGES_PER_HPAGE(level);
 
 	if (level == PG_LEVEL_2M)
-		return kvm_range_has_memory_attributes(kvm, start, end, ~0, attrs);
+		return kvm_range_has_vm_memory_attributes(kvm, start, end, ~0, attrs);
 
 	for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) {
 		if (hugepage_test_mixed(slot, gfn, level - 1) ||
-		    attrs != kvm_get_memory_attributes(kvm, gfn))
+		    attrs != kvm_get_vm_memory_attributes(kvm, gfn))
 			return false;
 	}
 	return true;
@@ -8191,7 +8191,7 @@ void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,
 		 * be manually checked as the attributes may already be mixed.
 		 */
 		for (gfn = start; gfn < end; gfn += nr_pages) {
-			unsigned long attrs = kvm_get_memory_attributes(kvm, gfn);
+			unsigned long attrs = kvm_get_vm_memory_attributes(kvm, gfn);
 
 			if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
 				hugepage_clear_mixed(slot, gfn, level);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d370e834d619e..eb26d4ea8945a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2534,13 +2534,13 @@ static inline bool kvm_memslot_is_gmem_only(const struct kvm_memory_slot *slot)
 }
 
 #ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
-static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
+static inline unsigned long kvm_get_vm_memory_attributes(struct kvm *kvm, gfn_t gfn)
 {
 	return xa_to_value(xa_load(&kvm->mem_attr_array, gfn));
 }
 
-bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
-				     unsigned long mask, unsigned long attrs);
+bool kvm_range_has_vm_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+					unsigned long mask, unsigned long attrs);
 bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
 					struct kvm_gfn_range *range);
 bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
@@ -2548,7 +2548,14 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
 
 static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
 {
-	return kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
+	return kvm_get_vm_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
+}
+static inline bool kvm_mem_range_is_private(struct kvm *kvm, gfn_t start,
+					    gfn_t end)
+{
+	return kvm_range_has_vm_memory_attributes(kvm, start, end,
+						  KVM_MEMORY_ATTRIBUTE_PRIVATE,
+						  KVM_MEMORY_ATTRIBUTE_PRIVATE);
 }
 #else
 static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index b4c24fdf159f6..8101f64e0366f 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -915,9 +915,9 @@ static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot,
 
 	folio_unlock(folio);
 
-	if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1,
-					     KVM_MEMORY_ATTRIBUTE_PRIVATE,
-					     KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
+	if (!kvm_range_has_vm_memory_attributes(kvm, gfn, gfn + 1,
+						KVM_MEMORY_ATTRIBUTE_PRIVATE,
+						KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
 		ret = -EINVAL;
 		goto out_put_folio;
 	}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7b989b659cf82..6669f1477013c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2419,7 +2419,7 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
 #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
 
 #ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
-static u64 kvm_supported_mem_attributes(struct kvm *kvm)
+static u64 kvm_supported_vm_mem_attributes(struct kvm *kvm)
 {
 #ifdef kvm_arch_has_private_mem
 	if (!kvm || kvm_arch_has_private_mem(kvm))
@@ -2433,19 +2433,19 @@ static u64 kvm_supported_mem_attributes(struct kvm *kvm)
  * Returns true if _all_ gfns in the range [@start, @end) have attributes
  * such that the bits in @mask match @attrs.
  */
-bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
-				     unsigned long mask, unsigned long attrs)
+bool kvm_range_has_vm_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+					unsigned long mask, unsigned long attrs)
 {
 	XA_STATE(xas, &kvm->mem_attr_array, start);
 	unsigned long index;
 	void *entry;
 
-	mask &= kvm_supported_mem_attributes(kvm);
+	mask &= kvm_supported_vm_mem_attributes(kvm);
 	if (attrs & ~mask)
 		return false;
 
 	if (end == start + 1)
-		return (kvm_get_memory_attributes(kvm, start) & mask) == attrs;
+		return (kvm_get_vm_memory_attributes(kvm, start) & mask) == attrs;
 
 	guard(rcu)();
 	if (!attrs)
@@ -2567,7 +2567,7 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
 	mutex_lock(&kvm->slots_lock);
 
 	/* Nothing to do if the entire range has the desired attributes. */
-	if (kvm_range_has_memory_attributes(kvm, start, end, ~0, attributes))
+	if (kvm_range_has_vm_memory_attributes(kvm, start, end, ~0, attributes))
 		goto out_unlock;
 
 	/*
@@ -2606,7 +2606,7 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
 	/* flags is currently not used. */
 	if (attrs->flags)
 		return -EINVAL;
-	if (attrs->attributes & ~kvm_supported_mem_attributes(kvm))
+	if (attrs->attributes & ~kvm_supported_vm_mem_attributes(kvm))
 		return -EINVAL;
 	if (attrs->size == 0 || attrs->address + attrs->size < attrs->address)
 		return -EINVAL;
@@ -4926,7 +4926,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 		return 1;
 #ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 	case KVM_CAP_MEMORY_ATTRIBUTES:
-		return kvm_supported_mem_attributes(kvm);
+		return kvm_supported_vm_mem_attributes(kvm);
 #endif
 #ifdef CONFIG_KVM_GUEST_MEMFD
 	case KVM_CAP_GUEST_MEMFD:

-- 
2.55.0.rc0.738.g0c8ab3ebcc-goog



^ permalink raw reply related

* [PATCH v8 06/46] KVM: Enumerate support for PRIVATE memory iff kvm_arch_has_private_mem is defined
From: Ackerley Tng via B4 Relay @ 2026-06-19  0:31 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	jmattson, jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
	wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Sean Christopherson,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
	Vishal Annapurve, Andrew Morton, Chris Li, Kairui Song,
	Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen, Yuanchu Xie,
	Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt, Kiryl Shutsemau,
	Baoquan He, Jason Gunthorpe, Vlastimil Babka, Baoquan He
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260618-gmem-inplace-conversion-v8-0-9d2959357853@google.com>

From: Ackerley Tng <ackerleytng@google.com>

Explicitly guard reporting support for KVM_MEMORY_ATTRIBUTE_PRIVATE based
on kvm_arch_has_private_mem being #defined in anticipation of decoupling
kvm_supported_mem_attributes() from CONFIG_KVM_VM_MEMORY_ATTRIBUTES.
guest_memfd support for memory attributes will be unconditional to avoid
yet more macros (all architectures that support guest_memfd are expected to
use per-gmem attributes at some point), at which point enumerating support
KVM_MEMORY_ATTRIBUTE_PRIVATE based solely on memory attributes being
supported _somewhere_ would result in KVM over-reporting support on arm64.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 virt/kvm/kvm_main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1ccc4895a4c26..7b989b659cf82 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2421,8 +2421,10 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
 #ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 static u64 kvm_supported_mem_attributes(struct kvm *kvm)
 {
+#ifdef kvm_arch_has_private_mem
 	if (!kvm || kvm_arch_has_private_mem(kvm))
 		return KVM_MEMORY_ATTRIBUTE_PRIVATE;
+#endif
 
 	return 0;
 }

-- 
2.55.0.rc0.738.g0c8ab3ebcc-goog



^ permalink raw reply related

* [PATCH v8 05/46] KVM: Make CONFIG_KVM_VM_MEMORY_ATTRIBUTES selectable
From: Ackerley Tng via B4 Relay @ 2026-06-19  0:31 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	jmattson, jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
	wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Sean Christopherson,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
	Vishal Annapurve, Andrew Morton, Chris Li, Kairui Song,
	Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen, Yuanchu Xie,
	Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt, Kiryl Shutsemau,
	Baoquan He, Jason Gunthorpe, Vlastimil Babka, Baoquan He
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260618-gmem-inplace-conversion-v8-0-9d2959357853@google.com>

From: Ackerley Tng <ackerleytng@google.com>

Make CONFIG_KVM_VM_MEMORY_ATTRIBUTES selectable, only for (CoCo) VM types
that might use vm_memory_attributes.

Also document CONFIG_KVM_VM_MEMORY_ATTRIBUTES to specifically be about the
private/shared attribute.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/Kconfig | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 24f96396cfa1c..c28393dc664eb 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -81,13 +81,16 @@ config KVM_WERROR
 	  If in doubt, say "N".
 
 config KVM_VM_MEMORY_ATTRIBUTES
-	bool
+	depends on KVM_SW_PROTECTED_VM || KVM_INTEL_TDX || KVM_AMD_SEV
+	bool "Enable per-VM PRIVATE vs. SHARED attributes (for CoCo VMs)"
+	help
+	  Enable support for tracking PRIVATE vs. SHARED memory using per-VM
+	  memory attributes.
 
 config KVM_SW_PROTECTED_VM
 	bool "Enable support for KVM software-protected VMs"
 	depends on EXPERT
 	depends on KVM_X86 && X86_64
-	select KVM_VM_MEMORY_ATTRIBUTES
 	help
 	  Enable support for KVM software-protected VMs.  Currently, software-
 	  protected VMs are purely a development and testing vehicle for
@@ -138,7 +141,6 @@ config KVM_INTEL_TDX
 	bool "Intel Trust Domain Extensions (TDX) support"
 	default y
 	depends on INTEL_TDX_HOST
-	select KVM_VM_MEMORY_ATTRIBUTES
 	select HAVE_KVM_ARCH_GMEM_POPULATE
 	help
 	  Provides support for launching Intel Trust Domain Extensions (TDX)
@@ -162,7 +164,6 @@ config KVM_AMD_SEV
 	depends on KVM_AMD && X86_64
 	depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
 	select ARCH_HAS_CC_PLATFORM
-	select KVM_VM_MEMORY_ATTRIBUTES
 	select HAVE_KVM_ARCH_GMEM_PREPARE
 	select HAVE_KVM_ARCH_GMEM_INVALIDATE
 	select HAVE_KVM_ARCH_GMEM_POPULATE

-- 
2.55.0.rc0.738.g0c8ab3ebcc-goog



^ permalink raw reply related

* [PATCH v8 04/46] KVM: Decouple kvm_has_arch_private_mem from CONFIG_KVM_VM_MEMORY_ATTRIBUTES
From: Ackerley Tng via B4 Relay @ 2026-06-19  0:31 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	jmattson, jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
	wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Sean Christopherson,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
	Vishal Annapurve, Andrew Morton, Chris Li, Kairui Song,
	Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen, Yuanchu Xie,
	Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt, Kiryl Shutsemau,
	Baoquan He, Jason Gunthorpe, Vlastimil Babka, Baoquan He
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260618-gmem-inplace-conversion-v8-0-9d2959357853@google.com>

From: Sean Christopherson <seanjc@google.com>

When memory attributes become trackable in guest_memfd, the concept of
having private memory is no longer dependent on
CONFIG_KVM_VM_MEMORY_ATTRIBUTES.

With this, on x86, kvm_arch_has_private_mem() is defined if some CoCo
platform support (or the testing CONFIG_KVM_SW_PROTECTED_VM) is compiled
in.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Co-developed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 arch/x86/include/asm/kvm_host.h | 4 +++-
 include/linux/kvm_host.h        | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8e8eb8a5e8a6b..1bde67cf6eb0e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2394,7 +2394,9 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
 		       int tdp_max_root_level, int tdp_huge_page_level);
 
 
-#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+#if defined(CONFIG_KVM_SW_PROTECTED_VM) ||	\
+	defined(CONFIG_KVM_INTEL_TDX) ||	\
+	defined(CONFIG_KVM_AMD_SEV)
 #define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
 #endif
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 201d0f2143976..d370e834d619e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -722,7 +722,7 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
 }
 #endif
 
-#ifndef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+#ifndef kvm_arch_has_private_mem
 static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
 {
 	return false;

-- 
2.55.0.rc0.738.g0c8ab3ebcc-goog



^ permalink raw reply related

* [PATCH v8 01/46] KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
From: Ackerley Tng via B4 Relay @ 2026-06-19  0:31 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	jmattson, jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
	wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Sean Christopherson,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
	Vishal Annapurve, Andrew Morton, Chris Li, Kairui Song,
	Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen, Yuanchu Xie,
	Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt, Kiryl Shutsemau,
	Baoquan He, Jason Gunthorpe, Vlastimil Babka, Baoquan He
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260618-gmem-inplace-conversion-v8-0-9d2959357853@google.com>

From: Sean Christopherson <seanjc@google.com>

Start plumbing in guest_memfd support for in-place private<=>shared
conversions by tracking attributes via a maple tree.  KVM currently tracks
private vs. shared attributes on a per-VM basis, which made sense when a
guest_memfd _only_ supported private memory, but tracking per-VM simply
can't work for in-place conversions as the shared/private status of a given
page needs to be per-gmem_inode, not per-VM.

Use the filemap invalidation lock to protect the maple tree, as taking the
lock for read when faulting in memory (for userspace or the guest) isn't
expected to result in meaningful contention, and using a separate lock
would add significant complexity (avoiding deadlock is quite difficult).

Co-developed-by: Vishal Annapurve <vannapurve@google.com>
Signed-off-by: Vishal Annapurve <vannapurve@google.com>
Co-developed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Co-developed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 virt/kvm/guest_memfd.c | 133 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 117 insertions(+), 16 deletions(-)

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 86690683b2fe3..b4c24fdf159f6 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -4,6 +4,7 @@
 #include <linux/falloc.h>
 #include <linux/fs.h>
 #include <linux/kvm_host.h>
+#include <linux/maple_tree.h>
 #include <linux/mempolicy.h>
 #include <linux/pseudo_fs.h>
 #include <linux/pagemap.h>
@@ -33,6 +34,13 @@ struct gmem_inode {
 	struct list_head gmem_file_list;
 
 	u64 flags;
+	/*
+	 * Every index in this inode, whether memory is populated or
+	 * not, is tracked in attributes. The entire range of indices,
+	 * corresponding to the size of this inode, is represented in
+	 * this maple tree.
+	 */
+	struct maple_tree attributes;
 };
 
 static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
@@ -60,6 +68,24 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
 	return gfn - slot->base_gfn + slot->gmem.pgoff;
 }
 
+static u64 kvm_gmem_get_attributes(struct inode *inode, pgoff_t index)
+{
+	struct maple_tree *mt = &GMEM_I(inode)->attributes;
+	void *entry = mtree_load(mt, index);
+
+	return WARN_ON_ONCE(!entry) ? 0 : xa_to_value(entry);
+}
+
+static bool kvm_gmem_is_private_mem(struct inode *inode, pgoff_t index)
+{
+	return kvm_gmem_get_attributes(inode, index) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
+}
+
+static bool kvm_gmem_is_shared_mem(struct inode *inode, pgoff_t index)
+{
+	return !kvm_gmem_is_private_mem(inode, index);
+}
+
 static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
 				    pgoff_t index, struct folio *folio)
 {
@@ -397,10 +423,13 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
 	if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
 		return VM_FAULT_SIGBUS;
 
-	if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED))
-		return VM_FAULT_SIGBUS;
+	filemap_invalidate_lock_shared(inode->i_mapping);
+	if (kvm_gmem_is_shared_mem(inode, vmf->pgoff))
+		folio = kvm_gmem_get_folio(inode, vmf->pgoff);
+	else
+		folio = ERR_PTR(-EACCES);
+	filemap_invalidate_unlock_shared(inode->i_mapping);
 
-	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
 	if (IS_ERR(folio)) {
 		if (PTR_ERR(folio) == -EAGAIN)
 			return VM_FAULT_RETRY;
@@ -557,6 +586,51 @@ bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
 	return true;
 }
 
+static int kvm_gmem_init_inode(struct inode *inode, loff_t size, u64 flags)
+{
+	struct gmem_inode *gi = GMEM_I(inode);
+	MA_STATE(mas, &gi->attributes, 0, (size >> PAGE_SHIFT) - 1);
+	u64 attrs;
+	int r;
+
+	inode->i_op = &kvm_gmem_iops;
+	inode->i_mapping->a_ops = &kvm_gmem_aops;
+	inode->i_mode |= S_IFREG;
+	inode->i_size = size;
+	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+
+	/*
+	 * guest_memfd memory is neither migratable nor swappable: set
+	 * inaccessible to gate off both.
+	 */
+	mapping_set_inaccessible(inode->i_mapping);
+	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
+
+	gi->flags = flags;
+
+	mt_set_external_lock(&gi->attributes,
+			     &inode->i_mapping->invalidate_lock);
+
+	/*
+	 * Store default attributes for the entire gmem instance. Ensuring every
+	 * index is represented in the maple tree at all times simplifies the
+	 * conversion and merging logic.
+	 */
+	attrs = gi->flags & GUEST_MEMFD_FLAG_INIT_SHARED ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE;
+
+	/*
+	 * Acquire the invalidation lock purely to make lockdep happy.  The
+	 * maple tree library expects all stores to be protected via the lock,
+	 * and the library can't know when the tree is reachable only by the
+	 * caller, as is the case here.
+	 */
+	filemap_invalidate_lock(inode->i_mapping);
+	r = mas_store_gfp(&mas, xa_mk_value(attrs), GFP_KERNEL);
+	filemap_invalidate_unlock(inode->i_mapping);
+
+	return r;
+}
+
 static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 {
 	static const char *name = "[kvm-gmem]";
@@ -587,16 +661,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 		goto err_fops;
 	}
 
-	inode->i_op = &kvm_gmem_iops;
-	inode->i_mapping->a_ops = &kvm_gmem_aops;
-	inode->i_mode |= S_IFREG;
-	inode->i_size = size;
-	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
-	mapping_set_inaccessible(inode->i_mapping);
-	/* Unmovable mappings are supposed to be marked unevictable as well. */
-	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
-
-	GMEM_I(inode)->flags = flags;
+	err = kvm_gmem_init_inode(inode, size, flags);
+	if (err)
+		goto err_inode;
 
 	file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
 	if (IS_ERR(file)) {
@@ -799,9 +866,13 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 	if (!file)
 		return -EFAULT;
 
+	filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
+
 	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
-	if (IS_ERR(folio))
-		return PTR_ERR(folio);
+	if (IS_ERR(folio)) {
+		r = PTR_ERR(folio);
+		goto out;
+	}
 
 	if (!folio_test_uptodate(folio)) {
 		clear_highpage(folio_page(folio, 0));
@@ -817,6 +888,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 	else
 		folio_put(folio);
 
+out:
+	filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
 	return r;
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
@@ -948,6 +1021,15 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
 
 	mpol_shared_policy_init(&gi->policy, NULL);
 
+	/*
+	 * Memory attributes are protected by the filemap invalidation lock, but
+	 * the lock structure isn't available at this time.  Immediately mark
+	 * maple tree as using external locking so that accessing the tree
+	 * before it's fully initialized results in NULL pointer dereferences
+	 * and not more subtle bugs.
+	 */
+	mt_init_flags(&gi->attributes, MT_FLAGS_LOCK_EXTERN | MT_FLAGS_USE_RCU);
+
 	gi->flags = 0;
 	INIT_LIST_HEAD(&gi->gmem_file_list);
 	return &gi->vfs_inode;
@@ -955,7 +1037,26 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
 
 static void kvm_gmem_destroy_inode(struct inode *inode)
 {
-	mpol_free_shared_policy(&GMEM_I(inode)->policy);
+	struct gmem_inode *gi = GMEM_I(inode);
+
+	mpol_free_shared_policy(&gi->policy);
+
+	/*
+	 * Note!  Checking for an empty tree is functionally necessary
+	 * to avoid explosions if the tree hasn't been fully
+	 * initialized, i.e. if the inode is being destroyed before
+	 * guest_memfd can set the external lock, lockdep would find
+	 * that the tree's internal ma_lock was not held.
+	 */
+	if (!mtree_empty(&gi->attributes)) {
+		/*
+		 * Acquire the invalidation lock purely to make lockdep happy,
+		 * the inode is unreachable at this point.
+		 */
+		filemap_invalidate_lock(inode->i_mapping);
+		__mt_destroy(&gi->attributes);
+		filemap_invalidate_unlock(inode->i_mapping);
+	}
 }
 
 static void kvm_gmem_free_inode(struct inode *inode)

-- 
2.55.0.rc0.738.g0c8ab3ebcc-goog



^ permalink raw reply related

* [PATCH v8 03/46] KVM: Move KVM_VM_MEMORY_ATTRIBUTES config definition to x86
From: Ackerley Tng via B4 Relay @ 2026-06-19  0:31 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	jmattson, jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
	wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Sean Christopherson,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
	Vishal Annapurve, Andrew Morton, Chris Li, Kairui Song,
	Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen, Yuanchu Xie,
	Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt, Kiryl Shutsemau,
	Baoquan He, Jason Gunthorpe, Vlastimil Babka, Baoquan He
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260618-gmem-inplace-conversion-v8-0-9d2959357853@google.com>

From: Sean Christopherson <seanjc@google.com>

Bury KVM_VM_MEMORY_ATTRIBUTES in x86 to discourage other architectures
from adding support for per-VM memory attributes, because tracking private
vs. shared memory on a per-VM basis is now deprecated in favor of tracking
on a per-guest_memfd basis, and while RWX memory attributes are on the
horizon, they too are expected to be x86-only.

This will also allow modifying KVM_VM_MEMORY_ATTRIBUTES to be
user-selectable (in x86) without creating weirdness in KVM's Kconfigs.
Now that guest_memfd supports in-place conversions, it's entirely possible
to run x86 CoCo VMs without support for KVM_VM_MEMORY_ATTRIBUTES.

Leave the code itself in common KVM so that it's trivial to undo this
change if new per-VM attributes do come along.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 arch/x86/kvm/Kconfig | 3 +++
 virt/kvm/Kconfig     | 3 ---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 26f6afd51bbdc..24f96396cfa1c 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -80,6 +80,9 @@ config KVM_WERROR
 
 	  If in doubt, say "N".
 
+config KVM_VM_MEMORY_ATTRIBUTES
+	bool
+
 config KVM_SW_PROTECTED_VM
 	bool "Enable support for KVM software-protected VMs"
 	depends on EXPERT
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 5119cb37145fc..297e4399fbd49 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -100,9 +100,6 @@ config KVM_ELIDE_TLB_FLUSH_IF_YOUNG
 config KVM_MMU_LOCKLESS_AGING
        bool
 
-config KVM_VM_MEMORY_ATTRIBUTES
-       bool
-
 config KVM_GUEST_MEMFD
        select XARRAY_MULTI
        bool

-- 
2.55.0.rc0.738.g0c8ab3ebcc-goog



^ permalink raw reply related

* [PATCH v8 02/46] KVM: Rename KVM_GENERIC_MEMORY_ATTRIBUTES to KVM_VM_MEMORY_ATTRIBUTES
From: Ackerley Tng via B4 Relay @ 2026-06-19  0:31 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	jmattson, jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
	wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Sean Christopherson,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
	Vishal Annapurve, Andrew Morton, Chris Li, Kairui Song,
	Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen, Yuanchu Xie,
	Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt, Kiryl Shutsemau,
	Baoquan He, Jason Gunthorpe, Vlastimil Babka, Baoquan He
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260618-gmem-inplace-conversion-v8-0-9d2959357853@google.com>

From: Sean Christopherson <seanjc@google.com>

Rename the per-VM memory attributes Kconfig to make it explicitly about
per-VM attributes in anticipation of adding memory attributes support to
guest_memfd, at which point it will be possible (and desirable) to have
memory attributes without the per-VM support, even in x86.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/Kconfig            |  6 +++---
 arch/x86/kvm/mmu/mmu.c          |  2 +-
 arch/x86/kvm/x86.c              |  2 +-
 include/linux/kvm_host.h        |  8 ++++----
 include/trace/events/kvm.h      |  4 ++--
 virt/kvm/Kconfig                |  2 +-
 virt/kvm/kvm_main.c             | 14 +++++++-------
 8 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index eee473717c0e5..8e8eb8a5e8a6b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2394,7 +2394,7 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
 		       int tdp_max_root_level, int tdp_huge_page_level);
 
 
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 #define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
 #endif
 
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 801bf9e520db3..26f6afd51bbdc 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -84,7 +84,7 @@ config KVM_SW_PROTECTED_VM
 	bool "Enable support for KVM software-protected VMs"
 	depends on EXPERT
 	depends on KVM_X86 && X86_64
-	select KVM_GENERIC_MEMORY_ATTRIBUTES
+	select KVM_VM_MEMORY_ATTRIBUTES
 	help
 	  Enable support for KVM software-protected VMs.  Currently, software-
 	  protected VMs are purely a development and testing vehicle for
@@ -135,7 +135,7 @@ config KVM_INTEL_TDX
 	bool "Intel Trust Domain Extensions (TDX) support"
 	default y
 	depends on INTEL_TDX_HOST
-	select KVM_GENERIC_MEMORY_ATTRIBUTES
+	select KVM_VM_MEMORY_ATTRIBUTES
 	select HAVE_KVM_ARCH_GMEM_POPULATE
 	help
 	  Provides support for launching Intel Trust Domain Extensions (TDX)
@@ -159,7 +159,7 @@ config KVM_AMD_SEV
 	depends on KVM_AMD && X86_64
 	depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
 	select ARCH_HAS_CC_PLATFORM
-	select KVM_GENERIC_MEMORY_ATTRIBUTES
+	select KVM_VM_MEMORY_ATTRIBUTES
 	select HAVE_KVM_ARCH_GMEM_PREPARE
 	select HAVE_KVM_ARCH_GMEM_INVALIDATE
 	select HAVE_KVM_ARCH_GMEM_POPULATE
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 26ed97efda919..e0005a21b6e22 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7998,7 +7998,7 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
 		vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread);
 }
 
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
 				int level)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d9d51803b7b20..2fde594e86d72 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13569,7 +13569,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
 		}
 	}
 
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 	kvm_mmu_init_memslot_memory_attributes(kvm, slot);
 #endif
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ab8cfaec82d31..201d0f2143976 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -722,7 +722,7 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
 }
 #endif
 
-#ifndef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifndef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
 {
 	return false;
@@ -871,7 +871,7 @@ struct kvm {
 #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
 	struct notifier_block pm_notifier;
 #endif
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 	/* Protected by slots_lock (for writes) and RCU (for reads) */
 	struct xarray mem_attr_array;
 #endif
@@ -2533,7 +2533,7 @@ static inline bool kvm_memslot_is_gmem_only(const struct kvm_memory_slot *slot)
 	return slot->flags & KVM_MEMSLOT_GMEM_ONLY;
 }
 
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
 {
 	return xa_to_value(xa_load(&kvm->mem_attr_array, gfn));
@@ -2555,7 +2555,7 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
 {
 	return false;
 }
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
 
 #ifdef CONFIG_KVM_GUEST_MEMFD
 int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index b282e3a867696..1ba72bd73ea2f 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -358,7 +358,7 @@ TRACE_EVENT(kvm_dirty_ring_exit,
 	TP_printk("vcpu %d", __entry->vcpu_id)
 );
 
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 /*
  * @start:	Starting address of guest memory range
  * @end:	End address of guest memory range
@@ -383,7 +383,7 @@ TRACE_EVENT(kvm_vm_set_mem_attributes,
 	TP_printk("%#016llx -- %#016llx [0x%lx]",
 		  __entry->start, __entry->end, __entry->attr)
 );
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
 
 TRACE_EVENT(kvm_unmap_hva_range,
 	TP_PROTO(unsigned long start, unsigned long end),
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 794976b88c6f9..5119cb37145fc 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -100,7 +100,7 @@ config KVM_ELIDE_TLB_FLUSH_IF_YOUNG
 config KVM_MMU_LOCKLESS_AGING
        bool
 
-config KVM_GENERIC_MEMORY_ATTRIBUTES
+config KVM_VM_MEMORY_ATTRIBUTES
        bool
 
 config KVM_GUEST_MEMFD
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e44c20c049610..1ccc4895a4c26 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1115,7 +1115,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 	spin_lock_init(&kvm->mn_invalidate_lock);
 	rcuwait_init(&kvm->mn_memslots_update_rcuwait);
 	xa_init(&kvm->vcpu_array);
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 	xa_init(&kvm->mem_attr_array);
 #endif
 
@@ -1300,7 +1300,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	cleanup_srcu_struct(&kvm->irq_srcu);
 	srcu_barrier(&kvm->srcu);
 	cleanup_srcu_struct(&kvm->srcu);
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 	xa_destroy(&kvm->mem_attr_array);
 #endif
 	kvm_arch_free_vm(kvm);
@@ -2418,7 +2418,7 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
 }
 #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
 
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 static u64 kvm_supported_mem_attributes(struct kvm *kvm)
 {
 	if (!kvm || kvm_arch_has_private_mem(kvm))
@@ -2623,7 +2623,7 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
 
 	return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
 }
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
 
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 {
@@ -4922,7 +4922,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_SYSTEM_EVENT_DATA:
 	case KVM_CAP_DEVICE_CTRL:
 		return 1;
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 	case KVM_CAP_MEMORY_ATTRIBUTES:
 		return kvm_supported_mem_attributes(kvm);
 #endif
@@ -5326,7 +5326,7 @@ static long kvm_vm_ioctl(struct file *filp,
 		break;
 	}
 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 	case KVM_SET_MEMORY_ATTRIBUTES: {
 		struct kvm_memory_attributes attrs;
 
@@ -5337,7 +5337,7 @@ static long kvm_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
 		break;
 	}
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
 	case KVM_CREATE_DEVICE: {
 		struct kvm_create_device cd;
 

-- 
2.55.0.rc0.738.g0c8ab3ebcc-goog



^ permalink raw reply related

* [PATCH v8 00/46] guest_memfd: In-place conversion support
From: Ackerley Tng via B4 Relay @ 2026-06-19  0:31 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	jmattson, jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
	wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Sean Christopherson,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
	Vishal Annapurve, Andrew Morton, Chris Li, Kairui Song,
	Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen, Yuanchu Xie,
	Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt, Kiryl Shutsemau,
	Baoquan He, Jason Gunthorpe, Vlastimil Babka, Baoquan He
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng

This is v8 of guest_memfd in-place conversion support.

Up till now, guest_memfd supports the entire inode worth of memory being
used as all-shared, or all-private. CoCo VMs may request guest memory to be
converted between private and shared states, and the only way to support
that currently would be to have the userspace VMM provide two sources of
backing memory from completely different areas of physical memory.

pKVM has a use case for in-place sharing: the guest and host may be
cooperating on given data, and pKVM doesn't protect data through
encryption, so copying that given data between different areas of physical
memory as part of conversions would be unnecessary work.

This series also serves as a foundation for guest_memfd huge page
support. Now, guest_memfd only supports PAGE_SIZE pages, so if two sources
of backing memory are used, the userspace VMM could maintain a steady total
memory utilized by punching out the pages that are not used. When huge
pages are available in guest_memfd, even if the backing memory source
supports hole punching within a huge page, punching out pages to maintain
the total memory utilized by a VM would be introducing lots of
fragmentation.

In-place conversion avoids fragmentation by allowing the same physical
memory to be used for both shared and private memory, with guest_memfd
tracks the shared/private status of all the pages at a per-page
granularity.

The central principle, which guest_memfd continues to uphold, is that any
guest-private page will not be mappable to host userspace. All pages will
be mmap()-able in host userspace, but accesses to guest-private pages (as
tracked by guest_memfd) will result in a SIGBUS.

This series introduces a guest_memfd ioctl (not kvm, vm or vcpu, but
guest_memfd ioctl) that allows userspace to set memory
attributes (shared/private) directly through the guest_memfd. This is the
appropriate interface because shared/private-ness is a property of memory
and hence the request should be sent directly to the memory provider -
guest_memfd.

Tested with both CONFIG_KVM_VM_MEMORY_ATTRIBUTES enabled and disabled:

+ tools/testing/selftests/kvm/guest_memfd_test.c
+ tools/testing/selftests/kvm/pre_fault_memory_test.c
+ tools/testing/selftests/kvm/x86/guest_memfd_conversions_test.c
+ tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
+ tools/testing/selftests/kvm/x86/private_mem_kvm_exits_test.c

Updates for this revision:

+ Updated the series to _not_ deprecate all of VM memory attributes, but
  only deprecate tracking of the PRIVATE attributes in VM memory
  attributes. This takes into account upcoming RWX attributes support,
  which will be tracked at the VM level.
+ Reshuffled the earlier commits that deal with preparing KVM to stop
  seeing VM memory attributes as the only source of attributes.
+ Addressed comments from v7

TODOs

+ Retest with TDX selftests. v7 was tested with TDX [12], but the setup there was
  wrong. Conversions were successful (no errors), but the shared memory being
  tested is actually in a completely different host physical page.
+ Retest with SNP selftests. v6 was tested with SNP, I ported that to v7
  and those ran fine too. Just need to double-check for v8.

This series is based on kvm-x86/next, and here's the tree for your convenience:

https://github.com/googleprodkernel/linux-cc/commits/guest_memfd-inplace-conversion-v8

Older series:

+ RFCv7 is at [11]
+ RFCv6 is at [10]
+ RFCv5 is at [8]
+ RFCv4 is at [7]
+ RFCv3 is at [6]
+ RFCv2 is at [5]
+ RFCv1 is at [4]
+ Previous versions of this feature, part of other series, are available at
  [1][2][3].

[1] https://lore.kernel.org/all/bd163de3118b626d1005aa88e71ef2fb72f0be0f.1726009989.git.ackerleytng@google.com/
[2] https://lore.kernel.org/all/20250117163001.2326672-6-tabba@google.com/
[3] https://lore.kernel.org/all/b784326e9ccae6a08388f1bf39db70a2204bdc51.1747264138.git.ackerleytng@google.com/
[4] https://lore.kernel.org/all/cover.1760731772.git.ackerleytng@google.com/T/
[5] https://lore.kernel.org/all/cover.1770071243.git.ackerleytng@google.com/T/
[6] https://lore.kernel.org/r/20260313-gmem-inplace-conversion-v3-0-5fc12a70ec89@google.com/T/
[7] https://lore.kernel.org/all/20260326-gmem-inplace-conversion-v4-0-e202fe950ffd@google.com/T/
[8] https://lore.kernel.org/r/20260428-gmem-inplace-conversion-v5-0-d8608ccfca22@google.com
[9] https://lore.kernel.org/all/20260414-selftest-global-metadata-v1-0-fd223922bc57@google.com/T/
[10] https://lore.kernel.org/r/20260507-gmem-inplace-conversion-v6-0-91ab5a8b19a4@google.com
[11] https://lore.kernel.org/r/20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com
[12] https://lore.kernel.org/all/20260605134153.204152-1-ackerleytng@google.com/

Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
Ackerley Tng (27):
      KVM: Make CONFIG_KVM_VM_MEMORY_ATTRIBUTES selectable
      KVM: Enumerate support for PRIVATE memory iff kvm_arch_has_private_mem is defined
      KVM: guest_memfd: Introduce function to check GFN private/shared status
      KVM: guest_memfd: Only prepare folios for private pages
      KVM: guest_memfd: Add base support for KVM_SET_MEMORY_ATTRIBUTES2
      KVM: guest_memfd: Ensure pages are not in use before conversion
      KVM: guest_memfd: Call arch invalidate hooks on conversion
      KVM: guest_memfd: Return early if range already has requested attributes
      KVM: guest_memfd: Advertise KVM_SET_MEMORY_ATTRIBUTES2 ioctl
      KVM: guest_memfd: Handle lru_add fbatch refcounts during conversion safety check
      KVM: guest_memfd: Use actual size for invalidation in kvm_gmem_release()
      KVM: guest_memfd: Determine invalidation filter from memory attributes
      KVM: guest_memfd: Zero page while getting pfn
      KVM: TDX: Make source page optional for KVM_TDX_INIT_MEM_REGION
      KVM: guest_memfd: Make in-place conversion the default
      KVM: selftests: Test basic single-page conversion flow
      KVM: selftests: Test conversion flow when INIT_SHARED
      KVM: selftests: Test conversion precision in guest_memfd
      KVM: selftests: Test conversion before allocation
      KVM: selftests: Convert with allocated folios in different layouts
      KVM: selftests: Test that truncation does not change shared/private status
      KVM: selftests: Add helpers to pin pages with CONFIG_GUP_TEST
      KVM: selftests: Test conversion with elevated page refcount
      KVM: selftests: Reset shared memory after hole-punching
      KVM: selftests: Provide function to look up guest_memfd details from gpa
      KVM: selftests: Make TEST_EXPECT_SIGBUS thread-safe
      KVM: selftests: Update private_mem_conversions_test to mmap() guest_memfd

Michael Roth (1):
      KVM: SEV: Make 'uaddr' parameter optional for KVM_SEV_SNP_LAUNCH_UPDATE

Sean Christopherson (18):
      KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
      KVM: Rename KVM_GENERIC_MEMORY_ATTRIBUTES to KVM_VM_MEMORY_ATTRIBUTES
      KVM: Move KVM_VM_MEMORY_ATTRIBUTES config definition to x86
      KVM: Decouple kvm_has_arch_private_mem from CONFIG_KVM_VM_MEMORY_ATTRIBUTES
      KVM: Rename memory attribute APIs to prepare for in-place gmem conversion
      KVM: Provide generic interface for checking memory private/shared status
      KVM: guest_memfd: Wire up core private/shared attribute interfaces
      KVM: Consolidate private memory and guest_memfd ifdeffery in kvm_host.h
      KVM: guest_memfd: Enable INIT_SHARED on guest_memfd for x86 Coco VMs
      KVM: selftests: Create gmem fd before "regular" fd when adding memslot
      KVM: selftests: Rename guest_memfd{,_offset} to gmem_{fd,offset}
      KVM: selftests: Add support for mmap() on guest_memfd in core library
      KVM: selftests: Add selftests global for guest memory attributes capability
      KVM: selftests: Add helpers for calling ioctls on guest_memfd
      KVM: selftests: Test that shared/private status is consistent across processes
      KVM: selftests: Provide common function to set memory attributes
      KVM: selftests: Check fd/flags provided to mmap() when setting up memslot
      KVM: selftests: Update private memory exits test to work with per-gmem attributes

 Documentation/virt/kvm/api.rst                     |  78 +++-
 .../virt/kvm/x86/amd-memory-encryption.rst         |  13 +-
 Documentation/virt/kvm/x86/intel-tdx.rst           |   4 +
 arch/x86/include/asm/kvm_host.h                    |   4 +-
 arch/x86/kvm/Kconfig                               |  15 +-
 arch/x86/kvm/mmu/mmu.c                             |   8 +-
 arch/x86/kvm/svm/sev.c                             |  16 +-
 arch/x86/kvm/vmx/tdx.c                             |  11 +-
 arch/x86/kvm/x86.c                                 |  15 +-
 include/linux/kvm_host.h                           |  74 +--
 include/trace/events/kvm.h                         |   4 +-
 include/uapi/linux/kvm.h                           |  16 +
 mm/swap.c                                          |   2 +
 tools/testing/selftests/kvm/Makefile.kvm           |   1 +
 tools/testing/selftests/kvm/include/kvm_util.h     | 139 +++++-
 tools/testing/selftests/kvm/include/test_util.h    |  34 +-
 tools/testing/selftests/kvm/lib/kvm_util.c         | 164 ++++---
 tools/testing/selftests/kvm/lib/test_util.c        |   7 -
 .../kvm/x86/guest_memfd_conversions_test.c         | 509 +++++++++++++++++++++
 .../kvm/x86/private_mem_conversions_test.c         |  53 ++-
 .../selftests/kvm/x86/private_mem_kvm_exits_test.c |  36 +-
 virt/kvm/Kconfig                                   |   4 +-
 virt/kvm/guest_memfd.c                             | 474 +++++++++++++++++--
 virt/kvm/kvm_main.c                                |  86 +++-
 24 files changed, 1547 insertions(+), 220 deletions(-)
---
base-commit: b7fbe9a1bf9ee6c967ef77d366ca58c35fcf1887
change-id: 20260225-gmem-inplace-conversion-bd0dbd39753a

Best regards,
--
Ackerley Tng <ackerleytng@google.com>



^ permalink raw reply

* Re: [PATCH v7 10/42] KVM: guest_memfd: Ensure pages are not in use before conversion
From: Ackerley Tng @ 2026-06-19  0:17 UTC (permalink / raw)
  To: Vlastimil Babka (SUSE), aik, andrew.jones, binbin.wu, brauner,
	chao.p.peng, david, ira.weiny, jmattson, jthoughton, michael.roth,
	oupton, pankaj.gupta, qperret, rick.p.edgecombe, rientjes,
	shivankg, steven.price, tabba, willy, wyihan, yan.y.zhao,
	forkloop, pratyush, suzuki.poulose, aneesh.kumar, liam,
	Paolo Bonzini, Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <509f9a66-5ae9-4c05-bef1-ced89fd29bf0@kernel.org>

"Vlastimil Babka (SUSE)" <vbabka@kernel.org> writes:

> On 5/23/26 02:17, Ackerley Tng via B4 Relay wrote:
>> From: Ackerley Tng <ackerleytng@google.com>
>>
>> When converting memory to private in guest_memfd, it is necessary to ensure
>> that the pages are not currently being accessed by any other part of the
>> kernel or userspace to avoid any current user writing to guest private
>> memory.
>>
>> guest_memfd checks for unexpected refcounts to determine whether a page is
>> still in use. The only expected refcounts after unmapping the range
>> requested for conversion are those that are held by guest_memfd itself.
>
> Is it sufficient to only check, and not also freeze the refcount? (i.e.
> using folio_ref_freeze()), because without freezing, anything (e.g.
> compaction's pfn-based scanner) could do a speculative folio_try_get() and
> the checked refcount becomes stale.
>

I believe there's no issue here, since the main thing here is to check
for long-term pins on the folio. Perhaps David can help me verify. :)

> Might be ok if we know that no such speculative increment can result in
> actually touching the page contents, and the extra refcount and something
> inspecting the struct folio won't interfere with anything else. Then it
> could be just a comment mentioning why it's safe.
>

In this series guest_memfd doesn't change anything in folio metadata,
guest_memfd only updates the attributes tracked in the guest_memfd
inode, and updates the RMP table for SNP.

With the upcoming huge page support, guest_memfd needs to split/merge
the folio, which means updates to folio metadata. That will need a
closer look.

I haven't added the comment, mostly because it's a long weekend here and
I'd like to get Sashiko to run on it over the weekend. We should
definitely continue this discussion on v8!

> IIRC the compaction's scanning can result in a migration here so it's
> probably ok?
>

Migration isn't supported for guest_memfd yet, so I think that's ok.

>> Update the kvm_memory_attributes2 structure to include an error_offset
>> field. This allows KVM to report the exact offset where a conversion
>> failed to userspace. If the safety check fails, return -EAGAIN and copy
>> the error_offset back to userspace so that it can potentially retry the
>> operation or handle the failure gracefully.
>>
>> Suggested-by: David Hildenbrand <david@kernel.org>
>> Co-developed-by: Vishal Annapurve <vannapurve@google.com>
>> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
>> Reviewed-by: Fuad Tabba <tabba@google.com>
>> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
>>
>> [...snip...]
>>

^ permalink raw reply

* Re: [PATCH v8 3/7] crypto/ccp: Disable CPU hotplug while SNP is active
From: Tom Lendacky @ 2026-06-18 21:49 UTC (permalink / raw)
  To: K Prateek Nayak, Ashish Kalra, tglx, mingo, bp, dave.hansen, x86,
	hpa, seanjc, peterz, herbert, davem, ardb
  Cc: pbonzini, aik, Michael.Roth, Tycho.Andersen, Nathan.Fontenot,
	ackerleytng, jackyli, pgonda, rientjes, jacobhxu, xin,
	pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen, darwi,
	linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <763bff29-e737-4033-ab30-cec8fd3e7438@amd.com>

On 6/16/26 23:33, K Prateek Nayak wrote:
> Hello Ashish,
> 
> On 6/16/2026 1:19 AM, Ashish Kalra wrote:
>> From: Ashish Kalra <ashish.kalra@amd.com>
>>
>> The SEV firmware enumerates the CPUs at SNP initialization and is not
>> aware of the OS bringing CPUs online or offline afterwards, so OS CPU
>> hotplug can diverge from the firmware's expectations and break SNP.
>> Disable CPU hotplug while SNP is active.
> 
> Dumb question: Is this specific to RMPOPT? Otherwise ...
> 
>>
>> SNP is fully torn down only on the SNP_SHUTDOWN_EX x86_snp_shutdown
>> path; the legacy path leaves SNP enabled in hardware while clearing
>> snp_initialized, so __sev_snp_init_locked() can run again.  Track the
>> disable with a flag so it is balanced by a matching enable rather than
>> stacked, and re-enable hotplug only on the x86_snp_shutdown path, after
>> snp_shutdown() has cleared the per-core RMPOPT_BASE MSRs with hotplug
>> still disabled.
>>
>> This also keeps the CPU set stable for the asynchronous RMPOPT scan
>> added later in this series, and ensures cpus_read_lock() in the scan
>> is uncontended.
>>
>> Suggested-by: Thomas Lendacky <thomas.lendacky@amd.com>
>> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
>> ---
>>  drivers/crypto/ccp/sev-dev.c | 29 ++++++++++++++++++++++++++++-
>>  1 file changed, 28 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
>> index 217b6b19802e..c8c3c577463c 100644
>> --- a/drivers/crypto/ccp/sev-dev.c
>> +++ b/drivers/crypto/ccp/sev-dev.c
>> @@ -106,6 +106,9 @@ struct snp_hv_fixed_pages_entry {
>>  
>>  static LIST_HEAD(snp_hv_fixed_pages);
>>  
>> +/* Set while SNP has CPU hotplug disabled. */
>> +static bool snp_cpu_hotplug_disabled;
>> +
>>  /* Trusted Memory Region (TMR):
>>   *   The TMR is a 1MB area that must be 1MB aligned.  Use the page allocator
>>   *   to allocate the memory, which will return aligned memory for the specified
>> @@ -1479,6 +1482,17 @@ static int __sev_snp_init_locked(int *error, unsigned int max_snp_asid)
>>  
>>  	snp_hv_fixed_pages_state_update(sev, HV_FIXED);
>>  
>> +	/*
>> +	 * Disable CPU hotplug while SNP is active.  Guard against stacking
>> +	 * the disable count: the legacy SNP_SHUTDOWN_EX path clears
>> +	 * snp_initialized without re-enabling hotplug, so this can run
>> +	 * again while hotplug is already disabled.
>> +	 */
>> +	if (!snp_cpu_hotplug_disabled) {
>> +		cpu_hotplug_disable();
>> +		snp_cpu_hotplug_disabled = true;
>> +	}
>> +
> 
> ... should this be done before __sev_do_cmd_locked(SEV_CMD_SNP_INIT_EX)
> is issued?
> 
> I'm assuming that is when the firmware enumerates the CPUs during SNP
> initialization and any hotplug after that should be disallowed?

Any hotplug before would be bad, too. SEV firmware understands what CPUs
are physically available based on the installed processor and BIOS/UEFI
settings (e.g. disabling SMT from the BIOS), not what Linux has online
at the time of SNP_INIT_EX.

So maybe the commit message needs updating about that.

Thanks,
Tom

> 
>>  	snp_setup_rmpopt();
>>  
>>  	sev->snp_initialized = true;


^ permalink raw reply

* Re: [PATCH v8 7/7] x86/sev: Add debugfs support for RMPOPT
From: Dave Hansen @ 2026-06-18 21:42 UTC (permalink / raw)
  To: Kalra, Ashish, Borislav Petkov
  Cc: tglx, mingo, dave.hansen, x86, hpa, seanjc, peterz,
	thomas.lendacky, herbert, davem, ardb, pbonzini, aik,
	Michael.Roth, KPrateek.Nayak, Tycho.Andersen, Nathan.Fontenot,
	ackerleytng, jackyli, pgonda, rientjes, jacobhxu, xin,
	pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen, darwi,
	linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <5849645c-f701-4768-8cdf-1f9032e3226f@amd.com>

On 6/18/26 12:57, Kalra, Ashish wrote:
> Maybe i can add a line to this patch's commit message stating it's a
> debug-only interface with no stability guarantee.

I'd highly suggest reading the lwn article Boris referenced.

In the meantime, drop this patch from the series. Please. Let's revisit
debugging interfaces *after* this gets merged. That way, you can
concentrate on functionality and not debug interfaces that aren't
critically needed.

^ permalink raw reply

* Re: [PATCH v8 6/7] KVM: SEV: Perform RMP optimizations on SNP guest shutdown
From: Dave Hansen @ 2026-06-18 21:42 UTC (permalink / raw)
  To: Ashish Kalra, tglx, mingo, bp, dave.hansen, x86, hpa, seanjc,
	peterz, thomas.lendacky, herbert, davem, ardb
  Cc: pbonzini, aik, Michael.Roth, KPrateek.Nayak, Tycho.Andersen,
	Nathan.Fontenot, ackerleytng, jackyli, pgonda, rientjes, jacobhxu,
	xin, pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen,
	darwi, linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <25c3693a59c8f00796e84f1ffa668df6e3b734b5.1781419998.git.ashish.kalra@amd.com>

On 6/15/26 12:50, Ashish Kalra wrote:
> From: Ashish Kalra <ashish.kalra@amd.com>
> 
> Pages are converted from shared to private as SNP guests are launched.
> This destroys exisiting RMPOPT optimizations in the regions where
> pages are converted.
> 
> Conversely, guest pages are converted back to shared during SNP guest
> termination and their region may become eligible for RMPOPT
> optimization.

Oh, actually that would be good text for the *previous* patch too. You
might want to move some of it there.

^ permalink raw reply

* Re: [PATCH v8 5/7] x86/sev: Add interface to re-enable RMP optimizations.
From: Dave Hansen @ 2026-06-18 21:41 UTC (permalink / raw)
  To: Ashish Kalra, tglx, mingo, bp, dave.hansen, x86, hpa, seanjc,
	peterz, thomas.lendacky, herbert, davem, ardb
  Cc: pbonzini, aik, Michael.Roth, KPrateek.Nayak, Tycho.Andersen,
	Nathan.Fontenot, ackerleytng, jackyli, pgonda, rientjes, jacobhxu,
	xin, pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen,
	darwi, linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <cdb8098074de8e150dcf534ab806e38744325a57.1781419998.git.ashish.kalra@amd.com>

On 6/15/26 12:49, Ashish Kalra wrote:
> From: Ashish Kalra <ashish.kalra@amd.com>
> 
> RMPOPT table is a per-CPU table which indicates if 1GB regions of
> physical memory are entirely hypervisor-owned or not.
> 
> When performing host memory accesses in hypervisor mode as well as
> non-SNP guest mode, the processor may consult the RMPOPT table to
> potentially skip an RMP access and improve performance.
> 
> Events such as RMPUPDATE can clear RMP optimizations. Add an interface
> to re-enable those optimizations.

This doesn't really help me understand when or how this function might
be called.

	Normal guest evens like splitting and collapsing large pages can
	clear RMP optimizations. Without some intervention, all RMP
	optimizations would eventually be lost. Periodically re-optimize
	the system.

> The interface uses mod_delayed_work() instead of queue_delayed_work()
> so that the delay timer is reset on each call. This provides proper
> batching semantics: re-optimization runs 10 seconds after the *last*
> VM termination rather than after the first. mod_delayed_work() also
> re-queues work that is already in-flight, so a re-scan request
> during an active scan is not silently dropped.

This seems sane.

> +void snp_rmpopt_all_physmem(void)
> +{
> +	if (!cpu_feature_enabled(X86_FEATURE_RMPOPT) || !rmpopt_configured)
> +		return;
> +
> +	guard(mutex)(&rmpopt_wq_mutex);
> +
> +	if (!rmpopt_wq)
> +		return;
> +
> +	mod_delayed_work(rmpopt_wq, &rmpopt_delayed_work,
> +			 msecs_to_jiffies(RMPOPT_WORK_TIMEOUT));
> +}
> +EXPORT_SYMBOL_GPL(snp_rmpopt_all_physmem);

Does this need to be globally exported? Or can it be exported to a
single module namespace?

I'm close to being able to ack this, but it's still got a few too many
nits to ack.

^ permalink raw reply

* Re: [PATCH v8 3/7] crypto/ccp: Disable CPU hotplug while SNP is active
From: Dave Hansen @ 2026-06-18 21:35 UTC (permalink / raw)
  To: Ashish Kalra, tglx, mingo, bp, dave.hansen, x86, hpa, seanjc,
	peterz, thomas.lendacky, herbert, davem, ardb
  Cc: pbonzini, aik, Michael.Roth, KPrateek.Nayak, Tycho.Andersen,
	Nathan.Fontenot, ackerleytng, jackyli, pgonda, rientjes, jacobhxu,
	xin, pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen,
	darwi, linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <1feccf6e2a56d949b30f403c0ca7949f580e5982.1781419998.git.ashish.kalra@amd.com>

On 6/15/26 12:49, Ashish Kalra wrote:
> +	/*
> +	 * Disable CPU hotplug while SNP is active.  Guard against stacking
> +	 * the disable count: the legacy SNP_SHUTDOWN_EX path clears
> +	 * snp_initialized without re-enabling hotplug, so this can run
> +	 * again while hotplug is already disabled.
> +	 */
> +	if (!snp_cpu_hotplug_disabled) {
> +		cpu_hotplug_disable();
> +		snp_cpu_hotplug_disabled = true;
> +	}

This seems like a hack, guys.

cpu_hotplug_disable() seems like more of a temporary lock than enforcing
basically permanent system state.

This seems like it would be better implemented by registering a CPU
hotplug callback and then refusing to offline if sev->snp_initialized is
set.

snp_setup_rmpopt() can be run any time, right? It doesn't need to be
after sev->snp_initialized=1.

^ permalink raw reply

* Re: [PATCH v8 2/7] x86/sev: Initialize RMPOPT configuration MSRs
From: Tom Lendacky @ 2026-06-18 21:08 UTC (permalink / raw)
  To: Ashish Kalra, tglx, mingo, bp, dave.hansen, x86, hpa, seanjc,
	peterz, herbert, davem, ardb
  Cc: pbonzini, aik, Michael.Roth, KPrateek.Nayak, Tycho.Andersen,
	Nathan.Fontenot, ackerleytng, jackyli, pgonda, rientjes, jacobhxu,
	xin, pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen,
	darwi, linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <6a4d0ec9e037d91c0fdba9c525942ca288e1b1b2.1781419998.git.ashish.kalra@amd.com>

On 6/15/26 14:48, Ashish Kalra wrote:
> From: Ashish Kalra <ashish.kalra@amd.com>
> 
> The new RMPOPT instruction helps manage per-CPU RMP optimization
> structures inside the CPU. It takes a 1GB-aligned physical address
> and either returns the status of the optimizations or tries to enable
> the optimizations.
> 
> Per-CPU RMPOPT tables support at most 2 TB of addressable memory for
> RMP optimizations.
> 
> Initialize the per-CPU RMPOPT table base to the starting physical
> address. This enables RMP optimization for up to 2 TB of system RAM on
> all CPUs.
> 
> Additionally, add support to setup and enable RMPOPT once SNP is
> enabled and initialized.
> 
> Suggested-by: Thomas Lendacky <thomas.lendacky@amd.com>
> Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
> Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
> ---
>  arch/x86/coco/core.c             |  2 +
>  arch/x86/include/asm/msr-index.h |  3 ++
>  arch/x86/include/asm/sev.h       |  4 ++
>  arch/x86/virt/svm/sev.c          | 70 ++++++++++++++++++++++++++++++++
>  drivers/crypto/ccp/sev-dev.c     |  3 ++
>  5 files changed, 82 insertions(+)
> 
> diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
> index 989ca9f72ba3..8c1393ddc5df 100644
> --- a/arch/x86/coco/core.c
> +++ b/arch/x86/coco/core.c
> @@ -16,6 +16,7 @@
>  #include <asm/archrandom.h>
>  #include <asm/coco.h>
>  #include <asm/processor.h>
> +#include <asm/sev.h>
>  
>  enum cc_vendor cc_vendor __ro_after_init = CC_VENDOR_NONE;
>  SYM_PIC_ALIAS(cc_vendor);
> @@ -172,6 +173,7 @@ static void amd_cc_platform_clear(enum cc_attr attr)
>  	switch (attr) {
>  	case CC_ATTR_HOST_SEV_SNP:
>  		cc_flags.host_sev_snp = 0;
> +		snp_clear_rmpopt_configured();
>  		break;
>  	default:
>  		break;
> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
> index 86554de9a3f5..28540744f1eb 100644
> --- a/arch/x86/include/asm/msr-index.h
> +++ b/arch/x86/include/asm/msr-index.h
> @@ -761,6 +761,9 @@
>  #define MSR_AMD64_SEG_RMP_ENABLED_BIT	0
>  #define MSR_AMD64_SEG_RMP_ENABLED	BIT_ULL(MSR_AMD64_SEG_RMP_ENABLED_BIT)
>  #define MSR_AMD64_RMP_SEGMENT_SHIFT(x)	(((x) & GENMASK_ULL(13, 8)) >> 8)
> +#define MSR_AMD64_RMPOPT_BASE		0xc0010139
> +#define MSR_AMD64_RMPOPT_ENABLE_BIT	0
> +#define MSR_AMD64_RMPOPT_ENABLE		BIT_ULL(MSR_AMD64_RMPOPT_ENABLE_BIT)
>  
>  #define MSR_SVSM_CAA			0xc001f000
>  
> diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
> index 594cfa19cbd4..0d662221615a 100644
> --- a/arch/x86/include/asm/sev.h
> +++ b/arch/x86/include/asm/sev.h
> @@ -662,6 +662,8 @@ static inline void snp_leak_pages(u64 pfn, unsigned int pages)
>  	__snp_leak_pages(pfn, pages, true);
>  }
>  int snp_prepare(void);
> +void snp_setup_rmpopt(void);
> +void snp_clear_rmpopt_configured(void);
>  void snp_shutdown(void);
>  #else
>  static inline bool snp_probe_rmptable_info(void) { return false; }
> @@ -680,6 +682,8 @@ static inline void snp_leak_pages(u64 pfn, unsigned int npages) {}
>  static inline void kdump_sev_callback(void) { }
>  static inline void snp_fixup_e820_tables(void) {}
>  static inline int snp_prepare(void) { return -ENODEV; }
> +static inline void snp_setup_rmpopt(void) {}
> +static inline void snp_clear_rmpopt_configured(void) {}
>  static inline void snp_shutdown(void) {}
>  #endif
>  
> diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
> index 8bcdce98f6dc..1b5c18408f0b 100644
> --- a/arch/x86/virt/svm/sev.c
> +++ b/arch/x86/virt/svm/sev.c
> @@ -124,6 +124,10 @@ static void *rmp_bookkeeping __ro_after_init;
>  
>  static u64 probed_rmp_base, probed_rmp_size;
>  
> +static cpumask_t rmpopt_cpumask;
> +static phys_addr_t rmpopt_pa_start;
> +static bool rmpopt_configured;

The usage of this isn't doesn't imply what the name says. How about
changing it to rmpopt_capable ?

> +
>  static LIST_HEAD(snp_leaked_pages_list);
>  static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
>  
> @@ -490,7 +494,12 @@ static bool __init setup_rmptable(void)
>  	if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) {
>  		if (!setup_segmented_rmptable())
>  			return false;
> +		rmpopt_configured = true;
>  	} else {
> +		/*
> +		 * RMPOPT requires a segmented RMP table, so leave
> +		 * rmpopt_configured clear on contiguous RMP systems.
> +		 */

I think this comment should be above where rmpopt_configured is set,
slightly changed to

	RMPOPT requires a segmented RMP, so indicate that the system
	is capable of configuring and running RMPOPT.

Thanks,
Tom
>  		if (!setup_contiguous_rmptable())
>  			return false;
>  	}
> @@ -555,6 +564,21 @@ int snp_prepare(void)
>  }
>  EXPORT_SYMBOL_FOR_MODULES(snp_prepare, "ccp");
>  
> +static void rmpopt_cleanup(void)
> +{
> +	int cpu;
> +
> +	cpus_read_lock();
> +
> +	for_each_cpu(cpu, &rmpopt_cpumask)
> +		WARN_ON_ONCE(wrmsrq_on_cpu(cpu, MSR_AMD64_RMPOPT_BASE, 0));
> +
> +	cpus_read_unlock();
> +
> +	cpumask_clear(&rmpopt_cpumask);
> +	rmpopt_pa_start = 0;
> +}
> +
>  void snp_shutdown(void)
>  {
>  	u64 syscfg;
> @@ -563,11 +587,57 @@ void snp_shutdown(void)
>  	if (syscfg & MSR_AMD64_SYSCFG_SNP_EN)
>  		return;
>  
> +	rmpopt_cleanup();
> +
>  	clear_rmp();
>  	on_each_cpu(mfd_reconfigure, NULL, 1);
>  }
>  EXPORT_SYMBOL_FOR_MODULES(snp_shutdown, "ccp");
>  
> +void snp_clear_rmpopt_configured(void)
> +{
> +	rmpopt_configured = false;
> +}
> +
> +void snp_setup_rmpopt(void)
> +{
> +	u64 rmpopt_base;
> +	int cpu;
> +
> +	if (!cpu_feature_enabled(X86_FEATURE_RMPOPT) || !rmpopt_configured)
> +		return;
> +
> +	cpus_read_lock();
> +
> +	/*
> +	 * The RMPOPT_BASE MSR is per-core, so only one thread per core needs
> +	 * to set up the RMPOPT_BASE MSR.
> +	 *
> +	 * Note: only online primary threads are included.  If a core's
> +	 * primary thread is offline, that core is not covered.  CPU hotplug
> +	 * is not currently supported with SNP enabled.
> +	 */
> +
> +	for_each_online_cpu(cpu)
> +		if (topology_is_primary_thread(cpu))
> +			cpumask_set_cpu(cpu, &rmpopt_cpumask);
> +
> +	rmpopt_pa_start = ALIGN_DOWN(PFN_PHYS(min_low_pfn), SZ_1G);
> +	rmpopt_base = rmpopt_pa_start | MSR_AMD64_RMPOPT_ENABLE;
> +
> +	/*
> +	 * Per-CPU RMPOPT tables support at most 2 TB of addressable memory
> +	 * for RMP optimizations. Initialize the per-CPU RMPOPT table base
> +	 * to the starting physical address to enable RMP optimizations for
> +	 * up to 2 TB of system RAM on all CPUs.
> +	 */
> +	for_each_cpu(cpu, &rmpopt_cpumask)
> +		WARN_ON_ONCE(wrmsrq_on_cpu(cpu, MSR_AMD64_RMPOPT_BASE, rmpopt_base));
> +
> +	cpus_read_unlock();
> +}
> +EXPORT_SYMBOL_FOR_MODULES(snp_setup_rmpopt, "ccp");
> +
>  /*
>   * Do the necessary preparations which are verified by the firmware as
>   * described in the SNP_INIT_EX firmware command description in the SNP
> diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
> index 78f98aee7a66..217b6b19802e 100644
> --- a/drivers/crypto/ccp/sev-dev.c
> +++ b/drivers/crypto/ccp/sev-dev.c
> @@ -1478,6 +1478,9 @@ static int __sev_snp_init_locked(int *error, unsigned int max_snp_asid)
>  	}
>  
>  	snp_hv_fixed_pages_state_update(sev, HV_FIXED);
> +
> +	snp_setup_rmpopt();
> +
>  	sev->snp_initialized = true;
>  	dev_dbg(sev->dev, "SEV-SNP firmware initialized, SEV-TIO is %s\n",
>  		data.tio_en ? "enabled" : "disabled");


^ permalink raw reply

* Re: [PATCH v8 7/7] x86/sev: Add debugfs support for RMPOPT
From: Borislav Petkov @ 2026-06-18 20:10 UTC (permalink / raw)
  To: Kalra, Ashish
  Cc: tglx, mingo, dave.hansen, x86, hpa, seanjc, peterz,
	thomas.lendacky, herbert, davem, ardb, pbonzini, aik,
	Michael.Roth, KPrateek.Nayak, Tycho.Andersen, Nathan.Fontenot,
	ackerleytng, jackyli, pgonda, rientjes, jacobhxu, xin,
	pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen, darwi,
	linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <5849645c-f701-4768-8cdf-1f9032e3226f@amd.com>

On Thu, Jun 18, 2026 at 02:57:45PM -0500, Kalra, Ashish wrote:
> Maybe i can add a line to this patch's commit message stating it's a debug-only interface
> with no stability guarantee.

Sounds to me like you didn't really read that article.

> We have to provide some method/interface for users to verify if RMP optimizations
> are enabled for a GB range of memory.

Sounds to me like this wants to be a facility which is present in the kernel
and it is going to be an ABI.

I am unclear on the real use case but I'm open to being persuaded otherwise.

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply

* Re: [PATCH v8 7/7] x86/sev: Add debugfs support for RMPOPT
From: Kalra, Ashish @ 2026-06-18 19:57 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: tglx, mingo, dave.hansen, x86, hpa, seanjc, peterz,
	thomas.lendacky, herbert, davem, ardb, pbonzini, aik,
	Michael.Roth, KPrateek.Nayak, Tycho.Andersen, Nathan.Fontenot,
	ackerleytng, jackyli, pgonda, rientjes, jacobhxu, xin,
	pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen, darwi,
	linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <20260618180814.GCajQ0Dv0CoRMJxbP0@fat_crate.local>


On 6/18/2026 1:08 PM, Borislav Petkov wrote:
> On Mon, Jun 15, 2026 at 07:50:56PM +0000, Ashish Kalra wrote:
>> From: Ashish Kalra <ashish.kalra@amd.com>
>>
>> Add a debugfs interface to report per-CPU RMPOPT status across all
>> system RAM.
>>
>> To dump the per-CPU RMPOPT status for all system RAM:
>>
>> /sys/kernel/debug/rmpopt# cat rmpopt-table
>>
>> Memory @  0GB: CPU(s): none
>> Memory @  1GB: CPU(s): none
>> Memory @  2GB: CPU(s): 0-1023
>> Memory @  3GB: CPU(s): 0-1023
>> Memory @  4GB: CPU(s): none
>> Memory @  5GB: CPU(s): 0-1023
>> Memory @  6GB: CPU(s): 0-1023
>> Memory @  7GB: CPU(s): 0-1023
>> ...
>> Memory @1025GB: CPU(s): 0-1023
>> Memory @1026GB: CPU(s): 0-1023
>> Memory @1027GB: CPU(s): 0-1023
>> Memory @1028GB: CPU(s): 0-1023
>> Memory @1029GB: CPU(s): 0-1023
>> Memory @1030GB: CPU(s): 0-1023
>> Memory @1031GB: CPU(s): 0-1023
>> Memory @1032GB: CPU(s): 0-1023
>> Memory @1033GB: CPU(s): 0-1023
>> Memory @1034GB: CPU(s): 0-1023
>> Memory @1035GB: CPU(s): 0-1023
>> Memory @1036GB: CPU(s): 0-1023
>> Memory @1037GB: CPU(s): 0-1023
>> Memory @1038GB: CPU(s): none
>>
>> Suggested-by: Thomas Lendacky <thomas.lendacky@amd.com>
>> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
>> ---
>>  arch/x86/virt/svm/sev.c | 128 ++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 128 insertions(+)
> 
> https://lwn.net/Articles/309298/
> 

Since the RMPOPT file is a diagnostic (verify the optimization took effect), debugfs is
arguably the right home for it and we are not claiming it to be an API (there is no
Documentation/ABI entry for it) and we are not presenting it as something tools should
depend on, it is a self-contained diagnostic/debug interface.

Maybe i can add a line to this patch's commit message stating it's a debug-only interface
with no stability guarantee.

We have to provide some method/interface for users to verify if RMP optimizations
are enabled for a GB range of memory.

Thanks,
Ashish

^ permalink raw reply

* Re: [PATCH v8 2/7] x86/sev: Initialize RMPOPT configuration MSRs
From: Kalra, Ashish @ 2026-06-18 18:23 UTC (permalink / raw)
  To: K Prateek Nayak, tglx, mingo, bp, dave.hansen, x86, hpa, seanjc,
	peterz, thomas.lendacky, herbert, davem, ardb
  Cc: pbonzini, aik, Michael.Roth, Tycho.Andersen, Nathan.Fontenot,
	ackerleytng, jackyli, pgonda, rientjes, jacobhxu, xin,
	pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen, darwi,
	linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <fb2f1105-3bef-4197-bccd-865c013ce712@amd.com>

Hello Prateek,

On 6/16/2026 1:03 AM, K Prateek Nayak wrote:
> Hello Ashish,
> 
> On 6/16/2026 1:18 AM, Ashish Kalra wrote:
>> diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
>> index 8bcdce98f6dc..1b5c18408f0b 100644
>> --- a/arch/x86/virt/svm/sev.c
>> +++ b/arch/x86/virt/svm/sev.c
>> @@ -124,6 +124,10 @@ static void *rmp_bookkeeping __ro_after_init;
>>  
>>  static u64 probed_rmp_base, probed_rmp_size;
>>  
>> +static cpumask_t rmpopt_cpumask;
> 
> nit.
> 
> I believe you can use cpumask_var_t here and do a zalloc_cpumask_var()
> during snp_setup_rmpopt(). That way !X86_FEATURE_RMPOPT configs don't
> have to needlessly waste space to keep a redundant cpumask around.
> 
> Same comment for rmpopt_report_cpumask in Patch 7 which can be
> allocated dynamically during rmpopt_debugfs_setup().
> 

Yes.

>> +static phys_addr_t rmpopt_pa_start;
>> +static bool rmpopt_configured;
>> +
>>  static LIST_HEAD(snp_leaked_pages_list);
>>  static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
>>  
>> @@ -490,7 +494,12 @@ static bool __init setup_rmptable(void)
>>  	if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) {
>>  		if (!setup_segmented_rmptable())
>>  			return false;
>> +		rmpopt_configured = true;
>>  	} else {
>> +		/*
>> +		 * RMPOPT requires a segmented RMP table, so leave
>> +		 * rmpopt_configured clear on contiguous RMP systems.
>> +		 */
>>  		if (!setup_contiguous_rmptable())
>>  			return false;
>>  	}
>> @@ -555,6 +564,21 @@ int snp_prepare(void)
>>  }
>>  EXPORT_SYMBOL_FOR_MODULES(snp_prepare, "ccp");
>>  
>> +static void rmpopt_cleanup(void)
>> +{
>> +	int cpu;
>> +
>> +	cpus_read_lock();
> 
> nit.
> 
> You can use guard(cpus_read_lock)() unless there is a complicated
> locking pattern where you need to drop and re-acquire the read lock.

But if i use guard(cpus_read_lock)(), cpus_read_lock stays held across as it is
function-scope, so it will be still held for code following the wrmsrq_on_cpu(),
which is harmless but still changes code behavior.

Probably, the other option is to use scoped_guard form ? 

Thanks,
Ashish

> 
>> +
>> +	for_each_cpu(cpu, &rmpopt_cpumask)
>> +		WARN_ON_ONCE(wrmsrq_on_cpu(cpu, MSR_AMD64_RMPOPT_BASE, 0));
>> +
>> +	cpus_read_unlock();
>> +
>> +	cpumask_clear(&rmpopt_cpumask);
>> +	rmpopt_pa_start = 0;
>> +}
>> +
>>  void snp_shutdown(void)
>>  {
>>  	u64 syscfg;
> 

^ permalink raw reply

* Re: [PATCH v8 7/7] x86/sev: Add debugfs support for RMPOPT
From: Borislav Petkov @ 2026-06-18 18:08 UTC (permalink / raw)
  To: Ashish Kalra
  Cc: tglx, mingo, dave.hansen, x86, hpa, seanjc, peterz,
	thomas.lendacky, herbert, davem, ardb, pbonzini, aik,
	Michael.Roth, KPrateek.Nayak, Tycho.Andersen, Nathan.Fontenot,
	ackerleytng, jackyli, pgonda, rientjes, jacobhxu, xin,
	pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen, darwi,
	linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <cc9aa9b6cfa2ce826f2ad53f8a13d3b7bf0790b6.1781419998.git.ashish.kalra@amd.com>

On Mon, Jun 15, 2026 at 07:50:56PM +0000, Ashish Kalra wrote:
> From: Ashish Kalra <ashish.kalra@amd.com>
> 
> Add a debugfs interface to report per-CPU RMPOPT status across all
> system RAM.
> 
> To dump the per-CPU RMPOPT status for all system RAM:
> 
> /sys/kernel/debug/rmpopt# cat rmpopt-table
> 
> Memory @  0GB: CPU(s): none
> Memory @  1GB: CPU(s): none
> Memory @  2GB: CPU(s): 0-1023
> Memory @  3GB: CPU(s): 0-1023
> Memory @  4GB: CPU(s): none
> Memory @  5GB: CPU(s): 0-1023
> Memory @  6GB: CPU(s): 0-1023
> Memory @  7GB: CPU(s): 0-1023
> ...
> Memory @1025GB: CPU(s): 0-1023
> Memory @1026GB: CPU(s): 0-1023
> Memory @1027GB: CPU(s): 0-1023
> Memory @1028GB: CPU(s): 0-1023
> Memory @1029GB: CPU(s): 0-1023
> Memory @1030GB: CPU(s): 0-1023
> Memory @1031GB: CPU(s): 0-1023
> Memory @1032GB: CPU(s): 0-1023
> Memory @1033GB: CPU(s): 0-1023
> Memory @1034GB: CPU(s): 0-1023
> Memory @1035GB: CPU(s): 0-1023
> Memory @1036GB: CPU(s): 0-1023
> Memory @1037GB: CPU(s): 0-1023
> Memory @1038GB: CPU(s): none
> 
> Suggested-by: Thomas Lendacky <thomas.lendacky@amd.com>
> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
> ---
>  arch/x86/virt/svm/sev.c | 128 ++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 128 insertions(+)

https://lwn.net/Articles/309298/

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply

* Re: [PATCH v6 00/20] dma-mapping: Use DMA_ATTR_CC_SHARED through direct, pool and swiotlb paths
From: Jason Gunthorpe @ 2026-06-18 15:37 UTC (permalink / raw)
  To: Aneesh Kumar K.V
  Cc: Alexey Kardashevskiy, Catalin Marinas, iommu, linux-arm-kernel,
	linux-kernel, linux-coco, Robin Murphy, Marek Szyprowski,
	Will Deacon, Marc Zyngier, Steven Price, Suzuki K Poulose,
	Jiri Pirko, Mostafa Saleh, Petr Tesarik, Dan Williams, Xu Yilun,
	linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
	Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
	Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, Sven Schnelle, x86
In-Reply-To: <yq5aqzm4dz25.fsf@kernel.org>

On Thu, Jun 18, 2026 at 09:37:22AM +0100, Aneesh Kumar K.V wrote:
> Alexey Kardashevskiy <aik@amd.com> writes:
> 
> > On 10/6/26 00:47, Jason Gunthorpe wrote:
> >> On Tue, Jun 09, 2026 at 02:43:08PM +0100, Catalin Marinas wrote:
> >>> On Thu, Jun 04, 2026 at 02:09:39PM +0530, Aneesh Kumar K.V (Arm) wrote:
> >>>> This series propagates DMA_ATTR_CC_SHARED through the dma-direct,
> >>>> dma-pool, and swiotlb paths so that encrypted and decrypted DMA buffers
> >>>> are handled consistently.
> >>>>
> >>>> Today, the direct DMA path mostly relies on force_dma_unencrypted() for
> >>>> shared/decrypted buffer handling. This series consolidates the
> >>>> force_dma_unencrypted() checks in the top-level functions and ensures
> >>>> that the remaining DMA interfaces use DMA attributes to make the correct
> >>>> decisions.
> >>>
> >>> Please check Sashiko's reports, it has some good points:
> >>>
> >>> https://sashiko.dev/#/patchset/20260604083959.1265923-1-aneesh.kumar@kernel.org
> >>>
> >>> I think the main one is the swiotlb_tbl_map_single() changes which break
> >>> AMD SME host support. There cc_platform_has(CC_ATTR_MEM_ENCRYPT) is true
> >>> but force_dma_unencrypted() is false. Normally you'd not end up on this
> >>> path but you can have swiotlb=force.
> >> 
> >> IMHO that's an AMD issue, not with the design of this series..
> >> 
> >> The series is right, a device that is !force_dma_decrypted() must be
> >> considerd to be a trusted device and we must never place any DMA
> >> mappings for a trusted device into shared memory.
> >
> > swiotlb=force forces swiotlb, not decryption.

If force_dma_decrypted() == true then swiotlb must allocate from a
decrypted memory pool. It is right there in the name!

The hypervisor environment should *never* set force_dma_decrypted()
because all devices can access all hypervisor memory, up to their IOVA
limits.

> > So when I try "mem_encrypt=on iommu=pt swiotlb=force" with this
> > patchset, it fails to boot. But it boots with a hack like this:

On the host side I expect this to cause swiotlb to allocate encrypted
memory and bounce to it.

>  		u64 dma_enc_mask = DMA_BIT_MASK(__ffs64(sme_me_mask));
>  		u64 dma_dev_mask = min_not_zero(dev->coherent_dma_mask,
>  						dev->bus_dma_limit);
> +		/*
> +		 * With memory encryption enabled, SWIOTLB is marked decrypted.
> +		 * If SWIOTLB bouncing is forced, treat the device as requiring
> +		 * decrypted DMA.
> +		 */

And this is more insane logic. The right fix is to allocate the
swiotlb bounce from the *encrypted* pools when running on the
hypervisor which requires undoing this abuse of force_dma_decrypted().

Jason

^ permalink raw reply

* Re: [PATCH v2 02/17] x86/virt/tdx: Configure add-on features on TDX module init and update
From: Dave Hansen @ 2026-06-18 15:04 UTC (permalink / raw)
  To: Xu Yilun, x86, kvm, linux-coco, linux-kernel
  Cc: djbw, kas, rick.p.edgecombe, yilun.xu, xiaoyao.li, sohil.mehta,
	adrian.hunter, kishen.maloor, tony.lindgren, peter.fang, baolu.lu,
	zhenzhong.duan, dave.hansen, seanjc
In-Reply-To: <20260618081355.3253581-3-yilun.xu@linux.intel.com>

On 6/18/26 01:13, Xu Yilun wrote:
>  int tdx_module_run_update(void)
>  {
> +	u64 seamcall_fn = TDH_SYS_UPDATE_V0;
>  	struct tdx_module_args args = {};
>  	int ret;
>  
> -	ret = seamcall_prerr(TDH_SYS_UPDATE, &args);
> +	if (tdx_addon_feature0) {
> +		args.r9 = tdx_addon_feature0;
> +		seamcall_fn = TDH_SYS_UPDATE;
> +	}

Heh, and it falls apart into craziness immediately. See how it
immediately loses the logical information that there's a version 1 and a
version 0? The "1" isn't even visible. It's hidden in "TDH_SYS_UPDATE".

Isn't this a million times more sane?

	struct tdx_module_args args = {};
	u64 version;
  	int ret;

	if (tdx_addon_feature0) {
		args.r9 = tdx_addon_feature0;
		version = 1;
	} else {
		version = 0;
	}

	ret = seamcall_prerr(TDH_SYS_UPDATE, version, &args);


There's also zero stopping us from putting version in args:

	struct tdx_module_args args = {};
  	int ret;

	if (tdx_addon_feature0) {
		args.r9 = tdx_addon_feature0;
		args.version = 1;
	}

	ret = seamcall_prerr(TDH_SYS_UPDATE, &args);

Eh?

That gives args.version==0 in all the normal cases which just happens to
be the exact behavior we want. It also avoids having to plumb version
through all the seamcall*() wrappers.

But this is *exactly* the kind of thing that shouldn't be a part of an
attestation patch series. This could very much have been a separate
discussion and happened a month or a year ago. But now it is blocking
this DICE thing from getting done <grumble>.

^ permalink raw reply

* Re: [PATCH v2 01/17] x86/virt/tdx: Embed version info in SEAMCALL leaf function definitions
From: Dave Hansen @ 2026-06-18 14:45 UTC (permalink / raw)
  To: Xu Yilun, x86, kvm, linux-coco, linux-kernel
  Cc: djbw, kas, rick.p.edgecombe, yilun.xu, xiaoyao.li, sohil.mehta,
	adrian.hunter, kishen.maloor, tony.lindgren, peter.fang, baolu.lu,
	zhenzhong.duan, dave.hansen, seanjc
In-Reply-To: <20260618081355.3253581-2-yilun.xu@linux.intel.com>

On 6/18/26 01:13, Xu Yilun wrote:
> Embed version information in SEAMCALL leaf function definitions rather
> than let the caller open code them. For now, only TDH.VP.INIT is
> involved.

This is jumping the gun a bit in the changelog.

What is a SEAMCALL leaf function?

How does the version fit in?

> Don't bother the caller to choose the SEAMCALL version if unnecessary.

I think I see what you are trying to say here but it's more than that.

The question is whether there should be a base seamcall() API that takes
an explicit version or whether the version should be passed in by callers.

One wrinkle is that the naming of all of these things is around
"function", "func" and "fn":

u64 __seamcall(u64 fn, struct tdx_module_args *args);

A "function" is TDH.SYS.INIT or TDH.SYS.INFO, not 'TDH.SYS.INFO v123'.

But the low-level calls could be:

	u64 __seamcall(u64 fn, u64 version, ...);
	
or

	u64 __seamcall(u64 fn, ...);

Where 'fn' encodes the function *and* version.

> The old TDX modules that only support TDH.VP.INIT v0 are all deprecated,
> so only provide the latest (v1) definition.

No, this isn't how this is going to work.

What do we *NEED* from v1? Why churn the code if we don't *NEED*
something from v1 and can live with v0? It has *ZERO* to do with the TDX
module being deprecated or whatever.

Linux stays on the old interface until we need a new interface. We are
*not* going to bump version numbers just because.


>  /*
>   * TDX module SEAMCALL leaf functions
>   */
> @@ -31,7 +44,7 @@
>  #define TDH_VP_CREATE			10
>  #define TDH_MNG_KEY_FREEID		20
>  #define TDH_MNG_INIT			21
> -#define TDH_VP_INIT			22
> +#define TDH_VP_INIT			SEAMCALL_LEAF_VER(22, 1)
>  #define TDH_PHYMEM_PAGE_RDMD		24
>  #define TDH_VP_RD			26
>  #define TDH_PHYMEM_PAGE_RECLAIM		28
> @@ -50,14 +63,6 @@
>  #define TDH_SYS_UPDATE			53
>  #define TDH_SYS_DISABLE			69

That is unreadable and patterns can't be seen. This is better:

#define TDH_MNG_INIT			SEAMCALL_LEAF_VER(21, 0)
#define TDH_VP_INIT			SEAMCALL_LEAF_VER(22, 1)
#define TDH_PHYMEM_PAGE_RDMD		SEAMCALL_LEAF_VER(24, 0)

> --- a/arch/x86/virt/vmx/tdx/tdx.c
> +++ b/arch/x86/virt/vmx/tdx/tdx.c
> @@ -1903,8 +1903,7 @@ u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid)
>  		.r8 = x2apicid,
>  	};
>  
> -	/* apicid requires version == 1. */
> -	return seamcall(TDH_VP_INIT | (1ULL << TDX_VERSION_SHIFT), &args);
> +	return seamcall(TDH_VP_INIT, &args);
>  }
>  EXPORT_SYMBOL_FOR_KVM(tdh_vp_init);

But that whole scheme falls apart the first time the kernel needs
functionality from v2. You'll need:

#define TDH_VP_INIT_V0			SEAMCALL_LEAF_VER(22, 0)
#define TDH_VP_INIT_V1			SEAMCALL_LEAF_VER(22, 1)

and then the calls will do:

	if (foo)
		return seamcall(TDH_VP_INIT_V0, &args);
	else
		return seamcall(TDH_VP_INIT_V1, &args);

So this 100% goes down the road of needing #defines for *EACH* version.
That's the real implication here and the real choice.

That said, I don't particularly like:

	if (foo)
		return seamcall(TDH_VP_INIT, 0, &args);
	else
		return seamcall(TDH_VP_INIT, 1, &args);

all that much either because of the seemingly magic numbers.

The whole seamcall RAX thing is one step too clever. I think Linux did
the right thing:

5	common	open				sys_open
288	common	openat				sys_openat
437	common	openat2				sys_openat2

New ABI gets a new base number. No need for the other end of the ABI to
know that 288 is arguably a later version of 5.

Ugh. But why is this patch even in here in the first place? Why is this
even *ASSOCIATED* with DICE-based attestation? Isn't this completely
orthogonal?

^ permalink raw reply

* [PATCH v2 17/17] KVM: TDX: Support event-notify interrupts only with userspace Quoting
From: Xu Yilun @ 2026-06-18  8:13 UTC (permalink / raw)
  To: x86, kvm, linux-coco, linux-kernel
  Cc: djbw, kas, rick.p.edgecombe, yilun.xu, yilun.xu, xiaoyao.li,
	sohil.mehta, adrian.hunter, kishen.maloor, tony.lindgren,
	peter.fang, baolu.lu, zhenzhong.duan, dave.hansen, dave.hansen,
	seanjc
In-Reply-To: <20260618081355.3253581-1-yilun.xu@linux.intel.com>

From: Peter Fang <peter.fang@intel.com>

Tie userspace SetupEventNotifyInterrupt support to userspace Quote
generation. Delivering event-notify interrupts via userspace breaks if
KVM never exits to userspace in the first place.

This is an optional capability to notify the guest when Quoting has
completed. No known guest currently uses it, so defer adding in-kernel
support for now. The Linux TDX guest relies on polling only.

Signed-off-by: Peter Fang <peter.fang@intel.com>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
---
 arch/x86/kvm/vmx/tdx.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 20558b0185b6..25146da3933f 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -185,7 +185,7 @@ static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char i
 	tdx_clear_unsupported_cpuid(entry);
 }
 
-#define TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT	BIT(1)
+#define TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT	BIT_ULL(1)
 
 static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
 			     struct kvm_tdx_capabilities *caps)
@@ -202,8 +202,15 @@ static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
 
 	caps->cpuid.nent = td_conf->num_cpuid_config;
 
-	caps->user_tdvmcallinfo_1_r11 =
-		TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT;
+	/*
+	 * Don't advertise userspace event-notify interrupt support if TDX
+	 * quoting service is enabled, as quote generation will be handled
+	 * entirely in the kernel. Support in the kernel can be added later.
+	 */
+	if (!tdx_quote_enabled()) {
+		caps->user_tdvmcallinfo_1_r11 |=
+			TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT;
+	}
 
 	for (i = 0; i < td_conf->num_cpuid_config; i++)
 		td_init_cpuid_entry2(&caps->cpuid.entries[i], i);
@@ -1684,9 +1691,16 @@ static int tdx_get_quote(struct kvm_vcpu *vcpu)
 
 static int tdx_setup_event_notify_interrupt(struct kvm_vcpu *vcpu)
 {
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
 	struct vcpu_tdx *tdx = to_tdx(vcpu);
 	u64 vector = tdx->vp_enter_args.r12;
 
+	/* See comment in init_kvm_tdx_caps() */
+	if (kvm_tdx->get_quote_in_kernel) {
+		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED);
+		return 1;
+	}
+
 	if (vector < 32 || vector > 255) {
 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
 		return 1;
-- 
2.25.1


^ permalink raw reply related

* [PATCH v2 16/17] KVM: TDX: Add in-kernel Quote generation
From: Xu Yilun @ 2026-06-18  8:13 UTC (permalink / raw)
  To: x86, kvm, linux-coco, linux-kernel
  Cc: djbw, kas, rick.p.edgecombe, yilun.xu, yilun.xu, xiaoyao.li,
	sohil.mehta, adrian.hunter, kishen.maloor, tony.lindgren,
	peter.fang, baolu.lu, zhenzhong.duan, dave.hansen, dave.hansen,
	seanjc
In-Reply-To: <20260618081355.3253581-1-yilun.xu@linux.intel.com>

From: Peter Fang <peter.fang@intel.com>

Provide an in-kernel path for Quote generation when handling
TDG.VP.VMCALL<GetQuote>, without requiring an exit to userspace.

Use the core TDX API for Quote generation when the Quoting extension is
available. For simplicity, KVM checks its availability once per guest
during initialization. KVM does not handle Quoting service disruptions
or switch between the in-kernel and userspace paths.

Update the KVM API and TDX documentation to describe this new Quoting
capability.

Signed-off-by: Peter Fang <peter.fang@intel.com>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
---
 Documentation/arch/x86/tdx.rst |  19 ++---
 Documentation/virt/kvm/api.rst |   3 +
 arch/x86/include/asm/tdx.h     |   9 +++
 arch/x86/kvm/vmx/tdx.h         |   6 ++
 arch/x86/kvm/vmx/tdx.c         | 135 ++++++++++++++++++++++++++++++++-
 virt/kvm/kvm_main.c            |   1 +
 6 files changed, 163 insertions(+), 10 deletions(-)

diff --git a/Documentation/arch/x86/tdx.rst b/Documentation/arch/x86/tdx.rst
index 3303499ad4c6..f02bb6919d91 100644
--- a/Documentation/arch/x86/tdx.rst
+++ b/Documentation/arch/x86/tdx.rst
@@ -522,15 +522,16 @@ provided by attestation service so the TDREPORT can be verified uniquely.
 More details about the TDREPORT can be found in Intel TDX Module
 specification, section titled "TDG.MR.REPORT Leaf".
 
-After getting the TDREPORT, the second step of the attestation process
-is to send it to the Quoting Enclave (QE) to generate the Quote. TDREPORT
-by design can only be verified on the local platform as the MAC key is
-bound to the platform. To support remote verification of the TDREPORT,
-TDX leverages Intel SGX Quoting Enclave to verify the TDREPORT locally
-and convert it to a remotely verifiable Quote. Method of sending TDREPORT
-to QE is implementation specific. Attestation software can choose
-whatever communication channel available (i.e. vsock or TCP/IP) to
-send the TDREPORT to QE and receive the Quote.
+After getting the TDREPORT, the second step of the attestation process is to
+convert it to a Quote. A TDREPORT by design can only be verified on the local
+platform, as the MAC key is bound to the platform. A Quote makes the TDREPORT
+remotely verifiable. It can be generated either through a Quoting Enclave
+(QE) in userspace or through the Quoting service in kernel space. In
+userspace, the Intel SGX Quoting Enclave verifies the TDREPORT locally and
+converts it to a Quote. The method of sending the TDREPORT to the QE and
+receiving the Quote is implementation-specific. If the TDX module supports the
+Quoting service, the kernel can convert a TDREPORT to a Quote directly through
+a SEAMCALL. In this case, the Quote is generated entirely by the TDX module.
 
 References
 ==========
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 52bbbb553ce1..4a3b69b2e602 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -7335,6 +7335,9 @@ inputs and outputs of the TDVMCALL.  Currently the following values of
    queued successfully, the TDX guest can poll the status field in the
    shared-memory area to check whether the Quote generation is completed or
    not. When completed, the generated Quote is returned via the same buffer.
+   If the host kernel generates Quotes through the Quoting service provided by
+   the TDX module, KVM processes the GetQuote request and it will not appear in
+   userspace.
 
  * ``TDVMCALL_GET_TD_VM_CALL_INFO``: the guest has requested the support
    status of TDVMCALLs.  The output values for the given leaf should be
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 24bce7512de3..b9a24104415c 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -86,6 +86,15 @@ struct tdx_quote_req {
 	u8 data[];
 };
 
+#define TDX_QUOTE_REQ_HDR_SIZE		(offsetof(struct tdx_quote_req, data))
+
+/*
+ * TDG.VP.VMCALL<GetQuote> Status Codes
+ */
+#define TDX_QUOTE_STATUS_SUCCESS	0x0000000000000000ULL
+#define TDX_QUOTE_STATUS_ERROR		0x8000000000000000ULL
+#define TDX_QUOTE_STATUS_UNAVAILABLE	0x8000000000000001ULL
+
 #ifdef CONFIG_INTEL_TDX_GUEST
 
 void __init tdx_early_init(void);
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index ac8323a68b16..5e4b3aee0577 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -47,6 +47,12 @@ struct kvm_tdx {
 	 * Set/unset is protected with kvm->mmu_lock.
 	 */
 	bool wait_for_sept_zap;
+
+	/*
+	 * Whether to get the quote directly in kernel, without exiting to
+	 * userspace.
+	 */
+	bool get_quote_in_kernel;
 };
 
 /* TDX module vCPU states */
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 9f7c39e0d4b5..20558b0185b6 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1538,11 +1538,133 @@ static int tdx_get_quote_user(struct kvm_vcpu *vcpu, u64 gpa, u64 size)
 	return 0;
 }
 
+static bool write_quote_status_to_guest(struct kvm_vcpu *vcpu, u64 status,
+					gpa_t gpa)
+{
+	if (kvm_vcpu_write_guest(vcpu,
+				 gpa + offsetof(struct tdx_quote_req, status),
+				 &status, sizeof(status)))
+		return false;
+
+	return true;
+}
+
+static bool write_quote_to_guest(struct kvm_vcpu *vcpu, void *quote_data,
+				 u32 quote_len, gpa_t gpa)
+{
+	if (kvm_vcpu_write_guest(vcpu,
+				 gpa + TDX_QUOTE_REQ_HDR_SIZE,
+				 quote_data, quote_len))
+		return false;
+
+	if (kvm_vcpu_write_guest(vcpu,
+				 gpa + offsetof(struct tdx_quote_req, out_len),
+				 &quote_len, sizeof(quote_len)))
+		return false;
+
+	return true;
+}
+
+static u64 get_quote_kernel(struct kvm_vcpu *vcpu, struct tdx_quote_req *req,
+			    gpa_t req_gpa, size_t total_len)
+{
+	struct tdx_td *td = &to_kvm_tdx(vcpu->kvm)->td;
+
+	/* Only support version 1 as defined in the GHCI spec */
+	if (req->version != 1)
+		return TDX_QUOTE_STATUS_ERROR;
+
+	/* Header + input data must fit in the page read from guest memory */
+	if ((size_t)req->in_len + TDX_QUOTE_REQ_HDR_SIZE > PAGE_SIZE)
+		return TDX_QUOTE_STATUS_ERROR;
+
+	/* Caller owns the requested quote */
+	void *quote_data __free(kvfree) =
+		tdx_quote_generate(td, req->data, req->in_len, &req->out_len);
+
+	if (!quote_data)
+		return TDX_QUOTE_STATUS_UNAVAILABLE;
+
+	if ((size_t)req->out_len + TDX_QUOTE_REQ_HDR_SIZE > total_len)
+		return TDX_QUOTE_STATUS_ERROR;
+
+	if (!write_quote_to_guest(vcpu, quote_data, req->out_len, req_gpa))
+		return TDX_QUOTE_STATUS_ERROR;
+
+	return TDX_QUOTE_STATUS_SUCCESS;
+}
+
+static u64 tdx_get_quote_check_args(struct kvm_vcpu *vcpu, u64 gpa, u64 size)
+{
+	gfn_t gfn_start, gfn_end;
+	u64 end;
+
+	if (!size)
+		return TDVMCALL_STATUS_INVALID_OPERAND;
+
+	if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size))
+		return TDVMCALL_STATUS_ALIGN_ERROR;
+
+	if (check_add_overflow(gpa, size, &end))
+		return TDVMCALL_STATUS_INVALID_OPERAND;
+
+	gfn_start = gpa_to_gfn(gpa);
+	gfn_end = gpa_to_gfn(end);
+
+	/*
+	 * Reject if the guest didn't explicitly convert its quote pages to
+	 * shared.
+	 */
+	if (!kvm_range_has_memory_attributes(vcpu->kvm, gfn_start, gfn_end,
+					     KVM_MEMORY_ATTRIBUTE_PRIVATE, 0))
+		return TDVMCALL_STATUS_INVALID_OPERAND;
+
+	return TDVMCALL_STATUS_SUCCESS;
+}
+
+static int tdx_get_quote_kernel(struct kvm_vcpu *vcpu, u64 gpa, u64 size)
+{
+	void *first_page = NULL;
+	u64 err, qerr;
+
+	err = tdx_get_quote_check_args(vcpu, gpa, size);
+	if (err != TDVMCALL_STATUS_SUCCESS)
+		goto out;
+
+	err = TDVMCALL_STATUS_INVALID_OPERAND;
+
+	first_page = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!first_page)
+		goto out;
+
+	/*
+	 * Read the first GetQuote page for its header + input data. The check
+	 * above ensures that this GetQuote message is at least one page in
+	 * size. in_data spanning more than a page is not supported.
+	 */
+	if (kvm_vcpu_read_guest(vcpu, gpa, first_page, PAGE_SIZE))
+		goto out;
+
+	qerr = get_quote_kernel(vcpu, first_page, (gpa_t)gpa, size);
+
+	if (write_quote_status_to_guest(vcpu, qerr, (gpa_t)gpa) &&
+	    qerr == TDX_QUOTE_STATUS_SUCCESS)
+		err = TDVMCALL_STATUS_SUCCESS;
+
+out:
+	kfree(first_page);
+	tdvmcall_set_return_code(vcpu, err);
+
+	return 1;
+}
+
 static int tdx_get_quote(struct kvm_vcpu *vcpu)
 {
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
 	struct vcpu_tdx *tdx = to_tdx(vcpu);
 	u64 gpa = tdx->vp_enter_args.r12;
 	u64 size = tdx->vp_enter_args.r13;
+	int ret;
 
 	/* The gpa of buffer must have shared bit set. */
 	if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
@@ -1552,7 +1674,12 @@ static int tdx_get_quote(struct kvm_vcpu *vcpu)
 
 	gpa &= ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
 
-	return tdx_get_quote_user(vcpu, gpa, size);
+	if (kvm_tdx->get_quote_in_kernel)
+		ret = tdx_get_quote_kernel(vcpu, gpa, size);
+	else
+		ret = tdx_get_quote_user(vcpu, gpa, size);
+
+	return ret;
 }
 
 static int tdx_setup_event_notify_interrupt(struct kvm_vcpu *vcpu)
@@ -2751,6 +2878,12 @@ static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
 	else
 		kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4;
 
+	/*
+	 * Check only once at TD creation. Switching between userspace and
+	 * in-kernel quoting is not supported.
+	 */
+	kvm_tdx->get_quote_in_kernel = tdx_quote_enabled();
+
 	kvm_tdx->state = TD_STATE_INITIALIZED;
 out:
 	/* kfree() accepts NULL. */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 89489996fbc1..599f88a13071 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2461,6 +2461,7 @@ bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
 
 	return true;
 }
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_range_has_memory_attributes);
 
 static __always_inline void kvm_handle_gfn_range(struct kvm *kvm,
 						 struct kvm_mmu_notifier_range *range)
-- 
2.25.1


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox