* [PATCH v6 02/43] KVM: Rename KVM_GENERIC_MEMORY_ATTRIBUTES to KVM_VM_MEMORY_ATTRIBUTES
From: Ackerley Tng via B4 Relay @ 2026-05-07 20:22 UTC (permalink / raw)
To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260507-gmem-inplace-conversion-v6-0-91ab5a8b19a4@google.com>
From: Sean Christopherson <seanjc@google.com>
Rename the per-VM memory attributes Kconfig to make it explicitly about
per-VM attributes in anticipation of adding memory attributes support to
guest_memfd, at which point it will be possible (and desirable) to have
memory attributes without the per-VM support, even in x86.
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/kvm/Kconfig | 6 +++---
arch/x86/kvm/mmu/mmu.c | 2 +-
arch/x86/kvm/x86.c | 2 +-
include/linux/kvm_host.h | 8 ++++----
include/trace/events/kvm.h | 4 ++--
virt/kvm/Kconfig | 2 +-
virt/kvm/kvm_main.c | 14 +++++++-------
8 files changed, 20 insertions(+), 20 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c470e40a00aa4..60b997764beef 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2369,7 +2369,7 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
int tdp_max_root_level, int tdp_huge_page_level);
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
#endif
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 801bf9e520db3..26f6afd51bbdc 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -84,7 +84,7 @@ config KVM_SW_PROTECTED_VM
bool "Enable support for KVM software-protected VMs"
depends on EXPERT
depends on KVM_X86 && X86_64
- select KVM_GENERIC_MEMORY_ATTRIBUTES
+ select KVM_VM_MEMORY_ATTRIBUTES
help
Enable support for KVM software-protected VMs. Currently, software-
protected VMs are purely a development and testing vehicle for
@@ -135,7 +135,7 @@ config KVM_INTEL_TDX
bool "Intel Trust Domain Extensions (TDX) support"
default y
depends on INTEL_TDX_HOST
- select KVM_GENERIC_MEMORY_ATTRIBUTES
+ select KVM_VM_MEMORY_ATTRIBUTES
select HAVE_KVM_ARCH_GMEM_POPULATE
help
Provides support for launching Intel Trust Domain Extensions (TDX)
@@ -159,7 +159,7 @@ config KVM_AMD_SEV
depends on KVM_AMD && X86_64
depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
select ARCH_HAS_CC_PLATFORM
- select KVM_GENERIC_MEMORY_ATTRIBUTES
+ select KVM_VM_MEMORY_ATTRIBUTES
select HAVE_KVM_ARCH_GMEM_PREPARE
select HAVE_KVM_ARCH_GMEM_INVALIDATE
select HAVE_KVM_ARCH_GMEM_POPULATE
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 892246204435c..a80a876ab4ad6 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7899,7 +7899,7 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread);
}
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
int level)
{
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0a1b63c63d1a9..1560de1e95be0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13625,7 +13625,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
}
}
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
kvm_mmu_init_memslot_memory_attributes(kvm, slot);
#endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4c14aee1fb063..7b9faa3545300 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -722,7 +722,7 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
}
#endif
-#ifndef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifndef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
{
return false;
@@ -871,7 +871,7 @@ struct kvm {
#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
struct notifier_block pm_notifier;
#endif
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
/* Protected by slots_lock (for writes) and RCU (for reads) */
struct xarray mem_attr_array;
#endif
@@ -2528,7 +2528,7 @@ static inline bool kvm_memslot_is_gmem_only(const struct kvm_memory_slot *slot)
return slot->flags & KVM_MEMSLOT_GMEM_ONLY;
}
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
{
return xa_to_value(xa_load(&kvm->mem_attr_array, gfn));
@@ -2550,7 +2550,7 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
{
return false;
}
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
#ifdef CONFIG_KVM_GUEST_MEMFD
int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index b282e3a867696..1ba72bd73ea2f 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -358,7 +358,7 @@ TRACE_EVENT(kvm_dirty_ring_exit,
TP_printk("vcpu %d", __entry->vcpu_id)
);
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
/*
* @start: Starting address of guest memory range
* @end: End address of guest memory range
@@ -383,7 +383,7 @@ TRACE_EVENT(kvm_vm_set_mem_attributes,
TP_printk("%#016llx -- %#016llx [0x%lx]",
__entry->start, __entry->end, __entry->attr)
);
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
TRACE_EVENT(kvm_unmap_hva_range,
TP_PROTO(unsigned long start, unsigned long end),
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 794976b88c6f9..5119cb37145fc 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -100,7 +100,7 @@ config KVM_ELIDE_TLB_FLUSH_IF_YOUNG
config KVM_MMU_LOCKLESS_AGING
bool
-config KVM_GENERIC_MEMORY_ATTRIBUTES
+config KVM_VM_MEMORY_ATTRIBUTES
bool
config KVM_GUEST_MEMFD
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 89489996fbc1e..306153abbafa5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1115,7 +1115,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
spin_lock_init(&kvm->mn_invalidate_lock);
rcuwait_init(&kvm->mn_memslots_update_rcuwait);
xa_init(&kvm->vcpu_array);
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
xa_init(&kvm->mem_attr_array);
#endif
@@ -1300,7 +1300,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
cleanup_srcu_struct(&kvm->irq_srcu);
srcu_barrier(&kvm->srcu);
cleanup_srcu_struct(&kvm->srcu);
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
xa_destroy(&kvm->mem_attr_array);
#endif
kvm_arch_free_vm(kvm);
@@ -2418,7 +2418,7 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
}
#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
static u64 kvm_supported_mem_attributes(struct kvm *kvm)
{
if (!kvm || kvm_arch_has_private_mem(kvm))
@@ -2623,7 +2623,7 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
}
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
{
@@ -4921,7 +4921,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
case KVM_CAP_SYSTEM_EVENT_DATA:
case KVM_CAP_DEVICE_CTRL:
return 1;
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
case KVM_CAP_MEMORY_ATTRIBUTES:
return kvm_supported_mem_attributes(kvm);
#endif
@@ -5325,7 +5325,7 @@ static long kvm_vm_ioctl(struct file *filp,
break;
}
#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
case KVM_SET_MEMORY_ATTRIBUTES: {
struct kvm_memory_attributes attrs;
@@ -5336,7 +5336,7 @@ static long kvm_vm_ioctl(struct file *filp,
r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
break;
}
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
case KVM_CREATE_DEVICE: {
struct kvm_create_device cd;
--
2.54.0.563.g4f69b47b94-goog
^ permalink raw reply related
* [PATCH v6 01/43] KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
From: Ackerley Tng via B4 Relay @ 2026-05-07 20:22 UTC (permalink / raw)
To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260507-gmem-inplace-conversion-v6-0-91ab5a8b19a4@google.com>
From: Sean Christopherson <seanjc@google.com>
Start plumbing in guest_memfd support for in-place private<=>shared
conversions by tracking attributes via a maple tree. KVM currently tracks
private vs. shared attributes on a per-VM basis, which made sense when a
guest_memfd _only_ supported private memory, but tracking per-VM simply
can't work for in-place conversions as the shareability of a given page
needs to be per-gmem_inode, not per-VM.
Use the filemap invalidation lock to protect the maple tree, as taking the
lock for read when faulting in memory (for userspace or the guest) isn't
expected to result in meaningful contention, and using a separate lock
would add significant complexity (avoid deadlock is quite difficult).
Signed-off-by: Sean Christopherson <seanjc@google.com>
Co-developed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Vishal Annapurve <vannapurve@google.com>
Signed-off-by: Vishal Annapurve <vannapurve@google.com>
Co-developed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
---
virt/kvm/guest_memfd.c | 133 +++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 117 insertions(+), 16 deletions(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 69c9d6d546b28..5011d38820d0d 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -4,6 +4,7 @@
#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/kvm_host.h>
+#include <linux/maple_tree.h>
#include <linux/mempolicy.h>
#include <linux/pseudo_fs.h>
#include <linux/pagemap.h>
@@ -33,6 +34,13 @@ struct gmem_inode {
struct list_head gmem_file_list;
u64 flags;
+ /*
+ * Every index in this inode, whether memory is populated or
+ * not, is tracked in attributes. The entire range of indices,
+ * corresponding to the size of this inode, is represented in
+ * this maple tree.
+ */
+ struct maple_tree attributes;
};
static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
@@ -60,6 +68,24 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
return gfn - slot->base_gfn + slot->gmem.pgoff;
}
+static u64 kvm_gmem_get_attributes(struct inode *inode, pgoff_t index)
+{
+ struct maple_tree *mt = &GMEM_I(inode)->attributes;
+ void *entry = mtree_load(mt, index);
+
+ return WARN_ON_ONCE(!entry) ? 0 : xa_to_value(entry);
+}
+
+static bool kvm_gmem_is_private_mem(struct inode *inode, pgoff_t index)
+{
+ return kvm_gmem_get_attributes(inode, index) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
+}
+
+static bool kvm_gmem_is_shared_mem(struct inode *inode, pgoff_t index)
+{
+ return !kvm_gmem_is_private_mem(inode, index);
+}
+
static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
pgoff_t index, struct folio *folio)
{
@@ -397,10 +423,13 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
return VM_FAULT_SIGBUS;
- if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED))
- return VM_FAULT_SIGBUS;
+ filemap_invalidate_lock_shared(inode->i_mapping);
+ if (kvm_gmem_is_shared_mem(inode, vmf->pgoff))
+ folio = kvm_gmem_get_folio(inode, vmf->pgoff);
+ else
+ folio = ERR_PTR(-EACCES);
+ filemap_invalidate_unlock_shared(inode->i_mapping);
- folio = kvm_gmem_get_folio(inode, vmf->pgoff);
if (IS_ERR(folio)) {
if (PTR_ERR(folio) == -EAGAIN)
return VM_FAULT_RETRY;
@@ -556,6 +585,51 @@ bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
return true;
}
+static int kvm_gmem_init_inode(struct inode *inode, loff_t size, u64 flags)
+{
+ struct gmem_inode *gi = GMEM_I(inode);
+ MA_STATE(mas, &gi->attributes, 0, (size >> PAGE_SHIFT) - 1);
+ u64 attrs;
+ int r;
+
+ inode->i_op = &kvm_gmem_iops;
+ inode->i_mapping->a_ops = &kvm_gmem_aops;
+ inode->i_mode |= S_IFREG;
+ inode->i_size = size;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+
+ /*
+ * guest_memfd memory is neither migratable nor swappable: set
+ * inaccessible to gate off both.
+ */
+ mapping_set_inaccessible(inode->i_mapping);
+ WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
+
+ gi->flags = flags;
+
+ mt_set_external_lock(&gi->attributes,
+ &inode->i_mapping->invalidate_lock);
+
+ /*
+ * Store default attributes for the entire gmem instance. Ensuring every
+ * index is represented in the maple tree at all times simplifies the
+ * conversion and merging logic.
+ */
+ attrs = gi->flags & GUEST_MEMFD_FLAG_INIT_SHARED ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE;
+
+ /*
+ * Acquire the invalidation lock purely to make lockdep happy. The
+ * maple tree library expects all stores to be protected via the lock,
+ * and the library can't know when the tree is reachable only by the
+ * caller, as is the case here.
+ */
+ filemap_invalidate_lock(inode->i_mapping);
+ r = mas_store_gfp(&mas, xa_mk_value(attrs), GFP_KERNEL);
+ filemap_invalidate_unlock(inode->i_mapping);
+
+ return r;
+}
+
static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
{
static const char *name = "[kvm-gmem]";
@@ -586,16 +660,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
goto err_fops;
}
- inode->i_op = &kvm_gmem_iops;
- inode->i_mapping->a_ops = &kvm_gmem_aops;
- inode->i_mode |= S_IFREG;
- inode->i_size = size;
- mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
- mapping_set_inaccessible(inode->i_mapping);
- /* Unmovable mappings are supposed to be marked unevictable as well. */
- WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
-
- GMEM_I(inode)->flags = flags;
+ err = kvm_gmem_init_inode(inode, size, flags);
+ if (err)
+ goto err_inode;
file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
if (IS_ERR(file)) {
@@ -797,9 +864,13 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
if (!file)
return -EFAULT;
+ filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
+
folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
- if (IS_ERR(folio))
- return PTR_ERR(folio);
+ if (IS_ERR(folio)) {
+ r = PTR_ERR(folio);
+ goto out;
+ }
if (!folio_test_uptodate(folio)) {
clear_highpage(folio_page(folio, 0));
@@ -815,6 +886,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
else
folio_put(folio);
+out:
+ filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
return r;
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
@@ -944,6 +1017,15 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
mpol_shared_policy_init(&gi->policy, NULL);
+ /*
+ * Memory attributes are protected by the filemap invalidation lock, but
+ * the lock structure isn't available at this time. Immediately mark
+ * maple tree as using external locking so that accessing the tree
+ * before it's fully initialized results in NULL pointer dereferences
+ * and not more subtle bugs.
+ */
+ mt_init_flags(&gi->attributes, MT_FLAGS_LOCK_EXTERN | MT_FLAGS_USE_RCU);
+
gi->flags = 0;
INIT_LIST_HEAD(&gi->gmem_file_list);
return &gi->vfs_inode;
@@ -951,7 +1033,26 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
static void kvm_gmem_destroy_inode(struct inode *inode)
{
- mpol_free_shared_policy(&GMEM_I(inode)->policy);
+ struct gmem_inode *gi = GMEM_I(inode);
+
+ mpol_free_shared_policy(&gi->policy);
+
+ /*
+ * Note! Checking for an empty tree is functionally necessary
+ * to avoid explosions if the tree hasn't been fully
+ * initialized, i.e. if the inode is being destroyed before
+ * guest_memfd can set the external lock, lockdep would find
+ * that the tree's internal ma_lock was not held.
+ */
+ if (!mtree_empty(&gi->attributes)) {
+ /*
+ * Acquire the invalidation lock purely to make lockdep happy,
+ * the inode is unreachable at this point.
+ */
+ filemap_invalidate_lock(inode->i_mapping);
+ __mt_destroy(&gi->attributes);
+ filemap_invalidate_unlock(inode->i_mapping);
+ }
}
static void kvm_gmem_free_inode(struct inode *inode)
--
2.54.0.563.g4f69b47b94-goog
^ permalink raw reply related
* [PATCH v6 00/43] guest_memfd: In-place conversion support
From: Ackerley Tng via B4 Relay @ 2026-05-07 20:22 UTC (permalink / raw)
To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco, Ackerley Tng
This is v6 of guest_memfd in-place conversion support, now out of RFC.
Up till now, guest_memfd supports the entire inode worth of memory being
used as all-shared, or all-private. CoCo VMs may request guest memory to be
converted between private and shared states, and the only way to support
that currently would be to have the userspace VMM provide two sources of
backing memory from completely different areas of physical memory.
pKVM has a use case for in-place sharing: the guest and host may be
cooperating on given data, and pKVM doesn't protect data through
encryption, so copying that given data between different areas of physical
memory as part of conversions would be unnecessary work.
This series also serves as a foundation for guest_memfd huge page
support. Now, guest_memfd only supports PAGE_SIZE pages, so if two sources
of backing memory are used, the userspace VMM could maintain a steady total
memory utilized by punching out the pages that are not used. When huge
pages are available in guest_memfd, even if the backing memory source
supports hole punching within a huge page, punching out pages to maintain
the total memory utilized by a VM would be introducing lots of
fragmentation.
In-place conversion avoids fragmentation by allowing the same physical
memory to be used for both shared and private memory, with guest_memfd
tracks the shared/private status of all the pages at a per-page
granularity.
The central principle, which guest_memfd continues to uphold, is that any
guest-private page will not be mappable to host userspace. All pages will
be mmap()-able in host userspace, but accesses to guest-private pages (as
tracked by guest_memfd) will result in a SIGBUS.
This series introduces a guest_memfd ioctl (not kvm, vm or vcpu, but
guest_memfd ioctl) that allows userspace to set memory
attributes (shared/private) directly through the guest_memfd. This is the
appropriate interface because shared/private-ness is a property of memory
and hence the request should be sent directly to the memory provider -
guest_memfd.
Tested with both CONFIG_KVM_VM_MEMORY_ATTRIBUTES enabled and disabled:
+ tools/testing/selftests/kvm/guest_memfd_test.c
+ tools/testing/selftests/kvm/pre_fault_memory_test.c
+ tools/testing/selftests/kvm/x86/guest_memfd_conversions_test.c
+ tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
+ tools/testing/selftests/kvm/x86/private_mem_conversions_test.sh
+ tools/testing/selftests/kvm/x86/private_mem_kvm_exits_test.c
Updates for this revision:
+ Dropped everything to do with content modes (ZERO/PRESERVE) (thanks
Michael and Sean!)
+ Discussed this at PUCK on 2026-05-06
+ guest_memfd was taking on too much complexity to try to paper
over differences in how trusted firmware handle conversions
+ Addressed Liam's comments about usage of the maple_tree
TODOs
+ Resolve issue where guest_memfd_conversions_test, which uses the
kselftest framework, doesn't perform teardown on assertion
failure. Please see proposal at [9]
+ Test with TDX selftests. We're in the process of rebasing TDX selftests
on this series and will post updates when that's tested.
I would like feedback on:
+ The use of private_mem_conversions_test.sh to run different options in
private_mem_conversions_test. If this makes sense, I'll adjust the
Makefile to have private_mem_conversions_test tested only via the script.
This series is based on kvm/next, and here's the tree for your convenience:
https://github.com/googleprodkernel/linux-cc/commits/guest_memfd-inplace-conversion-v6
Older series:
+ RFCv5 is at [8]
+ RFCv4 is at [7]
+ RFCv3 is at [6]
+ RFCv2 is at [5]
+ RFCv1 is at [4]
+ Previous versions of this feature, part of other series, are available at
[1][2][3].
[1] https://lore.kernel.org/all/bd163de3118b626d1005aa88e71ef2fb72f0be0f.1726009989.git.ackerleytng@google.com/
[2] https://lore.kernel.org/all/20250117163001.2326672-6-tabba@google.com/
[3] https://lore.kernel.org/all/b784326e9ccae6a08388f1bf39db70a2204bdc51.1747264138.git.ackerleytng@google.com/
[4] https://lore.kernel.org/all/cover.1760731772.git.ackerleytng@google.com/T/
[5] https://lore.kernel.org/all/cover.1770071243.git.ackerleytng@google.com/T/
[6] https://lore.kernel.org/r/20260313-gmem-inplace-conversion-v3-0-5fc12a70ec89@google.com/T/
[7] https://lore.kernel.org/all/20260326-gmem-inplace-conversion-v4-0-e202fe950ffd@google.com/T/
[8] https://lore.kernel.org/r/20260428-gmem-inplace-conversion-v5-0-d8608ccfca22@google.com
[9] https://lore.kernel.org/all/20260414-selftest-global-metadata-v1-0-fd223922bc57@google.com/T/
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
Ackerley Tng (25):
KVM: x86/mmu: Bug the VM if gmem attributes are queried to determine max mapping level
KVM: guest_memfd: Update kvm_gmem_populate() to use gmem attributes
KVM: guest_memfd: Only prepare folios for private pages
KVM: Move kvm_supported_mem_attributes() to kvm_host.h
KVM: guest_memfd: Add base support for KVM_SET_MEMORY_ATTRIBUTES2
KVM: guest_memfd: Ensure pages are not in use before conversion
KVM: guest_memfd: Call arch invalidate hooks on conversion
KVM: guest_memfd: Return early if range already has requested attributes
KVM: guest_memfd: Advertise KVM_SET_MEMORY_ATTRIBUTES2 ioctl
KVM: guest_memfd: Handle lru_add fbatch refcounts during conversion safety check
KVM: guest_memfd: Use actual size for invalidation in kvm_gmem_release()
KVM: guest_memfd: Determine invalidation filter from memory attributes
KVM: TDX: Make source page optional for KVM_TDX_INIT_MEM_REGION
KVM: selftests: Test basic single-page conversion flow
KVM: selftests: Test conversion flow when INIT_SHARED
KVM: selftests: Test conversion precision in guest_memfd
KVM: selftests: Test conversion before allocation
KVM: selftests: Convert with allocated folios in different layouts
KVM: selftests: Test that truncation does not change shared/private status
KVM: selftests: Test conversion with elevated page refcount
KVM: selftests: Reset shared memory after hole-punching
KVM: selftests: Provide function to look up guest_memfd details from gpa
KVM: selftests: Make TEST_EXPECT_SIGBUS thread-safe
KVM: selftests: Update private_mem_conversions_test to mmap() guest_memfd
KVM: selftests: Add script to exercise private_mem_conversions_test
Michael Roth (1):
KVM: SEV: Make 'uaddr' parameter optional for KVM_SEV_SNP_LAUNCH_UPDATE
Sean Christopherson (17):
KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
KVM: Rename KVM_GENERIC_MEMORY_ATTRIBUTES to KVM_VM_MEMORY_ATTRIBUTES
KVM: Enumerate support for PRIVATE memory iff kvm_arch_has_private_mem is defined
KVM: Stub in ability to disable per-VM memory attribute tracking
KVM: guest_memfd: Wire up kvm_get_memory_attributes() to per-gmem attributes
KVM: Move KVM_VM_MEMORY_ATTRIBUTES config definition to x86
KVM: Let userspace disable per-VM mem attributes, enable per-gmem attributes
KVM: guest_memfd: Enable INIT_SHARED on guest_memfd for x86 Coco VMs
KVM: selftests: Create gmem fd before "regular" fd when adding memslot
KVM: selftests: Rename guest_memfd{,_offset} to gmem_{fd,offset}
KVM: selftests: Add support for mmap() on guest_memfd in core library
KVM: selftests: Add selftests global for guest memory attributes capability
KVM: selftests: Add helpers for calling ioctls on guest_memfd
KVM: selftests: Test that shared/private status is consistent across processes
KVM: selftests: Provide common function to set memory attributes
KVM: selftests: Check fd/flags provided to mmap() when setting up memslot
KVM: selftests: Update private memory exits test to work with per-gmem attributes
Documentation/virt/kvm/api.rst | 78 +++-
.../virt/kvm/x86/amd-memory-encryption.rst | 15 +-
Documentation/virt/kvm/x86/intel-tdx.rst | 4 +
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/kvm/Kconfig | 15 +-
arch/x86/kvm/mmu/mmu.c | 13 +-
arch/x86/kvm/svm/sev.c | 18 +-
arch/x86/kvm/vmx/tdx.c | 11 +-
arch/x86/kvm/x86.c | 13 +-
include/linux/kvm_host.h | 53 ++-
include/trace/events/kvm.h | 4 +-
include/uapi/linux/kvm.h | 16 +
mm/swap.c | 2 +
tools/testing/selftests/kvm/Makefile.kvm | 5 +
tools/testing/selftests/kvm/include/kvm_util.h | 138 +++++-
tools/testing/selftests/kvm/include/test_util.h | 34 +-
.../selftests/kvm/kvm_has_gmem_attributes.c | 17 +
tools/testing/selftests/kvm/lib/kvm_util.c | 133 +++---
tools/testing/selftests/kvm/lib/test_util.c | 7 -
.../kvm/x86/guest_memfd_conversions_test.c | 487 +++++++++++++++++++++
.../kvm/x86/private_mem_conversions_test.c | 53 ++-
.../kvm/x86/private_mem_conversions_test.sh | 128 ++++++
.../selftests/kvm/x86/private_mem_kvm_exits_test.c | 36 +-
virt/kvm/Kconfig | 3 +-
virt/kvm/guest_memfd.c | 457 +++++++++++++++++--
virt/kvm/kvm_main.c | 82 +++-
26 files changed, 1636 insertions(+), 188 deletions(-)
---
base-commit: 6d35786de28116ecf78797a62b84e6bf3c45aa5a
change-id: 20260225-gmem-inplace-conversion-bd0dbd39753a
Best regards,
--
Ackerley Tng <ackerleytng@google.com>
^ permalink raw reply
* Re: [PATCH v2 1/4] x86/tdx: Use PFN directly for mapping guest private memory
From: Sean Christopherson @ 2026-05-07 18:46 UTC (permalink / raw)
To: Yan Zhao
Cc: Xiaoyao Li, dave.hansen, pbonzini, tglx, mingo, bp, kas, x86,
linux-kernel, kvm, linux-coco, kai.huang, rick.p.edgecombe,
yilun.xu, vannapurve, ackerleytng, sagis, binbin.wu,
isaku.yamahata
In-Reply-To: <afxIc/0qNvwcr4VQ@yzhao56-desk.sh.intel.com>
On Thu, May 07, 2026, Yan Zhao wrote:
> On Thu, May 07, 2026 at 03:49:09PM +0800, Xiaoyao Li wrote:
> > On 4/30/2026 9:49 AM, Yan Zhao wrote:
> > There is on-going attempt to remove the direct map for guest_memfd. The good
> > news is TDX is excluded. [1]
> We can see if any code refinement is necessary if TDX is included in the future.
Yeah, I wouldn't worry too much about that effort. The onus will firmly be on
that series to do the right thing for TDX (and any other unique code).
^ permalink raw reply
* Re: [PATCH RFC v5 01/53] KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
From: Ackerley Tng @ 2026-05-07 18:31 UTC (permalink / raw)
To: Liam R. Howlett
Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco
In-Reply-To: <byywwfin2aenobdwuesqihm6nzxyx6ecedwgbt7f5tvaaul6fi@u7bmexpavwdn>
"Liam R. Howlett" <liam@infradead.org> writes:
>
> [...snip...]
>
>> +static u64 kvm_gmem_get_attributes(struct inode *inode, pgoff_t index)
>> +{
>> + struct maple_tree *mt = &GMEM_I(inode)->attributes;
>> + void *entry = mtree_load(mt, index);
>> +
>> + /*
>> + * The lock _must_ be held for lookups, as some maple tree operations,
>> + * e.g. append, are unsafe (return inaccurate information) with respect
>> + * to concurrent RCU-protected lookups.
>> + */
>
> Can you please elaborate how you see inaccurate information and which
> information is inaccurate?
>
> Your comment is incorrect and misleading as append will not be used in
> rcu mode. Note that you have not set this tree up in rcu mode.
>
My bad. Thanks for clarifying about usage of rcu mode.
>> + lockdep_assert(mt_lock_is_held(mt));
>> +
In the next revision I'll remove this lockdep and use RCU mode, and
kvm_gmem_get_memory_attributes() should get a stable result.
The other lookups using mt_for_each() in kvm_gmem_range_has_attributes()
and kvm_gmem_get_invalidate_filter() retain the lockdep since those
operate over multiple ranges. Those are called from paths that require
holding the lock to exclude other operations anyway, so the lockdep
requirement does not cost anything more.
>> + return WARN_ON_ONCE(!entry) ? 0 : xa_to_value(entry);
>> +}
>> +
>>
>> [...snip...]
>>
^ permalink raw reply
* Re: [PATCH RFC v5 10/53] KVM: guest_memfd: Add basic support for KVM_SET_MEMORY_ATTRIBUTES2
From: Ackerley Tng @ 2026-05-07 16:56 UTC (permalink / raw)
To: Liam R. Howlett
Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco
In-Reply-To: <n5ce32wumzeiqqyqutom4apy2kqfetbvusc6j4k2xarsska5mw@klp5bmy7qhfm>
"Liam R. Howlett" <liam@infradead.org> writes:
> On 26/04/28 04:25PM, Ackerley Tng via B4 Relay wrote:
>>
>> [...snip...]
>>
>> +/*
>> + * Preallocate memory for attributes to be stored on a maple tree, pointed to
>> + * by mas. Adjacent ranges with attributes identical to the new attributes
>> + * will be merged. Also sets mas's bounds up for storing attributes.
>> + *
>> + * This maintains the invariant that ranges with the same attributes will
>> + * always be merged.
>> + */
>> +static int kvm_gmem_mas_preallocate(struct ma_state *mas, u64 attributes,
>> + pgoff_t start, size_t nr_pages)
>> +{
>> + pgoff_t end = start + nr_pages;
>> + pgoff_t last = end - 1;
>> + void *entry;
>> +
>> + /* Try extending range. entry is NULL on overflow/wrap-around. */
>> + mas_set_range(mas, end, end);
>> + entry = mas_find(mas, end);
Thank you for your reviews!
>
> Please read the documentation as I believe you have a bug here. What
> happens if there is another range stored higher than end + 1?
>
The invariant in this maple tree is that contiguous ranges with the same
attribute are stored as a single range.
The goal of this first part is to get the entry at the index just after
the requested range, and see what the attribute there is. If that
attribute is what we're about to set, extend the requested range for
storing to the end of that range.
If there is another range higher than end + 1, with the invariant
maintained, that attribute has to be different than the attribute stored
at end. Hence, we only want to extend this requested range up till end.
> Do you have testing of these functions somewhere?
>
GMEM_CONVERSION_MULTIPAGE_TEST_INIT_SHARED(indexing, 4) tests setting
attributes in ranges. If test_page is 2,
1. [0, 4) starts off shared (4 is the number of pages in the guest_memfd)
2. [2, 3) is converted to private
=> so the ranges should now be [0, 2), [2, 3), [3, 4)
3. [2, 3) is converted back to shared
=> so the ranges should now be [0, 4)
I verified this by inserting some trace_printk()s and inspecting manually.
>> + if (entry && xa_to_value(entry) == attributes)
>> + last = mas->last;
>> +
>> + if (start > 0) {
>> + mas_set_range(mas, start - 1, start - 1);
>> + entry = mas_find(mas, start - 1);
>> + if (entry && xa_to_value(entry) == attributes)
>> + start = mas->index;
>> + }
>> +
>> + mas_set_range(mas, start, last);
>> + return mas_preallocate(mas, xa_mk_value(attributes), GFP_KERNEL);
>> +}
>> +
>>
>> [...snip...]
>>
^ permalink raw reply
* Re: [PATCH v8 08/21] x86/virt/seamldr: Allocate and populate a module update request
From: Chao Gao @ 2026-05-07 13:19 UTC (permalink / raw)
To: Dave Hansen
Cc: kvm, linux-coco, linux-kernel, x86, binbin.wu, dave.hansen, djbw,
ira.weiny, kai.huang, kas, nik.borisov, paulmck, pbonzini,
reinette.chatre, rick.p.edgecombe, sagis, seanjc, tony.lindgren,
vannapurve, vishal.l.verma, yilun.xu, xiaoyao.li, yan.y.zhao,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin
In-Reply-To: <a52c4701-c99d-48d5-9b63-8eb1c0e589f0@intel.com>
>> header is consumed solely by the kernel to extract the sigstruct and
>> module, so validate it before processing to protect the kernel ABI. The
>> sigstruct and module are passed to and validated by P-SEAMLDR, so don't
>> duplicate any validation in the kernel.
>>
>> Note: the sigstruct_pa field in SEAMLDR_PARAMS has been extended to
>> a 4-element array. The updated "SEAM Loader (SEAMLDR) Interface
>> Specification" will be published separately.
>
>These changelogs have all the right info, but I find them really hard to
>parse. For instance, if you're going to have a 'struct seamldr_params',
>then just stick with that name. Don't use the "SEAMLDR_PARAMS" name too.
>
>Start with the data structures:
>
>There are two important ABIs here:
>
>'struct tdx_blob' - the on-disk and in-memory format for a TDX
> module update image.
>'struct seamldr_params' - The in-memory ABI passed to the TDX module
> loader. Points to a single 'struct tdx_blob'
Thanks for the thorough review.
Your comments all make sense to me. I just want to confirm two points
below.
>> + /*
>> + * Don't care about user passing the wrong file, but protect
>> + * kernel ABI by preventing accepting garbage.
>> + */
>> + if (memcmp(blob->signature, "TDX-BLOB", 8))
>> + return ERR_PTR(-EINVAL);
>
>Is there really no helper in the kernel anywhere that can safely do the
>8-byte compare against two known-to-the-compiler 8-byte-wide fields
>without hard-coding the 8?
I couldn't find a helper that automatically derives the comparison
length from the operands. 'strcmp()' is not suitable here because
'blob->signature' is not NUL-terminated.
Do you mean just avoiding the hard-coded 8, e.g.
if (memcmp(blob->signature, "TDX-BLOB", sizeof(blob->signature)))
return ERR_PTR(-EINVAL);
or define the 'u8 signature[8]' as a u64 and compare it with a constant, like
/* Little-endian encoding of "TDX-BLOB" string */
#define TDX_IMAGE_SIGNATURE 0x424f4c422d584454ULL
if (blob->signature != TDX_IMAGE_SIGNATURE)
return ERR_PTR(-EINVAL);
>> + struct seamldr_params *params;
>> + int module_pg_cnt, sig_pg_cnt;
>> + const u8 *sig, *module;
>> + int i;
>> +
>> + params = (struct seamldr_params *)get_zeroed_page(GFP_KERNEL);
>> + if (!params)
>> + return ERR_PTR(-ENOMEM);
>
>kzmalloc(PAGE_SIZE, GFP_KERNEL) will save you a cast.
I noticed that 'kzalloc_obj()' can be used here, which avoids spelling out
the size and GFP flags explicitly. So I ended up with:
params = kzalloc_obj(*params);
If you would prefer 'kzalloc(PAGE_SIZE, GFP_KERNEL)', I can switch to that.
^ permalink raw reply
* Re: [PATCH v2 1/4] x86/tdx: Use PFN directly for mapping guest private memory
From: Yan Zhao @ 2026-05-07 8:08 UTC (permalink / raw)
To: Xiaoyao Li
Cc: dave.hansen, pbonzini, seanjc, tglx, mingo, bp, kas, x86,
linux-kernel, kvm, linux-coco, kai.huang, rick.p.edgecombe,
yilun.xu, vannapurve, ackerleytng, sagis, binbin.wu,
isaku.yamahata
In-Reply-To: <28c8d380-d2e5-4f70-96b0-00225e0ea86d@intel.com>
On Thu, May 07, 2026 at 03:49:09PM +0800, Xiaoyao Li wrote:
> On 4/30/2026 9:49 AM, Yan Zhao wrote:
> > From: Sean Christopherson <seanjc@google.com>
> >
> > Remove struct page assumptions/constraints in the SEAMCALL wrapper APIs for
> > mapping guest private memory and have them take PFN directly.
> >
> > Having core TDX make assumptions that guest private memory must be backed
> > by struct page (and/or folio) will create subtle dependencies on how
> > KVM/guest_memfd allocates/manages memory (e.g., whether it uses memory
> > allocated from core MM, if the memory is refcounted, or if the folio is
> > split) that are easily avoided. [1].
> >
> > KVM's MMUs work with PFNs. This is very much an intentional design choice.
> > It ensures that the KVM MMUs remain flexible and are not too tied to the
> > regular CPU MMUs and the kernel code around them. Using 'struct page' for
> > TDX guest memory is not a good fit anywhere near the KVM MMU code [2].
> >
> > Use "kvm_pfn_t pfn" for type safety. Using this KVM type is appropriate
> > since APIs tdh_mem_page_add() and tdh_mem_page_aug() are exported to KVM
> > only.
> >
> > [ Yan: Replace "u64 pfn" with "kvm_pfn_t pfn" ]
> >
> > Signed-off-by: Sean Christopherson <seanjc@google.com>
> > Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> > Link: https://lore.kernel.org/all/aWgyhmTJphGQqO0Y@google.com [1]
> > Link: https://lore.kernel.org/all/ac7V0g2q2hN3dU5u@google.com [2]
>
> Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Thanks!
> > +static void tdx_clflush_pfn(kvm_pfn_t pfn)
> > +{
> > + clflush_cache_range(__va(PFN_PHYS(pfn)), PAGE_SIZE);
>
> If the pfn is not in the kernel direct map, we will get #PF, right?
Right.
There's no simple interface like pfn_range_is_mapped() that tells whether a PFN
has direct map or not if removing direct map is supported.
So, as PFNs not in the kernel direct map are unexpected for TDX, this series
leaves #PF, which is obvious enough for debugging.
> There is on-going attempt to remove the direct map for guest_memfd. The good
> news is TDX is excluded. [1]
We can see if any code refinement is necessary if TDX is included in the future.
> [1] https://lore.kernel.org/all/20260410151746.61150-9-kalyazin@amazon.com/
^ permalink raw reply
* Re: [PATCH v2 4/4] x86/virt/tdx: Move mk_keyed_paddr() to tdx.c due to no external users
From: Xiaoyao Li @ 2026-05-07 8:07 UTC (permalink / raw)
To: Yan Zhao, dave.hansen, pbonzini, seanjc
Cc: tglx, mingo, bp, kas, x86, linux-kernel, kvm, linux-coco,
kai.huang, rick.p.edgecombe, yilun.xu, vannapurve, ackerleytng,
sagis, binbin.wu, isaku.yamahata
In-Reply-To: <20260430015014.24261-1-yan.y.zhao@intel.com>
On 4/30/2026 9:50 AM, Yan Zhao wrote:
> Move mk_keyed_paddr() from tdx.h to tdx.c to avoid unnecessary header
> inclusion and improve encapsulation since there are no users outside of
> tdx.c.
>
> No functional change intended.
Missing a new blank line.
> Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
> ---
> arch/x86/include/asm/tdx.h | 6 ------
> arch/x86/virt/vmx/tdx/tdx.c | 6 ++++++
> 2 files changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
> index 9c63deaa0e8f..503f9a3f46d6 100644
> --- a/arch/x86/include/asm/tdx.h
> +++ b/arch/x86/include/asm/tdx.h
> @@ -177,12 +177,6 @@ struct tdx_vp {
> struct page **tdcx_pages;
> };
>
> -static inline u64 mk_keyed_paddr(u16 hkid, kvm_pfn_t pfn)
> -{
> - /* KeyID bits are just above the physical address bits. */
> - return PFN_PHYS(pfn) | ((u64)hkid << boot_cpu_data.x86_phys_bits);
> -}
> -
> u64 tdh_vp_enter(struct tdx_vp *vp, struct tdx_module_args *args);
> u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page);
> u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, kvm_pfn_t pfn, struct page *source,
> diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> index deb67e68f85f..967482ae3c80 100644
> --- a/arch/x86/virt/vmx/tdx/tdx.c
> +++ b/arch/x86/virt/vmx/tdx/tdx.c
> @@ -1911,6 +1911,12 @@ u64 tdh_phymem_cache_wb(bool resume)
> }
> EXPORT_SYMBOL_FOR_KVM(tdh_phymem_cache_wb);
>
> +static inline u64 mk_keyed_paddr(u16 hkid, kvm_pfn_t pfn)
> +{
> + /* KeyID bits are just above the physical address bits. */
> + return PFN_PHYS(pfn) | ((u64)hkid << boot_cpu_data.x86_phys_bits);
> +}
> +
> u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td)
> {
> struct tdx_module_args args = {};
^ permalink raw reply
* Re: [PATCH v2 3/4] x86/tdx: Drop exported function tdx_quirk_reset_page()
From: Xiaoyao Li @ 2026-05-07 8:02 UTC (permalink / raw)
To: Yan Zhao, dave.hansen, pbonzini, seanjc
Cc: tglx, mingo, bp, kas, x86, linux-kernel, kvm, linux-coco,
kai.huang, rick.p.edgecombe, yilun.xu, vannapurve, ackerleytng,
sagis, binbin.wu, isaku.yamahata
In-Reply-To: <20260430015001.24242-1-yan.y.zhao@intel.com>
On 4/30/2026 9:50 AM, Yan Zhao wrote:
> KVM invokes tdx_quirk_reset_page() to reset TDX control pages (including
> S-EPT pages, TDR page, etc.), as all those pages are allocated by KVM TDX
> and thus always have struct page.
>
> However, it's also reasonable for KVM to reset those TDX control pages via
> tdx_quirk_reset_paddr() directly, eliminating the need to export two
> parallel APIs. Keeping tdx_quirk_reset_page() as a one-line helper in the
> header file is also unnecessary.
>
> No functional change intended.
>
> Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
> Suggested-by: Xiaoyao Li <xiaoyao.li@intel.com>
> Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
> ---
> arch/x86/include/asm/tdx.h | 1 -
> arch/x86/kvm/vmx/tdx.c | 4 ++--
> arch/x86/virt/vmx/tdx/tdx.c | 6 ------
> 3 files changed, 2 insertions(+), 9 deletions(-)
>
> diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
> index 65f7d874fb5a..9c63deaa0e8f 100644
> --- a/arch/x86/include/asm/tdx.h
> +++ b/arch/x86/include/asm/tdx.h
> @@ -153,7 +153,6 @@ int tdx_guest_keyid_alloc(void);
> u32 tdx_get_nr_guest_keyids(void);
> void tdx_guest_keyid_free(unsigned int keyid);
>
> -void tdx_quirk_reset_page(struct page *page);
> void tdx_quirk_reset_paddr(unsigned long base, unsigned long size);
>
> struct tdx_td {
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index a2aadc6d0174..9bd4fd748e2a 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -343,7 +343,7 @@ static int tdx_reclaim_page(struct page *page)
>
> r = __tdx_reclaim_page(page);
> if (!r)
> - tdx_quirk_reset_page(page);
> + tdx_quirk_reset_paddr(page_to_phys(page), PAGE_SIZE);
> return r;
> }
>
> @@ -597,7 +597,7 @@ static void tdx_reclaim_td_control_pages(struct kvm *kvm)
> if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
> return;
>
> - tdx_quirk_reset_page(kvm_tdx->td.tdr_page);
> + tdx_quirk_reset_paddr(page_to_phys(kvm_tdx->td.tdr_page), PAGE_SIZE);
>
> __free_page(kvm_tdx->td.tdr_page);
> kvm_tdx->td.tdr_page = NULL;
> diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> index e5a37ea2d4a0..deb67e68f85f 100644
> --- a/arch/x86/virt/vmx/tdx/tdx.c
> +++ b/arch/x86/virt/vmx/tdx/tdx.c
> @@ -731,12 +731,6 @@ void tdx_quirk_reset_paddr(unsigned long base, unsigned long size)
> }
> EXPORT_SYMBOL_FOR_KVM(tdx_quirk_reset_paddr);
>
> -void tdx_quirk_reset_page(struct page *page)
> -{
> - tdx_quirk_reset_paddr(page_to_phys(page), PAGE_SIZE);
> -}
> -EXPORT_SYMBOL_FOR_KVM(tdx_quirk_reset_page);
> -
> static __init void tdmr_quirk_reset_pamt(struct tdmr_info *tdmr)
>
> {
^ permalink raw reply
* Re: [PATCH v2 2/4] x86/tdx: Use PFN directly for unmapping guest private memory
From: Xiaoyao Li @ 2026-05-07 7:54 UTC (permalink / raw)
To: Yan Zhao, dave.hansen, pbonzini, seanjc
Cc: tglx, mingo, bp, kas, x86, linux-kernel, kvm, linux-coco,
kai.huang, rick.p.edgecombe, yilun.xu, vannapurve, ackerleytng,
sagis, binbin.wu, isaku.yamahata
In-Reply-To: <20260430014948.24226-1-yan.y.zhao@intel.com>
On 4/30/2026 9:49 AM, Yan Zhao wrote:
> From: Sean Christopherson<seanjc@google.com>
>
> Remove struct page assumptions/constraints in APIs for unmapping guest
> private memory and have them take physical address directly.
>
> Having core TDX make assumptions that guest private memory must be backed
> by struct page (and/or folio) will create subtle dependencies on how
> KVM/guest_memfd allocates/manages memory (e.g., whether it uses memory
> allocated from core MM, if the memory is refcounted, or if the folio is
> split) that are easily avoided. [1].
>
> KVM's MMUs work with PFNs. This is very much an intentional design choice.
> It ensures that the KVM MMUs remain flexible and are not too tightly tied
> to the regular CPU MMUs and the kernel code around them. Using
> "struct page" for TDX guest memory is not a good fit anywhere near the KVM
> MMU code [2].
>
> Therefore, for unmapping guest private memory: export
> tdx_quirk_reset_paddr() for direct KVM invocation, and convert the SEAMCALL
> wrapper API tdh_phymem_page_wbinvd_hkid() to take PFN as input (thus
> updating mk_keyed_paddr() and tdh_phymem_page_wbinvd_tdr()).
>
> Intentionally have KVM pass PAGE_SIZE (rather than KVM_HPAGE_SIZE(level))
> to tdx_quirk_reset_paddr() in tdx_sept_remove_private_spte() to avoid
> mixing in huge page changes. The KVM_BUG_ON() check for !PG_LEVEL_4K in
> tdx_sept_remove_private_spte() justifies using PAGE_SIZE.
>
> Do not convert tdx_reclaim_page() to use PFN as input since it currently
> does not remove guest private memory.
>
> Use "kvm_pfn_t pfn" for type safety. Using this KVM type is appropriate
> since APIs tdh_phymem_page_wbinvd_hkid() and tdx_quirk_reset_paddr() are
> exported to KVM only.
>
> [Yan: Use kvm_pfn_t,exclude tdx_reclaim_page(),use tdx_quirk_reset_paddr()]
>
> Signed-off-by: Sean Christopherson<seanjc@google.com>
> Signed-off-by: Yan Zhao<yan.y.zhao@intel.com>
> Link:https://lore.kernel.org/all/aWgyhmTJphGQqO0Y@google.com [1]
> Link:https://lore.kernel.org/all/ac7V0g2q2hN3dU5u@google.com [2]
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
^ permalink raw reply
* Re: [PATCH v2 1/4] x86/tdx: Use PFN directly for mapping guest private memory
From: Xiaoyao Li @ 2026-05-07 7:49 UTC (permalink / raw)
To: Yan Zhao, dave.hansen, pbonzini, seanjc
Cc: tglx, mingo, bp, kas, x86, linux-kernel, kvm, linux-coco,
kai.huang, rick.p.edgecombe, yilun.xu, vannapurve, ackerleytng,
sagis, binbin.wu, isaku.yamahata
In-Reply-To: <20260430014929.24210-1-yan.y.zhao@intel.com>
On 4/30/2026 9:49 AM, Yan Zhao wrote:
> From: Sean Christopherson <seanjc@google.com>
>
> Remove struct page assumptions/constraints in the SEAMCALL wrapper APIs for
> mapping guest private memory and have them take PFN directly.
>
> Having core TDX make assumptions that guest private memory must be backed
> by struct page (and/or folio) will create subtle dependencies on how
> KVM/guest_memfd allocates/manages memory (e.g., whether it uses memory
> allocated from core MM, if the memory is refcounted, or if the folio is
> split) that are easily avoided. [1].
>
> KVM's MMUs work with PFNs. This is very much an intentional design choice.
> It ensures that the KVM MMUs remain flexible and are not too tied to the
> regular CPU MMUs and the kernel code around them. Using 'struct page' for
> TDX guest memory is not a good fit anywhere near the KVM MMU code [2].
>
> Use "kvm_pfn_t pfn" for type safety. Using this KVM type is appropriate
> since APIs tdh_mem_page_add() and tdh_mem_page_aug() are exported to KVM
> only.
>
> [ Yan: Replace "u64 pfn" with "kvm_pfn_t pfn" ]
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> Link: https://lore.kernel.org/all/aWgyhmTJphGQqO0Y@google.com [1]
> Link: https://lore.kernel.org/all/ac7V0g2q2hN3dU5u@google.com [2]
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
...
> +static void tdx_clflush_pfn(kvm_pfn_t pfn)
> +{
> + clflush_cache_range(__va(PFN_PHYS(pfn)), PAGE_SIZE);
If the pfn is not in the kernel direct map, we will get #PF, right?
There is on-going attempt to remove the direct map for guest_memfd. The
good news is TDX is excluded. [1]
[1] https://lore.kernel.org/all/20260410151746.61150-9-kalyazin@amazon.com/
^ permalink raw reply
* Re: [RFC PATCH 04/12] vfio/pci: Allow MMIO regions to be exported through dma-buf
From: Alexey Kardashevskiy @ 2026-05-07 7:16 UTC (permalink / raw)
To: Jason Gunthorpe
Cc: Xu Yilun, kvm, dri-devel, linux-media, linaro-mm-sig,
sumit.semwal, christian.koenig, pbonzini, seanjc, alex.williamson,
vivek.kasireddy, dan.j.williams, yilun.xu, linux-coco,
linux-kernel, lukas, yan.y.zhao, daniel.vetter, leon, baolu.lu,
zhenzhong.duan, tao1.su
In-Reply-To: <afs/Jamxnj6GGFfM@nvidia.com>
On 6/5/26 23:16, Jason Gunthorpe wrote:
> On Wed, May 06, 2026 at 12:35:42PM +1000, Alexey Kardashevskiy wrote:
>> Hi!
>>
>> Let's reignite this topic.
>>
>> I've been using these patches + QEMU side hacks for 6+ months. And it's been fine until I got a device where MSIX BAR is in a middle of another BAR marked as TEE in the TDISP interface report. And no trusted MSIX yet.
>>
>> Every time QEMU mmaps a BAR - I request a dmabuf fd from VFIO in QEMU. Since mapping of an entire MSIX BAR is allowed by default, VFIORegion::nr_mmaps==1 and it is an entire BAR.
>>
>> Problem: KVM memslot mismatches the dmabuf fd size
>
> Huh? kvm does not care about dmabuf at all? Are you running other
> patches to hook kvm and dmabuf?
yup, 06/12 of this patchset.
> Putting a slice in a dmabuf is a well understood need for MSI, so I
> expect whatever kvm dmabuf interface that gets merged to accomodate
> this?
good to know.
>> Solution2: modify logic in VFIO dmabuf to allow multiple KVM memory
>> slots per dmabuf. Now it is kvm_memory_slot::dmabuf_attach with no
>> offset into the dmabuf and one kvm_vfio_dmabuf per dma_buf.
>
> Yes, when kvm learns to take in a dmabuf it needs to take in a slice,
> not the whole buf. Or you need to create multiple dmabufs with the
> necessary slices from the VFIO. The upstream vfio dmabuf creation
> allows creating it with a slice.
true but either way dmabuf slicing will be directed by QEMU's msix-table emulation MR and this slicing needs to match the TDISP report so I'll have to teach QEMU these reports, right? I am worried if I miss something obvious, again. Thanks,
ps. I like nntp.lore.kernel.org very much for ability to dig out old stuff and then just reply to it :)
>
> Jason
--
Alexey
^ permalink raw reply
* Re: [PATCH RFC v5 10/53] KVM: guest_memfd: Add basic support for KVM_SET_MEMORY_ATTRIBUTES2
From: Liam R. Howlett @ 2026-05-07 3:46 UTC (permalink / raw)
To: ackerleytng
Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco
In-Reply-To: <20260428-gmem-inplace-conversion-v5-10-d8608ccfca22@google.com>
On 26/04/28 04:25PM, Ackerley Tng via B4 Relay wrote:
> From: Ackerley Tng <ackerleytng@google.com>
>
> Introduce basic support for KVM_SET_MEMORY_ATTRIBUTES2 in guest_memfd,
> which just updates attributes tracked by guest_memfd.
>
> Validate input fields in general. Guard usage of KVM_SET_MEMORY_ATTRIBUTES2
> by making sure requested attributes are supported for this instance of kvm.
>
> A new KVM_SET_MEMORY_ATTRIBUTES2 is defined to support writes (unlike
> KVM_SET_MEMORY_ATTRIBUTES) in addition to reads so it can provide error
> details to userspace. This will be used in a later patch.
>
> The two ioctls use their corresponding structs with no overlap, but
> backward compatibility is baked in for future support of
> KVM_SET_MEMORY_ATTRIBUTES2 and struct kvm_memory_attributes2 in the VM
> ioctl.
>
> The process of setting memory attributes is set up such that the later half
> will not fail due to allocation. Any necessary checks are performed before
> the point of no return.
>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> Co-developed-by: Vishal Annapurve <vannapurve@google.com>
> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
> Co-developed-by: Sean Christoperson <seanjc@google.com>
> Signed-off-by: Sean Christoperson <seanjc@google.com>
> ---
> include/uapi/linux/kvm.h | 13 ++++++
> virt/kvm/Kconfig | 1 +
> virt/kvm/guest_memfd.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++
> virt/kvm/kvm_main.c | 12 +++++
> 4 files changed, 140 insertions(+)
>
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 6c8afa2047bf3..e6bbf68a83813 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1648,6 +1648,19 @@ struct kvm_memory_attributes {
> __u64 flags;
> };
>
> +#define KVM_SET_MEMORY_ATTRIBUTES2 _IOWR(KVMIO, 0xd2, struct kvm_memory_attributes2)
> +
> +struct kvm_memory_attributes2 {
> + union {
> + __u64 address;
> + __u64 offset;
> + };
> + __u64 size;
> + __u64 attributes;
> + __u64 flags;
> + __u64 reserved[12];
> +};
> +
> #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3)
>
> #define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd)
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index 3fea89c45cfb4..e371e079e2c50 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -109,6 +109,7 @@ config KVM_VM_MEMORY_ATTRIBUTES
>
> config KVM_GUEST_MEMFD
> select XARRAY_MULTI
> + select KVM_MEMORY_ATTRIBUTES
> bool
>
> config HAVE_KVM_ARCH_GMEM_PREPARE
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 506219e2359eb..9a26eca717047 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -552,11 +552,125 @@ unsigned long kvm_gmem_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
> }
> EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_memory_attributes);
>
> +/*
> + * Preallocate memory for attributes to be stored on a maple tree, pointed to
> + * by mas. Adjacent ranges with attributes identical to the new attributes
> + * will be merged. Also sets mas's bounds up for storing attributes.
> + *
> + * This maintains the invariant that ranges with the same attributes will
> + * always be merged.
> + */
> +static int kvm_gmem_mas_preallocate(struct ma_state *mas, u64 attributes,
> + pgoff_t start, size_t nr_pages)
> +{
> + pgoff_t end = start + nr_pages;
> + pgoff_t last = end - 1;
> + void *entry;
> +
> + /* Try extending range. entry is NULL on overflow/wrap-around. */
> + mas_set_range(mas, end, end);
> + entry = mas_find(mas, end);
Please read the documentation as I believe you have a bug here. What
happens if there is another range stored higher than end + 1?
Do you have testing of these functions somewhere?
> + if (entry && xa_to_value(entry) == attributes)
> + last = mas->last;
> +
> + if (start > 0) {
> + mas_set_range(mas, start - 1, start - 1);
> + entry = mas_find(mas, start - 1);
> + if (entry && xa_to_value(entry) == attributes)
> + start = mas->index;
> + }
> +
> + mas_set_range(mas, start, last);
> + return mas_preallocate(mas, xa_mk_value(attributes), GFP_KERNEL);
> +}
> +
> +static int __kvm_gmem_set_attributes(struct inode *inode, pgoff_t start,
> + size_t nr_pages, uint64_t attrs)
> +{
> + struct address_space *mapping = inode->i_mapping;
> + struct gmem_inode *gi = GMEM_I(inode);
> + pgoff_t end = start + nr_pages;
> + struct maple_tree *mt;
> + struct ma_state mas;
> + int r;
> +
> + mt = &gi->attributes;
> +
> + filemap_invalidate_lock(mapping);
> +
> + mas_init(&mas, mt, start);
> + r = kvm_gmem_mas_preallocate(&mas, attrs, start, nr_pages);
> + if (r)
> + goto out;
> +
> + /*
> + * From this point on guest_memfd has performed necessary
> + * checks and can proceed to do guest-breaking changes.
> + */
> +
> + kvm_gmem_invalidate_begin(inode, start, end);
> + mas_store_prealloc(&mas, xa_mk_value(attrs));
> + kvm_gmem_invalidate_end(inode, start, end);
> +out:
> + filemap_invalidate_unlock(mapping);
> + return r;
> +}
> +
> +static long kvm_gmem_set_attributes(struct file *file, void __user *argp)
> +{
> + struct gmem_file *f = file->private_data;
> + struct inode *inode = file_inode(file);
> + struct kvm_memory_attributes2 attrs;
> + size_t nr_pages;
> + pgoff_t index;
> + int i;
> +
> + if (copy_from_user(&attrs, argp, sizeof(attrs)))
> + return -EFAULT;
> +
> + if (attrs.flags)
> + return -EINVAL;
> + for (i = 0; i < ARRAY_SIZE(attrs.reserved); i++) {
> + if (attrs.reserved[i])
> + return -EINVAL;
> + }
> + if (attrs.attributes & ~kvm_supported_mem_attributes(f->kvm))
> + return -EINVAL;
> + if (attrs.size == 0 || attrs.offset + attrs.size < attrs.offset)
> + return -EINVAL;
> + if (!PAGE_ALIGNED(attrs.offset) || !PAGE_ALIGNED(attrs.size))
> + return -EINVAL;
> +
> + if (attrs.offset >= i_size_read(inode) ||
> + attrs.offset + attrs.size > i_size_read(inode))
> + return -EINVAL;
> +
> + nr_pages = attrs.size >> PAGE_SHIFT;
> + index = attrs.offset >> PAGE_SHIFT;
> + return __kvm_gmem_set_attributes(inode, index, nr_pages,
> + attrs.attributes);
> +}
> +
> +static long kvm_gmem_ioctl(struct file *file, unsigned int ioctl,
> + unsigned long arg)
> +{
> + switch (ioctl) {
> + case KVM_SET_MEMORY_ATTRIBUTES2:
> + if (vm_memory_attributes)
> + return -ENOTTY;
> +
> + return kvm_gmem_set_attributes(file, (void __user *)arg);
> + default:
> + return -ENOTTY;
> + }
> +}
> +
> static struct file_operations kvm_gmem_fops = {
> .mmap = kvm_gmem_mmap,
> .open = generic_file_open,
> .release = kvm_gmem_release,
> .fallocate = kvm_gmem_fallocate,
> + .unlocked_ioctl = kvm_gmem_ioctl,
> };
>
> static int kvm_gmem_migrate_folio(struct address_space *mapping,
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index ff20e63143642..4d7bf52b7b717 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -110,6 +110,18 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(STATIC_CALL_KEY(__kvm_get_memory_attributes));
> EXPORT_SYMBOL_FOR_KVM_INTERNAL(STATIC_CALL_TRAMP(__kvm_get_memory_attributes));
> #endif
>
> +#define MEMORY_ATTRIBUTES_MATCH(one, two) \
> + static_assert(offsetof(struct kvm_memory_attributes, one) == \
> + offsetof(struct kvm_memory_attributes2, two)); \
> + static_assert(sizeof_field(struct kvm_memory_attributes, one) ==\
> + sizeof_field(struct kvm_memory_attributes2, two))
> +
> +/* Ensure the common parts of the two structs are identical. */
> +MEMORY_ATTRIBUTES_MATCH(address, address);
> +MEMORY_ATTRIBUTES_MATCH(size, size);
> +MEMORY_ATTRIBUTES_MATCH(attributes, attributes);
> +MEMORY_ATTRIBUTES_MATCH(flags, flags);
> +
> /*
> * Ordering of locks:
> *
>
> --
> 2.54.0.545.g6539524ca2-goog
>
>
>
^ permalink raw reply
* Re: [PATCH RFC v5 01/53] KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
From: Liam R. Howlett @ 2026-05-07 3:34 UTC (permalink / raw)
To: ackerleytng
Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco
In-Reply-To: <20260428-gmem-inplace-conversion-v5-1-d8608ccfca22@google.com>
On 26/04/28 04:24PM, Ackerley Tng via B4 Relay wrote:
> From: Sean Christopherson <seanjc@google.com>
>
> Start plumbing in guest_memfd support for in-place private<=>shared
> conversions by tracking attributes via a maple tree. KVM currently tracks
> private vs. shared attributes on a per-VM basis, which made sense when a
> guest_memfd _only_ supported private memory, but tracking per-VM simply
> can't work for in-place conversions as the shareability of a given page
> needs to be per-gmem_inode, not per-VM.
>
> Use the filemap invalidation lock to protect the maple tree, as taking the
> lock for read when faulting in memory (for userspace or the guest) isn't
> expected to result in meaningful contention, and using a separate lock
> would add significant complexity (avoid deadlock is quite difficult).
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Co-developed-by: Ackerley Tng <ackerleytng@google.com>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> Co-developed-by: Vishal Annapurve <vannapurve@google.com>
> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
> Co-developed-by: Fuad Tabba <tabba@google.com>
> Signed-off-by: Fuad Tabba <tabba@google.com>
> ---
> virt/kvm/guest_memfd.c | 139 +++++++++++++++++++++++++++++++++++++++++++------
> 1 file changed, 123 insertions(+), 16 deletions(-)
>
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 69c9d6d546b28..17e5a23fec0a1 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -4,6 +4,7 @@
> #include <linux/falloc.h>
> #include <linux/fs.h>
> #include <linux/kvm_host.h>
> +#include <linux/maple_tree.h>
> #include <linux/mempolicy.h>
> #include <linux/pseudo_fs.h>
> #include <linux/pagemap.h>
> @@ -33,6 +34,12 @@ struct gmem_inode {
> struct list_head gmem_file_list;
>
> u64 flags;
> + /*
> + * Every index in this inode, whether memory is populated or
> + * not, is tracked in attributes. There are no gaps in this
> + * maple tree.
> + */
> + struct maple_tree attributes;
> };
>
> static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
> @@ -60,6 +67,31 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
> return gfn - slot->base_gfn + slot->gmem.pgoff;
> }
>
> +static u64 kvm_gmem_get_attributes(struct inode *inode, pgoff_t index)
> +{
> + struct maple_tree *mt = &GMEM_I(inode)->attributes;
> + void *entry = mtree_load(mt, index);
> +
> + /*
> + * The lock _must_ be held for lookups, as some maple tree operations,
> + * e.g. append, are unsafe (return inaccurate information) with respect
> + * to concurrent RCU-protected lookups.
> + */
Can you please elaborate how you see inaccurate information and which
information is inaccurate?
Your comment is incorrect and misleading as append will not be used in
rcu mode. Note that you have not set this tree up in rcu mode.
> + lockdep_assert(mt_lock_is_held(mt));
> +
> + return WARN_ON_ONCE(!entry) ? 0 : xa_to_value(entry);
> +}
> +
> +static bool kvm_gmem_is_private_mem(struct inode *inode, pgoff_t index)
> +{
> + return kvm_gmem_get_attributes(inode, index) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
> +}
> +
> +static bool kvm_gmem_is_shared_mem(struct inode *inode, pgoff_t index)
> +{
> + return !kvm_gmem_is_private_mem(inode, index);
> +}
> +
> static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> pgoff_t index, struct folio *folio)
> {
> @@ -397,10 +429,13 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
> if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
> return VM_FAULT_SIGBUS;
>
> - if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED))
> - return VM_FAULT_SIGBUS;
> + filemap_invalidate_lock_shared(inode->i_mapping);
> + if (kvm_gmem_is_shared_mem(inode, vmf->pgoff))
> + folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> + else
> + folio = ERR_PTR(-EACCES);
> + filemap_invalidate_unlock_shared(inode->i_mapping);
>
> - folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> if (IS_ERR(folio)) {
> if (PTR_ERR(folio) == -EAGAIN)
> return VM_FAULT_RETRY;
> @@ -556,6 +591,51 @@ bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
> return true;
> }
>
> +static int kvm_gmem_init_inode(struct inode *inode, loff_t size, u64 flags)
> +{
> + struct gmem_inode *gi = GMEM_I(inode);
> + MA_STATE(mas, &gi->attributes, 0, (size >> PAGE_SHIFT) - 1);
> + u64 attrs;
> + int r;
> +
> + inode->i_op = &kvm_gmem_iops;
> + inode->i_mapping->a_ops = &kvm_gmem_aops;
> + inode->i_mode |= S_IFREG;
> + inode->i_size = size;
> + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
> +
> + /*
> + * guest_memfd memory is neither migratable nor swappable: set
> + * inaccessible to gate off both.
> + */
> + mapping_set_inaccessible(inode->i_mapping);
> + WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
> +
> + gi->flags = flags;
> +
> + mt_set_external_lock(&gi->attributes,
> + &inode->i_mapping->invalidate_lock);
> +
> + /*
> + * Store default attributes for the entire gmem instance. Ensuring every
> + * index is represented in the maple tree at all times simplifies the
> + * conversion and merging logic.
> + */
> + attrs = gi->flags & GUEST_MEMFD_FLAG_INIT_SHARED ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE;
> +
> + /*
> + * Acquire the invalidation lock purely to make lockdep happy. The
> + * maple tree library expects all stores to be protected via the lock,
> + * and the library can't know when the tree is reachable only by the
> + * caller, as is the case here.
> + */
> + filemap_invalidate_lock(inode->i_mapping);
> + r = mas_store_gfp(&mas, xa_mk_value(attrs), GFP_KERNEL);
> + filemap_invalidate_unlock(inode->i_mapping);
> +
> + return r;
> +}
> +
> static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
> {
> static const char *name = "[kvm-gmem]";
> @@ -586,16 +666,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
> goto err_fops;
> }
>
> - inode->i_op = &kvm_gmem_iops;
> - inode->i_mapping->a_ops = &kvm_gmem_aops;
> - inode->i_mode |= S_IFREG;
> - inode->i_size = size;
> - mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
> - mapping_set_inaccessible(inode->i_mapping);
> - /* Unmovable mappings are supposed to be marked unevictable as well. */
> - WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
> -
> - GMEM_I(inode)->flags = flags;
> + err = kvm_gmem_init_inode(inode, size, flags);
> + if (err)
> + goto err_inode;
>
> file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
> if (IS_ERR(file)) {
> @@ -797,9 +870,13 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> if (!file)
> return -EFAULT;
>
> + filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
> +
> folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
> - if (IS_ERR(folio))
> - return PTR_ERR(folio);
> + if (IS_ERR(folio)) {
> + r = PTR_ERR(folio);
> + goto out;
> + }
>
> if (!folio_test_uptodate(folio)) {
> clear_highpage(folio_page(folio, 0));
> @@ -815,6 +892,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> else
> folio_put(folio);
>
> +out:
> + filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
> return r;
> }
> EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
> @@ -944,6 +1023,15 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
>
> mpol_shared_policy_init(&gi->policy, NULL);
>
> + /*
> + * Memory attributes are protected by the filemap invalidation lock, but
> + * the lock structure isn't available at this time. Immediately mark
> + * maple tree as using external locking so that accessing the tree
> + * before it's fully initialized results in NULL pointer dereferences
> + * and not more subtle bugs.
> + */
> + mt_init_flags(&gi->attributes, MT_FLAGS_LOCK_EXTERN);
> +
> gi->flags = 0;
> INIT_LIST_HEAD(&gi->gmem_file_list);
> return &gi->vfs_inode;
> @@ -951,7 +1039,26 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
>
> static void kvm_gmem_destroy_inode(struct inode *inode)
> {
> - mpol_free_shared_policy(&GMEM_I(inode)->policy);
> + struct gmem_inode *gi = GMEM_I(inode);
> +
> + mpol_free_shared_policy(&gi->policy);
> +
> + /*
> + * Note! Checking for an empty tree is functionally necessary
> + * to avoid explosions if the tree hasn't been fully
> + * initialized, i.e. if the inode is being destroyed before
> + * guest_memfd can set the external lock, lockdep would find
> + * that the tree's internal ma_lock was not held.
> + */
> + if (!mtree_empty(&gi->attributes)) {
> + /*
> + * Acquire the invalidation lock purely to make lockdep happy,
> + * the inode is unreachable at this point.
> + */
> + filemap_invalidate_lock(inode->i_mapping);
> + __mt_destroy(&gi->attributes);
> + filemap_invalidate_unlock(inode->i_mapping);
> + }
> }
>
> static void kvm_gmem_free_inode(struct inode *inode)
>
> --
> 2.54.0.545.g6539524ca2-goog
>
>
>
^ permalink raw reply
* Re: [PATCH v8 10/21] x86/virt/seamldr: Shut down the current TDX module
From: Dave Hansen @ 2026-05-06 20:49 UTC (permalink / raw)
To: Chao Gao
Cc: kvm, linux-coco, linux-kernel, x86, binbin.wu, dave.hansen, djbw,
ira.weiny, kai.huang, kas, nik.borisov, paulmck, pbonzini,
reinette.chatre, rick.p.edgecombe, sagis, seanjc, tony.lindgren,
vannapurve, vishal.l.verma, yilun.xu, xiaoyao.li, yan.y.zhao,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin
In-Reply-To: <afqtylwGsoLsp/0f@intel.com>
On 5/5/26 19:56, Chao Gao wrote:
> On Thu, Apr 30, 2026 at 11:52:50AM -0700, Dave Hansen wrote:
>> On 4/27/26 08:28, Chao Gao wrote:
>>> static int do_seamldr_install_module(void *seamldr_params)
>>> {
>>> enum module_update_state newstate, curstate = MODULE_UPDATE_START;
>>> + int cpu = smp_processor_id();
>>> + bool primary;
>>> int ret = 0;
>>>
>>> + primary = cpumask_first(cpu_online_mask) == cpu;
>> Isn't cpumask_first(cpu_online_mask)==0, always? I thought CPU 0 could
>> never be offlined.
> Not always. On x86, CPU 0 can be offlined at runtime if the kernel is booted
> with the cpu0_hotplug command-line option. See cpu_hotplug.rst.
See e59e74dc48a309cb848ffc3d76a0d61aa6803c05.
Yes, the docs are stale.
^ permalink raw reply
* Re: [PATCH v8 09/21] x86/virt/seamldr: Introduce skeleton for TDX module updates
From: Dave Hansen @ 2026-05-06 20:43 UTC (permalink / raw)
To: Chao Gao
Cc: kvm, linux-coco, linux-kernel, x86, binbin.wu, dave.hansen, djbw,
ira.weiny, kai.huang, kas, nik.borisov, paulmck, pbonzini,
reinette.chatre, rick.p.edgecombe, sagis, seanjc, tony.lindgren,
vannapurve, vishal.l.verma, yilun.xu, xiaoyao.li, yan.y.zhao,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin
In-Reply-To: <afs7UATlAuDehCCB@intel.com>
On 5/6/26 06:00, Chao Gao wrote:
> On Thu, Apr 30, 2026 at 01:03:53PM -0700, Dave Hansen wrote:
>> On 4/27/26 08:28, Chao Gao wrote:
>>> +static struct {
>>> + enum module_update_state state;
>>> + int thread_ack;
>>
>> multi_stop_data has an atomic_t for this.
>>
>> You have an int.
>>
>> Which one is right?
>
> You pointed out that using atomic_t and memory barriers for synchronization was
> overly complicated. So, I switched to use a spinlock, and thread_ack can now be
> a plain int.
Good point.
Could you make this a bit more obvious, please?
I honestly don't like guard(). Maybe I'm old-fashioned, but I really,
really like critical sections to be, well, explicit *sections* of code.
Second, you have two functions defined next to each other with similar
names:
static void ack_state(void)
static void set_target_state(enum module_update_state state)
Both of which manipulate the same data. One takes the lock. One doesn't.
That could be fixed with comments.
^ permalink raw reply
* Re: SVSM Development Call May 6th, 2026
From: Jörg Rödel @ 2026-05-06 18:10 UTC (permalink / raw)
To: coconut-svsm, linux-coco
In-Reply-To: <45x7yirnjkd374loz5ymizxdyccuytg7rr46jvgh33gl7c2xwo@vps5xitk2umx>
Added meeting minutes to the pending PR from last week:
https://github.com/coconut-svsm/governance/pull/106
-Joerg
^ permalink raw reply
* Re: [PATCH v4 3/3] coco: guest: arm64: Query host IPA-change alignment via RHI
From: Aneesh Kumar K.V @ 2026-05-06 14:23 UTC (permalink / raw)
To: Marc Zyngier
Cc: linux-kernel, iommu, linux-coco, linux-arm-kernel, kvmarm,
Catalin Marinas, Jason Gunthorpe, Marek Szyprowski, Robin Murphy,
Steven Price, Suzuki K Poulose, Thomas Gleixner, Will Deacon
In-Reply-To: <86tssvyz2v.wl-maz@kernel.org>
Marc Zyngier <maz@kernel.org> writes:
> On Tue, 28 Apr 2026 13:49:46 +0100,
> Aneesh Kumar K.V <aneesh.kumar@kernel.org> wrote:
>>
>> Marc Zyngier <maz@kernel.org> writes:
>>
>> > On Mon, 27 Apr 2026 07:31:08 +0100,
>> > "Aneesh Kumar K.V (Arm)" <aneesh.kumar@kernel.org> wrote:
>> >>
>> >> Add the Realm Host Interface support needed to query host configuration
>> >> from a Realm guest. Define the RHI hostconf SMCs, add rsi_host_call(), and
>> >> use them during Realm initialization to retrieve the host IPA-change
>> >> alignment size.
>> >
>> > I don't understand what "IPA-change" means. What you are after is the
>> > host's sharing granule size.
>> >
>>
>> This is part of the RHI specification, and the call is named
>> RHI_HOSTCONF_GET_IPA_CHANGE_ALIGNMENT. The intent is to determine the
>> alignment requirements for changing IPA attributes (protected vs.
>> unprotected IPA
>
> This really is a terrible name. Why the 'change' part? It doesn't
> change, it is a constant.
>
How about rhi_get_host_sharing_alignment()? or can you suggest a better
name I can switch to?
> Oh well...
>
> [...]
>
>> >> +static inline unsigned long rsi_host_call(struct rsi_host_call *rhi_call)
>> >> +{
>> >> + phys_addr_t addr = virt_to_phys(rhi_call);
>> >> + struct arm_smccc_res res;
>> >> +
>> >> + arm_smccc_1_1_invoke(SMC_RSI_HOST_CALL, addr, &res);
>> >
>> > Errr... What guarantees that *rhi_call is *IPA contiguous*? This is
>> > incredibly fragile. You should at the very least check that this isn't
>> > vmalloc'd.
>> >
>>
>>
>> I didn’t quite follow that. We have other RSI calls (even RMI calls)
>> that do similar things, and the caller understands that the address
>> should be IPA-contiguous.
>
> Does it? Where is it documented? All you get is a pointer, so all
> bets are off.
>
We have multiple rmi and rsi calls that takes ipa values. asm/rmi_cmds.h
and asm/rsi_cmds.h. Some of them takes phys_addr_t while others take
unsigned long. The spec mention these as 64 bits values. May be we
should switch them all to u64. x86 also having similar discussion
https://lore.kernel.org/all/afOrd7JYkUfe7wcZ@google.com
>
>> Are you suggesting that all RSI calls should
>> add checks for this?. or are you suggesting to update the API to
>>
>> unsigned long rsi_host_call(unsigned long rhi_call_phys) ?
>
> I'm suggesting that this API is subtly broken because it makes random
> assumption about the physical contiguity of the VA space. It does so
> without any check, without any documentation.
>
> Simply changing the parameter to phys_addr_t could at the very least
> capture some of the requirements, but I'd like something in big bold
> letters.
>
virt_to_phys() emits a WARN if the address is not part of the linear
map. Are you suggesting that we should add additional checks to the call
sites that pass such addresses?
Sorry, it’s still not clear to me how you want these calls to be
updated.
The pattern I’ve been following is:
Lower-level calls that use arm_smccc_1_1_invoke() take parameters as
unsigned long. I initially wanted to switch this to u64, but since
kvm/rmi.c uses unsigned long, it was decided to keep it consistent.
This approach is used in cases where the same argument is passed across
multiple calls, for example:
phys_addr_t rd_phys = virt_to_phys(realm->rd);
rmi_vdev_create(rd_phys, ...);
rmi_vdev_lock(rd_phys, ...);
For calls like rsi_host_call(), I chose to pass a struct pointer to
maintain better type safety:
static inline unsigned long rsi_host_call(struct rsi_host_call *rhi_call)
{
phys_addr_t addr = virt_to_phys(rhi_call);
arm_smccc_1_1_invoke(SMC_RSI_HOST_CALL, addr, &res);
}
Note that virt_to_phys() will WARN if the address is not part of the
linear map
Could you clarify what changes you would like to see in these
interfaces?
-aneesh
^ permalink raw reply
* Re: [RFC PATCH 04/12] vfio/pci: Allow MMIO regions to be exported through dma-buf
From: Jason Gunthorpe @ 2026-05-06 13:16 UTC (permalink / raw)
To: Alexey Kardashevskiy
Cc: Xu Yilun, kvm, dri-devel, linux-media, linaro-mm-sig,
sumit.semwal, christian.koenig, pbonzini, seanjc, alex.williamson,
vivek.kasireddy, dan.j.williams, yilun.xu, linux-coco,
linux-kernel, lukas, yan.y.zhao, daniel.vetter, leon, baolu.lu,
zhenzhong.duan, tao1.su
In-Reply-To: <c0b160f8-2930-4158-9e50-b4cc4209e2ca@amd.com>
On Wed, May 06, 2026 at 12:35:42PM +1000, Alexey Kardashevskiy wrote:
> Hi!
>
> Let's reignite this topic.
>
> I've been using these patches + QEMU side hacks for 6+ months. And it's been fine until I got a device where MSIX BAR is in a middle of another BAR marked as TEE in the TDISP interface report. And no trusted MSIX yet.
>
> Every time QEMU mmaps a BAR - I request a dmabuf fd from VFIO in QEMU. Since mapping of an entire MSIX BAR is allowed by default, VFIORegion::nr_mmaps==1 and it is an entire BAR.
>
> Problem: KVM memslot mismatches the dmabuf fd size
Huh? kvm does not care about dmabuf at all? Are you running other
patches to hook kvm and dmabuf?
Putting a slice in a dmabuf is a well understood need for MSI, so I
expect whatever kvm dmabuf interface that gets merged to accomodate
this?
> Solution2: modify logic in VFIO dmabuf to allow multiple KVM memory
> slots per dmabuf. Now it is kvm_memory_slot::dmabuf_attach with no
> offset into the dmabuf and one kvm_vfio_dmabuf per dma_buf.
Yes, when kvm learns to take in a dmabuf it needs to take in a slice,
not the whole buf. Or you need to create multiple dmabufs with the
necessary slices from the VFIO. The upstream vfio dmabuf creation
allows creating it with a slice.
Jason
^ permalink raw reply
* Re: [PATCH v8 09/21] x86/virt/seamldr: Introduce skeleton for TDX module updates
From: Chao Gao @ 2026-05-06 13:00 UTC (permalink / raw)
To: Dave Hansen
Cc: kvm, linux-coco, linux-kernel, x86, binbin.wu, dave.hansen, djbw,
ira.weiny, kai.huang, kas, nik.borisov, paulmck, pbonzini,
reinette.chatre, rick.p.edgecombe, sagis, seanjc, tony.lindgren,
vannapurve, vishal.l.verma, yilun.xu, xiaoyao.li, yan.y.zhao,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin
In-Reply-To: <5dc70847-332d-496f-b0ab-03323eff7118@intel.com>
On Thu, Apr 30, 2026 at 01:03:53PM -0700, Dave Hansen wrote:
>On 4/27/26 08:28, Chao Gao wrote:
>> +static struct {
>> + enum module_update_state state;
>> + int thread_ack;
>
>multi_stop_data has an atomic_t for this.
>
>You have an int.
>
>Which one is right?
You pointed out that using atomic_t and memory barriers for synchronization was
overly complicated. So, I switched to use a spinlock, and thread_ack can now be
a plain int.
See https://lore.kernel.org/kvm/31936a20-929f-489a-9dc6-0f8fcb9307f1@intel.com/
^ permalink raw reply
* Re: [PATCH v8 15/21] x86/virt/tdx: Refresh TDX module version after update
From: Chao Gao @ 2026-05-06 12:51 UTC (permalink / raw)
To: Dave Hansen
Cc: kvm, linux-coco, linux-kernel, x86, binbin.wu, dave.hansen, djbw,
ira.weiny, kai.huang, kas, nik.borisov, paulmck, pbonzini,
reinette.chatre, rick.p.edgecombe, sagis, seanjc, tony.lindgren,
vannapurve, vishal.l.verma, yilun.xu, xiaoyao.li, yan.y.zhao,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin
In-Reply-To: <28be5180-ff74-4e4d-b392-7ba7a9b4c1c0@intel.com>
On Thu, Apr 30, 2026 at 12:14:32PM -0700, Dave Hansen wrote:
>On 4/27/26 08:28, Chao Gao wrote:
>> The kernel exposes the TDX module version through sysfs so userspace can
>> check update compatibility. That information needs to remain accurate
>> across runtime updates.
>>
>> A runtime update may change the module's update_version, so refresh the
>> cached version after a successful update and emit a log message to show
>> the version change.
>>
>> Drop __ro_after_init from tdx_sysinfo because it is now updated at runtime.
>>
>> Perform the refresh outside of stop_machine() since printk() within
>> stop_machine() would add significant latency.
>>
>> Do not refresh the rest of tdx_sysinfo. Refreshing them at runtime could
>> disrupt running software that relies on the previously reported values.
>
>This needs more explanation. I think the reasoning is quite nuanced.
>
>tdx_sysinfo is just a cache of what the TDX module is and can do. If
>that changes, it means the TDX module changed. So you somehow need to
>argue why it's OK to hide those changes from the tdx_sysinfo users.
>
>Why would they be confused by tdx_sysinfo changes but not by the TDX
>module *itself* changing?
Good point. The key assumption here is that module updates are fully backward
compatible, so running software can continue to work without seeing the new
tdx_sysinfo. I will revise the changelog. See below.
>
>> Note that major and minor versions are not refreshed because runtime updates
>> are supported only between releases with identical major and minor versions.
>
>I'd rather have this in code than a changelog comment.
>
>If they can't change then warn if they do.
Maybe I can just drop the note as I don't want to add code to preemptively
catch theoretical module bugs.
I added it because Sashiko pointed out that assigning the whole version struct
outside stop_machine() could allow sysfs readers to observe a partially updated
version. As we don't need to print new module version, I will move that
assignment into stop_machine(), which addresses that issue. After that, there
is no need to mention that major/minor versions are identical across updates.
>
>> diff --git a/arch/x86/virt/vmx/tdx/seamldr.c b/arch/x86/virt/vmx/tdx/seamldr.c
>> index 98a8d9d3ae25..c81b26c4bac1 100644
>> --- a/arch/x86/virt/vmx/tdx/seamldr.c
>> +++ b/arch/x86/virt/vmx/tdx/seamldr.c
>> @@ -306,6 +306,8 @@ DEFINE_FREE(free_seamldr_params, struct seamldr_params *,
>> */
>> int seamldr_install_module(const u8 *data, u32 size)
>> {
>> + int ret;
>> +
>> struct seamldr_params *params __free(free_seamldr_params) =
>> init_seamldr_params(data, size);
>> if (IS_ERR(params))
>> @@ -314,6 +316,10 @@ int seamldr_install_module(const u8 *data, u32 size)
>> /* Ensure a stable set of online CPUs for the update process. */
>> guard(cpus_read_lock)();
>> set_target_state(MODULE_UPDATE_START + 1);
>> - return stop_machine_cpuslocked(do_seamldr_install_module, params, cpu_online_mask);
>> + ret = stop_machine_cpuslocked(do_seamldr_install_module, params, cpu_online_mask);
>> + if (ret)
>> + return ret;
>> +
>> + return tdx_module_refresh_version();
>> }
>
>This is one reason I rather dislike guard().
>
>Does tdx_module_refresh_version() need to be guarded by 'cpus_read_lock'?
No. It can be moved out of 'cpus_read_lock'.
>
>?
>
>
>> + /* Major/minor versions should not change across updates. */
>> + tdx_sysinfo.version.update_version = new.update_version;
>
> ^ very odd tab
>
>Also, how much of this do you *NEED*? You don't need to print the old
>version. You don't really need to _print_ the new version either.
>
>Isn't this arguably all fluff?
I initially kept tdx module update similar to the microcode update. But yes,
printing the new version is not strictly needed. Once the unnecessary
complexity is dropped, the patch becomes quite small:
commit 90e5a66b3f54af96d5895f6707ecdeef4bc4a3ed
Author: Chao Gao <chao.gao@intel.com>
Date: Tue Mar 31 05:41:29 2026 -0700
x86/virt/tdx: Refresh TDX module version after update
The kernel exposes the TDX module version through sysfs so userspace can
check update compatibility. That information needs to remain accurate
across runtime updates.
A runtime update may change the module's update_version, so refresh the
cached version after a successful update.
Drop __ro_after_init from tdx_sysinfo because it is now updated at runtime.
Do not refresh the rest of tdx_sysinfo even if those values change. Current
tdx_sysinfo users (e.g., KVM) can continue to work across module updates
without seeing the new values because module updates are required to be
backward compatible. And those users are not written to re-validate
tdx_sysinfo values after an update, so refreshing them could be risky.
For example, a user may decide both setup and teardown behavior purely
based on the reported capabilities. If refreshed tdx_sysinfo starts
reporting a newly gained capability, later code may assume the
corresponding setup exists and try to use or tear it down, even though no
such setup was done before the update.
Signed-off-by: Chao Gao <chao.gao@intel.com>
---
v9:
- don't print old and new version [Dave]
- explain why it's OK to hide changes from the tdx_sysinfo users [Dave]
- update versions in stop_machine context
- don't mention major/minor versions are idential across updates. That fact is
not relevant here.
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index deb1470185ce..ab350b705910 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -66,7 +66,7 @@ static struct tdmr_info_list tdx_tdmr_list;
/* All TDX-usable memory regions. Protected by mem_hotplug_lock. */
static LIST_HEAD(tdx_memlist);
-static struct tdx_sys_info tdx_sysinfo __ro_after_init;
+static struct tdx_sys_info tdx_sysinfo;
/*
* Do the module global initialization once and return its result.
@@ -1305,6 +1305,10 @@ int tdx_module_run_update(void)
if (ret)
return ret;
+ /* Shouldn't fail as the update has succeeded. */
+ ret = get_tdx_sys_info_version(&tdx_sysinfo.version);
+ WARN_ON_ONCE(ret);
+
tdx_module_state.initialized = true;
return 0;
}
diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
index e793dec688ab..e49c300f23d4 100644
--- a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
+++ b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
@@ -7,7 +7,7 @@
* Include this file to other C file instead.
*/
-static __init int get_tdx_sys_info_version(struct tdx_sys_info_version *sysinfo_version)
+static int get_tdx_sys_info_version(struct tdx_sys_info_version *sysinfo_version)
{
int ret = 0;
u64 val;
^ permalink raw reply related
* Re: SVSM Development Call April 29, 2026
From: Jörg Rödel @ 2026-05-06 10:35 UTC (permalink / raw)
To: coconut-svsm, linux-coco
In-Reply-To: <j3qgor73sx536uzabgyrfadoj7xznc4ir2pj6tbyimn7aqpiki@m5hzafmcng7m>
Meeting minutes are now available:
https://github.com/coconut-svsm/governance/pull/106
-Joerg
^ permalink raw reply
* Re: [PATCH v8 11/21] x86/virt/tdx: Reset software states during TDX module shutdown
From: Chao Gao @ 2026-05-06 6:21 UTC (permalink / raw)
To: Dave Hansen
Cc: kvm, linux-coco, linux-kernel, x86, binbin.wu, dave.hansen, djbw,
ira.weiny, kai.huang, kas, nik.borisov, paulmck, pbonzini,
reinette.chatre, rick.p.edgecombe, sagis, seanjc, tony.lindgren,
vannapurve, vishal.l.verma, yilun.xu, xiaoyao.li, yan.y.zhao,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin
In-Reply-To: <20f2d821-bfa6-4db2-a968-b5455c7b5007@intel.com>
On Thu, Apr 30, 2026 at 11:58:12AM -0700, Dave Hansen wrote:
>On 4/27/26 08:28, Chao Gao wrote:
>> + /*
>> + * Clear global and per-CPU initialization flags so the new module
>> + * can be fully re-initialized after a successful update.
>> + *
>> + * No locks needed as no concurrent accesses can occur here.
>> + */
>> + tdx_module_initialized = false;
>> + sysinit_done = false;
>> + sysinit_ret = 0;
>> + for_each_possible_cpu(cpu)
>> + per_cpu(tdx_lp_initialized, cpu) = false;
>
>This speaks to needing refactoring.
>
>If there's global TDX state, it needs to be in a global TDX state
>structure, not scattered across random global variables.
>
>Imagine the structure is:
>
>struct tdx_module_config foo;
>
>That's 0's at boot. You have to init the TDX module to bring it out of
>0's to valid state. It actually means something if you do:
>
> memset(&foo, 0, sizeof(foo));
>
>Because it takes it right back to its bss state. That ^ is also handy
>because it naturally just works if new state gets added.
>
>Guess what will happen the next time someone adds:
>
> int sysinit_new_fancy_thing;
>
>Someone will forget to add it to this code. Then the module gets updated
>and breaks things in fun ways.
Makes sense. I will slot in a patch like this:
From 1578bd211c732f7773475819cbf145fe9fedfcb5 Mon Sep 17 00:00:00 2001
From: Chao Gao <chao.gao@intel.com>
Date: Tue, 5 May 2026 22:46:39 -0700
Subject: [PATCH] x86/virt/tdx: Consolidate TDX global initialization state
The kernel uses several global flags to guard one-time TDX initialization
flows and prevent them from being repeated.
When the TDX module is updated, all of this state must be reset so that
the module can be initialized again. Today those flags are kept as separate
global variables, which makes the reset path awkward and easy to miss when
new state is added.
Group the flags into a single structure so they can be reset together, for
example with memset(), and so future state that needs the same handling is
easier to manage.
Drop the __ro_after_init annotation from tdx_module_initialized to make it
consistent with the other two flags.
Signed-off-by: Chao Gao <chao.gao@intel.com>
---
arch/x86/virt/vmx/tdx/tdx.c | 24 ++++++++++++++----------
1 file changed, 14 insertions(+), 10 deletions(-)
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index c0c6281b08a5..0172b432f229 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -44,6 +44,13 @@
#include <asm/virt.h>
#include "tdx.h"
+struct tdx_module_state {
+ bool initialized;
+ bool sysinit_done;
+ int sysinit_ret;
+};
+
+static struct tdx_module_state tdx_module_state;
static u32 tdx_global_keyid __ro_after_init;
static u32 tdx_guest_keyid_start __ro_after_init;
static u32 tdx_nr_guest_keyids __ro_after_init;
@@ -58,7 +65,6 @@ static struct tdmr_info_list tdx_tdmr_list;
static LIST_HEAD(tdx_memlist);
static struct tdx_sys_info tdx_sysinfo __ro_after_init;
-static bool tdx_module_initialized __ro_after_init;
typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args);
@@ -113,30 +119,28 @@ static int try_init_module_global(void)
{
struct tdx_module_args args = {};
static DEFINE_RAW_SPINLOCK(sysinit_lock);
- static bool sysinit_done;
- static int sysinit_ret;
raw_spin_lock(&sysinit_lock);
- if (sysinit_done)
+ if (tdx_module_state.sysinit_done)
goto out;
/* RCX is module attributes and all bits are reserved */
args.rcx = 0;
- sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args);
+ tdx_module_state.sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args);
/*
* The first SEAMCALL also detects the TDX module, thus
* it can fail due to the TDX module is not loaded.
* Dump message to let the user know.
*/
- if (sysinit_ret == -ENODEV)
+ if (tdx_module_state.sysinit_ret == -ENODEV)
pr_err("module not loaded\n");
- sysinit_done = true;
+ tdx_module_state.sysinit_done = true;
out:
raw_spin_unlock(&sysinit_lock);
- return sysinit_ret;
+ return tdx_module_state.sysinit_ret;
}
/**
@@ -1299,7 +1303,7 @@ static __init int tdx_enable(void)
register_syscore(&tdx_syscore);
- tdx_module_initialized = true;
+ tdx_module_state.initialized = true;
pr_info("TDX-Module initialized\n");
return 0;
}
@@ -1554,7 +1558,7 @@ void __init tdx_init(void)
const struct tdx_sys_info *tdx_get_sysinfo(void)
{
- if (!tdx_module_initialized)
+ if (!tdx_module_state.initialized)
return NULL;
return (const struct tdx_sys_info *)&tdx_sysinfo;
--
2.52.0
^ permalink raw reply related
* Re: [PATCH v8 10/21] x86/virt/seamldr: Shut down the current TDX module
From: Chao Gao @ 2026-05-06 2:56 UTC (permalink / raw)
To: Dave Hansen
Cc: kvm, linux-coco, linux-kernel, x86, binbin.wu, dave.hansen, djbw,
ira.weiny, kai.huang, kas, nik.borisov, paulmck, pbonzini,
reinette.chatre, rick.p.edgecombe, sagis, seanjc, tony.lindgren,
vannapurve, vishal.l.verma, yilun.xu, xiaoyao.li, yan.y.zhao,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin
In-Reply-To: <0523b07b-2df2-4cf4-bf98-6efe01780698@intel.com>
On Thu, Apr 30, 2026 at 11:52:50AM -0700, Dave Hansen wrote:
>On 4/27/26 08:28, Chao Gao wrote:
>> static int do_seamldr_install_module(void *seamldr_params)
>> {
>> enum module_update_state newstate, curstate = MODULE_UPDATE_START;
>> + int cpu = smp_processor_id();
>> + bool primary;
>> int ret = 0;
>>
>> + primary = cpumask_first(cpu_online_mask) == cpu;
>
>Isn't cpumask_first(cpu_online_mask)==0, always? I thought CPU 0 could
>never be offlined.
Not always. On x86, CPU 0 can be offlined at runtime if the kernel is booted
with the cpu0_hotplug command-line option. See cpu_hotplug.rst.
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox