* [PATCH v7 06/42] KVM: guest_memfd: Update kvm_gmem_populate() to use gmem attributes
From: Ackerley Tng via B4 Relay @ 2026-05-23 0:17 UTC (permalink / raw)
To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>
From: Ackerley Tng <ackerleytng@google.com>
Update the guest_memfd populate() flow to pull memory attributes from the
gmem instance instead of the VM when KVM is not configured to track
shared/private status in the VM.
Rename the per-VM API to make it clear that it retrieves per-VM
attributes, i.e. is not suitable for use outside of flows that are
specific to generic per-VM attributes.
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
arch/x86/kvm/mmu/mmu.c | 2 +-
include/linux/kvm_host.h | 14 +++++++++++++-
virt/kvm/guest_memfd.c | 24 +++++++++++++++++++++---
virt/kvm/kvm_main.c | 8 +++-----
4 files changed, 38 insertions(+), 10 deletions(-)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index b53a0c4b4dfca..3f70859232b07 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -8060,7 +8060,7 @@ static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
const unsigned long end = start + KVM_PAGES_PER_HPAGE(level);
if (level == PG_LEVEL_2M)
- return kvm_range_has_memory_attributes(kvm, start, end, ~0, attrs);
+ return kvm_range_has_vm_memory_attributes(kvm, start, end, ~0, attrs);
for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) {
if (hugepage_test_mixed(slot, gfn, level - 1) ||
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7de85474c75bd..3039b291e4b09 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2549,12 +2549,24 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
#endif
#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
-bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+extern bool vm_memory_attributes;
+bool kvm_range_has_vm_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
unsigned long mask, unsigned long attrs);
bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
struct kvm_gfn_range *range);
bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
struct kvm_gfn_range *range);
+#else
+#define vm_memory_attributes false
+static inline bool kvm_range_has_vm_memory_attributes(struct kvm *kvm,
+ gfn_t start, gfn_t end,
+ unsigned long mask,
+ unsigned long attrs)
+{
+ WARN_ONCE(1, "Unexpected call to kvm_range_has_vm_memory_attributes()");
+
+ return false;
+}
#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
unsigned long kvm_gmem_get_memory_attributes(struct kvm *kvm, gfn_t gfn);
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index c55879e033d96..78e5435967341 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -930,12 +930,31 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
+static bool kvm_gmem_range_is_private(struct gmem_inode *gi, pgoff_t index,
+ size_t nr_pages, struct kvm *kvm, gfn_t gfn)
+{
+ pgoff_t end = index + nr_pages - 1;
+ void *entry;
+
+ if (vm_memory_attributes)
+ return kvm_range_has_vm_memory_attributes(kvm, gfn, gfn + nr_pages,
+ KVM_MEMORY_ATTRIBUTE_PRIVATE,
+ KVM_MEMORY_ATTRIBUTE_PRIVATE);
+
+ mt_for_each(&gi->attributes, entry, index, end) {
+ if (xa_to_value(entry) != KVM_MEMORY_ATTRIBUTE_PRIVATE)
+ return false;
+ }
+
+ return true;
+}
static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot,
struct file *file, gfn_t gfn, struct page *src_page,
kvm_gmem_populate_cb post_populate, void *opaque)
{
pgoff_t index = kvm_gmem_get_index(slot, gfn);
+ struct gmem_inode *gi;
struct folio *folio;
kvm_pfn_t pfn;
int ret;
@@ -950,9 +969,8 @@ static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot,
folio_unlock(folio);
- if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1,
- KVM_MEMORY_ATTRIBUTE_PRIVATE,
- KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
+ gi = GMEM_I(file_inode(file));
+ if (!kvm_gmem_range_is_private(gi, index, 1, kvm, gfn)) {
ret = -EINVAL;
goto out_put_folio;
}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4139e903f756a..0a4024948711a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -103,9 +103,7 @@ module_param(allow_unsafe_mappings, bool, 0444);
#ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
-static bool vm_memory_attributes = true;
-#else
-#define vm_memory_attributes false
+bool vm_memory_attributes = true;
#endif
DEFINE_STATIC_CALL_RET0(__kvm_get_memory_attributes, kvm_get_memory_attributes_t);
EXPORT_SYMBOL_FOR_KVM_INTERNAL(STATIC_CALL_KEY(__kvm_get_memory_attributes));
@@ -2450,7 +2448,7 @@ static unsigned long kvm_get_vm_memory_attributes(struct kvm *kvm, gfn_t gfn)
* Returns true if _all_ gfns in the range [@start, @end) have attributes
* such that the bits in @mask match @attrs.
*/
-bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+bool kvm_range_has_vm_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
unsigned long mask, unsigned long attrs)
{
XA_STATE(xas, &kvm->mem_attr_array, start);
@@ -2584,7 +2582,7 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
mutex_lock(&kvm->slots_lock);
/* Nothing to do if the entire range has the desired attributes. */
- if (kvm_range_has_memory_attributes(kvm, start, end, ~0, attributes))
+ if (kvm_range_has_vm_memory_attributes(kvm, start, end, ~0, attributes))
goto out_unlock;
/*
--
2.54.0.794.g4f17f83d09-goog
^ permalink raw reply related
* [PATCH v7 05/42] KVM: guest_memfd: Wire up kvm_get_memory_attributes() to per-gmem attributes
From: Ackerley Tng via B4 Relay @ 2026-05-23 0:17 UTC (permalink / raw)
To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>
From: Sean Christopherson <seanjc@google.com>
Implement kvm_gmem_get_memory_attributes() for guest_memfd to allow the KVM
core and architecture code to query per-GFN memory attributes.
kvm_gmem_get_memory_attributes() finds the memory slot for a given GFN and
queries the guest_memfd file's to determine if the page is marked as
private.
If vm_memory_attributes is not enabled, there is no shared/private tracking
at the VM level. Install the guest_memfd implementation as long as
guest_memfd is enabled to give guest_memfd a chance to respond on
attributes.
guest_memfd should look up attributes regardless of whether this memslot is
gmem-only since attributes are now tracked by gmem regardless of whether
mmap() is enabled.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Co-developed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
include/linux/kvm_host.h | 2 ++
virt/kvm/guest_memfd.c | 31 +++++++++++++++++++++++++++++++
virt/kvm/kvm_main.c | 3 +++
3 files changed, 36 insertions(+)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 29694b348df40..7de85474c75bd 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2557,6 +2557,8 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
struct kvm_gfn_range *range);
#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
+unsigned long kvm_gmem_get_memory_attributes(struct kvm *kvm, gfn_t gfn);
+
#ifdef CONFIG_KVM_GUEST_MEMFD
int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 117b726f670e8..c55879e033d96 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -509,6 +509,37 @@ static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
+unsigned long kvm_gmem_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+ struct inode *inode;
+
+ /*
+ * If this gfn has no associated memslot, there's no chance of the gfn
+ * being backed by private memory, since guest_memfd must be used for
+ * private memory, and guest_memfd must be associated with some memslot.
+ */
+ if (!slot)
+ return 0;
+
+ CLASS(gmem_get_file, file)(slot);
+ if (!file)
+ return 0;
+
+ inode = file_inode(file);
+
+ /*
+ * Rely on the maple tree's internal RCU lock to ensure a
+ * stable result. This result can become stale as soon as the
+ * lock is dropped, so the caller _must_ still protect
+ * consumption of private vs. shared by checking
+ * mmu_invalidate_retry_gfn() under mmu_lock to serialize
+ * against ongoing attribute updates.
+ */
+ return kvm_gmem_get_attributes(inode, kvm_gmem_get_index(slot, gfn));
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_memory_attributes);
+
static struct file_operations kvm_gmem_fops = {
.mmap = kvm_gmem_mmap,
.open = generic_file_open,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ee26f1d9b5fda..4139e903f756a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2653,6 +2653,9 @@ static void kvm_init_memory_attributes(void)
if (vm_memory_attributes)
static_call_update(__kvm_get_memory_attributes,
kvm_get_vm_memory_attributes);
+ else if (IS_ENABLED(CONFIG_KVM_GUEST_MEMFD))
+ static_call_update(__kvm_get_memory_attributes,
+ kvm_gmem_get_memory_attributes);
else
static_call_update(__kvm_get_memory_attributes,
(void *)__static_call_return0);
--
2.54.0.794.g4f17f83d09-goog
^ permalink raw reply related
* [PATCH v7 03/42] KVM: Enumerate support for PRIVATE memory iff kvm_arch_has_private_mem is defined
From: Ackerley Tng via B4 Relay @ 2026-05-23 0:17 UTC (permalink / raw)
To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>
From: Sean Christopherson <seanjc@google.com>
Explicitly guard reporting support for KVM_MEMORY_ATTRIBUTE_PRIVATE based
on kvm_arch_has_private_mem being #defined in anticipation of decoupling
kvm_supported_mem_attributes() from CONFIG_KVM_VM_MEMORY_ATTRIBUTES.
guest_memfd support for memory attributes will be unconditional to avoid
yet more macros (all architectures that support guest_memfd are expected to
use per-gmem attributes at some point), at which point enumerating support
KVM_MEMORY_ATTRIBUTE_PRIVATE based solely on memory attributes being
supported _somewhere_ would result in KVM over-reporting support on arm64.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
include/linux/kvm_host.h | 2 +-
virt/kvm/kvm_main.c | 2 ++
2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 091f201251159..68142bc962953 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -722,7 +722,7 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
}
#endif
-#ifndef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+#ifndef kvm_arch_has_private_mem
static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
{
return false;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 306153abbafa5..abb9cfa3eb04d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2421,8 +2421,10 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
static u64 kvm_supported_mem_attributes(struct kvm *kvm)
{
+#ifdef kvm_arch_has_private_mem
if (!kvm || kvm_arch_has_private_mem(kvm))
return KVM_MEMORY_ATTRIBUTE_PRIVATE;
+#endif
return 0;
}
--
2.54.0.794.g4f17f83d09-goog
^ permalink raw reply related
* [PATCH v7 04/42] KVM: Stub in ability to disable per-VM memory attribute tracking
From: Ackerley Tng via B4 Relay @ 2026-05-23 0:17 UTC (permalink / raw)
To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>
From: Sean Christopherson <seanjc@google.com>
Introduce the basic infrastructure to allow per-VM memory attribute
tracking to be disabled. This will be built-upon in a later patch, where a
module param can disable per-VM memory attribute tracking.
Split the Kconfig option into a base KVM_MEMORY_ATTRIBUTES and the
existing KVM_VM_MEMORY_ATTRIBUTES. The base option provides the core
plumbing, while the latter enables the full per-VM tracking via an xarray
and the associated ioctls.
kvm_get_memory_attributes() now performs a static call that either looks up
kvm->mem_attr_array with CONFIG_KVM_VM_MEMORY_ATTRIBUTES is enabled, or
just returns 0 otherwise. The static call can be patched depending on
whether per-VM tracking is enabled by the CONFIG.
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
arch/x86/include/asm/kvm_host.h | 2 +-
include/linux/kvm_host.h | 23 ++++++++++++---------
virt/kvm/Kconfig | 4 ++++
virt/kvm/kvm_main.c | 44 ++++++++++++++++++++++++++++++++++++++++-
4 files changed, 62 insertions(+), 11 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8bb7c25240e33..01125be81a131 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2393,7 +2393,7 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
int tdp_max_root_level, int tdp_huge_page_level);
-#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
#endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 68142bc962953..29694b348df40 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2528,19 +2528,15 @@ static inline bool kvm_memslot_is_gmem_only(const struct kvm_memory_slot *slot)
return slot->flags & KVM_MEMSLOT_GMEM_ONLY;
}
-#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
+typedef unsigned long (kvm_get_memory_attributes_t)(struct kvm *kvm, gfn_t gfn);
+DECLARE_STATIC_CALL(__kvm_get_memory_attributes, kvm_get_memory_attributes_t);
+
static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
{
- return xa_to_value(xa_load(&kvm->mem_attr_array, gfn));
+ return static_call(__kvm_get_memory_attributes)(kvm, gfn);
}
-bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
- unsigned long mask, unsigned long attrs);
-bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
- struct kvm_gfn_range *range);
-bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
- struct kvm_gfn_range *range);
-
static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
{
return kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
@@ -2550,6 +2546,15 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
{
return false;
}
+#endif
+
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+ unsigned long mask, unsigned long attrs);
+bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
+ struct kvm_gfn_range *range);
+bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
+ struct kvm_gfn_range *range);
#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
#ifdef CONFIG_KVM_GUEST_MEMFD
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 5119cb37145fc..3fea89c45cfb4 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -100,7 +100,11 @@ config KVM_ELIDE_TLB_FLUSH_IF_YOUNG
config KVM_MMU_LOCKLESS_AGING
bool
+config KVM_MEMORY_ATTRIBUTES
+ bool
+
config KVM_VM_MEMORY_ATTRIBUTES
+ select KVM_MEMORY_ATTRIBUTES
bool
config KVM_GUEST_MEMFD
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index abb9cfa3eb04d..ee26f1d9b5fda 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -101,6 +101,17 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_shrink);
static bool __ro_after_init allow_unsafe_mappings;
module_param(allow_unsafe_mappings, bool, 0444);
+#ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+static bool vm_memory_attributes = true;
+#else
+#define vm_memory_attributes false
+#endif
+DEFINE_STATIC_CALL_RET0(__kvm_get_memory_attributes, kvm_get_memory_attributes_t);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(STATIC_CALL_KEY(__kvm_get_memory_attributes));
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(STATIC_CALL_TRAMP(__kvm_get_memory_attributes));
+#endif
+
/*
* Ordering of locks:
*
@@ -2418,7 +2429,7 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
}
#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
-#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
static u64 kvm_supported_mem_attributes(struct kvm *kvm)
{
#ifdef kvm_arch_has_private_mem
@@ -2429,6 +2440,12 @@ static u64 kvm_supported_mem_attributes(struct kvm *kvm)
return 0;
}
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+static unsigned long kvm_get_vm_memory_attributes(struct kvm *kvm, gfn_t gfn)
+{
+ return xa_to_value(xa_load(&kvm->mem_attr_array, gfn));
+}
+
/*
* Returns true if _all_ gfns in the range [@start, @end) have attributes
* such that the bits in @mask match @attrs.
@@ -2625,7 +2642,24 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
}
+#else /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
+static unsigned long kvm_get_vm_memory_attributes(struct kvm *kvm, gfn_t gfn)
+{
+ BUILD_BUG_ON(1);
+}
#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
+static void kvm_init_memory_attributes(void)
+{
+ if (vm_memory_attributes)
+ static_call_update(__kvm_get_memory_attributes,
+ kvm_get_vm_memory_attributes);
+ else
+ static_call_update(__kvm_get_memory_attributes,
+ (void *)__static_call_return0);
+}
+#else /* CONFIG_KVM_MEMORY_ATTRIBUTES */
+static void kvm_init_memory_attributes(void) { }
+#endif /* CONFIG_KVM_MEMORY_ATTRIBUTES */
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
{
@@ -4925,6 +4959,9 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
return 1;
#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
case KVM_CAP_MEMORY_ATTRIBUTES:
+ if (!vm_memory_attributes)
+ return 0;
+
return kvm_supported_mem_attributes(kvm);
#endif
#ifdef CONFIG_KVM_GUEST_MEMFD
@@ -5331,6 +5368,10 @@ static long kvm_vm_ioctl(struct file *filp,
case KVM_SET_MEMORY_ATTRIBUTES: {
struct kvm_memory_attributes attrs;
+ r = -ENOTTY;
+ if (!vm_memory_attributes)
+ goto out;
+
r = -EFAULT;
if (copy_from_user(&attrs, argp, sizeof(attrs)))
goto out;
@@ -6527,6 +6568,7 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
kvm_preempt_ops.sched_in = kvm_sched_in;
kvm_preempt_ops.sched_out = kvm_sched_out;
+ kvm_init_memory_attributes();
kvm_init_debug();
r = kvm_vfio_ops_init();
--
2.54.0.794.g4f17f83d09-goog
^ permalink raw reply related
* [PATCH v7 01/42] KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
From: Ackerley Tng via B4 Relay @ 2026-05-23 0:17 UTC (permalink / raw)
To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>
From: Sean Christopherson <seanjc@google.com>
Start plumbing in guest_memfd support for in-place private<=>shared
conversions by tracking attributes via a maple tree. KVM currently tracks
private vs. shared attributes on a per-VM basis, which made sense when a
guest_memfd _only_ supported private memory, but tracking per-VM simply
can't work for in-place conversions as the shareability of a given page
needs to be per-gmem_inode, not per-VM.
Use the filemap invalidation lock to protect the maple tree, as taking the
lock for read when faulting in memory (for userspace or the guest) isn't
expected to result in meaningful contention, and using a separate lock
would add significant complexity (avoid deadlock is quite difficult).
Signed-off-by: Sean Christopherson <seanjc@google.com>
Co-developed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Vishal Annapurve <vannapurve@google.com>
Signed-off-by: Vishal Annapurve <vannapurve@google.com>
Co-developed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
---
virt/kvm/guest_memfd.c | 133 +++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 117 insertions(+), 16 deletions(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 5b4911ffa208a..117b726f670e8 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -4,6 +4,7 @@
#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/kvm_host.h>
+#include <linux/maple_tree.h>
#include <linux/mempolicy.h>
#include <linux/pseudo_fs.h>
#include <linux/pagemap.h>
@@ -33,6 +34,13 @@ struct gmem_inode {
struct list_head gmem_file_list;
u64 flags;
+ /*
+ * Every index in this inode, whether memory is populated or
+ * not, is tracked in attributes. The entire range of indices,
+ * corresponding to the size of this inode, is represented in
+ * this maple tree.
+ */
+ struct maple_tree attributes;
};
static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
@@ -60,6 +68,24 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
return gfn - slot->base_gfn + slot->gmem.pgoff;
}
+static u64 kvm_gmem_get_attributes(struct inode *inode, pgoff_t index)
+{
+ struct maple_tree *mt = &GMEM_I(inode)->attributes;
+ void *entry = mtree_load(mt, index);
+
+ return WARN_ON_ONCE(!entry) ? 0 : xa_to_value(entry);
+}
+
+static bool kvm_gmem_is_private_mem(struct inode *inode, pgoff_t index)
+{
+ return kvm_gmem_get_attributes(inode, index) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
+}
+
+static bool kvm_gmem_is_shared_mem(struct inode *inode, pgoff_t index)
+{
+ return !kvm_gmem_is_private_mem(inode, index);
+}
+
static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
pgoff_t index, struct folio *folio)
{
@@ -397,10 +423,13 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
return VM_FAULT_SIGBUS;
- if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED))
- return VM_FAULT_SIGBUS;
+ filemap_invalidate_lock_shared(inode->i_mapping);
+ if (kvm_gmem_is_shared_mem(inode, vmf->pgoff))
+ folio = kvm_gmem_get_folio(inode, vmf->pgoff);
+ else
+ folio = ERR_PTR(-EACCES);
+ filemap_invalidate_unlock_shared(inode->i_mapping);
- folio = kvm_gmem_get_folio(inode, vmf->pgoff);
if (IS_ERR(folio)) {
if (PTR_ERR(folio) == -EAGAIN)
return VM_FAULT_RETRY;
@@ -556,6 +585,51 @@ bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
return true;
}
+static int kvm_gmem_init_inode(struct inode *inode, loff_t size, u64 flags)
+{
+ struct gmem_inode *gi = GMEM_I(inode);
+ MA_STATE(mas, &gi->attributes, 0, (size >> PAGE_SHIFT) - 1);
+ u64 attrs;
+ int r;
+
+ inode->i_op = &kvm_gmem_iops;
+ inode->i_mapping->a_ops = &kvm_gmem_aops;
+ inode->i_mode |= S_IFREG;
+ inode->i_size = size;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+
+ /*
+ * guest_memfd memory is neither migratable nor swappable: set
+ * inaccessible to gate off both.
+ */
+ mapping_set_inaccessible(inode->i_mapping);
+ WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
+
+ gi->flags = flags;
+
+ mt_set_external_lock(&gi->attributes,
+ &inode->i_mapping->invalidate_lock);
+
+ /*
+ * Store default attributes for the entire gmem instance. Ensuring every
+ * index is represented in the maple tree at all times simplifies the
+ * conversion and merging logic.
+ */
+ attrs = gi->flags & GUEST_MEMFD_FLAG_INIT_SHARED ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE;
+
+ /*
+ * Acquire the invalidation lock purely to make lockdep happy. The
+ * maple tree library expects all stores to be protected via the lock,
+ * and the library can't know when the tree is reachable only by the
+ * caller, as is the case here.
+ */
+ filemap_invalidate_lock(inode->i_mapping);
+ r = mas_store_gfp(&mas, xa_mk_value(attrs), GFP_KERNEL);
+ filemap_invalidate_unlock(inode->i_mapping);
+
+ return r;
+}
+
static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
{
static const char *name = "[kvm-gmem]";
@@ -586,16 +660,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
goto err_fops;
}
- inode->i_op = &kvm_gmem_iops;
- inode->i_mapping->a_ops = &kvm_gmem_aops;
- inode->i_mode |= S_IFREG;
- inode->i_size = size;
- mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
- mapping_set_inaccessible(inode->i_mapping);
- /* Unmovable mappings are supposed to be marked unevictable as well. */
- WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
-
- GMEM_I(inode)->flags = flags;
+ err = kvm_gmem_init_inode(inode, size, flags);
+ if (err)
+ goto err_inode;
file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
if (IS_ERR(file)) {
@@ -803,9 +870,13 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
if (!file)
return -EFAULT;
+ filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
+
folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
- if (IS_ERR(folio))
- return PTR_ERR(folio);
+ if (IS_ERR(folio)) {
+ r = PTR_ERR(folio);
+ goto out;
+ }
if (!folio_test_uptodate(folio)) {
clear_highpage(folio_page(folio, 0));
@@ -821,6 +892,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
else
folio_put(folio);
+out:
+ filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
return r;
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
@@ -952,6 +1025,15 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
mpol_shared_policy_init(&gi->policy, NULL);
+ /*
+ * Memory attributes are protected by the filemap invalidation lock, but
+ * the lock structure isn't available at this time. Immediately mark
+ * maple tree as using external locking so that accessing the tree
+ * before it's fully initialized results in NULL pointer dereferences
+ * and not more subtle bugs.
+ */
+ mt_init_flags(&gi->attributes, MT_FLAGS_LOCK_EXTERN | MT_FLAGS_USE_RCU);
+
gi->flags = 0;
INIT_LIST_HEAD(&gi->gmem_file_list);
return &gi->vfs_inode;
@@ -959,7 +1041,26 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
static void kvm_gmem_destroy_inode(struct inode *inode)
{
- mpol_free_shared_policy(&GMEM_I(inode)->policy);
+ struct gmem_inode *gi = GMEM_I(inode);
+
+ mpol_free_shared_policy(&gi->policy);
+
+ /*
+ * Note! Checking for an empty tree is functionally necessary
+ * to avoid explosions if the tree hasn't been fully
+ * initialized, i.e. if the inode is being destroyed before
+ * guest_memfd can set the external lock, lockdep would find
+ * that the tree's internal ma_lock was not held.
+ */
+ if (!mtree_empty(&gi->attributes)) {
+ /*
+ * Acquire the invalidation lock purely to make lockdep happy,
+ * the inode is unreachable at this point.
+ */
+ filemap_invalidate_lock(inode->i_mapping);
+ __mt_destroy(&gi->attributes);
+ filemap_invalidate_unlock(inode->i_mapping);
+ }
}
static void kvm_gmem_free_inode(struct inode *inode)
--
2.54.0.794.g4f17f83d09-goog
^ permalink raw reply related
* [PATCH v7 00/42] guest_memfd: In-place conversion support
From: Ackerley Tng via B4 Relay @ 2026-05-23 0:17 UTC (permalink / raw)
To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco, Ackerley Tng
This is v7 of guest_memfd in-place conversion support.
Up till now, guest_memfd supports the entire inode worth of memory being
used as all-shared, or all-private. CoCo VMs may request guest memory to be
converted between private and shared states, and the only way to support
that currently would be to have the userspace VMM provide two sources of
backing memory from completely different areas of physical memory.
pKVM has a use case for in-place sharing: the guest and host may be
cooperating on given data, and pKVM doesn't protect data through
encryption, so copying that given data between different areas of physical
memory as part of conversions would be unnecessary work.
This series also serves as a foundation for guest_memfd huge page
support. Now, guest_memfd only supports PAGE_SIZE pages, so if two sources
of backing memory are used, the userspace VMM could maintain a steady total
memory utilized by punching out the pages that are not used. When huge
pages are available in guest_memfd, even if the backing memory source
supports hole punching within a huge page, punching out pages to maintain
the total memory utilized by a VM would be introducing lots of
fragmentation.
In-place conversion avoids fragmentation by allowing the same physical
memory to be used for both shared and private memory, with guest_memfd
tracks the shared/private status of all the pages at a per-page
granularity.
The central principle, which guest_memfd continues to uphold, is that any
guest-private page will not be mappable to host userspace. All pages will
be mmap()-able in host userspace, but accesses to guest-private pages (as
tracked by guest_memfd) will result in a SIGBUS.
This series introduces a guest_memfd ioctl (not kvm, vm or vcpu, but
guest_memfd ioctl) that allows userspace to set memory
attributes (shared/private) directly through the guest_memfd. This is the
appropriate interface because shared/private-ness is a property of memory
and hence the request should be sent directly to the memory provider -
guest_memfd.
Tested with both CONFIG_KVM_VM_MEMORY_ATTRIBUTES enabled and disabled:
+ tools/testing/selftests/kvm/guest_memfd_test.c
+ tools/testing/selftests/kvm/pre_fault_memory_test.c
+ tools/testing/selftests/kvm/x86/guest_memfd_conversions_test.c
+ tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
+ tools/testing/selftests/kvm/x86/private_mem_conversions_test.sh
+ tools/testing/selftests/kvm/x86/private_mem_kvm_exits_test.c
Updates for this revision:
+ Picked up Reviewed-bys from Fuad
+ Addressed Fuad, Sean and Sashiko's comments
Regarding the issue where guest_memfd_conversions_test, which uses the
kselftest framework, doesn't perform teardown on assertion failure. I think
we can have that fixed separately from this series? Please see proposal [9].
TODOs
+ Test with TDX selftests. We're in the process of rebasing TDX selftests
on this series and will post updates when that's tested.
This series is based on kvm/next, and here's the tree for your convenience:
https://github.com/googleprodkernel/linux-cc/commits/guest_memfd-inplace-conversion-v7
Older series:
+ RFCv6 is at [10]
+ RFCv5 is at [8]
+ RFCv4 is at [7]
+ RFCv3 is at [6]
+ RFCv2 is at [5]
+ RFCv1 is at [4]
+ Previous versions of this feature, part of other series, are available at
[1][2][3].
[1] https://lore.kernel.org/all/bd163de3118b626d1005aa88e71ef2fb72f0be0f.1726009989.git.ackerleytng@google.com/
[2]
[3] https://lore.kernel.org/all/b784326e9ccae6a08388f1bf39db70a2204bdc51.1747264138.git.ackerleytng@google.com/
[4] https://lore.kernel.org/all/cover.1760731772.git.ackerleytng@google.com/T/
[5] https://lore.kernel.org/all/cover.1770071243.git.ackerleytng@google.com/T/
[6] https://lore.kernel.org/r/20260313-gmem-inplace-conversion-v3-0-5fc12a70ec89@google.com/T/
[7] https://lore.kernel.org/all/20260326-gmem-inplace-conversion-v4-0-e202fe950ffd@google.com/T/
[8] https://lore.kernel.org/r/20260428-gmem-inplace-conversion-v5-0-d8608ccfca22@google.com
[9] https://lore.kernel.org/all/20260414-selftest-global-metadata-v1-0-fd223922bc57@google.com/T/
[10] https://lore.kernel.org/r/20260507-gmem-inplace-conversion-v6-0-91ab5a8b19a4@google.com
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
Ackerley Tng (24):
KVM: guest_memfd: Update kvm_gmem_populate() to use gmem attributes
KVM: guest_memfd: Only prepare folios for private pages
KVM: Move kvm_supported_mem_attributes() to kvm_host.h
KVM: guest_memfd: Add base support for KVM_SET_MEMORY_ATTRIBUTES2
KVM: guest_memfd: Ensure pages are not in use before conversion
KVM: guest_memfd: Call arch invalidate hooks on conversion
KVM: guest_memfd: Return early if range already has requested attributes
KVM: guest_memfd: Advertise KVM_SET_MEMORY_ATTRIBUTES2 ioctl
KVM: guest_memfd: Handle lru_add fbatch refcounts during conversion safety check
KVM: guest_memfd: Use actual size for invalidation in kvm_gmem_release()
KVM: guest_memfd: Determine invalidation filter from memory attributes
KVM: TDX: Make source page optional for KVM_TDX_INIT_MEM_REGION
KVM: selftests: Test basic single-page conversion flow
KVM: selftests: Test conversion flow when INIT_SHARED
KVM: selftests: Test conversion precision in guest_memfd
KVM: selftests: Test conversion before allocation
KVM: selftests: Convert with allocated folios in different layouts
KVM: selftests: Test that truncation does not change shared/private status
KVM: selftests: Test conversion with elevated page refcount
KVM: selftests: Reset shared memory after hole-punching
KVM: selftests: Provide function to look up guest_memfd details from gpa
KVM: selftests: Make TEST_EXPECT_SIGBUS thread-safe
KVM: selftests: Update private_mem_conversions_test to mmap() guest_memfd
KVM: selftests: Add script to exercise private_mem_conversions_test
Michael Roth (1):
KVM: SEV: Make 'uaddr' parameter optional for KVM_SEV_SNP_LAUNCH_UPDATE
Sean Christopherson (17):
KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
KVM: Rename KVM_GENERIC_MEMORY_ATTRIBUTES to KVM_VM_MEMORY_ATTRIBUTES
KVM: Enumerate support for PRIVATE memory iff kvm_arch_has_private_mem is defined
KVM: Stub in ability to disable per-VM memory attribute tracking
KVM: guest_memfd: Wire up kvm_get_memory_attributes() to per-gmem attributes
KVM: Move KVM_VM_MEMORY_ATTRIBUTES config definition to x86
KVM: Let userspace disable per-VM mem attributes, enable per-gmem attributes
KVM: guest_memfd: Enable INIT_SHARED on guest_memfd for x86 Coco VMs
KVM: selftests: Create gmem fd before "regular" fd when adding memslot
KVM: selftests: Rename guest_memfd{,_offset} to gmem_{fd,offset}
KVM: selftests: Add support for mmap() on guest_memfd in core library
KVM: selftests: Add selftests global for guest memory attributes capability
KVM: selftests: Add helpers for calling ioctls on guest_memfd
KVM: selftests: Test that shared/private status is consistent across processes
KVM: selftests: Provide common function to set memory attributes
KVM: selftests: Check fd/flags provided to mmap() when setting up memslot
KVM: selftests: Update private memory exits test to work with per-gmem attributes
Documentation/virt/kvm/api.rst | 78 +++-
.../virt/kvm/x86/amd-memory-encryption.rst | 15 +-
Documentation/virt/kvm/x86/intel-tdx.rst | 4 +
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/kvm/Kconfig | 15 +-
arch/x86/kvm/mmu/mmu.c | 4 +-
arch/x86/kvm/svm/sev.c | 18 +-
arch/x86/kvm/vmx/tdx.c | 11 +-
arch/x86/kvm/x86.c | 13 +-
include/linux/kvm_host.h | 53 ++-
include/trace/events/kvm.h | 4 +-
include/uapi/linux/kvm.h | 16 +
mm/swap.c | 2 +
tools/testing/selftests/kvm/Makefile.kvm | 5 +
tools/testing/selftests/kvm/include/kvm_util.h | 136 +++++-
tools/testing/selftests/kvm/include/test_util.h | 34 +-
.../selftests/kvm/kvm_has_gmem_attributes.c | 17 +
tools/testing/selftests/kvm/lib/kvm_util.c | 141 +++---
tools/testing/selftests/kvm/lib/test_util.c | 7 -
.../kvm/x86/guest_memfd_conversions_test.c | 488 +++++++++++++++++++++
.../kvm/x86/private_mem_conversions_test.c | 53 ++-
.../kvm/x86/private_mem_conversions_test.sh | 128 ++++++
.../selftests/kvm/x86/private_mem_kvm_exits_test.c | 36 +-
virt/kvm/Kconfig | 3 +-
virt/kvm/guest_memfd.c | 460 +++++++++++++++++--
virt/kvm/kvm_main.c | 82 +++-
26 files changed, 1633 insertions(+), 192 deletions(-)
---
base-commit: b7fbe9a1bf9ee6c967ef77d366ca58c35fcf1887
change-id: 20260225-gmem-inplace-conversion-bd0dbd39753a
prerequisite-change-id: 20260522-fix-sev-gmem-post-populate-a36bef7f0698:v2
prerequisite-patch-id: 0d1feef8af7aa3471735869080aefa58b254ed0d
prerequisite-patch-id: f64ff55d6fe8d399e720a570fd83cc47bf12ac15
prerequisite-patch-id: 8c52920dd7f65859cbe804c787a9293b33266a3a
prerequisite-patch-id: 95018daf73833296a045c91cfb55cd9f53886dec
prerequisite-patch-id: bcfd440d79bb9f59f41e3244c4392da4c95cd932
Best regards,
--
Ackerley Tng <ackerleytng@google.com>
^ permalink raw reply
* [PATCH v7 02/42] KVM: Rename KVM_GENERIC_MEMORY_ATTRIBUTES to KVM_VM_MEMORY_ATTRIBUTES
From: Ackerley Tng via B4 Relay @ 2026-05-23 0:17 UTC (permalink / raw)
To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>
From: Sean Christopherson <seanjc@google.com>
Rename the per-VM memory attributes Kconfig to make it explicitly about
per-VM attributes in anticipation of adding memory attributes support to
guest_memfd, at which point it will be possible (and desirable) to have
memory attributes without the per-VM support, even in x86.
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/kvm/Kconfig | 6 +++---
arch/x86/kvm/mmu/mmu.c | 2 +-
arch/x86/kvm/x86.c | 2 +-
include/linux/kvm_host.h | 8 ++++----
include/trace/events/kvm.h | 4 ++--
virt/kvm/Kconfig | 2 +-
virt/kvm/kvm_main.c | 14 +++++++-------
8 files changed, 20 insertions(+), 20 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8a53ca6195701..8bb7c25240e33 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2393,7 +2393,7 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
int tdp_max_root_level, int tdp_huge_page_level);
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
#endif
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 801bf9e520db3..26f6afd51bbdc 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -84,7 +84,7 @@ config KVM_SW_PROTECTED_VM
bool "Enable support for KVM software-protected VMs"
depends on EXPERT
depends on KVM_X86 && X86_64
- select KVM_GENERIC_MEMORY_ATTRIBUTES
+ select KVM_VM_MEMORY_ATTRIBUTES
help
Enable support for KVM software-protected VMs. Currently, software-
protected VMs are purely a development and testing vehicle for
@@ -135,7 +135,7 @@ config KVM_INTEL_TDX
bool "Intel Trust Domain Extensions (TDX) support"
default y
depends on INTEL_TDX_HOST
- select KVM_GENERIC_MEMORY_ATTRIBUTES
+ select KVM_VM_MEMORY_ATTRIBUTES
select HAVE_KVM_ARCH_GMEM_POPULATE
help
Provides support for launching Intel Trust Domain Extensions (TDX)
@@ -159,7 +159,7 @@ config KVM_AMD_SEV
depends on KVM_AMD && X86_64
depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
select ARCH_HAS_CC_PLATFORM
- select KVM_GENERIC_MEMORY_ATTRIBUTES
+ select KVM_VM_MEMORY_ATTRIBUTES
select HAVE_KVM_ARCH_GMEM_PREPARE
select HAVE_KVM_ARCH_GMEM_INVALIDATE
select HAVE_KVM_ARCH_GMEM_POPULATE
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index f8aa7eda661ee..b53a0c4b4dfca 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7971,7 +7971,7 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread);
}
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
int level)
{
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 48f259015ce44..cb4f7432a073d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13611,7 +13611,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
}
}
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
kvm_mmu_init_memslot_memory_attributes(kvm, slot);
#endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2c5ad9a6d5ce8..091f201251159 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -722,7 +722,7 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
}
#endif
-#ifndef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifndef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
{
return false;
@@ -871,7 +871,7 @@ struct kvm {
#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
struct notifier_block pm_notifier;
#endif
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
/* Protected by slots_lock (for writes) and RCU (for reads) */
struct xarray mem_attr_array;
#endif
@@ -2528,7 +2528,7 @@ static inline bool kvm_memslot_is_gmem_only(const struct kvm_memory_slot *slot)
return slot->flags & KVM_MEMSLOT_GMEM_ONLY;
}
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
{
return xa_to_value(xa_load(&kvm->mem_attr_array, gfn));
@@ -2550,7 +2550,7 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
{
return false;
}
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
#ifdef CONFIG_KVM_GUEST_MEMFD
int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index b282e3a867696..1ba72bd73ea2f 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -358,7 +358,7 @@ TRACE_EVENT(kvm_dirty_ring_exit,
TP_printk("vcpu %d", __entry->vcpu_id)
);
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
/*
* @start: Starting address of guest memory range
* @end: End address of guest memory range
@@ -383,7 +383,7 @@ TRACE_EVENT(kvm_vm_set_mem_attributes,
TP_printk("%#016llx -- %#016llx [0x%lx]",
__entry->start, __entry->end, __entry->attr)
);
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
TRACE_EVENT(kvm_unmap_hva_range,
TP_PROTO(unsigned long start, unsigned long end),
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 794976b88c6f9..5119cb37145fc 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -100,7 +100,7 @@ config KVM_ELIDE_TLB_FLUSH_IF_YOUNG
config KVM_MMU_LOCKLESS_AGING
bool
-config KVM_GENERIC_MEMORY_ATTRIBUTES
+config KVM_VM_MEMORY_ATTRIBUTES
bool
config KVM_GUEST_MEMFD
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 89489996fbc1e..306153abbafa5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1115,7 +1115,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
spin_lock_init(&kvm->mn_invalidate_lock);
rcuwait_init(&kvm->mn_memslots_update_rcuwait);
xa_init(&kvm->vcpu_array);
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
xa_init(&kvm->mem_attr_array);
#endif
@@ -1300,7 +1300,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
cleanup_srcu_struct(&kvm->irq_srcu);
srcu_barrier(&kvm->srcu);
cleanup_srcu_struct(&kvm->srcu);
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
xa_destroy(&kvm->mem_attr_array);
#endif
kvm_arch_free_vm(kvm);
@@ -2418,7 +2418,7 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
}
#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
static u64 kvm_supported_mem_attributes(struct kvm *kvm)
{
if (!kvm || kvm_arch_has_private_mem(kvm))
@@ -2623,7 +2623,7 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
}
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
{
@@ -4921,7 +4921,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
case KVM_CAP_SYSTEM_EVENT_DATA:
case KVM_CAP_DEVICE_CTRL:
return 1;
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
case KVM_CAP_MEMORY_ATTRIBUTES:
return kvm_supported_mem_attributes(kvm);
#endif
@@ -5325,7 +5325,7 @@ static long kvm_vm_ioctl(struct file *filp,
break;
}
#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
case KVM_SET_MEMORY_ATTRIBUTES: {
struct kvm_memory_attributes attrs;
@@ -5336,7 +5336,7 @@ static long kvm_vm_ioctl(struct file *filp,
r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
break;
}
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
case KVM_CREATE_DEVICE: {
struct kvm_create_device cd;
--
2.54.0.794.g4f17f83d09-goog
^ permalink raw reply related
* Re: [PATCH v13 07/22] KVM: selftests: Introduce structures for TDX guest boot parameters
From: Yosry Ahmed @ 2026-05-22 23:50 UTC (permalink / raw)
To: Sean Christopherson
Cc: Lisa Wang, Andrew Jones, Ackerley Tng, Binbin Wu, Chao Gao,
Chenyi Qiang, Dave Hansen, Erdem Aktas, Ira Weiny, Isaku Yamahata,
Kiryl Shutsemau, linux-kselftest, Paolo Bonzini, Pratik R. Sampat,
Reinette Chatre, Rick Edgecombe, Roger Wang, Ryan Afranji,
Sagi Shahar, Shuah Khan, Oliver Upton, Jeremiah McReynolds, kvm,
linux-coco, linux-kernel, x86
In-Reply-To: <ahDhQJI7xBOQpnjg@google.com>
> > Sean, is this the preferred way to expose offsets to asm files (or asm
> > code blocks) -- as opposed to say using .equ [*]?
>
> For actual .S assembly, yes. For inline asm, maybe? If it looks prettier, go
> for it.
>
> > If yes, I can rework my nVMX GPR fixes to use the same approach for
> > register offsets. I wonder if the non-TDX part of this patch (i.e.
> > Makefile stuff) can be split, then patch 6 and the Makefile stuff can
> > land independently and allow development on top.
> >
> > I can also split them out and include them in the next version of my
> > series, then whichever series lands first will land the offsets
> > support.
> >
> > WDYT?
>
> Hmm, I'd say keep your series as-is for now. The OFFSET() infrastructure really
> shines for proper assembly. For what you're doing, AFAICT it's only marginally
> better. So I don't think it's worth juggling dependencies to use it right away,
> we can always convert if/when the TDX series lands the fancy stuff.
Ack. We can do the switch later like you say.
^ permalink raw reply
* Re: [PATCH v13 07/22] KVM: selftests: Introduce structures for TDX guest boot parameters
From: Sean Christopherson @ 2026-05-22 23:05 UTC (permalink / raw)
To: Yosry Ahmed
Cc: Lisa Wang, Andrew Jones, Ackerley Tng, Binbin Wu, Chao Gao,
Chenyi Qiang, Dave Hansen, Erdem Aktas, Ira Weiny, Isaku Yamahata,
Kiryl Shutsemau, linux-kselftest, Paolo Bonzini, Pratik R. Sampat,
Reinette Chatre, Rick Edgecombe, Roger Wang, Ryan Afranji,
Sagi Shahar, Shuah Khan, Oliver Upton, Jeremiah McReynolds, kvm,
linux-coco, linux-kernel, x86
In-Reply-To: <CAO9r8zMY6i+TWoZPWzj8zB+Xw3rONbK-Nee1kSFwDkT3hy66zg@mail.gmail.com>
On Fri, May 22, 2026, Yosry Ahmed wrote:
> > +static void __attribute__((used)) common(void)
> > +{
> > + OFFSET(TD_BOOT_PARAMETERS_CR0, td_boot_parameters, cr0);
> > + OFFSET(TD_BOOT_PARAMETERS_CR3, td_boot_parameters, cr3);
> > + OFFSET(TD_BOOT_PARAMETERS_CR4, td_boot_parameters, cr4);
> > + OFFSET(TD_BOOT_PARAMETERS_GDT, td_boot_parameters, gdtr);
> > + OFFSET(TD_BOOT_PARAMETERS_IDT, td_boot_parameters, idtr);
> > + OFFSET(TD_BOOT_PARAMETERS_PER_VCPU, td_boot_parameters, per_vcpu);
> > + OFFSET(TD_PER_VCPU_PARAMETERS_ESP_GVA, td_per_vcpu_parameters, esp_gva);
> > + OFFSET(TD_PER_VCPU_PARAMETERS_GUEST_CODE, td_per_vcpu_parameters,
> > + guest_code);
> > + DEFINE(SIZEOF_TD_PER_VCPU_PARAMETERS,
> > + sizeof(struct td_per_vcpu_parameters));
> > +}
>
> This is neat.
>
> Sean, is this the preferred way to expose offsets to asm files (or asm
> code blocks) -- as opposed to say using .equ [*]?
For actual .S assembly, yes. For inline asm, maybe? If it looks prettier, go
for it.
> If yes, I can rework my nVMX GPR fixes to use the same approach for
> register offsets. I wonder if the non-TDX part of this patch (i.e.
> Makefile stuff) can be split, then patch 6 and the Makefile stuff can
> land independently and allow development on top.
>
> I can also split them out and include them in the next version of my
> series, then whichever series lands first will land the offsets
> support.
>
> WDYT?
Hmm, I'd say keep your series as-is for now. The OFFSET() infrastructure really
shines for proper assembly. For what you're doing, AFAICT it's only marginally
better. So I don't think it's worth juggling dependencies to use it right away,
we can always convert if/when the TDX series lands the fancy stuff.
^ permalink raw reply
* Re: [PATCH v6 25/43] KVM: selftests: Add support for mmap() on guest_memfd in core library
From: Ackerley Tng @ 2026-05-22 23:02 UTC (permalink / raw)
To: Ackerley Tng via B4 Relay, aik, andrew.jones, binbin.wu, brauner,
chao.p.peng, david, ira.weiny, jmattson, jthoughton, michael.roth,
oupton, pankaj.gupta, qperret, rick.p.edgecombe, rientjes,
shivankg, steven.price, tabba, willy, wyihan, yan.y.zhao,
forkloop, pratyush, suzuki.poulose, aneesh.kumar, liam,
Paolo Bonzini, Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-25-91ab5a8b19a4@google.com>
Ackerley Tng via B4 Relay <devnull+ackerleytng.google.com@kernel.org>
writes:
>
> [...snip...]
>
> @@ -1078,13 +1077,17 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
> }
>
> region->fd = -1;
> - if (backing_src_is_shared(src_type))
> + if (flags & KVM_MEM_GUEST_MEMFD && gmem_flags & GUEST_MEMFD_FLAG_MMAP) {
> + region->fd = kvm_dup(gmem_fd);
> + mmap_offset = gmem_offset;
> + } else if (backing_src_is_shared(src_type)) {
> region->fd = kvm_memfd_alloc(region->mmap_size,
> src_type == VM_MEM_SRC_SHARED_HUGETLB);
> + }
>
> - region->mmap_start = kvm_mmap(region->mmap_size, PROT_READ | PROT_WRITE,
> - vm_mem_backing_src_alias(src_type)->flag,
> - region->fd);
> + region->mmap_start = __kvm_mmap(region->mmap_size, PROT_READ | PROT_WRITE,
> + vm_mem_backing_src_alias(src_type)->flag,
> + region->fd, mmap_offset);
Sashiko pointed out these:
1. When mmap() is done for region->mmap_alias, it doesn't use
mmap_offset. I'll fix that in the next revision.
2. mmap() may map past the end of the guest_memfd if, due to alignment,
the mmap_size is increased. That is true, but I feel that that fix
should go with a bigger clean up for vm_mem_add().
3. vm_mem_backing_src_alias(src_type)->flag may contain incompatible
mmap flags. This is true. For now, when guest_memfd is used with
vm_mem_add, the src_type passed has to be VM_MEM_SRC_SHMEM. I think
this also falls in the category of doing a bigger clean up for
vm_mem_add().
>
> [...snip...]
>
^ permalink raw reply
* [PATCH v2 5/5] KVM: SNP: Mark source page dirty in sev_gmem_post_populate
From: Ackerley Tng via B4 Relay @ 2026-05-22 22:46 UTC (permalink / raw)
To: Sean Christopherson, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
Kiryl Shutsemau, Rick Edgecombe, Vishal Annapurve, Yan Zhao,
Michael Roth, Isaku Yamahata, Chao Peng, Xiaoyao Li, Zongyao Chen
Cc: kvm, linux-kernel, linux-coco, Yu Zhang, Fuad Tabba, Ackerley Tng
In-Reply-To: <20260522-fix-sev-gmem-post-populate-v2-0-3f196bfad5a1@google.com>
From: Ackerley Tng <ackerleytng@google.com>
Mark the folio as dirty after copying data into the source page in
sev_gmem_post_populate. After the memcpy, failing to mark the page dirty
can lead to the memory management subsystem discarding the changes if the
page is reclaimed or otherwise processed by the swap subsystem.
Fixes: 2a62345b3052 ("KVM: guest_memfd: GUP source pages prior to populating guest memory")
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
arch/x86/kvm/svm/sev.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index dbf75326a40f4..1a361f08c7a3d 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2395,6 +2395,7 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
void *dst_vaddr = kmap_local_pfn(pfn);
memcpy(src_vaddr, dst_vaddr, PAGE_SIZE);
+ folio_mark_dirty(page_folio(src_page));
kunmap_local(dst_vaddr);
kunmap_local(src_vaddr);
--
2.54.0.794.g4f17f83d09-goog
^ permalink raw reply related
* [PATCH v2 4/5] KVM: SNP: Fix kunmap_local() unmapping order
From: Ackerley Tng via B4 Relay @ 2026-05-22 22:46 UTC (permalink / raw)
To: Sean Christopherson, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
Kiryl Shutsemau, Rick Edgecombe, Vishal Annapurve, Yan Zhao,
Michael Roth, Isaku Yamahata, Chao Peng, Xiaoyao Li, Zongyao Chen
Cc: kvm, linux-kernel, linux-coco, Yu Zhang, Fuad Tabba, Ackerley Tng
In-Reply-To: <20260522-fix-sev-gmem-post-populate-v2-0-3f196bfad5a1@google.com>
From: Ackerley Tng <ackerleytng@google.com>
Mappings created with kmap_local_page() or kmap_local_pfn() must be
unmapped in the reverse order they were acquired, following a LIFO
(last-in, first-out) stack-based approach.
In sev_gmem_post_populate(), src_vaddr is mapped first and dst_vaddr is
mapped second. The current code incorrectly calls kunmap_local() for
src_vaddr before dst_vaddr.
Swap the kunmap_local() calls to ensure the mappings are released in the
correct order.
Fixes: 2a62345b3052 ("KVM: guest_memfd: GUP source pages prior to populating guest memory")
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
arch/x86/kvm/svm/sev.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 2f254c447923e..dbf75326a40f4 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2360,8 +2360,8 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
memcpy(dst_vaddr, src_vaddr, PAGE_SIZE);
- kunmap_local(src_vaddr);
kunmap_local(dst_vaddr);
+ kunmap_local(src_vaddr);
}
ret = rmp_make_private(pfn, gfn << PAGE_SHIFT, PG_LEVEL_4K,
@@ -2396,8 +2396,8 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
memcpy(src_vaddr, dst_vaddr, PAGE_SIZE);
- kunmap_local(src_vaddr);
kunmap_local(dst_vaddr);
+ kunmap_local(src_vaddr);
}
out:
--
2.54.0.794.g4f17f83d09-goog
^ permalink raw reply related
* [PATCH v2 1/5] KVM: guest_memfd: Use write permissions when GUP-ing source pages
From: Ackerley Tng via B4 Relay @ 2026-05-22 22:46 UTC (permalink / raw)
To: Sean Christopherson, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
Kiryl Shutsemau, Rick Edgecombe, Vishal Annapurve, Yan Zhao,
Michael Roth, Isaku Yamahata, Chao Peng, Xiaoyao Li, Zongyao Chen
Cc: kvm, linux-kernel, linux-coco, Yu Zhang, Fuad Tabba, Ackerley Tng
In-Reply-To: <20260522-fix-sev-gmem-post-populate-v2-0-3f196bfad5a1@google.com>
From: Sean Christopherson <seanjc@google.com>
sev_gmem_post_populate() may write to the source page if there was an error
while performing SNP_LAUNCH_UPDATE.
Since GUP requested only reads, there is a chance sev_gmem_post_populate()
could be writing to some read-only page.
sev_gmem_post_populate() will only ever write the source page if the type
of page being LAUNCH_UPDATEd is a CPUID page. Hence, request a writable
page only when loading the CPUID page.
Since TDX never writes to the source page, always pass false to
kvm_gmem_populate().
With this, even if a read-only mapping or the global zero page was provided
as the source page, GUP will do a copy-on-write, making it writable before
the write happens in gvm_post_populate.
Fixes: 2a62345b30529 ("KVM: guest_memfd: GUP source pages prior to populating guest memory")
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
arch/x86/kvm/svm/sev.c | 1 +
arch/x86/kvm/vmx/tdx.c | 2 +-
include/linux/kvm_host.h | 3 ++-
virt/kvm/guest_memfd.c | 6 ++++--
4 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 940b97d4a8523..2f254c447923e 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2469,6 +2469,7 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
sev_populate_args.type = params.type;
count = kvm_gmem_populate(kvm, params.gfn_start, src, npages,
+ params.type == KVM_SEV_SNP_PAGE_TYPE_CPUID,
sev_gmem_post_populate, &sev_populate_args);
if (count < 0) {
argp->error = sev_populate_args.fw_error;
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index b8c3d3d8bbfe5..00dcfcbc47f68 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -3185,7 +3185,7 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c
};
gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa),
u64_to_user_ptr(region.source_addr),
- 1, tdx_gmem_post_populate, &arg);
+ 1, false, tdx_gmem_post_populate, &arg);
if (gmem_ret < 0) {
ret = gmem_ret;
break;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4c14aee1fb063..2c5ad9a6d5ce8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2596,7 +2596,8 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord
typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
struct page *page, void *opaque);
-long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
+long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src,
+ long npages, bool may_writeback_src,
kvm_gmem_populate_cb post_populate, void *opaque);
#endif
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 69c9d6d546b28..07d8db344872b 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -858,7 +858,8 @@ static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot,
return ret;
}
-long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
+long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src,
+ long npages, bool may_writeback_src,
kvm_gmem_populate_cb post_populate, void *opaque)
{
struct kvm_memory_slot *slot;
@@ -892,8 +893,9 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
if (src) {
unsigned long uaddr = (unsigned long)src + i * PAGE_SIZE;
+ unsigned int flags = may_writeback_src ? FOLL_WRITE : 0;
- ret = get_user_pages_fast(uaddr, 1, 0, &src_page);
+ ret = get_user_pages_fast(uaddr, 1, flags, &src_page);
if (ret < 0)
break;
if (ret != 1) {
--
2.54.0.794.g4f17f83d09-goog
^ permalink raw reply related
* [PATCH v2 2/5] KVM: guest_memfd: Fix possible signed integer overflow
From: Ackerley Tng via B4 Relay @ 2026-05-22 22:46 UTC (permalink / raw)
To: Sean Christopherson, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
Kiryl Shutsemau, Rick Edgecombe, Vishal Annapurve, Yan Zhao,
Michael Roth, Isaku Yamahata, Chao Peng, Xiaoyao Li, Zongyao Chen
Cc: kvm, linux-kernel, linux-coco, Yu Zhang, Fuad Tabba, Ackerley Tng
In-Reply-To: <20260522-fix-sev-gmem-post-populate-v2-0-3f196bfad5a1@google.com>
From: Sean Christopherson <seanjc@google.com>
The caller, kvm_set_memory_region(), checks for an overflow in an unsigned
u64 guest_memfd_offset. When guest_memfd_offset is passed to kvm_gmem_bind,
it is cast into a signed 64-bit integer.
Hence, a large 64-bit offset could result in a negative loff_t, which could
result in the overflow checks failing.
Make kvm_gmem_bind() take u64 instead of loff_t to consistently deal with
unsigned values to avoid this issue.
Fixes: a7800aa80ea4d ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory")
Signed-off-by: Sean Christopherson <seanjc@google.com>
[Use size_t for size instead of u64]
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
virt/kvm/guest_memfd.c | 7 +++----
virt/kvm/kvm_mm.h | 4 ++--
2 files changed, 5 insertions(+), 6 deletions(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 07d8db344872b..d203135969d13 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -640,9 +640,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
}
int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
- unsigned int fd, loff_t offset)
+ unsigned int fd, u64 offset)
{
- loff_t size = slot->npages << PAGE_SHIFT;
+ size_t size = slot->npages << PAGE_SHIFT;
unsigned long start, end;
struct gmem_file *f;
struct inode *inode;
@@ -664,8 +664,7 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
inode = file_inode(file);
- if (offset < 0 || !PAGE_ALIGNED(offset) ||
- offset + size > i_size_read(inode))
+ if (!PAGE_ALIGNED(offset) || offset + size > i_size_read(inode))
goto err;
filemap_invalidate_lock(inode->i_mapping);
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 9fcc5d5b7f8d0..8c2bbfba63424 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -72,7 +72,7 @@ int kvm_gmem_init(struct module *module);
void kvm_gmem_exit(void);
int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args);
int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
- unsigned int fd, loff_t offset);
+ unsigned int fd, u64 offset);
void kvm_gmem_unbind(struct kvm_memory_slot *slot);
#else
static inline int kvm_gmem_init(struct module *module)
@@ -82,7 +82,7 @@ static inline int kvm_gmem_init(struct module *module)
static inline void kvm_gmem_exit(void) {};
static inline int kvm_gmem_bind(struct kvm *kvm,
struct kvm_memory_slot *slot,
- unsigned int fd, loff_t offset)
+ unsigned int fd, u64 offset)
{
WARN_ON_ONCE(1);
return -EIO;
--
2.54.0.794.g4f17f83d09-goog
^ permalink raw reply related
* [PATCH v2 3/5] KVM: guest_memfd: Handle errors from xa_store_range() when binding
From: Ackerley Tng via B4 Relay @ 2026-05-22 22:46 UTC (permalink / raw)
To: Sean Christopherson, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
Kiryl Shutsemau, Rick Edgecombe, Vishal Annapurve, Yan Zhao,
Michael Roth, Isaku Yamahata, Chao Peng, Xiaoyao Li, Zongyao Chen
Cc: kvm, linux-kernel, linux-coco, Yu Zhang, Fuad Tabba, Ackerley Tng
In-Reply-To: <20260522-fix-sev-gmem-post-populate-v2-0-3f196bfad5a1@google.com>
From: Ackerley Tng <ackerleytng@google.com>
Unhandled errors from xa_store_range() means kvm_gmem_bind() might falsely
reporting success, leading to false assumptions in guest_memfd's lifecycle
later.
On error, restore the unbound state and return the error to userspace.
Fixes: a7800aa80ea4d ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory")
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
virt/kvm/guest_memfd.c | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index d203135969d13..5b4911ffa208a 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -648,6 +648,7 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
struct inode *inode;
struct file *file;
int r = -EINVAL;
+ void *result;
BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
@@ -688,7 +689,14 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
if (kvm_gmem_supports_mmap(inode))
slot->flags |= KVM_MEMSLOT_GMEM_ONLY;
- xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL);
+ result = xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL);
+ if (xa_is_err(result)) {
+ r = xa_err(result);
+ xa_store_range(&f->bindings, start, end - 1, NULL, GFP_KERNEL);
+ } else {
+ r = 0;
+ }
+
filemap_invalidate_unlock(inode->i_mapping);
/*
@@ -696,7 +704,6 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
* not the other way 'round. Active bindings are invalidated if the
* file is closed before memslots are destroyed.
*/
- r = 0;
err:
fput(file);
return r;
--
2.54.0.794.g4f17f83d09-goog
^ permalink raw reply related
* [PATCH v2 0/5] guest_memfd fixes for bind and populate
From: Ackerley Tng via B4 Relay @ 2026-05-22 22:46 UTC (permalink / raw)
To: Sean Christopherson, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
Kiryl Shutsemau, Rick Edgecombe, Vishal Annapurve, Yan Zhao,
Michael Roth, Isaku Yamahata, Chao Peng, Xiaoyao Li, Zongyao Chen
Cc: kvm, linux-kernel, linux-coco, Yu Zhang, Fuad Tabba, Ackerley Tng
This series is a group of fixes for the bind and populate flows for
guest_memfd, and fixes some issues reported by Sashiko after reviewing the
guest_memfd in-place conversions series [1] and another fixup series Sean
posted [3].
Changes in v2:
+ Add patch 4 and 5 to fix more issues, see below
+ Also update stub for kvm_gmem_bind()
Sashiko pointed out
+ Possible write to read-only page [1]
=> Fixed in patch 1
+ Signed integer overflow in kvm_gmem_bind() twice: [2][3]
=> Fixed in patch 2
+ Unchecked xa_store_range() [3]
=> Fixed in patch 3
+ Ordering issue with kmap_* and kunmap_* in sev_gmem_post_populate() [4]
=> Fixed in patch 4
+ Ordering issue with kmap_* and kunmap_* in sev_gmem_post_populate() [5]
=> Fixed in patch 5
[1] https://lore.kernel.org/all/CA+EHjTwrygfMrZZSw4y7-ry8fidW2x0C7iuF2Q=dnPNHUmNtUg@mail.gmail.com/
[2] https://lore.kernel.org/all/CA+EHjTxcadguOfOo7RpJVtAzcY5JAFZTbrAT_wcN6akMi8gCUg@mail.gmail.com/
[3] https://lore.kernel.org/all/20260522180530.EE9101F00A3E@smtp.kernel.org/
[4] https://sashiko.dev/#/patchset/20260507-gmem-inplace-conversion-v6-0-91ab5a8b19a4%40google.com?part=21
[5] https://sashiko.dev/#/patchset/20260522-fix-sev-gmem-post-populate-v1-0-9fc8d6437b65%40google.com?part=1
v1: https://lore.kernel.org/r/20260522-fix-sev-gmem-post-populate-v1-0-9fc8d6437b65@google.com
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
Ackerley Tng (3):
KVM: guest_memfd: Handle errors from xa_store_range() when binding
KVM: SNP: Fix kunmap_local() unmapping order
KVM: SNP: Mark source page dirty in sev_gmem_post_populate
Sean Christopherson (2):
KVM: guest_memfd: Use write permissions when GUP-ing source pages
KVM: guest_memfd: Fix possible signed integer overflow
arch/x86/kvm/svm/sev.c | 6 ++++--
arch/x86/kvm/vmx/tdx.c | 2 +-
include/linux/kvm_host.h | 3 ++-
virt/kvm/guest_memfd.c | 24 ++++++++++++++++--------
virt/kvm/kvm_mm.h | 4 ++--
5 files changed, 25 insertions(+), 14 deletions(-)
---
base-commit: b7fbe9a1bf9ee6c967ef77d366ca58c35fcf1887
change-id: 20260522-fix-sev-gmem-post-populate-a36bef7f0698
Best regards,
--
Ackerley Tng <ackerleytng@google.com>
^ permalink raw reply
* Re: [PATCH v6 01/43] KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
From: Ackerley Tng @ 2026-05-22 21:45 UTC (permalink / raw)
To: Ackerley Tng via B4 Relay, aik, andrew.jones, binbin.wu, brauner,
chao.p.peng, david, ira.weiny, jmattson, jthoughton, michael.roth,
oupton, pankaj.gupta, qperret, rick.p.edgecombe, rientjes,
shivankg, steven.price, tabba, willy, wyihan, yan.y.zhao,
forkloop, pratyush, suzuki.poulose, aneesh.kumar, liam,
Paolo Bonzini, Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-1-91ab5a8b19a4@google.com>
Ackerley Tng via B4 Relay <devnull+ackerleytng.google.com@kernel.org>
writes:
>
> [...snip...]
>
> +static int kvm_gmem_init_inode(struct inode *inode, loff_t size, u64 flags)
> +{
>
> [...snip...]
>
> + filemap_invalidate_lock(inode->i_mapping);
> + r = mas_store_gfp(&mas, xa_mk_value(attrs), GFP_KERNEL);
Sashiko says using GFP_KERNEL with this attributes maple_tree could
allow a process creating a very fragmented maple tree to consume lots of
memory not charged to some memcg and proposed using GFP_KERNEL_ACCOUNT.
The problem with using GFP_KERNEL_ACCOUNT is that the maple tree nodes
are allocated from a shared kmem_cache maple_node_cache. Allocating the
maple tree nodes using GFP_KERNEL_ACCOUNT would mean that the node could
be reused by other maple trees unrelated to this process, and so the
nodes might long outlive the process using this guest_memfd, keeping the
memcg alive far longer than the VM.
For now I think it's okay to stick with GFP_KERNEL? Does anyone else
have suggestions on how to solve this?
> + filemap_invalidate_unlock(inode->i_mapping);
> +
> + return r;
> +}
>
> [...snip...]
>
^ permalink raw reply
* [PATCH 3/3] KVM: guest_memfd: Handle errors from xa_store_range() when binding
From: Ackerley Tng via B4 Relay @ 2026-05-22 20:45 UTC (permalink / raw)
To: Sean Christopherson, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
Kiryl Shutsemau, Rick Edgecombe, Vishal Annapurve, Yan Zhao,
Michael Roth, Isaku Yamahata, Chao Peng, Xiaoyao Li, Zongyao Chen
Cc: kvm, linux-kernel, linux-coco, Yu Zhang, Fuad Tabba, Ackerley Tng
In-Reply-To: <20260522-fix-sev-gmem-post-populate-v1-0-9fc8d6437b65@google.com>
From: Ackerley Tng <ackerleytng@google.com>
Unhandled errors from xa_store_range() means kvm_gmem_bind() might falsely
reporting success, leading to false assumptions in guest_memfd's lifecycle
later.
Handle these errors by checking and returning the error to the userspace.
Fixes: a7800aa80ea4d ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory")
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
virt/kvm/guest_memfd.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index d203135969d13..104f0f3d6a0b3 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -648,6 +648,7 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
struct inode *inode;
struct file *file;
int r = -EINVAL;
+ void *result;
BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
@@ -688,7 +689,7 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
if (kvm_gmem_supports_mmap(inode))
slot->flags |= KVM_MEMSLOT_GMEM_ONLY;
- xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL);
+ result = xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL);
filemap_invalidate_unlock(inode->i_mapping);
/*
@@ -696,7 +697,7 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
* not the other way 'round. Active bindings are invalidated if the
* file is closed before memslots are destroyed.
*/
- r = 0;
+ r = xa_is_err(result) ? xa_err(result) : 0;
err:
fput(file);
return r;
--
2.54.0.794.g4f17f83d09-goog
^ permalink raw reply related
* [PATCH 1/3] KVM: guest_memfd: Use write permissions when GUP-ing source pages
From: Ackerley Tng via B4 Relay @ 2026-05-22 20:45 UTC (permalink / raw)
To: Sean Christopherson, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
Kiryl Shutsemau, Rick Edgecombe, Vishal Annapurve, Yan Zhao,
Michael Roth, Isaku Yamahata, Chao Peng, Xiaoyao Li, Zongyao Chen
Cc: kvm, linux-kernel, linux-coco, Yu Zhang, Fuad Tabba, Ackerley Tng
In-Reply-To: <20260522-fix-sev-gmem-post-populate-v1-0-9fc8d6437b65@google.com>
From: Sean Christopherson <seanjc@google.com>
sev_gmem_post_populate() may write to the source page if there was an error
while performing SNP_LAUNCH_UPDATE.
Since GUP requested only reads, there is a chance sev_gmem_post_populate()
could be writing to some read-only page.
sev_gmem_post_populate() will only ever write the source page if the type
of page being LAUNCH_UPDATEd is a CPUID page. Hence, request a writable
page only when loading the CPUID page.
Since TDX never writes to the source page, always pass false to
kvm_gmem_populate().
With this, even if a read-only mapping or the global zero page was provided
as the source page, GUP will do a copy-on-write, making it writable before
the write happens in gvm_post_populate.
Fixes: 2a62345b30529 ("KVM: guest_memfd: GUP source pages prior to populating guest memory")
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
arch/x86/kvm/svm/sev.c | 1 +
arch/x86/kvm/vmx/tdx.c | 2 +-
include/linux/kvm_host.h | 3 ++-
virt/kvm/guest_memfd.c | 6 ++++--
4 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 940b97d4a8523..2f254c447923e 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2469,6 +2469,7 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
sev_populate_args.type = params.type;
count = kvm_gmem_populate(kvm, params.gfn_start, src, npages,
+ params.type == KVM_SEV_SNP_PAGE_TYPE_CPUID,
sev_gmem_post_populate, &sev_populate_args);
if (count < 0) {
argp->error = sev_populate_args.fw_error;
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index b8c3d3d8bbfe5..00dcfcbc47f68 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -3185,7 +3185,7 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c
};
gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa),
u64_to_user_ptr(region.source_addr),
- 1, tdx_gmem_post_populate, &arg);
+ 1, false, tdx_gmem_post_populate, &arg);
if (gmem_ret < 0) {
ret = gmem_ret;
break;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4c14aee1fb063..2c5ad9a6d5ce8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2596,7 +2596,8 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord
typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
struct page *page, void *opaque);
-long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
+long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src,
+ long npages, bool may_writeback_src,
kvm_gmem_populate_cb post_populate, void *opaque);
#endif
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 69c9d6d546b28..07d8db344872b 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -858,7 +858,8 @@ static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot,
return ret;
}
-long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
+long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src,
+ long npages, bool may_writeback_src,
kvm_gmem_populate_cb post_populate, void *opaque)
{
struct kvm_memory_slot *slot;
@@ -892,8 +893,9 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
if (src) {
unsigned long uaddr = (unsigned long)src + i * PAGE_SIZE;
+ unsigned int flags = may_writeback_src ? FOLL_WRITE : 0;
- ret = get_user_pages_fast(uaddr, 1, 0, &src_page);
+ ret = get_user_pages_fast(uaddr, 1, flags, &src_page);
if (ret < 0)
break;
if (ret != 1) {
--
2.54.0.794.g4f17f83d09-goog
^ permalink raw reply related
* [PATCH 0/3] guest_memfd fixes for bind and populate
From: Ackerley Tng via B4 Relay @ 2026-05-22 20:45 UTC (permalink / raw)
To: Sean Christopherson, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
Kiryl Shutsemau, Rick Edgecombe, Vishal Annapurve, Yan Zhao,
Michael Roth, Isaku Yamahata, Chao Peng, Xiaoyao Li, Zongyao Chen
Cc: kvm, linux-kernel, linux-coco, Yu Zhang, Fuad Tabba, Ackerley Tng
This series is a group of fixes for the bind and populate flows for
guest_memfd, and fixes some issues reported by Sashiko after reviewing the
guest_memfd in-place conversions series [1] and another fixup series Sean
posted [3].
Sashiko pointed out
+ Possible write to read-only page [1]
=> Fixed in patch 1
+ Signed integer overflow in kvm_gmem_bind() twice: [2][3]
=> Fixed in patch 2
+ Unchecked xa_store_range() [3]
=> Fixed in patch 3
[1] https://lore.kernel.org/all/CA+EHjTwrygfMrZZSw4y7-ry8fidW2x0C7iuF2Q=dnPNHUmNtUg@mail.gmail.com/
[2] https://lore.kernel.org/all/CA+EHjTxcadguOfOo7RpJVtAzcY5JAFZTbrAT_wcN6akMi8gCUg@mail.gmail.com/
[3] https://lore.kernel.org/all/20260522180530.EE9101F00A3E@smtp.kernel.org/
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
Ackerley Tng (1):
KVM: guest_memfd: Handle errors from xa_store_range() when binding
Sean Christopherson (2):
KVM: guest_memfd: Use write permissions when GUP-ing source pages
KVM: guest_memfd: Fix possible signed integer overflow
arch/x86/kvm/svm/sev.c | 1 +
arch/x86/kvm/vmx/tdx.c | 2 +-
include/linux/kvm_host.h | 3 ++-
virt/kvm/guest_memfd.c | 18 ++++++++++--------
virt/kvm/kvm_mm.h | 2 +-
5 files changed, 15 insertions(+), 11 deletions(-)
---
base-commit: b7fbe9a1bf9ee6c967ef77d366ca58c35fcf1887
change-id: 20260522-fix-sev-gmem-post-populate-a36bef7f0698
Best regards,
--
Ackerley Tng <ackerleytng@google.com>
^ permalink raw reply
* [PATCH 2/3] KVM: guest_memfd: Fix possible signed integer overflow
From: Ackerley Tng via B4 Relay @ 2026-05-22 20:45 UTC (permalink / raw)
To: Sean Christopherson, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
Kiryl Shutsemau, Rick Edgecombe, Vishal Annapurve, Yan Zhao,
Michael Roth, Isaku Yamahata, Chao Peng, Xiaoyao Li, Zongyao Chen
Cc: kvm, linux-kernel, linux-coco, Yu Zhang, Fuad Tabba, Ackerley Tng
In-Reply-To: <20260522-fix-sev-gmem-post-populate-v1-0-9fc8d6437b65@google.com>
From: Sean Christopherson <seanjc@google.com>
The caller, kvm_set_memory_region(), checks for an overflow in an unsigned
u64 guest_memfd_offset. When guest_memfd_offset is passed to kvm_gmem_bind,
it is cast into a signed 64-bit integer.
Hence, a large 64-bit offset could result in a negative loff_t, which could
result in the overflow checks failing.
Make kvm_gmem_bind() take u64 instead of loff_t to consistently deal with
unsigned values to avoid this issue.
Fixes: a7800aa80ea4d ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory")
Signed-off-by: Sean Christopherson <seanjc@google.com>
[Use size_t for size instead of u64]
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
virt/kvm/guest_memfd.c | 7 +++----
virt/kvm/kvm_mm.h | 2 +-
2 files changed, 4 insertions(+), 5 deletions(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 07d8db344872b..d203135969d13 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -640,9 +640,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
}
int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
- unsigned int fd, loff_t offset)
+ unsigned int fd, u64 offset)
{
- loff_t size = slot->npages << PAGE_SHIFT;
+ size_t size = slot->npages << PAGE_SHIFT;
unsigned long start, end;
struct gmem_file *f;
struct inode *inode;
@@ -664,8 +664,7 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
inode = file_inode(file);
- if (offset < 0 || !PAGE_ALIGNED(offset) ||
- offset + size > i_size_read(inode))
+ if (!PAGE_ALIGNED(offset) || offset + size > i_size_read(inode))
goto err;
filemap_invalidate_lock(inode->i_mapping);
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 9fcc5d5b7f8d0..23813d74ce709 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -72,7 +72,7 @@ int kvm_gmem_init(struct module *module);
void kvm_gmem_exit(void);
int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args);
int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
- unsigned int fd, loff_t offset);
+ unsigned int fd, u64 offset);
void kvm_gmem_unbind(struct kvm_memory_slot *slot);
#else
static inline int kvm_gmem_init(struct module *module)
--
2.54.0.794.g4f17f83d09-goog
^ permalink raw reply related
* Re: [PATCH v13 07/22] KVM: selftests: Introduce structures for TDX guest boot parameters
From: Yosry Ahmed @ 2026-05-22 17:43 UTC (permalink / raw)
To: Lisa Wang
Cc: Andrew Jones, Ackerley Tng, Binbin Wu, Chao Gao, Chenyi Qiang,
Dave Hansen, Erdem Aktas, Ira Weiny, Isaku Yamahata,
Kiryl Shutsemau, linux-kselftest, Paolo Bonzini, Pratik R. Sampat,
Reinette Chatre, Rick Edgecombe, Roger Wang, Ryan Afranji,
Sagi Shahar, Sean Christopherson, Shuah Khan, Oliver Upton,
Jeremiah McReynolds, kvm, linux-coco, linux-kernel, x86
In-Reply-To: <20260521-tdx-selftests-v13-v13-7-6983ae4c3a4d@google.com>
> +static void __attribute__((used)) common(void)
> +{
> + OFFSET(TD_BOOT_PARAMETERS_CR0, td_boot_parameters, cr0);
> + OFFSET(TD_BOOT_PARAMETERS_CR3, td_boot_parameters, cr3);
> + OFFSET(TD_BOOT_PARAMETERS_CR4, td_boot_parameters, cr4);
> + OFFSET(TD_BOOT_PARAMETERS_GDT, td_boot_parameters, gdtr);
> + OFFSET(TD_BOOT_PARAMETERS_IDT, td_boot_parameters, idtr);
> + OFFSET(TD_BOOT_PARAMETERS_PER_VCPU, td_boot_parameters, per_vcpu);
> + OFFSET(TD_PER_VCPU_PARAMETERS_ESP_GVA, td_per_vcpu_parameters, esp_gva);
> + OFFSET(TD_PER_VCPU_PARAMETERS_GUEST_CODE, td_per_vcpu_parameters,
> + guest_code);
> + DEFINE(SIZEOF_TD_PER_VCPU_PARAMETERS,
> + sizeof(struct td_per_vcpu_parameters));
> +}
This is neat.
Sean, is this the preferred way to expose offsets to asm files (or asm
code blocks) -- as opposed to say using .equ [*]?
If yes, I can rework my nVMX GPR fixes to use the same approach for
register offsets. I wonder if the non-TDX part of this patch (i.e.
Makefile stuff) can be split, then patch 6 and the Makefile stuff can
land independently and allow development on top.
I can also split them out and include them in the next version of my
series, then whichever series lands first will land the offsets
support.
WDYT?
[*]https://lore.kernel.org/kvm/20260518202514.2037078-2-yosry@kernel.org/
^ permalink raw reply
* Re: [PATCH] x86/tdx: Fix zero-extension for CPUID emulation
From: Kiryl Shutsemau @ 2026-05-22 16:54 UTC (permalink / raw)
To: Dave Hansen
Cc: Edgecombe, Rick P, linux-coco@lists.linux.dev, clopez@suse.de,
x86@kernel.org, ak@linux.intel.com, bp@alien8.de,
dave.hansen@linux.intel.com, hpa@zytor.com, mingo@redhat.com,
linux-kernel@vger.kernel.org, Luck, Tony, tglx@kernel.org,
stable@vger.kernel.org, kvm@vger.kernel.org
In-Reply-To: <7f7b8bfd-f39e-417c-991f-d224d58cb52a@intel.com>
On Tue, May 12, 2026 at 03:14:54PM -0700, Dave Hansen wrote:
> On 5/12/26 14:48, Edgecombe, Rick P wrote:
> >> - regs->ax = args.r12;
> >> - regs->bx = args.r13;
> >> - regs->cx = args.r14;
> >> - regs->dx = args.r15;
> >> + regs->ax = lower_32_bits(args.r12);
> >> + regs->bx = lower_32_bits(args.r13);
> >> + regs->cx = lower_32_bits(args.r14);
> >> + regs->dx = lower_32_bits(args.r15);
> >>
> > Can you explain the impact here? Why should the guest fixup what the VMM
> > emulates?
>
> Oh boy.
>
> args.r12-15 come from the VMM, right? So the VMM Can put whatever it
> wants in there.
>
> CPUID (the instruction) is defined to fill in eax/ebx/ecx/edx. Those are
> 32-bit registers so the normal register rules apply: "32-bit operands
> generate a 32-bit result, zero-extended to a 64-bit result in the
> destination general-purpose register."
>
> So a properly-behaving CPUID implementation will always end up with the
> top 32 bits empty on the four CPUID registers after a CPUID is executed.
>
> The VMM here obviously might be naughty and might put gunk in
> args.r12/r13/r14/r15 that gets copied to ptregs->ax/bx/cx/dx which are
> 'unsigned long' on 64-bit.
>
> The end result is that a TDX guest can use CPUID and end up having bits
> set in rax/rbx/rcx/rdx that are architecturally impossible. This patch
> is effectively fixing up the VMM naughtiness before the guest CPUID
> instance can see it.
>
> Does anybody disagree with any of that?
Not really.
But note that the exposure is minimal as we do not issue hypercalls to
VMM for anything outside of hypervisor range. I am not sure stable@ is
justified, but worth fixing.
--
Kiryl Shutsemau / Kirill A. Shutemov
^ permalink raw reply
* Re: [PATCH v2 4/4] x86/virt/tdx: Move mk_keyed_paddr() to tdx.c due to no external users
From: Kiryl Shutsemau @ 2026-05-22 16:41 UTC (permalink / raw)
To: Yan Zhao
Cc: dave.hansen, pbonzini, seanjc, tglx, mingo, bp, x86, linux-kernel,
kvm, linux-coco, kai.huang, rick.p.edgecombe, yilun.xu,
vannapurve, ackerleytng, sagis, binbin.wu, xiaoyao.li,
isaku.yamahata
In-Reply-To: <20260430015014.24261-1-yan.y.zhao@intel.com>
On Thu, Apr 30, 2026 at 09:50:14AM +0800, Yan Zhao wrote:
> Move mk_keyed_paddr() from tdx.h to tdx.c to avoid unnecessary header
> inclusion and improve encapsulation since there are no users outside of
> tdx.c.
>
> No functional change intended.
Add a new line before SoB.
> Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
Otherwise:
Acked-by: Kiryl Shutsemau <kas@kernel.org>
--
Kiryl Shutsemau / Kirill A. Shutemov
^ permalink raw reply
* Re: [PATCH v2 3/4] x86/tdx: Drop exported function tdx_quirk_reset_page()
From: Kiryl Shutsemau @ 2026-05-22 16:39 UTC (permalink / raw)
To: Yan Zhao
Cc: dave.hansen, pbonzini, seanjc, tglx, mingo, bp, x86, linux-kernel,
kvm, linux-coco, kai.huang, rick.p.edgecombe, yilun.xu,
vannapurve, ackerleytng, sagis, binbin.wu, xiaoyao.li,
isaku.yamahata
In-Reply-To: <20260430015001.24242-1-yan.y.zhao@intel.com>
On Thu, Apr 30, 2026 at 09:50:01AM +0800, Yan Zhao wrote:
> KVM invokes tdx_quirk_reset_page() to reset TDX control pages (including
> S-EPT pages, TDR page, etc.), as all those pages are allocated by KVM TDX
> and thus always have struct page.
>
> However, it's also reasonable for KVM to reset those TDX control pages via
> tdx_quirk_reset_paddr() directly, eliminating the need to export two
> parallel APIs. Keeping tdx_quirk_reset_page() as a one-line helper in the
> header file is also unnecessary.
>
> No functional change intended.
>
> Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
> Suggested-by: Xiaoyao Li <xiaoyao.li@intel.com>
> Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
Acked-by: Kiryl Shutsemau <kas@kernel.org>
--
Kiryl Shutsemau / Kirill A. Shutemov
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox