* [RFC PATCH v1 01/37] KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
@ 2025-10-17 20:11 ` Ackerley Tng
2025-10-27 13:27 ` Vlastimil Babka
2025-10-17 20:11 ` [RFC PATCH v1 02/37] KVM: Rename KVM_GENERIC_MEMORY_ATTRIBUTES to KVM_VM_MEMORY_ATTRIBUTES Ackerley Tng
` (35 subsequent siblings)
36 siblings, 1 reply; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:11 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
From: Sean Christopherson <seanjc@google.com>
Start plumbing in guest_memfd support for in-place private<=>shared
conversions by tracking attributes via a maple tree. KVM currently tracks
private vs. shared attributes on a per-VM basis, which made sense when a
guest_memfd _only_ supported private memory, but tracking per-VM simply
can't work for in-place conversions as the shareability of a given page
needs to be per-gmem_inode, not per-VM.
Use the filemap invalidation lock to protect the maple tree, as taking the
lock for read when faulting in memory (for userspace or the guest) isn't
expected to result in meaningful contention, and using a separate lock
would add significant complexity (avoid deadlock is quite difficult).
Signed-off-by: Sean Christopherson <seanjc@google.com>
Co-developed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Vishal Annapurve <vannapurve@google.com>
Signed-off-by: Vishal Annapurve <vannapurve@google.com>
Co-developed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
---
virt/kvm/guest_memfd.c | 119 +++++++++++++++++++++++++++++++++++------
1 file changed, 103 insertions(+), 16 deletions(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index b22caa8b530ab..26cec833766c3 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -4,6 +4,7 @@
#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/kvm_host.h>
+#include <linux/maple_tree.h>
#include <linux/mempolicy.h>
#include <linux/pseudo_fs.h>
#include <linux/pagemap.h>
@@ -32,6 +33,7 @@ struct gmem_inode {
struct inode vfs_inode;
u64 flags;
+ struct maple_tree attributes;
};
static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
@@ -54,6 +56,23 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
}
+static u64 kvm_gmem_get_attributes(struct inode *inode, pgoff_t index)
+{
+ void *entry = mtree_load(&GMEM_I(inode)->attributes, index);
+
+ return WARN_ON_ONCE(!entry) ? 0 : xa_to_value(entry);
+}
+
+static bool kvm_gmem_is_private_mem(struct inode *inode, pgoff_t index)
+{
+ return kvm_gmem_get_attributes(inode, index) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
+}
+
+static bool kvm_gmem_is_shared_mem(struct inode *inode, pgoff_t index)
+{
+ return !kvm_gmem_is_private_mem(inode, index);
+}
+
static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
pgoff_t index, struct folio *folio)
{
@@ -415,10 +434,13 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
return VM_FAULT_SIGBUS;
- if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED))
- return VM_FAULT_SIGBUS;
+ filemap_invalidate_lock_shared(inode->i_mapping);
+ if (kvm_gmem_is_shared_mem(inode, vmf->pgoff))
+ folio = kvm_gmem_get_folio(inode, vmf->pgoff);
+ else
+ folio = ERR_PTR(-EACCES);
+ filemap_invalidate_unlock_shared(inode->i_mapping);
- folio = kvm_gmem_get_folio(inode, vmf->pgoff);
if (IS_ERR(folio)) {
if (PTR_ERR(folio) == -EAGAIN)
return VM_FAULT_RETRY;
@@ -572,6 +594,46 @@ bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
return true;
}
+static int kvm_gmem_init_inode(struct inode *inode, loff_t size, u64 flags)
+{
+ struct gmem_inode *gi = GMEM_I(inode);
+ MA_STATE(mas, &gi->attributes, 0, (size >> PAGE_SHIFT) - 1);
+ u64 attrs;
+ int r;
+
+ inode->i_op = &kvm_gmem_iops;
+ inode->i_mapping->a_ops = &kvm_gmem_aops;
+ inode->i_mode |= S_IFREG;
+ inode->i_size = size;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+ mapping_set_inaccessible(inode->i_mapping);
+ /* Unmovable mappings are supposed to be marked unevictable as well. */
+ WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
+
+ gi->flags = flags;
+
+ mt_set_external_lock(&gi->attributes,
+ &inode->i_mapping->invalidate_lock);
+
+ /*
+ * Store default attributes for the entire gmem instance. Ensuring every
+ * index is represented in the maple tree at all times simplifies the
+ * conversion and merging logic.
+ */
+ attrs = gi->flags & GUEST_MEMFD_FLAG_INIT_SHARED ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE;
+
+ /*
+ * Acquire the invalidation lock purely to make lockdep happy. There
+ * should be no races at this time since the inode hasn't yet been fully
+ * created.
+ */
+ filemap_invalidate_lock(inode->i_mapping);
+ r = mas_store_gfp(&mas, xa_mk_value(attrs), GFP_KERNEL);
+ filemap_invalidate_unlock(inode->i_mapping);
+
+ return r;
+}
+
static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
{
static const char *name = "[kvm-gmem]";
@@ -602,16 +664,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
goto err_fops;
}
- inode->i_op = &kvm_gmem_iops;
- inode->i_mapping->a_ops = &kvm_gmem_aops;
- inode->i_mode |= S_IFREG;
- inode->i_size = size;
- mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
- mapping_set_inaccessible(inode->i_mapping);
- /* Unmovable mappings are supposed to be marked unevictable as well. */
- WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
-
- GMEM_I(inode)->flags = flags;
+ err = kvm_gmem_init_inode(inode, size, flags);
+ if (err)
+ goto err_inode;
file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
if (IS_ERR(file)) {
@@ -798,9 +853,13 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
if (!file)
return -EFAULT;
+ filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
+
folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
- if (IS_ERR(folio))
- return PTR_ERR(folio);
+ if (IS_ERR(folio)) {
+ r = PTR_ERR(folio);
+ goto out;
+ }
if (!is_prepared)
r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
@@ -812,6 +871,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
else
folio_put(folio);
+out:
+ filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
return r;
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
@@ -925,13 +986,39 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
mpol_shared_policy_init(&gi->policy, NULL);
+ /*
+ * Memory attributes are protected the filemap invalidation lock, but
+ * the lock structure isn't available at this time. Immediately mark
+ * maple tree as using external locking so that accessing the tree
+ * before its fully initialized results in NULL pointer dereferences
+ * and not more subtle bugs.
+ */
+ mt_init_flags(&gi->attributes, MT_FLAGS_LOCK_EXTERN);
+
gi->flags = 0;
return &gi->vfs_inode;
}
static void kvm_gmem_destroy_inode(struct inode *inode)
{
- mpol_free_shared_policy(&GMEM_I(inode)->policy);
+ struct gmem_inode *gi = GMEM_I(inode);
+
+ mpol_free_shared_policy(&gi->policy);
+
+ /*
+ * Note! Checking for an empty tree is functionally necessary to avoid
+ * explosions if the tree hasn't been initialized, i.e. if the inode is
+ * being destroyed before guest_memfd can set the external lock.
+ */
+ if (!mtree_empty(&gi->attributes)) {
+ /*
+ * Acquire the invalidation lock purely to make lockdep happy,
+ * the inode is unreachable at this point.
+ */
+ filemap_invalidate_lock(inode->i_mapping);
+ __mt_destroy(&gi->attributes);
+ filemap_invalidate_unlock(inode->i_mapping);
+ }
}
static void kvm_gmem_free_inode(struct inode *inode)
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* Re: [RFC PATCH v1 01/37] KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
2025-10-17 20:11 ` [RFC PATCH v1 01/37] KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings Ackerley Tng
@ 2025-10-27 13:27 ` Vlastimil Babka
0 siblings, 0 replies; 56+ messages in thread
From: Vlastimil Babka @ 2025-10-27 13:27 UTC (permalink / raw)
To: Ackerley Tng, cgroups, kvm, linux-doc, linux-fsdevel,
linux-kernel, linux-kselftest, linux-mm, linux-trace-kernel, x86,
Liam R. Howlett, maple-tree@lists.infradead.org
Cc: akpm, binbin.wu, bp, brauner, chao.p.peng, chenhuacai, corbet,
dave.hansen, dave.hansen, david, dmatlack, erdemaktas, fan.du,
fvdl, haibo1.xu, hannes, hch, hpa, hughd, ira.weiny,
isaku.yamahata, jack, james.morse, jarkko, jgg, jgowans, jhubbard,
jroedel, jthoughton, jun.miao, kai.huang, keirf, kent.overstreet,
liam.merwick, maciej.wieczor-retman, mail, maobibo,
mathieu.desnoyers, maz, mhiramat, mhocko, mic, michael.roth,
mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz, oliver.upton,
palmer, pankaj.gupta, paul.walmsley, pbonzini, peterx, pgonda,
prsampat, pvorel, qperret, richard.weiyang, rick.p.edgecombe,
rientjes, rostedt, roypat, rppt, seanjc, shakeel.butt, shuah,
steven.price, steven.sistare, suzuki.poulose, tabba, tglx,
thomas.lendacky, vannapurve, viro, vkuznets, wei.w.wang, will,
willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu, yuzenghui,
zhiquan1.li
On 10/17/25 22:11, Ackerley Tng wrote:
> From: Sean Christopherson <seanjc@google.com>
>
> Start plumbing in guest_memfd support for in-place private<=>shared
> conversions by tracking attributes via a maple tree. KVM currently tracks
> private vs. shared attributes on a per-VM basis, which made sense when a
> guest_memfd _only_ supported private memory, but tracking per-VM simply
> can't work for in-place conversions as the shareability of a given page
> needs to be per-gmem_inode, not per-VM.
>
> Use the filemap invalidation lock to protect the maple tree, as taking the
> lock for read when faulting in memory (for userspace or the guest) isn't
> expected to result in meaningful contention, and using a separate lock
> would add significant complexity (avoid deadlock is quite difficult).
+Cc Liam and maple-tree list, especially for this part.
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Co-developed-by: Ackerley Tng <ackerleytng@google.com>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> Co-developed-by: Vishal Annapurve <vannapurve@google.com>
> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
> Co-developed-by: Fuad Tabba <tabba@google.com>
> Signed-off-by: Fuad Tabba <tabba@google.com>
> ---
> virt/kvm/guest_memfd.c | 119 +++++++++++++++++++++++++++++++++++------
> 1 file changed, 103 insertions(+), 16 deletions(-)
>
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index b22caa8b530ab..26cec833766c3 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -4,6 +4,7 @@
> #include <linux/falloc.h>
> #include <linux/fs.h>
> #include <linux/kvm_host.h>
> +#include <linux/maple_tree.h>
> #include <linux/mempolicy.h>
> #include <linux/pseudo_fs.h>
> #include <linux/pagemap.h>
> @@ -32,6 +33,7 @@ struct gmem_inode {
> struct inode vfs_inode;
>
> u64 flags;
> + struct maple_tree attributes;
> };
>
> static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
> @@ -54,6 +56,23 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
> return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
> }
>
> +static u64 kvm_gmem_get_attributes(struct inode *inode, pgoff_t index)
> +{
> + void *entry = mtree_load(&GMEM_I(inode)->attributes, index);
> +
> + return WARN_ON_ONCE(!entry) ? 0 : xa_to_value(entry);
> +}
> +
> +static bool kvm_gmem_is_private_mem(struct inode *inode, pgoff_t index)
> +{
> + return kvm_gmem_get_attributes(inode, index) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
> +}
> +
> +static bool kvm_gmem_is_shared_mem(struct inode *inode, pgoff_t index)
> +{
> + return !kvm_gmem_is_private_mem(inode, index);
> +}
> +
> static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> pgoff_t index, struct folio *folio)
> {
> @@ -415,10 +434,13 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
> if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
> return VM_FAULT_SIGBUS;
>
> - if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED))
> - return VM_FAULT_SIGBUS;
> + filemap_invalidate_lock_shared(inode->i_mapping);
> + if (kvm_gmem_is_shared_mem(inode, vmf->pgoff))
> + folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> + else
> + folio = ERR_PTR(-EACCES);
> + filemap_invalidate_unlock_shared(inode->i_mapping);
>
> - folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> if (IS_ERR(folio)) {
> if (PTR_ERR(folio) == -EAGAIN)
> return VM_FAULT_RETRY;
> @@ -572,6 +594,46 @@ bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
> return true;
> }
>
> +static int kvm_gmem_init_inode(struct inode *inode, loff_t size, u64 flags)
> +{
> + struct gmem_inode *gi = GMEM_I(inode);
> + MA_STATE(mas, &gi->attributes, 0, (size >> PAGE_SHIFT) - 1);
> + u64 attrs;
> + int r;
> +
> + inode->i_op = &kvm_gmem_iops;
> + inode->i_mapping->a_ops = &kvm_gmem_aops;
> + inode->i_mode |= S_IFREG;
> + inode->i_size = size;
> + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
> + mapping_set_inaccessible(inode->i_mapping);
> + /* Unmovable mappings are supposed to be marked unevictable as well. */
> + WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
> +
> + gi->flags = flags;
> +
> + mt_set_external_lock(&gi->attributes,
> + &inode->i_mapping->invalidate_lock);
> +
> + /*
> + * Store default attributes for the entire gmem instance. Ensuring every
> + * index is represented in the maple tree at all times simplifies the
> + * conversion and merging logic.
> + */
> + attrs = gi->flags & GUEST_MEMFD_FLAG_INIT_SHARED ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE;
> +
> + /*
> + * Acquire the invalidation lock purely to make lockdep happy. There
> + * should be no races at this time since the inode hasn't yet been fully
> + * created.
> + */
> + filemap_invalidate_lock(inode->i_mapping);
> + r = mas_store_gfp(&mas, xa_mk_value(attrs), GFP_KERNEL);
> + filemap_invalidate_unlock(inode->i_mapping);
> +
> + return r;
> +}
> +
> static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
> {
> static const char *name = "[kvm-gmem]";
> @@ -602,16 +664,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
> goto err_fops;
> }
>
> - inode->i_op = &kvm_gmem_iops;
> - inode->i_mapping->a_ops = &kvm_gmem_aops;
> - inode->i_mode |= S_IFREG;
> - inode->i_size = size;
> - mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
> - mapping_set_inaccessible(inode->i_mapping);
> - /* Unmovable mappings are supposed to be marked unevictable as well. */
> - WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
> -
> - GMEM_I(inode)->flags = flags;
> + err = kvm_gmem_init_inode(inode, size, flags);
> + if (err)
> + goto err_inode;
>
> file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
> if (IS_ERR(file)) {
> @@ -798,9 +853,13 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> if (!file)
> return -EFAULT;
>
> + filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
> +
> folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
> - if (IS_ERR(folio))
> - return PTR_ERR(folio);
> + if (IS_ERR(folio)) {
> + r = PTR_ERR(folio);
> + goto out;
> + }
>
> if (!is_prepared)
> r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
> @@ -812,6 +871,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> else
> folio_put(folio);
>
> +out:
> + filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
> return r;
> }
> EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
> @@ -925,13 +986,39 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
>
> mpol_shared_policy_init(&gi->policy, NULL);
>
> + /*
> + * Memory attributes are protected the filemap invalidation lock, but
> + * the lock structure isn't available at this time. Immediately mark
> + * maple tree as using external locking so that accessing the tree
> + * before its fully initialized results in NULL pointer dereferences
> + * and not more subtle bugs.
> + */
> + mt_init_flags(&gi->attributes, MT_FLAGS_LOCK_EXTERN);
> +
> gi->flags = 0;
> return &gi->vfs_inode;
> }
>
> static void kvm_gmem_destroy_inode(struct inode *inode)
> {
> - mpol_free_shared_policy(&GMEM_I(inode)->policy);
> + struct gmem_inode *gi = GMEM_I(inode);
> +
> + mpol_free_shared_policy(&gi->policy);
> +
> + /*
> + * Note! Checking for an empty tree is functionally necessary to avoid
> + * explosions if the tree hasn't been initialized, i.e. if the inode is
> + * being destroyed before guest_memfd can set the external lock.
> + */
> + if (!mtree_empty(&gi->attributes)) {
> + /*
> + * Acquire the invalidation lock purely to make lockdep happy,
> + * the inode is unreachable at this point.
> + */
> + filemap_invalidate_lock(inode->i_mapping);
> + __mt_destroy(&gi->attributes);
> + filemap_invalidate_unlock(inode->i_mapping);
> + }
> }
>
> static void kvm_gmem_free_inode(struct inode *inode)
> --
> 2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply [flat|nested] 56+ messages in thread
* [RFC PATCH v1 02/37] KVM: Rename KVM_GENERIC_MEMORY_ATTRIBUTES to KVM_VM_MEMORY_ATTRIBUTES
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
2025-10-17 20:11 ` [RFC PATCH v1 01/37] KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings Ackerley Tng
@ 2025-10-17 20:11 ` Ackerley Tng
2025-10-17 20:11 ` [RFC PATCH v1 03/37] KVM: Enumerate support for PRIVATE memory iff kvm_arch_has_private_mem is defined Ackerley Tng
` (34 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:11 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
From: Sean Christopherson <seanjc@google.com>
Rename the per-VM memory attributes Kconfig to make it explicitly about
per-VM attributes in anticipation of adding memory attributes support to
guest_memfd, at which point it will be possible (and desirable) to have
memory attributes without the per-VM support, even in x86.
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/kvm/Kconfig | 6 +++---
arch/x86/kvm/mmu/mmu.c | 2 +-
arch/x86/kvm/x86.c | 2 +-
include/linux/kvm_host.h | 8 ++++----
include/trace/events/kvm.h | 4 ++--
virt/kvm/Kconfig | 2 +-
virt/kvm/kvm_main.c | 14 +++++++-------
8 files changed, 20 insertions(+), 20 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 48598d017d6f3..efb0b2e1808d5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2301,7 +2301,7 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
int tdp_max_root_level, int tdp_huge_page_level);
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
#endif
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 4e43923656d0e..acb03b45ba050 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -84,7 +84,7 @@ config KVM_SW_PROTECTED_VM
bool "Enable support for KVM software-protected VMs"
depends on EXPERT
depends on KVM_X86 && X86_64
- select KVM_GENERIC_MEMORY_ATTRIBUTES
+ select KVM_VM_MEMORY_ATTRIBUTES
help
Enable support for KVM software-protected VMs. Currently, software-
protected VMs are purely a development and testing vehicle for
@@ -134,7 +134,7 @@ config KVM_INTEL_TDX
bool "Intel Trust Domain Extensions (TDX) support"
default y
depends on INTEL_TDX_HOST
- select KVM_GENERIC_MEMORY_ATTRIBUTES
+ select KVM_VM_MEMORY_ATTRIBUTES
select HAVE_KVM_ARCH_GMEM_POPULATE
help
Provides support for launching Intel Trust Domain Extensions (TDX)
@@ -158,7 +158,7 @@ config KVM_AMD_SEV
depends on KVM_AMD && X86_64
depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
select ARCH_HAS_CC_PLATFORM
- select KVM_GENERIC_MEMORY_ATTRIBUTES
+ select KVM_VM_MEMORY_ATTRIBUTES
select HAVE_KVM_ARCH_GMEM_PREPARE
select HAVE_KVM_ARCH_GMEM_INVALIDATE
select HAVE_KVM_ARCH_GMEM_POPULATE
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 667d66cf76d5e..e4542b37b0db6 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7808,7 +7808,7 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread);
}
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
int level)
{
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fe3dc3eb43312..5e38c4c9cf63c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13398,7 +13398,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
}
}
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
kvm_mmu_init_memslot_memory_attributes(kvm, slot);
#endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 680ca838f0181..fddb373fcbaaf 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -721,7 +721,7 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
}
#endif
-#ifndef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifndef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
{
return false;
@@ -871,7 +871,7 @@ struct kvm {
#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
struct notifier_block pm_notifier;
#endif
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
/* Protected by slots_lock (for writes) and RCU (for reads) */
struct xarray mem_attr_array;
#endif
@@ -2514,7 +2514,7 @@ static inline bool kvm_memslot_is_gmem_only(const struct kvm_memory_slot *slot)
return slot->flags & KVM_MEMSLOT_GMEM_ONLY;
}
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
{
return xa_to_value(xa_load(&kvm->mem_attr_array, gfn));
@@ -2536,7 +2536,7 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
{
return false;
}
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
#ifdef CONFIG_KVM_GUEST_MEMFD
int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index b282e3a867696..1ba72bd73ea2f 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -358,7 +358,7 @@ TRACE_EVENT(kvm_dirty_ring_exit,
TP_printk("vcpu %d", __entry->vcpu_id)
);
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
/*
* @start: Starting address of guest memory range
* @end: End address of guest memory range
@@ -383,7 +383,7 @@ TRACE_EVENT(kvm_vm_set_mem_attributes,
TP_printk("%#016llx -- %#016llx [0x%lx]",
__entry->start, __entry->end, __entry->attr)
);
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
TRACE_EVENT(kvm_unmap_hva_range,
TP_PROTO(unsigned long start, unsigned long end),
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index a01cc5743137c..9dd7873114b59 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -108,7 +108,7 @@ config KVM_MMU_LOCKLESS_AGING
depends on KVM_GENERIC_MMU_NOTIFIER
bool
-config KVM_GENERIC_MEMORY_ATTRIBUTES
+config KVM_VM_MEMORY_ATTRIBUTES
depends on KVM_GENERIC_MMU_NOTIFIER
bool
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4845e5739436a..f73047ea4333e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1131,7 +1131,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
spin_lock_init(&kvm->mn_invalidate_lock);
rcuwait_init(&kvm->mn_memslots_update_rcuwait);
xa_init(&kvm->vcpu_array);
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
xa_init(&kvm->mem_attr_array);
#endif
@@ -1322,7 +1322,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
cleanup_srcu_struct(&kvm->irq_srcu);
srcu_barrier(&kvm->srcu);
cleanup_srcu_struct(&kvm->srcu);
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
xa_destroy(&kvm->mem_attr_array);
#endif
kvm_arch_free_vm(kvm);
@@ -2425,7 +2425,7 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
}
#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
static u64 kvm_supported_mem_attributes(struct kvm *kvm)
{
if (!kvm || kvm_arch_has_private_mem(kvm))
@@ -2630,7 +2630,7 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
}
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
{
@@ -4921,7 +4921,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
case KVM_CAP_SYSTEM_EVENT_DATA:
case KVM_CAP_DEVICE_CTRL:
return 1;
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
case KVM_CAP_MEMORY_ATTRIBUTES:
return kvm_supported_mem_attributes(kvm);
#endif
@@ -5325,7 +5325,7 @@ static long kvm_vm_ioctl(struct file *filp,
break;
}
#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
case KVM_SET_MEMORY_ATTRIBUTES: {
struct kvm_memory_attributes attrs;
@@ -5336,7 +5336,7 @@ static long kvm_vm_ioctl(struct file *filp,
r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
break;
}
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
case KVM_CREATE_DEVICE: {
struct kvm_create_device cd;
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 03/37] KVM: Enumerate support for PRIVATE memory iff kvm_arch_has_private_mem is defined
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
2025-10-17 20:11 ` [RFC PATCH v1 01/37] KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings Ackerley Tng
2025-10-17 20:11 ` [RFC PATCH v1 02/37] KVM: Rename KVM_GENERIC_MEMORY_ATTRIBUTES to KVM_VM_MEMORY_ATTRIBUTES Ackerley Tng
@ 2025-10-17 20:11 ` Ackerley Tng
2025-10-17 20:11 ` [RFC PATCH v1 04/37] KVM: Stub in ability to disable per-VM memory attribute tracking Ackerley Tng
` (33 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:11 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
From: Sean Christopherson <seanjc@google.com>
Explicitly guard reporting support for KVM_MEMORY_ATTRIBUTE_PRIVATE based
on kvm_arch_has_private_mem being #defined in anticipation of decoupling
kvm_supported_mem_attributes() from CONFIG_KVM_VM_MEMORY_ATTRIBUTES.
guest_memfd support for memory attributes will be unconditional to avoid
yet more macros (all architectures that support guest_memfd are expect to
user per-gmem attributes at some point), at which point enumerating support
KVM_MEMORY_ATTRIBUTE_PRIVATE based solely on memory attributes being
supported _somewhere_ would result in KVM over-reporting support on arm64.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
include/linux/kvm_host.h | 2 +-
virt/kvm/kvm_main.c | 2 ++
2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index fddb373fcbaaf..21bf30e8d3cc1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -721,7 +721,7 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
}
#endif
-#ifndef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+#ifndef kvm_arch_has_private_mem
static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
{
return false;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f73047ea4333e..591795a3fa124 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2428,8 +2428,10 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
static u64 kvm_supported_mem_attributes(struct kvm *kvm)
{
+#ifdef kvm_arch_has_private_mem
if (!kvm || kvm_arch_has_private_mem(kvm))
return KVM_MEMORY_ATTRIBUTE_PRIVATE;
+#endif
return 0;
}
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 04/37] KVM: Stub in ability to disable per-VM memory attribute tracking
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (2 preceding siblings ...)
2025-10-17 20:11 ` [RFC PATCH v1 03/37] KVM: Enumerate support for PRIVATE memory iff kvm_arch_has_private_mem is defined Ackerley Tng
@ 2025-10-17 20:11 ` Ackerley Tng
2025-10-17 20:11 ` [RFC PATCH v1 05/37] KVM: guest_memfd: Wire up kvm_get_memory_attributes() to per-gmem attributes Ackerley Tng
` (32 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:11 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
From: Sean Christopherson <seanjc@google.com>
Introduce the basic infrastructure to allow per-VM memory attribute
tracking to be disabled. This will be built-upon in a later patch, where a
module param can disable per-VM memory attribute tracking.
Split the Kconfig option into a base KVM_MEMORY_ATTRIBUTES and the
existing KVM_VM_MEMORY_ATTRIBUTES. The base option provides the core
plumbing, while the latter enables the full per-VM tracking via an xarray
and the associated ioctls.
kvm_get_memory_attributes() now performs a static call that either looks up
kvm->mem_attr_array with CONFIG_KVM_VM_MEMORY_ATTRIBUTES is enabled, or
just returns 0 otherwise. The static call can be patched depending on
whether per-VM tracking is enabled by the CONFIG.
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/kvm_host.h | 2 +-
include/linux/kvm_host.h | 23 ++++++++++-------
virt/kvm/Kconfig | 6 ++++-
virt/kvm/kvm_main.c | 44 ++++++++++++++++++++++++++++++++-
4 files changed, 63 insertions(+), 12 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index efb0b2e1808d5..197b28ae0e28c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2301,7 +2301,7 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
int tdp_max_root_level, int tdp_huge_page_level);
-#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
#endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 21bf30e8d3cc1..512febf47c265 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2514,19 +2514,15 @@ static inline bool kvm_memslot_is_gmem_only(const struct kvm_memory_slot *slot)
return slot->flags & KVM_MEMSLOT_GMEM_ONLY;
}
-#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
+typedef unsigned long (kvm_get_memory_attributes_t)(struct kvm *kvm, gfn_t gfn);
+DECLARE_STATIC_CALL(__kvm_get_memory_attributes, kvm_get_memory_attributes_t);
+
static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
{
- return xa_to_value(xa_load(&kvm->mem_attr_array, gfn));
+ return static_call(__kvm_get_memory_attributes)(kvm, gfn);
}
-bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
- unsigned long mask, unsigned long attrs);
-bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
- struct kvm_gfn_range *range);
-bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
- struct kvm_gfn_range *range);
-
static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
{
return kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
@@ -2536,6 +2532,15 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
{
return false;
}
+#endif
+
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+ unsigned long mask, unsigned long attrs);
+bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
+ struct kvm_gfn_range *range);
+bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
+ struct kvm_gfn_range *range);
#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
#ifdef CONFIG_KVM_GUEST_MEMFD
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 9dd7873114b59..395996977fe5a 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -108,10 +108,14 @@ config KVM_MMU_LOCKLESS_AGING
depends on KVM_GENERIC_MMU_NOTIFIER
bool
-config KVM_VM_MEMORY_ATTRIBUTES
+config KVM_MEMORY_ATTRIBUTES
depends on KVM_GENERIC_MMU_NOTIFIER
bool
+config KVM_VM_MEMORY_ATTRIBUTES
+ select KVM_MEMORY_ATTRIBUTES
+ bool
+
config KVM_GUEST_MEMFD
depends on KVM_GENERIC_MMU_NOTIFIER
select XARRAY_MULTI
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 591795a3fa124..6c29770dfa7c8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -101,6 +101,17 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_shrink);
static bool allow_unsafe_mappings;
module_param(allow_unsafe_mappings, bool, 0444);
+#ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+static bool vm_memory_attributes = true;
+#else
+#define vm_memory_attributes false
+#endif
+DEFINE_STATIC_CALL_RET0(__kvm_get_memory_attributes, kvm_get_memory_attributes_t);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(STATIC_CALL_KEY(__kvm_get_memory_attributes));
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(STATIC_CALL_TRAMP(__kvm_get_memory_attributes));
+#endif
+
/*
* Ordering of locks:
*
@@ -2425,7 +2436,7 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
}
#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
-#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
static u64 kvm_supported_mem_attributes(struct kvm *kvm)
{
#ifdef kvm_arch_has_private_mem
@@ -2436,6 +2447,12 @@ static u64 kvm_supported_mem_attributes(struct kvm *kvm)
return 0;
}
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+static unsigned long kvm_get_vm_memory_attributes(struct kvm *kvm, gfn_t gfn)
+{
+ return xa_to_value(xa_load(&kvm->mem_attr_array, gfn));
+}
+
/*
* Returns true if _all_ gfns in the range [@start, @end) have attributes
* such that the bits in @mask match @attrs.
@@ -2632,7 +2649,24 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
}
+#else /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
+static unsigned long kvm_get_vm_memory_attributes(struct kvm *kvm, gfn_t gfn)
+{
+ BUILD_BUG_ON(1);
+}
#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
+static void kvm_init_memory_attributes(void)
+{
+ if (vm_memory_attributes)
+ static_call_update(__kvm_get_memory_attributes,
+ kvm_get_vm_memory_attributes);
+ else
+ static_call_update(__kvm_get_memory_attributes,
+ (void *)__static_call_return0);
+}
+#else /* CONFIG_KVM_MEMORY_ATTRIBUTES */
+static void kvm_init_memory_attributes(void) { }
+#endif /* CONFIG_KVM_MEMORY_ATTRIBUTES */
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
{
@@ -4925,6 +4959,9 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
return 1;
#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
case KVM_CAP_MEMORY_ATTRIBUTES:
+ if (!vm_memory_attributes)
+ return 0;
+
return kvm_supported_mem_attributes(kvm);
#endif
#ifdef CONFIG_KVM_GUEST_MEMFD
@@ -5331,6 +5368,10 @@ static long kvm_vm_ioctl(struct file *filp,
case KVM_SET_MEMORY_ATTRIBUTES: {
struct kvm_memory_attributes attrs;
+ r = -ENOTTY;
+ if (!vm_memory_attributes)
+ goto out;
+
r = -EFAULT;
if (copy_from_user(&attrs, argp, sizeof(attrs)))
goto out;
@@ -6513,6 +6554,7 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
kvm_preempt_ops.sched_in = kvm_sched_in;
kvm_preempt_ops.sched_out = kvm_sched_out;
+ kvm_init_memory_attributes();
kvm_init_debug();
r = kvm_vfio_ops_init();
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 05/37] KVM: guest_memfd: Wire up kvm_get_memory_attributes() to per-gmem attributes
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (3 preceding siblings ...)
2025-10-17 20:11 ` [RFC PATCH v1 04/37] KVM: Stub in ability to disable per-VM memory attribute tracking Ackerley Tng
@ 2025-10-17 20:11 ` Ackerley Tng
2025-10-17 20:11 ` [RFC PATCH v1 06/37] KVM: guest_memfd: Update kvm_gmem_populate() to use gmem attributes Ackerley Tng
` (31 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:11 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
From: Sean Christopherson <seanjc@google.com>
Implement kvm_gmem_get_memory_attributes() for guest_memfd to allow the KVM
core and architecture code to query per-GFN memory attributes.
kvm_gmem_get_memory_attributes() finds the memory slot for a given GFN and
queries the guest_memfd file's to determine if the page is marked as
private.
If vm_memory_attributes is not enabled, there is no shared/private tracking
at the VM level. Install the guest_memfd implementation as long as
guest_memfd is enabled to give guest_memfd a chance to respond on
attributes.
guest_memfd should look up attributes regardless of whether this memslot is
gmem-only since attributes are now tracked by gmem regardless of whether
mmap() is enabled.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Co-developed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
include/linux/kvm_host.h | 2 ++
virt/kvm/guest_memfd.c | 29 +++++++++++++++++++++++++++++
virt/kvm/kvm_main.c | 3 +++
3 files changed, 34 insertions(+)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 512febf47c265..b8418cc5851f1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2543,6 +2543,8 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
struct kvm_gfn_range *range);
#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
+unsigned long kvm_gmem_get_memory_attributes(struct kvm *kvm, gfn_t gfn);
+
#ifdef CONFIG_KVM_GUEST_MEMFD
int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 26cec833766c3..f62facc3ab776 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -518,6 +518,35 @@ static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
+unsigned long kvm_gmem_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+
+ /*
+ * If this gfn has no associated memslot, there's no chance of the gfn
+ * being backed by private memory, since guest_memfd must be used for
+ * private memory, and guest_memfd must be associated with some memslot.
+ */
+ if (!slot)
+ return 0;
+
+ CLASS(gmem_get_file, file)(slot);
+ if (!file)
+ return false;
+
+ /*
+ * Don't take the filemap invalidation lock, as temporarily acquiring
+ * that lock wouldn't provide any meaningful protection. The caller
+ * _must_ protect consumption of private vs. shared by checking
+ * mmu_invalidate_retry_gfn() under mmu_lock.
+ */
+ guard(rcu)();
+
+ return kvm_gmem_get_attributes(file_inode(file),
+ kvm_gmem_get_index(slot, gfn));
+}
+EXPORT_SYMBOL_GPL(kvm_gmem_get_memory_attributes);
+
static struct file_operations kvm_gmem_fops = {
.mmap = kvm_gmem_mmap,
.open = generic_file_open,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6c29770dfa7c8..c73ebdb73070e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2660,6 +2660,9 @@ static void kvm_init_memory_attributes(void)
if (vm_memory_attributes)
static_call_update(__kvm_get_memory_attributes,
kvm_get_vm_memory_attributes);
+ else if (IS_ENABLED(CONFIG_KVM_GUEST_MEMFD))
+ static_call_update(__kvm_get_memory_attributes,
+ kvm_gmem_get_memory_attributes);
else
static_call_update(__kvm_get_memory_attributes,
(void *)__static_call_return0);
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 06/37] KVM: guest_memfd: Update kvm_gmem_populate() to use gmem attributes
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (4 preceding siblings ...)
2025-10-17 20:11 ` [RFC PATCH v1 05/37] KVM: guest_memfd: Wire up kvm_get_memory_attributes() to per-gmem attributes Ackerley Tng
@ 2025-10-17 20:11 ` Ackerley Tng
2025-10-17 20:11 ` [RFC PATCH v1 07/37] KVM: Introduce KVM_SET_MEMORY_ATTRIBUTES2 Ackerley Tng
` (30 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:11 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
Update the guest_memfd populate() flow to pull memory attributes from the
gmem instance instead of the VM when KVM is not configured to track
shared/private status in the VM.
Rename the per-VM API to make it clear that it retrieves per-VM
attributes, i.e. is not suitable for use outside of flows that are
specific to generic per-VM attributes.
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kvm/mmu/mmu.c | 2 +-
include/linux/kvm_host.h | 5 ++++-
virt/kvm/guest_memfd.c | 26 +++++++++++++++++++++++---
virt/kvm/kvm_main.c | 8 +++-----
4 files changed, 31 insertions(+), 10 deletions(-)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index e4542b37b0db6..52189853cf4ab 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7897,7 +7897,7 @@ static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
const unsigned long end = start + KVM_PAGES_PER_HPAGE(level);
if (level == PG_LEVEL_2M)
- return kvm_range_has_memory_attributes(kvm, start, end, ~0, attrs);
+ return kvm_range_has_vm_memory_attributes(kvm, start, end, ~0, attrs);
for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) {
if (hugepage_test_mixed(slot, gfn, level - 1) ||
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b8418cc5851f1..b48632ee242b3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2535,12 +2535,15 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
#endif
#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
-bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+extern bool vm_memory_attributes;
+bool kvm_range_has_vm_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
unsigned long mask, unsigned long attrs);
bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
struct kvm_gfn_range *range);
bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
struct kvm_gfn_range *range);
+#else
+#define vm_memory_attributes false
#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
unsigned long kvm_gmem_get_memory_attributes(struct kvm *kvm, gfn_t gfn);
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index f62facc3ab776..855e682041311 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -907,10 +907,30 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
+static bool kvm_gmem_range_is_private(struct gmem_inode *gi, pgoff_t index,
+ size_t nr_pages, struct kvm *kvm, gfn_t gfn)
+{
+ pgoff_t end = index + nr_pages - 1;
+ void *entry;
+
+ if (vm_memory_attributes)
+ return kvm_range_has_vm_memory_attributes(kvm, gfn, gfn + nr_pages,
+ KVM_MEMORY_ATTRIBUTE_PRIVATE,
+ KVM_MEMORY_ATTRIBUTE_PRIVATE);
+
+ mt_for_each(&gi->attributes, entry, index, end) {
+ if (xa_to_value(entry) != attributes)
+ return false;
+ }
+
+ return true;
+}
+
long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
kvm_gmem_populate_cb post_populate, void *opaque)
{
struct kvm_memory_slot *slot;
+ struct gmem_inode *gi;
void __user *p;
int ret = 0, max_order;
@@ -929,6 +949,8 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
if (!file)
return -EFAULT;
+ gi = GMEM_I(file_inode(file));
+
filemap_invalidate_lock(file->f_mapping);
npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
@@ -962,9 +984,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
(npages - i) < (1 << max_order));
ret = -EINVAL;
- while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order),
- KVM_MEMORY_ATTRIBUTE_PRIVATE,
- KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
+ while (!kvm_gmem_range_is_private(gi, index, 1 << max_order, kvm, gfn)) {
if (!max_order)
goto put_folio_and_exit;
max_order--;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c73ebdb73070e..35166754a22b4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -103,9 +103,7 @@ module_param(allow_unsafe_mappings, bool, 0444);
#ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
-static bool vm_memory_attributes = true;
-#else
-#define vm_memory_attributes false
+bool vm_memory_attributes = true;
#endif
DEFINE_STATIC_CALL_RET0(__kvm_get_memory_attributes, kvm_get_memory_attributes_t);
EXPORT_SYMBOL_FOR_KVM_INTERNAL(STATIC_CALL_KEY(__kvm_get_memory_attributes));
@@ -2457,7 +2455,7 @@ static unsigned long kvm_get_vm_memory_attributes(struct kvm *kvm, gfn_t gfn)
* Returns true if _all_ gfns in the range [@start, @end) have attributes
* such that the bits in @mask match @attrs.
*/
-bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+bool kvm_range_has_vm_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
unsigned long mask, unsigned long attrs)
{
XA_STATE(xas, &kvm->mem_attr_array, start);
@@ -2591,7 +2589,7 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
mutex_lock(&kvm->slots_lock);
/* Nothing to do if the entire range has the desired attributes. */
- if (kvm_range_has_memory_attributes(kvm, start, end, ~0, attributes))
+ if (kvm_range_has_vm_memory_attributes(kvm, start, end, ~0, attributes))
goto out_unlock;
/*
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 07/37] KVM: Introduce KVM_SET_MEMORY_ATTRIBUTES2
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (5 preceding siblings ...)
2025-10-17 20:11 ` [RFC PATCH v1 06/37] KVM: guest_memfd: Update kvm_gmem_populate() to use gmem attributes Ackerley Tng
@ 2025-10-17 20:11 ` Ackerley Tng
2025-10-22 15:21 ` Steven Price
2025-10-17 20:11 ` [RFC PATCH v1 08/37] KVM: guest_memfd: Don't set FGP_ACCESSED when getting folios Ackerley Tng
` (29 subsequent siblings)
36 siblings, 1 reply; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:11 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
Introduce a "version 2" of KVM_SET_MEMORY_ATTRIBUTES to support returning
information back to userspace.
This new ioctl and structure will, in a later patch, be shared as a
guest_memfd ioctl, where the padding in the new kvm_memory_attributes2
structure will be for writing the response from the guest_memfd ioctl to
userspace.
A new ioctl is necessary for these reasons:
1. KVM_SET_MEMORY_ATTRIBUTES is currently a write-only ioctl and does not
allow userspace to read fields. There's nothing in code (yet?) that
validates this, but using _IOWR for consistency would be prudent.
2. KVM_SET_MEMORY_ATTRIBUTES, when used as a guest_memfd ioctl, will need
an additional field to provide userspace with more error details.
Alternatively, a completely new ioctl could be defined, unrelated to
KVM_SET_MEMORY_ATTRIBUTES, but using the same ioctl number and struct for
the vm and guest_memfd ioctls streamlines the interface for userspace. In
addition, any memory attributes, implemented on the vm or guest_memfd
ioctl, can be easily shared with the other.
Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
Documentation/virt/kvm/api.rst | 32 +++++++++++++++++++++++++++++++
include/uapi/linux/kvm.h | 12 ++++++++++++
virt/kvm/kvm_main.c | 35 +++++++++++++++++++++++++++++++---
3 files changed, 76 insertions(+), 3 deletions(-)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 754b662a453c3..a812769d79bf6 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6355,6 +6355,8 @@ S390:
Returns -EINVAL if the VM has the KVM_VM_S390_UCONTROL flag set.
Returns -EINVAL if called on a protected VM.
+.. _KVM_SET_MEMORY_ATTRIBUTES:
+
4.141 KVM_SET_MEMORY_ATTRIBUTES
-------------------------------
@@ -6512,6 +6514,36 @@ the capability to be present.
`flags` must currently be zero.
+4.144 KVM_SET_MEMORY_ATTRIBUTES2
+---------------------------------
+
+:Capability: KVM_CAP_MEMORY_ATTRIBUTES2
+:Architectures: x86
+:Type: vm ioctl
+:Parameters: struct kvm_memory_attributes2 (in/out)
+:Returns: 0 on success, <0 on error
+
+KVM_SET_MEMORY_ATTRIBUTES2 is an extension to
+KVM_SET_MEMORY_ATTRIBUTES that supports returning (writing) values to
+userspace. The original (pre-extension) fields are shared with
+KVM_SET_MEMORY_ATTRIBUTES identically.
+
+Attribute values are shared with KVM_SET_MEMORY_ATTRIBUTES.
+
+::
+
+ struct kvm_memory_attributes2 {
+ __u64 address;
+ __u64 size;
+ __u64 attributes;
+ __u64 flags;
+ __u64 reserved[4];
+ };
+
+ #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3)
+
+See also: :ref: `KVM_SET_MEMORY_ATTRIBUTES`.
+
.. _kvm_run:
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 52f6000ab0208..c300e38c7c9cd 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -963,6 +963,7 @@ struct kvm_enable_cap {
#define KVM_CAP_RISCV_MP_STATE_RESET 242
#define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243
#define KVM_CAP_GUEST_MEMFD_FLAGS 244
+#define KVM_CAP_MEMORY_ATTRIBUTES2 245
struct kvm_irq_routing_irqchip {
__u32 irqchip;
@@ -1617,4 +1618,15 @@ struct kvm_pre_fault_memory {
__u64 padding[5];
};
+/* Available with KVM_CAP_MEMORY_ATTRIBUTES2 */
+#define KVM_SET_MEMORY_ATTRIBUTES2 _IOWR(KVMIO, 0xd6, struct kvm_memory_attributes2)
+
+struct kvm_memory_attributes2 {
+ __u64 address;
+ __u64 size;
+ __u64 attributes;
+ __u64 flags;
+ __u64 reserved[4];
+};
+
#endif /* __LINUX_KVM_H */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 35166754a22b4..dd84b377e46db 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2621,7 +2621,7 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
return r;
}
static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
- struct kvm_memory_attributes *attrs)
+ struct kvm_memory_attributes2 *attrs)
{
gfn_t start, end;
@@ -4959,6 +4959,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
case KVM_CAP_DEVICE_CTRL:
return 1;
#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+ case KVM_CAP_MEMORY_ATTRIBUTES2:
case KVM_CAP_MEMORY_ATTRIBUTES:
if (!vm_memory_attributes)
return 0;
@@ -5184,6 +5185,14 @@ do { \
sizeof_field(struct kvm_userspace_memory_region2, field)); \
} while (0)
+#define SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(field) \
+do { \
+ BUILD_BUG_ON(offsetof(struct kvm_set_memory_attributes, field) != \
+ offsetof(struct kvm_set_memory_attributes2, field)); \
+ BUILD_BUG_ON(sizeof_field(struct kvm_set_memory_attributes, field) != \
+ sizeof_field(struct kvm_set_memory_attributes2, field)); \
+} while (0)
+
static long kvm_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -5366,15 +5375,35 @@ static long kvm_vm_ioctl(struct file *filp,
}
#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+ case KVM_SET_MEMORY_ATTRIBUTES2:
case KVM_SET_MEMORY_ATTRIBUTES: {
- struct kvm_memory_attributes attrs;
+ struct kvm_memory_attributes2 attrs;
+ unsigned long size;
+
+ if (ioctl == KVM_SET_MEMORY_ATTRIBUTES) {
+ /*
+ * Fields beyond struct kvm_userspace_memory_region shouldn't be
+ * accessed, but avoid leaking kernel memory in case of a bug.
+ */
+ memset(&mem, 0, sizeof(mem));
+ size = sizeof(struct kvm_set_memory_attributes);
+ } else {
+ size = sizeof(struct kvm_set_memory_attributes2);
+ }
+
+ /* Ensure the common parts of the two structs are identical. */
+ SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(slot);
+ SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(flags);
+ SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(guest_phys_addr);
+ SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(memory_size);
+ SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(userspace_addr);
r = -ENOTTY;
if (!vm_memory_attributes)
goto out;
r = -EFAULT;
- if (copy_from_user(&attrs, argp, sizeof(attrs)))
+ if (copy_from_user(&attrs, argp, size))
goto out;
r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* Re: [RFC PATCH v1 07/37] KVM: Introduce KVM_SET_MEMORY_ATTRIBUTES2
2025-10-17 20:11 ` [RFC PATCH v1 07/37] KVM: Introduce KVM_SET_MEMORY_ATTRIBUTES2 Ackerley Tng
@ 2025-10-22 15:21 ` Steven Price
2025-10-22 16:51 ` Ackerley Tng
0 siblings, 1 reply; 56+ messages in thread
From: Steven Price @ 2025-10-22 15:21 UTC (permalink / raw)
To: Ackerley Tng, cgroups, kvm, linux-doc, linux-fsdevel,
linux-kernel, linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: akpm, binbin.wu, bp, brauner, chao.p.peng, chenhuacai, corbet,
dave.hansen, dave.hansen, david, dmatlack, erdemaktas, fan.du,
fvdl, haibo1.xu, hannes, hch, hpa, hughd, ira.weiny,
isaku.yamahata, jack, james.morse, jarkko, jgg, jgowans, jhubbard,
jroedel, jthoughton, jun.miao, kai.huang, keirf, kent.overstreet,
liam.merwick, maciej.wieczor-retman, mail, maobibo,
mathieu.desnoyers, maz, mhiramat, mhocko, mic, michael.roth,
mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz, oliver.upton,
palmer, pankaj.gupta, paul.walmsley, pbonzini, peterx, pgonda,
prsampat, pvorel, qperret, richard.weiyang, rick.p.edgecombe,
rientjes, rostedt, roypat, rppt, seanjc, shakeel.butt, shuah,
steven.sistare, suzuki.poulose, tabba, tglx, thomas.lendacky,
vannapurve, vbabka, viro, vkuznets, wei.w.wang, will, willy,
wyihan, xiaoyao.li, yan.y.zhao, yilun.xu, yuzenghui, zhiquan1.li
On 17/10/2025 21:11, Ackerley Tng wrote:
> Introduce a "version 2" of KVM_SET_MEMORY_ATTRIBUTES to support returning
> information back to userspace.
>
> This new ioctl and structure will, in a later patch, be shared as a
> guest_memfd ioctl, where the padding in the new kvm_memory_attributes2
> structure will be for writing the response from the guest_memfd ioctl to
> userspace.
>
> A new ioctl is necessary for these reasons:
>
> 1. KVM_SET_MEMORY_ATTRIBUTES is currently a write-only ioctl and does not
> allow userspace to read fields. There's nothing in code (yet?) that
> validates this, but using _IOWR for consistency would be prudent.
>
> 2. KVM_SET_MEMORY_ATTRIBUTES, when used as a guest_memfd ioctl, will need
> an additional field to provide userspace with more error details.
>
> Alternatively, a completely new ioctl could be defined, unrelated to
> KVM_SET_MEMORY_ATTRIBUTES, but using the same ioctl number and struct for
> the vm and guest_memfd ioctls streamlines the interface for userspace. In
> addition, any memory attributes, implemented on the vm or guest_memfd
> ioctl, can be easily shared with the other.
>
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> ---
> Documentation/virt/kvm/api.rst | 32 +++++++++++++++++++++++++++++++
> include/uapi/linux/kvm.h | 12 ++++++++++++
> virt/kvm/kvm_main.c | 35 +++++++++++++++++++++++++++++++---
> 3 files changed, 76 insertions(+), 3 deletions(-)
>
[...]
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 52f6000ab0208..c300e38c7c9cd 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
[...]
> @@ -5366,15 +5375,35 @@ static long kvm_vm_ioctl(struct file *filp,
> }
> #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
> #ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
> + case KVM_SET_MEMORY_ATTRIBUTES2:
> case KVM_SET_MEMORY_ATTRIBUTES: {
> - struct kvm_memory_attributes attrs;
> + struct kvm_memory_attributes2 attrs;
> + unsigned long size;
> +
> + if (ioctl == KVM_SET_MEMORY_ATTRIBUTES) {
> + /*
> + * Fields beyond struct kvm_userspace_memory_region shouldn't be
> + * accessed, but avoid leaking kernel memory in case of a bug.
> + */
> + memset(&mem, 0, sizeof(mem));
s/mem/attrs/g
> + size = sizeof(struct kvm_set_memory_attributes);
> + } else {
> + size = sizeof(struct kvm_set_memory_attributes2);
s/kvm_set_memory_attributes/kvm_memory_attributes/ (on both sizeof lines
above and in the SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD macro).
> + }
> +
> + /* Ensure the common parts of the two structs are identical. */
> + SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(slot);
> + SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(flags);
> + SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(guest_phys_addr);
> + SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(memory_size);
> + SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(userspace_addr);
The fields are:
* address
* size
* attributes
* flags
The list you've got appears to match struct kvm_userspace_memory_region
- copy/paste error?
Thanks,
Steve
>
> r = -ENOTTY;
> if (!vm_memory_attributes)
> goto out;
>
> r = -EFAULT;
> - if (copy_from_user(&attrs, argp, sizeof(attrs)))
> + if (copy_from_user(&attrs, argp, size))
> goto out;
>
> r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
^ permalink raw reply [flat|nested] 56+ messages in thread* Re: [RFC PATCH v1 07/37] KVM: Introduce KVM_SET_MEMORY_ATTRIBUTES2
2025-10-22 15:21 ` Steven Price
@ 2025-10-22 16:51 ` Ackerley Tng
2025-10-22 22:45 ` Ackerley Tng
0 siblings, 1 reply; 56+ messages in thread
From: Ackerley Tng @ 2025-10-22 16:51 UTC (permalink / raw)
To: Steven Price, cgroups, kvm, linux-doc, linux-fsdevel,
linux-kernel, linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: akpm, binbin.wu, bp, brauner, chao.p.peng, chenhuacai, corbet,
dave.hansen, dave.hansen, david, dmatlack, erdemaktas, fan.du,
fvdl, haibo1.xu, hannes, hch, hpa, hughd, ira.weiny,
isaku.yamahata, jack, james.morse, jarkko, jgg, jgowans, jhubbard,
jroedel, jthoughton, jun.miao, kai.huang, keirf, kent.overstreet,
liam.merwick, maciej.wieczor-retman, mail, maobibo,
mathieu.desnoyers, maz, mhiramat, mhocko, mic, michael.roth,
mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz, oliver.upton,
palmer, pankaj.gupta, paul.walmsley, pbonzini, peterx, pgonda,
prsampat, pvorel, qperret, richard.weiyang, rick.p.edgecombe,
rientjes, rostedt, roypat, rppt, seanjc, shakeel.butt, shuah,
steven.sistare, suzuki.poulose, tabba, tglx, thomas.lendacky,
vannapurve, vbabka, viro, vkuznets, wei.w.wang, will, willy,
wyihan, xiaoyao.li, yan.y.zhao, yilun.xu, yuzenghui, zhiquan1.li
Steven Price <steven.price@arm.com> writes:
> On 17/10/2025 21:11, Ackerley Tng wrote:
>>
>> [...snip...]
>>
>> @@ -5366,15 +5375,35 @@ static long kvm_vm_ioctl(struct file *filp,
>> }
>> #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
>> #ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
>> + case KVM_SET_MEMORY_ATTRIBUTES2:
>> case KVM_SET_MEMORY_ATTRIBUTES: {
>> - struct kvm_memory_attributes attrs;
>> + struct kvm_memory_attributes2 attrs;
>> + unsigned long size;
>> +
>> + if (ioctl == KVM_SET_MEMORY_ATTRIBUTES) {
>> + /*
>> + * Fields beyond struct kvm_userspace_memory_region shouldn't be
>> + * accessed, but avoid leaking kernel memory in case of a bug.
>> + */
>> + memset(&mem, 0, sizeof(mem));
>
> s/mem/attrs/g
>
>> + size = sizeof(struct kvm_set_memory_attributes);
>> + } else {
>> + size = sizeof(struct kvm_set_memory_attributes2);
>
> s/kvm_set_memory_attributes/kvm_memory_attributes/ (on both sizeof lines
> above and in the SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD macro).
>
>> + }
>> +
>> + /* Ensure the common parts of the two structs are identical. */
>> + SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(slot);
>> + SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(flags);
>> + SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(guest_phys_addr);
>> + SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(memory_size);
>> + SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(userspace_addr);
>
> The fields are:
> * address
> * size
> * attributes
> * flags
>
> The list you've got appears to match struct kvm_userspace_memory_region
> - copy/paste error?
>
Yes I did copy/paste this from KVM_SET_USER_MEMORY_REGION2.
Thanks for catching this! I missed out build-testing this with
CONFIG_KVM_VM_MEMORY_ATTRIBUTES.
I've done that and here's a replacement patch.
> Thanks,
> Steve
>
>>
>> [...snip...]
>>
From 31283972574bde2ffa1960d30c80286f8467c594 Mon Sep 17 00:00:00 2001
From: Ackerley Tng <ackerleytng@google.com>
Date: Thu, 16 Oct 2025 11:48:01 -0700
Subject: [PATCH] KVM: Introduce KVM_SET_MEMORY_ATTRIBUTES2
Introduce a "version 2" of KVM_SET_MEMORY_ATTRIBUTES to support returning
information back to userspace.
This new ioctl and structure will, in a later patch, be shared as a
guest_memfd ioctl, where the padding in the new kvm_memory_attributes2
structure will be for writing the response from the guest_memfd ioctl to
userspace.
A new ioctl is necessary for these reasons:
1. KVM_SET_MEMORY_ATTRIBUTES is currently a write-only ioctl and does not
allow userspace to read fields. There's nothing in code (yet?) that
validates this, but using _IOWR for consistency would be prudent.
2. KVM_SET_MEMORY_ATTRIBUTES, when used as a guest_memfd ioctl, will need
an additional field to provide userspace with more error details.
Alternatively, a completely new ioctl could be defined, unrelated to
KVM_SET_MEMORY_ATTRIBUTES, but using the same ioctl number and struct for
the vm and guest_memfd ioctls streamlines the interface for userspace. In
addition, any memory attributes, implemented on the vm or guest_memfd
ioctl, can be easily shared with the other.
Suggested-by: Sean Christopherson <seanjc@google.com>
Change-Id: I50cd506d9a28bf68a90e659015603de579569bc1
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
Documentation/virt/kvm/api.rst | 32 ++++++++++++++++++++++++++++++++
include/uapi/linux/kvm.h | 12 ++++++++++++
virt/kvm/kvm_main.c | 34 +++++++++++++++++++++++++++++++---
3 files changed, 75 insertions(+), 3 deletions(-)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 754b662a453c3..a812769d79bf6 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6355,6 +6355,8 @@ S390:
Returns -EINVAL if the VM has the KVM_VM_S390_UCONTROL flag set.
Returns -EINVAL if called on a protected VM.
+.. _KVM_SET_MEMORY_ATTRIBUTES:
+
4.141 KVM_SET_MEMORY_ATTRIBUTES
-------------------------------
@@ -6512,6 +6514,36 @@ the capability to be present.
`flags` must currently be zero.
+4.144 KVM_SET_MEMORY_ATTRIBUTES2
+---------------------------------
+
+:Capability: KVM_CAP_MEMORY_ATTRIBUTES2
+:Architectures: x86
+:Type: vm ioctl
+:Parameters: struct kvm_memory_attributes2 (in/out)
+:Returns: 0 on success, <0 on error
+
+KVM_SET_MEMORY_ATTRIBUTES2 is an extension to
+KVM_SET_MEMORY_ATTRIBUTES that supports returning (writing) values to
+userspace. The original (pre-extension) fields are shared with
+KVM_SET_MEMORY_ATTRIBUTES identically.
+
+Attribute values are shared with KVM_SET_MEMORY_ATTRIBUTES.
+
+::
+
+ struct kvm_memory_attributes2 {
+ __u64 address;
+ __u64 size;
+ __u64 attributes;
+ __u64 flags;
+ __u64 reserved[4];
+ };
+
+ #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3)
+
+See also: :ref: `KVM_SET_MEMORY_ATTRIBUTES`.
+
.. _kvm_run:
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 52f6000ab0208..c300e38c7c9cd 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -963,6 +963,7 @@ struct kvm_enable_cap {
#define KVM_CAP_RISCV_MP_STATE_RESET 242
#define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243
#define KVM_CAP_GUEST_MEMFD_FLAGS 244
+#define KVM_CAP_MEMORY_ATTRIBUTES2 245
struct kvm_irq_routing_irqchip {
__u32 irqchip;
@@ -1617,4 +1618,15 @@ struct kvm_pre_fault_memory {
__u64 padding[5];
};
+/* Available with KVM_CAP_MEMORY_ATTRIBUTES2 */
+#define KVM_SET_MEMORY_ATTRIBUTES2 _IOWR(KVMIO, 0xd6, struct kvm_memory_attributes2)
+
+struct kvm_memory_attributes2 {
+ __u64 address;
+ __u64 size;
+ __u64 attributes;
+ __u64 flags;
+ __u64 reserved[4];
+};
+
#endif /* __LINUX_KVM_H */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 35166754a22b4..95aa51b334a70 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2621,7 +2621,7 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
return r;
}
static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
- struct kvm_memory_attributes *attrs)
+ struct kvm_memory_attributes2 *attrs)
{
gfn_t start, end;
@@ -4959,6 +4959,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
case KVM_CAP_DEVICE_CTRL:
return 1;
#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+ case KVM_CAP_MEMORY_ATTRIBUTES2:
case KVM_CAP_MEMORY_ATTRIBUTES:
if (!vm_memory_attributes)
return 0;
@@ -5184,6 +5185,14 @@ do { \
sizeof_field(struct kvm_userspace_memory_region2, field)); \
} while (0)
+#define SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(field) \
+do { \
+ BUILD_BUG_ON(offsetof(struct kvm_memory_attributes, field) != \
+ offsetof(struct kvm_memory_attributes2, field)); \
+ BUILD_BUG_ON(sizeof_field(struct kvm_memory_attributes, field) != \
+ sizeof_field(struct kvm_memory_attributes2, field)); \
+} while (0)
+
static long kvm_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -5366,15 +5375,34 @@ static long kvm_vm_ioctl(struct file *filp,
}
#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+ case KVM_SET_MEMORY_ATTRIBUTES2:
case KVM_SET_MEMORY_ATTRIBUTES: {
- struct kvm_memory_attributes attrs;
+ struct kvm_memory_attributes2 attrs;
+ unsigned long size;
+
+ if (ioctl == KVM_SET_MEMORY_ATTRIBUTES) {
+ /*
+ * Fields beyond struct kvm_userspace_memory_region shouldn't be
+ * accessed, but avoid leaking kernel memory in case of a bug.
+ */
+ memset(&attrs, 0, sizeof(attrs));
+ size = sizeof(struct kvm_memory_attributes);
+ } else {
+ size = sizeof(struct kvm_memory_attributes2);
+ }
+
+ /* Ensure the common parts of the two structs are identical. */
+ SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(address);
+ SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(size);
+ SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(attributes);
+ SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(flags);
r = -ENOTTY;
if (!vm_memory_attributes)
goto out;
r = -EFAULT;
- if (copy_from_user(&attrs, argp, sizeof(attrs)))
+ if (copy_from_user(&attrs, argp, size))
goto out;
r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
--
2.51.0.915.g61a8936c21-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* Re: [RFC PATCH v1 07/37] KVM: Introduce KVM_SET_MEMORY_ATTRIBUTES2
2025-10-22 16:51 ` Ackerley Tng
@ 2025-10-22 22:45 ` Ackerley Tng
2025-10-22 23:30 ` Sean Christopherson
0 siblings, 1 reply; 56+ messages in thread
From: Ackerley Tng @ 2025-10-22 22:45 UTC (permalink / raw)
To: Steven Price, cgroups, kvm, linux-doc, linux-fsdevel,
linux-kernel, linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: akpm, binbin.wu, bp, brauner, chao.p.peng, chenhuacai, corbet,
dave.hansen, dave.hansen, david, dmatlack, erdemaktas, fan.du,
fvdl, haibo1.xu, hannes, hch, hpa, hughd, ira.weiny,
isaku.yamahata, jack, james.morse, jarkko, jgg, jgowans, jhubbard,
jthoughton, jun.miao, kai.huang, keirf, kent.overstreet,
liam.merwick, maciej.wieczor-retman, mail, maobibo,
mathieu.desnoyers, maz, mhiramat, mhocko, mic, michael.roth,
mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz, oliver.upton,
palmer, pankaj.gupta, paul.walmsley, pbonzini, peterx, pgonda,
prsampat, pvorel, qperret, richard.weiyang, rick.p.edgecombe,
rientjes, rostedt, roypat, rppt, seanjc, shakeel.butt, shuah,
suzuki.poulose, tabba, tglx, thomas.lendacky, vannapurve, vbabka,
viro, vkuznets, will, willy, wyihan, xiaoyao.li, yan.y.zhao,
yilun.xu, yuzenghui
Ackerley Tng <ackerleytng@google.com> writes:
Found another issue with KVM_CAP_MEMORY_ATTRIBUTES2.
KVM_CAP_MEMORY_ATTRIBUTES2 was defined to do the same thing as
KVM_CAP_MEMORY_ATTRIBUTES, but that's wrong since
KVM_CAP_MEMORY_ATTRIBUTES2 should indicate the presence of
KVM_SET_MEMORY_ATTRIBUTES2 and struct kvm_memory_attributes2.
Usage is kind of weird and I hope to get feedback on this as
well.
This describes the difference between the previous version of this patch
and the one attached below.
I also added this to the changelog
Add KVM_CAP_MEMORY_ATTRIBUTES2 to indicate that struct
kvm_memory_attributes2 exists and can be used either with
KVM_SET_MEMORY_ATTRIBUTES2 via the vm or guest_memfd ioctl.
Since KVM_SET_MEMORY_ATTRIBUTES2 is not limited to be used only with the vm
ioctl, return 1 for KVM_CAP_MEMORY_ATTRIBUTES2 as long as struct
kvm_memory_attributes2 and KVM_SET_MEMORY_ATTRIBUTES2 can be
used. KVM_CAP_MEMORY_ATTRIBUTES must still be used to actually get valid
attributes.
Handle KVM_CAP_MEMORY_ATTRIBUTES2 and return 1 regardless of
CONFIG_KVM_VM_MEMORY_ATTRIBUTES, since KVM_SET_MEMORY_ATTRIBUTES2 is not
limited to a vm ioctl and can also be used with the guest_memfd ioctl.
Here's the entire patch so hopefully it's easy to swap out this entire
patch over the original one.
From 8887ba58f6fd97c529c8152d6f18e5e26651dbec Mon Sep 17 00:00:00 2001
From: Ackerley Tng <ackerleytng@google.com>
Date: Thu, 16 Oct 2025 11:48:01 -0700
Subject: [PATCH] KVM: Introduce KVM_SET_MEMORY_ATTRIBUTES2
Introduce a "version 2" of KVM_SET_MEMORY_ATTRIBUTES to support returning
information back to userspace.
This new ioctl and structure will, in a later patch, be shared as a
guest_memfd ioctl, where the padding in the new kvm_memory_attributes2
structure will be for writing the response from the guest_memfd ioctl to
userspace.
A new ioctl is necessary for these reasons:
1. KVM_SET_MEMORY_ATTRIBUTES is currently a write-only ioctl and does not
allow userspace to read fields. There's nothing in code (yet?) that
validates this, but using _IOWR for consistency would be prudent.
2. KVM_SET_MEMORY_ATTRIBUTES, when used as a guest_memfd ioctl, will need
an additional field to provide userspace with more error details.
Alternatively, a completely new ioctl could be defined, unrelated to
KVM_SET_MEMORY_ATTRIBUTES, but using the same ioctl number and struct for
the vm and guest_memfd ioctls streamlines the interface for userspace. In
addition, any memory attributes, implemented on the vm or guest_memfd
ioctl, can be easily shared with the other.
Add KVM_CAP_MEMORY_ATTRIBUTES2 to indicate that struct
kvm_memory_attributes2 exists and can be used either with
KVM_SET_MEMORY_ATTRIBUTES2 via the vm or guest_memfd ioctl.
Since KVM_SET_MEMORY_ATTRIBUTES2 is not limited to be used only with the vm
ioctl, return 1 for KVM_CAP_MEMORY_ATTRIBUTES2 as long as struct
kvm_memory_attributes2 and KVM_SET_MEMORY_ATTRIBUTES2 can be
used. KVM_CAP_MEMORY_ATTRIBUTES must still be used to actually get valid
attributes.
Handle KVM_CAP_MEMORY_ATTRIBUTES2 and return 1 regardless of
CONFIG_KVM_VM_MEMORY_ATTRIBUTES, since KVM_SET_MEMORY_ATTRIBUTES2 is not
limited to a vm ioctl and can also be used with the guest_memfd ioctl.
Suggested-by: Sean Christopherson <seanjc@google.com>
Change-Id: I50cd506d9a28bf68a90e659015603de579569bc1
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
Documentation/virt/kvm/api.rst | 32 ++++++++++++++++++++++++++++++++
include/uapi/linux/kvm.h | 12 ++++++++++++
virt/kvm/kvm_main.c | 34 +++++++++++++++++++++++++++++++---
3 files changed, 75 insertions(+), 3 deletions(-)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 754b662a453c3..a812769d79bf6 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6355,6 +6355,8 @@ S390:
Returns -EINVAL if the VM has the KVM_VM_S390_UCONTROL flag set.
Returns -EINVAL if called on a protected VM.
+.. _KVM_SET_MEMORY_ATTRIBUTES:
+
4.141 KVM_SET_MEMORY_ATTRIBUTES
-------------------------------
@@ -6512,6 +6514,36 @@ the capability to be present.
`flags` must currently be zero.
+4.144 KVM_SET_MEMORY_ATTRIBUTES2
+---------------------------------
+
+:Capability: KVM_CAP_MEMORY_ATTRIBUTES2
+:Architectures: x86
+:Type: vm ioctl
+:Parameters: struct kvm_memory_attributes2 (in/out)
+:Returns: 0 on success, <0 on error
+
+KVM_SET_MEMORY_ATTRIBUTES2 is an extension to
+KVM_SET_MEMORY_ATTRIBUTES that supports returning (writing) values to
+userspace. The original (pre-extension) fields are shared with
+KVM_SET_MEMORY_ATTRIBUTES identically.
+
+Attribute values are shared with KVM_SET_MEMORY_ATTRIBUTES.
+
+::
+
+ struct kvm_memory_attributes2 {
+ __u64 address;
+ __u64 size;
+ __u64 attributes;
+ __u64 flags;
+ __u64 reserved[4];
+ };
+
+ #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3)
+
+See also: :ref: `KVM_SET_MEMORY_ATTRIBUTES`.
+
.. _kvm_run:
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 52f6000ab0208..c300e38c7c9cd 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -963,6 +963,7 @@ struct kvm_enable_cap {
#define KVM_CAP_RISCV_MP_STATE_RESET 242
#define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243
#define KVM_CAP_GUEST_MEMFD_FLAGS 244
+#define KVM_CAP_MEMORY_ATTRIBUTES2 245
struct kvm_irq_routing_irqchip {
__u32 irqchip;
@@ -1617,4 +1618,15 @@ struct kvm_pre_fault_memory {
__u64 padding[5];
};
+/* Available with KVM_CAP_MEMORY_ATTRIBUTES2 */
+#define KVM_SET_MEMORY_ATTRIBUTES2 _IOWR(KVMIO, 0xd6, struct kvm_memory_attributes2)
+
+struct kvm_memory_attributes2 {
+ __u64 address;
+ __u64 size;
+ __u64 attributes;
+ __u64 flags;
+ __u64 reserved[4];
+};
+
#endif /* __LINUX_KVM_H */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 35166754a22b4..d083011744eba 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2621,7 +2621,7 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
return r;
}
static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
- struct kvm_memory_attributes *attrs)
+ struct kvm_memory_attributes2 *attrs)
{
gfn_t start, end;
@@ -4957,6 +4957,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
case KVM_CAP_BINARY_STATS_FD:
case KVM_CAP_SYSTEM_EVENT_DATA:
case KVM_CAP_DEVICE_CTRL:
+ case KVM_CAP_MEMORY_ATTRIBUTES2:
return 1;
#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
case KVM_CAP_MEMORY_ATTRIBUTES:
@@ -5184,6 +5185,14 @@ do { \
sizeof_field(struct kvm_userspace_memory_region2, field)); \
} while (0)
+#define SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(field) \
+do { \
+ BUILD_BUG_ON(offsetof(struct kvm_memory_attributes, field) != \
+ offsetof(struct kvm_memory_attributes2, field)); \
+ BUILD_BUG_ON(sizeof_field(struct kvm_memory_attributes, field) != \
+ sizeof_field(struct kvm_memory_attributes2, field)); \
+} while (0)
+
static long kvm_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -5366,15 +5375,34 @@ static long kvm_vm_ioctl(struct file *filp,
}
#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+ case KVM_SET_MEMORY_ATTRIBUTES2:
case KVM_SET_MEMORY_ATTRIBUTES: {
- struct kvm_memory_attributes attrs;
+ struct kvm_memory_attributes2 attrs;
+ unsigned long size;
+
+ if (ioctl == KVM_SET_MEMORY_ATTRIBUTES) {
+ /*
+ * Fields beyond struct kvm_userspace_memory_region shouldn't be
+ * accessed, but avoid leaking kernel memory in case of a bug.
+ */
+ memset(&attrs, 0, sizeof(attrs));
+ size = sizeof(struct kvm_memory_attributes);
+ } else {
+ size = sizeof(struct kvm_memory_attributes2);
+ }
+
+ /* Ensure the common parts of the two structs are identical. */
+ SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(address);
+ SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(size);
+ SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(attributes);
+ SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(flags);
r = -ENOTTY;
if (!vm_memory_attributes)
goto out;
r = -EFAULT;
- if (copy_from_user(&attrs, argp, sizeof(attrs)))
+ if (copy_from_user(&attrs, argp, size))
goto out;
r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
--
2.51.1.838.g19442a804e-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* Re: [RFC PATCH v1 07/37] KVM: Introduce KVM_SET_MEMORY_ATTRIBUTES2
2025-10-22 22:45 ` Ackerley Tng
@ 2025-10-22 23:30 ` Sean Christopherson
2025-10-23 14:01 ` Ackerley Tng
0 siblings, 1 reply; 56+ messages in thread
From: Sean Christopherson @ 2025-10-22 23:30 UTC (permalink / raw)
To: Ackerley Tng
Cc: Steven Price, cgroups, kvm, linux-doc, linux-fsdevel,
linux-kernel, linux-kselftest, linux-mm, linux-trace-kernel, x86,
akpm, binbin.wu, bp, brauner, chao.p.peng, chenhuacai, corbet,
dave.hansen, dave.hansen, david, dmatlack, erdemaktas, fan.du,
fvdl, haibo1.xu, hannes, hch, hpa, hughd, ira.weiny,
isaku.yamahata, jack, james.morse, jarkko, jgg, jgowans, jhubbard,
jthoughton, jun.miao, kai.huang, keirf, kent.overstreet,
liam.merwick, maciej.wieczor-retman, mail, maobibo,
mathieu.desnoyers, maz, mhiramat, mhocko, mic, michael.roth,
mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz, oliver.upton,
palmer, pankaj.gupta, paul.walmsley, pbonzini, peterx, pgonda,
prsampat, pvorel, qperret, richard.weiyang, rick.p.edgecombe,
rientjes, rostedt, roypat, rppt, shakeel.butt, shuah,
suzuki.poulose, tabba, tglx, thomas.lendacky, vannapurve, vbabka,
viro, vkuznets, will, willy, wyihan, xiaoyao.li, yan.y.zhao,
yilun.xu, yuzenghui
On Wed, Oct 22, 2025, Ackerley Tng wrote:
> Ackerley Tng <ackerleytng@google.com> writes:
>
> Found another issue with KVM_CAP_MEMORY_ATTRIBUTES2.
>
> KVM_CAP_MEMORY_ATTRIBUTES2 was defined to do the same thing as
> KVM_CAP_MEMORY_ATTRIBUTES, but that's wrong since
> KVM_CAP_MEMORY_ATTRIBUTES2 should indicate the presence of
> KVM_SET_MEMORY_ATTRIBUTES2 and struct kvm_memory_attributes2.
No? If no attributes are supported, whether or not KVM_SET_MEMORY_ATTRIBUTES2
exists is largely irrelevant. We can even provide the same -ENOTTY errno by
checking that _any_ attributes are supported, i.e. so that doing
KVM_SET_MEMORY_ATTRIBUTES2 on KVM without any support whatsoever fails in the
same way that KVM with code support but no attributes fails.
In other words, I don't see why it can't do both. Even if we can't massage the
right errno, I would much rather KVM_SET_MEMORY_ATTRIBUTES2 enumerate the set of
supported attributes than simply '1'. E.g. we have no plans to support
KVM_SET_MEMORY_ATTRIBUTES on guest_memfd, and so returning simply '1' creates an
unwanted and unnecessary dependency.
> @@ -1617,4 +1618,15 @@ struct kvm_pre_fault_memory {
> __u64 padding[5];
> };
>
> +/* Available with KVM_CAP_MEMORY_ATTRIBUTES2 */
> +#define KVM_SET_MEMORY_ATTRIBUTES2 _IOWR(KVMIO, 0xd6, struct kvm_memory_attributes2)
Please use the same literal number, 0xd2, as
#define KVM_SET_MEMORY_ATTRIBUTES _IOW(KVMIO, 0xd2, struct kvm_memory_attributes)
The "final" ioctl number that userspace sees incorporates the directionality and
the size of the struct, i.e. KVM_SET_MEMORY_ATTRIBUTES and KVM_SET_MEMORY_ATTRIBUTES2
are guaranteed to be distinct even if they both use 0xd2 as the "minor" number.
> +
> +struct kvm_memory_attributes2 {
> + __u64 address;
> + __u64 size;
> + __u64 attributes;
> + __u64 flags;
> + __u64 reserved[4];
Maybe be paranoid and reserve 12 u64s?
^ permalink raw reply [flat|nested] 56+ messages in thread* Re: [RFC PATCH v1 07/37] KVM: Introduce KVM_SET_MEMORY_ATTRIBUTES2
2025-10-22 23:30 ` Sean Christopherson
@ 2025-10-23 14:01 ` Ackerley Tng
2025-10-23 15:05 ` Sean Christopherson
0 siblings, 1 reply; 56+ messages in thread
From: Ackerley Tng @ 2025-10-23 14:01 UTC (permalink / raw)
To: Sean Christopherson
Cc: Steven Price, cgroups, kvm, linux-doc, linux-fsdevel,
linux-kernel, linux-kselftest, linux-mm, linux-trace-kernel, x86,
akpm, binbin.wu, bp, brauner, chao.p.peng, chenhuacai, corbet,
dave.hansen, dave.hansen, david, dmatlack, erdemaktas, fan.du,
fvdl, haibo1.xu, hannes, hch, hpa, hughd, ira.weiny,
isaku.yamahata, jack, james.morse, jarkko, jgg, jgowans, jhubbard,
jthoughton, jun.miao, kai.huang, keirf, kent.overstreet,
liam.merwick, maciej.wieczor-retman, mail, maobibo,
mathieu.desnoyers, maz, mhiramat, mhocko, mic, michael.roth,
mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz, oliver.upton,
palmer, pankaj.gupta, paul.walmsley, pbonzini, peterx, pgonda,
prsampat, pvorel, qperret, richard.weiyang, rick.p.edgecombe,
rientjes, rostedt, roypat, rppt, shakeel.butt, shuah,
suzuki.poulose, tabba, tglx, thomas.lendacky, vannapurve, vbabka,
viro, vkuznets, will, willy, wyihan, xiaoyao.li, yan.y.zhao,
yilun.xu, yuzenghui
Sean Christopherson <seanjc@google.com> writes:
> On Wed, Oct 22, 2025, Ackerley Tng wrote:
>> Ackerley Tng <ackerleytng@google.com> writes:
>>
>> Found another issue with KVM_CAP_MEMORY_ATTRIBUTES2.
>>
>> KVM_CAP_MEMORY_ATTRIBUTES2 was defined to do the same thing as
>> KVM_CAP_MEMORY_ATTRIBUTES, but that's wrong since
>> KVM_CAP_MEMORY_ATTRIBUTES2 should indicate the presence of
>> KVM_SET_MEMORY_ATTRIBUTES2 and struct kvm_memory_attributes2.
>
> No? If no attributes are supported, whether or not KVM_SET_MEMORY_ATTRIBUTES2
> exists is largely irrelevant.
That's true.
> We can even provide the same -ENOTTY errno by
> checking that _any_ attributes are supported, i.e. so that doing
> KVM_SET_MEMORY_ATTRIBUTES2 on KVM without any support whatsoever fails in the
> same way that KVM with code support but no attributes fails.
>
IIUC KVM_SET_MEMORY_ATTRIBUTES doesn't fail with -ENOTTY now when there
are no valid attributes.
Even if there's no valid attributes (as in
kvm_supported_mem_attributes() returns 0), it's possible to call
KVM_SET_MEMORY_ATTRIBUTES with .attributes set to 0, which will be a
no-op, but will return 0.
I think this is kind of correct behavior since .attributes = 0 is
actually a valid expression for "I want this range to be shared", and
for a VM that doesn't support private memory, it's a valid expression.
The other way that there are "no attributes" would be if there are no
/VM/ attributes, in which case KVM_SET_MEMORY_ATTRIBUTES, sent to as a
vm ioctl, will return -ENOTTY.
> In other words, I don't see why it can't do both. Even if we can't massage the
> right errno, I would much rather KVM_SET_MEMORY_ATTRIBUTES2 enumerate the set of
Did you mean KVM_CAP_MEMORY_ATTRIBUTES2 in the line above?
> supported attributes than simply '1'. E.g. we have no plans to support
> KVM_SET_MEMORY_ATTRIBUTES on guest_memfd, and so returning simply '1' creates an
> unwanted and unnecessary dependency.
>
Okay I'll switch this back to what it was.
>> @@ -1617,4 +1618,15 @@ struct kvm_pre_fault_memory {
>> __u64 padding[5];
>> };
>>
>> +/* Available with KVM_CAP_MEMORY_ATTRIBUTES2 */
>> +#define KVM_SET_MEMORY_ATTRIBUTES2 _IOWR(KVMIO, 0xd6, struct kvm_memory_attributes2)
>
> Please use the same literal number, 0xd2, as
>
> #define KVM_SET_MEMORY_ATTRIBUTES _IOW(KVMIO, 0xd2, struct kvm_memory_attributes)
>
> The "final" ioctl number that userspace sees incorporates the directionality and
> the size of the struct, i.e. KVM_SET_MEMORY_ATTRIBUTES and KVM_SET_MEMORY_ATTRIBUTES2
> are guaranteed to be distinct even if they both use 0xd2 as the "minor" number.
>
Will do.
>> +
>> +struct kvm_memory_attributes2 {
>> + __u64 address;
>> + __u64 size;
>> + __u64 attributes;
>> + __u64 flags;
>> + __u64 reserved[4];
>
> Maybe be paranoid and reserve 12 u64s?
Will do.
^ permalink raw reply [flat|nested] 56+ messages in thread* Re: [RFC PATCH v1 07/37] KVM: Introduce KVM_SET_MEMORY_ATTRIBUTES2
2025-10-23 14:01 ` Ackerley Tng
@ 2025-10-23 15:05 ` Sean Christopherson
2025-10-24 14:36 ` Ackerley Tng
0 siblings, 1 reply; 56+ messages in thread
From: Sean Christopherson @ 2025-10-23 15:05 UTC (permalink / raw)
To: Ackerley Tng
Cc: Steven Price, cgroups, kvm, linux-doc, linux-fsdevel,
linux-kernel, linux-kselftest, linux-mm, linux-trace-kernel, x86,
akpm, binbin.wu, bp, brauner, chao.p.peng, chenhuacai, corbet,
dave.hansen, dave.hansen, david, dmatlack, erdemaktas, fan.du,
fvdl, haibo1.xu, hannes, hch, hpa, hughd, ira.weiny,
isaku.yamahata, jack, james.morse, jarkko, jgg, jgowans, jhubbard,
jthoughton, jun.miao, kai.huang, keirf, kent.overstreet,
liam.merwick, maciej.wieczor-retman, mail, maobibo,
mathieu.desnoyers, maz, mhiramat, mhocko, mic, michael.roth,
mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz, oliver.upton,
palmer, pankaj.gupta, paul.walmsley, pbonzini, peterx, pgonda,
prsampat, pvorel, qperret, richard.weiyang, rick.p.edgecombe,
rientjes, rostedt, roypat, rppt, shakeel.butt, shuah,
suzuki.poulose, tabba, tglx, thomas.lendacky, vannapurve, vbabka,
viro, vkuznets, will, willy, wyihan, xiaoyao.li, yan.y.zhao,
yilun.xu, yuzenghui
On Thu, Oct 23, 2025, Ackerley Tng wrote:
> Sean Christopherson <seanjc@google.com> writes:
>
> > On Wed, Oct 22, 2025, Ackerley Tng wrote:
> >> Ackerley Tng <ackerleytng@google.com> writes:
> >>
> >> Found another issue with KVM_CAP_MEMORY_ATTRIBUTES2.
> >>
> >> KVM_CAP_MEMORY_ATTRIBUTES2 was defined to do the same thing as
> >> KVM_CAP_MEMORY_ATTRIBUTES, but that's wrong since
> >> KVM_CAP_MEMORY_ATTRIBUTES2 should indicate the presence of
> >> KVM_SET_MEMORY_ATTRIBUTES2 and struct kvm_memory_attributes2.
> >
> > No? If no attributes are supported, whether or not KVM_SET_MEMORY_ATTRIBUTES2
> > exists is largely irrelevant.
>
> That's true.
>
> > We can even provide the same -ENOTTY errno by
> > checking that _any_ attributes are supported, i.e. so that doing
> > KVM_SET_MEMORY_ATTRIBUTES2 on KVM without any support whatsoever fails in the
> > same way that KVM with code support but no attributes fails.
>
> IIUC KVM_SET_MEMORY_ATTRIBUTES doesn't fail with -ENOTTY now when there
> are no valid attributes.
>
> Even if there's no valid attributes (as in
> kvm_supported_mem_attributes() returns 0), it's possible to call
> KVM_SET_MEMORY_ATTRIBUTES with .attributes set to 0, which will be a
> no-op, but will return 0.
>
> I think this is kind of correct behavior since .attributes = 0 is
> actually a valid expression for "I want this range to be shared", and
> for a VM that doesn't support private memory, it's a valid expression.
>
>
> The other way that there are "no attributes" would be if there are no
> /VM/ attributes, in which case KVM_SET_MEMORY_ATTRIBUTES, sent to as a
> vm ioctl, will return -ENOTTY.
Ya, this is what I was trying to say with "_any_ attributes are supported". I.e.
by "any" I meant "any attributes in KVM for VMs vs. gmems", not "any attributes
for this specific VM/gmem instance".
> > In other words, I don't see why it can't do both. Even if we can't massage the
> > right errno, I would much rather KVM_SET_MEMORY_ATTRIBUTES2 enumerate the set of
>
> Did you mean KVM_CAP_MEMORY_ATTRIBUTES2 in the line above?
Doh, yes.
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC PATCH v1 07/37] KVM: Introduce KVM_SET_MEMORY_ATTRIBUTES2
2025-10-23 15:05 ` Sean Christopherson
@ 2025-10-24 14:36 ` Ackerley Tng
2025-10-24 15:11 ` Sean Christopherson
0 siblings, 1 reply; 56+ messages in thread
From: Ackerley Tng @ 2025-10-24 14:36 UTC (permalink / raw)
To: Sean Christopherson
Cc: Steven Price, cgroups, kvm, linux-doc, linux-fsdevel,
linux-kernel, linux-kselftest, linux-mm, linux-trace-kernel, x86,
akpm, binbin.wu, bp, brauner, chao.p.peng, chenhuacai, corbet,
dave.hansen, dave.hansen, david, dmatlack, erdemaktas, fan.du,
fvdl, haibo1.xu, hannes, hch, hpa, hughd, ira.weiny,
isaku.yamahata, jack, james.morse, jarkko, jgg, jgowans, jhubbard,
jthoughton, jun.miao, kai.huang, keirf, kent.overstreet,
liam.merwick, maciej.wieczor-retman, mail, maobibo,
mathieu.desnoyers, maz, mhiramat, mhocko, mic, michael.roth,
mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz, oliver.upton,
palmer, pankaj.gupta, paul.walmsley, pbonzini, peterx, pgonda,
prsampat, pvorel, qperret, richard.weiyang, rick.p.edgecombe,
rientjes, rostedt, roypat, rppt, shakeel.butt, shuah,
suzuki.poulose, tabba, tglx, thomas.lendacky, vannapurve, vbabka,
viro, vkuznets, will, willy, wyihan, xiaoyao.li, yan.y.zhao,
yilun.xu, yuzenghui
Sean Christopherson <seanjc@google.com> writes:
> On Thu, Oct 23, 2025, Ackerley Tng wrote:
>> Sean Christopherson <seanjc@google.com> writes:
>>
>> > On Wed, Oct 22, 2025, Ackerley Tng wrote:
>> >> Ackerley Tng <ackerleytng@google.com> writes:
>> >>
>> >> Found another issue with KVM_CAP_MEMORY_ATTRIBUTES2.
>> >>
>> >> KVM_CAP_MEMORY_ATTRIBUTES2 was defined to do the same thing as
>> >> KVM_CAP_MEMORY_ATTRIBUTES, but that's wrong since
>> >> KVM_CAP_MEMORY_ATTRIBUTES2 should indicate the presence of
>> >> KVM_SET_MEMORY_ATTRIBUTES2 and struct kvm_memory_attributes2.
>> >
>> > No? If no attributes are supported, whether or not KVM_SET_MEMORY_ATTRIBUTES2
>> > exists is largely irrelevant.
>>
>> That's true.
>>
>> > We can even provide the same -ENOTTY errno by
>> > checking that _any_ attributes are supported, i.e. so that doing
>> > KVM_SET_MEMORY_ATTRIBUTES2 on KVM without any support whatsoever fails in the
>> > same way that KVM with code support but no attributes fails.
>>
>> IIUC KVM_SET_MEMORY_ATTRIBUTES doesn't fail with -ENOTTY now when there
>> are no valid attributes.
>>
>> Even if there's no valid attributes (as in
>> kvm_supported_mem_attributes() returns 0), it's possible to call
>> KVM_SET_MEMORY_ATTRIBUTES with .attributes set to 0, which will be a
>> no-op, but will return 0.
>>
>> I think this is kind of correct behavior since .attributes = 0 is
>> actually a valid expression for "I want this range to be shared", and
>> for a VM that doesn't support private memory, it's a valid expression.
>>
>>
>> The other way that there are "no attributes" would be if there are no
>> /VM/ attributes, in which case KVM_SET_MEMORY_ATTRIBUTES, sent to as a
>> vm ioctl, will return -ENOTTY.
>
> Ya, this is what I was trying to say with "_any_ attributes are supported". I.e.
> by "any" I meant "any attributes in KVM for VMs vs. gmems", not "any attributes
> for this specific VM/gmem instance".
>
>>
>> [...snip...]
>>
I've been thinking more about this:
#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
case KVM_CAP_MEMORY_ATTRIBUTES2:
case KVM_CAP_MEMORY_ATTRIBUTES:
if (!vm_memory_attributes)
return 0;
return kvm_supported_mem_attributes(kvm);
#endif
And the purpose of adding KVM_CAP_MEMORY_ATTRIBUTES2 is that
KVM_CAP_MEMORY_ATTRIBUTES2 tells userspace that
KVM_SET_MEMORY_ATTRIBUTES2 is available iff there are valid
attributes.
(So there's still a purpose)
Without valid attributes, userspace can't tell if it should use
KVM_SET_MEMORY_ATTRIBUTES or the 2 version.
I also added KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES, which tells
userspace the valid attributes when calling KVM_SET_MEMORY_ATTRIBUTES2
on a guest_memfd:
#ifdef CONFIG_KVM_GUEST_MEMFD
case KVM_CAP_GUEST_MEMFD:
return 1;
case KVM_CAP_GUEST_MEMFD_FLAGS:
return kvm_gmem_get_supported_flags(kvm);
case KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES:
if (vm_memory_attributes)
return 0;
return kvm_supported_mem_attributes(kvm);
#endif
So to set memory attributes, userspace should
if (kvm_check_cap(KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES) > 0)
use KVM_SET_MEMORY_ATTRIBUTES2 with guest_memfd
else if (kvm_check_cap(KVM_CAP_MEMORY_ATTRIBUTES2) > 0)
use KVM_SET_MEMORY_ATTRIBUTES2 with VM fd
else if (kvm_check_cap(KVM_CAP_MEMORY_ATTRIBUTES) > 0)
use KVM_SET_MEMORY_ATTRIBUTES with VM fd
else
can't set memory attributes
Something like that?
In selftests there's this, when KVM_SET_USER_MEMORY_REGION2 was
introduced:
#define TEST_REQUIRE_SET_USER_MEMORY_REGION2() \
__TEST_REQUIRE(kvm_has_cap(KVM_CAP_USER_MEMORY2), \
"KVM selftests now require KVM_SET_USER_MEMORY_REGION2 (introduced in v6.8)")
But looks like there's no direct equivalent for the introduction of
KVM_SET_MEMORY_ATTRIBUTES2?
The closest would be to add a TEST_REQUIRE_VALID_ATTRIBUTES() which
checks KVM_CAP_MEMORY_ATTRIBUTES2 or
KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES before making the vm or
guest_memfd ioctl respsectively.
^ permalink raw reply [flat|nested] 56+ messages in thread* Re: [RFC PATCH v1 07/37] KVM: Introduce KVM_SET_MEMORY_ATTRIBUTES2
2025-10-24 14:36 ` Ackerley Tng
@ 2025-10-24 15:11 ` Sean Christopherson
2025-10-24 16:41 ` Ackerley Tng
0 siblings, 1 reply; 56+ messages in thread
From: Sean Christopherson @ 2025-10-24 15:11 UTC (permalink / raw)
To: Ackerley Tng
Cc: Steven Price, cgroups, kvm, linux-doc, linux-fsdevel,
linux-kernel, linux-kselftest, linux-mm, linux-trace-kernel, x86,
akpm, binbin.wu, bp, brauner, chao.p.peng, chenhuacai, corbet,
dave.hansen, dave.hansen, david, dmatlack, erdemaktas, fan.du,
fvdl, haibo1.xu, hannes, hch, hpa, hughd, ira.weiny,
isaku.yamahata, jack, james.morse, jarkko, jgg, jgowans, jhubbard,
jthoughton, jun.miao, kai.huang, keirf, kent.overstreet,
liam.merwick, maciej.wieczor-retman, mail, maobibo,
mathieu.desnoyers, maz, mhiramat, mhocko, mic, michael.roth,
mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz, oliver.upton,
palmer, pankaj.gupta, paul.walmsley, pbonzini, peterx, pgonda,
prsampat, pvorel, qperret, richard.weiyang, rick.p.edgecombe,
rientjes, rostedt, roypat, rppt, shakeel.butt, shuah,
suzuki.poulose, tabba, tglx, thomas.lendacky, vannapurve, vbabka,
viro, vkuznets, will, willy, wyihan, xiaoyao.li, yan.y.zhao,
yilun.xu, yuzenghui
On Fri, Oct 24, 2025, Ackerley Tng wrote:
> Sean Christopherson <seanjc@google.com> writes:
> >>
> >> [...snip...]
> >>
>
> I've been thinking more about this:
>
> #ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
> case KVM_CAP_MEMORY_ATTRIBUTES2:
> case KVM_CAP_MEMORY_ATTRIBUTES:
> if (!vm_memory_attributes)
> return 0;
>
> return kvm_supported_mem_attributes(kvm);
> #endif
>
> And the purpose of adding KVM_CAP_MEMORY_ATTRIBUTES2 is that
> KVM_CAP_MEMORY_ATTRIBUTES2 tells userspace that
> KVM_SET_MEMORY_ATTRIBUTES2 is available iff there are valid
> attributes.
>
> (So there's still a purpose)
>
> Without valid attributes, userspace can't tell if it should use
> KVM_SET_MEMORY_ATTRIBUTES or the 2 version.
To do what? If there are no attributes, userspace can't do anything useful anyways.
> I also added KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES, which tells
> userspace the valid attributes when calling KVM_SET_MEMORY_ATTRIBUTES2
> on a guest_memfd:
Ya, and that KVM_SET_MEMORY_ATTRIBUTES2 is supported on guest_memfd.
> #ifdef CONFIG_KVM_GUEST_MEMFD
> case KVM_CAP_GUEST_MEMFD:
> return 1;
> case KVM_CAP_GUEST_MEMFD_FLAGS:
> return kvm_gmem_get_supported_flags(kvm);
> case KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES:
> if (vm_memory_attributes)
> return 0;
>
> return kvm_supported_mem_attributes(kvm);
> #endif
>
> So to set memory attributes, userspace should
Userspace *can*. User could also decide it only wants to support guest_memfd
attributes, e.g. because the platform admins controls the entire stack and built
their entire operation around in-place conversion.
> if (kvm_check_cap(KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES) > 0)
> use KVM_SET_MEMORY_ATTRIBUTES2 with guest_memfd
> else if (kvm_check_cap(KVM_CAP_MEMORY_ATTRIBUTES2) > 0)
> use KVM_SET_MEMORY_ATTRIBUTES2 with VM fd
> else if (kvm_check_cap(KVM_CAP_MEMORY_ATTRIBUTES) > 0)
> use KVM_SET_MEMORY_ATTRIBUTES with VM fd
> else
> can't set memory attributes
>
> Something like that?
More or else, ya.
> In selftests there's this, when KVM_SET_USER_MEMORY_REGION2 was
> introduced:
>
> #define TEST_REQUIRE_SET_USER_MEMORY_REGION2() \
> __TEST_REQUIRE(kvm_has_cap(KVM_CAP_USER_MEMORY2), \
> "KVM selftests now require KVM_SET_USER_MEMORY_REGION2 (introduced in v6.8)")
>
> But looks like there's no direct equivalent for the introduction of
> KVM_SET_MEMORY_ATTRIBUTES2?
KVM_CAP_USER_MEMORY2 is the equivalent.
There's was no need to enumerate anything beyond yes/no, because
SET_USER_MEMORY_REGION2 didn't introduce new flags, it expanded the size of the
structure passed in from userspace so that KVM_CAP_GUEST_MEMFD could be introduced
without breaking backwards compatibility.
> The closest would be to add a TEST_REQUIRE_VALID_ATTRIBUTES() which
> checks KVM_CAP_MEMORY_ATTRIBUTES2 or
> KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES before making the vm or
> guest_memfd ioctl respsectively.
Yes. This is what I did in my (never posted, but functional) version:
@@ -486,6 +488,7 @@ struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus,
}
guest_rng = new_guest_random_state(guest_random_seed);
sync_global_to_guest(vm, guest_rng);
+ sync_global_to_guest(vm, kvm_has_gmem_attributes);
kvm_arch_vm_post_create(vm, nr_runnable_vcpus);
@@ -2319,6 +2333,8 @@ void __attribute((constructor)) kvm_selftest_init(void)
guest_random_seed = last_guest_seed = random();
pr_info("Random seed: 0x%x\n", guest_random_seed);
+ kvm_has_gmem_attributes = kvm_has_cap(KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES);
+
kvm_selftest_arch_init();
}
That way the core library code can pivot on gmem vs. VM attributes without having
to rely on tests to define anything. E.g.
static inline void vm_mem_set_memory_attributes(struct kvm_vm *vm, uint64_t gpa,
uint64_t size, uint64_t attrs)
{
if (kvm_has_gmem_attributes) {
off_t fd_offset;
uint64_t len;
int fd;
fd = kvm_gpa_to_guest_memfd(vm, gpa, &fd_offset, &len);
TEST_ASSERT(len >= size, "Setting attributes beyond the length of a guest_memfd");
gmem_set_memory_attributes(fd, fd_offset, size, attrs);
} else {
vm_set_memory_attributes(vm, gpa, size, attrs);
}
}
^ permalink raw reply [flat|nested] 56+ messages in thread* Re: [RFC PATCH v1 07/37] KVM: Introduce KVM_SET_MEMORY_ATTRIBUTES2
2025-10-24 15:11 ` Sean Christopherson
@ 2025-10-24 16:41 ` Ackerley Tng
2025-10-24 17:45 ` Sean Christopherson
0 siblings, 1 reply; 56+ messages in thread
From: Ackerley Tng @ 2025-10-24 16:41 UTC (permalink / raw)
To: Sean Christopherson
Cc: Steven Price, cgroups, kvm, linux-doc, linux-fsdevel,
linux-kernel, linux-kselftest, linux-mm, linux-trace-kernel, x86,
akpm, binbin.wu, bp, brauner, chao.p.peng, chenhuacai, corbet,
dave.hansen, dave.hansen, david, dmatlack, erdemaktas, fan.du,
fvdl, haibo1.xu, hannes, hch, hpa, hughd, ira.weiny,
isaku.yamahata, jack, james.morse, jarkko, jgg, jgowans, jhubbard,
jthoughton, jun.miao, kai.huang, keirf, kent.overstreet,
liam.merwick, maciej.wieczor-retman, mail, maobibo,
mathieu.desnoyers, maz, mhiramat, mhocko, mic, michael.roth,
mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz, oliver.upton,
palmer, pankaj.gupta, paul.walmsley, pbonzini, peterx, pgonda,
prsampat, pvorel, qperret, richard.weiyang, rick.p.edgecombe,
rientjes, rostedt, roypat, rppt, shakeel.butt, shuah,
suzuki.poulose, tabba, tglx, thomas.lendacky, vannapurve, vbabka,
viro, vkuznets, will, willy, wyihan, xiaoyao.li, yan.y.zhao,
yilun.xu, yuzenghui
Sean Christopherson <seanjc@google.com> writes:
> On Fri, Oct 24, 2025, Ackerley Tng wrote:
>> Sean Christopherson <seanjc@google.com> writes:
>> >>
>> >> [...snip...]
>> >>
>>
>> I've been thinking more about this:
>>
>> #ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
>> case KVM_CAP_MEMORY_ATTRIBUTES2:
>> case KVM_CAP_MEMORY_ATTRIBUTES:
>> if (!vm_memory_attributes)
>> return 0;
>>
>> return kvm_supported_mem_attributes(kvm);
>> #endif
>>
>> And the purpose of adding KVM_CAP_MEMORY_ATTRIBUTES2 is that
>> KVM_CAP_MEMORY_ATTRIBUTES2 tells userspace that
>> KVM_SET_MEMORY_ATTRIBUTES2 is available iff there are valid
>> attributes.
>>
>> (So there's still a purpose)
>>
>> Without valid attributes, userspace can't tell if it should use
>> KVM_SET_MEMORY_ATTRIBUTES or the 2 version.
>
> To do what? If there are no attributes, userspace can't do anything useful anyways.
>
>> I also added KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES, which tells
>> userspace the valid attributes when calling KVM_SET_MEMORY_ATTRIBUTES2
>> on a guest_memfd:
>
> Ya, and that KVM_SET_MEMORY_ATTRIBUTES2 is supported on guest_memfd.
>
>> #ifdef CONFIG_KVM_GUEST_MEMFD
>> case KVM_CAP_GUEST_MEMFD:
>> return 1;
>> case KVM_CAP_GUEST_MEMFD_FLAGS:
>> return kvm_gmem_get_supported_flags(kvm);
>> case KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES:
>> if (vm_memory_attributes)
>> return 0;
>>
>> return kvm_supported_mem_attributes(kvm);
>> #endif
>>
>> So to set memory attributes, userspace should
>
> Userspace *can*. User could also decide it only wants to support guest_memfd
> attributes, e.g. because the platform admins controls the entire stack and built
> their entire operation around in-place conversion.
>
>> if (kvm_check_cap(KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES) > 0)
>> use KVM_SET_MEMORY_ATTRIBUTES2 with guest_memfd
>> else if (kvm_check_cap(KVM_CAP_MEMORY_ATTRIBUTES2) > 0)
>> use KVM_SET_MEMORY_ATTRIBUTES2 with VM fd
>> else if (kvm_check_cap(KVM_CAP_MEMORY_ATTRIBUTES) > 0)
>> use KVM_SET_MEMORY_ATTRIBUTES with VM fd
>> else
>> can't set memory attributes
>>
>> Something like that?
>
> More or else, ya.
>
>> In selftests there's this, when KVM_SET_USER_MEMORY_REGION2 was
>> introduced:
>>
>> #define TEST_REQUIRE_SET_USER_MEMORY_REGION2() \
>> __TEST_REQUIRE(kvm_has_cap(KVM_CAP_USER_MEMORY2), \
>> "KVM selftests now require KVM_SET_USER_MEMORY_REGION2 (introduced in v6.8)")
>>
>> But looks like there's no direct equivalent for the introduction of
>> KVM_SET_MEMORY_ATTRIBUTES2?
>
> KVM_CAP_USER_MEMORY2 is the equivalent.
>
> There's was no need to enumerate anything beyond yes/no, because
> SET_USER_MEMORY_REGION2 didn't introduce new flags, it expanded the size of the
> structure passed in from userspace so that KVM_CAP_GUEST_MEMFD could be introduced
> without breaking backwards compatibility.
>
>> The closest would be to add a TEST_REQUIRE_VALID_ATTRIBUTES() which
>> checks KVM_CAP_MEMORY_ATTRIBUTES2 or
>> KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES before making the vm or
>> guest_memfd ioctl respsectively.
>
> Yes. This is what I did in my (never posted, but functional) version:
>
> @@ -486,6 +488,7 @@ struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus,
> }
> guest_rng = new_guest_random_state(guest_random_seed);
> sync_global_to_guest(vm, guest_rng);
> + sync_global_to_guest(vm, kvm_has_gmem_attributes);
I ported this [1] except for syncing this value to the guest, because I
think the guest shouldn't need to know this information, the host should
decide what to do. I think, if the guests really need to know this, the
test itself can do the syncing.
[1] https://lore.kernel.org/all/5656d432df1217c08da0cc2694fd79948bfd686f.1760731772.git.ackerleytng@google.com/
>
> kvm_arch_vm_post_create(vm, nr_runnable_vcpus);
>
> @@ -2319,6 +2333,8 @@ void __attribute((constructor)) kvm_selftest_init(void)
> guest_random_seed = last_guest_seed = random();
> pr_info("Random seed: 0x%x\n", guest_random_seed);
>
> + kvm_has_gmem_attributes = kvm_has_cap(KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES);
> +
> kvm_selftest_arch_init();
> }
>
> That way the core library code can pivot on gmem vs. VM attributes without having
> to rely on tests to define anything. E.g.
>
> static inline void vm_mem_set_memory_attributes(struct kvm_vm *vm, uint64_t gpa,
> uint64_t size, uint64_t attrs)
> {
> if (kvm_has_gmem_attributes) {
> off_t fd_offset;
> uint64_t len;
> int fd;
>
> fd = kvm_gpa_to_guest_memfd(vm, gpa, &fd_offset, &len);
> TEST_ASSERT(len >= size, "Setting attributes beyond the length of a guest_memfd");
> gmem_set_memory_attributes(fd, fd_offset, size, attrs);
> } else {
> vm_set_memory_attributes(vm, gpa, size, attrs);
> }
> }
^ permalink raw reply [flat|nested] 56+ messages in thread* Re: [RFC PATCH v1 07/37] KVM: Introduce KVM_SET_MEMORY_ATTRIBUTES2
2025-10-24 16:41 ` Ackerley Tng
@ 2025-10-24 17:45 ` Sean Christopherson
2025-10-27 12:48 ` Ackerley Tng
0 siblings, 1 reply; 56+ messages in thread
From: Sean Christopherson @ 2025-10-24 17:45 UTC (permalink / raw)
To: Ackerley Tng
Cc: Steven Price, cgroups, kvm, linux-doc, linux-fsdevel,
linux-kernel, linux-kselftest, linux-mm, linux-trace-kernel, x86,
akpm, binbin.wu, bp, brauner, chao.p.peng, chenhuacai, corbet,
dave.hansen, dave.hansen, david, dmatlack, erdemaktas, fan.du,
fvdl, haibo1.xu, hannes, hch, hpa, hughd, ira.weiny,
isaku.yamahata, jack, james.morse, jarkko, jgg, jgowans, jhubbard,
jthoughton, jun.miao, kai.huang, keirf, kent.overstreet,
liam.merwick, maciej.wieczor-retman, mail, maobibo,
mathieu.desnoyers, maz, mhiramat, mhocko, mic, michael.roth,
mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz, oliver.upton,
palmer, pankaj.gupta, paul.walmsley, pbonzini, peterx, pgonda,
prsampat, pvorel, qperret, richard.weiyang, rick.p.edgecombe,
rientjes, rostedt, roypat, rppt, shakeel.butt, shuah,
suzuki.poulose, tabba, tglx, thomas.lendacky, vannapurve, vbabka,
viro, vkuznets, will, willy, wyihan, xiaoyao.li, yan.y.zhao,
yilun.xu, yuzenghui
On Fri, Oct 24, 2025, Ackerley Tng wrote:
> Sean Christopherson <seanjc@google.com> writes:
> > @@ -486,6 +488,7 @@ struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus,
> > }
> > guest_rng = new_guest_random_state(guest_random_seed);
> > sync_global_to_guest(vm, guest_rng);
> > + sync_global_to_guest(vm, kvm_has_gmem_attributes);
>
> I ported this [1] except for syncing this value to the guest, because I
> think the guest shouldn't need to know this information,
KVM selftests are about practically and testing, what information should or
shouldn't be available to a test from e.g. a safety perspective is completely
irrelevant. In fact, one of the biggest advantages of selftests over KUT is
that the guest side can know _exactly_ what's going on in the host.
See the usage in 1850e3da4b03 ("KVM: selftests: Update private_mem_conversions_test
to mmap() guest_memfd") from:
https://github.com/sean-jc/linux.git x86/gmem_inplace
> the host should decide what to do. I think, if the guests really need to know
> this, the test itself can do the syncing.
Why force tests to do extra work, and potentially introduce subtle bugs due to
state being stale?
^ permalink raw reply [flat|nested] 56+ messages in thread* Re: [RFC PATCH v1 07/37] KVM: Introduce KVM_SET_MEMORY_ATTRIBUTES2
2025-10-24 17:45 ` Sean Christopherson
@ 2025-10-27 12:48 ` Ackerley Tng
0 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-27 12:48 UTC (permalink / raw)
To: Sean Christopherson
Cc: Steven Price, cgroups, kvm, linux-doc, linux-fsdevel,
linux-kernel, linux-kselftest, linux-mm, linux-trace-kernel, x86,
akpm, binbin.wu, bp, brauner, chao.p.peng, chenhuacai, corbet,
dave.hansen, dave.hansen, david, dmatlack, erdemaktas, fan.du,
fvdl, haibo1.xu, hannes, hch, hpa, hughd, ira.weiny,
isaku.yamahata, jack, james.morse, jarkko, jgg, jgowans, jhubbard,
jthoughton, jun.miao, kai.huang, keirf, kent.overstreet,
liam.merwick, maciej.wieczor-retman, mail, maobibo,
mathieu.desnoyers, maz, mhiramat, mhocko, mic, michael.roth,
mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz, oliver.upton,
palmer, pankaj.gupta, paul.walmsley, pbonzini, peterx, pgonda,
prsampat, pvorel, qperret, richard.weiyang, rick.p.edgecombe,
rientjes, rostedt, roypat, rppt, shakeel.butt, shuah,
suzuki.poulose, tabba, tglx, thomas.lendacky, vannapurve, vbabka,
viro, vkuznets, will, willy, wyihan, xiaoyao.li, yan.y.zhao,
yilun.xu, yuzenghui
Sean Christopherson <seanjc@google.com> writes:
> On Fri, Oct 24, 2025, Ackerley Tng wrote:
>> Sean Christopherson <seanjc@google.com> writes:
>> > @@ -486,6 +488,7 @@ struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus,
>> > }
>> > guest_rng = new_guest_random_state(guest_random_seed);
>> > sync_global_to_guest(vm, guest_rng);
>> > + sync_global_to_guest(vm, kvm_has_gmem_attributes);
>>
>> I ported this [1] except for syncing this value to the guest, because I
>> think the guest shouldn't need to know this information,
>
> KVM selftests are about practically and testing, what information should or
> shouldn't be available to a test from e.g. a safety perspective is completely
> irrelevant. In fact, one of the biggest advantages of selftests over KUT is
> that the guest side can know _exactly_ what's going on in the host.
>
> See the usage in 1850e3da4b03 ("KVM: selftests: Update private_mem_conversions_test
> to mmap() guest_memfd") from:
>
> https://github.com/sean-jc/linux.git x86/gmem_inplace
>
>> the host should decide what to do. I think, if the guests really need to know
>> this, the test itself can do the syncing.
>
> Why force tests to do extra work, and potentially introduce subtle bugs due to
> state being stale?
Adding it back. Thanks!
This variable should be sync-able for TDX selftests as well since the
value should be synced before the TD image is loaded.
^ permalink raw reply [flat|nested] 56+ messages in thread
* [RFC PATCH v1 08/37] KVM: guest_memfd: Don't set FGP_ACCESSED when getting folios
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (6 preceding siblings ...)
2025-10-17 20:11 ` [RFC PATCH v1 07/37] KVM: Introduce KVM_SET_MEMORY_ATTRIBUTES2 Ackerley Tng
@ 2025-10-17 20:11 ` Ackerley Tng
2025-10-27 13:39 ` Vlastimil Babka
2025-10-17 20:11 ` [RFC PATCH v1 09/37] KVM: guest_memfd: Skip LRU for guest_memfd folios Ackerley Tng
` (28 subsequent siblings)
36 siblings, 1 reply; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:11 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
guest_memfd folios don't care about accessed flags since the memory is
unevictable and there is no storage to write back to, hence, cleanup the
allocation path by not setting FGP_ACCESSED.
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
[sean: split to separate patch, write changelog]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
virt/kvm/guest_memfd.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 855e682041311..2a9e9220a48aa 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -167,14 +167,13 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
* Fast-path: See if folio is already present in mapping to avoid
* policy_lookup.
*/
- folio = __filemap_get_folio(inode->i_mapping, index,
- FGP_LOCK | FGP_ACCESSED, 0);
+ folio = filemap_lock_folio(inode->i_mapping, index);
if (!IS_ERR(folio))
return folio;
policy = kvm_gmem_get_folio_policy(GMEM_I(inode), index);
folio = __filemap_get_folio_mpol(inode->i_mapping, index,
- FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ FGP_LOCK | FGP_CREAT,
mapping_gfp_mask(inode->i_mapping), policy);
mpol_cond_put(policy);
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* Re: [RFC PATCH v1 08/37] KVM: guest_memfd: Don't set FGP_ACCESSED when getting folios
2025-10-17 20:11 ` [RFC PATCH v1 08/37] KVM: guest_memfd: Don't set FGP_ACCESSED when getting folios Ackerley Tng
@ 2025-10-27 13:39 ` Vlastimil Babka
0 siblings, 0 replies; 56+ messages in thread
From: Vlastimil Babka @ 2025-10-27 13:39 UTC (permalink / raw)
To: Ackerley Tng, cgroups, kvm, linux-doc, linux-fsdevel,
linux-kernel, linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: akpm, binbin.wu, bp, brauner, chao.p.peng, chenhuacai, corbet,
dave.hansen, dave.hansen, david, dmatlack, erdemaktas, fan.du,
fvdl, haibo1.xu, hannes, hch, hpa, hughd, ira.weiny,
isaku.yamahata, jack, james.morse, jarkko, jgg, jgowans, jhubbard,
jroedel, jthoughton, jun.miao, kai.huang, keirf, kent.overstreet,
liam.merwick, maciej.wieczor-retman, mail, maobibo,
mathieu.desnoyers, maz, mhiramat, mhocko, mic, michael.roth,
mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz, oliver.upton,
palmer, pankaj.gupta, paul.walmsley, pbonzini, peterx, pgonda,
prsampat, pvorel, qperret, richard.weiyang, rick.p.edgecombe,
rientjes, rostedt, roypat, rppt, seanjc, shakeel.butt, shuah,
steven.price, steven.sistare, suzuki.poulose, tabba, tglx,
thomas.lendacky, vannapurve, viro, vkuznets, wei.w.wang, will,
willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu, yuzenghui,
zhiquan1.li
On 10/17/25 22:11, Ackerley Tng wrote:
> guest_memfd folios don't care about accessed flags since the memory is
> unevictable and there is no storage to write back to, hence, cleanup the
> allocation path by not setting FGP_ACCESSED.
>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> [sean: split to separate patch, write changelog]
> Signed-off-by: Sean Christopherson <seanjc@google.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
> ---
> virt/kvm/guest_memfd.c | 5 ++---
> 1 file changed, 2 insertions(+), 3 deletions(-)
>
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 855e682041311..2a9e9220a48aa 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -167,14 +167,13 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
> * Fast-path: See if folio is already present in mapping to avoid
> * policy_lookup.
> */
> - folio = __filemap_get_folio(inode->i_mapping, index,
> - FGP_LOCK | FGP_ACCESSED, 0);
> + folio = filemap_lock_folio(inode->i_mapping, index);
> if (!IS_ERR(folio))
> return folio;
>
> policy = kvm_gmem_get_folio_policy(GMEM_I(inode), index);
> folio = __filemap_get_folio_mpol(inode->i_mapping, index,
> - FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
> + FGP_LOCK | FGP_CREAT,
> mapping_gfp_mask(inode->i_mapping), policy);
> mpol_cond_put(policy);
>
^ permalink raw reply [flat|nested] 56+ messages in thread
* [RFC PATCH v1 09/37] KVM: guest_memfd: Skip LRU for guest_memfd folios
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (7 preceding siblings ...)
2025-10-17 20:11 ` [RFC PATCH v1 08/37] KVM: guest_memfd: Don't set FGP_ACCESSED when getting folios Ackerley Tng
@ 2025-10-17 20:11 ` Ackerley Tng
2025-10-27 13:56 ` Vlastimil Babka
2025-10-17 20:11 ` [RFC PATCH v1 10/37] KVM: guest_memfd: Enable INIT_SHARED on guest_memfd for x86 Coco VMs Ackerley Tng
` (27 subsequent siblings)
36 siblings, 1 reply; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:11 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
filemap_add_folio(), called from filemap_grab_folio(), adds folios to
an LRU list. This is unnecessary for guest_memfd, which does not
participate in swapping.
In addition, the LRU list takes a reference count on the folio. With
shared-to-private memory conversions for KVM guests dependent on folio
refcounts, this extra reference can cause conversions to fail due to
unexpected refcounts.
Rework kvm_gmem_get_folio() to manually allocate and insert the folio
into the page cache without placing it on the LRU. This is done by
calling __filemap_add_folio() directly.
The folio is then marked unevictable to avoid participation in
swapping. The ->free_folio() handler is modified to unset the
unevictable flag when the folio is released from guest_memfd.
This change ensures that LRU lists no longer take refcounts on
guest_memfd folios, significantly reducing the chance of elevated
refcounts during conversion.
To facilitate this, __filemap_add_folio is exported for KVM's use.
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
mm/filemap.c | 1 +
mm/memcontrol.c | 2 ++
virt/kvm/guest_memfd.c | 60 +++++++++++++++++++++++++++++++++---------
3 files changed, 50 insertions(+), 13 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index 03f223be575ca..60c7c95bbd7e6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -954,6 +954,7 @@ noinline int __filemap_add_folio(struct address_space *mapping,
return xas_error(&xas);
}
ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
+EXPORT_SYMBOL_FOR_MODULES(__filemap_add_folio, "kvm");
int filemap_add_folio(struct address_space *mapping, struct folio *folio,
pgoff_t index, gfp_t gfp)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8dd7fbed5a942..fe8629414d0a9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4721,6 +4721,7 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
return ret;
}
+EXPORT_SYMBOL_FOR_MODULES(__mem_cgroup_charge, "kvm");
/**
* mem_cgroup_charge_hugetlb - charge the memcg for a hugetlb folio
@@ -4893,6 +4894,7 @@ void __mem_cgroup_uncharge(struct folio *folio)
uncharge_folio(folio, &ug);
uncharge_batch(&ug);
}
+EXPORT_SYMBOL_FOR_MODULES(__mem_cgroup_uncharge, "kvm");
void __mem_cgroup_uncharge_folios(struct folio_batch *folios)
{
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 2a9e9220a48aa..dab2b3ce78bc8 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -148,6 +148,41 @@ static struct mempolicy *kvm_gmem_get_folio_policy(struct gmem_inode *gi,
#endif
}
+static struct folio *__kvm_gmem_get_folio(struct address_space *mapping,
+ pgoff_t index,
+ struct mempolicy *policy)
+{
+ const gfp_t gfp = mapping_gfp_mask(mapping);
+ struct folio *folio;
+ int err;
+
+ folio = filemap_lock_folio(mapping, index);
+ if (!IS_ERR(folio))
+ return folio;
+
+ folio = filemap_alloc_folio(gfp, 0, policy);
+ if (!folio)
+ return ERR_PTR(-ENOMEM);
+
+ err = mem_cgroup_charge(folio, NULL, gfp);
+ if (err)
+ goto err_put;
+
+ __folio_set_locked(folio);
+
+ err = __filemap_add_folio(mapping, folio, index, gfp, NULL);
+ if (err) {
+ __folio_clear_locked(folio);
+ goto err_put;
+ }
+
+ return folio;
+
+err_put:
+ folio_put(folio);
+ return ERR_PTR(err);
+}
+
/*
* Returns a locked folio on success. The caller is responsible for
* setting the up-to-date flag before the memory is mapped into the guest.
@@ -160,6 +195,7 @@ static struct mempolicy *kvm_gmem_get_folio_policy(struct gmem_inode *gi,
static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
{
/* TODO: Support huge pages. */
+ struct address_space *mapping = inode->i_mapping;
struct mempolicy *policy;
struct folio *folio;
@@ -167,16 +203,17 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
* Fast-path: See if folio is already present in mapping to avoid
* policy_lookup.
*/
- folio = filemap_lock_folio(inode->i_mapping, index);
+ folio = filemap_lock_folio(mapping, index);
if (!IS_ERR(folio))
return folio;
policy = kvm_gmem_get_folio_policy(GMEM_I(inode), index);
- folio = __filemap_get_folio_mpol(inode->i_mapping, index,
- FGP_LOCK | FGP_CREAT,
- mapping_gfp_mask(inode->i_mapping), policy);
- mpol_cond_put(policy);
+ do {
+ folio = __kvm_gmem_get_folio(mapping, index, policy);
+ } while (IS_ERR(folio) && PTR_ERR(folio) == -EEXIST);
+
+ mpol_cond_put(policy);
return folio;
}
@@ -588,24 +625,21 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
return MF_DELAYED;
}
-#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
static void kvm_gmem_free_folio(struct folio *folio)
{
- struct page *page = folio_page(folio, 0);
- kvm_pfn_t pfn = page_to_pfn(page);
- int order = folio_order(folio);
+ folio_clear_unevictable(folio);
- kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
-}
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
+ kvm_arch_gmem_invalidate(folio_pfn(folio),
+ folio_pfn(folio) + folio_nr_pages(folio));
#endif
+}
static const struct address_space_operations kvm_gmem_aops = {
.dirty_folio = noop_dirty_folio,
.migrate_folio = kvm_gmem_migrate_folio,
.error_remove_folio = kvm_gmem_error_folio,
-#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
.free_folio = kvm_gmem_free_folio,
-#endif
};
static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* Re: [RFC PATCH v1 09/37] KVM: guest_memfd: Skip LRU for guest_memfd folios
2025-10-17 20:11 ` [RFC PATCH v1 09/37] KVM: guest_memfd: Skip LRU for guest_memfd folios Ackerley Tng
@ 2025-10-27 13:56 ` Vlastimil Babka
0 siblings, 0 replies; 56+ messages in thread
From: Vlastimil Babka @ 2025-10-27 13:56 UTC (permalink / raw)
To: Ackerley Tng, cgroups, kvm, linux-doc, linux-fsdevel,
linux-kernel, linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: akpm, binbin.wu, bp, brauner, chao.p.peng, chenhuacai, corbet,
dave.hansen, dave.hansen, david, dmatlack, erdemaktas, fan.du,
fvdl, haibo1.xu, hannes, hch, hpa, hughd, ira.weiny,
isaku.yamahata, jack, james.morse, jarkko, jgg, jgowans, jhubbard,
jroedel, jthoughton, jun.miao, kai.huang, keirf, kent.overstreet,
liam.merwick, maciej.wieczor-retman, mail, maobibo,
mathieu.desnoyers, maz, mhiramat, mhocko, mic, michael.roth,
mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz, oliver.upton,
palmer, pankaj.gupta, paul.walmsley, pbonzini, peterx, pgonda,
prsampat, pvorel, qperret, richard.weiyang, rick.p.edgecombe,
rientjes, rostedt, roypat, rppt, seanjc, shakeel.butt, shuah,
steven.price, steven.sistare, suzuki.poulose, tabba, tglx,
thomas.lendacky, vannapurve, viro, vkuznets, wei.w.wang, will,
willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu, yuzenghui,
zhiquan1.li
On 10/17/25 22:11, Ackerley Tng wrote:
> filemap_add_folio(), called from filemap_grab_folio(), adds folios to
> an LRU list. This is unnecessary for guest_memfd, which does not
> participate in swapping.
IIRC guest_memfd mappings are unevictable. That should mean they are not
ultimately added to a list (see lruvec_add_folio()).
> In addition, the LRU list takes a reference count on the folio. With
IIUC the refcount is temporary while being on the percpu
&cpu_fbatches.lru_add, added by __folio_batch_add_and_move(). When flushed
via folio_batch_move_lru(), the refcount is removed and there's only the LRU
folio flag that remains. The fbatch flushing can be triggered if you see an
unexpected refcount increase. So it might be feasible to do without this
patch (maybe it was already tried and there were substantial issues, in
which case should be mentioned).
> shared-to-private memory conversions for KVM guests dependent on folio
> refcounts, this extra reference can cause conversions to fail due to
> unexpected refcounts.
>
> Rework kvm_gmem_get_folio() to manually allocate and insert the folio
> into the page cache without placing it on the LRU. This is done by
> calling __filemap_add_folio() directly.
>
> The folio is then marked unevictable to avoid participation in
> swapping. The ->free_folio() handler is modified to unset the
> unevictable flag when the folio is released from guest_memfd.
>
> This change ensures that LRU lists no longer take refcounts on
> guest_memfd folios, significantly reducing the chance of elevated
> refcounts during conversion.
>
> To facilitate this, __filemap_add_folio is exported for KVM's use.
>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
> mm/filemap.c | 1 +
> mm/memcontrol.c | 2 ++
> virt/kvm/guest_memfd.c | 60 +++++++++++++++++++++++++++++++++---------
> 3 files changed, 50 insertions(+), 13 deletions(-)
>
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 03f223be575ca..60c7c95bbd7e6 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -954,6 +954,7 @@ noinline int __filemap_add_folio(struct address_space *mapping,
> return xas_error(&xas);
> }
> ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
> +EXPORT_SYMBOL_FOR_MODULES(__filemap_add_folio, "kvm");
>
> int filemap_add_folio(struct address_space *mapping, struct folio *folio,
> pgoff_t index, gfp_t gfp)
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 8dd7fbed5a942..fe8629414d0a9 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4721,6 +4721,7 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
>
> return ret;
> }
> +EXPORT_SYMBOL_FOR_MODULES(__mem_cgroup_charge, "kvm");
>
> /**
> * mem_cgroup_charge_hugetlb - charge the memcg for a hugetlb folio
> @@ -4893,6 +4894,7 @@ void __mem_cgroup_uncharge(struct folio *folio)
> uncharge_folio(folio, &ug);
> uncharge_batch(&ug);
> }
> +EXPORT_SYMBOL_FOR_MODULES(__mem_cgroup_uncharge, "kvm");
>
> void __mem_cgroup_uncharge_folios(struct folio_batch *folios)
> {
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 2a9e9220a48aa..dab2b3ce78bc8 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -148,6 +148,41 @@ static struct mempolicy *kvm_gmem_get_folio_policy(struct gmem_inode *gi,
> #endif
> }
>
> +static struct folio *__kvm_gmem_get_folio(struct address_space *mapping,
> + pgoff_t index,
> + struct mempolicy *policy)
> +{
> + const gfp_t gfp = mapping_gfp_mask(mapping);
> + struct folio *folio;
> + int err;
> +
> + folio = filemap_lock_folio(mapping, index);
> + if (!IS_ERR(folio))
> + return folio;
> +
> + folio = filemap_alloc_folio(gfp, 0, policy);
> + if (!folio)
> + return ERR_PTR(-ENOMEM);
> +
> + err = mem_cgroup_charge(folio, NULL, gfp);
> + if (err)
> + goto err_put;
> +
> + __folio_set_locked(folio);
> +
> + err = __filemap_add_folio(mapping, folio, index, gfp, NULL);
> + if (err) {
> + __folio_clear_locked(folio);
> + goto err_put;
> + }
> +
> + return folio;
> +
> +err_put:
> + folio_put(folio);
> + return ERR_PTR(err);
> +}
> +
> /*
> * Returns a locked folio on success. The caller is responsible for
> * setting the up-to-date flag before the memory is mapped into the guest.
> @@ -160,6 +195,7 @@ static struct mempolicy *kvm_gmem_get_folio_policy(struct gmem_inode *gi,
> static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
> {
> /* TODO: Support huge pages. */
> + struct address_space *mapping = inode->i_mapping;
> struct mempolicy *policy;
> struct folio *folio;
>
> @@ -167,16 +203,17 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
> * Fast-path: See if folio is already present in mapping to avoid
> * policy_lookup.
> */
> - folio = filemap_lock_folio(inode->i_mapping, index);
> + folio = filemap_lock_folio(mapping, index);
> if (!IS_ERR(folio))
> return folio;
>
> policy = kvm_gmem_get_folio_policy(GMEM_I(inode), index);
> - folio = __filemap_get_folio_mpol(inode->i_mapping, index,
> - FGP_LOCK | FGP_CREAT,
> - mapping_gfp_mask(inode->i_mapping), policy);
> - mpol_cond_put(policy);
>
> + do {
> + folio = __kvm_gmem_get_folio(mapping, index, policy);
> + } while (IS_ERR(folio) && PTR_ERR(folio) == -EEXIST);
> +
> + mpol_cond_put(policy);
> return folio;
> }
>
> @@ -588,24 +625,21 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
> return MF_DELAYED;
> }
>
> -#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
> static void kvm_gmem_free_folio(struct folio *folio)
> {
> - struct page *page = folio_page(folio, 0);
> - kvm_pfn_t pfn = page_to_pfn(page);
> - int order = folio_order(folio);
> + folio_clear_unevictable(folio);
>
> - kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
> -}
> +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
> + kvm_arch_gmem_invalidate(folio_pfn(folio),
> + folio_pfn(folio) + folio_nr_pages(folio));
> #endif
> +}
>
> static const struct address_space_operations kvm_gmem_aops = {
> .dirty_folio = noop_dirty_folio,
> .migrate_folio = kvm_gmem_migrate_folio,
> .error_remove_folio = kvm_gmem_error_folio,
> -#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
> .free_folio = kvm_gmem_free_folio,
> -#endif
> };
>
> static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
^ permalink raw reply [flat|nested] 56+ messages in thread
* [RFC PATCH v1 10/37] KVM: guest_memfd: Enable INIT_SHARED on guest_memfd for x86 Coco VMs
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (8 preceding siblings ...)
2025-10-17 20:11 ` [RFC PATCH v1 09/37] KVM: guest_memfd: Skip LRU for guest_memfd folios Ackerley Tng
@ 2025-10-17 20:11 ` Ackerley Tng
2025-10-17 20:11 ` [RFC PATCH v1 11/37] KVM: guest_memfd: Add support for KVM_SET_MEMORY_ATTRIBUTES Ackerley Tng
` (26 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:11 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
From: Sean Christopherson <seanjc@google.com>
Now that guest_memfd supports tracking private vs. shared within gmem
itself, allow userspace to specify INIT_SHARED on a guest_memfd instance
for x86 Confidential Computing (CoCo) VMs, so long as per-VM attributes
are disabled, i.e. when it's actually possible for a guest_memfd instance
to contain shared memory.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kvm/x86.c | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5e38c4c9cf63c..4ad451982380e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13941,14 +13941,13 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
}
#ifdef CONFIG_KVM_GUEST_MEMFD
-/*
- * KVM doesn't yet support initializing guest_memfd memory as shared for VMs
- * with private memory (the private vs. shared tracking needs to be moved into
- * guest_memfd).
- */
bool kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
{
- return !kvm_arch_has_private_mem(kvm);
+ /*
+ * INIT_SHARED isn't supported if the memory attributes are per-VM,
+ * in which case guest_memfd can _only_ be used for private memory.
+ */
+ return !vm_memory_attributes || !kvm_arch_has_private_mem(kvm);
}
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 11/37] KVM: guest_memfd: Add support for KVM_SET_MEMORY_ATTRIBUTES
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (9 preceding siblings ...)
2025-10-17 20:11 ` [RFC PATCH v1 10/37] KVM: guest_memfd: Enable INIT_SHARED on guest_memfd for x86 Coco VMs Ackerley Tng
@ 2025-10-17 20:11 ` Ackerley Tng
2025-10-17 20:11 ` [RFC PATCH v1 12/37] KVM: Move KVM_VM_MEMORY_ATTRIBUTES config definition to x86 Ackerley Tng
` (25 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:11 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
For shared to private conversions, if refcounts on any of the folios
within the range are elevated, fail the conversion with -EAGAIN.
At the point of shared to private conversion, all folios in range are
also unmapped. The filemap_invalidate_lock() is held, so no faulting
can occur. Hence, from that point on, only transient refcounts can be
taken on the folios associated with that guest_memfd.
Hence, it is safe to do the conversion from shared to private.
After conversion is complete, refcounts may become elevated, but that
is fine since users of transient refcounts don't actually access
memory.
For private to shared conversions, there are no refcount checks. any
transient refcounts are expected to drop their refcounts soon. The
conversion process will spin waiting for these transient refcounts to
go away.
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
Documentation/virt/kvm/api.rst | 48 +++++++-
include/linux/kvm_host.h | 10 ++
include/uapi/linux/kvm.h | 9 +-
virt/kvm/Kconfig | 2 +-
virt/kvm/guest_memfd.c | 197 ++++++++++++++++++++++++++++++---
virt/kvm/kvm_main.c | 15 +--
6 files changed, 250 insertions(+), 31 deletions(-)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index a812769d79bf6..156ee69fa9067 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -117,7 +117,7 @@ description:
x86 includes both i386 and x86_64.
Type:
- system, vm, or vcpu.
+ system, vm, vcpu or guest_memfd.
Parameters:
what parameters are accepted by the ioctl.
@@ -6518,11 +6518,22 @@ the capability to be present.
---------------------------------
:Capability: KVM_CAP_MEMORY_ATTRIBUTES2
-:Architectures: x86
-:Type: vm ioctl
+:Architectures: all
+:Type: vm, guest_memfd ioctl
:Parameters: struct kvm_memory_attributes2 (in/out)
:Returns: 0 on success, <0 on error
+Errors:
+
+ ========== ===============================================================
+ EINVAL The specified `offset` or `size` were invalid (e.g. not
+ page aligned, causes an overflow, or size is zero).
+ EFAULT The parameter address was invalid.
+ EAGAIN Some page within requested range had unexpected refcounts. The
+ offset of the page will be returned in `error_offset`.
+ ENOMEM Ran out of memory trying to track private/shared state
+ ========== ===============================================================
+
KVM_SET_MEMORY_ATTRIBUTES2 is an extension to
KVM_SET_MEMORY_ATTRIBUTES that supports returning (writing) values to
userspace. The original (pre-extension) fields are shared with
@@ -6533,15 +6544,42 @@ Attribute values are shared with KVM_SET_MEMORY_ATTRIBUTES.
::
struct kvm_memory_attributes2 {
- __u64 address;
+ /* in */
+ union {
+ __u64 address;
+ __u64 offset;
+ };
__u64 size;
__u64 attributes;
__u64 flags;
- __u64 reserved[4];
+ /* out */
+ __u64 error_offset;
+ __u64 reserved[3];
};
#define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3)
+Set attributes for a range of offsets within a guest_memfd to
+KVM_MEMORY_ATTRIBUTE_PRIVATE to limit the specified guest_memfd backed
+memory range for guest_use. Even if KVM_CAP_GUEST_MEMFD_MMAP is
+supported, after a successful call to set
+KVM_MEMORY_ATTRIBUTE_PRIVATE, the requested range will not be mappable
+into host userspace and will only be mappable by the guest.
+
+To allow the range to be mappable into host userspace again, call
+KVM_SET_MEMORY_ATTRIBUTES2 on the guest_memfd again with
+KVM_MEMORY_ATTRIBUTE_PRIVATE unset.
+
+If this ioctl returns -EAGAIN, the offset of the page with unexpected
+refcounts will be returned in `error_offset`. This can occur if there
+are transient refcounts on the pages, taken by other parts of the
+kernel.
+
+Userspace is expected to figure out how to remove all known refcounts
+on the shared pages, such as refcounts taken by get_user_pages(), and
+try the ioctl again. A possible source of these long term refcounts is
+if the guest_memfd memory was pinned in IOMMU page tables.
+
See also: :ref: `KVM_SET_MEMORY_ATTRIBUTES`.
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b48632ee242b3..962055309084c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2515,6 +2515,16 @@ static inline bool kvm_memslot_is_gmem_only(const struct kvm_memory_slot *slot)
}
#ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
+static inline u64 kvm_supported_mem_attributes(struct kvm *kvm)
+{
+#ifdef kvm_arch_has_private_mem
+ if (!kvm || kvm_arch_has_private_mem(kvm))
+ return KVM_MEMORY_ATTRIBUTE_PRIVATE;
+#endif
+
+ return 0;
+}
+
typedef unsigned long (kvm_get_memory_attributes_t)(struct kvm *kvm, gfn_t gfn);
DECLARE_STATIC_CALL(__kvm_get_memory_attributes, kvm_get_memory_attributes_t);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index c300e38c7c9cd..cdb00866efe49 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -964,6 +964,7 @@ struct kvm_enable_cap {
#define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243
#define KVM_CAP_GUEST_MEMFD_FLAGS 244
#define KVM_CAP_MEMORY_ATTRIBUTES2 245
+#define KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES 246
struct kvm_irq_routing_irqchip {
__u32 irqchip;
@@ -1622,11 +1623,15 @@ struct kvm_pre_fault_memory {
#define KVM_SET_MEMORY_ATTRIBUTES2 _IOWR(KVMIO, 0xd6, struct kvm_memory_attributes2)
struct kvm_memory_attributes2 {
- __u64 address;
+ union {
+ __u64 address;
+ __u64 offset;
+ };
__u64 size;
__u64 attributes;
__u64 flags;
- __u64 reserved[4];
+ __u64 error_offset;
+ __u64 reserved[3];
};
#endif /* __LINUX_KVM_H */
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 395996977fe5a..b3473aec4d24d 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -117,7 +117,7 @@ config KVM_VM_MEMORY_ATTRIBUTES
bool
config KVM_GUEST_MEMFD
- depends on KVM_GENERIC_MMU_NOTIFIER
+ select KVM_MEMORY_ATTRIBUTES
select XARRAY_MULTI
bool
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index dab2b3ce78bc8..5ec38e9395d22 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -73,6 +73,21 @@ static bool kvm_gmem_is_shared_mem(struct inode *inode, pgoff_t index)
return !kvm_gmem_is_private_mem(inode, index);
}
+static bool kvm_gmem_range_has_attributes(struct maple_tree *mt,
+ pgoff_t index, size_t nr_pages,
+ u64 attributes)
+{
+ pgoff_t end = index + nr_pages - 1;
+ void *entry;
+
+ mt_for_each(mt, entry, index, end) {
+ if (xa_to_value(entry) != attributes)
+ return false;
+ }
+
+ return true;
+}
+
static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
pgoff_t index, struct folio *folio)
{
@@ -219,10 +234,12 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode)
{
- if (GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)
- return KVM_FILTER_SHARED;
-
- return KVM_FILTER_PRIVATE;
+ /*
+ * TODO: Limit invalidations based on the to-be-invalidated range, i.e.
+ * invalidate shared/private if and only if there can possibly be
+ * such mappings.
+ */
+ return KVM_FILTER_SHARED | KVM_FILTER_PRIVATE;
}
static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start,
@@ -583,11 +600,172 @@ unsigned long kvm_gmem_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(kvm_gmem_get_memory_attributes);
+static bool kvm_gmem_is_safe_for_conversion(struct inode *inode, pgoff_t start,
+ size_t nr_pages, pgoff_t *err_index)
+{
+ struct address_space *mapping = inode->i_mapping;
+ const int filemap_get_folios_refcount = 1;
+ pgoff_t last = start + nr_pages - 1;
+ struct folio_batch fbatch;
+ bool safe = true;
+ int i;
+
+ folio_batch_init(&fbatch);
+ while (safe && filemap_get_folios(mapping, &start, last, &fbatch)) {
+
+ for (i = 0; i < folio_batch_count(&fbatch); ++i) {
+ struct folio *folio = fbatch.folios[i];
+
+ if (folio_ref_count(folio) !=
+ folio_nr_pages(folio) + filemap_get_folios_refcount) {
+ safe = false;
+ *err_index = folio->index;
+ break;
+ }
+ }
+
+ folio_batch_release(&fbatch);
+ }
+
+ return safe;
+}
+
+/*
+ * Preallocate memory for attributes to be stored on a maple tree, pointed to
+ * by mas. Adjacent ranges with attributes identical to the new attributes
+ * will be merged. Also sets mas's bounds up for storing attributes.
+ *
+ * This maintains the invariant that ranges with the same attributes will
+ * always be merged.
+ */
+static int kvm_gmem_mas_preallocate(struct ma_state *mas, u64 attributes,
+ pgoff_t start, size_t nr_pages)
+{
+ pgoff_t end = start + nr_pages;
+ pgoff_t last = end - 1;
+ void *entry;
+
+ /* Try extending range. entry is NULL on overflow/wrap-around. */
+ mas_set_range(mas, end, end);
+ entry = mas_find(mas, end);
+ if (entry && xa_to_value(entry) == attributes)
+ last = mas->last;
+
+ mas_set_range(mas, start - 1, start - 1);
+ entry = mas_find(mas, start - 1);
+ if (entry && xa_to_value(entry) == attributes)
+ start = mas->index;
+
+ mas_set_range(mas, start, last);
+ return mas_preallocate(mas, xa_mk_value(attributes), GFP_KERNEL);
+}
+
+static int __kvm_gmem_set_attributes(struct inode *inode, pgoff_t start,
+ size_t nr_pages, uint64_t attrs,
+ pgoff_t *err_index)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct gmem_inode *gi = GMEM_I(inode);
+ pgoff_t end = start + nr_pages;
+ struct maple_tree *mt;
+ struct ma_state mas;
+ int r;
+
+ mt = &gi->attributes;
+
+ filemap_invalidate_lock(mapping);
+
+ mas_init(&mas, mt, start);
+
+ if (kvm_gmem_range_has_attributes(mt, start, nr_pages, attrs))
+ goto done;
+
+ r = kvm_gmem_mas_preallocate(&mas, attrs, start, nr_pages);
+ if (r) {
+ *err_index = start;
+ goto out;
+ }
+
+ unmap_mapping_pages(mapping, start, nr_pages, false);
+
+ if (!kvm_gmem_is_safe_for_conversion(inode, start, nr_pages, err_index)) {
+ mas_destroy(&mas);
+ r = -EAGAIN;
+ goto out;
+ }
+
+ kvm_gmem_invalidate_begin(inode, start, end);
+
+ mas_store_prealloc(&mas, xa_mk_value(attrs));
+
+ kvm_gmem_invalidate_end(inode, start, end);
+done:
+ r = 0;
+out:
+ filemap_invalidate_unlock(mapping);
+ return r;
+}
+
+static long kvm_gmem_set_attributes(struct file *file, void __user *argp)
+{
+ struct gmem_file *f = file->private_data;
+ struct inode *inode = file_inode(file);
+ struct kvm_memory_attributes2 attrs;
+ pgoff_t err_index;
+ size_t nr_pages;
+ pgoff_t index;
+ int r;
+
+ if (copy_from_user(&attrs, argp, sizeof(attrs)))
+ return -EFAULT;
+
+ if (attrs.flags)
+ return -EINVAL;
+ if (attrs.attributes & ~kvm_supported_mem_attributes(f->kvm))
+ return -EINVAL;
+ if (attrs.size == 0 || attrs.offset + attrs.size < attrs.offset)
+ return -EINVAL;
+ if (!PAGE_ALIGNED(attrs.offset) || !PAGE_ALIGNED(attrs.offset))
+ return -EINVAL;
+
+ if (attrs.offset > inode->i_size ||
+ attrs.offset + attrs.size > inode->i_size)
+ return -EINVAL;
+
+ nr_pages = attrs.size >> PAGE_SHIFT;
+ index = attrs.offset >> PAGE_SHIFT;
+ r = __kvm_gmem_set_attributes(inode, index, nr_pages, attrs.attributes,
+ &err_index);
+ if (r) {
+ attrs.error_offset = err_index << PAGE_SHIFT;
+
+ if (copy_to_user(argp, &attrs, sizeof(attrs)))
+ return -EFAULT;
+ }
+
+ return r;
+}
+
+static long kvm_gmem_ioctl(struct file *file, unsigned int ioctl,
+ unsigned long arg)
+{
+ switch (ioctl) {
+ case KVM_SET_MEMORY_ATTRIBUTES2:
+ if (vm_memory_attributes)
+ return -ENOTTY;
+
+ return kvm_gmem_set_attributes(file, (void __user *)arg);
+ default:
+ return -ENOTTY;
+ }
+}
+
static struct file_operations kvm_gmem_fops = {
.mmap = kvm_gmem_mmap,
.open = generic_file_open,
.release = kvm_gmem_release,
.fallocate = kvm_gmem_fallocate,
+ .unlocked_ioctl = kvm_gmem_ioctl,
};
static int kvm_gmem_migrate_folio(struct address_space *mapping,
@@ -943,20 +1121,13 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
static bool kvm_gmem_range_is_private(struct gmem_inode *gi, pgoff_t index,
size_t nr_pages, struct kvm *kvm, gfn_t gfn)
{
- pgoff_t end = index + nr_pages - 1;
- void *entry;
-
if (vm_memory_attributes)
return kvm_range_has_vm_memory_attributes(kvm, gfn, gfn + nr_pages,
KVM_MEMORY_ATTRIBUTE_PRIVATE,
KVM_MEMORY_ATTRIBUTE_PRIVATE);
- mt_for_each(&gi->attributes, entry, index, end) {
- if (xa_to_value(entry) != attributes)
- return false;
- }
-
- return true;
+ return kvm_gmem_range_has_attributes(&gi->attributes, index, nr_pages,
+ KVM_MEMORY_ATTRIBUTE_PRIVATE);
}
long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index dd84b377e46db..3506a2f2be041 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2435,16 +2435,6 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
#ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
-static u64 kvm_supported_mem_attributes(struct kvm *kvm)
-{
-#ifdef kvm_arch_has_private_mem
- if (!kvm || kvm_arch_has_private_mem(kvm))
- return KVM_MEMORY_ATTRIBUTE_PRIVATE;
-#endif
-
- return 0;
-}
-
#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
static unsigned long kvm_get_vm_memory_attributes(struct kvm *kvm, gfn_t gfn)
{
@@ -4971,6 +4961,11 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
return 1;
case KVM_CAP_GUEST_MEMFD_FLAGS:
return kvm_gmem_get_supported_flags(kvm);
+ case KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES:
+ if (vm_memory_attributes)
+ return 0;
+
+ return kvm_supported_mem_attributes(kvm);
#endif
default:
break;
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 12/37] KVM: Move KVM_VM_MEMORY_ATTRIBUTES config definition to x86
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (10 preceding siblings ...)
2025-10-17 20:11 ` [RFC PATCH v1 11/37] KVM: guest_memfd: Add support for KVM_SET_MEMORY_ATTRIBUTES Ackerley Tng
@ 2025-10-17 20:11 ` Ackerley Tng
2025-10-17 20:11 ` [RFC PATCH v1 13/37] KVM: Let userspace disable per-VM mem attributes, enable per-gmem attributes Ackerley Tng
` (24 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:11 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
From: Sean Christopherson <seanjc@google.com>
Bury KVM_VM_MEMORY_ATTRIBUTES in x86 to discourage other architectures
from adding support for per-VM memory attributes, because tracking private
vs. shared memory on a per-VM basis is now deprecated in favor of tracking
on a per-guest_memfd basis, and no other memory attributes are on the
horizon.
This will also allow modifying KVM_VM_MEMORY_ATTRIBUTES to be
user-selectable (in x86) without creating weirdness in KVM's Kconfigs.
Now that guest_memfd support memory attributes, it's entirely possible to
run x86 CoCo VMs without support for KVM_VM_MEMORY_ATTRIBUTES.
Leave the code itself in common KVM so that it's trivial to undo this
change if new per-VM attributes do come along.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kvm/Kconfig | 4 ++++
virt/kvm/Kconfig | 4 ----
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index acb03b45ba050..49c7709e3d895 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -80,6 +80,10 @@ config KVM_WERROR
If in doubt, say "N".
+config KVM_VM_MEMORY_ATTRIBUTES
+ select KVM_MEMORY_ATTRIBUTES
+ bool
+
config KVM_SW_PROTECTED_VM
bool "Enable support for KVM software-protected VMs"
depends on EXPERT
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index b3473aec4d24d..72b19813e5412 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -112,10 +112,6 @@ config KVM_MEMORY_ATTRIBUTES
depends on KVM_GENERIC_MMU_NOTIFIER
bool
-config KVM_VM_MEMORY_ATTRIBUTES
- select KVM_MEMORY_ATTRIBUTES
- bool
-
config KVM_GUEST_MEMFD
select KVM_MEMORY_ATTRIBUTES
select XARRAY_MULTI
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 13/37] KVM: Let userspace disable per-VM mem attributes, enable per-gmem attributes
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (11 preceding siblings ...)
2025-10-17 20:11 ` [RFC PATCH v1 12/37] KVM: Move KVM_VM_MEMORY_ATTRIBUTES config definition to x86 Ackerley Tng
@ 2025-10-17 20:11 ` Ackerley Tng
2025-10-17 20:11 ` [RFC PATCH v1 14/37] KVM: selftests: Create gmem fd before "regular" fd when adding memslot Ackerley Tng
` (23 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:11 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
From: Sean Christopherson <seanjc@google.com>
Make vm_memory_attributes a module parameter so that userspace can disable
the use of memory attributes on the VM level.
To avoid inconsistencies in the way memory attributes are tracked in KVM
and guest_memfd, the vm_memory_attributes module_param is made
read-only (0444).
Make CONFIG_KVM_VM_MEMORY_ATTRIBUTES selectable, only for (CoCo) VM types
that might use vm_memory_attributes.
Signed-off-by: Sean Christopherson <seanjc@google.com>
[Drop compile-time check for CONFIG_KVM_VM_MEMORY_ATTRIBUTES in
kvm_gmem_range_is_private() since vm_memory_attributes, if defined false
with a macro, should elide generation of the if block anyway]
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
arch/x86/kvm/Kconfig | 13 +++++++++----
virt/kvm/kvm_main.c | 1 +
2 files changed, 10 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 49c7709e3d895..5cd5046c542ba 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -82,13 +82,20 @@ config KVM_WERROR
config KVM_VM_MEMORY_ATTRIBUTES
select KVM_MEMORY_ATTRIBUTES
- bool
+ depends on KVM_SW_PROTECTED_VM || KVM_INTEL_TDX || KVM_AMD_SEV
+ bool "Enable per-VM memory attributes (for CoCo VMs)"
+ help
+ Enable support for per-VM memory attributes, which are deprecated in
+ favor of tracking memory attributes in guest_memfd. Select this if
+ you need to run CoCo VMs using a VMM that doesn't support guest_memfd
+ memory attributes.
+
+ If unsure, say N.
config KVM_SW_PROTECTED_VM
bool "Enable support for KVM software-protected VMs"
depends on EXPERT
depends on KVM_X86 && X86_64
- select KVM_VM_MEMORY_ATTRIBUTES
help
Enable support for KVM software-protected VMs. Currently, software-
protected VMs are purely a development and testing vehicle for
@@ -138,7 +145,6 @@ config KVM_INTEL_TDX
bool "Intel Trust Domain Extensions (TDX) support"
default y
depends on INTEL_TDX_HOST
- select KVM_VM_MEMORY_ATTRIBUTES
select HAVE_KVM_ARCH_GMEM_POPULATE
help
Provides support for launching Intel Trust Domain Extensions (TDX)
@@ -162,7 +168,6 @@ config KVM_AMD_SEV
depends on KVM_AMD && X86_64
depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
select ARCH_HAS_CC_PLATFORM
- select KVM_VM_MEMORY_ATTRIBUTES
select HAVE_KVM_ARCH_GMEM_PREPARE
select HAVE_KVM_ARCH_GMEM_INVALIDATE
select HAVE_KVM_ARCH_GMEM_POPULATE
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 3506a2f2be041..7680c868fd6e1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -104,6 +104,7 @@ module_param(allow_unsafe_mappings, bool, 0444);
#ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
bool vm_memory_attributes = true;
+module_param(vm_memory_attributes, bool, 0444);
#endif
DEFINE_STATIC_CALL_RET0(__kvm_get_memory_attributes, kvm_get_memory_attributes_t);
EXPORT_SYMBOL_FOR_KVM_INTERNAL(STATIC_CALL_KEY(__kvm_get_memory_attributes));
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 14/37] KVM: selftests: Create gmem fd before "regular" fd when adding memslot
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (12 preceding siblings ...)
2025-10-17 20:11 ` [RFC PATCH v1 13/37] KVM: Let userspace disable per-VM mem attributes, enable per-gmem attributes Ackerley Tng
@ 2025-10-17 20:11 ` Ackerley Tng
2025-10-17 20:11 ` [RFC PATCH v1 15/37] KVM: selftests: Rename guest_memfd{,_offset} to gmem_{fd,offset} Ackerley Tng
` (22 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:11 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
From: Sean Christopherson <seanjc@google.com>
When adding a memslot associated a guest_memfd instance, create/dup the
guest_memfd before creating the "normal" backing file. This will allow
dup'ing the gmem fd as the normal fd when guest_memfd supports mmap(),
i.e. to make guest_memfd the _only_ backing source for the memslot.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
tools/testing/selftests/kvm/lib/kvm_util.c | 45 +++++++++++-----------
1 file changed, 23 insertions(+), 22 deletions(-)
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index eef6de9a7f4b1..e35c65a173606 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1026,6 +1026,29 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
if (alignment > 1)
region->mmap_size += alignment;
+ if (flags & KVM_MEM_GUEST_MEMFD) {
+ if (guest_memfd < 0) {
+ uint32_t guest_memfd_flags = 0;
+
+ TEST_ASSERT(!guest_memfd_offset,
+ "Offset must be zero when creating new guest_memfd");
+ guest_memfd = vm_create_guest_memfd(vm, mem_size, guest_memfd_flags);
+ } else {
+ /*
+ * Install a unique fd for each memslot so that the fd
+ * can be closed when the region is deleted without
+ * needing to track if the fd is owned by the framework
+ * or by the caller.
+ */
+ guest_memfd = kvm_dup(guest_memfd);
+ }
+
+ region->region.guest_memfd = guest_memfd;
+ region->region.guest_memfd_offset = guest_memfd_offset;
+ } else {
+ region->region.guest_memfd = -1;
+ }
+
region->fd = -1;
if (backing_src_is_shared(src_type))
region->fd = kvm_memfd_alloc(region->mmap_size,
@@ -1055,28 +1078,6 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
region->backing_src_type = src_type;
- if (flags & KVM_MEM_GUEST_MEMFD) {
- if (guest_memfd < 0) {
- uint32_t guest_memfd_flags = 0;
- TEST_ASSERT(!guest_memfd_offset,
- "Offset must be zero when creating new guest_memfd");
- guest_memfd = vm_create_guest_memfd(vm, mem_size, guest_memfd_flags);
- } else {
- /*
- * Install a unique fd for each memslot so that the fd
- * can be closed when the region is deleted without
- * needing to track if the fd is owned by the framework
- * or by the caller.
- */
- guest_memfd = kvm_dup(guest_memfd);
- }
-
- region->region.guest_memfd = guest_memfd;
- region->region.guest_memfd_offset = guest_memfd_offset;
- } else {
- region->region.guest_memfd = -1;
- }
-
region->unused_phy_pages = sparsebit_alloc();
if (vm_arch_has_protected_memory(vm))
region->protected_phy_pages = sparsebit_alloc();
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 15/37] KVM: selftests: Rename guest_memfd{,_offset} to gmem_{fd,offset}
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (13 preceding siblings ...)
2025-10-17 20:11 ` [RFC PATCH v1 14/37] KVM: selftests: Create gmem fd before "regular" fd when adding memslot Ackerley Tng
@ 2025-10-17 20:11 ` Ackerley Tng
2025-10-17 20:11 ` [RFC PATCH v1 16/37] KVM: selftests: Add support for mmap() on guest_memfd in core library Ackerley Tng
` (21 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:11 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
From: Sean Christopherson <seanjc@google.com>
Rename local variables and function parameters for the guest memory file
descriptor and its offset to use a "gmem_" prefix instead of
"guest_memfd_".
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
tools/testing/selftests/kvm/lib/kvm_util.c | 26 +++++++++++-----------
1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index e35c65a173606..8b714270cf381 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -912,7 +912,7 @@ void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
uint64_t gpa, uint64_t size, void *hva,
- uint32_t guest_memfd, uint64_t guest_memfd_offset)
+ uint32_t gmem_fd, uint64_t gmem_offset)
{
struct kvm_userspace_memory_region2 region = {
.slot = slot,
@@ -920,8 +920,8 @@ int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flag
.guest_phys_addr = gpa,
.memory_size = size,
.userspace_addr = (uintptr_t)hva,
- .guest_memfd = guest_memfd,
- .guest_memfd_offset = guest_memfd_offset,
+ .guest_memfd = gmem_fd,
+ .guest_memfd_offset = gmem_offset,
};
TEST_REQUIRE_SET_USER_MEMORY_REGION2();
@@ -931,10 +931,10 @@ int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flag
void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
uint64_t gpa, uint64_t size, void *hva,
- uint32_t guest_memfd, uint64_t guest_memfd_offset)
+ uint32_t gmem_fd, uint64_t gmem_offset)
{
int ret = __vm_set_user_memory_region2(vm, slot, flags, gpa, size, hva,
- guest_memfd, guest_memfd_offset);
+ gmem_fd, gmem_offset);
TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION2 failed, errno = %d (%s)",
errno, strerror(errno));
@@ -944,7 +944,7 @@ void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags
/* FIXME: This thing needs to be ripped apart and rewritten. */
void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
uint64_t gpa, uint32_t slot, uint64_t npages, uint32_t flags,
- int guest_memfd, uint64_t guest_memfd_offset)
+ int gmem_fd, uint64_t gmem_offset)
{
int ret;
struct userspace_mem_region *region;
@@ -1027,12 +1027,12 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
region->mmap_size += alignment;
if (flags & KVM_MEM_GUEST_MEMFD) {
- if (guest_memfd < 0) {
- uint32_t guest_memfd_flags = 0;
+ if (gmem_fd < 0) {
+ uint32_t gmem_flags = 0;
- TEST_ASSERT(!guest_memfd_offset,
+ TEST_ASSERT(!gmem_offset,
"Offset must be zero when creating new guest_memfd");
- guest_memfd = vm_create_guest_memfd(vm, mem_size, guest_memfd_flags);
+ gmem_fd = vm_create_guest_memfd(vm, mem_size, gmem_flags);
} else {
/*
* Install a unique fd for each memslot so that the fd
@@ -1040,11 +1040,11 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
* needing to track if the fd is owned by the framework
* or by the caller.
*/
- guest_memfd = kvm_dup(guest_memfd);
+ gmem_fd = kvm_dup(gmem_fd);
}
- region->region.guest_memfd = guest_memfd;
- region->region.guest_memfd_offset = guest_memfd_offset;
+ region->region.guest_memfd = gmem_fd;
+ region->region.guest_memfd_offset = gmem_offset;
} else {
region->region.guest_memfd = -1;
}
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 16/37] KVM: selftests: Add support for mmap() on guest_memfd in core library
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (14 preceding siblings ...)
2025-10-17 20:11 ` [RFC PATCH v1 15/37] KVM: selftests: Rename guest_memfd{,_offset} to gmem_{fd,offset} Ackerley Tng
@ 2025-10-17 20:11 ` Ackerley Tng
2025-10-24 16:48 ` Ackerley Tng
2025-10-17 20:11 ` [RFC PATCH v1 17/37] KVM: selftests: Update framework to use KVM_SET_MEMORY_ATTRIBUTES2 Ackerley Tng
` (20 subsequent siblings)
36 siblings, 1 reply; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:11 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
From: Sean Christopherson <seanjc@google.com>
Accept gmem_flags in vm_mem_add() to be able to create a guest_memfd within
vm_mem_add().
When vm_mem_add() is used to set up a guest_memfd for a memslot, set up the
provided (or created) gmem_fd as the fd for the user memory region. This
makes it available to be mmap()-ed from just like fds from other memory
sources. mmap() from guest_memfd using the provided gmem_flags and
gmem_offset.
Add a kvm_slot_to_fd() helper to provide convenient access to the file
descriptor of a memslot.
Update existing callers of vm_mem_add() to pass 0 for gmem_flags to
preserve existing behavior.
Signed-off-by: Sean Christopherson <seanjc@google.com>
[For guest_memfds, mmap() using gmem_offset instead of 0 all the time.]
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
tools/testing/selftests/kvm/include/kvm_util.h | 7 ++++++-
tools/testing/selftests/kvm/lib/kvm_util.c | 18 ++++++++++--------
.../kvm/x86/private_mem_conversions_test.c | 2 +-
3 files changed, 17 insertions(+), 10 deletions(-)
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 45159638d5dde..de8ae9be19067 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -678,7 +678,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
uint32_t flags);
void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
uint64_t gpa, uint32_t slot, uint64_t npages, uint32_t flags,
- int guest_memfd_fd, uint64_t guest_memfd_offset);
+ int gmem_fd, uint64_t gmem_offset, uint64_t gmem_flags);
#ifndef vm_arch_has_protected_memory
static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm)
@@ -711,6 +711,11 @@ void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva);
vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva);
void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa);
+static inline int kvm_slot_to_fd(struct kvm_vm *vm, uint32_t slot)
+{
+ return memslot2region(vm, slot)->fd;
+}
+
#ifndef vcpu_arch_put_guest
#define vcpu_arch_put_guest(mem, val) do { (mem) = (val); } while (0)
#endif
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 8b714270cf381..19c0445c0b296 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -944,12 +944,13 @@ void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags
/* FIXME: This thing needs to be ripped apart and rewritten. */
void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
uint64_t gpa, uint32_t slot, uint64_t npages, uint32_t flags,
- int gmem_fd, uint64_t gmem_offset)
+ int gmem_fd, uint64_t gmem_offset, uint64_t gmem_flags)
{
int ret;
struct userspace_mem_region *region;
size_t backing_src_pagesz = get_backing_src_pagesz(src_type);
size_t mem_size = npages * vm->page_size;
+ off_t mmap_offset;
size_t alignment;
TEST_REQUIRE_SET_USER_MEMORY_REGION2();
@@ -1028,8 +1029,6 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
if (flags & KVM_MEM_GUEST_MEMFD) {
if (gmem_fd < 0) {
- uint32_t gmem_flags = 0;
-
TEST_ASSERT(!gmem_offset,
"Offset must be zero when creating new guest_memfd");
gmem_fd = vm_create_guest_memfd(vm, mem_size, gmem_flags);
@@ -1050,13 +1049,16 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
}
region->fd = -1;
- if (backing_src_is_shared(src_type))
+ if (flags & KVM_MEM_GUEST_MEMFD && gmem_flags & GUEST_MEMFD_FLAG_MMAP)
+ region->fd = kvm_dup(gmem_fd);
+ else if (backing_src_is_shared(src_type))
region->fd = kvm_memfd_alloc(region->mmap_size,
src_type == VM_MEM_SRC_SHARED_HUGETLB);
- region->mmap_start = kvm_mmap(region->mmap_size, PROT_READ | PROT_WRITE,
- vm_mem_backing_src_alias(src_type)->flag,
- region->fd);
+ mmap_offset = flags & KVM_MEM_GUEST_MEMFD ? gmem_offset : 0;
+ region->mmap_start = __kvm_mmap(region->mmap_size, PROT_READ | PROT_WRITE,
+ vm_mem_backing_src_alias(src_type)->flag,
+ region->fd, mmap_offset);
TEST_ASSERT(!is_backing_src_hugetlb(src_type) ||
region->mmap_start == align_ptr_up(region->mmap_start, backing_src_pagesz),
@@ -1117,7 +1119,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
uint64_t gpa, uint32_t slot, uint64_t npages,
uint32_t flags)
{
- vm_mem_add(vm, src_type, gpa, slot, npages, flags, -1, 0);
+ vm_mem_add(vm, src_type, gpa, slot, npages, flags, -1, 0, 0);
}
/*
diff --git a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
index 1969f4ab9b280..41f6b38f04071 100644
--- a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
+++ b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
@@ -399,7 +399,7 @@ static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t
for (i = 0; i < nr_memslots; i++)
vm_mem_add(vm, src_type, BASE_DATA_GPA + slot_size * i,
BASE_DATA_SLOT + i, slot_size / vm->page_size,
- KVM_MEM_GUEST_MEMFD, memfd, slot_size * i);
+ KVM_MEM_GUEST_MEMFD, memfd, slot_size * i, 0);
for (i = 0; i < nr_vcpus; i++) {
uint64_t gpa = BASE_DATA_GPA + i * per_cpu_size;
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* Re: [RFC PATCH v1 16/37] KVM: selftests: Add support for mmap() on guest_memfd in core library
2025-10-17 20:11 ` [RFC PATCH v1 16/37] KVM: selftests: Add support for mmap() on guest_memfd in core library Ackerley Tng
@ 2025-10-24 16:48 ` Ackerley Tng
2025-10-24 18:18 ` Sean Christopherson
0 siblings, 1 reply; 56+ messages in thread
From: Ackerley Tng @ 2025-10-24 16:48 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: akpm, binbin.wu, bp, brauner, chao.p.peng, chenhuacai, corbet,
dave.hansen, dave.hansen, david, dmatlack, erdemaktas, fan.du,
fvdl, haibo1.xu, hannes, hch, hpa, hughd, ira.weiny,
isaku.yamahata, jack, james.morse, jarkko, jgg, jgowans, jhubbard,
jroedel, jthoughton, jun.miao, kai.huang, keirf, kent.overstreet,
liam.merwick, maciej.wieczor-retman, mail, maobibo,
mathieu.desnoyers, maz, mhiramat, mhocko, mic, michael.roth,
mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz, oliver.upton,
palmer, pankaj.gupta, paul.walmsley, pbonzini, peterx, pgonda,
prsampat, pvorel, qperret, richard.weiyang, rick.p.edgecombe,
rientjes, rostedt, roypat, rppt, seanjc, shakeel.butt, shuah,
steven.price, steven.sistare, suzuki.poulose, tabba, tglx,
thomas.lendacky, vannapurve, vbabka, viro, vkuznets, wei.w.wang,
will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu, yuzenghui,
zhiquan1.li
Ackerley Tng <ackerleytng@google.com> writes:
> From: Sean Christopherson <seanjc@google.com>
>
> Accept gmem_flags in vm_mem_add() to be able to create a guest_memfd within
> vm_mem_add().
>
> When vm_mem_add() is used to set up a guest_memfd for a memslot, set up the
> provided (or created) gmem_fd as the fd for the user memory region. This
> makes it available to be mmap()-ed from just like fds from other memory
> sources. mmap() from guest_memfd using the provided gmem_flags and
> gmem_offset.
>
> Add a kvm_slot_to_fd() helper to provide convenient access to the file
> descriptor of a memslot.
>
> Update existing callers of vm_mem_add() to pass 0 for gmem_flags to
> preserve existing behavior.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> [For guest_memfds, mmap() using gmem_offset instead of 0 all the time.]
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> ---
> tools/testing/selftests/kvm/include/kvm_util.h | 7 ++++++-
> tools/testing/selftests/kvm/lib/kvm_util.c | 18 ++++++++++--------
> .../kvm/x86/private_mem_conversions_test.c | 2 +-
> 3 files changed, 17 insertions(+), 10 deletions(-)
>
>
> [...snip...]
>
> @@ -1050,13 +1049,16 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
> }
>
> region->fd = -1;
> - if (backing_src_is_shared(src_type))
> + if (flags & KVM_MEM_GUEST_MEMFD && gmem_flags & GUEST_MEMFD_FLAG_MMAP)
> + region->fd = kvm_dup(gmem_fd);
> + else if (backing_src_is_shared(src_type))
> region->fd = kvm_memfd_alloc(region->mmap_size,
> src_type == VM_MEM_SRC_SHARED_HUGETLB);
>
Doing this makes it hard to test the legacy dual-backing case.
It actually broke x86/private_mem_conversions_test for the legacy
dual-backing case because there's no way to mmap or provide a
userspace_address from the memory provider that is not guest_memfd, as
determined by src_type.
I didn't test the legacy dual-backing case before posting this RFC and
probably should have.
> - region->mmap_start = kvm_mmap(region->mmap_size, PROT_READ | PROT_WRITE,
> - vm_mem_backing_src_alias(src_type)->flag,
> - region->fd);
> + mmap_offset = flags & KVM_MEM_GUEST_MEMFD ? gmem_offset : 0;
> + region->mmap_start = __kvm_mmap(region->mmap_size, PROT_READ | PROT_WRITE,
> + vm_mem_backing_src_alias(src_type)->flag,
> + region->fd, mmap_offset);
>
> TEST_ASSERT(!is_backing_src_hugetlb(src_type) ||
> region->mmap_start == align_ptr_up(region->mmap_start, backing_src_pagesz),
> @@ -1117,7 +1119,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
> uint64_t gpa, uint32_t slot, uint64_t npages,
> uint32_t flags)
> {
> - vm_mem_add(vm, src_type, gpa, slot, npages, flags, -1, 0);
> + vm_mem_add(vm, src_type, gpa, slot, npages, flags, -1, 0, 0);
> }
>
> /*
> diff --git a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
> index 1969f4ab9b280..41f6b38f04071 100644
> --- a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
> +++ b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
> @@ -399,7 +399,7 @@ static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t
> for (i = 0; i < nr_memslots; i++)
> vm_mem_add(vm, src_type, BASE_DATA_GPA + slot_size * i,
> BASE_DATA_SLOT + i, slot_size / vm->page_size,
> - KVM_MEM_GUEST_MEMFD, memfd, slot_size * i);
> + KVM_MEM_GUEST_MEMFD, memfd, slot_size * i, 0);
>
> for (i = 0; i < nr_vcpus; i++) {
> uint64_t gpa = BASE_DATA_GPA + i * per_cpu_size;
> --
> 2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply [flat|nested] 56+ messages in thread* Re: [RFC PATCH v1 16/37] KVM: selftests: Add support for mmap() on guest_memfd in core library
2025-10-24 16:48 ` Ackerley Tng
@ 2025-10-24 18:18 ` Sean Christopherson
2025-10-27 12:51 ` Ackerley Tng
0 siblings, 1 reply; 56+ messages in thread
From: Sean Christopherson @ 2025-10-24 18:18 UTC (permalink / raw)
To: Ackerley Tng
Cc: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86, akpm,
binbin.wu, bp, brauner, chao.p.peng, chenhuacai, corbet,
dave.hansen, dave.hansen, david, dmatlack, erdemaktas, fan.du,
fvdl, haibo1.xu, hannes, hch, hpa, hughd, ira.weiny,
isaku.yamahata, jack, james.morse, jarkko, jgg, jgowans, jhubbard,
jroedel, jthoughton, jun.miao, kai.huang, keirf, kent.overstreet,
liam.merwick, maciej.wieczor-retman, mail, maobibo,
mathieu.desnoyers, maz, mhiramat, mhocko, mic, michael.roth,
mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz, oliver.upton,
palmer, pankaj.gupta, paul.walmsley, pbonzini, peterx, pgonda,
prsampat, pvorel, qperret, richard.weiyang, rick.p.edgecombe,
rientjes, rostedt, roypat, rppt, shakeel.butt, shuah,
steven.price, steven.sistare, suzuki.poulose, tabba, tglx,
thomas.lendacky, vannapurve, vbabka, viro, vkuznets, wei.w.wang,
will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu, yuzenghui,
zhiquan1.li
On Fri, Oct 24, 2025, Ackerley Tng wrote:
> Ackerley Tng <ackerleytng@google.com> writes:
>
> > From: Sean Christopherson <seanjc@google.com>
> >
> > Accept gmem_flags in vm_mem_add() to be able to create a guest_memfd within
> > vm_mem_add().
> >
> > When vm_mem_add() is used to set up a guest_memfd for a memslot, set up the
> > provided (or created) gmem_fd as the fd for the user memory region. This
> > makes it available to be mmap()-ed from just like fds from other memory
> > sources. mmap() from guest_memfd using the provided gmem_flags and
> > gmem_offset.
> >
> > Add a kvm_slot_to_fd() helper to provide convenient access to the file
> > descriptor of a memslot.
> >
> > Update existing callers of vm_mem_add() to pass 0 for gmem_flags to
> > preserve existing behavior.
> >
> > Signed-off-by: Sean Christopherson <seanjc@google.com>
> > [For guest_memfds, mmap() using gmem_offset instead of 0 all the time.]
> > Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> > ---
> > tools/testing/selftests/kvm/include/kvm_util.h | 7 ++++++-
> > tools/testing/selftests/kvm/lib/kvm_util.c | 18 ++++++++++--------
> > .../kvm/x86/private_mem_conversions_test.c | 2 +-
> > 3 files changed, 17 insertions(+), 10 deletions(-)
> >
> >
> > [...snip...]
> >
> > @@ -1050,13 +1049,16 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
> > }
> >
> > region->fd = -1;
> > - if (backing_src_is_shared(src_type))
> > + if (flags & KVM_MEM_GUEST_MEMFD && gmem_flags & GUEST_MEMFD_FLAG_MMAP)
> > + region->fd = kvm_dup(gmem_fd);
> > + else if (backing_src_is_shared(src_type))
> > region->fd = kvm_memfd_alloc(region->mmap_size,
> > src_type == VM_MEM_SRC_SHARED_HUGETLB);
> >
>
> Doing this makes it hard to test the legacy dual-backing case.
>
> It actually broke x86/private_mem_conversions_test for the legacy
> dual-backing case because there's no way to mmap or provide a
> userspace_address from the memory provider that is not guest_memfd, as
> determined by src_type.
Yes there is. This patch is a giant nop. The only thing that the core library
doesn't support is mmap() on guest_memfd *and* the other src_type, and IMO that
is big "don't care", because KVM doesn't even support that combination:
if (kvm_gmem_supports_mmap(inode))
slot->flags |= KVM_MEMSLOT_GMEM_ONLY;
I mean, we _could_ test that KVM ignores the hva for mapping, but that's a
different and unique test entirely.
I did break x86/private_mem_conversions_test (I could have sworn I tested, *sigh*),
but the bug is in:
KVM: selftests: Provide function to look up guest_memfd details from gpa
not here. And it's a trivial /facepalm-style fix:
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index ee5b63f7cb50..23a8676fee6d 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1680,7 +1680,7 @@ int kvm_gpa_to_guest_memfd(struct kvm_vm *vm, vm_paddr_t gpa, off_t *fd_offset,
gpa_offset = gpa - region->region.guest_phys_addr;
*fd_offset = region->region.guest_memfd_offset + gpa_offset;
*nr_bytes = region->region.memory_size - gpa_offset;
- return region->fd;
+ return region->region.guest_memfd;
}
/* Create an interrupt controller chip for the specified VM. */
^ permalink raw reply related [flat|nested] 56+ messages in thread* Re: [RFC PATCH v1 16/37] KVM: selftests: Add support for mmap() on guest_memfd in core library
2025-10-24 18:18 ` Sean Christopherson
@ 2025-10-27 12:51 ` Ackerley Tng
0 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-27 12:51 UTC (permalink / raw)
To: Sean Christopherson
Cc: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86, akpm,
binbin.wu, bp, brauner, chao.p.peng, chenhuacai, corbet,
dave.hansen, dave.hansen, david, dmatlack, erdemaktas, fan.du,
fvdl, haibo1.xu, hannes, hch, hpa, hughd, ira.weiny,
isaku.yamahata, jack, james.morse, jarkko, jgg, jgowans, jhubbard,
jroedel, jthoughton, jun.miao, kai.huang, keirf, kent.overstreet,
liam.merwick, maciej.wieczor-retman, mail, maobibo,
mathieu.desnoyers, maz, mhiramat, mhocko, mic, michael.roth,
mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz, oliver.upton,
palmer, pankaj.gupta, paul.walmsley, pbonzini, peterx, pgonda,
prsampat, pvorel, qperret, richard.weiyang, rick.p.edgecombe,
rientjes, rostedt, roypat, rppt, shakeel.butt, shuah,
steven.price, steven.sistare, suzuki.poulose, tabba, tglx,
thomas.lendacky, vannapurve, vbabka, viro, vkuznets, wei.w.wang,
will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu, yuzenghui,
zhiquan1.li
Sean Christopherson <seanjc@google.com> writes:
> On Fri, Oct 24, 2025, Ackerley Tng wrote:
>> Ackerley Tng <ackerleytng@google.com> writes:
>>
>> > From: Sean Christopherson <seanjc@google.com>
>> >
>> > Accept gmem_flags in vm_mem_add() to be able to create a guest_memfd within
>> > vm_mem_add().
>> >
>> > When vm_mem_add() is used to set up a guest_memfd for a memslot, set up the
>> > provided (or created) gmem_fd as the fd for the user memory region. This
>> > makes it available to be mmap()-ed from just like fds from other memory
>> > sources. mmap() from guest_memfd using the provided gmem_flags and
>> > gmem_offset.
>> >
>> > Add a kvm_slot_to_fd() helper to provide convenient access to the file
>> > descriptor of a memslot.
>> >
>> > Update existing callers of vm_mem_add() to pass 0 for gmem_flags to
>> > preserve existing behavior.
>> >
>> > Signed-off-by: Sean Christopherson <seanjc@google.com>
>> > [For guest_memfds, mmap() using gmem_offset instead of 0 all the time.]
>> > Signed-off-by: Ackerley Tng <ackerleytng@google.com>
>> > ---
>> > tools/testing/selftests/kvm/include/kvm_util.h | 7 ++++++-
>> > tools/testing/selftests/kvm/lib/kvm_util.c | 18 ++++++++++--------
>> > .../kvm/x86/private_mem_conversions_test.c | 2 +-
>> > 3 files changed, 17 insertions(+), 10 deletions(-)
>> >
>> >
>> > [...snip...]
>> >
>> > @@ -1050,13 +1049,16 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
>> > }
>> >
>> > region->fd = -1;
>> > - if (backing_src_is_shared(src_type))
>> > + if (flags & KVM_MEM_GUEST_MEMFD && gmem_flags & GUEST_MEMFD_FLAG_MMAP)
>> > + region->fd = kvm_dup(gmem_fd);
>> > + else if (backing_src_is_shared(src_type))
>> > region->fd = kvm_memfd_alloc(region->mmap_size,
>> > src_type == VM_MEM_SRC_SHARED_HUGETLB);
>> >
>>
>> Doing this makes it hard to test the legacy dual-backing case.
>>
>> It actually broke x86/private_mem_conversions_test for the legacy
>> dual-backing case because there's no way to mmap or provide a
>> userspace_address from the memory provider that is not guest_memfd, as
>> determined by src_type.
>
> Yes there is. This patch is a giant nop. The only thing that the core library
> doesn't support is mmap() on guest_memfd *and* the other src_type, and IMO that
> is big "don't care", because KVM doesn't even support that combination:
>
> if (kvm_gmem_supports_mmap(inode))
> slot->flags |= KVM_MEMSLOT_GMEM_ONLY;
>
Makes sense.
> I mean, we _could_ test that KVM ignores the hva for mapping, but that's a
> different and unique test entirely.
>
> I did break x86/private_mem_conversions_test (I could have sworn I tested, *sigh*),
> but the bug is in:
>
> KVM: selftests: Provide function to look up guest_memfd details from gpa
>
> not here. And it's a trivial /facepalm-style fix:
>
> diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
> index ee5b63f7cb50..23a8676fee6d 100644
> --- a/tools/testing/selftests/kvm/lib/kvm_util.c
> +++ b/tools/testing/selftests/kvm/lib/kvm_util.c
> @@ -1680,7 +1680,7 @@ int kvm_gpa_to_guest_memfd(struct kvm_vm *vm, vm_paddr_t gpa, off_t *fd_offset,
> gpa_offset = gpa - region->region.guest_phys_addr;
> *fd_offset = region->region.guest_memfd_offset + gpa_offset;
> *nr_bytes = region->region.memory_size - gpa_offset;
> - return region->fd;
> + return region->region.guest_memfd;
> }
>
> /* Create an interrupt controller chip for the specified VM. */
This works. Thanks!
^ permalink raw reply [flat|nested] 56+ messages in thread
* [RFC PATCH v1 17/37] KVM: selftests: Update framework to use KVM_SET_MEMORY_ATTRIBUTES2
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (15 preceding siblings ...)
2025-10-17 20:11 ` [RFC PATCH v1 16/37] KVM: selftests: Add support for mmap() on guest_memfd in core library Ackerley Tng
@ 2025-10-17 20:11 ` Ackerley Tng
2025-10-17 20:11 ` [RFC PATCH v1 18/37] KVM: selftests: Add helpers for calling ioctls on guest_memfd Ackerley Tng
` (19 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:11 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
Update KVM selftest framework to use KVM_SET_MEMORY_ATTRIBUTES2 and the
accompanying struct kvm_memory_attributes2.
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
tools/testing/selftests/kvm/include/kvm_util.h | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index de8ae9be19067..019ffcec4510f 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -394,24 +394,30 @@ static inline void vm_enable_cap(struct kvm_vm *vm, uint32_t cap, uint64_t arg0)
vm_ioctl(vm, KVM_ENABLE_CAP, &enable_cap);
}
+#define TEST_REQUIRE_SET_MEMORY_ATTRIBUTES2() \
+ __TEST_REQUIRE(kvm_has_cap(KVM_CAP_MEMORY_ATTRIBUTES2), \
+ "KVM selftests now require KVM_SET_MEMORY_ATTRIBUTES2")
+
static inline void vm_set_memory_attributes(struct kvm_vm *vm, uint64_t gpa,
uint64_t size, uint64_t attributes)
{
- struct kvm_memory_attributes attr = {
+ struct kvm_memory_attributes2 attr = {
.attributes = attributes,
.address = gpa,
.size = size,
.flags = 0,
};
+ TEST_REQUIRE_SET_MEMORY_ATTRIBUTES2();
+
/*
- * KVM_SET_MEMORY_ATTRIBUTES overwrites _all_ attributes. These flows
+ * KVM_SET_MEMORY_ATTRIBUTES2 overwrites _all_ attributes. These flows
* need significant enhancements to support multiple attributes.
*/
TEST_ASSERT(!attributes || attributes == KVM_MEMORY_ATTRIBUTE_PRIVATE,
"Update me to support multiple attributes!");
- vm_ioctl(vm, KVM_SET_MEMORY_ATTRIBUTES, &attr);
+ vm_ioctl(vm, KVM_SET_MEMORY_ATTRIBUTES2, &attr);
}
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 18/37] KVM: selftests: Add helpers for calling ioctls on guest_memfd
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (16 preceding siblings ...)
2025-10-17 20:11 ` [RFC PATCH v1 17/37] KVM: selftests: Update framework to use KVM_SET_MEMORY_ATTRIBUTES2 Ackerley Tng
@ 2025-10-17 20:11 ` Ackerley Tng
2025-10-17 20:12 ` [RFC PATCH v1 19/37] KVM: selftests: guest_memfd: Test basic single-page conversion flow Ackerley Tng
` (18 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:11 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
From: Sean Christopherson <seanjc@google.com>
Add helper functions to kvm_util.h to support calling ioctls, specifically
KVM_SET_MEMORY_ATTRIBUTES2, on a guest_memfd file descriptor.
Introduce gmem_ioctl() and __gmem_ioctl() macros, modeled after the
existing vm_ioctl() helpers, to provide a standard way to call ioctls
on a guest_memfd.
Add gmem_set_memory_attributes() and its derivatives (gmem_set_private(),
gmem_set_shared()) to set memory attributes on a guest_memfd region.
Also provide "__" variants that return the ioctl error code instead of
aborting the test. These helpers will be used by upcoming guest_memfd
tests.
To avoid code duplication, factor out the check for supported memory
attributes into a new macro, TEST_ASSERT_SUPPORTED_ATTRIBUTES, and use
it in both the existing vm_set_memory_attributes() and the new
gmem_set_memory_attributes() helpers.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
.../testing/selftests/kvm/include/kvm_util.h | 87 +++++++++++++++++--
1 file changed, 79 insertions(+), 8 deletions(-)
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 019ffcec4510f..dd26a41106fae 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -310,6 +310,16 @@ static inline bool kvm_has_cap(long cap)
TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(#cmd, ret)); \
})
+#define __gmem_ioctl(gmem_fd, cmd, arg) \
+ kvm_do_ioctl(gmem_fd, cmd, arg)
+
+#define gmem_ioctl(gmem_fd, cmd, arg) \
+({ \
+ int ret = __gmem_ioctl(gmem_fd, cmd, arg); \
+ \
+ TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(#cmd, ret)); \
+})
+
static __always_inline void static_assert_is_vm(struct kvm_vm *vm) { }
#define __vm_ioctl(vm, cmd, arg) \
@@ -398,6 +408,14 @@ static inline void vm_enable_cap(struct kvm_vm *vm, uint32_t cap, uint64_t arg0)
__TEST_REQUIRE(kvm_has_cap(KVM_CAP_MEMORY_ATTRIBUTES2), \
"KVM selftests now require KVM_SET_MEMORY_ATTRIBUTES2")
+/*
+ * KVM_SET_MEMORY_ATTRIBUTES overwrites _all_ attributes. These flows need
+ * significant enhancements to support multiple attributes.
+ */
+#define TEST_ASSERT_SUPPORTED_ATTRIBUTES(attributes) \
+ TEST_ASSERT(!attributes || attributes == KVM_MEMORY_ATTRIBUTE_PRIVATE, \
+ "Update me to support multiple attributes!")
+
static inline void vm_set_memory_attributes(struct kvm_vm *vm, uint64_t gpa,
uint64_t size, uint64_t attributes)
{
@@ -409,18 +427,11 @@ static inline void vm_set_memory_attributes(struct kvm_vm *vm, uint64_t gpa,
};
TEST_REQUIRE_SET_MEMORY_ATTRIBUTES2();
-
- /*
- * KVM_SET_MEMORY_ATTRIBUTES2 overwrites _all_ attributes. These flows
- * need significant enhancements to support multiple attributes.
- */
- TEST_ASSERT(!attributes || attributes == KVM_MEMORY_ATTRIBUTE_PRIVATE,
- "Update me to support multiple attributes!");
+ TEST_ASSERT_SUPPORTED_ATTRIBUTES(attributes);
vm_ioctl(vm, KVM_SET_MEMORY_ATTRIBUTES2, &attr);
}
-
static inline void vm_mem_set_private(struct kvm_vm *vm, uint64_t gpa,
uint64_t size)
{
@@ -433,6 +444,66 @@ static inline void vm_mem_set_shared(struct kvm_vm *vm, uint64_t gpa,
vm_set_memory_attributes(vm, gpa, size, 0);
}
+static inline int __gmem_set_memory_attributes(int fd, loff_t offset,
+ uint64_t size,
+ uint64_t attributes,
+ loff_t *error_offset)
+{
+ struct kvm_memory_attributes2 attr = {
+ .attributes = attributes,
+ .offset = offset,
+ .size = size,
+ .flags = 0,
+ };
+ int r;
+
+ TEST_ASSERT_SUPPORTED_ATTRIBUTES(attributes);
+
+ r = __gmem_ioctl(fd, KVM_SET_MEMORY_ATTRIBUTES2, &attr);
+ if (r)
+ *error_offset = attr.error_offset;
+ return r;
+}
+
+static inline int __gmem_set_private(int fd, loff_t offset, uint64_t size,
+ loff_t *error_offset)
+{
+ return __gmem_set_memory_attributes(fd, offset, size,
+ KVM_MEMORY_ATTRIBUTE_PRIVATE,
+ error_offset);
+}
+
+static inline int __gmem_set_shared(int fd, loff_t offset, uint64_t size,
+ loff_t *error_offset)
+{
+ return __gmem_set_memory_attributes(fd, offset, size, 0, error_offset);
+}
+
+static inline void gmem_set_memory_attributes(int fd, loff_t offset,
+ uint64_t size, uint64_t attributes)
+{
+ struct kvm_memory_attributes2 attr = {
+ .attributes = attributes,
+ .offset = offset,
+ .size = size,
+ .flags = 0,
+ };
+
+ TEST_ASSERT_SUPPORTED_ATTRIBUTES(attributes);
+
+ gmem_ioctl(fd, KVM_SET_MEMORY_ATTRIBUTES2, &attr);
+}
+
+static inline void gmem_set_private(int fd, loff_t offset, uint64_t size)
+{
+ gmem_set_memory_attributes(fd, offset, size, KVM_MEMORY_ATTRIBUTE_PRIVATE);
+}
+
+static inline void gmem_set_shared(int fd, loff_t offset, uint64_t size)
+{
+ gmem_set_memory_attributes(fd, offset, size, 0);
+}
+
void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t gpa, uint64_t size,
bool punch_hole);
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 19/37] KVM: selftests: guest_memfd: Test basic single-page conversion flow
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (17 preceding siblings ...)
2025-10-17 20:11 ` [RFC PATCH v1 18/37] KVM: selftests: Add helpers for calling ioctls on guest_memfd Ackerley Tng
@ 2025-10-17 20:12 ` Ackerley Tng
2025-10-17 20:12 ` [RFC PATCH v1 20/37] KVM: selftests: guest_memfd: Test conversion flow when INIT_SHARED Ackerley Tng
` (17 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:12 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
Add a selftest for the guest_memfd memory attribute conversion ioctls.
The test starts the guest_memfd as all-private (the default state), and
verifies the basic flow of converting a single page to shared and then back
to private.
Add infrastructure that supports extensions to other conversion flow
tests. This infrastructure will be used in upcoming patches for other
conversion tests.
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
tools/testing/selftests/kvm/Makefile.kvm | 1 +
.../kvm/guest_memfd_conversions_test.c | 207 ++++++++++++++++++
2 files changed, 208 insertions(+)
create mode 100644 tools/testing/selftests/kvm/guest_memfd_conversions_test.c
diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index 148d427ff24be..ddc1bdd51b834 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -141,6 +141,7 @@ TEST_GEN_PROGS_x86 += access_tracking_perf_test
TEST_GEN_PROGS_x86 += coalesced_io_test
TEST_GEN_PROGS_x86 += dirty_log_perf_test
TEST_GEN_PROGS_x86 += guest_memfd_test
+TEST_GEN_PROGS_x86 += guest_memfd_conversions_test
TEST_GEN_PROGS_x86 += hardware_disable_test
TEST_GEN_PROGS_x86 += memslot_modification_stress_test
TEST_GEN_PROGS_x86 += memslot_perf_test
diff --git a/tools/testing/selftests/kvm/guest_memfd_conversions_test.c b/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
new file mode 100644
index 0000000000000..e0370e92e1b24
--- /dev/null
+++ b/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, Google LLC.
+ */
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include <linux/align.h>
+#include <linux/kvm.h>
+#include <linux/sizes.h>
+
+#include "kvm_util.h"
+#include "kselftest_harness.h"
+#include "test_util.h"
+#include "ucall_common.h"
+
+FIXTURE(gmem_conversions) {
+ struct kvm_vcpu *vcpu;
+ int gmem_fd;
+ /* HVA of the first byte of the memory mmap()-ed from gmem_fd. */
+ char *mem;
+};
+
+typedef FIXTURE_DATA(gmem_conversions) test_data_t;
+
+FIXTURE_SETUP(gmem_conversions) { }
+
+static uint64_t page_size;
+
+static void guest_do_rmw(void);
+#define GUEST_MEMFD_SHARING_TEST_GVA 0x90000000ULL
+
+/*
+ * Defer setup until the individual test is invoked so that tests can specify
+ * the number of pages and flags for the guest_memfd instance.
+ */
+static void gmem_conversions_do_setup(test_data_t *t, int nr_pages,
+ int gmem_flags)
+{
+ const struct vm_shape shape = {
+ .mode = VM_MODE_DEFAULT,
+ .type = KVM_X86_SW_PROTECTED_VM,
+ };
+ /*
+ * Use high GPA above APIC_DEFAULT_PHYS_BASE to avoid clashing with
+ * APIC_DEFAULT_PHYS_BASE.
+ */
+ const uint64_t gpa = SZ_4G;
+ const uint32_t slot = 1;
+ struct kvm_vm *vm;
+
+ vm = __vm_create_shape_with_one_vcpu(shape, &t->vcpu, nr_pages, guest_do_rmw);
+
+ vm_mem_add(vm, VM_MEM_SRC_SHMEM, gpa, slot, nr_pages,
+ KVM_MEM_GUEST_MEMFD, -1, 0, gmem_flags);
+
+ t->gmem_fd = kvm_slot_to_fd(vm, slot);
+ t->mem = addr_gpa2hva(vm, gpa);
+ virt_map(vm, GUEST_MEMFD_SHARING_TEST_GVA, gpa, nr_pages);
+}
+
+static void gmem_conversions_do_teardown(test_data_t *t)
+{
+ /* No need to close gmem_fd, it's owned by the VM structure. */
+ kvm_vm_free(t->vcpu->vm);
+}
+
+FIXTURE_TEARDOWN(gmem_conversions)
+{
+ gmem_conversions_do_teardown(self);
+}
+
+/*
+ * In these test definition macros, __nr_pages and nr_pages is used to set up
+ * the total number of pages in the guest_memfd under test. This will be
+ * available in the test definitions as nr_pages.
+ */
+
+#define __GMEM_CONVERSION_TEST(test, __nr_pages, flags) \
+static void __gmem_conversions_##test(test_data_t *t, int nr_pages); \
+ \
+TEST_F(gmem_conversions, test) \
+{ \
+ gmem_conversions_do_setup(self, __nr_pages, flags); \
+ __gmem_conversions_##test(self, __nr_pages); \
+} \
+static void __gmem_conversions_##test(test_data_t *t, int nr_pages) \
+
+#define GMEM_CONVERSION_TEST(test, __nr_pages, flags) \
+ __GMEM_CONVERSION_TEST(test, __nr_pages, (flags) | GUEST_MEMFD_FLAG_MMAP)
+
+#define __GMEM_CONVERSION_TEST_INIT_PRIVATE(test, __nr_pages) \
+ GMEM_CONVERSION_TEST(test, __nr_pages, 0)
+
+#define GMEM_CONVERSION_TEST_INIT_PRIVATE(test) \
+ __GMEM_CONVERSION_TEST_INIT_PRIVATE(test, 1)
+
+struct guest_check_data {
+ void *mem;
+ char expected_val;
+ char write_val;
+};
+static struct guest_check_data guest_data;
+
+static void guest_do_rmw(void)
+{
+ for (;;) {
+ char *mem = READ_ONCE(guest_data.mem);
+
+ GUEST_ASSERT_EQ(READ_ONCE(*mem), READ_ONCE(guest_data.expected_val));
+ WRITE_ONCE(*mem, READ_ONCE(guest_data.write_val));
+
+ GUEST_SYNC(0);
+ }
+}
+
+static void run_guest_do_rmw(struct kvm_vcpu *vcpu, loff_t pgoff,
+ char expected_val, char write_val)
+{
+ struct ucall uc;
+ int r;
+
+ guest_data.mem = (void *)GUEST_MEMFD_SHARING_TEST_GVA + pgoff * page_size;
+ guest_data.expected_val = expected_val;
+ guest_data.write_val = write_val;
+ sync_global_to_guest(vcpu->vm, guest_data);
+
+ for (;;) {
+ r = __vcpu_run(vcpu);
+ if (!r && get_ucall(vcpu, &uc) == UCALL_PRINTF) {
+ REPORT_GUEST_PRINTF(uc);
+ continue;
+ }
+ if (r == -1 && errno == EINTR)
+ continue;
+ break;
+ }
+
+ TEST_ASSERT_EQ(r, 0);
+
+ switch (get_ucall(vcpu, &uc)) {
+ case UCALL_ABORT:
+ REPORT_GUEST_ASSERT(uc);
+ case UCALL_SYNC:
+ break;
+ case UCALL_PRINTF:
+ default:
+ TEST_FAIL("Unexpected ucall %lu", uc.cmd);
+ }
+}
+
+static void host_do_rmw(char *mem, loff_t pgoff, char expected_val,
+ char write_val)
+{
+ TEST_ASSERT_EQ(READ_ONCE(mem[pgoff * page_size]), expected_val);
+ WRITE_ONCE(mem[pgoff * page_size], write_val);
+}
+
+static void test_private(test_data_t *t, loff_t pgoff, char starting_val,
+ char write_val)
+{
+ TEST_EXPECT_SIGBUS(WRITE_ONCE(t->mem[pgoff * page_size], write_val));
+ run_guest_do_rmw(t->vcpu, pgoff, starting_val, write_val);
+ TEST_EXPECT_SIGBUS(READ_ONCE(t->mem[pgoff * page_size]));
+}
+
+static void test_convert_to_private(test_data_t *t, loff_t pgoff,
+ char starting_val, char write_val)
+{
+ gmem_set_private(t->gmem_fd, pgoff * page_size, page_size);
+ test_private(t, pgoff, starting_val, write_val);
+}
+
+static void test_shared(test_data_t *t, loff_t pgoff, char starting_val,
+ char host_write_val, char write_val)
+{
+ host_do_rmw(t->mem, pgoff, starting_val, host_write_val);
+ run_guest_do_rmw(t->vcpu, pgoff, host_write_val, write_val);
+ TEST_ASSERT_EQ(READ_ONCE(t->mem[pgoff * page_size]), write_val);
+}
+
+static void test_convert_to_shared(test_data_t *t, loff_t pgoff,
+ char starting_val, char host_write_val,
+ char write_val)
+{
+ gmem_set_shared(t->gmem_fd, pgoff * page_size, page_size);
+ test_shared(t, pgoff, starting_val, host_write_val, write_val);
+}
+
+GMEM_CONVERSION_TEST_INIT_PRIVATE(init_private)
+{
+ test_private(t, 0, 0, 'A');
+ test_convert_to_shared(t, 0, 'A', 'B', 'C');
+ test_convert_to_private(t, 0, 'C', 'E');
+}
+
+
+int main(int argc, char *argv[])
+{
+ TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM));
+ TEST_REQUIRE(kvm_check_cap(KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES) &
+ KVM_MEMORY_ATTRIBUTE_PRIVATE);
+
+ page_size = getpagesize();
+
+ return test_harness_run(argc, argv);
+}
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 20/37] KVM: selftests: guest_memfd: Test conversion flow when INIT_SHARED
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (18 preceding siblings ...)
2025-10-17 20:12 ` [RFC PATCH v1 19/37] KVM: selftests: guest_memfd: Test basic single-page conversion flow Ackerley Tng
@ 2025-10-17 20:12 ` Ackerley Tng
2025-10-17 20:12 ` [RFC PATCH v1 21/37] KVM: selftests: guest_memfd: Test indexing in guest_memfd Ackerley Tng
` (16 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:12 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
Add a test case to verify that conversions between private and shared
memory work correctly when the memory is initially created as shared.
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
.../selftests/kvm/guest_memfd_conversions_test.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/tools/testing/selftests/kvm/guest_memfd_conversions_test.c b/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
index e0370e92e1b24..d57e66ee11310 100644
--- a/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
+++ b/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
@@ -95,6 +95,12 @@ static void __gmem_conversions_##test(test_data_t *t, int nr_pages) \
#define GMEM_CONVERSION_TEST_INIT_PRIVATE(test) \
__GMEM_CONVERSION_TEST_INIT_PRIVATE(test, 1)
+#define __GMEM_CONVERSION_TEST_INIT_SHARED(test, __nr_pages) \
+ GMEM_CONVERSION_TEST(test, __nr_pages, GUEST_MEMFD_FLAG_INIT_SHARED)
+
+#define GMEM_CONVERSION_TEST_INIT_SHARED(test) \
+ __GMEM_CONVERSION_TEST_INIT_SHARED(test, 1)
+
struct guest_check_data {
void *mem;
char expected_val;
@@ -194,6 +200,12 @@ GMEM_CONVERSION_TEST_INIT_PRIVATE(init_private)
test_convert_to_private(t, 0, 'C', 'E');
}
+GMEM_CONVERSION_TEST_INIT_SHARED(init_shared)
+{
+ test_shared(t, 0, 0, 'A', 'B');
+ test_convert_to_private(t, 0, 'B', 'C');
+ test_convert_to_shared(t, 0, 'C', 'D', 'E');
+}
int main(int argc, char *argv[])
{
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 21/37] KVM: selftests: guest_memfd: Test indexing in guest_memfd
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (19 preceding siblings ...)
2025-10-17 20:12 ` [RFC PATCH v1 20/37] KVM: selftests: guest_memfd: Test conversion flow when INIT_SHARED Ackerley Tng
@ 2025-10-17 20:12 ` Ackerley Tng
2025-10-17 20:12 ` [RFC PATCH v1 22/37] KVM: selftests: guest_memfd: Test conversion before allocation Ackerley Tng
` (15 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:12 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
The existing guest_memfd conversion tests only use single-page memory
regions. This provides no coverage for multi-page guest_memfd objects,
specifically whether KVM correctly handles the page index for conversion
operations. An incorrect implementation could, for example, always operate
on the first page regardless of the index provided.
Add a new test case to verify that conversions between private and shared
memory correctly target the specified page within a multi-page guest_memfd.
To support this test, add a new GMEM_CONVERSION_MULTIPAGE_TEST_INIT_SHARED
macro that handles setting up and tearing down the VM for each page
iteration. The teardown logic is adjusted to prevent a double-free in this
new scenario.
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
.../kvm/guest_memfd_conversions_test.c | 56 +++++++++++++++++++
1 file changed, 56 insertions(+)
diff --git a/tools/testing/selftests/kvm/guest_memfd_conversions_test.c b/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
index d57e66ee11310..54e7deec992d4 100644
--- a/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
+++ b/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
@@ -63,6 +63,9 @@ static void gmem_conversions_do_teardown(test_data_t *t)
{
/* No need to close gmem_fd, it's owned by the VM structure. */
kvm_vm_free(t->vcpu->vm);
+
+ /* NULL this out to avoid second free on full teardown in multipage tests. */
+ t->vcpu->vm = NULL;
}
FIXTURE_TEARDOWN(gmem_conversions)
@@ -101,6 +104,29 @@ static void __gmem_conversions_##test(test_data_t *t, int nr_pages) \
#define GMEM_CONVERSION_TEST_INIT_SHARED(test) \
__GMEM_CONVERSION_TEST_INIT_SHARED(test, 1)
+/*
+ * Repeats test over nr_pages in a guest_memfd of size nr_pages, providing each
+ * test iteration with test_page, the index of the page under test in
+ * guest_memfd. test_page takes values 0..(nr_pages - 1) inclusive.
+ */
+#define GMEM_CONVERSION_MULTIPAGE_TEST_INIT_SHARED(test, __nr_pages) \
+static void __gmem_conversions_multipage_##test(test_data_t *t, int nr_pages, \
+ const int test_page); \
+ \
+TEST_F(gmem_conversions, test) \
+{ \
+ const uint64_t flags = GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_INIT_SHARED; \
+ int i; \
+ \
+ for (i = 0; i < __nr_pages; ++i) { \
+ gmem_conversions_do_setup(self, __nr_pages, flags); \
+ __gmem_conversions_multipage_##test(self, __nr_pages, i); \
+ gmem_conversions_do_teardown(self); \
+ } \
+} \
+static void __gmem_conversions_multipage_##test(test_data_t *t, int nr_pages, \
+ const int test_page)
+
struct guest_check_data {
void *mem;
char expected_val;
@@ -207,6 +233,36 @@ GMEM_CONVERSION_TEST_INIT_SHARED(init_shared)
test_convert_to_shared(t, 0, 'C', 'D', 'E');
}
+/*
+ * Test indexing of pages within guest_memfd, using test data that is a multiple
+ * of page index.
+ */
+GMEM_CONVERSION_MULTIPAGE_TEST_INIT_SHARED(indexing, 4)
+{
+ int i;
+
+ /*
+ * Start with the highest index, to catch any errors when, perhaps, the
+ * first page is returned even for the last index.
+ */
+ for (i = nr_pages - 1; i >= 0; --i)
+ test_shared(t, i, 0, i, i * 2);
+
+ for (i = 0; i < nr_pages; ++i) {
+ if (i == test_page)
+ test_convert_to_private(t, i, i * 2, i * 4);
+ else
+ test_shared(t, i, i * 2, i * 3, i * 4);
+ }
+
+ for (i = 0; i < nr_pages; ++i) {
+ if (i == test_page)
+ test_convert_to_shared(t, i, i * 4, i * 5, i * 6);
+ else
+ test_shared(t, i, i * 4, i * 5, i * 6);
+ }
+}
+
int main(int argc, char *argv[])
{
TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM));
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 22/37] KVM: selftests: guest_memfd: Test conversion before allocation
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (20 preceding siblings ...)
2025-10-17 20:12 ` [RFC PATCH v1 21/37] KVM: selftests: guest_memfd: Test indexing in guest_memfd Ackerley Tng
@ 2025-10-17 20:12 ` Ackerley Tng
2025-10-17 20:12 ` [RFC PATCH v1 23/37] KVM: selftests: guest_memfd: Convert with allocated folios in different layouts Ackerley Tng
` (14 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:12 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
Add two test cases to the guest_memfd conversions selftest to cover
the scenario where a conversion is requested before any memory has been
allocated in the guest_memfd region.
The KVM_MEMORY_CONVERT_GUEST ioctl can be called on a memory region at any
time. If the guest has not yet faulted in any pages for that region, the
kernel must record the conversion request and apply the requested state
when the pages are eventually allocated.
The new tests cover both conversion directions.
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
.../selftests/kvm/guest_memfd_conversions_test.c | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/tools/testing/selftests/kvm/guest_memfd_conversions_test.c b/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
index 54e7deec992d4..3b222009227c3 100644
--- a/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
+++ b/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
@@ -263,6 +263,20 @@ GMEM_CONVERSION_MULTIPAGE_TEST_INIT_SHARED(indexing, 4)
}
}
+/*
+ * Test that even if there are no folios yet, conversion requests are recorded
+ * in guest_memfd.
+ */
+GMEM_CONVERSION_TEST_INIT_SHARED(before_allocation_shared)
+{
+ test_convert_to_private(t, 0, 0, 'A');
+}
+
+GMEM_CONVERSION_TEST_INIT_PRIVATE(before_allocation_private)
+{
+ test_convert_to_shared(t, 0, 0, 'A', 'B');
+}
+
int main(int argc, char *argv[])
{
TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM));
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 23/37] KVM: selftests: guest_memfd: Convert with allocated folios in different layouts
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (21 preceding siblings ...)
2025-10-17 20:12 ` [RFC PATCH v1 22/37] KVM: selftests: guest_memfd: Test conversion before allocation Ackerley Tng
@ 2025-10-17 20:12 ` Ackerley Tng
2025-10-17 20:12 ` [RFC PATCH v1 24/37] KVM: selftests: guest_memfd: Test precision of conversion Ackerley Tng
` (13 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:12 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
Add a guest_memfd selftest to verify that memory conversions work
correctly with allocated folios in different layouts.
By iterating through which pages are initially faulted, the test covers
various layouts of contiguous allocated and unallocated regions, exercising
conversion with different range layouts.
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
.../kvm/guest_memfd_conversions_test.c | 30 +++++++++++++++++++
1 file changed, 30 insertions(+)
diff --git a/tools/testing/selftests/kvm/guest_memfd_conversions_test.c b/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
index 3b222009227c3..b42b1b27cb727 100644
--- a/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
+++ b/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
@@ -277,6 +277,36 @@ GMEM_CONVERSION_TEST_INIT_PRIVATE(before_allocation_private)
test_convert_to_shared(t, 0, 0, 'A', 'B');
}
+/*
+ * Test that when some of the folios in the conversion range are allocated,
+ * conversion requests are handled correctly in guest_memfd. Vary the ranges
+ * allocated before conversion, using test_page, to cover various layouts of
+ * contiguous allocated and unallocated regions.
+ */
+GMEM_CONVERSION_MULTIPAGE_TEST_INIT_SHARED(unallocated_folios, 8)
+{
+ const int second_page_to_fault = 4;
+ int i;
+
+ /*
+ * Fault 2 of the pages to test filemap range operations except when
+ * test_page == second_page_to_fault.
+ */
+ host_do_rmw(t->mem, test_page, 0, 'A');
+ if (test_page != second_page_to_fault)
+ host_do_rmw(t->mem, second_page_to_fault, 0, 'A');
+
+ gmem_set_private(t->gmem_fd, 0, nr_pages * page_size);
+ for (i = 0; i < nr_pages; ++i) {
+ char expected = (i == test_page || i == second_page_to_fault) ? 'A' : 0;
+
+ test_private(t, i, expected, 'B');
+ }
+
+ for (i = 0; i < nr_pages; ++i)
+ test_convert_to_shared(t, i, 'B', 'C', 'D');
+}
+
int main(int argc, char *argv[])
{
TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM));
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 24/37] KVM: selftests: guest_memfd: Test precision of conversion
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (22 preceding siblings ...)
2025-10-17 20:12 ` [RFC PATCH v1 23/37] KVM: selftests: guest_memfd: Convert with allocated folios in different layouts Ackerley Tng
@ 2025-10-17 20:12 ` Ackerley Tng
2025-10-17 20:12 ` [RFC PATCH v1 25/37] KVM: selftests: guest_memfd: Test that truncation does not change shared/private status Ackerley Tng
` (12 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:12 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
Enhance the guest_memfd indexing selftest to also verify the precision of
memory conversions between private and shared.
The existing test converted a single page within a multi-page mapping but
did not explicitly check the state of the surrounding pages after the
conversion loop.
Add checks to confirm that converting a single page from shared to private
only affects the target page. Iterate through all other pages in the
guest_memfd region to ensure they remain in their original shared state,
thus verifying that the conversion operation is precise and does not have
unintended side effects.
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
.../selftests/kvm/guest_memfd_conversions_test.c | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/kvm/guest_memfd_conversions_test.c b/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
index b42b1b27cb727..43efe4af1403c 100644
--- a/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
+++ b/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
@@ -235,7 +235,8 @@ GMEM_CONVERSION_TEST_INIT_SHARED(init_shared)
/*
* Test indexing of pages within guest_memfd, using test data that is a multiple
- * of page index.
+ * of page index. Also test the precision of conversion, that it does not
+ * affect surrounding pages.
*/
GMEM_CONVERSION_MULTIPAGE_TEST_INIT_SHARED(indexing, 4)
{
@@ -255,12 +256,20 @@ GMEM_CONVERSION_MULTIPAGE_TEST_INIT_SHARED(indexing, 4)
test_shared(t, i, i * 2, i * 3, i * 4);
}
+ /* Confirm that only one page was converted */
for (i = 0; i < nr_pages; ++i) {
if (i == test_page)
- test_convert_to_shared(t, i, i * 4, i * 5, i * 6);
+ test_private(t, i, i * 4, i * 6);
else
test_shared(t, i, i * 4, i * 5, i * 6);
}
+
+ for (i = 0; i < nr_pages; ++i) {
+ if (i == test_page)
+ test_convert_to_shared(t, i, i * 6, i * 7, i * 8);
+ else
+ test_shared(t, i, i * 6, i * 7, i * 8);
+ }
}
/*
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 25/37] KVM: selftests: guest_memfd: Test that truncation does not change shared/private status
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (23 preceding siblings ...)
2025-10-17 20:12 ` [RFC PATCH v1 24/37] KVM: selftests: guest_memfd: Test precision of conversion Ackerley Tng
@ 2025-10-17 20:12 ` Ackerley Tng
2025-10-17 20:12 ` [RFC PATCH v1 26/37] KVM: selftests: guest_memfd: Test that shared/private status is consistent across processes Ackerley Tng
` (11 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:12 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
Add a test to verify that deallocating a page in a guest memfd region via
fallocate() with FALLOC_FL_PUNCH_HOLE does not alter the shared or private
status of the corresponding memory range.
When a page backing a guest memfd mapping is deallocated, e.g., by punching
a hole or truncating the file, and then subsequently faulted back in, the
new page must inherit the correct shared/private status tracked by
guest_memfd.
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
.../selftests/kvm/guest_memfd_conversions_test.c | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/tools/testing/selftests/kvm/guest_memfd_conversions_test.c b/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
index 43efe4af1403c..907d415d72315 100644
--- a/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
+++ b/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
@@ -10,6 +10,7 @@
#include <linux/sizes.h>
#include "kvm_util.h"
+#include "kvm_syscalls.h"
#include "kselftest_harness.h"
#include "test_util.h"
#include "ucall_common.h"
@@ -316,6 +317,19 @@ GMEM_CONVERSION_MULTIPAGE_TEST_INIT_SHARED(unallocated_folios, 8)
test_convert_to_shared(t, i, 'B', 'C', 'D');
}
+/* Truncation should not affect shared/private status. */
+GMEM_CONVERSION_TEST_INIT_SHARED(truncate)
+{
+ host_do_rmw(t->mem, 0, 0, 'A');
+ kvm_fallocate(t->gmem_fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, page_size);
+ host_do_rmw(t->mem, 0, 0, 'A');
+
+ test_convert_to_private(t, 0, 'A', 'B');
+
+ kvm_fallocate(t->gmem_fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, page_size);
+ test_private(t, 0, 0, 'A');
+}
+
int main(int argc, char *argv[])
{
TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM));
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 26/37] KVM: selftests: guest_memfd: Test that shared/private status is consistent across processes
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (24 preceding siblings ...)
2025-10-17 20:12 ` [RFC PATCH v1 25/37] KVM: selftests: guest_memfd: Test that truncation does not change shared/private status Ackerley Tng
@ 2025-10-17 20:12 ` Ackerley Tng
2025-10-17 23:33 ` Ackerley Tng
2025-10-17 20:12 ` [RFC PATCH v1 27/37] KVM: selftests: guest_memfd: Test conversion with elevated page refcount Ackerley Tng
` (10 subsequent siblings)
36 siblings, 1 reply; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:12 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
From: Sean Christopherson <seanjc@google.com>
Add a test to verify that a guest_memfd's shared/private status is
consistent across processes.
The test forks a child process after creating the shared guest_memfd
region so that the second process exists alongside the main process for the
entire test.
The processes then take turns to access memory to check that the
shared/private status is consistent across processes.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Co-developed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
.../kvm/guest_memfd_conversions_test.c | 74 +++++++++++++++++++
1 file changed, 74 insertions(+)
diff --git a/tools/testing/selftests/kvm/guest_memfd_conversions_test.c b/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
index 907d415d72315..e6abf2d30c62d 100644
--- a/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
+++ b/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
@@ -330,6 +330,80 @@ GMEM_CONVERSION_TEST_INIT_SHARED(truncate)
test_private(t, 0, 0, 'A');
}
+/* Test that shared/private memory protections work and are seen from any process. */
+GMEM_CONVERSION_TEST_INIT_SHARED(forked_accesses)
+{
+ /*
+ * No races are intended in this test, shared memory is only used to
+ * coordinate between processes.
+ */
+ static enum {
+ STATE_INIT,
+ STATE_CHECK_SHARED,
+ STATE_DONE_CHECKING_SHARED,
+ STATE_CHECK_PRIVATE,
+ STATE_DONE_CHECKING_PRIVATE,
+ } *test_state;
+ pid_t child_pid;
+
+ test_state = kvm_mmap(sizeof(*test_state), PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS, -1);
+
+#define TEST_STATE_AWAIT(__state) \
+ while (READ_ONCE(*test_state) != __state) { \
+ if (child_pid != 0) { \
+ int status; \
+ pid_t pid; \
+ do { \
+ pid = waitpid(child_pid, &status, WNOHANG); \
+ } while (pid == -1 && errno == EINTR); \
+ if (pid == -1) \
+ TEST_FAIL("Couldn't check child status."); \
+ else if (pid != 0) \
+ TEST_FAIL("Child exited prematurely."); \
+ } \
+ }
+
+#define TEST_STATE_SET(__state) WRITE_ONCE(*test_state, __state)
+
+ child_pid = fork();
+ TEST_ASSERT(child_pid != -1, "fork failed");
+
+ if (child_pid == 0) {
+ const char inconsequential = 0xdd;
+
+ TEST_STATE_AWAIT(STATE_CHECK_SHARED);
+
+ /*
+ * This maps the pages into the child process as well, and tests
+ * that the conversion process will unmap the guest_memfd memory
+ * from all processes.
+ */
+ host_do_rmw(t->mem, 0, 0xB, 0xC);
+
+ TEST_STATE_SET(STATE_DONE_CHECKING_SHARED);
+ TEST_STATE_AWAIT(STATE_CHECK_PRIVATE);
+
+ TEST_EXPECT_SIGBUS(READ_ONCE(t->mem[0]));
+ TEST_EXPECT_SIGBUS(WRITE_ONCE(t->mem[0], inconsequential));
+
+ TEST_STATE_SET(STATE_DONE_CHECKING_PRIVATE);
+ exit(0);
+ }
+
+ test_shared(t, 0, 0, 0xA, 0xB);
+
+ TEST_STATE_SET(STATE_CHECK_SHARED);
+ TEST_STATE_AWAIT(STATE_DONE_CHECKING_SHARED);
+
+ test_convert_to_private(t, 0, 0xC, 0xD);
+
+ TEST_STATE_SET(STATE_CHECK_PRIVATE);
+ TEST_STATE_AWAIT(STATE_DONE_CHECKING_PRIVATE);
+
+ kvm_munmap(test_state, sizeof(*test_state));
+}
+
int main(int argc, char *argv[])
{
TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM));
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* Re: [RFC PATCH v1 26/37] KVM: selftests: guest_memfd: Test that shared/private status is consistent across processes
2025-10-17 20:12 ` [RFC PATCH v1 26/37] KVM: selftests: guest_memfd: Test that shared/private status is consistent across processes Ackerley Tng
@ 2025-10-17 23:33 ` Ackerley Tng
0 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 23:33 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: akpm, binbin.wu, bp, brauner, chao.p.peng, chenhuacai, corbet,
dave.hansen, dave.hansen, david, dmatlack, erdemaktas, fan.du,
fvdl, haibo1.xu, hannes, hch, hpa, hughd, ira.weiny,
isaku.yamahata, jack, james.morse, jarkko, jgg, jgowans, jhubbard,
jroedel, jthoughton, jun.miao, kai.huang, keirf, kent.overstreet,
liam.merwick, maciej.wieczor-retman, mail, maobibo,
mathieu.desnoyers, maz, mhiramat, mhocko, mic, michael.roth,
mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz, oliver.upton,
palmer, pankaj.gupta, paul.walmsley, pbonzini, peterx, pgonda,
prsampat, pvorel, qperret, richard.weiyang, rick.p.edgecombe,
rientjes, rostedt, roypat, rppt, seanjc, shakeel.butt, shuah,
steven.price, steven.sistare, suzuki.poulose, tabba, tglx,
thomas.lendacky, vannapurve, vbabka, viro, vkuznets, wei.w.wang,
will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu, yuzenghui,
zhiquan1.li
Ackerley Tng <ackerleytng@google.com> writes:
> From: Sean Christopherson <seanjc@google.com>
>
> Add a test to verify that a guest_memfd's shared/private status is
> consistent across processes.
>
Missed copying Sean's note from [1]. Rephrased:
Test that on shared to private conversion, any shared pages previously
mapped in any process are unmapped from all processes.
[1] https://lore.kernel.org/all/aN7U1ewx8dNOKl1n@google.com/
> The test forks a child process after creating the shared guest_memfd
> region so that the second process exists alongside the main process for the
> entire test.
>
> The processes then take turns to access memory to check that the
> shared/private status is consistent across processes.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Co-developed-by: Ackerley Tng <ackerleytng@google.com>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> ---
> .../kvm/guest_memfd_conversions_test.c | 74 +++++++++++++++++++
> 1 file changed, 74 insertions(+)
>
>
> [...snip...]
>
^ permalink raw reply [flat|nested] 56+ messages in thread
* [RFC PATCH v1 27/37] KVM: selftests: guest_memfd: Test conversion with elevated page refcount
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (25 preceding siblings ...)
2025-10-17 20:12 ` [RFC PATCH v1 26/37] KVM: selftests: guest_memfd: Test that shared/private status is consistent across processes Ackerley Tng
@ 2025-10-17 20:12 ` Ackerley Tng
2025-10-17 20:12 ` [RFC PATCH v1 28/37] KVM: selftests: Reset shared memory after hole-punching Ackerley Tng
` (9 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:12 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
Add a selftest to verify that converting a shared guest_memfd page to a
private page fails if the page has an elevated reference count.
When KVM converts a shared page to a private one, it expects the page to
have a reference count equal to the reference counts taken by the
filemap. If another kernel subsystem holds a reference to the page, for
example via pin_user_pages(), the conversion must be aborted.
This test uses the gup_test debugfs interface (which requires
CONFIG_GUP_TEST) to call pin_user_pages() on a specific page, artificially
increasing its reference count. It then attempts to convert a range of
pages, including the pinned page, from shared to private.
The test asserts that both bulk and single-page conversion attempts
correctly fail with EAGAIN for the pinned page. After the page is unpinned,
the test verifies that subsequent conversions succeed.
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
.../kvm/guest_memfd_conversions_test.c | 82 +++++++++++++++++++
1 file changed, 82 insertions(+)
diff --git a/tools/testing/selftests/kvm/guest_memfd_conversions_test.c b/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
index e6abf2d30c62d..856166f1b1dfc 100644
--- a/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
+++ b/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
@@ -14,6 +14,7 @@
#include "kselftest_harness.h"
#include "test_util.h"
#include "ucall_common.h"
+#include "../../../../mm/gup_test.h"
FIXTURE(gmem_conversions) {
struct kvm_vcpu *vcpu;
@@ -404,6 +405,87 @@ GMEM_CONVERSION_TEST_INIT_SHARED(forked_accesses)
kvm_munmap(test_state, sizeof(*test_state));
}
+static int gup_test_fd;
+
+static void pin_pages(void *vaddr, uint64_t size)
+{
+ const struct pin_longterm_test args = {
+ .addr = (uint64_t)vaddr,
+ .size = size,
+ .flags = PIN_LONGTERM_TEST_FLAG_USE_WRITE,
+ };
+
+ gup_test_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
+ TEST_REQUIRE(gup_test_fd >= 0);
+
+ TEST_ASSERT_EQ(ioctl(gup_test_fd, PIN_LONGTERM_TEST_START, &args), 0);
+}
+
+static void unpin_pages(void)
+{
+ if (gup_test_fd > 0)
+ TEST_ASSERT_EQ(ioctl(gup_test_fd, PIN_LONGTERM_TEST_STOP), 0);
+}
+
+static void test_convert_to_private_fails(test_data_t *t, loff_t pgoff,
+ size_t nr_pages,
+ loff_t expected_error_offset)
+{
+ loff_t offset = pgoff * page_size;
+ loff_t error_offset = -1ul;
+ int ret;
+
+ do {
+ ret = __gmem_set_private(t->gmem_fd, offset,
+ nr_pages * page_size, &error_offset);
+ } while (ret == -1 && errno == EINTR);
+ TEST_ASSERT(ret == -1 && errno == EAGAIN,
+ "Wanted EAGAIN on page %lu, got %d (ret = %d)", pgoff,
+ errno, ret);
+ TEST_ASSERT_EQ(error_offset, expected_error_offset);
+}
+
+/*
+ * This test depends on CONFIG_GUP_TEST to provide a kernel module that exposes
+ * pin_user_pages() to userspace.
+ */
+GMEM_CONVERSION_MULTIPAGE_TEST_INIT_SHARED(elevated_refcount, 4)
+{
+ int i;
+
+ pin_pages(t->mem + test_page * page_size, page_size);
+
+ for (i = 0; i < nr_pages; i++)
+ test_shared(t, i, 0, 'A', 'B');
+
+ /*
+ * Converting in bulk should fail as long any page in the range has
+ * unexpected refcounts.
+ */
+ test_convert_to_private_fails(t, 0, nr_pages, test_page * page_size);
+
+ for (i = 0; i < nr_pages; i++) {
+ /*
+ * Converting page-wise should also fail as long any page in the
+ * range has unexpected refcounts.
+ */
+ if (i == test_page)
+ test_convert_to_private_fails(t, i, 1, test_page * page_size);
+ else
+ test_convert_to_private(t, i, 'B', 'C');
+ }
+
+ unpin_pages();
+
+ gmem_set_private(t->gmem_fd, 0, nr_pages * page_size);
+
+ for (i = 0; i < nr_pages; i++) {
+ char expected = i == test_page ? 'B' : 'C';
+
+ test_private(t, i, expected, 'D');
+ }
+}
+
int main(int argc, char *argv[])
{
TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM));
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 28/37] KVM: selftests: Reset shared memory after hole-punching
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (26 preceding siblings ...)
2025-10-17 20:12 ` [RFC PATCH v1 27/37] KVM: selftests: guest_memfd: Test conversion with elevated page refcount Ackerley Tng
@ 2025-10-17 20:12 ` Ackerley Tng
2025-10-17 20:12 ` [RFC PATCH v1 29/37] KVM: selftests: Add selftests global for guest memory attributes capability Ackerley Tng
` (8 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:12 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
The private_mem_conversions_test resets shared memory to an initial
pattern at the end of each test iteration. This reset is currently
performed before the (re)mapping pages as shared.
FALLOC_FL_PUNCH_HOLE indirectly zeroes memory, since old folios were
released and new folios are zeroed. This "clobbers" the intended
initial pattern, leaving the memory as all-zeroes for the next
iteration.
Move the memset() to occur after the hole-punch operation to ensure
the memory is correctly re-initialized with the desired pattern. While
at it, update the memset() to reset the entire data region, not just
the portion used in the last loop, to provide a fully clean slate for
the next iteration.
This was not observed before because guest_memfd was only used for
private memory, hence shared memory contents were not zeroed by the
hole punch operation.
Opportunistically add a test/check that truncation zeroes memory.
Fixes: 43f623f350ce1 ("KVM: selftests: Add x86-only selftest for private memory conversions")
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
.../selftests/kvm/x86/private_mem_conversions_test.c | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
index 41f6b38f04071..814187d06fcca 100644
--- a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
+++ b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
@@ -202,15 +202,20 @@ static void guest_test_explicit_conversion(uint64_t base_gpa, bool do_fallocate)
guest_sync_shared(gpa, size, p3, p4);
memcmp_g(gpa, p4, size);
- /* Reset the shared memory back to the initial pattern. */
- memset((void *)gpa, init_p, size);
-
/*
* Free (via PUNCH_HOLE) *all* private memory so that the next
* iteration starts from a clean slate, e.g. with respect to
* whether or not there are pages/folios in guest_mem.
*/
guest_map_shared(base_gpa, PER_CPU_DATA_SIZE, true);
+
+ /*
+ * Test that fallocate(PUNCH_HOLE) because hole-punching zeroes
+ * memory, then reset the entire block back to the initial
+ * pattern for the next GUEST_STAGE.
+ */
+ memcmp_g(base_gpa, 0, PER_CPU_DATA_SIZE);
+ memset((void *)base_gpa, init_p, PER_CPU_DATA_SIZE);
}
}
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 29/37] KVM: selftests: Add selftests global for guest memory attributes capability
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (27 preceding siblings ...)
2025-10-17 20:12 ` [RFC PATCH v1 28/37] KVM: selftests: Reset shared memory after hole-punching Ackerley Tng
@ 2025-10-17 20:12 ` Ackerley Tng
2025-10-17 20:12 ` [RFC PATCH v1 30/37] KVM: selftests: Provide function to look up guest_memfd details from gpa Ackerley Tng
` (7 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:12 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
From: Sean Christopherson <seanjc@google.com>
Add a global variable, kvm_has_gmem_attributes, to make the result of
checking for KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES available to all tests.
kvm_has_gmem_attributes is true if KVM tracks memory attributes at the VM
level, as opposed to per-guest_memfd tracking.
This global variable is meant to be used by the host only.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
tools/testing/selftests/kvm/include/test_util.h | 2 ++
tools/testing/selftests/kvm/lib/kvm_util.c | 4 ++++
2 files changed, 6 insertions(+)
diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
index b4872ba8ed124..2871a42928471 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -113,6 +113,8 @@ struct guest_random_state {
extern uint32_t guest_random_seed;
extern struct guest_random_state guest_rng;
+extern bool kvm_has_gmem_attributes;
+
struct guest_random_state new_guest_random_state(uint32_t seed);
uint32_t guest_random_u32(struct guest_random_state *state);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 19c0445c0b296..c9c59f3ecd14f 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -24,6 +24,8 @@ uint32_t guest_random_seed;
struct guest_random_state guest_rng;
static uint32_t last_guest_seed;
+bool kvm_has_gmem_attributes;
+
static size_t vcpu_mmap_sz(void);
int __open_path_or_exit(const char *path, int flags, const char *enoent_help)
@@ -2321,6 +2323,8 @@ void __attribute((constructor)) kvm_selftest_init(void)
guest_random_seed = last_guest_seed = random();
pr_info("Random seed: 0x%x\n", guest_random_seed);
+ kvm_has_gmem_attributes = kvm_has_cap(KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES);
+
kvm_selftest_arch_init();
}
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 30/37] KVM: selftests: Provide function to look up guest_memfd details from gpa
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (28 preceding siblings ...)
2025-10-17 20:12 ` [RFC PATCH v1 29/37] KVM: selftests: Add selftests global for guest memory attributes capability Ackerley Tng
@ 2025-10-17 20:12 ` Ackerley Tng
2025-10-17 20:12 ` [RFC PATCH v1 31/37] KVM: selftests: Provide common function to set memory attributes Ackerley Tng
` (6 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:12 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
Introduce a new helper, kvm_gpa_to_guest_memfd(), to find the
guest_memfd-related details of a memory region that contains a given guest
physical address (GPA).
The function returns the file descriptor for the memfd, the offset into
the file that corresponds to the GPA, and the number of bytes remaining
in the region from that GPA.
kvm_gpa_to_guest_memfd() was factored out from vm_guest_mem_fallocate();
refactor vm_guest_mem_fallocate() to use the new helper.
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
.../testing/selftests/kvm/include/kvm_util.h | 3 ++
tools/testing/selftests/kvm/lib/kvm_util.c | 34 ++++++++++++-------
2 files changed, 24 insertions(+), 13 deletions(-)
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index dd26a41106fae..e9c2696770cf0 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -404,6 +404,9 @@ static inline void vm_enable_cap(struct kvm_vm *vm, uint32_t cap, uint64_t arg0)
vm_ioctl(vm, KVM_ENABLE_CAP, &enable_cap);
}
+int kvm_gpa_to_guest_memfd(struct kvm_vm *vm, vm_paddr_t gpa, off_t *fd_offset,
+ uint64_t *nr_bytes);
+
#define TEST_REQUIRE_SET_MEMORY_ATTRIBUTES2() \
__TEST_REQUIRE(kvm_has_cap(KVM_CAP_MEMORY_ATTRIBUTES2), \
"KVM selftests now require KVM_SET_MEMORY_ATTRIBUTES2")
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index c9c59f3ecd14f..cb73566fdf153 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1244,27 +1244,19 @@ void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t base, uint64_t size,
bool punch_hole)
{
const int mode = FALLOC_FL_KEEP_SIZE | (punch_hole ? FALLOC_FL_PUNCH_HOLE : 0);
- struct userspace_mem_region *region;
uint64_t end = base + size;
uint64_t gpa, len;
off_t fd_offset;
- int ret;
+ int fd, ret;
for (gpa = base; gpa < end; gpa += len) {
- uint64_t offset;
+ fd = kvm_gpa_to_guest_memfd(vm, gpa, &fd_offset, &len);
+ len = min(end - gpa, len);
- region = userspace_mem_region_find(vm, gpa, gpa);
- TEST_ASSERT(region && region->region.flags & KVM_MEM_GUEST_MEMFD,
- "Private memory region not found for GPA 0x%lx", gpa);
-
- offset = gpa - region->region.guest_phys_addr;
- fd_offset = region->region.guest_memfd_offset + offset;
- len = min_t(uint64_t, end - gpa, region->region.memory_size - offset);
-
- ret = fallocate(region->region.guest_memfd, mode, fd_offset, len);
+ ret = fallocate(fd, mode, fd_offset, len);
TEST_ASSERT(!ret, "fallocate() failed to %s at %lx (len = %lu), fd = %d, mode = %x, offset = %lx",
punch_hole ? "punch hole" : "allocate", gpa, len,
- region->region.guest_memfd, mode, fd_offset);
+ fd, mode, fd_offset);
}
}
@@ -1673,6 +1665,22 @@ void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa)
return (void *) ((uintptr_t) region->host_alias + offset);
}
+int kvm_gpa_to_guest_memfd(struct kvm_vm *vm, vm_paddr_t gpa, off_t *fd_offset,
+ uint64_t *nr_bytes)
+{
+ struct userspace_mem_region *region;
+ vm_paddr_t gpa_offset;
+
+ region = userspace_mem_region_find(vm, gpa, gpa);
+ TEST_ASSERT(region && region->region.flags & KVM_MEM_GUEST_MEMFD,
+ "guest_memfd memory region not found for GPA 0x%lx", gpa);
+
+ gpa_offset = gpa - region->region.guest_phys_addr;
+ *fd_offset = region->region.guest_memfd_offset + gpa_offset;
+ *nr_bytes = region->region.memory_size - gpa_offset;
+ return region->fd;
+}
+
/* Create an interrupt controller chip for the specified VM. */
void vm_create_irqchip(struct kvm_vm *vm)
{
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 31/37] KVM: selftests: Provide common function to set memory attributes
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (29 preceding siblings ...)
2025-10-17 20:12 ` [RFC PATCH v1 30/37] KVM: selftests: Provide function to look up guest_memfd details from gpa Ackerley Tng
@ 2025-10-17 20:12 ` Ackerley Tng
2025-10-17 20:12 ` [RFC PATCH v1 32/37] KVM: selftests: Check fd/flags provided to mmap() when setting up memslot Ackerley Tng
` (5 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:12 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
From: Sean Christopherson <seanjc@google.com>
Introduce vm_mem_set_memory_attributes(), which handles setting of memory
attributes for a range of guest physical addresses, regardless of whether
the attributes should be set via guest_memfd or via the memory attributes
at the VM level.
Refactor existing vm_mem_set_{shared,private} functions to use the new
function.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Co-developed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
.../testing/selftests/kvm/include/kvm_util.h | 44 ++++++++++++++-----
1 file changed, 32 insertions(+), 12 deletions(-)
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index e9c2696770cf0..9f5338bd82b24 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -435,18 +435,6 @@ static inline void vm_set_memory_attributes(struct kvm_vm *vm, uint64_t gpa,
vm_ioctl(vm, KVM_SET_MEMORY_ATTRIBUTES2, &attr);
}
-static inline void vm_mem_set_private(struct kvm_vm *vm, uint64_t gpa,
- uint64_t size)
-{
- vm_set_memory_attributes(vm, gpa, size, KVM_MEMORY_ATTRIBUTE_PRIVATE);
-}
-
-static inline void vm_mem_set_shared(struct kvm_vm *vm, uint64_t gpa,
- uint64_t size)
-{
- vm_set_memory_attributes(vm, gpa, size, 0);
-}
-
static inline int __gmem_set_memory_attributes(int fd, loff_t offset,
uint64_t size,
uint64_t attributes,
@@ -507,6 +495,38 @@ static inline void gmem_set_shared(int fd, loff_t offset, uint64_t size)
gmem_set_memory_attributes(fd, offset, size, 0);
}
+static inline void vm_mem_set_memory_attributes(struct kvm_vm *vm, uint64_t gpa,
+ uint64_t size, uint64_t attrs)
+{
+ if (kvm_has_gmem_attributes) {
+ uint64_t end = gpa + size;
+ uint64_t addr, len;
+ off_t fd_offset;
+ int fd;
+
+ for (addr = gpa; addr < end; addr += len) {
+ fd = kvm_gpa_to_guest_memfd(vm, gpa, &fd_offset, &len);
+ len = min(end - addr, len);
+
+ gmem_set_memory_attributes(fd, fd_offset, len, attrs);
+ }
+ } else {
+ vm_set_memory_attributes(vm, gpa, size, attrs);
+ }
+}
+
+static inline void vm_mem_set_private(struct kvm_vm *vm, uint64_t gpa,
+ uint64_t size)
+{
+ vm_mem_set_memory_attributes(vm, gpa, size, KVM_MEMORY_ATTRIBUTE_PRIVATE);
+}
+
+static inline void vm_mem_set_shared(struct kvm_vm *vm, uint64_t gpa,
+ uint64_t size)
+{
+ vm_mem_set_memory_attributes(vm, gpa, size, 0);
+}
+
void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t gpa, uint64_t size,
bool punch_hole);
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 32/37] KVM: selftests: Check fd/flags provided to mmap() when setting up memslot
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (30 preceding siblings ...)
2025-10-17 20:12 ` [RFC PATCH v1 31/37] KVM: selftests: Provide common function to set memory attributes Ackerley Tng
@ 2025-10-17 20:12 ` Ackerley Tng
2025-10-17 20:12 ` [RFC PATCH v1 33/37] KVM: selftests: Make TEST_EXPECT_SIGBUS thread-safe Ackerley Tng
` (4 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:12 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
From: Sean Christopherson <seanjc@google.com>
Check that a valid fd provided to mmap() must be accompanied by MAP_SHARED.
With an invalid fd (usually used for anonymous mappings), there are no
constraints on mmap() flags.
Add this check to make sure that when a guest_memfd is used as region->fd,
the flag provided to mmap() will include MAP_SHARED.
Signed-off-by: Sean Christopherson <seanjc@google.com>
[Rephrase assertion message.]
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
tools/testing/selftests/kvm/lib/kvm_util.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index cb73566fdf153..8603bd5c705ed 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1057,6 +1057,9 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
region->fd = kvm_memfd_alloc(region->mmap_size,
src_type == VM_MEM_SRC_SHARED_HUGETLB);
+ TEST_ASSERT(region->fd == -1 || backing_src_is_shared(src_type),
+ "A valid fd provided to mmap() must be accompanied by MAP_SHARED.");
+
mmap_offset = flags & KVM_MEM_GUEST_MEMFD ? gmem_offset : 0;
region->mmap_start = __kvm_mmap(region->mmap_size, PROT_READ | PROT_WRITE,
vm_mem_backing_src_alias(src_type)->flag,
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 33/37] KVM: selftests: Make TEST_EXPECT_SIGBUS thread-safe
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (31 preceding siblings ...)
2025-10-17 20:12 ` [RFC PATCH v1 32/37] KVM: selftests: Check fd/flags provided to mmap() when setting up memslot Ackerley Tng
@ 2025-10-17 20:12 ` Ackerley Tng
2025-10-17 20:12 ` [RFC PATCH v1 34/37] KVM: selftests: Update private_mem_conversions_test to mmap() guest_memfd Ackerley Tng
` (3 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:12 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
The TEST_EXPECT_SIGBUS macro is not thread-safe as it uses a global
sigjmp_buf and installs a global SIGBUS signal handler. If multiple threads
execute the macro concurrently, they will race on installing the signal
handler and stomp on other threads' jump buffers, leading to incorrect test
behavior.
Make TEST_EXPECT_SIGBUS thread-safe with the following changes:
Share the KVM tests' global signal handler. sigaction() applies to all
threads; without sharing a global signal handler, one thread may have
removed the signal handler that another thread added, hence leading to
unexpected signals.
The alternative of layering signal handlers was considered, but calling
sigaction() within TEST_EXPECT_SIGBUS() necessarily creates a race. To
avoid adding new setup and teardown routines to do sigaction() and keep
usage of TEST_EXPECT_SIGBUS() simple, share the KVM tests' global signal
handler.
Opportunistically rename report_unexpected_signal to
catchall_signal_handler.
To continue to only expect SIGBUS within specific regions of code, use a
thread-specific variable, expecting_sigbus, to replace installing and
removing signal handlers.
Make the execution environment for the thread, sigjmp_buf, a
thread-specific variable.
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
.../testing/selftests/kvm/include/test_util.h | 27 +++++++++----------
tools/testing/selftests/kvm/lib/kvm_util.c | 18 +++++++++----
tools/testing/selftests/kvm/lib/test_util.c | 7 -----
3 files changed, 25 insertions(+), 27 deletions(-)
diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
index 2871a42928471..0e4e6f7dab8fb 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -80,22 +80,19 @@ do { \
__builtin_unreachable(); \
} while (0)
-extern sigjmp_buf expect_sigbus_jmpbuf;
-void expect_sigbus_handler(int signum);
+extern __thread sigjmp_buf expect_sigbus_jmpbuf;
+extern __thread bool expecting_sigbus;
-#define TEST_EXPECT_SIGBUS(action) \
-do { \
- struct sigaction sa_old, sa_new = { \
- .sa_handler = expect_sigbus_handler, \
- }; \
- \
- sigaction(SIGBUS, &sa_new, &sa_old); \
- if (sigsetjmp(expect_sigbus_jmpbuf, 1) == 0) { \
- action; \
- TEST_FAIL("'%s' should have triggered SIGBUS", #action); \
- } \
- sigaction(SIGBUS, &sa_old, NULL); \
-} while (0)
+#define TEST_EXPECT_SIGBUS(action) \
+ do { \
+ expecting_sigbus = true; \
+ if (sigsetjmp(expect_sigbus_jmpbuf, 1) == 0) { \
+ action; \
+ TEST_FAIL("'%s' should have triggered SIGBUS", \
+ #action); \
+ } \
+ expecting_sigbus = false; \
+ } while (0)
size_t parse_size(const char *size);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 8603bd5c705ed..41169e8cbf8af 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -2302,13 +2302,20 @@ __weak void kvm_selftest_arch_init(void)
{
}
-static void report_unexpected_signal(int signum)
+__thread sigjmp_buf expect_sigbus_jmpbuf;
+__thread bool expecting_sigbus;
+
+static void catchall_signal_handler(int signum)
{
+ switch (signum) {
+ case SIGBUS: {
+ if (expecting_sigbus)
+ siglongjmp(expect_sigbus_jmpbuf, 1);
+
+ TEST_FAIL("Unexpected SIGBUS (%d)\n", signum);
+ }
#define KVM_CASE_SIGNUM(sig) \
case sig: TEST_FAIL("Unexpected " #sig " (%d)\n", signum)
-
- switch (signum) {
- KVM_CASE_SIGNUM(SIGBUS);
KVM_CASE_SIGNUM(SIGSEGV);
KVM_CASE_SIGNUM(SIGILL);
KVM_CASE_SIGNUM(SIGFPE);
@@ -2320,12 +2327,13 @@ static void report_unexpected_signal(int signum)
void __attribute((constructor)) kvm_selftest_init(void)
{
struct sigaction sig_sa = {
- .sa_handler = report_unexpected_signal,
+ .sa_handler = catchall_signal_handler,
};
/* Tell stdout not to buffer its content. */
setbuf(stdout, NULL);
+ expecting_sigbus = false;
sigaction(SIGBUS, &sig_sa, NULL);
sigaction(SIGSEGV, &sig_sa, NULL);
sigaction(SIGILL, &sig_sa, NULL);
diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c
index 8a1848586a857..03eb99af9b8de 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -18,13 +18,6 @@
#include "test_util.h"
-sigjmp_buf expect_sigbus_jmpbuf;
-
-void __attribute__((used)) expect_sigbus_handler(int signum)
-{
- siglongjmp(expect_sigbus_jmpbuf, 1);
-}
-
/*
* Random number generator that is usable from guest code. This is the
* Park-Miller LCG using standard constants.
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 34/37] KVM: selftests: Update private_mem_conversions_test to mmap() guest_memfd
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (32 preceding siblings ...)
2025-10-17 20:12 ` [RFC PATCH v1 33/37] KVM: selftests: Make TEST_EXPECT_SIGBUS thread-safe Ackerley Tng
@ 2025-10-17 20:12 ` Ackerley Tng
2025-10-17 20:12 ` [RFC PATCH v1 35/37] KVM: selftests: Add script to exercise private_mem_conversions_test Ackerley Tng
` (2 subsequent siblings)
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:12 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
Update the private memory conversions selftest to also test conversions
that are done "in-place" via per-guest_memfd memory attributes. In-place
conversions require the host to be able to mmap() the guest_memfd so that
the host and guest can share the same backing physical memory.
This includes several updates, that are conditioned on the system
supporting per-guest_memfd attributes (kvm_has_gmem_attributes):
1. Set up guest_memfd requesting MMAP and INIT_SHARED.
2. With in-place conversions, the host's mapping points directly to the
guest's memory. When the guest converts a region to private, host access
to that region is blocked. Update the test to expect a SIGBUS when
attempting to access the host virtual address (HVA) of private memory.
3. Use vm_mem_set_memory_attributes(), which chooses how to set memory
attributes based on whether kvm_has_gmem_attributes.
Restrict the test to using VM_MEM_SRC_SHMEM because guest_memfd's required
mmap() flags and page sizes happens to align with those of
VM_MEM_SRC_SHMEM. As long as VM_MEM_SRC_SHMEM is used for src_type,
vm_mem_add() works as intended.
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
.../kvm/x86/private_mem_conversions_test.c | 44 ++++++++++++++++---
1 file changed, 37 insertions(+), 7 deletions(-)
diff --git a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
index 814187d06fcca..6730923af830c 100644
--- a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
+++ b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
@@ -309,8 +309,8 @@ static void handle_exit_hypercall(struct kvm_vcpu *vcpu)
vm_guest_mem_fallocate(vm, gpa, size, map_shared);
if (set_attributes)
- vm_set_memory_attributes(vm, gpa, size,
- map_shared ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE);
+ vm_mem_set_memory_attributes(vm, gpa, size,
+ map_shared ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE);
run->hypercall.ret = 0;
}
@@ -354,8 +354,20 @@ static void *__test_mem_conversions(void *__vcpu)
size_t nr_bytes = min_t(size_t, vm->page_size, size - i);
uint8_t *hva = addr_gpa2hva(vm, gpa + i);
- /* In all cases, the host should observe the shared data. */
- memcmp_h(hva, gpa + i, uc.args[3], nr_bytes);
+ /*
+ * When using per-guest_memfd memory attributes,
+ * i.e. in-place conversion, host accesses will
+ * point at guest memory and should SIGBUS when
+ * guest memory is private. When using per-VM
+ * attributes, i.e. separate backing for shared
+ * vs. private, the host should always observe
+ * the shared data.
+ */
+ if (kvm_has_gmem_attributes &&
+ uc.args[0] == SYNC_PRIVATE)
+ TEST_EXPECT_SIGBUS(READ_ONCE(*hva));
+ else
+ memcmp_h(hva, gpa + i, uc.args[3], nr_bytes);
/* For shared, write the new pattern to guest memory. */
if (uc.args[0] == SYNC_SHARED)
@@ -384,6 +396,7 @@ static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t
const size_t slot_size = memfd_size / nr_memslots;
struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
pthread_t threads[KVM_MAX_VCPUS];
+ uint64_t gmem_flags;
struct kvm_vm *vm;
int memfd, i;
@@ -399,12 +412,17 @@ static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t
vm_enable_cap(vm, KVM_CAP_EXIT_HYPERCALL, (1 << KVM_HC_MAP_GPA_RANGE));
- memfd = vm_create_guest_memfd(vm, memfd_size, 0);
+ if (kvm_has_gmem_attributes)
+ gmem_flags = GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_INIT_SHARED;
+ else
+ gmem_flags = 0;
+
+ memfd = vm_create_guest_memfd(vm, memfd_size, gmem_flags);
for (i = 0; i < nr_memslots; i++)
vm_mem_add(vm, src_type, BASE_DATA_GPA + slot_size * i,
BASE_DATA_SLOT + i, slot_size / vm->page_size,
- KVM_MEM_GUEST_MEMFD, memfd, slot_size * i, 0);
+ KVM_MEM_GUEST_MEMFD, memfd, slot_size * i, gmem_flags);
for (i = 0; i < nr_vcpus; i++) {
uint64_t gpa = BASE_DATA_GPA + i * per_cpu_size;
@@ -454,17 +472,29 @@ static void usage(const char *cmd)
int main(int argc, char *argv[])
{
- enum vm_mem_backing_src_type src_type = DEFAULT_VM_MEM_SRC;
+ enum vm_mem_backing_src_type src_type;
uint32_t nr_memslots = 1;
uint32_t nr_vcpus = 1;
int opt;
TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM));
+ src_type = kvm_has_gmem_attributes ? VM_MEM_SRC_SHMEM :
+ DEFAULT_VM_MEM_SRC;
+
while ((opt = getopt(argc, argv, "hm:s:n:")) != -1) {
switch (opt) {
case 's':
src_type = parse_backing_src_type(optarg);
+ if (kvm_has_gmem_attributes && src_type != VM_MEM_SRC_SHMEM) {
+ printf("Overriding mem_type to %s to test in-place conversions\n",
+ vm_mem_backing_src_alias(VM_MEM_SRC_SHMEM)->name);
+ /*
+ * Use VM_MEM_SRC_SHMEM, whose size and mmap flags
+ * align with those of guest_memfd.
+ */
+ src_type = VM_MEM_SRC_SHMEM;
+ }
break;
case 'n':
nr_vcpus = atoi_positive("nr_vcpus", optarg);
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 35/37] KVM: selftests: Add script to exercise private_mem_conversions_test
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (33 preceding siblings ...)
2025-10-17 20:12 ` [RFC PATCH v1 34/37] KVM: selftests: Update private_mem_conversions_test to mmap() guest_memfd Ackerley Tng
@ 2025-10-17 20:12 ` Ackerley Tng
2025-10-17 20:12 ` [RFC PATCH v1 36/37] KVM: selftests: Update pre-fault test to work with per-guest_memfd attributes Ackerley Tng
2025-10-17 20:12 ` [RFC PATCH v1 37/37] KVM: selftests: Update private memory exits test work with per-gmem attributes Ackerley Tng
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:12 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
Add a wrapper script to simplify running the private_mem_conversions_test
with a variety of configurations. Manually invoking the test for all
supported memory backing source types is tedious.
The script automatically detects the availability of 2MB and 1GB hugepages
and builds a list of source types to test. It then iterates through the
list, running the test for each type with both a single memslot and
multiple memslots.
This makes it easier to get comprehensive test coverage across different
memory configurations.
Use python to be able to issue an ioctl to /dev/kvm.
Update .gitignore to allowlist python scripts.
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
tools/testing/selftests/kvm/.gitignore | 1 +
.../kvm/x86/private_mem_conversions_test.py | 159 ++++++++++++++++++
2 files changed, 160 insertions(+)
create mode 100755 tools/testing/selftests/kvm/x86/private_mem_conversions_test.py
diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 1d41a046a7bfd..d7e9c1d97e376 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -4,6 +4,7 @@
!*.c
!*.h
!*.S
+!*.py
!*.sh
!.gitignore
!config
diff --git a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.py b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.py
new file mode 100755
index 0000000000000..32421ae824d64
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Wrapper script which runs different test setups of
+# private_mem_conversions_test.
+#
+# Copyright (C) 2025, Google LLC.
+
+import os
+import fcntl
+import sys
+import subprocess
+
+
+NUM_VCPUS_TO_TEST = 4
+NUM_MEMSLOTS_TO_TEST = NUM_VCPUS_TO_TEST
+
+# Required pages are based on the test setup in the C code.
+# These static requirements are set to the maximum required for
+# NUM_VCPUS_TO_TEST, over all the hugetlb-related tests
+REQUIRED_NUM_2M_HUGEPAGES = 1024 * NUM_VCPUS_TO_TEST
+REQUIRED_NUM_1G_HUGEPAGES = 2 * NUM_VCPUS_TO_TEST
+
+
+def get_hugepage_count(page_size_kb: int) -> int:
+ """Reads the current number of hugepages available for a given size."""
+ try:
+ path = f"/sys/kernel/mm/hugepages/hugepages-{page_size_kb}kB/nr_hugepages"
+ with open(path, 'r') as f:
+ return int(f.read().strip())
+ except (FileNotFoundError, ValueError):
+ return 0
+
+
+def get_default_hugepage_size_in_kb():
+ """Reads the default hugepage size from /proc/meminfo."""
+ try:
+ with open("/proc/meminfo", 'r') as f:
+ for line in f:
+ if line.startswith("Hugepagesize:"):
+ parts = line.split()
+ if len(parts) >= 2 and parts[1].isdigit():
+ return int(parts[1])
+ except FileNotFoundError:
+ return None
+
+
+def run_tests(executable_path: str, src_type: str, num_memslots: int, num_vcpus: int) -> None:
+ """Runs the test executable with different arguments."""
+ print(f"Running tests for backing source type: {src_type}")
+
+ command1 = [executable_path, "-s", src_type, "-m", str(num_memslots)]
+ print(" ".join(command1))
+ _ = subprocess.run(command1, check=True)
+
+ command2 = [executable_path, "-s", src_type, "-m", str(num_memslots), "-n", str(num_vcpus)]
+ print(" ".join(command2))
+ _ = subprocess.run(command2, check=True)
+
+
+def kvm_check_cap(capability: int) -> int:
+ KVM_CHECK_EXTENSION = 0xAE03
+ KVM_DEVICE = '/dev/kvm'
+
+ if not os.path.exists(KVM_DEVICE):
+ print(f"Error: KVM device not found at {KVM_DEVICE}. Is the 'kvm' module loaded?")
+ return -1
+
+ try:
+ fd = os.open(KVM_DEVICE, os.O_RDWR)
+
+ # Issue the ioctl: fcntl.ioctl(fd, request, arg)
+ # request is KVM_CHECK_EXTENSION (0xAE03)
+ # arg is the capability constant (e.g., KVM_CAP_COALESCED_MMIO)
+ result = fcntl.ioctl(fd, KVM_CHECK_EXTENSION, capability)
+
+ os.close(fd)
+ return result
+ except OSError as e:
+ print(f"Error issuing KVM ioctl on {KVM_DEVICE}: {e}", file=sys.stderr)
+ if fd > 0:
+ os.close(fd)
+ return -1
+
+
+def kvm_has_gmem_attributes() -> bool:
+ KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES = 245
+
+ return kvm_check_cap(KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES) > 0
+
+
+def get_backing_source_types() -> list[str]:
+ hugepage_2mb_count = get_hugepage_count(2048)
+ hugepage_2mb_enabled = hugepage_2mb_count >= REQUIRED_NUM_2M_HUGEPAGES
+ hugepage_1gb_count = get_hugepage_count(1048576)
+ hugepage_1gb_enabled = hugepage_1gb_count >= REQUIRED_NUM_1G_HUGEPAGES
+
+ default_hugepage_size_kb = get_default_hugepage_size_in_kb()
+ hugepage_default_enabled = False
+ if default_hugepage_size_kb == 2048:
+ hugepage_default_enabled = hugepage_2mb_enabled
+ elif default_hugepage_size_kb == 1048576:
+ hugepage_default_enabled = hugepage_1gb_enabled
+
+ backing_src_types: list[str] = ["anonymous", "anonymous_thp"]
+
+ if hugepage_default_enabled:
+ backing_src_types.append("anonymous_hugetlb")
+ else:
+ print("skipping anonymous_hugetlb backing source type")
+
+ if hugepage_2mb_enabled:
+ backing_src_types.append("anonymous_hugetlb_2mb")
+ else:
+ print("skipping anonymous_hugetlb_2mb backing source type")
+
+ if hugepage_1gb_enabled:
+ backing_src_types.append("anonymous_hugetlb_1gb")
+ else:
+ print("skipping anonymous_hugetlb_1gb backing source type")
+
+ backing_src_types.append("shmem")
+
+ if hugepage_default_enabled:
+ backing_src_types.append("shared_hugetlb")
+ else:
+ print("skipping shared_hugetlb backing source type")
+
+ return backing_src_types
+
+
+def main():
+ script_dir = os.path.dirname(os.path.abspath(__file__))
+ test_executable = os.path.join(script_dir, "private_mem_conversions_test")
+
+ if not os.path.exists(test_executable):
+ print(f"Error: Test executable not found at '{test_executable}'", file=sys.stderr)
+ sys.exit(1)
+
+ return_code = 0
+
+ backing_src_types = ["shmem"] if kvm_has_gmem_attributes() else get_backing_source_types()
+ try:
+ for i, src_type in enumerate(backing_src_types):
+ if i > 0:
+ print()
+ run_tests(test_executable, src_type, NUM_MEMSLOTS_TO_TEST, NUM_VCPUS_TO_TEST)
+ except subprocess.CalledProcessError as e:
+ print(f"Test failed for source type '{src_type}'. Command: {' '.join(e.cmd)}", file=sys.stderr)
+ return_code = e.returncode
+ except Exception as e:
+ print(f"An unexpected error occurred: {e}", file=sys.stderr)
+ return_code = 1
+
+ sys.exit(return_code)
+
+
+if __name__ == "__main__":
+ main()
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 36/37] KVM: selftests: Update pre-fault test to work with per-guest_memfd attributes
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (34 preceding siblings ...)
2025-10-17 20:12 ` [RFC PATCH v1 35/37] KVM: selftests: Add script to exercise private_mem_conversions_test Ackerley Tng
@ 2025-10-17 20:12 ` Ackerley Tng
2025-10-17 20:12 ` [RFC PATCH v1 37/37] KVM: selftests: Update private memory exits test work with per-gmem attributes Ackerley Tng
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:12 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
From: Sean Christopherson <seanjc@google.com>
Skip setting memory to private in the pre-fault memory test when using
per-gmem memory attributes, as memory is initialized to private by default
for guest_memfd, and using vm_mem_set_private() on a guest_memfd instance
requires creating guest_memfd with GUEST_MEMFD_FLAG_MMAP (which is totally
doable, but would need to be conditional and is ultimately unnecessary).
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
tools/testing/selftests/kvm/pre_fault_memory_test.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/kvm/pre_fault_memory_test.c b/tools/testing/selftests/kvm/pre_fault_memory_test.c
index 6db75946a4f89..6bb5e52f6d948 100644
--- a/tools/testing/selftests/kvm/pre_fault_memory_test.c
+++ b/tools/testing/selftests/kvm/pre_fault_memory_test.c
@@ -188,7 +188,7 @@ static void __test_pre_fault_memory(unsigned long vm_type, bool private)
private ? KVM_MEM_GUEST_MEMFD : 0);
virt_map(vm, gva, gpa, TEST_NPAGES);
- if (private)
+ if (!kvm_has_gmem_attributes && private)
vm_mem_set_private(vm, gpa, TEST_SIZE);
pre_fault_memory(vcpu, gpa, 0, SZ_2M, 0, private);
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread* [RFC PATCH v1 37/37] KVM: selftests: Update private memory exits test work with per-gmem attributes
2025-10-17 20:11 [RFC PATCH v1 00/37] guest_memfd: In-place conversion support Ackerley Tng
` (35 preceding siblings ...)
2025-10-17 20:12 ` [RFC PATCH v1 36/37] KVM: selftests: Update pre-fault test to work with per-guest_memfd attributes Ackerley Tng
@ 2025-10-17 20:12 ` Ackerley Tng
36 siblings, 0 replies; 56+ messages in thread
From: Ackerley Tng @ 2025-10-17 20:12 UTC (permalink / raw)
To: cgroups, kvm, linux-doc, linux-fsdevel, linux-kernel,
linux-kselftest, linux-mm, linux-trace-kernel, x86
Cc: ackerleytng, akpm, binbin.wu, bp, brauner, chao.p.peng,
chenhuacai, corbet, dave.hansen, dave.hansen, david, dmatlack,
erdemaktas, fan.du, fvdl, haibo1.xu, hannes, hch, hpa, hughd,
ira.weiny, isaku.yamahata, jack, james.morse, jarkko, jgg,
jgowans, jhubbard, jroedel, jthoughton, jun.miao, kai.huang,
keirf, kent.overstreet, liam.merwick, maciej.wieczor-retman, mail,
maobibo, mathieu.desnoyers, maz, mhiramat, mhocko, mic,
michael.roth, mingo, mlevitsk, mpe, muchun.song, nikunj, nsaenz,
oliver.upton, palmer, pankaj.gupta, paul.walmsley, pbonzini,
peterx, pgonda, prsampat, pvorel, qperret, richard.weiyang,
rick.p.edgecombe, rientjes, rostedt, roypat, rppt, seanjc,
shakeel.butt, shuah, steven.price, steven.sistare, suzuki.poulose,
tabba, tglx, thomas.lendacky, vannapurve, vbabka, viro, vkuznets,
wei.w.wang, will, willy, wyihan, xiaoyao.li, yan.y.zhao, yilun.xu,
yuzenghui, zhiquan1.li
From: Sean Christopherson <seanjc@google.com>
Skip setting memory to private in the private memory exits test when using
per-gmem memory attributes, as memory is initialized to private by default
for guest_memfd, and using vm_mem_set_private() on a guest_memfd instance
requires creating guest_memfd with GUEST_MEMFD_FLAG_MMAP (which is totally
doable, but would need to be conditional and is ultimately unnecessary).
Expect an emulated MMIO instead of a memory fault exit when attributes are
per-gmem, as deleting the memslot effectively drops the private status,
i.e. the GPA becomes shared and thus supports emulated MMIO.
Skip the "memslot not private" test entirely, as private vs. shared state
for x86 software-protected VMs comes from the memory attributes themselves,
and so when doing in-place conversions there can never be a disconnect
between the expected and actual states.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
.../kvm/x86/private_mem_kvm_exits_test.c | 36 +++++++++++++++----
1 file changed, 30 insertions(+), 6 deletions(-)
diff --git a/tools/testing/selftests/kvm/x86/private_mem_kvm_exits_test.c b/tools/testing/selftests/kvm/x86/private_mem_kvm_exits_test.c
index 13e72fcec8dd2..10be67441d457 100644
--- a/tools/testing/selftests/kvm/x86/private_mem_kvm_exits_test.c
+++ b/tools/testing/selftests/kvm/x86/private_mem_kvm_exits_test.c
@@ -62,8 +62,9 @@ static void test_private_access_memslot_deleted(void)
virt_map(vm, EXITS_TEST_GVA, EXITS_TEST_GPA, EXITS_TEST_NPAGES);
- /* Request to access page privately */
- vm_mem_set_private(vm, EXITS_TEST_GPA, EXITS_TEST_SIZE);
+ /* Request to access page privately. */
+ if (!kvm_has_gmem_attributes)
+ vm_mem_set_private(vm, EXITS_TEST_GPA, EXITS_TEST_SIZE);
pthread_create(&vm_thread, NULL,
(void *(*)(void *))run_vcpu_get_exit_reason,
@@ -74,10 +75,26 @@ static void test_private_access_memslot_deleted(void)
pthread_join(vm_thread, &thread_return);
exit_reason = (uint32_t)(uint64_t)thread_return;
- TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MEMORY_FAULT);
- TEST_ASSERT_EQ(vcpu->run->memory_fault.flags, KVM_MEMORY_EXIT_FLAG_PRIVATE);
- TEST_ASSERT_EQ(vcpu->run->memory_fault.gpa, EXITS_TEST_GPA);
- TEST_ASSERT_EQ(vcpu->run->memory_fault.size, EXITS_TEST_SIZE);
+ /*
+ * If attributes are tracked per-gmem, deleting the memslot that points
+ * at the gmem instance effectively makes the memory shared, and so the
+ * read should trigger emulated MMIO.
+ *
+ * If attributes are tracked per-VM, deleting the memslot shouldn't
+ * affect the private attribute, and so KVM should generate a memory
+ * fault exit (emulated MMIO on private GPAs is disallowed).
+ */
+ if (kvm_has_gmem_attributes) {
+ TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MMIO);
+ TEST_ASSERT_EQ(vcpu->run->mmio.phys_addr, EXITS_TEST_GPA);
+ TEST_ASSERT_EQ(vcpu->run->mmio.len, sizeof(uint64_t));
+ TEST_ASSERT_EQ(vcpu->run->mmio.is_write, false);
+ } else {
+ TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MEMORY_FAULT);
+ TEST_ASSERT_EQ(vcpu->run->memory_fault.flags, KVM_MEMORY_EXIT_FLAG_PRIVATE);
+ TEST_ASSERT_EQ(vcpu->run->memory_fault.gpa, EXITS_TEST_GPA);
+ TEST_ASSERT_EQ(vcpu->run->memory_fault.size, EXITS_TEST_SIZE);
+ }
kvm_vm_free(vm);
}
@@ -88,6 +105,13 @@ static void test_private_access_memslot_not_private(void)
struct kvm_vcpu *vcpu;
uint32_t exit_reason;
+ /*
+ * Accessing non-private memory as private with a software-protected VM
+ * isn't possible when doing in-place conversions.
+ */
+ if (kvm_has_gmem_attributes)
+ return;
+
vm = vm_create_shape_with_one_vcpu(protected_vm_shape, &vcpu,
guest_repeatedly_read);
--
2.51.0.858.gf9c4a03a3a-goog
^ permalink raw reply related [flat|nested] 56+ messages in thread