From: Ackerley Tng <ackerleytng@google.com>
To: aik@amd.com, andrew.jones@linux.dev, binbin.wu@linux.intel.com,
brauner@kernel.org, chao.p.peng@linux.intel.com,
david@kernel.org, ira.weiny@intel.com, jmattson@google.com,
jroedel@suse.de, jthoughton@google.com, michael.roth@amd.com,
oupton@kernel.org, pankaj.gupta@amd.com, qperret@google.com,
rick.p.edgecombe@intel.com, rientjes@google.com,
shivankg@amd.com, steven.price@arm.com, tabba@google.com,
willy@infradead.org, wyihan@google.com, yan.y.zhao@intel.com,
forkloop@google.com, pratyush@kernel.org,
suzuki.poulose@arm.com, aneesh.kumar@kernel.org,
Paolo Bonzini <pbonzini@redhat.com>,
Sean Christopherson <seanjc@google.com>,
Thomas Gleixner <tglx@kernel.org>, Ingo Molnar <mingo@redhat.com>,
Borislav Petkov <bp@alien8.de>,
Dave Hansen <dave.hansen@linux.intel.com>,
x86@kernel.org, "H. Peter Anvin" <hpa@zytor.com>,
Steven Rostedt <rostedt@goodmis.org>,
Masami Hiramatsu <mhiramat@kernel.org>,
Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
Jonathan Corbet <corbet@lwn.net>,
Shuah Khan <skhan@linuxfoundation.org>,
Shuah Khan <shuah@kernel.org>,
Vishal Annapurve <vannapurve@google.com>,
Jason Gunthorpe <jgg@ziepe.ca>,
Vlastimil Babka <vbabka@kernel.org>
Cc: kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
linux-trace-kernel@vger.kernel.org, linux-doc@vger.kernel.org,
linux-kselftest@vger.kernel.org,
Ackerley Tng <ackerleytng@google.com>
Subject: [PATCH RFC v3 01/43] KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
Date: Fri, 13 Mar 2026 06:12:40 +0000 [thread overview]
Message-ID: <20260313-gmem-inplace-conversion-v3-1-5fc12a70ec89@google.com> (raw)
In-Reply-To: <20260313-gmem-inplace-conversion-v3-0-5fc12a70ec89@google.com>
From: Sean Christopherson <seanjc@google.com>
Start plumbing in guest_memfd support for in-place private<=>shared
conversions by tracking attributes via a maple tree. KVM currently tracks
private vs. shared attributes on a per-VM basis, which made sense when a
guest_memfd _only_ supported private memory, but tracking per-VM simply
can't work for in-place conversions as the shareability of a given page
needs to be per-gmem_inode, not per-VM.
Use the filemap invalidation lock to protect the maple tree, as taking the
lock for read when faulting in memory (for userspace or the guest) isn't
expected to result in meaningful contention, and using a separate lock
would add significant complexity (avoid deadlock is quite difficult).
Signed-off-by: Sean Christopherson <seanjc@google.com>
Co-developed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Vishal Annapurve <vannapurve@google.com>
Signed-off-by: Vishal Annapurve <vannapurve@google.com>
Co-developed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
---
virt/kvm/guest_memfd.c | 134 +++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 118 insertions(+), 16 deletions(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 017d84a7adf37..67709af03f39d 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -4,6 +4,7 @@
#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/kvm_host.h>
+#include <linux/maple_tree.h>
#include <linux/mempolicy.h>
#include <linux/pseudo_fs.h>
#include <linux/pagemap.h>
@@ -32,6 +33,7 @@ struct gmem_inode {
struct inode vfs_inode;
u64 flags;
+ struct maple_tree attributes;
};
static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
@@ -59,6 +61,31 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
return gfn - slot->base_gfn + slot->gmem.pgoff;
}
+static u64 kvm_gmem_get_attributes(struct inode *inode, pgoff_t index)
+{
+ struct maple_tree *mt = &GMEM_I(inode)->attributes;
+ void *entry = mtree_load(mt, index);
+
+ /*
+ * The lock _must_ be held for lookups, as some maple tree operations,
+ * e.g. append, are unsafe (return inaccurate information) with respect
+ * to concurrent RCU-protected lookups.
+ */
+ lockdep_assert(mt_lock_is_held(mt));
+
+ return WARN_ON_ONCE(!entry) ? 0 : xa_to_value(entry);
+}
+
+static bool kvm_gmem_is_private_mem(struct inode *inode, pgoff_t index)
+{
+ return kvm_gmem_get_attributes(inode, index) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
+}
+
+static bool kvm_gmem_is_shared_mem(struct inode *inode, pgoff_t index)
+{
+ return !kvm_gmem_is_private_mem(inode, index);
+}
+
static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
pgoff_t index, struct folio *folio)
{
@@ -397,10 +424,13 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
return VM_FAULT_SIGBUS;
- if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED))
- return VM_FAULT_SIGBUS;
+ filemap_invalidate_lock_shared(inode->i_mapping);
+ if (kvm_gmem_is_shared_mem(inode, vmf->pgoff))
+ folio = kvm_gmem_get_folio(inode, vmf->pgoff);
+ else
+ folio = ERR_PTR(-EACCES);
+ filemap_invalidate_unlock_shared(inode->i_mapping);
- folio = kvm_gmem_get_folio(inode, vmf->pgoff);
if (IS_ERR(folio)) {
if (PTR_ERR(folio) == -EAGAIN)
return VM_FAULT_RETRY;
@@ -556,6 +586,51 @@ bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
return true;
}
+static int kvm_gmem_init_inode(struct inode *inode, loff_t size, u64 flags)
+{
+ struct gmem_inode *gi = GMEM_I(inode);
+ MA_STATE(mas, &gi->attributes, 0, (size >> PAGE_SHIFT) - 1);
+ u64 attrs;
+ int r;
+
+ inode->i_op = &kvm_gmem_iops;
+ inode->i_mapping->a_ops = &kvm_gmem_aops;
+ inode->i_mode |= S_IFREG;
+ inode->i_size = size;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+
+ /*
+ * guest_memfd memory is neither migratable nor swappable: set
+ * inaccessible to gate off both.
+ */
+ mapping_set_inaccessible(inode->i_mapping);
+ WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
+
+ gi->flags = flags;
+
+ mt_set_external_lock(&gi->attributes,
+ &inode->i_mapping->invalidate_lock);
+
+ /*
+ * Store default attributes for the entire gmem instance. Ensuring every
+ * index is represented in the maple tree at all times simplifies the
+ * conversion and merging logic.
+ */
+ attrs = gi->flags & GUEST_MEMFD_FLAG_INIT_SHARED ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE;
+
+ /*
+ * Acquire the invalidation lock purely to make lockdep happy. The
+ * maple tree library expects all stores to be protected via the lock,
+ * and the library can't know when the tree is reachable only by the
+ * caller, as is the case here.
+ */
+ filemap_invalidate_lock(inode->i_mapping);
+ r = mas_store_gfp(&mas, xa_mk_value(attrs), GFP_KERNEL);
+ filemap_invalidate_unlock(inode->i_mapping);
+
+ return r;
+}
+
static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
{
static const char *name = "[kvm-gmem]";
@@ -586,16 +661,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
goto err_fops;
}
- inode->i_op = &kvm_gmem_iops;
- inode->i_mapping->a_ops = &kvm_gmem_aops;
- inode->i_mode |= S_IFREG;
- inode->i_size = size;
- mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
- mapping_set_inaccessible(inode->i_mapping);
- /* Unmovable mappings are supposed to be marked unevictable as well. */
- WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
-
- GMEM_I(inode)->flags = flags;
+ err = kvm_gmem_init_inode(inode, size, flags);
+ if (err)
+ goto err_inode;
file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
if (IS_ERR(file)) {
@@ -797,9 +865,13 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
if (!file)
return -EFAULT;
+ filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
+
folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
- if (IS_ERR(folio))
- return PTR_ERR(folio);
+ if (IS_ERR(folio)) {
+ r = PTR_ERR(folio);
+ goto out;
+ }
if (!folio_test_uptodate(folio)) {
clear_highpage(folio_page(folio, 0));
@@ -815,6 +887,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
else
folio_put(folio);
+out:
+ filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
return r;
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
@@ -944,13 +1018,41 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
mpol_shared_policy_init(&gi->policy, NULL);
+ /*
+ * Memory attributes are protected by the filemap invalidation lock, but
+ * the lock structure isn't available at this time. Immediately mark
+ * maple tree as using external locking so that accessing the tree
+ * before it's fully initialized results in NULL pointer dereferences
+ * and not more subtle bugs.
+ */
+ mt_init_flags(&gi->attributes, MT_FLAGS_LOCK_EXTERN);
+
gi->flags = 0;
return &gi->vfs_inode;
}
static void kvm_gmem_destroy_inode(struct inode *inode)
{
- mpol_free_shared_policy(&GMEM_I(inode)->policy);
+ struct gmem_inode *gi = GMEM_I(inode);
+
+ mpol_free_shared_policy(&gi->policy);
+
+ /*
+ * Note! Checking for an empty tree is functionally necessary
+ * to avoid explosions if the tree hasn't been fully
+ * initialized, i.e. if the inode is being destroyed before
+ * guest_memfd can set the external lock, lockdep would find
+ * that the tree's internal ma_lock was not held.
+ */
+ if (!mtree_empty(&gi->attributes)) {
+ /*
+ * Acquire the invalidation lock purely to make lockdep happy,
+ * the inode is unreachable at this point.
+ */
+ filemap_invalidate_lock(inode->i_mapping);
+ __mt_destroy(&gi->attributes);
+ filemap_invalidate_unlock(inode->i_mapping);
+ }
}
static void kvm_gmem_free_inode(struct inode *inode)
--
2.53.0.851.ga537e3e6e9-goog
next prev parent reply other threads:[~2026-03-13 6:12 UTC|newest]
Thread overview: 45+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-13 6:12 [PATCH RFC v3 00/43] guest_memfd: In-place conversion support Ackerley Tng
2026-03-13 6:12 ` Ackerley Tng [this message]
2026-03-13 6:12 ` [PATCH RFC v3 02/43] KVM: Rename KVM_GENERIC_MEMORY_ATTRIBUTES to KVM_VM_MEMORY_ATTRIBUTES Ackerley Tng
2026-03-13 6:12 ` [PATCH RFC v3 03/43] KVM: Enumerate support for PRIVATE memory iff kvm_arch_has_private_mem is defined Ackerley Tng
2026-03-13 6:12 ` [PATCH RFC v3 04/43] KVM: Stub in ability to disable per-VM memory attribute tracking Ackerley Tng
2026-03-13 6:12 ` [PATCH RFC v3 05/43] KVM: guest_memfd: Wire up kvm_get_memory_attributes() to per-gmem attributes Ackerley Tng
2026-03-13 6:12 ` [PATCH RFC v3 06/43] KVM: guest_memfd: Update kvm_gmem_populate() to use gmem attributes Ackerley Tng
2026-03-13 6:12 ` [PATCH RFC v3 07/43] KVM: Introduce KVM_SET_MEMORY_ATTRIBUTES2 Ackerley Tng
2026-03-13 6:12 ` [PATCH RFC v3 08/43] KVM: guest_memfd: Enable INIT_SHARED on guest_memfd for x86 Coco VMs Ackerley Tng
2026-03-13 6:12 ` [PATCH RFC v3 09/43] KVM: guest_memfd: Add support for KVM_SET_MEMORY_ATTRIBUTES2 Ackerley Tng
2026-03-13 6:12 ` [PATCH RFC v3 10/43] KVM: guest_memfd: Handle lru_add fbatch refcounts during conversion safety check Ackerley Tng
2026-03-13 6:12 ` [PATCH RFC v3 11/43] KVM: Move KVM_VM_MEMORY_ATTRIBUTES config definition to x86 Ackerley Tng
2026-03-13 6:12 ` [PATCH RFC v3 12/43] KVM: Let userspace disable per-VM mem attributes, enable per-gmem attributes Ackerley Tng
2026-03-13 6:12 ` [PATCH RFC v3 13/43] KVM: selftests: Create gmem fd before "regular" fd when adding memslot Ackerley Tng
2026-03-13 6:12 ` [PATCH RFC v3 14/43] KVM: selftests: Rename guest_memfd{,_offset} to gmem_{fd,offset} Ackerley Tng
2026-03-13 6:12 ` [PATCH RFC v3 15/43] KVM: selftests: Add support for mmap() on guest_memfd in core library Ackerley Tng
2026-03-13 6:12 ` [PATCH RFC v3 16/43] KVM: selftests: Add selftests global for guest memory attributes capability Ackerley Tng
2026-03-13 6:12 ` [PATCH RFC v3 17/43] KVM: selftests: Update framework to use KVM_SET_MEMORY_ATTRIBUTES2 Ackerley Tng
2026-03-13 6:12 ` [PATCH RFC v3 18/43] KVM: selftests: Add helpers for calling ioctls on guest_memfd Ackerley Tng
2026-03-13 6:12 ` [PATCH RFC v3 19/43] KVM: selftests: Test using guest_memfd for guest private memory Ackerley Tng
2026-03-13 6:12 ` [PATCH RFC v3 20/43] KVM: selftests: Test basic single-page conversion flow Ackerley Tng
2026-03-13 6:13 ` [PATCH RFC v3 21/43] KVM: selftests: Test conversion flow when INIT_SHARED Ackerley Tng
2026-03-13 6:13 ` [PATCH RFC v3 22/43] KVM: selftests: Test indexing in guest_memfd Ackerley Tng
2026-03-13 6:13 ` [PATCH RFC v3 23/43] KVM: selftests: Test conversion before allocation Ackerley Tng
2026-03-13 6:13 ` [PATCH RFC v3 24/43] KVM: selftests: Convert with allocated folios in different layouts Ackerley Tng
2026-03-13 6:13 ` [PATCH RFC v3 25/43] KVM: selftests: Test precision of conversion Ackerley Tng
2026-03-13 6:13 ` [PATCH RFC v3 26/43] KVM: selftests: Test that truncation does not change shared/private status Ackerley Tng
2026-03-13 6:13 ` [PATCH RFC v3 27/43] KVM: selftests: Test that shared/private status is consistent across processes Ackerley Tng
2026-03-13 6:13 ` [PATCH RFC v3 28/43] KVM: selftests: Test conversion with elevated page refcount Ackerley Tng
2026-03-13 6:13 ` [PATCH RFC v3 29/43] KVM: selftests: Reset shared memory after hole-punching Ackerley Tng
2026-03-13 6:13 ` [PATCH RFC v3 30/43] KVM: selftests: Provide function to look up guest_memfd details from gpa Ackerley Tng
2026-03-13 6:13 ` [PATCH RFC v3 31/43] KVM: selftests: Provide common function to set memory attributes Ackerley Tng
2026-03-13 6:13 ` [PATCH RFC v3 32/43] KVM: selftests: Check fd/flags provided to mmap() when setting up memslot Ackerley Tng
2026-03-13 6:13 ` [PATCH RFC v3 33/43] KVM: selftests: Make TEST_EXPECT_SIGBUS thread-safe Ackerley Tng
2026-03-13 6:13 ` [PATCH RFC v3 34/43] KVM: selftests: Update private_mem_conversions_test to mmap() guest_memfd Ackerley Tng
2026-03-13 6:13 ` [PATCH RFC v3 35/43] KVM: selftests: Add script to exercise private_mem_conversions_test Ackerley Tng
2026-03-13 6:13 ` [PATCH RFC v3 36/43] KVM: selftests: Update pre-fault test to work with per-guest_memfd attributes Ackerley Tng
2026-03-13 6:13 ` [PATCH RFC v3 37/43] KVM: selftests: Update private memory exits test work with per-gmem attributes Ackerley Tng
2026-03-13 6:13 ` [PATCH RFC v3 38/43] KVM: guest_memfd: Introduce default handlers for content modes Ackerley Tng
2026-03-13 6:13 ` [PATCH RFC v3 39/43] KVM: guest_memfd: Apply content modes while setting memory attributes Ackerley Tng
2026-03-13 6:13 ` [PATCH RFC v3 40/43] KVM: x86: Add support for applying content modes Ackerley Tng
2026-03-13 6:13 ` [PATCH RFC v3 41/43] KVM: x86: Support content mode ZERO for TDX Ackerley Tng
2026-03-13 6:13 ` [PATCH RFC v3 42/43] KVM: selftests: Allow flags to be specified in set_memory_attributes functions Ackerley Tng
2026-03-13 6:13 ` [PATCH RFC v3 43/43] KVM: selftests: Update tests to use flag-enabled library functions Ackerley Tng
2026-03-13 15:45 ` [PATCH RFC v3 00/43] guest_memfd: In-place conversion support Sean Christopherson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260313-gmem-inplace-conversion-v3-1-5fc12a70ec89@google.com \
--to=ackerleytng@google.com \
--cc=aik@amd.com \
--cc=andrew.jones@linux.dev \
--cc=aneesh.kumar@kernel.org \
--cc=binbin.wu@linux.intel.com \
--cc=bp@alien8.de \
--cc=brauner@kernel.org \
--cc=chao.p.peng@linux.intel.com \
--cc=corbet@lwn.net \
--cc=dave.hansen@linux.intel.com \
--cc=david@kernel.org \
--cc=forkloop@google.com \
--cc=hpa@zytor.com \
--cc=ira.weiny@intel.com \
--cc=jgg@ziepe.ca \
--cc=jmattson@google.com \
--cc=jroedel@suse.de \
--cc=jthoughton@google.com \
--cc=kvm@vger.kernel.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=linux-trace-kernel@vger.kernel.org \
--cc=mathieu.desnoyers@efficios.com \
--cc=mhiramat@kernel.org \
--cc=michael.roth@amd.com \
--cc=mingo@redhat.com \
--cc=oupton@kernel.org \
--cc=pankaj.gupta@amd.com \
--cc=pbonzini@redhat.com \
--cc=pratyush@kernel.org \
--cc=qperret@google.com \
--cc=rick.p.edgecombe@intel.com \
--cc=rientjes@google.com \
--cc=rostedt@goodmis.org \
--cc=seanjc@google.com \
--cc=shivankg@amd.com \
--cc=shuah@kernel.org \
--cc=skhan@linuxfoundation.org \
--cc=steven.price@arm.com \
--cc=suzuki.poulose@arm.com \
--cc=tabba@google.com \
--cc=tglx@kernel.org \
--cc=vannapurve@google.com \
--cc=vbabka@kernel.org \
--cc=willy@infradead.org \
--cc=wyihan@google.com \
--cc=x86@kernel.org \
--cc=yan.y.zhao@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox