From: Sean Christopherson <seanjc@google.com>
To: Marc Zyngier <maz@kernel.org>,
Oliver Upton <oliver.upton@linux.dev>,
Paolo Bonzini <pbonzini@redhat.com>,
Sean Christopherson <seanjc@google.com>
Cc: linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,
kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
David Hildenbrand <david@redhat.com>,
Fuad Tabba <tabba@google.com>,
Ackerley Tng <ackerleytng@google.com>,
Shivank Garg <shivankg@amd.com>,
Ashish Kalra <ashish.kalra@amd.com>,
Vlastimil Babka <vbabka@suse.cz>
Subject: [PATCH v12 05/12] KVM: guest_memfd: Enforce NUMA mempolicy using shared policy
Date: Tue, 7 Oct 2025 15:14:13 -0700 [thread overview]
Message-ID: <20251007221420.344669-6-seanjc@google.com> (raw)
In-Reply-To: <20251007221420.344669-1-seanjc@google.com>
From: Shivank Garg <shivankg@amd.com>
Previously, guest-memfd allocations followed local NUMA node id in absence
of process mempolicy, resulting in arbitrary memory allocation.
Moreover, mbind() couldn't be used by the VMM as guest memory wasn't
mapped into userspace when allocation occurred.
Enable NUMA policy support by implementing vm_ops for guest-memfd mmap
operation. This allows the VMM to map the memory and use mbind() to set the
desired NUMA policy. The policy is stored in the inode structure via
kvm_gmem_inode_info, as memory policy is a property of the memory (struct
inode) itself. The policy is then retrieved via mpol_shared_policy_lookup()
and passed to filemap_grab_folio_mpol() to ensure that allocations follow
the specified memory policy.
This enables the VMM to control guest memory NUMA placement by calling
mbind() on the mapped memory regions, providing fine-grained control over
guest memory allocation across NUMA nodes.
The policy change only affect future allocations and does not migrate
existing memory. This matches mbind(2)'s default behavior which affects
only new allocations unless overridden with MPOL_MF_MOVE/MPOL_MF_MOVE_ALL
flags, which are not supported for guest_memfd as it is unmovable.
Suggested-by: David Hildenbrand <david@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Shivank Garg <shivankg@amd.com>
Tested-by: Ashish Kalra <ashish.kalra@amd.com>
[sean: document the ABI impact of the ->get_policy() hook]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
virt/kvm/guest_memfd.c | 69 ++++++++++++++++++++++++++++++++++++++++--
1 file changed, 67 insertions(+), 2 deletions(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index cc3b25155726..95267c92983b 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -4,6 +4,7 @@
#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/kvm_host.h>
+#include <linux/mempolicy.h>
#include <linux/pseudo_fs.h>
#include <linux/pagemap.h>
@@ -27,6 +28,7 @@ struct gmem_file {
};
struct gmem_inode {
+ struct shared_policy policy;
struct inode vfs_inode;
};
@@ -112,6 +114,19 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
return r;
}
+static struct mempolicy *kvm_gmem_get_folio_policy(struct gmem_inode *gi,
+ pgoff_t index)
+{
+#ifdef CONFIG_NUMA
+ struct mempolicy *mpol;
+
+ mpol = mpol_shared_policy_lookup(&gi->policy, index);
+ return mpol ? mpol : get_task_policy(current);
+#else
+ return NULL;
+#endif
+}
+
/*
* Returns a locked folio on success. The caller is responsible for
* setting the up-to-date flag before the memory is mapped into the guest.
@@ -124,7 +139,25 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
{
/* TODO: Support huge pages. */
- return filemap_grab_folio(inode->i_mapping, index);
+ struct mempolicy *policy;
+ struct folio *folio;
+
+ /*
+ * Fast-path: See if folio is already present in mapping to avoid
+ * policy_lookup.
+ */
+ folio = __filemap_get_folio(inode->i_mapping, index,
+ FGP_LOCK | FGP_ACCESSED, 0);
+ if (!IS_ERR(folio))
+ return folio;
+
+ policy = kvm_gmem_get_folio_policy(GMEM_I(inode), index);
+ folio = __filemap_get_folio_mpol(inode->i_mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ mapping_gfp_mask(inode->i_mapping), policy);
+ mpol_cond_put(policy);
+
+ return folio;
}
static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode)
@@ -413,8 +446,38 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
return ret;
}
+#ifdef CONFIG_NUMA
+static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+
+ return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol);
+}
+
+static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
+ unsigned long addr, pgoff_t *pgoff)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+
+ *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
+
+ /*
+ * Note! Directly return whatever the lookup returns, do NOT return
+ * the current task's policy as is done when looking up the policy for
+ * a specific folio. Kernel ABI for get_mempolicy() is to return
+ * MPOL_DEFAULT when there is no defined policy, not whatever the
+ * default policy resolves to.
+ */
+ return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
+}
+#endif /* CONFIG_NUMA */
+
static const struct vm_operations_struct kvm_gmem_vm_ops = {
- .fault = kvm_gmem_fault_user_mapping,
+ .fault = kvm_gmem_fault_user_mapping,
+#ifdef CONFIG_NUMA
+ .get_policy = kvm_gmem_get_policy,
+ .set_policy = kvm_gmem_set_policy,
+#endif
};
static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
@@ -867,11 +930,13 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
if (!gi)
return NULL;
+ mpol_shared_policy_init(&gi->policy, NULL);
return &gi->vfs_inode;
}
static void kvm_gmem_destroy_inode(struct inode *inode)
{
+ mpol_free_shared_policy(&GMEM_I(inode)->policy);
}
static void kvm_gmem_free_inode(struct inode *inode)
--
2.51.0.710.ga91ca5db03-goog
next prev parent reply other threads:[~2025-10-07 22:14 UTC|newest]
Thread overview: 34+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-10-07 22:14 [PATCH v12 00/12] KVM: guest_memfd: Add NUMA mempolicy support Sean Christopherson
2025-10-07 22:14 ` [PATCH v12 01/12] KVM: guest_memfd: Rename "struct kvm_gmem" to "struct gmem_file" Sean Christopherson
2025-10-08 5:25 ` Garg, Shivank
2025-10-09 21:08 ` Ackerley Tng
2025-10-10 15:07 ` David Hildenbrand
2025-10-07 22:14 ` [PATCH v12 02/12] KVM: guest_memfd: Add macro to iterate over gmem_files for a mapping/inode Sean Christopherson
2025-10-08 5:30 ` Garg, Shivank
2025-10-09 21:27 ` Ackerley Tng
2025-10-07 22:14 ` [PATCH v12 03/12] KVM: guest_memfd: Use guest mem inodes instead of anonymous inodes Sean Christopherson
2025-10-07 22:14 ` [PATCH v12 04/12] KVM: guest_memfd: Add slab-allocated inode cache Sean Christopherson
2025-10-09 21:39 ` Ackerley Tng
2025-10-09 22:16 ` Ackerley Tng
2025-10-07 22:14 ` Sean Christopherson [this message]
2025-10-09 22:15 ` [PATCH v12 05/12] KVM: guest_memfd: Enforce NUMA mempolicy using shared policy Ackerley Tng
2025-10-10 7:57 ` Garg, Shivank
2025-10-10 20:33 ` Sean Christopherson
2025-10-10 21:57 ` Ackerley Tng
2025-10-12 20:00 ` Garg, Shivank
2025-10-15 16:56 ` Sean Christopherson
2025-10-07 22:14 ` [PATCH v12 06/12] KVM: selftests: Define wrappers for common syscalls to assert success Sean Christopherson
2025-10-09 21:44 ` Ackerley Tng
2025-10-07 22:14 ` [PATCH v12 07/12] KVM: selftests: Report stacktraces SIGBUS, SIGSEGV, SIGILL, and SIGFPE by default Sean Christopherson
2025-10-09 22:31 ` Ackerley Tng
2025-10-07 22:14 ` [PATCH v12 08/12] KVM: selftests: Add additional equivalents to libnuma APIs in KVM's numaif.h Sean Christopherson
2025-10-09 22:34 ` Ackerley Tng
2025-10-07 22:14 ` [PATCH v12 09/12] KVM: selftests: Use proper uAPI headers to pick up mempolicy.h definitions Sean Christopherson
2025-10-10 17:59 ` Ackerley Tng
2025-10-07 22:14 ` [PATCH v12 10/12] KVM: selftests: Add helpers to probe for NUMA support, and multi-node systems Sean Christopherson
2025-10-07 22:14 ` [PATCH v12 11/12] KVM: selftests: Add guest_memfd tests for mmap and NUMA policy support Sean Christopherson
2025-10-09 23:08 ` Ackerley Tng
2025-10-07 22:14 ` [PATCH v12 12/12] KVM: guest_memfd: Add gmem_inode.flags field instead of using i_private Sean Christopherson
2025-10-09 20:58 ` [PATCH v12 00/12] KVM: guest_memfd: Add NUMA mempolicy support Ackerley Tng
2025-10-10 4:59 ` Garg, Shivank
2025-10-10 17:56 ` Ackerley Tng
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251007221420.344669-6-seanjc@google.com \
--to=seanjc@google.com \
--cc=ackerleytng@google.com \
--cc=ashish.kalra@amd.com \
--cc=david@redhat.com \
--cc=kvm@vger.kernel.org \
--cc=kvmarm@lists.linux.dev \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=linux-kernel@vger.kernel.org \
--cc=maz@kernel.org \
--cc=oliver.upton@linux.dev \
--cc=pbonzini@redhat.com \
--cc=shivankg@amd.com \
--cc=tabba@google.com \
--cc=vbabka@suse.cz \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox