public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] KVM: gmem: track preparedness a page at a time
@ 2024-11-08 15:50 Paolo Bonzini
  2024-11-08 15:50 ` [PATCH 1/3] KVM: gmem: allocate private data for the gmem inode Paolo Bonzini
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: Paolo Bonzini @ 2024-11-08 15:50 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: michael.roth, seanjc

With support for large pages in gmem, it may happen that part of the gmem
is mapped with large pages and part with 4k pages.  For example, if a
conversion happens on a small region within a large page, the large
page has to be smashed into small pages even if backed by a large folio.
Each of the small pages will have its own state of preparedness,
which makes it harder to use the uptodate flag for preparedness.

Just switch to a bitmap in the inode's i_private data.  This is
a stupidly large amount of code; first because we need to add back
a gmem filesystem just in order to plug in custom inode operations,
second because bitmap operations have to be atomic.  Apart from
that, it's not too hard.

Please review!

Paolo

Paolo Bonzini (3):
  KVM: gmem: allocate private data for the gmem inode
  KVM: gmem: adjust functions to query preparedness state
  KVM: gmem: track preparedness a page at a time

 include/uapi/linux/magic.h |   1 +
 virt/kvm/guest_memfd.c     | 243 ++++++++++++++++++++++++++++++++++---
 virt/kvm/kvm_main.c        |   7 +-
 virt/kvm/kvm_mm.h          |   8 +-
 4 files changed, 237 insertions(+), 22 deletions(-)

-- 
2.43.5


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/3] KVM: gmem: allocate private data for the gmem inode
  2024-11-08 15:50 [PATCH 0/3] KVM: gmem: track preparedness a page at a time Paolo Bonzini
@ 2024-11-08 15:50 ` Paolo Bonzini
  2024-11-14  0:14   ` Sean Christopherson
  2024-11-08 15:50 ` [PATCH 2/3] KVM: gmem: add a complete set of functions to query page preparedness Paolo Bonzini
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 9+ messages in thread
From: Paolo Bonzini @ 2024-11-08 15:50 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: michael.roth, seanjc

In preparation for removing the usage of the uptodate flag,
reintroduce the gmem filesystem type.  We need it in order to
free the private inode information.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/uapi/linux/magic.h |   1 +
 virt/kvm/guest_memfd.c     | 117 +++++++++++++++++++++++++++++++++----
 virt/kvm/kvm_main.c        |   7 ++-
 virt/kvm/kvm_mm.h          |   8 ++-
 4 files changed, 119 insertions(+), 14 deletions(-)

diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index bb575f3ab45e..d856dd6a7ed9 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -103,5 +103,6 @@
 #define DEVMEM_MAGIC		0x454d444d	/* "DMEM" */
 #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
 #define PID_FS_MAGIC		0x50494446	/* "PIDF" */
+#define KVM_GUEST_MEM_MAGIC	0x474d454d	/* "GMEM" */
 
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 8f079a61a56d..3ea5a7597fd4 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -4,9 +4,74 @@
 #include <linux/kvm_host.h>
 #include <linux/pagemap.h>
 #include <linux/anon_inodes.h>
+#include <linux/pseudo_fs.h>
 
 #include "kvm_mm.h"
 
+/* Do all the filesystem crap just for evict_inode... */
+
+static struct vfsmount *kvm_gmem_mnt __read_mostly;
+
+static void gmem_evict_inode(struct inode *inode)
+{
+	kvfree(inode->i_private);
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+}
+
+static const struct super_operations gmem_super_operations = {
+	.drop_inode	= generic_delete_inode,
+	.evict_inode    = gmem_evict_inode,
+	.statfs         = simple_statfs,
+};
+
+static int gmem_init_fs_context(struct fs_context *fc)
+{
+	struct pseudo_fs_context *ctx = init_pseudo(fc, KVM_GUEST_MEM_MAGIC);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->ops = &gmem_super_operations;
+	return 0;
+}
+
+static struct file_system_type kvm_gmem_fs_type = {
+	.name           = "kvm_gmemfs",
+	.init_fs_context = gmem_init_fs_context,
+	.kill_sb        = kill_anon_super,
+};
+
+static struct file *kvm_gmem_create_file(const char *name, const struct file_operations *fops)
+{
+	struct inode *inode;
+	struct file *file;
+
+	if (fops->owner && !try_module_get(fops->owner))
+		return ERR_PTR(-ENOENT);
+
+	inode = alloc_anon_inode(kvm_gmem_mnt->mnt_sb);
+	if (IS_ERR(inode)) {
+		file = ERR_CAST(inode);
+		goto err;
+	}
+	file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, fops);
+	if (IS_ERR(file))
+		goto err_iput;
+
+	return file;
+
+err_iput:
+	iput(inode);
+err:
+	module_put(fops->owner);
+	return file;
+}
+
+
+struct kvm_gmem_inode {
+	unsigned long flags;
+};
+
 struct kvm_gmem {
 	struct kvm *kvm;
 	struct xarray bindings;
@@ -308,9 +373,31 @@ static struct file_operations kvm_gmem_fops = {
 	.fallocate	= kvm_gmem_fallocate,
 };
 
-void kvm_gmem_init(struct module *module)
+int kvm_gmem_init(struct module *module)
 {
+	int ret;
+
+	ret = register_filesystem(&kvm_gmem_fs_type);
+	if (ret) {
+		pr_err("kvm-gmem: cannot register file system (%d)\n", ret);
+		return ret;
+	}
+
+	kvm_gmem_mnt = kern_mount(&kvm_gmem_fs_type);
+	if (IS_ERR(kvm_gmem_mnt)) {
+		pr_err("kvm-gmem: kernel mount failed (%ld)\n", PTR_ERR(kvm_gmem_mnt));
+		return PTR_ERR(kvm_gmem_mnt);
+	}
+
 	kvm_gmem_fops.owner = module;
+
+	return 0;
+}
+
+void kvm_gmem_exit(void)
+{
+	kern_unmount(kvm_gmem_mnt);
+	unregister_filesystem(&kvm_gmem_fs_type);
 }
 
 static int kvm_gmem_migrate_folio(struct address_space *mapping,
@@ -394,15 +481,23 @@ static const struct inode_operations kvm_gmem_iops = {
 
 static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 {
-	const char *anon_name = "[kvm-gmem]";
+	const char *gmem_name = "[kvm-gmem]";
+	struct kvm_gmem_inode *i_gmem;
 	struct kvm_gmem *gmem;
 	struct inode *inode;
 	struct file *file;
 	int fd, err;
 
+	i_gmem = kvzalloc(sizeof(struct kvm_gmem_inode), GFP_KERNEL);
+	if (!i_gmem)
+		return -ENOMEM;
+	i_gmem->flags = flags;
+
 	fd = get_unused_fd_flags(0);
-	if (fd < 0)
-		return fd;
+	if (fd < 0) {
+		err = fd;
+		goto err_i_gmem;
+	}
 
 	gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
 	if (!gmem) {
@@ -410,19 +505,19 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 		goto err_fd;
 	}
 
-	file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem,
-					 O_RDWR, NULL);
+	file = kvm_gmem_create_file(gmem_name, &kvm_gmem_fops);
 	if (IS_ERR(file)) {
 		err = PTR_ERR(file);
 		goto err_gmem;
 	}
 
+	inode = file->f_inode;
+
+	file->f_mapping = inode->i_mapping;
+	file->private_data = gmem;
 	file->f_flags |= O_LARGEFILE;
 
-	inode = file->f_inode;
-	WARN_ON(file->f_mapping != inode->i_mapping);
-
-	inode->i_private = (void *)(unsigned long)flags;
+	inode->i_private = i_gmem;
 	inode->i_op = &kvm_gmem_iops;
 	inode->i_mapping->a_ops = &kvm_gmem_aops;
 	inode->i_mode |= S_IFREG;
@@ -444,6 +539,8 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 	kfree(gmem);
 err_fd:
 	put_unused_fd(fd);
+err_i_gmem:
+	kvfree(i_gmem);
 	return err;
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 279e03029ce1..8b7b4e0eb639 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -6504,7 +6504,9 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
 	if (WARN_ON_ONCE(r))
 		goto err_vfio;
 
-	kvm_gmem_init(module);
+	r = kvm_gmem_init(module);
+	if (r)
+		goto err_gmem;
 
 	r = kvm_init_virtualization();
 	if (r)
@@ -6525,6 +6527,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
 err_register:
 	kvm_uninit_virtualization();
 err_virt:
+	kvm_gmem_exit();
+err_gmem:
 	kvm_vfio_ops_exit();
 err_vfio:
 	kvm_async_pf_deinit();
@@ -6556,6 +6560,7 @@ void kvm_exit(void)
 	for_each_possible_cpu(cpu)
 		free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
 	kmem_cache_destroy(kvm_vcpu_cache);
+	kvm_gmem_exit();
 	kvm_vfio_ops_exit();
 	kvm_async_pf_deinit();
 	kvm_irqfd_exit();
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 715f19669d01..91e4202574a8 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -36,15 +36,17 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
 #endif /* HAVE_KVM_PFNCACHE */
 
 #ifdef CONFIG_KVM_PRIVATE_MEM
-void kvm_gmem_init(struct module *module);
+int kvm_gmem_init(struct module *module);
+void kvm_gmem_exit(void);
 int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args);
 int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
 		  unsigned int fd, loff_t offset);
 void kvm_gmem_unbind(struct kvm_memory_slot *slot);
 #else
-static inline void kvm_gmem_init(struct module *module)
+static inline void kvm_gmem_exit(void) {}
+static inline int kvm_gmem_init(struct module *module)
 {
-
+	return 0;
 }
 
 static inline int kvm_gmem_bind(struct kvm *kvm,
-- 
2.43.5



^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/3] KVM: gmem: add a complete set of functions to query page preparedness
  2024-11-08 15:50 [PATCH 0/3] KVM: gmem: track preparedness a page at a time Paolo Bonzini
  2024-11-08 15:50 ` [PATCH 1/3] KVM: gmem: allocate private data for the gmem inode Paolo Bonzini
@ 2024-11-08 15:50 ` Paolo Bonzini
  2024-11-14  1:42   ` Sean Christopherson
  2024-11-08 15:50 ` [PATCH 3/3] KVM: gmem: track preparedness a page at a time Paolo Bonzini
  2024-11-08 16:32 ` [PATCH 2.5/3] KVM: gmem: limit hole-punching to ranges within the file Paolo Bonzini
  3 siblings, 1 reply; 9+ messages in thread
From: Paolo Bonzini @ 2024-11-08 15:50 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: michael.roth, seanjc

In preparation for moving preparedness out of the folio flags, pass
the struct file* or struct inode* down to kvm_gmem_mark_prepared,
as well as the offset within the gmem file.  Introduce new functions
to unprepare page on punch-hole, and to query the state.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/guest_memfd.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 3ea5a7597fd4..416e02a00cae 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -107,18 +107,28 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
 	return 0;
 }
 
-static inline void kvm_gmem_mark_prepared(struct folio *folio)
+static void kvm_gmem_mark_prepared(struct file *file, pgoff_t index, struct folio *folio)
 {
 	folio_mark_uptodate(folio);
 }
 
+static void kvm_gmem_mark_range_unprepared(struct inode *inode, pgoff_t index, pgoff_t npages)
+{
+}
+
+static bool kvm_gmem_is_prepared(struct file *file, pgoff_t index, struct folio *folio)
+{
+	return folio_test_uptodate(folio);
+}
+
 /*
  * Process @folio, which contains @gfn, so that the guest can use it.
  * The folio must be locked and the gfn must be contained in @slot.
  * On successful return the guest sees a zero page so as to avoid
  * leaking host data and the up-to-date flag is set.
  */
-static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
+static int kvm_gmem_prepare_folio(struct kvm *kvm, struct file *file,
+				  struct kvm_memory_slot *slot,
 				  gfn_t gfn, struct folio *folio)
 {
 	unsigned long nr_pages, i;
@@ -147,7 +157,7 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
 	index = ALIGN_DOWN(index, 1 << folio_order(folio));
 	r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
 	if (!r)
-		kvm_gmem_mark_prepared(folio);
+		kvm_gmem_mark_prepared(file, index, folio);
 
 	return r;
 }
@@ -231,6 +241,7 @@ static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		kvm_gmem_invalidate_begin(gmem, start, end);
 
 	truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
+	kvm_gmem_mark_range_unprepared(inode, start, end - start);
 
 	list_for_each_entry(gmem, gmem_list, entry)
 		kvm_gmem_invalidate_end(gmem, start, end);
@@ -682,7 +693,7 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
 	if (max_order)
 		*max_order = 0;
 
-	*is_prepared = folio_test_uptodate(folio);
+	*is_prepared = kvm_gmem_is_prepared(file, index, folio);
 	return folio;
 }
 
@@ -704,7 +715,7 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 	}
 
 	if (!is_prepared)
-		r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
+		r = kvm_gmem_prepare_folio(kvm, file, slot, gfn, folio);
 
 	folio_unlock(folio);
 	if (r < 0)
@@ -781,8 +792,10 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
 
 		p = src ? src + i * PAGE_SIZE : NULL;
 		ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
-		if (!ret)
-			kvm_gmem_mark_prepared(folio);
+		if (!ret) {
+			pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
+			kvm_gmem_mark_prepared(file, index, folio);
+		}
 
 put_folio_and_exit:
 		folio_put(folio);
-- 
2.43.5



^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 3/3] KVM: gmem: track preparedness a page at a time
  2024-11-08 15:50 [PATCH 0/3] KVM: gmem: track preparedness a page at a time Paolo Bonzini
  2024-11-08 15:50 ` [PATCH 1/3] KVM: gmem: allocate private data for the gmem inode Paolo Bonzini
  2024-11-08 15:50 ` [PATCH 2/3] KVM: gmem: add a complete set of functions to query page preparedness Paolo Bonzini
@ 2024-11-08 15:50 ` Paolo Bonzini
  2024-11-14  1:31   ` Sean Christopherson
  2024-11-08 16:32 ` [PATCH 2.5/3] KVM: gmem: limit hole-punching to ranges within the file Paolo Bonzini
  3 siblings, 1 reply; 9+ messages in thread
From: Paolo Bonzini @ 2024-11-08 15:50 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: michael.roth, seanjc

With support for large pages in gmem, it may happen that part of the gmem
is mapped with large pages and part with 4k pages.  For example, if a
conversion happens on a small region within a large page, the large
page has to be smashed into small pages even if backed by a large folio.
Each of the small pages will have its own state of preparedness,
which makes it harder to use the uptodate flag for preparedness.

Just switch to a bitmap in the inode's i_private data.  This is
a bit gnarly because ordinary bitmap operations in Linux are not
atomic, but otherwise not too hard.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/guest_memfd.c | 103 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 100 insertions(+), 3 deletions(-)

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 416e02a00cae..e08503dfdd8a 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -68,8 +68,13 @@ static struct file *kvm_gmem_create_file(const char *name, const struct file_ope
 }
 
 
+#define KVM_GMEM_INODE_SIZE(size)			\
+	struct_size_t(struct kvm_gmem_inode, prepared,	\
+		      DIV_ROUND_UP(size, PAGE_SIZE * BITS_PER_LONG))
+
 struct kvm_gmem_inode {
 	unsigned long flags;
+	unsigned long prepared[];
 };
 
 struct kvm_gmem {
@@ -107,18 +112,110 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
 	return 0;
 }
 
+/*
+ * The bitmap of prepared pages has to be accessed atomically, because
+ * preparation is not protected by any guest.  This unfortunately means
+ * that we cannot use regular bitmap operations.
+ *
+ * The logic becomes a bit simpler for set and test, which operate a
+ * folio at a time and therefore can assume that the range is naturally
+ * aligned (meaning that either it is smaller than a word, or it is does
+ * not include fractions of a word).  For punch-hole operations however
+ * there is all the complexity.
+ */
+
+static void bitmap_set_atomic_word(unsigned long *p, unsigned long start, unsigned long len)
+{
+	unsigned long mask_to_set =
+		BITMAP_FIRST_WORD_MASK(start) & BITMAP_LAST_WORD_MASK(start + len);
+
+	atomic_long_or(mask_to_set, (atomic_long_t *)p);
+}
+
+static void bitmap_clear_atomic_word(unsigned long *p, unsigned long start, unsigned long len)
+{
+	unsigned long mask_to_set =
+		BITMAP_FIRST_WORD_MASK(start) & BITMAP_LAST_WORD_MASK(start + len);
+
+	atomic_long_andnot(mask_to_set, (atomic_long_t *)p);
+}
+
+static bool bitmap_test_allset_word(unsigned long *p, unsigned long start, unsigned long len)
+{
+	unsigned long mask_to_set =
+		BITMAP_FIRST_WORD_MASK(start) & BITMAP_LAST_WORD_MASK(start + len);
+
+	return (*p & mask_to_set) == mask_to_set;
+}
+
 static void kvm_gmem_mark_prepared(struct file *file, pgoff_t index, struct folio *folio)
 {
-	folio_mark_uptodate(folio);
+	struct kvm_gmem_inode *i_gmem = (struct kvm_gmem_inode *)file->f_inode->i_private;
+	unsigned long *p = i_gmem->prepared + BIT_WORD(index);
+	unsigned long npages = folio_nr_pages(folio);
+
+	/* Folios must be naturally aligned */
+	WARN_ON_ONCE(index & (npages - 1));
+	index &= ~(npages - 1);
+
+	/* Clear page before updating bitmap.  */
+	smp_wmb();
+
+	if (npages < BITS_PER_LONG) {
+		bitmap_set_atomic_word(p, index, npages);
+	} else {
+		BUILD_BUG_ON(BITS_PER_LONG != 64);
+		memset64((u64 *)p, ~0, BITS_TO_LONGS(npages));
+	}
 }
 
 static void kvm_gmem_mark_range_unprepared(struct inode *inode, pgoff_t index, pgoff_t npages)
 {
+	struct kvm_gmem_inode *i_gmem = (struct kvm_gmem_inode *)inode->i_private;
+	unsigned long *p = i_gmem->prepared + BIT_WORD(index);
+
+	index &= BITS_PER_LONG - 1;
+	if (index) {
+		int first_word_count = min(npages, BITS_PER_LONG - index);
+		bitmap_clear_atomic_word(p, index, first_word_count);
+		npages -= first_word_count;
+		p++;
+	}
+
+	if (npages > BITS_PER_LONG) {
+		BUILD_BUG_ON(BITS_PER_LONG != 64);
+		memset64((u64 *)p, 0, BITS_TO_LONGS(npages));
+		p += BIT_WORD(npages);
+		npages &= BITS_PER_LONG - 1;
+	}
+
+	if (npages)
+		bitmap_clear_atomic_word(p++, 0, npages);
 }
 
 static bool kvm_gmem_is_prepared(struct file *file, pgoff_t index, struct folio *folio)
 {
-	return folio_test_uptodate(folio);
+	struct kvm_gmem_inode *i_gmem = (struct kvm_gmem_inode *)file->f_inode->i_private;
+	unsigned long *p = i_gmem->prepared + BIT_WORD(index);
+	unsigned long npages = folio_nr_pages(folio);
+	bool ret;
+
+	/* Folios must be naturally aligned */
+	WARN_ON_ONCE(index & (npages - 1));
+	index &= ~(npages - 1);
+
+	if (npages < BITS_PER_LONG) {
+		ret = bitmap_test_allset_word(p, index, npages);
+	} else {
+		for (; npages > 0; npages -= BITS_PER_LONG)
+			if (*p++ != ~0)
+				break;
+		ret = (npages == 0);
+	}
+
+	/* Synchronize with kvm_gmem_mark_prepared().  */
+	smp_rmb();
+	return ret;
 }
 
 /*
@@ -499,7 +596,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 	struct file *file;
 	int fd, err;
 
-	i_gmem = kvzalloc(sizeof(struct kvm_gmem_inode), GFP_KERNEL);
+	i_gmem = kvzalloc(KVM_GMEM_INODE_SIZE(size), GFP_KERNEL);
 	if (!i_gmem)
 		return -ENOMEM;
 	i_gmem->flags = flags;
-- 
2.43.5


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2.5/3] KVM: gmem: limit hole-punching to ranges within the file
  2024-11-08 15:50 [PATCH 0/3] KVM: gmem: track preparedness a page at a time Paolo Bonzini
                   ` (2 preceding siblings ...)
  2024-11-08 15:50 ` [PATCH 3/3] KVM: gmem: track preparedness a page at a time Paolo Bonzini
@ 2024-11-08 16:32 ` Paolo Bonzini
  3 siblings, 0 replies; 9+ messages in thread
From: Paolo Bonzini @ 2024-11-08 16:32 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: michael.roth, seanjc

Do not pass out-of-bounds values to kvm_gmem_mark_range_unprepared().

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/guest_memfd.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

	Sent separately because I thought this was a bug also in the current
	code but, on closer look, it is fine because ksys_fallocate checks that
	there is no overflow.

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 412d49c6d491..7dc89ceef782 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -324,10 +324,17 @@ static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
 static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 {
 	struct list_head *gmem_list = &inode->i_mapping->i_private_list;
-	pgoff_t start = offset >> PAGE_SHIFT;
-	pgoff_t end = (offset + len) >> PAGE_SHIFT;
+	loff_t size = i_size_read(inode);
+	pgoff_t start, end;
 	struct kvm_gmem *gmem;
 
+	if (offset > size)
+		return 0;
+
+	len = min(size - offset, len);
+	start = offset >> PAGE_SHIFT;
+	end = (offset + len) >> PAGE_SHIFT;
+
 	/*
 	 * Bindings must be stable across invalidation to ensure the start+end
 	 * are balanced.
-- 
2.43.5


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/3] KVM: gmem: allocate private data for the gmem inode
  2024-11-08 15:50 ` [PATCH 1/3] KVM: gmem: allocate private data for the gmem inode Paolo Bonzini
@ 2024-11-14  0:14   ` Sean Christopherson
  0 siblings, 0 replies; 9+ messages in thread
From: Sean Christopherson @ 2024-11-14  0:14 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, michael.roth

+Ackerley, who's also working on resurrecting the file system[*].  At a glance,
there appear to be non-trivial differences, e.g. Ackerley's version has a call
to security_inode_init_security_anon().  I've paged out much of the inode stuff,
so I trust Ackerley's judgment far, far more than my own :-)

[*] https://lore.kernel.org/all/d1940d466fc69472c8b6dda95df2e0522b2d8744.1726009989.git.ackerleytng@google.com

On Fri, Nov 08, 2024, Paolo Bonzini wrote:
> In preparation for removing the usage of the uptodate flag,
> reintroduce the gmem filesystem type.  We need it in order to
> free the private inode information.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  include/uapi/linux/magic.h |   1 +
>  virt/kvm/guest_memfd.c     | 117 +++++++++++++++++++++++++++++++++----
>  virt/kvm/kvm_main.c        |   7 ++-
>  virt/kvm/kvm_mm.h          |   8 ++-
>  4 files changed, 119 insertions(+), 14 deletions(-)
> 
> diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
> index bb575f3ab45e..d856dd6a7ed9 100644
> --- a/include/uapi/linux/magic.h
> +++ b/include/uapi/linux/magic.h
> @@ -103,5 +103,6 @@
>  #define DEVMEM_MAGIC		0x454d444d	/* "DMEM" */
>  #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
>  #define PID_FS_MAGIC		0x50494446	/* "PIDF" */
> +#define KVM_GUEST_MEM_MAGIC	0x474d454d	/* "GMEM" */
>  
>  #endif /* __LINUX_MAGIC_H__ */
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 8f079a61a56d..3ea5a7597fd4 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -4,9 +4,74 @@
>  #include <linux/kvm_host.h>
>  #include <linux/pagemap.h>
>  #include <linux/anon_inodes.h>
> +#include <linux/pseudo_fs.h>
>  
>  #include "kvm_mm.h"
>  
> +/* Do all the filesystem crap just for evict_inode... */
> +
> +static struct vfsmount *kvm_gmem_mnt __read_mostly;
> +
> +static void gmem_evict_inode(struct inode *inode)
> +{
> +	kvfree(inode->i_private);
> +	truncate_inode_pages_final(&inode->i_data);
> +	clear_inode(inode);
> +}
> +
> +static const struct super_operations gmem_super_operations = {
> +	.drop_inode	= generic_delete_inode,
> +	.evict_inode    = gmem_evict_inode,
> +	.statfs         = simple_statfs,
> +};
> +
> +static int gmem_init_fs_context(struct fs_context *fc)
> +{
> +	struct pseudo_fs_context *ctx = init_pseudo(fc, KVM_GUEST_MEM_MAGIC);
> +	if (!ctx)
> +		return -ENOMEM;
> +
> +	ctx->ops = &gmem_super_operations;
> +	return 0;
> +}
> +
> +static struct file_system_type kvm_gmem_fs_type = {
> +	.name           = "kvm_gmemfs",
> +	.init_fs_context = gmem_init_fs_context,
> +	.kill_sb        = kill_anon_super,
> +};
> +
> +static struct file *kvm_gmem_create_file(const char *name, const struct file_operations *fops)
> +{
> +	struct inode *inode;
> +	struct file *file;
> +
> +	if (fops->owner && !try_module_get(fops->owner))
> +		return ERR_PTR(-ENOENT);
> +
> +	inode = alloc_anon_inode(kvm_gmem_mnt->mnt_sb);
> +	if (IS_ERR(inode)) {
> +		file = ERR_CAST(inode);
> +		goto err;
> +	}
> +	file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, fops);
> +	if (IS_ERR(file))
> +		goto err_iput;
> +
> +	return file;
> +
> +err_iput:
> +	iput(inode);
> +err:
> +	module_put(fops->owner);
> +	return file;
> +}
> +
> +
> +struct kvm_gmem_inode {
> +	unsigned long flags;
> +};
> +
>  struct kvm_gmem {
>  	struct kvm *kvm;
>  	struct xarray bindings;
> @@ -308,9 +373,31 @@ static struct file_operations kvm_gmem_fops = {
>  	.fallocate	= kvm_gmem_fallocate,
>  };
>  
> -void kvm_gmem_init(struct module *module)
> +int kvm_gmem_init(struct module *module)
>  {
> +	int ret;
> +
> +	ret = register_filesystem(&kvm_gmem_fs_type);
> +	if (ret) {
> +		pr_err("kvm-gmem: cannot register file system (%d)\n", ret);
> +		return ret;
> +	}
> +
> +	kvm_gmem_mnt = kern_mount(&kvm_gmem_fs_type);
> +	if (IS_ERR(kvm_gmem_mnt)) {
> +		pr_err("kvm-gmem: kernel mount failed (%ld)\n", PTR_ERR(kvm_gmem_mnt));
> +		return PTR_ERR(kvm_gmem_mnt);
> +	}
> +
>  	kvm_gmem_fops.owner = module;
> +
> +	return 0;
> +}
> +
> +void kvm_gmem_exit(void)
> +{
> +	kern_unmount(kvm_gmem_mnt);
> +	unregister_filesystem(&kvm_gmem_fs_type);
>  }
>  
>  static int kvm_gmem_migrate_folio(struct address_space *mapping,
> @@ -394,15 +481,23 @@ static const struct inode_operations kvm_gmem_iops = {
>  
>  static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
>  {
> -	const char *anon_name = "[kvm-gmem]";
> +	const char *gmem_name = "[kvm-gmem]";
> +	struct kvm_gmem_inode *i_gmem;
>  	struct kvm_gmem *gmem;
>  	struct inode *inode;
>  	struct file *file;
>  	int fd, err;
>  
> +	i_gmem = kvzalloc(sizeof(struct kvm_gmem_inode), GFP_KERNEL);
> +	if (!i_gmem)
> +		return -ENOMEM;
> +	i_gmem->flags = flags;
> +
>  	fd = get_unused_fd_flags(0);
> -	if (fd < 0)
> -		return fd;
> +	if (fd < 0) {
> +		err = fd;
> +		goto err_i_gmem;
> +	}
>  
>  	gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
>  	if (!gmem) {
> @@ -410,19 +505,19 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
>  		goto err_fd;
>  	}
>  
> -	file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem,
> -					 O_RDWR, NULL);
> +	file = kvm_gmem_create_file(gmem_name, &kvm_gmem_fops);
>  	if (IS_ERR(file)) {
>  		err = PTR_ERR(file);
>  		goto err_gmem;
>  	}
>  
> +	inode = file->f_inode;
> +
> +	file->f_mapping = inode->i_mapping;
> +	file->private_data = gmem;
>  	file->f_flags |= O_LARGEFILE;
>  
> -	inode = file->f_inode;
> -	WARN_ON(file->f_mapping != inode->i_mapping);
> -
> -	inode->i_private = (void *)(unsigned long)flags;
> +	inode->i_private = i_gmem;
>  	inode->i_op = &kvm_gmem_iops;
>  	inode->i_mapping->a_ops = &kvm_gmem_aops;
>  	inode->i_mode |= S_IFREG;
> @@ -444,6 +539,8 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
>  	kfree(gmem);
>  err_fd:
>  	put_unused_fd(fd);
> +err_i_gmem:
> +	kvfree(i_gmem);
>  	return err;
>  }
>  
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 279e03029ce1..8b7b4e0eb639 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -6504,7 +6504,9 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
>  	if (WARN_ON_ONCE(r))
>  		goto err_vfio;
>  
> -	kvm_gmem_init(module);
> +	r = kvm_gmem_init(module);
> +	if (r)
> +		goto err_gmem;
>  
>  	r = kvm_init_virtualization();
>  	if (r)
> @@ -6525,6 +6527,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
>  err_register:
>  	kvm_uninit_virtualization();
>  err_virt:
> +	kvm_gmem_exit();
> +err_gmem:
>  	kvm_vfio_ops_exit();
>  err_vfio:
>  	kvm_async_pf_deinit();
> @@ -6556,6 +6560,7 @@ void kvm_exit(void)
>  	for_each_possible_cpu(cpu)
>  		free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
>  	kmem_cache_destroy(kvm_vcpu_cache);
> +	kvm_gmem_exit();
>  	kvm_vfio_ops_exit();
>  	kvm_async_pf_deinit();
>  	kvm_irqfd_exit();
> diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
> index 715f19669d01..91e4202574a8 100644
> --- a/virt/kvm/kvm_mm.h
> +++ b/virt/kvm/kvm_mm.h
> @@ -36,15 +36,17 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
>  #endif /* HAVE_KVM_PFNCACHE */
>  
>  #ifdef CONFIG_KVM_PRIVATE_MEM
> -void kvm_gmem_init(struct module *module);
> +int kvm_gmem_init(struct module *module);
> +void kvm_gmem_exit(void);
>  int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args);
>  int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
>  		  unsigned int fd, loff_t offset);
>  void kvm_gmem_unbind(struct kvm_memory_slot *slot);
>  #else
> -static inline void kvm_gmem_init(struct module *module)
> +static inline void kvm_gmem_exit(void) {}
> +static inline int kvm_gmem_init(struct module *module)
>  {
> -
> +	return 0;
>  }
>  
>  static inline int kvm_gmem_bind(struct kvm *kvm,
> -- 
> 2.43.5
> 
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/3] KVM: gmem: track preparedness a page at a time
  2024-11-08 15:50 ` [PATCH 3/3] KVM: gmem: track preparedness a page at a time Paolo Bonzini
@ 2024-11-14  1:31   ` Sean Christopherson
  2024-11-14  1:41     ` Sean Christopherson
  0 siblings, 1 reply; 9+ messages in thread
From: Sean Christopherson @ 2024-11-14  1:31 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, michael.roth

On Fri, Nov 08, 2024, Paolo Bonzini wrote:
> With support for large pages in gmem, it may happen that part of the gmem
> is mapped with large pages and part with 4k pages.  For example, if a
> conversion happens on a small region within a large page, the large
> page has to be smashed into small pages even if backed by a large folio.
> Each of the small pages will have its own state of preparedness,
> which makes it harder to use the uptodate flag for preparedness.
> 
> Just switch to a bitmap in the inode's i_private data.  This is
> a bit gnarly because ordinary bitmap operations in Linux are not
> atomic, but otherwise not too hard.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  virt/kvm/guest_memfd.c | 103 +++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 100 insertions(+), 3 deletions(-)
> 
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 416e02a00cae..e08503dfdd8a 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -68,8 +68,13 @@ static struct file *kvm_gmem_create_file(const char *name, const struct file_ope
>  }
>  
>  
> +#define KVM_GMEM_INODE_SIZE(size)			\
> +	struct_size_t(struct kvm_gmem_inode, prepared,	\
> +		      DIV_ROUND_UP(size, PAGE_SIZE * BITS_PER_LONG))
> +
>  struct kvm_gmem_inode {
>  	unsigned long flags;
> +	unsigned long prepared[];
>  };
>  
>  struct kvm_gmem {
> @@ -107,18 +112,110 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
>  	return 0;
>  }
>  
> +/*
> + * The bitmap of prepared pages has to be accessed atomically, because
> + * preparation is not protected by any guest.

Not entirely sure what you intended to say here.  KVM should never rely on the
guest to provide protection for anything :-)

> +   This unfortunately means that we cannot use regular bitmap operations.

No "we" please.

> + * The logic becomes a bit simpler for set and test, which operate a
> + * folio at a time and therefore can assume that the range is naturally
> + * aligned (meaning that either it is smaller than a word, or it is does
> + * not include fractions of a word).  For punch-hole operations however
> + * there is all the complexity.
> + */

This entire comment feels like it belongs in the changelog.

> +
> +static void bitmap_set_atomic_word(unsigned long *p, unsigned long start, unsigned long len)

I vote to keep wrapping at ~80 chars by default.  I have no objection to running
slightly past 80 when the result is a net postive, and in some cases prefer to do
so, but as a general rule I like wrapping at ~80 better than wrapping at ~100.

> +{
> +	unsigned long mask_to_set =
> +		BITMAP_FIRST_WORD_MASK(start) & BITMAP_LAST_WORD_MASK(start + len);

Align the right hand side?  E.g.

	unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start) &
				    BITMAP_LAST_WORD_MASK(start + len);

Though I think it makes sense to add a macro to generate the mask, maybe with an
assert that the inputs are sane?  E.g. something like BITMAP_WORD_MASK().

> +
> +	atomic_long_or(mask_to_set, (atomic_long_t *)p);
> +}
> +
> +static void bitmap_clear_atomic_word(unsigned long *p, unsigned long start, unsigned long len)

If these are going to stay in KVM, I think it would be better to force the caller
to cast to atomic_long_t.  Actually, I think kvm_gmem_inode.prepared itself should
be an array of atomic_long_t.  More below.

> +{
> +	unsigned long mask_to_set =

mask_to_clear, though if we add a macro, this could probably be something like:

	atomic_long_andnot(BITMAP_WORD_MASK(start, len), p);

> +		BITMAP_FIRST_WORD_MASK(start) & BITMAP_LAST_WORD_MASK(start + len);
> +
> +	atomic_long_andnot(mask_to_set, (atomic_long_t *)p);
> +}
> +
> +static bool bitmap_test_allset_word(unsigned long *p, unsigned long start, unsigned long len)
> +{
> +	unsigned long mask_to_set =

mask_to_test, though using a macro would make the local variables go away.  At
that point, it might even make sense to forego the helpers entirely.

> +		BITMAP_FIRST_WORD_MASK(start) & BITMAP_LAST_WORD_MASK(start + len);
> +
> +	return (*p & mask_to_set) == mask_to_set;

Dereferencing *p should be READ_ONCE(), no?  And if everything is tracked as
atomic_long_t, then KVM is forced to do the right thing via atomic_long_read().

> +}
> +
>  static void kvm_gmem_mark_prepared(struct file *file, pgoff_t index, struct folio *folio)
>  {
> -	folio_mark_uptodate(folio);
> +	struct kvm_gmem_inode *i_gmem = (struct kvm_gmem_inode *)file->f_inode->i_private;
> +	unsigned long *p = i_gmem->prepared + BIT_WORD(index);
> +	unsigned long npages = folio_nr_pages(folio);
> +
> +	/* Folios must be naturally aligned */
> +	WARN_ON_ONCE(index & (npages - 1));
> +	index &= ~(npages - 1);
> +
> +	/* Clear page before updating bitmap.  */
> +	smp_wmb();
> +
> +	if (npages < BITS_PER_LONG) {
> +		bitmap_set_atomic_word(p, index, npages);
> +	} else {
> +		BUILD_BUG_ON(BITS_PER_LONG != 64);
> +		memset64((u64 *)p, ~0, BITS_TO_LONGS(npages));

More code that could be deduplicated (unprepared path below).

But more importantly, I'm pretty sure the entire lockless approach is unsafe.

Callers of kvm_gmem_get_pfn() do not take any locks that protect kvm_gmem_mark_prepared()
from racing with kvm_gmem_mark_range_unprepared().  kvm_mmu_invalidate_begin()
prevents KVM from installing a stage-2 mapping, i.e. from consuming the PFN, but
it doesn't prevent kvm_gmem_mark_prepared() from being called.  And
sev_handle_rmp_fault() never checks mmu_notifiers, which is probably fine?  But
sketchy.

Oof.  Side topic.  sev_handle_rmp_fault() is suspect for other reasons.  It does
its own lookup of the PFN, and so could see an entirely different PFN than was
resolved by kvm_mmu_page_fault().  And thanks to KVM's wonderful 0/1/-errno
behavior, sev_handle_rmp_fault() is invoked even when KVM wants to retry the
fault, e.g. due to an active MMU invalidation.

Anyways, KVM wouldn't _immediately_ consume bad data, as the invalidation
would block the current page fault.  But clobbering i_gmem->prepared would result
in a missed "prepare" phase if the hole-punch region were restored.

One idea would be to use a rwlock to protect updates to the bitmap (setters can
generally stomp on each other).  And to avoid contention whenever possible in
page fault paths, only take the lock if the page is not up-to-date, because again
kvm_mmu_invalidate_{begin,end}() protects against UAF, it's purely updates to the
bitmap that need extra protection.

Note, using is mmu_invalidate_retry_gfn() is unsafe, because it must be called
under mmu_lock to ensure it doesn't get false negatives.

> +	}
>  }
>  
>  static void kvm_gmem_mark_range_unprepared(struct inode *inode, pgoff_t index, pgoff_t npages)
>  {
> +	struct kvm_gmem_inode *i_gmem = (struct kvm_gmem_inode *)inode->i_private;
> +	unsigned long *p = i_gmem->prepared + BIT_WORD(index);
> +
> +	index &= BITS_PER_LONG - 1;
> +	if (index) {
> +		int first_word_count = min(npages, BITS_PER_LONG - index);

Newline.

> +		bitmap_clear_atomic_word(p, index, first_word_count);
> +		npages -= first_word_count;
> +		p++;
> +	}
> +
> +	if (npages > BITS_PER_LONG) {
> +		BUILD_BUG_ON(BITS_PER_LONG != 64);
> +		memset64((u64 *)p, 0, BITS_TO_LONGS(npages));
> +		p += BIT_WORD(npages);
> +		npages &= BITS_PER_LONG - 1;
> +	}
> +
> +	if (npages)
> +		bitmap_clear_atomic_word(p++, 0, npages);
>  }
>  
>  static bool kvm_gmem_is_prepared(struct file *file, pgoff_t index, struct folio *folio)
>  {
> -	return folio_test_uptodate(folio);
> +	struct kvm_gmem_inode *i_gmem = (struct kvm_gmem_inode *)file->f_inode->i_private;
> +	unsigned long *p = i_gmem->prepared + BIT_WORD(index);
> +	unsigned long npages = folio_nr_pages(folio);
> +	bool ret;
> +
> +	/* Folios must be naturally aligned */
> +	WARN_ON_ONCE(index & (npages - 1));
> +	index &= ~(npages - 1);
> +
> +	if (npages < BITS_PER_LONG) {
> +		ret = bitmap_test_allset_word(p, index, npages);
> +	} else {
> +		for (; npages > 0; npages -= BITS_PER_LONG)

for-loop needs curly braces.

> +			if (*p++ != ~0)

Presumably a READ_ONCE() here as well.

> +				break;
> +		ret = (npages == 0);
> +	}
> +
> +	/* Synchronize with kvm_gmem_mark_prepared().  */
> +	smp_rmb();
> +	return ret;
>  }
>  
>  /*
> @@ -499,7 +596,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
>  	struct file *file;
>  	int fd, err;
>  
> -	i_gmem = kvzalloc(sizeof(struct kvm_gmem_inode), GFP_KERNEL);
> +	i_gmem = kvzalloc(KVM_GMEM_INODE_SIZE(size), GFP_KERNEL);
>  	if (!i_gmem)
>  		return -ENOMEM;
>  	i_gmem->flags = flags;
> -- 
> 2.43.5
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/3] KVM: gmem: track preparedness a page at a time
  2024-11-14  1:31   ` Sean Christopherson
@ 2024-11-14  1:41     ` Sean Christopherson
  0 siblings, 0 replies; 9+ messages in thread
From: Sean Christopherson @ 2024-11-14  1:41 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, michael.roth

On Wed, Nov 13, 2024, Sean Christopherson wrote:
> On Fri, Nov 08, 2024, Paolo Bonzini wrote:
> >  static void kvm_gmem_mark_prepared(struct file *file, pgoff_t index, struct folio *folio)
> >  {
> > -	folio_mark_uptodate(folio);
> > +	struct kvm_gmem_inode *i_gmem = (struct kvm_gmem_inode *)file->f_inode->i_private;
> > +	unsigned long *p = i_gmem->prepared + BIT_WORD(index);
> > +	unsigned long npages = folio_nr_pages(folio);
> > +
> > +	/* Folios must be naturally aligned */
> > +	WARN_ON_ONCE(index & (npages - 1));
> > +	index &= ~(npages - 1);
> > +
> > +	/* Clear page before updating bitmap.  */
> > +	smp_wmb();
> > +
> > +	if (npages < BITS_PER_LONG) {
> > +		bitmap_set_atomic_word(p, index, npages);
> > +	} else {
> > +		BUILD_BUG_ON(BITS_PER_LONG != 64);
> > +		memset64((u64 *)p, ~0, BITS_TO_LONGS(npages));
> 
> More code that could be deduplicated (unprepared path below).
> 
> But more importantly, I'm pretty sure the entire lockless approach is unsafe.
> 
> Callers of kvm_gmem_get_pfn() do not take any locks that protect kvm_gmem_mark_prepared()
> from racing with kvm_gmem_mark_range_unprepared().  kvm_mmu_invalidate_begin()
> prevents KVM from installing a stage-2 mapping, i.e. from consuming the PFN, but
> it doesn't prevent kvm_gmem_mark_prepared() from being called.  And
> sev_handle_rmp_fault() never checks mmu_notifiers, which is probably fine?  But
> sketchy.
> 
> Oof.  Side topic.  sev_handle_rmp_fault() is suspect for other reasons.  It does
> its own lookup of the PFN, and so could see an entirely different PFN than was
> resolved by kvm_mmu_page_fault().  And thanks to KVM's wonderful 0/1/-errno
> behavior, sev_handle_rmp_fault() is invoked even when KVM wants to retry the
> fault, e.g. due to an active MMU invalidation.
> 
> Anyways, KVM wouldn't _immediately_ consume bad data, as the invalidation
> would block the current page fault.  But clobbering i_gmem->prepared would result
> in a missed "prepare" phase if the hole-punch region were restored.
> 
> One idea would be to use a rwlock to protect updates to the bitmap (setters can
> generally stomp on each other).  And to avoid contention whenever possible in
> page fault paths, only take the lock if the page is not up-to-date, because again
> kvm_mmu_invalidate_{begin,end}() protects against UAF, it's purely updates to the
> bitmap that need extra protection.

Actually, there's no point in having a rwlock, because readers are serialized on
the folio's lock.  And KVM absolutely relies on that already, because otherwise
multiple vCPUs could see an unprepared folio, and clear_highpage() could end up
racing with writes from the vCPU.

> Note, using is mmu_invalidate_retry_gfn() is unsafe, because it must be called
> under mmu_lock to ensure it doesn't get false negatives.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/3] KVM: gmem: add a complete set of functions to query page preparedness
  2024-11-08 15:50 ` [PATCH 2/3] KVM: gmem: add a complete set of functions to query page preparedness Paolo Bonzini
@ 2024-11-14  1:42   ` Sean Christopherson
  0 siblings, 0 replies; 9+ messages in thread
From: Sean Christopherson @ 2024-11-14  1:42 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, michael.roth

On Fri, Nov 08, 2024, Paolo Bonzini wrote:
> In preparation for moving preparedness out of the folio flags, pass
> the struct file* or struct inode* down to kvm_gmem_mark_prepared,
> as well as the offset within the gmem file.  Introduce new functions
> to unprepare page on punch-hole, and to query the state.

...

> +static bool kvm_gmem_is_prepared(struct file *file, pgoff_t index, struct folio *folio)
> +{
> +	return folio_test_uptodate(folio);
> +}
> +
>  /*
>   * Process @folio, which contains @gfn, so that the guest can use it.
>   * The folio must be locked and the gfn must be contained in @slot.
>   * On successful return the guest sees a zero page so as to avoid
>   * leaking host data and the up-to-date flag is set.
>   */
> -static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> +static int kvm_gmem_prepare_folio(struct kvm *kvm, struct file *file,
> +				  struct kvm_memory_slot *slot,
>  				  gfn_t gfn, struct folio *folio)
>  {
>  	unsigned long nr_pages, i;
> @@ -147,7 +157,7 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>  	index = ALIGN_DOWN(index, 1 << folio_order(folio));
>  	r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
>  	if (!r)
> -		kvm_gmem_mark_prepared(folio);
> +		kvm_gmem_mark_prepared(file, index, folio);
>  
>  	return r;
>  }
> @@ -231,6 +241,7 @@ static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  		kvm_gmem_invalidate_begin(gmem, start, end);
>  
>  	truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
> +	kvm_gmem_mark_range_unprepared(inode, start, end - start);
>  
>  	list_for_each_entry(gmem, gmem_list, entry)
>  		kvm_gmem_invalidate_end(gmem, start, end);
> @@ -682,7 +693,7 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
>  	if (max_order)
>  		*max_order = 0;
>  
> -	*is_prepared = folio_test_uptodate(folio);
> +	*is_prepared = kvm_gmem_is_prepared(file, index, folio);
>  	return folio;
>  }
>  
> @@ -704,7 +715,7 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>  	}
>  
>  	if (!is_prepared)
> -		r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
> +		r = kvm_gmem_prepare_folio(kvm, file, slot, gfn, folio);

This is broken when the next patch comes along.  If KVM encounters a partially
prepared folio, i.e. a folio with some prepared pages and some unprepared pages,
then KVM needs to zero only the unprepared pages.  But kvm_gmem_prepare_folio()
zeroes everything.

static int kvm_gmem_prepare_folio(struct kvm *kvm, struct file *file,
				  struct kvm_memory_slot *slot,
				  gfn_t gfn, struct folio *folio)
{
	unsigned long nr_pages, i;
	pgoff_t index;
	int r;

	nr_pages = folio_nr_pages(folio);
	for (i = 0; i < nr_pages; i++)
		clear_highpage(folio_page(folio, i));


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2024-11-14  1:42 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-11-08 15:50 [PATCH 0/3] KVM: gmem: track preparedness a page at a time Paolo Bonzini
2024-11-08 15:50 ` [PATCH 1/3] KVM: gmem: allocate private data for the gmem inode Paolo Bonzini
2024-11-14  0:14   ` Sean Christopherson
2024-11-08 15:50 ` [PATCH 2/3] KVM: gmem: add a complete set of functions to query page preparedness Paolo Bonzini
2024-11-14  1:42   ` Sean Christopherson
2024-11-08 15:50 ` [PATCH 3/3] KVM: gmem: track preparedness a page at a time Paolo Bonzini
2024-11-14  1:31   ` Sean Christopherson
2024-11-14  1:41     ` Sean Christopherson
2024-11-08 16:32 ` [PATCH 2.5/3] KVM: gmem: limit hole-punching to ranges within the file Paolo Bonzini

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox