Linux-HyperV List
 help / color / mirror / Atom feed
* [PATCH v4 05/21] mm: switch the rmap lock held option off in compat layer
From: Lorenzo Stoakes (Oracle) @ 2026-03-20 22:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jonathan Corbet, Clemens Ladisch, Arnd Bergmann,
	Greg Kroah-Hartman, K . Y . Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Alexander Shishkin, Maxime Coquelin,
	Alexandre Torgue, Miquel Raynal, Richard Weinberger,
	Vignesh Raghavendra, Bodo Stroesser, Martin K . Petersen,
	David Howells, Marc Dionne, Alexander Viro, Christian Brauner,
	Jan Kara, David Hildenbrand, Liam R . Howlett, Vlastimil Babka,
	Mike Rapoport, Suren Baghdasaryan, Michal Hocko, Jann Horn,
	Pedro Falcato, linux-kernel, linux-doc, linux-hyperv, linux-stm32,
	linux-arm-kernel, linux-mtd, linux-staging, linux-scsi,
	target-devel, linux-afs, linux-fsdevel, linux-mm, Ryan Roberts
In-Reply-To: <cover.1774045440.git.ljs@kernel.org>

In the mmap_prepare compatibility layer, we don't need to hold the rmap
lock, as we are being called from an .mmap handler.

The .mmap_prepare hook, when invoked in the VMA logic, is called prior to
the VMA being instantiated, but the completion hook is called after the VMA
is linked into the maple tree, meaning rmap walkers can reach it.

The mmap hook does not link the VMA into the tree, so this cannot happen.

Therefore it's safe to simply disable this in the mmap_prepare
compatibility layer.

Also update VMA tests code to reflect current compatibility layer state.

Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
---
 mm/util.c                       |  6 ++++-
 tools/testing/vma/include/dup.h | 42 +++++++++++++++++----------------
 2 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/mm/util.c b/mm/util.c
index a2cfa0d77c35..182f0f5cc400 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1204,6 +1204,7 @@ int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)

 		.action.type = MMAP_NOTHING, /* Default */
 	};
+	struct mmap_action *action = &desc.action;
 	int err;

 	err = vfs_mmap_prepare(file, &desc);
@@ -1214,8 +1215,11 @@ int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
 	if (err)
 		return err;

+	/* being invoked from .mmmap means we don't have to enforce this. */
+	action->hide_from_rmap_until_complete = false;
+
 	set_vma_from_desc(vma, &desc);
-	err = mmap_action_complete(vma, &desc.action);
+	err = mmap_action_complete(vma, action);
 	if (err) {
 		const size_t len = vma_pages(vma) << PAGE_SHIFT;

diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h
index 26c6c3255a94..c62d3998e922 100644
--- a/tools/testing/vma/include/dup.h
+++ b/tools/testing/vma/include/dup.h
@@ -1256,8 +1256,17 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma)
 static inline void set_vma_from_desc(struct vm_area_struct *vma,
 		struct vm_area_desc *desc);

-static inline int __compat_vma_mmap(const struct file_operations *f_op,
-		struct file *file, struct vm_area_struct *vma)
+static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
+{
+	return file->f_op->mmap_prepare(desc);
+}
+
+static inline unsigned long vma_pages(struct vm_area_struct *vma)
+{
+	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+}
+
+static inline int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct vm_area_desc desc = {
 		.mm = vma->vm_mm,
@@ -1272,9 +1281,10 @@ static inline int __compat_vma_mmap(const struct file_operations *f_op,

 		.action.type = MMAP_NOTHING, /* Default */
 	};
+	struct mmap_action *action = &desc.action;
 	int err;

-	err = f_op->mmap_prepare(&desc);
+	err = vfs_mmap_prepare(file, &desc);
 	if (err)
 		return err;

@@ -1282,28 +1292,25 @@ static inline int __compat_vma_mmap(const struct file_operations *f_op,
 	if (err)
 		return err;

+	/* being invoked from .mmmap means we don't have to enforce this. */
+	action->hide_from_rmap_until_complete = false;
+
 	set_vma_from_desc(vma, &desc);
-	return mmap_action_complete(vma, &desc.action);
-}
+	err = mmap_action_complete(vma, action);
+	if (err) {
+		const size_t len = vma_pages(vma) << PAGE_SHIFT;

-static inline int compat_vma_mmap(struct file *file,
-		struct vm_area_struct *vma)
-{
-	return __compat_vma_mmap(file->f_op, file, vma);
+		do_munmap(current->mm, vma->vm_start, len, NULL);
+	}
+	return err;
 }

-
 static inline void vma_iter_init(struct vma_iterator *vmi,
 		struct mm_struct *mm, unsigned long addr)
 {
 	mas_init(&vmi->mas, &mm->mm_mt, addr);
 }

-static inline unsigned long vma_pages(struct vm_area_struct *vma)
-{
-	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
-}
-
 static inline void mmap_assert_locked(struct mm_struct *);
 static inline struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
 						unsigned long start_addr,
@@ -1473,11 +1480,6 @@ static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
 	return file->f_op->mmap(file, vma);
 }

-static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
-{
-	return file->f_op->mmap_prepare(desc);
-}
-
 static inline void vma_set_file(struct vm_area_struct *vma, struct file *file)
 {
 	/* Changing an anonymous vma with this is illegal */
--
2.53.0

^ permalink raw reply related

* [PATCH v4 04/21] mm: avoid deadlock when holding rmap on mmap_prepare error
From: Lorenzo Stoakes (Oracle) @ 2026-03-20 22:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jonathan Corbet, Clemens Ladisch, Arnd Bergmann,
	Greg Kroah-Hartman, K . Y . Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Alexander Shishkin, Maxime Coquelin,
	Alexandre Torgue, Miquel Raynal, Richard Weinberger,
	Vignesh Raghavendra, Bodo Stroesser, Martin K . Petersen,
	David Howells, Marc Dionne, Alexander Viro, Christian Brauner,
	Jan Kara, David Hildenbrand, Liam R . Howlett, Vlastimil Babka,
	Mike Rapoport, Suren Baghdasaryan, Michal Hocko, Jann Horn,
	Pedro Falcato, linux-kernel, linux-doc, linux-hyperv, linux-stm32,
	linux-arm-kernel, linux-mtd, linux-staging, linux-scsi,
	target-devel, linux-afs, linux-fsdevel, linux-mm, Ryan Roberts
In-Reply-To: <cover.1774045440.git.ljs@kernel.org>

Commit ac0a3fc9c07d ("mm: add ability to take further action in
vm_area_desc") added the ability for drivers to instruct mm to take actions
after the .mmap_prepare callback is complete.

To make life simpler and safer, this is done before the VMA/mmap write lock
is dropped but when the VMA is completely established.

So on error, we simply munmap() the VMA.

As part of this implementation, unfortunately a horrible hack had to be
implemented to support some questionable behaviour hugetlb relies upon -
that is that the file rmap lock is held until the operation is complete.

The implementation, for convenience, did this in mmap_action_finish() so
both the VMA and mmap_prepare compatibility layer paths would have this
correctly handled.

However, it turns out there is a mistake here - the rmap lock cannot be
held on munmap, as free_pgtables() -> unlink_file_vma_batch_add() ->
unlink_file_vma_batch_process() takes the file rmap lock.

We therefore currently have a deadlock issue that might arise.

Resolve this by leaving it to callers to handle the unmap.

The compatibility layer does not support this rmap behaviour, so we simply
have it unmap on error after calling mmap_action_complete().

In the VMA implementation, we only perform the unmap after the rmap lock is
dropped.

This resolves the issue by ensuring the rmap lock is always dropped when
the unmap occurs.

Fixes: ac0a3fc9c07d ("mm: add ability to take further action in vm_area_desc")
Cc: <stable@vger.kernel.org>
Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
---
 mm/util.c | 12 +++++++-----
 mm/vma.c  | 13 ++++++++++---
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/mm/util.c b/mm/util.c
index 73c97a748d8e..a2cfa0d77c35 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1215,7 +1215,13 @@ int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
 		return err;
 
 	set_vma_from_desc(vma, &desc);
-	return mmap_action_complete(vma, &desc.action);
+	err = mmap_action_complete(vma, &desc.action);
+	if (err) {
+		const size_t len = vma_pages(vma) << PAGE_SHIFT;
+
+		do_munmap(current->mm, vma->vm_start, len, NULL);
+	}
+	return err;
 }
 EXPORT_SYMBOL(compat_vma_mmap);
 
@@ -1316,10 +1322,6 @@ static int mmap_action_finish(struct vm_area_struct *vma,
 	 * invoked if we do NOT merge, so we only clean up the VMA we created.
 	 */
 	if (err) {
-		const size_t len = vma_pages(vma) << PAGE_SHIFT;
-
-		do_munmap(current->mm, vma->vm_start, len, NULL);
-
 		if (action->error_hook) {
 			/* We may want to filter the error. */
 			err = action->error_hook(err);
diff --git a/mm/vma.c b/mm/vma.c
index ee91f2b76acf..3fc5fe4f1a7c 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -2736,9 +2736,9 @@ static int call_action_complete(struct mmap_state *map,
 				struct mmap_action *action,
 				struct vm_area_struct *vma)
 {
-	int ret;
+	int err;
 
-	ret = mmap_action_complete(vma, action);
+	err = mmap_action_complete(vma, action);
 
 	/* If we held the file rmap we need to release it. */
 	if (map->hold_file_rmap_lock) {
@@ -2746,7 +2746,14 @@ static int call_action_complete(struct mmap_state *map,
 
 		i_mmap_unlock_write(file->f_mapping);
 	}
-	return ret;
+
+	if (err) {
+		const size_t len = vma_pages(vma) << PAGE_SHIFT;
+
+		do_munmap(current->mm, vma->vm_start, len, NULL);
+	}
+
+	return err;
 }
 
 static unsigned long __mmap_region(struct file *file, unsigned long addr,
-- 
2.53.0


^ permalink raw reply related

* [PATCH v4 03/21] mm: document vm_operations_struct->open the same as close()
From: Lorenzo Stoakes (Oracle) @ 2026-03-20 22:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jonathan Corbet, Clemens Ladisch, Arnd Bergmann,
	Greg Kroah-Hartman, K . Y . Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Alexander Shishkin, Maxime Coquelin,
	Alexandre Torgue, Miquel Raynal, Richard Weinberger,
	Vignesh Raghavendra, Bodo Stroesser, Martin K . Petersen,
	David Howells, Marc Dionne, Alexander Viro, Christian Brauner,
	Jan Kara, David Hildenbrand, Liam R . Howlett, Vlastimil Babka,
	Mike Rapoport, Suren Baghdasaryan, Michal Hocko, Jann Horn,
	Pedro Falcato, linux-kernel, linux-doc, linux-hyperv, linux-stm32,
	linux-arm-kernel, linux-mtd, linux-staging, linux-scsi,
	target-devel, linux-afs, linux-fsdevel, linux-mm, Ryan Roberts
In-Reply-To: <cover.1774045440.git.ljs@kernel.org>

Describe when the operation is invoked and the context in which it is
invoked, matching the description already added for vm_op->close().

While we're here, update all outdated references to an 'area' field for
VMAs to the more consistent 'vma'.

Acked-by: Vlastimil Babka (SUSE) <vbabka@kernel.org>
Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
---
 include/linux/mm.h              | 15 ++++++++++-----
 tools/testing/vma/include/dup.h | 15 ++++++++++-----
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1e63b3a44a47..da94edb287cd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -766,15 +766,20 @@ struct vm_uffd_ops;
  * to the functions called when a no-page or a wp-page exception occurs.
  */
 struct vm_operations_struct {
-	void (*open)(struct vm_area_struct * area);
+	/**
+	 * @open: Called when a VMA is remapped, split or forked. Not called
+	 * upon first mapping a VMA.
+	 * Context: User context.  May sleep.  Caller holds mmap_lock.
+	 */
+	void (*open)(struct vm_area_struct *vma);
 	/**
 	 * @close: Called when the VMA is being removed from the MM.
 	 * Context: User context.  May sleep.  Caller holds mmap_lock.
 	 */
-	void (*close)(struct vm_area_struct * area);
+	void (*close)(struct vm_area_struct *vma);
 	/* Called any time before splitting to check if it's allowed */
-	int (*may_split)(struct vm_area_struct *area, unsigned long addr);
-	int (*mremap)(struct vm_area_struct *area);
+	int (*may_split)(struct vm_area_struct *vma, unsigned long addr);
+	int (*mremap)(struct vm_area_struct *vma);
 	/*
 	 * Called by mprotect() to make driver-specific permission
 	 * checks before mprotect() is finalised.   The VMA must not
@@ -786,7 +791,7 @@ struct vm_operations_struct {
 	vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order);
 	vm_fault_t (*map_pages)(struct vm_fault *vmf,
 			pgoff_t start_pgoff, pgoff_t end_pgoff);
-	unsigned long (*pagesize)(struct vm_area_struct * area);
+	unsigned long (*pagesize)(struct vm_area_struct *vma);
 
 	/* notification that a previously read-only page is about to become
 	 * writable, if an error is returned it will cause a SIGBUS */
diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h
index 477a5be65dd2..26c6c3255a94 100644
--- a/tools/testing/vma/include/dup.h
+++ b/tools/testing/vma/include/dup.h
@@ -632,15 +632,20 @@ struct vm_area_struct {
 } __randomize_layout;
 
 struct vm_operations_struct {
-	void (*open)(struct vm_area_struct * area);
+	/**
+	 * @open: Called when a VMA is remapped, split or forked. Not called
+	 * upon first mapping a VMA.
+	 * Context: User context.  May sleep.  Caller holds mmap_lock.
+	 */
+	void (*open)(struct vm_area_struct *vma);
 	/**
 	 * @close: Called when the VMA is being removed from the MM.
 	 * Context: User context.  May sleep.  Caller holds mmap_lock.
 	 */
-	void (*close)(struct vm_area_struct * area);
+	void (*close)(struct vm_area_struct *vma);
 	/* Called any time before splitting to check if it's allowed */
-	int (*may_split)(struct vm_area_struct *area, unsigned long addr);
-	int (*mremap)(struct vm_area_struct *area);
+	int (*may_split)(struct vm_area_struct *vma, unsigned long addr);
+	int (*mremap)(struct vm_area_struct *vma);
 	/*
 	 * Called by mprotect() to make driver-specific permission
 	 * checks before mprotect() is finalised.   The VMA must not
@@ -652,7 +657,7 @@ struct vm_operations_struct {
 	vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order);
 	vm_fault_t (*map_pages)(struct vm_fault *vmf,
 			pgoff_t start_pgoff, pgoff_t end_pgoff);
-	unsigned long (*pagesize)(struct vm_area_struct * area);
+	unsigned long (*pagesize)(struct vm_area_struct *vma);
 
 	/* notification that a previously read-only page is about to become
 	 * writable, if an error is returned it will cause a SIGBUS */
-- 
2.53.0


^ permalink raw reply related

* [PATCH v4 02/21] mm: add documentation for the mmap_prepare file operation callback
From: Lorenzo Stoakes (Oracle) @ 2026-03-20 22:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jonathan Corbet, Clemens Ladisch, Arnd Bergmann,
	Greg Kroah-Hartman, K . Y . Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Alexander Shishkin, Maxime Coquelin,
	Alexandre Torgue, Miquel Raynal, Richard Weinberger,
	Vignesh Raghavendra, Bodo Stroesser, Martin K . Petersen,
	David Howells, Marc Dionne, Alexander Viro, Christian Brauner,
	Jan Kara, David Hildenbrand, Liam R . Howlett, Vlastimil Babka,
	Mike Rapoport, Suren Baghdasaryan, Michal Hocko, Jann Horn,
	Pedro Falcato, linux-kernel, linux-doc, linux-hyperv, linux-stm32,
	linux-arm-kernel, linux-mtd, linux-staging, linux-scsi,
	target-devel, linux-afs, linux-fsdevel, linux-mm, Ryan Roberts
In-Reply-To: <cover.1774045440.git.ljs@kernel.org>

This documentation makes it easier for a driver/file system implementer to
correctly use this callback.

It covers the fundamentals, whilst intentionally leaving the less lovely
possible actions one might take undocumented (for instance - the
success_hook, error_hook fields in mmap_action).

The document also covers the new VMA flags implementation which is the
only one which will work correctly with mmap_prepare.

Acked-by: Vlastimil Babka (SUSE) <vbabka@kernel.org>
Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
---
 Documentation/filesystems/index.rst        |   1 +
 Documentation/filesystems/mmap_prepare.rst | 142 +++++++++++++++++++++
 2 files changed, 143 insertions(+)
 create mode 100644 Documentation/filesystems/mmap_prepare.rst

diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst
index f4873197587d..6cbc3e0292ae 100644
--- a/Documentation/filesystems/index.rst
+++ b/Documentation/filesystems/index.rst
@@ -29,6 +29,7 @@ algorithms work.
    fiemap
    files
    locks
+   mmap_prepare
    multigrain-ts
    mount_api
    quota
diff --git a/Documentation/filesystems/mmap_prepare.rst b/Documentation/filesystems/mmap_prepare.rst
new file mode 100644
index 000000000000..ae484d371861
--- /dev/null
+++ b/Documentation/filesystems/mmap_prepare.rst
@@ -0,0 +1,142 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========================
+mmap_prepare callback HOWTO
+===========================
+
+Introduction
+============
+
+The ``struct file->f_op->mmap()`` callback has been deprecated as it is both a
+stability and security risk, and doesn't always permit the merging of adjacent
+mappings resulting in unnecessary memory fragmentation.
+
+It has been replaced with the ``file->f_op->mmap_prepare()`` callback which
+solves these problems.
+
+This hook is called right at the beginning of setting up the mapping, and
+importantly it is invoked *before* any merging of adjacent mappings has taken
+place.
+
+If an error arises upon mapping, it might arise after this callback has been
+invoked, therefore it should be treated as effectively stateless.
+
+That is - no resources should be allocated nor state updated to reflect that a
+mapping has been established, as the mapping may either be merged, or fail to be
+mapped after the callback is complete.
+
+How To Use
+==========
+
+In your driver's struct file_operations struct, specify an ``mmap_prepare``
+callback rather than an ``mmap`` one, e.g. for ext4:
+
+.. code-block:: C
+
+    const struct file_operations ext4_file_operations = {
+        ...
+        .mmap_prepare    = ext4_file_mmap_prepare,
+    };
+
+This has a signature of ``int (*mmap_prepare)(struct vm_area_desc *)``.
+
+Examining the struct vm_area_desc type:
+
+.. code-block:: C
+
+    struct vm_area_desc {
+        /* Immutable state. */
+        const struct mm_struct *const mm;
+        struct file *const file; /* May vary from vm_file in stacked callers. */
+        unsigned long start;
+        unsigned long end;
+
+        /* Mutable fields. Populated with initial state. */
+        pgoff_t pgoff;
+        struct file *vm_file;
+        vma_flags_t vma_flags;
+        pgprot_t page_prot;
+
+        /* Write-only fields. */
+        const struct vm_operations_struct *vm_ops;
+        void *private_data;
+
+        /* Take further action? */
+        struct mmap_action action;
+    };
+
+This is straightforward - you have all the fields you need to set up the
+mapping, and you can update the mutable and writable fields, for instance:
+
+.. code-block:: C
+
+    static int ext4_file_mmap_prepare(struct vm_area_desc *desc)
+    {
+        int ret;
+        struct file *file = desc->file;
+        struct inode *inode = file->f_mapping->host;
+
+        ...
+
+        file_accessed(file);
+        if (IS_DAX(file_inode(file))) {
+            desc->vm_ops = &ext4_dax_vm_ops;
+            vma_desc_set_flags(desc, VMA_HUGEPAGE_BIT);
+        } else {
+            desc->vm_ops = &ext4_file_vm_ops;
+        }
+        return 0;
+    }
+
+Importantly, you no longer have to dance around with reference counts or locks
+when updating these fields - **you can simply go ahead and change them**.
+
+Everything is taken care of by the mapping code.
+
+VMA Flags
+---------
+
+Along with ``mmap_prepare``, VMA flags have undergone an overhaul. Where before
+you would invoke one of vm_flags_init(), vm_flags_reset(), vm_flags_set(),
+vm_flags_clear(), and vm_flags_mod() to modify flags (and to have the
+locking done correctly for you, this is no longer necessary.
+
+Also, the legacy approach of specifying VMA flags via ``VM_READ``, ``VM_WRITE``,
+etc. - i.e. using a ``-VM_xxx``- macro has changed too.
+
+When implementing mmap_prepare(), reference flags by their bit number, defined
+as a ``VMA_xxx_BIT`` macro, e.g. ``VMA_READ_BIT``, ``VMA_WRITE_BIT`` etc.,
+and use one of (where ``desc`` is a pointer to struct vm_area_desc):
+
+* ``vma_desc_test_any(desc, ...)`` - Specify a comma-separated list of flags
+  you wish to test for (whether _any_ are set), e.g. - ``vma_desc_test_any(
+  desc, VMA_WRITE_BIT, VMA_MAYWRITE_BIT)`` - returns ``true`` if either are set,
+  otherwise ``false``.
+* ``vma_desc_set_flags(desc, ...)`` - Update the VMA descriptor flags to set
+  additional flags specified by a comma-separated list,
+  e.g. - ``vma_desc_set_flags(desc, VMA_PFNMAP_BIT, VMA_IO_BIT)``.
+* ``vma_desc_clear_flags(desc, ...)`` - Update the VMA descriptor flags to clear
+  flags specified by a comma-separated list, e.g. - ``vma_desc_clear_flags(
+  desc, VMA_WRITE_BIT, VMA_MAYWRITE_BIT)``.
+
+Actions
+=======
+
+You can now very easily have actions be performed upon a mapping once set up by
+utilising simple helper functions invoked upon the struct vm_area_desc
+pointer. These are:
+
+* mmap_action_remap() - Remaps a range consisting only of PFNs for a specific
+  range starting a virtual address and PFN number of a set size.
+
+* mmap_action_remap_full() - Same as mmap_action_remap(), only remaps the
+  entire mapping from ``start_pfn`` onward.
+
+* mmap_action_ioremap() - Same as mmap_action_remap(), only performs an I/O
+  remap.
+
+* mmap_action_ioremap_full() - Same as mmap_action_ioremap(), only remaps
+  the entire mapping from ``start_pfn`` onward.
+
+**NOTE:** The ``action`` field should never normally be manipulated directly,
+rather you ought to use one of these helpers.
-- 
2.53.0


^ permalink raw reply related

* [PATCH v4 01/21] mm: various small mmap_prepare cleanups
From: Lorenzo Stoakes (Oracle) @ 2026-03-20 22:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jonathan Corbet, Clemens Ladisch, Arnd Bergmann,
	Greg Kroah-Hartman, K . Y . Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Alexander Shishkin, Maxime Coquelin,
	Alexandre Torgue, Miquel Raynal, Richard Weinberger,
	Vignesh Raghavendra, Bodo Stroesser, Martin K . Petersen,
	David Howells, Marc Dionne, Alexander Viro, Christian Brauner,
	Jan Kara, David Hildenbrand, Liam R . Howlett, Vlastimil Babka,
	Mike Rapoport, Suren Baghdasaryan, Michal Hocko, Jann Horn,
	Pedro Falcato, linux-kernel, linux-doc, linux-hyperv, linux-stm32,
	linux-arm-kernel, linux-mtd, linux-staging, linux-scsi,
	target-devel, linux-afs, linux-fsdevel, linux-mm, Ryan Roberts
In-Reply-To: <cover.1774045440.git.ljs@kernel.org>

Rather than passing arbitrary fields, pass a vm_area_desc pointer to mmap
prepare functions to mmap prepare, and an action and vma pointer to mmap
complete in order to put all the action-specific logic in the function
actually doing the work.

Additionally, allow mmap prepare functions to return an error so we can
error out as soon as possible if there is something logically incorrect in
the input.

Update remap_pfn_range_prepare() to properly check the input range for the
CoW case.

Also remove io_remap_pfn_range_complete(), as we can simply set up the
fields correctly in io_remap_pfn_range_prepare() and use
remap_pfn_range_complete() for this.

While we're here, make remap_pfn_range_prepare_vma() a little neater, and
pass mmap_action directly to call_action_complete().

Then, update compat_vma_mmap() to perform its logic directly, as
__compat_vma_map() is not used by anything so we don't need to export it.

Also update compat_vma_mmap() to use vfs_mmap_prepare() rather than
calling the mmap_prepare op directly.

Finally, update the VMA userland tests to reflect the changes.

Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
---
 include/linux/fs.h                |   2 -
 include/linux/mm.h                |   7 +-
 mm/internal.h                     |  32 ++++----
 mm/memory.c                       |  45 +++++++----
 mm/util.c                         | 121 +++++++++++++-----------------
 mm/vma.c                          |  24 +++---
 tools/testing/vma/include/dup.h   |   7 +-
 tools/testing/vma/include/stubs.h |   8 +-
 8 files changed, 126 insertions(+), 120 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8b3dd145b25e..a2628a12bd2b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2058,8 +2058,6 @@ static inline bool can_mmap_file(struct file *file)
 	return true;
 }
 
-int __compat_vma_mmap(const struct file_operations *f_op,
-		struct file *file, struct vm_area_struct *vma);
 int compat_vma_mmap(struct file *file, struct vm_area_struct *vma);
 
 static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 42cc40aa63d9..1e63b3a44a47 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4320,10 +4320,9 @@ static inline void mmap_action_ioremap_full(struct vm_area_desc *desc,
 	mmap_action_ioremap(desc, desc->start, start_pfn, vma_desc_size(desc));
 }
 
-void mmap_action_prepare(struct mmap_action *action,
-			 struct vm_area_desc *desc);
-int mmap_action_complete(struct mmap_action *action,
-			 struct vm_area_struct *vma);
+int mmap_action_prepare(struct vm_area_desc *desc);
+int mmap_action_complete(struct vm_area_struct *vma,
+			 struct mmap_action *action);
 
 /* Look up the first VMA which exactly match the interval vm_start ... vm_end */
 static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
diff --git a/mm/internal.h b/mm/internal.h
index 708d240b4198..0256ca44115a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1793,26 +1793,28 @@ int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
 void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm);
 int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm);
 
-void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn);
-int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr,
-		unsigned long pfn, unsigned long size, pgprot_t pgprot);
+int remap_pfn_range_prepare(struct vm_area_desc *desc);
+int remap_pfn_range_complete(struct vm_area_struct *vma,
+			     struct mmap_action *action);
 
-static inline void io_remap_pfn_range_prepare(struct vm_area_desc *desc,
-		unsigned long orig_pfn, unsigned long size)
+static inline int io_remap_pfn_range_prepare(struct vm_area_desc *desc)
 {
+	struct mmap_action *action = &desc->action;
+	const unsigned long orig_pfn = action->remap.start_pfn;
+	const pgprot_t orig_pgprot = action->remap.pgprot;
+	const unsigned long size = action->remap.size;
 	const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size);
+	int err;
 
-	return remap_pfn_range_prepare(desc, pfn);
-}
+	action->remap.start_pfn = pfn;
+	action->remap.pgprot = pgprot_decrypted(orig_pgprot);
+	err = remap_pfn_range_prepare(desc);
+	if (err)
+		return err;
 
-static inline int io_remap_pfn_range_complete(struct vm_area_struct *vma,
-		unsigned long addr, unsigned long orig_pfn, unsigned long size,
-		pgprot_t orig_prot)
-{
-	const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size);
-	const pgprot_t prot = pgprot_decrypted(orig_prot);
-
-	return remap_pfn_range_complete(vma, addr, pfn, size, prot);
+	/* Remap does the actual work. */
+	action->type = MMAP_REMAP_PFN;
+	return 0;
 }
 
 #ifdef CONFIG_MMU_NOTIFIER
diff --git a/mm/memory.c b/mm/memory.c
index 219b9bf6cae0..9dec67a18116 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3099,26 +3099,34 @@ static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 }
 #endif
 
-void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn)
+int remap_pfn_range_prepare(struct vm_area_desc *desc)
 {
-	/*
-	 * We set addr=VMA start, end=VMA end here, so this won't fail, but we
-	 * check it again on complete and will fail there if specified addr is
-	 * invalid.
-	 */
-	get_remap_pgoff(vma_desc_is_cow_mapping(desc), desc->start, desc->end,
-			desc->start, desc->end, pfn, &desc->pgoff);
+	const struct mmap_action *action = &desc->action;
+	const unsigned long start = action->remap.start;
+	const unsigned long end = start + action->remap.size;
+	const unsigned long pfn = action->remap.start_pfn;
+	const bool is_cow = vma_desc_is_cow_mapping(desc);
+	int err;
+
+	err = get_remap_pgoff(is_cow, start, end, desc->start, desc->end, pfn,
+			      &desc->pgoff);
+	if (err)
+		return err;
+
 	vma_desc_set_flags_mask(desc, VMA_REMAP_FLAGS);
+	return 0;
 }
 
-static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long addr,
-		unsigned long pfn, unsigned long size)
+static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma,
+				       unsigned long addr, unsigned long pfn,
+				       unsigned long size)
 {
-	unsigned long end = addr + PAGE_ALIGN(size);
+	const unsigned long end = addr + PAGE_ALIGN(size);
+	const bool is_cow = is_cow_mapping(vma->vm_flags);
 	int err;
 
-	err = get_remap_pgoff(is_cow_mapping(vma->vm_flags), addr, end,
-			      vma->vm_start, vma->vm_end, pfn, &vma->vm_pgoff);
+	err = get_remap_pgoff(is_cow, addr, end, vma->vm_start, vma->vm_end,
+			      pfn, &vma->vm_pgoff);
 	if (err)
 		return err;
 
@@ -3151,10 +3159,15 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 }
 EXPORT_SYMBOL(remap_pfn_range);
 
-int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr,
-		unsigned long pfn, unsigned long size, pgprot_t prot)
+int remap_pfn_range_complete(struct vm_area_struct *vma,
+			     struct mmap_action *action)
 {
-	return do_remap_pfn_range(vma, addr, pfn, size, prot);
+	const unsigned long start = action->remap.start;
+	const unsigned long pfn = action->remap.start_pfn;
+	const unsigned long size = action->remap.size;
+	const pgprot_t prot = action->remap.pgprot;
+
+	return do_remap_pfn_range(vma, start, pfn, size, prot);
 }
 
 /**
diff --git a/mm/util.c b/mm/util.c
index ce7ae80047cf..73c97a748d8e 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1163,43 +1163,6 @@ void flush_dcache_folio(struct folio *folio)
 EXPORT_SYMBOL(flush_dcache_folio);
 #endif
 
-/**
- * __compat_vma_mmap() - See description for compat_vma_mmap()
- * for details. This is the same operation, only with a specific file operations
- * struct which may or may not be the same as vma->vm_file->f_op.
- * @f_op: The file operations whose .mmap_prepare() hook is specified.
- * @file: The file which backs or will back the mapping.
- * @vma: The VMA to apply the .mmap_prepare() hook to.
- * Returns: 0 on success or error.
- */
-int __compat_vma_mmap(const struct file_operations *f_op,
-		struct file *file, struct vm_area_struct *vma)
-{
-	struct vm_area_desc desc = {
-		.mm = vma->vm_mm,
-		.file = file,
-		.start = vma->vm_start,
-		.end = vma->vm_end,
-
-		.pgoff = vma->vm_pgoff,
-		.vm_file = vma->vm_file,
-		.vma_flags = vma->flags,
-		.page_prot = vma->vm_page_prot,
-
-		.action.type = MMAP_NOTHING, /* Default */
-	};
-	int err;
-
-	err = f_op->mmap_prepare(&desc);
-	if (err)
-		return err;
-
-	mmap_action_prepare(&desc.action, &desc);
-	set_vma_from_desc(vma, &desc);
-	return mmap_action_complete(&desc.action, vma);
-}
-EXPORT_SYMBOL(__compat_vma_mmap);
-
 /**
  * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an
  * existing VMA and execute any requested actions.
@@ -1228,7 +1191,31 @@ EXPORT_SYMBOL(__compat_vma_mmap);
  */
 int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	return __compat_vma_mmap(file->f_op, file, vma);
+	struct vm_area_desc desc = {
+		.mm = vma->vm_mm,
+		.file = file,
+		.start = vma->vm_start,
+		.end = vma->vm_end,
+
+		.pgoff = vma->vm_pgoff,
+		.vm_file = vma->vm_file,
+		.vma_flags = vma->flags,
+		.page_prot = vma->vm_page_prot,
+
+		.action.type = MMAP_NOTHING, /* Default */
+	};
+	int err;
+
+	err = vfs_mmap_prepare(file, &desc);
+	if (err)
+		return err;
+
+	err = mmap_action_prepare(&desc);
+	if (err)
+		return err;
+
+	set_vma_from_desc(vma, &desc);
+	return mmap_action_complete(vma, &desc.action);
 }
 EXPORT_SYMBOL(compat_vma_mmap);
 
@@ -1320,8 +1307,8 @@ void snapshot_page(struct page_snapshot *ps, const struct page *page)
 	}
 }
 
-static int mmap_action_finish(struct mmap_action *action,
-		const struct vm_area_struct *vma, int err)
+static int mmap_action_finish(struct vm_area_struct *vma,
+			      struct mmap_action *action, int err)
 {
 	/*
 	 * If an error occurs, unmap the VMA altogether and return an error. We
@@ -1353,37 +1340,38 @@ static int mmap_action_finish(struct mmap_action *action,
 /**
  * mmap_action_prepare - Perform preparatory setup for an VMA descriptor
  * action which need to be performed.
- * @desc: The VMA descriptor to prepare for @action.
- * @action: The action to perform.
+ * @desc: The VMA descriptor to prepare for its @desc->action.
+ *
+ * Returns: %0 on success, otherwise error.
  */
-void mmap_action_prepare(struct mmap_action *action,
-			 struct vm_area_desc *desc)
+int mmap_action_prepare(struct vm_area_desc *desc)
 {
-	switch (action->type) {
+	switch (desc->action.type) {
 	case MMAP_NOTHING:
-		break;
+		return 0;
 	case MMAP_REMAP_PFN:
-		remap_pfn_range_prepare(desc, action->remap.start_pfn);
-		break;
+		return remap_pfn_range_prepare(desc);
 	case MMAP_IO_REMAP_PFN:
-		io_remap_pfn_range_prepare(desc, action->remap.start_pfn,
-					   action->remap.size);
-		break;
+		return io_remap_pfn_range_prepare(desc);
 	}
+
+	WARN_ON_ONCE(1);
+	return -EINVAL;
 }
 EXPORT_SYMBOL(mmap_action_prepare);
 
 /**
  * mmap_action_complete - Execute VMA descriptor action.
- * @action: The action to perform.
  * @vma: The VMA to perform the action upon.
+ * @action: The action to perform.
  *
  * Similar to mmap_action_prepare().
  *
  * Return: 0 on success, or error, at which point the VMA will be unmapped.
  */
-int mmap_action_complete(struct mmap_action *action,
-			 struct vm_area_struct *vma)
+int mmap_action_complete(struct vm_area_struct *vma,
+			 struct mmap_action *action)
+
 {
 	int err = 0;
 
@@ -1391,25 +1379,22 @@ int mmap_action_complete(struct mmap_action *action,
 	case MMAP_NOTHING:
 		break;
 	case MMAP_REMAP_PFN:
-		err = remap_pfn_range_complete(vma, action->remap.start,
-				action->remap.start_pfn, action->remap.size,
-				action->remap.pgprot);
+		err = remap_pfn_range_complete(vma, action);
 		break;
 	case MMAP_IO_REMAP_PFN:
-		err = io_remap_pfn_range_complete(vma, action->remap.start,
-				action->remap.start_pfn, action->remap.size,
-				action->remap.pgprot);
+		/* Should have been delegated. */
+		WARN_ON_ONCE(1);
+		err = -EINVAL;
 		break;
 	}
 
-	return mmap_action_finish(action, vma, err);
+	return mmap_action_finish(vma, action, err);
 }
 EXPORT_SYMBOL(mmap_action_complete);
 #else
-void mmap_action_prepare(struct mmap_action *action,
-			struct vm_area_desc *desc)
+int mmap_action_prepare(struct vm_area_desc *desc)
 {
-	switch (action->type) {
+	switch (desc->action.type) {
 	case MMAP_NOTHING:
 		break;
 	case MMAP_REMAP_PFN:
@@ -1417,11 +1402,13 @@ void mmap_action_prepare(struct mmap_action *action,
 		WARN_ON_ONCE(1); /* nommu cannot handle these. */
 		break;
 	}
+
+	return 0;
 }
 EXPORT_SYMBOL(mmap_action_prepare);
 
-int mmap_action_complete(struct mmap_action *action,
-			struct vm_area_struct *vma)
+int mmap_action_complete(struct vm_area_struct *vma,
+			 struct mmap_action *action)
 {
 	int err = 0;
 
@@ -1436,7 +1423,7 @@ int mmap_action_complete(struct mmap_action *action,
 		break;
 	}
 
-	return mmap_action_finish(action, vma, err);
+	return mmap_action_finish(vma, action, err);
 }
 EXPORT_SYMBOL(mmap_action_complete);
 #endif
diff --git a/mm/vma.c b/mm/vma.c
index 8cccaeb8ccbb..ee91f2b76acf 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -2641,15 +2641,18 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
 	vma_set_page_prot(vma);
 }
 
-static void call_action_prepare(struct mmap_state *map,
-				struct vm_area_desc *desc)
+static int call_action_prepare(struct mmap_state *map,
+			       struct vm_area_desc *desc)
 {
-	struct mmap_action *action = &desc->action;
+	int err;
 
-	mmap_action_prepare(action, desc);
+	err = mmap_action_prepare(desc);
+	if (err)
+		return err;
 
-	if (action->hide_from_rmap_until_complete)
+	if (desc->action.hide_from_rmap_until_complete)
 		map->hold_file_rmap_lock = true;
+	return 0;
 }
 
 /*
@@ -2673,7 +2676,9 @@ static int call_mmap_prepare(struct mmap_state *map,
 	if (err)
 		return err;
 
-	call_action_prepare(map, desc);
+	err = call_action_prepare(map, desc);
+	if (err)
+		return err;
 
 	/* Update fields permitted to be changed. */
 	map->pgoff = desc->pgoff;
@@ -2728,13 +2733,12 @@ static bool can_set_ksm_flags_early(struct mmap_state *map)
 }
 
 static int call_action_complete(struct mmap_state *map,
-				struct vm_area_desc *desc,
+				struct mmap_action *action,
 				struct vm_area_struct *vma)
 {
-	struct mmap_action *action = &desc->action;
 	int ret;
 
-	ret = mmap_action_complete(action, vma);
+	ret = mmap_action_complete(vma, action);
 
 	/* If we held the file rmap we need to release it. */
 	if (map->hold_file_rmap_lock) {
@@ -2796,7 +2800,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
 	__mmap_complete(&map, vma);
 
 	if (have_mmap_prepare && allocated_new) {
-		error = call_action_complete(&map, &desc, vma);
+		error = call_action_complete(&map, &desc.action, vma);
 
 		if (error)
 			return error;
diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h
index b69eefba4cf7..477a5be65dd2 100644
--- a/tools/testing/vma/include/dup.h
+++ b/tools/testing/vma/include/dup.h
@@ -1273,9 +1273,12 @@ static inline int __compat_vma_mmap(const struct file_operations *f_op,
 	if (err)
 		return err;
 
-	mmap_action_prepare(&desc.action, &desc);
+	err = mmap_action_prepare(&desc);
+	if (err)
+		return err;
+
 	set_vma_from_desc(vma, &desc);
-	return mmap_action_complete(&desc.action, vma);
+	return mmap_action_complete(vma, &desc.action);
 }
 
 static inline int compat_vma_mmap(struct file *file,
diff --git a/tools/testing/vma/include/stubs.h b/tools/testing/vma/include/stubs.h
index 5afb0afe2d48..a30b8bc84955 100644
--- a/tools/testing/vma/include/stubs.h
+++ b/tools/testing/vma/include/stubs.h
@@ -81,13 +81,13 @@ static inline void free_anon_vma_name(struct vm_area_struct *vma)
 {
 }
 
-static inline void mmap_action_prepare(struct mmap_action *action,
-					   struct vm_area_desc *desc)
+static inline int mmap_action_prepare(struct vm_area_desc *desc)
 {
+	return 0;
 }
 
-static inline int mmap_action_complete(struct mmap_action *action,
-					   struct vm_area_struct *vma)
+static inline int mmap_action_complete(struct vm_area_struct *vma,
+				       struct mmap_action *action)
 {
 	return 0;
 }
-- 
2.53.0


^ permalink raw reply related

* [PATCH v4 00/21] mm: expand mmap_prepare functionality and usage
From: Lorenzo Stoakes (Oracle) @ 2026-03-20 22:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jonathan Corbet, Clemens Ladisch, Arnd Bergmann,
	Greg Kroah-Hartman, K . Y . Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Alexander Shishkin, Maxime Coquelin,
	Alexandre Torgue, Miquel Raynal, Richard Weinberger,
	Vignesh Raghavendra, Bodo Stroesser, Martin K . Petersen,
	David Howells, Marc Dionne, Alexander Viro, Christian Brauner,
	Jan Kara, David Hildenbrand, Liam R . Howlett, Vlastimil Babka,
	Mike Rapoport, Suren Baghdasaryan, Michal Hocko, Jann Horn,
	Pedro Falcato, linux-kernel, linux-doc, linux-hyperv, linux-stm32,
	linux-arm-kernel, linux-mtd, linux-staging, linux-scsi,
	target-devel, linux-afs, linux-fsdevel, linux-mm, Ryan Roberts

This series expands the mmap_prepare functionality, which is intended to
replace the deprecated f_op->mmap hook which has been the source of bugs
and security issues for some time.

This series starts with some cleanup of existing mmap_prepare logic, then
adds documentation for the mmap_prepare call to make it easier for
filesystem and driver writers to understand how it works.

It then importantly adds a vm_ops->mapped hook, a key feature that was
missing from mmap_prepare previously - this is invoked when a driver which
specifies mmap_prepare has successfully been mapped but not merged with
another VMA.

mmap_prepare is invoked prior to a merge being attempted, so you cannot
manipulate state such as reference counts as if it were a new mapping.

The vm_ops->mapped hook allows a driver to perform tasks required at this
stage, and provides symmetry against subsequent vm_ops->open,close calls.

The series uses this to correct the afs implementation which wrongly
manipulated reference count at mmap_prepare time.

It then adds an mmap_prepare equivalent of vm_iomap_memory() -
mmap_action_simple_ioremap(), then uses this to update a number of drivers.

It then splits out the mmap_prepare compatibility layer (which allows for
invocation of mmap_prepare hooks in an mmap() hook) in such a way as to
allow for more incremental implementation of mmap_prepare hooks.

It then uses this to extend mmap_prepare usage in drivers.

Finally it adds an mmap_prepare equivalent of vm_map_pages(), which lays
the foundation for future work which will extend mmap_prepare to DMA
coherent mappings.

v4:
* Added partial revert of AFS as per Vlasta. Labelled as hotfix.
* Updated subsequent afs patch to apply against this version of AFS.
* Reverted rmap_lock_held changes to util.c, mm.h mmap_action_complete()
  etc. as per Vlasta.
* Added hotfix to fix issue with rmap lock held over munmap() as per
  Vlasta. Labelled as hotfix.
* Force-disable the rmap lock hold feature in the compatbility layer
  because being run under the mmap hook eliminates the need for it.
* Removed superfluous map->hold_file_rmap_lock field.
* Moved handling of rmap lock and unmapping to mmap_action_complete().
* Removed unmap_vma_locked() as previous added patches render it
  unnecessary.
* Removed __compat_vma_mapped() from compatibility layer and
  call_vma_mapped() from VMA layer and made it part of mmap_action_finish()
  for all callers.
* Propagated changes to VMA tests.
* Updated mmap_action_map_kernel_pages[_full]() patch to add missing
  mmap_complete() noop switch enum value as per Nathan.
* Fixed a doc issue in the mmap_prepare docs - reference
  vma_desc_test_flags() rather than _any().
* Rearranged logic so the vm_ops->mapped hook is called before the success
  hook, but this should have no impact.

v3:
* Propagated tags (thanks Suren, Richard!)
* Updated 12/16 to correctly clear the vm_area_desc data structure in
  set_desc_from_vma() as per Joshua Hahn (thanks! :)
* Fixed type in 12/16 as per Suren (cheers!)
* Fixed up 6/16 to use mmap_action_ioremap_full() in simple_ioremap_prepare() as
  suggested by Suren.
* Also fixed up 6/16 to call io_remap_pfn_range_prepare() direct rather than
  mmap_action_prepare() as per Suren.
* Also fixed up 6/16 to pass vm_len rather than vm_[start, end] to
  __simple_ioremap_prep() as per Suren (thanks for all the above! :)
* Fixed issue in rmap lock being held - we were referencing a vma->vm_file after
  the VMA was unmapped, so UAF. Avoid that. Also do_munmap() relies on rmap lock
  NOT being held or may deadlock, so extend functionality to ensure we drop it
  when it is held on error paths.
* Updated 'area' -> 'vma' variable in 3/16 in VMA test dup.h.
* Fixed up reference to __compat_vma_mmap() in 12/16 commit message.
* Updated 1/16 to no longer duplicatively apply io_remap_pfn_range_pfn().
* Updated 1/16 to delegate I/O remap complete to remap complete logic.
* Fixed various typos in 12/16.
* Fixed stale comment typos in 13/16.
* Fixed commit msg and comment typos in 14/16.
* Removed accidental sneak peak to future functionality in 15/16 commit message
  :).
* Fixed up field names to be identical in VMA tests + mm_types.h in 6/16,
  15/16.
https://lore.kernel.org/all/cover.1773944114.git.ljs@kernel.org/

v2:
* Rebased on
  https://lore.kernel.org/all/cover.1773665966.git.ljs@kernel.org/ to make
  Andrew's life easier :)
* Folded all interim fixes into series (thanks Randy for many doc fixes!))
* As per Suren, removed a comment about allocations too small to fail.
* As per Randy, fixed up typo in documentation for vm_area_desc.
* Fixed mmap_action_prepare() not returning if invalid action->type
  specified, as updated from Andrew's interim fix (thanks!) and also
  reported by kernel test bot.
* Updated mmap_action_prepare() and specific prepare functions to only
  pass vm_area_desc parameter as per Suren.
* Fixed up whitespace as per Suren.
* Updated vm_op->open comment in vm_operations_struct to reference forking
  as per Suren.
* Added a commit to check that input range is within VMA on remap as per
  Suren (this also covers I/O remap and all other cases already asserted).
* Updated AFS to not incorrectly reference count on mmap prepare as per
  Usama.
* Also updated various static AFS functions to be consistent with each
  other.
* Updated AFS commit message to reflect mmap_prepare being before any VMA
  merging as per Suren.
* Updated __compat_vma_mapped() to check for NULL vm_ops as per Usama.
* Updated __compat_vma_mapped() to not reference an unmapped VMA's fields
  as per Usama.
* Updated __vma_check_mmap_hook() to check for NULL vm_ops as per Usama.
* Dropped comment about preferring mmap_prepare as seems overly confusing,
  as per Suren.
* Updated the mmap lock assert in unmap_vma_locked() to a write lock assert
  as per Suren.
* Copied vm_ops->open comment over to VMA tests in appropriate patch as per
  Suren.
* Updated mmap_prepare documentation to reflect the fact that no resources
  should be allocated upon mmap_prepare.
* Updated mmap_prepare documentation to reference the vm_ops->mapped
  callback.
* Fixed stray markdown '## How to use' in documentation.
* Fixed bug reported by kernel test bot re: overlooked
  vma_desc_test_flags() -> vma_desc_test() in MTD driver for nommu.
https://lore.kernel.org/linux-mm/cover.1773695307.git.ljs@kernel.org/

v1:
https://lore.kernel.org/linux-mm/cover.1773346620.git.ljs@kernel.org/

Lorenzo Stoakes (Oracle) (21):
  mm: various small mmap_prepare cleanups
  mm: add documentation for the mmap_prepare file operation callback
  mm: document vm_operations_struct->open the same as close()
  mm: avoid deadlock when holding rmap on mmap_prepare error
  mm: switch the rmap lock held option off in compat layer
  mm/vma: remove superfluous map->hold_file_rmap_lock
  mm: have mmap_action_complete() handle the rmap lock and unmap
  mm: add vm_ops->mapped hook
  fs: afs: revert mmap_prepare() change
  fs: afs: restore mmap_prepare implementation
  mm: add mmap_action_simple_ioremap()
  misc: open-dice: replace deprecated mmap hook with mmap_prepare
  hpet: replace deprecated mmap hook with mmap_prepare
  mtdchar: replace deprecated mmap hook with mmap_prepare, clean up
  stm: replace deprecated mmap hook with mmap_prepare
  staging: vme_user: replace deprecated mmap hook with mmap_prepare
  mm: allow handling of stacked mmap_prepare hooks in more drivers
  drivers: hv: vmbus: replace deprecated mmap hook with mmap_prepare
  uio: replace deprecated mmap hook with mmap_prepare in uio_info
  mm: add mmap_action_map_kernel_pages[_full]()
  mm: on remap assert that input range within the proposed VMA

 Documentation/driver-api/vme.rst           |   2 +-
 Documentation/filesystems/index.rst        |   1 +
 Documentation/filesystems/mmap_prepare.rst | 168 ++++++++++++++
 drivers/char/hpet.c                        |  12 +-
 drivers/hv/hyperv_vmbus.h                  |   4 +-
 drivers/hv/vmbus_drv.c                     |  31 ++-
 drivers/hwtracing/stm/core.c               |  31 ++-
 drivers/misc/open-dice.c                   |  19 +-
 drivers/mtd/mtdchar.c                      |  21 +-
 drivers/staging/vme_user/vme.c             |  20 +-
 drivers/staging/vme_user/vme.h             |   2 +-
 drivers/staging/vme_user/vme_user.c        |  51 +++--
 drivers/target/target_core_user.c          |  26 ++-
 drivers/uio/uio.c                          |  10 +-
 drivers/uio/uio_hv_generic.c               |  11 +-
 fs/afs/file.c                              |  36 ++-
 include/linux/fs.h                         |  14 +-
 include/linux/hyperv.h                     |   4 +-
 include/linux/mm.h                         | 158 ++++++++++++-
 include/linux/mm_types.h                   |  17 +-
 include/linux/uio_driver.h                 |   4 +-
 mm/internal.h                              |  46 +++-
 mm/memory.c                                | 175 ++++++++++----
 mm/util.c                                  | 251 ++++++++++++++-------
 mm/vma.c                                   |  48 ++--
 mm/vma.h                                   |   2 +-
 tools/testing/vma/include/dup.h            | 134 +++++++----
 tools/testing/vma/include/stubs.h          |   8 +-
 28 files changed, 956 insertions(+), 350 deletions(-)
 create mode 100644 Documentation/filesystems/mmap_prepare.rst

--
2.53.0

^ permalink raw reply

* Re: [PATCH v3 15/16] mm: add mmap_action_map_kernel_pages[_full]()
From: Lorenzo Stoakes (Oracle) @ 2026-03-20 22:14 UTC (permalink / raw)
  To: Nathan Chancellor
  Cc: Andrew Morton, Jonathan Corbet, Clemens Ladisch, Arnd Bergmann,
	Greg Kroah-Hartman, K . Y . Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Alexander Shishkin, Maxime Coquelin,
	Alexandre Torgue, Miquel Raynal, Richard Weinberger,
	Vignesh Raghavendra, Bodo Stroesser, Martin K . Petersen,
	David Howells, Marc Dionne, Alexander Viro, Christian Brauner,
	Jan Kara, David Hildenbrand, Liam R . Howlett, Vlastimil Babka,
	Mike Rapoport, Suren Baghdasaryan, Michal Hocko, Jann Horn,
	Pedro Falcato, linux-kernel, linux-doc, linux-hyperv, linux-stm32,
	linux-arm-kernel, linux-mtd, linux-staging, linux-scsi,
	target-devel, linux-afs, linux-fsdevel, linux-mm, Ryan Roberts
In-Reply-To: <20260320210812.GA3988975@ax162>

On Fri, Mar 20, 2026 at 02:08:12PM -0700, Nathan Chancellor wrote:
> Hi Lorenzo,
>
> On Thu, Mar 19, 2026 at 06:23:39PM +0000, Lorenzo Stoakes (Oracle) wrote:
> > A user can invoke mmap_action_map_kernel_pages() to specify that the
> > mapping should map kernel pages starting from desc->start of a specified
> > number of pages specified in an array.
> >
> > In order to implement this, adjust mmap_action_prepare() to be able to
> > return an error code, as it makes sense to assert that the specified
> > parameters are valid as quickly as possible as well as updating the VMA
> > flags to include VMA_MIXEDMAP_BIT as necessary.
> >
> > This provides an mmap_prepare equivalent of vm_insert_pages().  We
> > additionally update the existing vm_insert_pages() code to use
> > range_in_vma() and add a new range_in_vma_desc() helper function for the
> > mmap_prepare case, sharing the code between the two in range_is_subset().
> >
> > We add both mmap_action_map_kernel_pages() and
> > mmap_action_map_kernel_pages_full() to allow for both partial and full VMA
> > mappings.
> >
> > We update the documentation to reflect the new features.
> >
> > Finally, we update the VMA tests accordingly to reflect the changes.
> >
> > Reviewed-by: Suren Baghdasaryan <surenb@google.com>
> > Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
> ...
> > diff --git a/mm/util.c b/mm/util.c
> > index 8cf59267a9ac..682d0d24e1c6 100644
> > --- a/mm/util.c
> > +++ b/mm/util.c
> > @@ -1446,6 +1446,8 @@ int mmap_action_prepare(struct vm_area_desc *desc)
> >  		return io_remap_pfn_range_prepare(desc);
> >  	case MMAP_SIMPLE_IO_REMAP:
> >  		return simple_ioremap_prepare(desc);
> > +	case MMAP_MAP_KERNEL_PAGES:
> > +		return map_kernel_pages_prepare(desc);
> >  	}
> >
> >  	WARN_ON_ONCE(1);
> > @@ -1476,6 +1478,9 @@ int mmap_action_complete(struct vm_area_struct *vma,
> >  	case MMAP_REMAP_PFN:
> >  		err = remap_pfn_range_complete(vma, action);
> >  		break;
> > +	case MMAP_MAP_KERNEL_PAGES:
> > +		err = map_kernel_pages_complete(vma, action);
> > +		break;
> >  	case MMAP_IO_REMAP_PFN:
> >  	case MMAP_SIMPLE_IO_REMAP:
> >  		/* Should have been delegated. */
> > @@ -1497,6 +1502,7 @@ int mmap_action_prepare(struct vm_area_desc *desc)
> >  	case MMAP_REMAP_PFN:
> >  	case MMAP_IO_REMAP_PFN:
> >  	case MMAP_SIMPLE_IO_REMAP:
> > +	case MMAP_MAP_KERNEL_PAGES:
> >  		WARN_ON_ONCE(1); /* nommu cannot handle these. */
> >  		break;
> >  	}
>
> Not sure if it has been reported/addressed yet but it looks like
> mmap_action_complete() was missed here, as pointed out by clang:
>
>   $ make -skj"$(nproc)" ARCH=arm LLVM=1 mrproper allnoconfig mm/util.o
>   mm/util.c:1520:10: warning: enumeration value 'MMAP_MAP_KERNEL_PAGES' not handled in switch [-Wswitch]
>    1520 |         switch (action->type) {
>         |                 ^~~~~~~~~~~~
>
> I assume
>
> diff --git a/mm/util.c b/mm/util.c
> index 682d0d24e1c6..c41c119a5a74 100644
> --- a/mm/util.c
> +++ b/mm/util.c
> @@ -1523,6 +1523,7 @@ int mmap_action_complete(struct vm_area_struct *vma,
>  	case MMAP_REMAP_PFN:
>  	case MMAP_IO_REMAP_PFN:
>  	case MMAP_SIMPLE_IO_REMAP:
> +	case MMAP_MAP_KERNEL_PAGES:
>  		WARN_ON_ONCE(1); /* nommu cannot handle this. */
>
>  		err = -EINVAL;
> --
>
> should be the fix?
>
> Cheers,
> Nathan

Thanks, will fix, working on a respin now anyway :)

Cheers, Lorenzo

^ permalink raw reply

* Re: [PATCH v3 15/16] mm: add mmap_action_map_kernel_pages[_full]()
From: Nathan Chancellor @ 2026-03-20 21:08 UTC (permalink / raw)
  To: Lorenzo Stoakes (Oracle)
  Cc: Andrew Morton, Jonathan Corbet, Clemens Ladisch, Arnd Bergmann,
	Greg Kroah-Hartman, K . Y . Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Alexander Shishkin, Maxime Coquelin,
	Alexandre Torgue, Miquel Raynal, Richard Weinberger,
	Vignesh Raghavendra, Bodo Stroesser, Martin K . Petersen,
	David Howells, Marc Dionne, Alexander Viro, Christian Brauner,
	Jan Kara, David Hildenbrand, Liam R . Howlett, Vlastimil Babka,
	Mike Rapoport, Suren Baghdasaryan, Michal Hocko, Jann Horn,
	Pedro Falcato, linux-kernel, linux-doc, linux-hyperv, linux-stm32,
	linux-arm-kernel, linux-mtd, linux-staging, linux-scsi,
	target-devel, linux-afs, linux-fsdevel, linux-mm, Ryan Roberts
In-Reply-To: <54ff3670662e10a66ce0c1a13c0ae93b99a5f201.1773944114.git.ljs@kernel.org>

Hi Lorenzo,

On Thu, Mar 19, 2026 at 06:23:39PM +0000, Lorenzo Stoakes (Oracle) wrote:
> A user can invoke mmap_action_map_kernel_pages() to specify that the
> mapping should map kernel pages starting from desc->start of a specified
> number of pages specified in an array.
> 
> In order to implement this, adjust mmap_action_prepare() to be able to
> return an error code, as it makes sense to assert that the specified
> parameters are valid as quickly as possible as well as updating the VMA
> flags to include VMA_MIXEDMAP_BIT as necessary.
> 
> This provides an mmap_prepare equivalent of vm_insert_pages().  We
> additionally update the existing vm_insert_pages() code to use
> range_in_vma() and add a new range_in_vma_desc() helper function for the
> mmap_prepare case, sharing the code between the two in range_is_subset().
> 
> We add both mmap_action_map_kernel_pages() and
> mmap_action_map_kernel_pages_full() to allow for both partial and full VMA
> mappings.
> 
> We update the documentation to reflect the new features.
> 
> Finally, we update the VMA tests accordingly to reflect the changes.
> 
> Reviewed-by: Suren Baghdasaryan <surenb@google.com>
> Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
...
> diff --git a/mm/util.c b/mm/util.c
> index 8cf59267a9ac..682d0d24e1c6 100644
> --- a/mm/util.c
> +++ b/mm/util.c
> @@ -1446,6 +1446,8 @@ int mmap_action_prepare(struct vm_area_desc *desc)
>  		return io_remap_pfn_range_prepare(desc);
>  	case MMAP_SIMPLE_IO_REMAP:
>  		return simple_ioremap_prepare(desc);
> +	case MMAP_MAP_KERNEL_PAGES:
> +		return map_kernel_pages_prepare(desc);
>  	}
>  
>  	WARN_ON_ONCE(1);
> @@ -1476,6 +1478,9 @@ int mmap_action_complete(struct vm_area_struct *vma,
>  	case MMAP_REMAP_PFN:
>  		err = remap_pfn_range_complete(vma, action);
>  		break;
> +	case MMAP_MAP_KERNEL_PAGES:
> +		err = map_kernel_pages_complete(vma, action);
> +		break;
>  	case MMAP_IO_REMAP_PFN:
>  	case MMAP_SIMPLE_IO_REMAP:
>  		/* Should have been delegated. */
> @@ -1497,6 +1502,7 @@ int mmap_action_prepare(struct vm_area_desc *desc)
>  	case MMAP_REMAP_PFN:
>  	case MMAP_IO_REMAP_PFN:
>  	case MMAP_SIMPLE_IO_REMAP:
> +	case MMAP_MAP_KERNEL_PAGES:
>  		WARN_ON_ONCE(1); /* nommu cannot handle these. */
>  		break;
>  	}

Not sure if it has been reported/addressed yet but it looks like
mmap_action_complete() was missed here, as pointed out by clang:

  $ make -skj"$(nproc)" ARCH=arm LLVM=1 mrproper allnoconfig mm/util.o
  mm/util.c:1520:10: warning: enumeration value 'MMAP_MAP_KERNEL_PAGES' not handled in switch [-Wswitch]
   1520 |         switch (action->type) {
        |                 ^~~~~~~~~~~~

I assume

diff --git a/mm/util.c b/mm/util.c
index 682d0d24e1c6..c41c119a5a74 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1523,6 +1523,7 @@ int mmap_action_complete(struct vm_area_struct *vma,
 	case MMAP_REMAP_PFN:
 	case MMAP_IO_REMAP_PFN:
 	case MMAP_SIMPLE_IO_REMAP:
+	case MMAP_MAP_KERNEL_PAGES:
 		WARN_ON_ONCE(1); /* nommu cannot handle this. */
 
 		err = -EINVAL;
--

should be the fix?

Cheers,
Nathan

^ permalink raw reply related

* Re: [PATCH v3 04/16] mm: add vm_ops->mapped hook
From: Lorenzo Stoakes (Oracle) @ 2026-03-20 20:43 UTC (permalink / raw)
  To: Vlastimil Babka (SUSE)
  Cc: Andrew Morton, Jonathan Corbet, Clemens Ladisch, Arnd Bergmann,
	Greg Kroah-Hartman, K . Y . Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Alexander Shishkin, Maxime Coquelin,
	Alexandre Torgue, Miquel Raynal, Richard Weinberger,
	Vignesh Raghavendra, Bodo Stroesser, Martin K . Petersen,
	David Howells, Marc Dionne, Alexander Viro, Christian Brauner,
	Jan Kara, David Hildenbrand, Liam R . Howlett, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Jann Horn, Pedro Falcato,
	linux-kernel, linux-doc, linux-hyperv, linux-mtd, linux-staging,
	linux-scsi, target-devel, linux-afs, linux-fsdevel, linux-mm,
	Ryan Roberts
In-Reply-To: <c9068fbb-a23c-4342-8638-3b11897a57cb@kernel.org>

On Fri, Mar 20, 2026 at 07:26:37PM +0100, Vlastimil Babka (SUSE) wrote:
> On 3/19/26 19:23, Lorenzo Stoakes (Oracle) wrote:
> > Previously, when a driver needed to do something like establish a
> > reference count, it could do so in the mmap hook in the knowledge that the
> > mapping would succeed.
> >
> > With the introduction of f_op->mmap_prepare this is no longer the case, as
> > it is invoked prior to actually establishing the mapping.
> >
> > mmap_prepare is not appropriate for this kind of thing as it is called
> > before any merge might take place, and after which an error might occur
> > meaning resources could be leaked.
> >
> > To take this into account, introduce a new vm_ops->mapped callback which
> > is invoked when the VMA is first mapped (though notably - not when it is
> > merged - which is correct and mirrors existing mmap/open/close behaviour).
> >
> > We do better that vm_ops->open() here, as this callback can return an
> > error, at which point the VMA will be unmapped.
> >
> > Note that vm_ops->mapped() is invoked after any mmap action is complete
> > (such as I/O remapping).
> >
> > We intentionally do not expose the VMA at this point, exposing only the
> > fields that could be used, and an output parameter in case the operation
> > needs to update the vma->vm_private_data field.
> >
> > In order to deal with stacked filesystems which invoke inner filesystem's
> > mmap() invocations, add __compat_vma_mapped() and invoke it on vfs_mmap()
> > (via compat_vma_mmap()) to ensure that the mapped callback is handled when
> > an mmap() caller invokes a nested filesystem's mmap_prepare() callback.
> >
> > We can now also remove call_action_complete() and invoke
> > mmap_action_complete() directly, as we separate out the rmap lock logic.
> >
> > The rmap lock logic, which was added in order to keep hugetlb working (!)
> > to allow for the rmap lock to be held longer, needs to be propagated to the
> > error paths on mmap complete and mapped hook error paths.
> >
> > This is because do_munmap() might otherwise deadlock with the rmap being
> > held, so instead we unlock at the point of unmap.
>
> Hmm but that was also true prior to this series? So is this a bugfix? Should
> it be a stable hotfix done outside of the series before the refactoring?

Yup, will send a hotfix.

Thanks, Lorenzo

^ permalink raw reply

* [PATCH ethtool-next] netlink: settings: add netlink support for RX CQE Coalescing params
From: Haiyang Zhang @ 2026-03-20 20:31 UTC (permalink / raw)
  To: mkubecek, linux-hyperv, netdev; +Cc: haiyangz, paulros

From: Haiyang Zhang <haiyangz@microsoft.com>

Add support to get/set RX CQE Coalescing parameters, including the max frames
and time out value in nanoseconds.

(Headers: dc3d720e12f6 "net: ethtool: add ethtool COALESCE_RX_CQE_FRAMES/NSECS")

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
---
 ethtool.8.in                  |  2 ++
 ethtool.c                     |  2 ++
 netlink/coalesce.c            | 17 +++++++++++++++++
 netlink/desc-ethtool.c        |  2 ++
 shell-completion/bash/ethtool |  2 ++
 5 files changed, 25 insertions(+)

diff --git a/ethtool.8.in b/ethtool.8.in
index e10a252..fe3c0ec 100644
--- a/ethtool.8.in
+++ b/ethtool.8.in
@@ -198,6 +198,8 @@ ethtool \- query or control network driver and hardware settings
 .BN tx\-aggr\-max\-bytes
 .BN tx\-aggr\-max\-frames
 .BN tx\-aggr\-time\-usecs
+.BN rx\-cqe\-frames
+.BN rx\-cqe\-nsecs
 .HP
 .B ethtool \-g|\-\-show\-ring
 .I devname
diff --git a/ethtool.c b/ethtool.c
index c9c1502..2444d85 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -5925,6 +5925,8 @@ static const struct option args[] = {
 			  "		[tx-aggr-max-bytes N]\n"
 			  "		[tx-aggr-max-frames N]\n"
 			  "		[tx-aggr-time-usecs N]\n"
+			  "		[rx-cqe-frames N]\n"
+			  "		[rx-cqe-nsecs N]\n"
 	},
 	{
 		.opts	= "-g|--show-ring",
diff --git a/netlink/coalesce.c b/netlink/coalesce.c
index bc8b57b..f36b8e8 100644
--- a/netlink/coalesce.c
+++ b/netlink/coalesce.c
@@ -96,6 +96,11 @@ int coalesce_reply_cb(const struct nlmsghdr *nlhdr, void *data)
 	show_u32("tx-aggr-time-usecs", "tx-aggr-time-usecs:\t",
 		 tb[ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS]);
 	show_cr();
+	show_u32("rx-cqe-frames", "rx-cqe-frames:\t\t",
+		 tb[ETHTOOL_A_COALESCE_RX_CQE_FRAMES]);
+	show_u32("rx-cqe-nsecs", "rx-cqe-nsecs:\t\t",
+		 tb[ETHTOOL_A_COALESCE_RX_CQE_NSECS]);
+	show_cr();
 
 	close_json_object();
 
@@ -292,6 +297,18 @@ static const struct param_parser scoalesce_params[] = {
 		.handler	= nl_parse_direct_u32,
 		.min_argc	= 1,
 	},
+	{
+		.arg		= "rx-cqe-frames",
+		.type		= ETHTOOL_A_COALESCE_RX_CQE_FRAMES,
+		.handler	= nl_parse_direct_u32,
+		.min_argc	= 1,
+	},
+	{
+		.arg		= "rx-cqe-nsecs",
+		.type		= ETHTOOL_A_COALESCE_RX_CQE_NSECS,
+		.handler	= nl_parse_direct_u32,
+		.min_argc	= 1,
+	},
 	{}
 };
 
diff --git a/netlink/desc-ethtool.c b/netlink/desc-ethtool.c
index 8289190..08d94de 100644
--- a/netlink/desc-ethtool.c
+++ b/netlink/desc-ethtool.c
@@ -249,6 +249,8 @@ static const struct pretty_nla_desc __coalesce_desc[] = {
 	NLATTR_DESC_U32(ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS),
 	NLATTR_DESC_NESTED(ETHTOOL_A_COALESCE_RX_PROFILE, profile),
 	NLATTR_DESC_NESTED(ETHTOOL_A_COALESCE_TX_PROFILE, profile),
+	NLATTR_DESC_U32(ETHTOOL_A_COALESCE_RX_CQE_FRAMES),
+	NLATTR_DESC_U32(ETHTOOL_A_COALESCE_RX_CQE_NSECS),
 };
 
 static const struct pretty_nla_desc __pause_stats_desc[] = {
diff --git a/shell-completion/bash/ethtool b/shell-completion/bash/ethtool
index 3c775a1..57c39c4 100644
--- a/shell-completion/bash/ethtool
+++ b/shell-completion/bash/ethtool
@@ -259,6 +259,8 @@ _ethtool_coalesce()
 		[tx-aggr-max-bytes]=1
 		[tx-aggr-max-frames]=1
 		[tx-aggr-time-usecs]=1
+		[rx-cqe-frames]=1
+		[rx-cqe-nsecs]=1
 	)
 
 	case "$prev" in
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH v3 04/16] mm: add vm_ops->mapped hook
From: Lorenzo Stoakes (Oracle) @ 2026-03-20 19:57 UTC (permalink / raw)
  To: Vlastimil Babka (SUSE)
  Cc: Andrew Morton, Jonathan Corbet, Clemens Ladisch, Arnd Bergmann,
	Greg Kroah-Hartman, K . Y . Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Alexander Shishkin, Maxime Coquelin,
	Alexandre Torgue, Miquel Raynal, Richard Weinberger,
	Vignesh Raghavendra, Bodo Stroesser, Martin K . Petersen,
	David Howells, Marc Dionne, Alexander Viro, Christian Brauner,
	Jan Kara, David Hildenbrand, Liam R . Howlett, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Jann Horn, Pedro Falcato,
	linux-kernel, linux-doc, linux-hyperv, linux-mtd, linux-staging,
	linux-scsi, target-devel, linux-afs, linux-fsdevel, linux-mm,
	Ryan Roberts
In-Reply-To: <c9068fbb-a23c-4342-8638-3b11897a57cb@kernel.org>

On Fri, Mar 20, 2026 at 07:26:37PM +0100, Vlastimil Babka (SUSE) wrote:
> On 3/19/26 19:23, Lorenzo Stoakes (Oracle) wrote:
> > Previously, when a driver needed to do something like establish a
> > reference count, it could do so in the mmap hook in the knowledge that the
> > mapping would succeed.
> >
> > With the introduction of f_op->mmap_prepare this is no longer the case, as
> > it is invoked prior to actually establishing the mapping.
> >
> > mmap_prepare is not appropriate for this kind of thing as it is called
> > before any merge might take place, and after which an error might occur
> > meaning resources could be leaked.
> >
> > To take this into account, introduce a new vm_ops->mapped callback which
> > is invoked when the VMA is first mapped (though notably - not when it is
> > merged - which is correct and mirrors existing mmap/open/close behaviour).
> >
> > We do better that vm_ops->open() here, as this callback can return an
> > error, at which point the VMA will be unmapped.
> >
> > Note that vm_ops->mapped() is invoked after any mmap action is complete
> > (such as I/O remapping).
> >
> > We intentionally do not expose the VMA at this point, exposing only the
> > fields that could be used, and an output parameter in case the operation
> > needs to update the vma->vm_private_data field.
> >
> > In order to deal with stacked filesystems which invoke inner filesystem's
> > mmap() invocations, add __compat_vma_mapped() and invoke it on vfs_mmap()
> > (via compat_vma_mmap()) to ensure that the mapped callback is handled when
> > an mmap() caller invokes a nested filesystem's mmap_prepare() callback.
> >
> > We can now also remove call_action_complete() and invoke
> > mmap_action_complete() directly, as we separate out the rmap lock logic.
> >
> > The rmap lock logic, which was added in order to keep hugetlb working (!)
> > to allow for the rmap lock to be held longer, needs to be propagated to the
> > error paths on mmap complete and mapped hook error paths.
> >
> > This is because do_munmap() might otherwise deadlock with the rmap being
> > held, so instead we unlock at the point of unmap.
>
> Hmm but that was also true prior to this series? So is this a bugfix? Should
> it be a stable hotfix done outside of the series before the refactoring?
>
> > This is fine as any reliance on the rmap being held is irrelevant on error.
> >
> > While we're here, refactor mmap_action_finish() to avoid a big if (err)
> > branch.
> >
> > We also abstract unmapping of a VMA on mmap action completion into its own
> > helper function, unmap_vma_locked().
> >
> > Update the mmap_prepare documentation to describe the mapped hook and make
> > it clear what its intended use is.
> >
> > Additionally, update VMA userland test headers to reflect the change.
> >
> > Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
> > ---
> >  Documentation/filesystems/mmap_prepare.rst |  15 +++
> >  include/linux/fs.h                         |   9 +-
> >  include/linux/mm.h                         |  20 +++-
> >  mm/internal.h                              |   8 ++
> >  mm/util.c                                  | 129 ++++++++++++++-------
> >  mm/vma.c                                   |  35 +++---
> >  tools/testing/vma/include/dup.h            |  27 ++++-
> >  tools/testing/vma/include/stubs.h          |   3 +-
> >  8 files changed, 186 insertions(+), 60 deletions(-)
> >
> > diff --git a/Documentation/filesystems/mmap_prepare.rst b/Documentation/filesystems/mmap_prepare.rst
> > index 65a1f094e469..20db474915da 100644
> > --- a/Documentation/filesystems/mmap_prepare.rst
> > +++ b/Documentation/filesystems/mmap_prepare.rst
> > @@ -25,6 +25,21 @@ That is - no resources should be allocated nor state updated to reflect that a
> >  mapping has been established, as the mapping may either be merged, or fail to be
> >  mapped after the callback is complete.
> >
> > +Mapped callback
> > +---------------
> > +
> > +If resources need to be allocated per-mapping, or state such as a reference
> > +count needs to be manipulated, this should be done using the ``vm_ops->mapped``
> > +hook, which itself should be set by the >mmap_prepare hook.
> > +
> > +This callback is only invoked if a new mapping has been established and was not
> > +merged with any other, and is invoked at a point where no error may occur before
> > +the mapping is established.
> > +
> > +You may return an error to the callback itself, which will cause the mapping to
> > +become unmapped and an error returned to the mmap() caller. This is useful if
> > +resources need to be allocated, and that allocation might fail.
> > +
> >  How To Use
> >  ==========
> >
> > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > index a2628a12bd2b..c390f5c667e3 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -2059,13 +2059,20 @@ static inline bool can_mmap_file(struct file *file)
> >  }
> >
> >  int compat_vma_mmap(struct file *file, struct vm_area_struct *vma);
> > +int __vma_check_mmap_hook(struct vm_area_struct *vma);
> >
> >  static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
> >  {
> > +	int err;
> > +
> >  	if (file->f_op->mmap_prepare)
> >  		return compat_vma_mmap(file, vma);
> >
> > -	return file->f_op->mmap(file, vma);
> > +	err = file->f_op->mmap(file, vma);
> > +	if (err)
> > +		return err;
> > +
> > +	return __vma_check_mmap_hook(vma);
> >  }
> >
> >  static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index da94edb287cd..68dee1101313 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -777,6 +777,23 @@ struct vm_operations_struct {
> >  	 * Context: User context.  May sleep.  Caller holds mmap_lock.
> >  	 */
> >  	void (*close)(struct vm_area_struct *vma);
> > +	/**
> > +	 * @mapped: Called when the VMA is first mapped in the MM. Not called if
> > +	 * the new VMA is merged with an adjacent VMA.
> > +	 *
> > +	 * The @vm_private_data field is an output field allowing the user to
> > +	 * modify vma->vm_private_data as necessary.
> > +	 *
> > +	 * ONLY valid if set from f_op->mmap_prepare. Will result in an error if
> > +	 * set from f_op->mmap.
> > +	 *
> > +	 * Returns %0 on success, or an error otherwise. On error, the VMA will
> > +	 * be unmapped.
> > +	 *
> > +	 * Context: User context.  May sleep.  Caller holds mmap_lock.
> > +	 */
> > +	int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff,
> > +		      const struct file *file, void **vm_private_data);
> >  	/* Called any time before splitting to check if it's allowed */
> >  	int (*may_split)(struct vm_area_struct *vma, unsigned long addr);
> >  	int (*mremap)(struct vm_area_struct *vma);
> > @@ -4327,7 +4344,8 @@ static inline void mmap_action_ioremap_full(struct vm_area_desc *desc,
> >
> >  int mmap_action_prepare(struct vm_area_desc *desc);
> >  int mmap_action_complete(struct vm_area_struct *vma,
> > -			 struct mmap_action *action);
> > +			 struct mmap_action *action,
> > +			 bool rmap_lock_held);
> >
> >  /* Look up the first VMA which exactly match the interval vm_start ... vm_end */
> >  static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
> > diff --git a/mm/internal.h b/mm/internal.h
> > index 0256ca44115a..e0f554178143 100644
> > --- a/mm/internal.h
> > +++ b/mm/internal.h
> > @@ -202,6 +202,14 @@ static inline void vma_close(struct vm_area_struct *vma)
> >  /* unmap_vmas is in mm/memory.c */
> >  void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap);
> >
> > +static inline void unmap_vma_locked(struct vm_area_struct *vma)
> > +{
> > +	const size_t len = vma_pages(vma) << PAGE_SHIFT;
> > +
> > +	mmap_assert_write_locked(vma->vm_mm);
> > +	do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
> > +}
> > +
> >  #ifdef CONFIG_MMU
> >
> >  static inline void get_anon_vma(struct anon_vma *anon_vma)
> > diff --git a/mm/util.c b/mm/util.c
> > index 73c97a748d8e..fc1bd8a8f3ea 100644
> > --- a/mm/util.c
> > +++ b/mm/util.c
> > @@ -1163,6 +1163,54 @@ void flush_dcache_folio(struct folio *folio)
> >  EXPORT_SYMBOL(flush_dcache_folio);
> >  #endif
> >
> > +static int __compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
> > +{
> > +	struct vm_area_desc desc = {
> > +		.mm = vma->vm_mm,
> > +		.file = file,
> > +		.start = vma->vm_start,
> > +		.end = vma->vm_end,
> > +
> > +		.pgoff = vma->vm_pgoff,
> > +		.vm_file = vma->vm_file,
> > +		.vma_flags = vma->flags,
> > +		.page_prot = vma->vm_page_prot,
> > +
> > +		.action.type = MMAP_NOTHING, /* Default */
> > +	};
> > +	int err;
> > +
> > +	err = vfs_mmap_prepare(file, &desc);
> > +	if (err)
> > +		return err;
> > +
> > +	err = mmap_action_prepare(&desc);
> > +	if (err)
> > +		return err;
> > +
> > +	set_vma_from_desc(vma, &desc);
> > +	return mmap_action_complete(vma, &desc.action, /*rmap_lock_held=*/false);
>
> Patch 1 removed this function and this one reinstates it with some
> modifications. Could patch 1 only remove the export and otherwise both do
> only the necessary modifications?

No, I'd rather do it this way because the function is now doing something
different and for different reasons.

Let's not just churn respins for no reason. If it's not breaking anything
let's just leave it as is please.

>
> > +}
> > +
> > +static int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
> > +{
> > +	const struct vm_operations_struct *vm_ops = vma->vm_ops;
> > +	void *vm_private_data = vma->vm_private_data;
> > +	int err;
> > +
> > +	if (!vm_ops || !vm_ops->mapped)
> > +		return 0;
> > +
> > +	err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff, file,
> > +			     &vm_private_data);
> > +	if (err)
> > +		unmap_vma_locked(vma);
> > +	else if (vm_private_data != vma->vm_private_data)
> > +		vma->vm_private_data = vm_private_data;
> > +
> > +	return err;
> > +}
> > +
> >  /**
> >   * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an
> >   * existing VMA and execute any requested actions.
> > @@ -1191,34 +1239,26 @@ EXPORT_SYMBOL(flush_dcache_folio);
> >   */
> >  int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
> >  {
> > -	struct vm_area_desc desc = {
> > -		.mm = vma->vm_mm,
> > -		.file = file,
> > -		.start = vma->vm_start,
> > -		.end = vma->vm_end,
> > -
> > -		.pgoff = vma->vm_pgoff,
> > -		.vm_file = vma->vm_file,
> > -		.vma_flags = vma->flags,
> > -		.page_prot = vma->vm_page_prot,
> > -
> > -		.action.type = MMAP_NOTHING, /* Default */
> > -	};
> >  	int err;
> >
> > -	err = vfs_mmap_prepare(file, &desc);
> > +	err = __compat_vma_mmap(file, vma);
> >  	if (err)
> >  		return err;
> >
> > -	err = mmap_action_prepare(&desc);
> > -	if (err)
> > -		return err;
> > -
> > -	set_vma_from_desc(vma, &desc);
> > -	return mmap_action_complete(vma, &desc.action);
> > +	return __compat_vma_mapped(file, vma);
> >  }
> >  EXPORT_SYMBOL(compat_vma_mmap);
> >
> > +int __vma_check_mmap_hook(struct vm_area_struct *vma)
> > +{
> > +	/* vm_ops->mapped is not valid if mmap() is specified. */
> > +	if (vma->vm_ops && WARN_ON_ONCE(vma->vm_ops->mapped))
> > +		return -EINVAL;
> > +
> > +	return 0;
> > +}
> > +EXPORT_SYMBOL(__vma_check_mmap_hook);
> > +
> >  static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio,
> >  			 const struct page *page)
> >  {
> > @@ -1308,32 +1348,31 @@ void snapshot_page(struct page_snapshot *ps, const struct page *page)
> >  }
> >
> >  static int mmap_action_finish(struct vm_area_struct *vma,
> > -			      struct mmap_action *action, int err)
> > +			      struct mmap_action *action, int err,
> > +			      bool rmap_lock_held)
> >  {
> > +	if (rmap_lock_held)
> > +		i_mmap_unlock_write(vma->vm_file->f_mapping);
>
> Should this be moved below the if (!err) ?

Yup sigh, the success hook needs this hack to be held.

This is all because hugetlb does stupid things.

The one case in which it uses generic hooks rather than stupid 'if
(hugetlb) { ... }' branches, and it does stuff it shouldn't (of course! :)

>
> Otherwise I think we unlock prematurely, and can even try to unlock twice -
> here and in call_mapped_hook(). And we want to unlock only if we're about to
> munmap, right?

No, we want to unlock at the end of the operation regardless. But this can
unlock twice because I'm not threading this crap through enough clearly.

Let me rethink this I guess. I hate hugetlb.

>
> > +
> > +	if (!err) {
> > +		if (action->success_hook)
> > +			return action->success_hook(vma);
> > +		return 0;
> > +	}
> > +
> >  	/*
> >  	 * If an error occurs, unmap the VMA altogether and return an error. We
> >  	 * only clear the newly allocated VMA, since this function is only
> >  	 * invoked if we do NOT merge, so we only clean up the VMA we created.
> >  	 */
> > -	if (err) {
> > -		const size_t len = vma_pages(vma) << PAGE_SHIFT;
> > -
> > -		do_munmap(current->mm, vma->vm_start, len, NULL);
> > -
> > -		if (action->error_hook) {
> > -			/* We may want to filter the error. */
> > -			err = action->error_hook(err);
> > -
> > -			/* The caller should not clear the error. */
> > -			VM_WARN_ON_ONCE(!err);
> > -		}
> > -		return err;
> > +	unmap_vma_locked(vma);
> > +	if (action->error_hook) {
> > +		/* We may want to filter the error. */
> > +		err = action->error_hook(err);
> > +		/* The caller should not clear the error. */
> > +		VM_WARN_ON_ONCE(!err);
> >  	}
> > -
> > -	if (action->success_hook)
> > -		return action->success_hook(vma);
> > -
> > -	return 0;
> > +	return err;
> >  }
> >
> >  #ifdef CONFIG_MMU
> > @@ -1364,13 +1403,15 @@ EXPORT_SYMBOL(mmap_action_prepare);
> >   * mmap_action_complete - Execute VMA descriptor action.
> >   * @vma: The VMA to perform the action upon.
> >   * @action: The action to perform.
> > + * @rmap_lock_held: Is the file rmap lock held?
> >   *
> >   * Similar to mmap_action_prepare().
> >   *
> >   * Return: 0 on success, or error, at which point the VMA will be unmapped.
> >   */
> >  int mmap_action_complete(struct vm_area_struct *vma,
> > -			 struct mmap_action *action)
> > +			 struct mmap_action *action,
> > +			 bool rmap_lock_held)
> >
> >  {
> >  	int err = 0;
> > @@ -1388,7 +1429,8 @@ int mmap_action_complete(struct vm_area_struct *vma,
> >  		break;
> >  	}
> >
> > -	return mmap_action_finish(vma, action, err);
> > +	return mmap_action_finish(vma, action, err,
> > +				  rmap_lock_held);
> >  }
> >  EXPORT_SYMBOL(mmap_action_complete);
> >  #else
> > @@ -1408,7 +1450,8 @@ int mmap_action_prepare(struct vm_area_desc *desc)
> >  EXPORT_SYMBOL(mmap_action_prepare);
> >
> >  int mmap_action_complete(struct vm_area_struct *vma,
> > -			 struct mmap_action *action)
> > +			 struct mmap_action *action,
> > +			 bool rmap_lock_held)
> >  {
> >  	int err = 0;
> >
> > @@ -1423,7 +1466,7 @@ int mmap_action_complete(struct vm_area_struct *vma,
> >  		break;
> >  	}
> >
> > -	return mmap_action_finish(vma, action, err);
> > +	return mmap_action_finish(vma, action, err, rmap_lock_held);
> >  }
> >  EXPORT_SYMBOL(mmap_action_complete);
> >  #endif
> > diff --git a/mm/vma.c b/mm/vma.c
> > index 2a86c7575000..a27d1278ea6d 100644
> > --- a/mm/vma.c
> > +++ b/mm/vma.c
> > @@ -2731,21 +2731,28 @@ static bool can_set_ksm_flags_early(struct mmap_state *map)
> >  	return false;
> >  }
> >
> > -static int call_action_complete(struct mmap_state *map,
> > -				struct mmap_action *action,
> > -				struct vm_area_struct *vma)
> > +static int call_mapped_hook(struct mmap_state *map,
> > +			    struct vm_area_struct *vma)
> >  {
> > -	int ret;
> > -
> > -	ret = mmap_action_complete(vma, action);
> > +	const struct vm_operations_struct *vm_ops = vma->vm_ops;
> > +	void *vm_private_data = vma->vm_private_data;
> > +	int err;
> >
> > -	/* If we held the file rmap we need to release it. */
> > -	if (map->hold_file_rmap_lock) {
> > -		struct file *file = vma->vm_file;
> > +	if (!vm_ops || !vm_ops->mapped)
> > +		return 0;
> > +	err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff,
> > +			     vma->vm_file, &vm_private_data);
> > +	if (err) {
> > +		if (map->hold_file_rmap_lock)
> > +			i_mmap_unlock_write(vma->vm_file->f_mapping);
> >
> > -		i_mmap_unlock_write(file->f_mapping);
> > +		unmap_vma_locked(vma);
> > +		return err;
> >  	}
> > -	return ret;
> > +	/* Update private data if changed. */
> > +	if (vm_private_data != vma->vm_private_data)
> > +		vma->vm_private_data = vm_private_data;
> > +	return 0;
> >  }
> >
> >  static unsigned long __mmap_region(struct file *file, unsigned long addr,
> > @@ -2799,8 +2806,10 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
> >  	__mmap_complete(&map, vma);
> >
> >  	if (have_mmap_prepare && allocated_new) {
> > -		error = call_action_complete(&map, &desc.action, vma);
> > -
> > +		error = mmap_action_complete(vma, &desc.action,
> > +					     map.hold_file_rmap_lock);
> > +		if (!error)
> > +			error = call_mapped_hook(&map, vma);
>
> And if neither of those above end up doing i_mmap_unlock_write(), we should
> do it here? I think currently the misplaced unlock in mmap_action_finish()

I don't think it's misplaced, I think it's a much better place for it than
here, and otherwise we have yet more hacks upon hacks threaded throughout
the code.

We don't actually need the hugetlb hack to hold the lock over .mapped but I
guess we should do that anyway just for the sake of consistency.

But obviously call_mapped_hook() is trying to unlock anyway.

> masks the lack of it here, otherwise bots would already notice. Loss of

And AI... it would require hitting a .mapped error I guess.

> locking coverage (due to premature unlock) or the risk of double unlock is
> probably harder to trigger. Or maybe a later patch in the series happens to
> fix the issues, so it's just a bisection hazard here. Or I'm completely wrong.

Nope I messed it up, will fix.

>
> >  		if (error)
> >  			return error;
> >  	}
> > diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h
> > index 8ae525ed1738..aa34966cbc62 100644
> > --- a/tools/testing/vma/include/dup.h
> > +++ b/tools/testing/vma/include/dup.h
> > @@ -643,6 +643,23 @@ struct vm_operations_struct {
> >  	 * Context: User context.  May sleep.  Caller holds mmap_lock.
> >  	 */
> >  	void (*close)(struct vm_area_struct *vma);
> > +	/**
> > +	 * @mapped: Called when the VMA is first mapped in the MM. Not called if
> > +	 * the new VMA is merged with an adjacent VMA.
> > +	 *
> > +	 * The @vm_private_data field is an output field allowing the user to
> > +	 * modify vma->vm_private_data as necessary.
> > +	 *
> > +	 * ONLY valid if set from f_op->mmap_prepare. Will result in an error if
> > +	 * set from f_op->mmap.
> > +	 *
> > +	 * Returns %0 on success, or an error otherwise. On error, the VMA will
> > +	 * be unmapped.
> > +	 *
> > +	 * Context: User context.  May sleep.  Caller holds mmap_lock.
> > +	 */
> > +	int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff,
> > +		      const struct file *file, void **vm_private_data);
> >  	/* Called any time before splitting to check if it's allowed */
> >  	int (*may_split)(struct vm_area_struct *vma, unsigned long addr);
> >  	int (*mremap)(struct vm_area_struct *vma);
> > @@ -1281,7 +1298,7 @@ static inline int __compat_vma_mmap(const struct file_operations *f_op,
> >  		return err;
> >
> >  	set_vma_from_desc(vma, &desc);
> > -	return mmap_action_complete(vma, &desc.action);
> > +	return mmap_action_complete(vma, &desc.action, /*rmap_lock_held=*/false);
> >  }
> >
> >  static inline int compat_vma_mmap(struct file *file,
> > @@ -1500,3 +1517,11 @@ static inline pgprot_t vma_get_page_prot(vma_flags_t vma_flags)
> >
> >  	return vm_get_page_prot(vm_flags);
> >  }
> > +
> > +static inline void unmap_vma_locked(struct vm_area_struct *vma)
> > +{
> > +	const size_t len = vma_pages(vma) << PAGE_SHIFT;
> > +
> > +	mmap_assert_write_locked(vma->vm_mm);
> > +	do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
> > +}
> > diff --git a/tools/testing/vma/include/stubs.h b/tools/testing/vma/include/stubs.h
> > index a30b8bc84955..d1c3d4ddb5e9 100644
> > --- a/tools/testing/vma/include/stubs.h
> > +++ b/tools/testing/vma/include/stubs.h
> > @@ -87,7 +87,8 @@ static inline int mmap_action_prepare(struct vm_area_desc *desc)
> >  }
> >
> >  static inline int mmap_action_complete(struct vm_area_struct *vma,
> > -				       struct mmap_action *action)
> > +				       struct mmap_action *action,
> > +				       bool rmap_lock_held)
> >  {
> >  	return 0;
> >  }
>

Thanks, Lorenzo

^ permalink raw reply

* Re: [PATCH v3 05/16] fs: afs: correctly drop reference count on mapping failure
From: Lorenzo Stoakes (Oracle) @ 2026-03-20 19:01 UTC (permalink / raw)
  To: Vlastimil Babka (SUSE)
  Cc: Andrew Morton, Jonathan Corbet, Clemens Ladisch, Arnd Bergmann,
	Greg Kroah-Hartman, K . Y . Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Alexander Shishkin, Maxime Coquelin,
	Alexandre Torgue, Miquel Raynal, Richard Weinberger,
	Vignesh Raghavendra, Bodo Stroesser, Martin K . Petersen,
	David Howells, Marc Dionne, Alexander Viro, Christian Brauner,
	Jan Kara, David Hildenbrand, Liam R . Howlett, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Jann Horn, Pedro Falcato,
	linux-kernel, linux-doc, linux-hyperv, linux-mtd, linux-staging,
	linux-scsi, target-devel, linux-afs, linux-fsdevel, linux-mm,
	Ryan Roberts
In-Reply-To: <608ba54c-f19e-4e27-8142-0870f91d6514@kernel.org>

On Fri, Mar 20, 2026 at 07:57:29PM +0100, Vlastimil Babka (SUSE) wrote:
> On 3/19/26 19:23, Lorenzo Stoakes (Oracle) wrote:
> > Commit 9d5403b1036c ("fs: convert most other generic_file_*mmap() users to
> > .mmap_prepare()") updated AFS to use the mmap_prepare callback in favour
> > of the deprecated mmap callback.
> >
> > However, it did not account for the fact that mmap_prepare is called
> > pre-merge, and may then be merged, nor that mmap_prepare can fail to map
> > due to an out of memory error.
>
> So that means a file can become pinned forever? OOM is probably only a
> problem with fault injection in practice, but the merge case can happen. And
> 9d5403b1036c is pre-6.18 LTS. Are we going to need Fixes: and Cc: stable then?

That'd require backporting all of the .mapped functionality and half of this
series, I don't think that's really practical.

I guess I can do a manual backport of a partial revert.

Thanks, Lorenzo

^ permalink raw reply

* Re: [PATCH v3 05/16] fs: afs: correctly drop reference count on mapping failure
From: Vlastimil Babka (SUSE) @ 2026-03-20 18:57 UTC (permalink / raw)
  To: Lorenzo Stoakes (Oracle), Andrew Morton
  Cc: Jonathan Corbet, Clemens Ladisch, Arnd Bergmann,
	Greg Kroah-Hartman, K . Y . Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Alexander Shishkin, Maxime Coquelin,
	Alexandre Torgue, Miquel Raynal, Richard Weinberger,
	Vignesh Raghavendra, Bodo Stroesser, Martin K . Petersen,
	David Howells, Marc Dionne, Alexander Viro, Christian Brauner,
	Jan Kara, David Hildenbrand, Liam R . Howlett, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Jann Horn, Pedro Falcato,
	linux-kernel, linux-doc, linux-hyperv, linux-mtd, linux-staging,
	linux-scsi, target-devel, linux-afs, linux-fsdevel, linux-mm,
	Ryan Roberts
In-Reply-To: <018cd0d8b2dae44de6d3952527e754e52ef02da8.1773944114.git.ljs@kernel.org>

On 3/19/26 19:23, Lorenzo Stoakes (Oracle) wrote:
> Commit 9d5403b1036c ("fs: convert most other generic_file_*mmap() users to
> .mmap_prepare()") updated AFS to use the mmap_prepare callback in favour
> of the deprecated mmap callback.
> 
> However, it did not account for the fact that mmap_prepare is called
> pre-merge, and may then be merged, nor that mmap_prepare can fail to map
> due to an out of memory error.

So that means a file can become pinned forever? OOM is probably only a
problem with fault injection in practice, but the merge case can happen. And
9d5403b1036c is pre-6.18 LTS. Are we going to need Fixes: and Cc: stable then?

> Both of those are cases in which we should not be incrementing a reference
> count.
> 
> With the newly added vm_ops->mapped callback available, we can simply
> defer this operation to that callback which is only invoked once the
> mapping is successfully in place (but not yet visible to userspace as the
> mmap and VMA write locks are held).
> 
> Therefore add afs_mapped() to implement this callback for AFS, and remove
> the code doing so in afs_mmap_prepare().
> 
> Also update afs_vm_open(), afs_vm_close() and afs_vm_map_pages() to be
> consistent in how the vnode is accessed.
> 
> Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
> ---
>  fs/afs/file.c | 36 ++++++++++++++++++++++++++----------
>  1 file changed, 26 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/afs/file.c b/fs/afs/file.c
> index f609366fd2ac..85696ac984cc 100644
> --- a/fs/afs/file.c
> +++ b/fs/afs/file.c
> @@ -28,6 +28,8 @@ static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos,
>  static void afs_vm_open(struct vm_area_struct *area);
>  static void afs_vm_close(struct vm_area_struct *area);
>  static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff);
> +static int afs_mapped(unsigned long start, unsigned long end, pgoff_t pgoff,
> +		      const struct file *file, void **vm_private_data);
>  
>  const struct file_operations afs_file_operations = {
>  	.open		= afs_open,
> @@ -61,6 +63,7 @@ const struct address_space_operations afs_file_aops = {
>  };
>  
>  static const struct vm_operations_struct afs_vm_ops = {
> +	.mapped		= afs_mapped,
>  	.open		= afs_vm_open,
>  	.close		= afs_vm_close,
>  	.fault		= filemap_fault,
> @@ -494,32 +497,45 @@ static void afs_drop_open_mmap(struct afs_vnode *vnode)
>   */
>  static int afs_file_mmap_prepare(struct vm_area_desc *desc)
>  {
> -	struct afs_vnode *vnode = AFS_FS_I(file_inode(desc->file));
>  	int ret;
>  
> -	afs_add_open_mmap(vnode);
> -
>  	ret = generic_file_mmap_prepare(desc);
> -	if (ret == 0)
> -		desc->vm_ops = &afs_vm_ops;
> -	else
> -		afs_drop_open_mmap(vnode);
> +	if (ret)
> +		return ret;
> +
> +	desc->vm_ops = &afs_vm_ops;
>  	return ret;
>  }
>  
> +static int afs_mapped(unsigned long start, unsigned long end, pgoff_t pgoff,
> +		      const struct file *file, void **vm_private_data)
> +{
> +	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
> +
> +	afs_add_open_mmap(vnode);
> +	return 0;
> +}
> +
>  static void afs_vm_open(struct vm_area_struct *vma)
>  {
> -	afs_add_open_mmap(AFS_FS_I(file_inode(vma->vm_file)));
> +	struct file *file = vma->vm_file;
> +	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
> +
> +	afs_add_open_mmap(vnode);
>  }
>  
>  static void afs_vm_close(struct vm_area_struct *vma)
>  {
> -	afs_drop_open_mmap(AFS_FS_I(file_inode(vma->vm_file)));
> +	struct file *file = vma->vm_file;
> +	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
> +
> +	afs_drop_open_mmap(vnode);
>  }
>  
>  static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff)
>  {
> -	struct afs_vnode *vnode = AFS_FS_I(file_inode(vmf->vma->vm_file));
> +	struct file *file = vmf->vma->vm_file;
> +	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
>  
>  	if (afs_check_validity(vnode))
>  		return filemap_map_pages(vmf, start_pgoff, end_pgoff);


^ permalink raw reply

* Re: [PATCH net-next, v3] net: mana: Force full-page RX buffers for 4K page size on specific systems.
From: Dipayaan Roy @ 2026-03-20 18:37 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	pabeni, leon, longli, kotaranov, horms, shradhagupta, ssengar,
	ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, dipayanroy
In-Reply-To: <20260314125053.41d6221b@kernel.org>

On Sat, Mar 14, 2026 at 12:50:53PM -0700, Jakub Kicinski wrote:
> On Tue, 10 Mar 2026 21:00:49 -0700 Dipayaan Roy wrote:
> > On certain systems configured with 4K PAGE_SIZE, utilizing page_pool
> > fragments for RX buffers results in a significant throughput regression.
> > Profiling reveals that this regression correlates with high overhead in the
> > fragment allocation and reference counting paths on these specific
> > platforms, rendering the multi-buffer-per-page strategy counterproductive.
> 
> Can you say more ? We could technically take two references on the page
> right away if MTU is small and avoid some of the cost.

There is a 15-20% shortfall in achieving line rate for MANA (180+ Gbps)
on a particular ARM64 SKU. The issue is only specific to this processor SKU —
not seen on other ARM64 SKUs (e.g., GB200) or x86 SKUs. Critically, the
regression only manifests beyond 16 TCP connections, which strongly indicates
seen when there is  high contention and traffic.

  no. of     | rx buf backed       | rx buf backed
 connections | with page fragments | with full page
-------------+---------------------+---------------
           4 |         139 Gbps    |     138 Gbps
           8 |         140 Gbps    |     162 Gbps
          16 |         186 Gbps    |     186 Gbps
          32 |         136 Gbps    |     183 Gbps
          48 |         159 Gbps    |     185 Gbps
          64 |         165 Gbps    |     184 Gbps
         128 |         170 Gbps    |     180 Gbps
 
HW team is still working to RCA this hw behaviour.

Regarding "We could technically take two references on the page right
away", are you suggesting having page reference counting logic to driver
instead of relying on page pool?

> 
> The driver doesn't seem to set skb->truesize accordingly after this
> change. So you're lying to the stack about how much memory each packet
> consumes. This is a blocker for the change.
> 
ACK. I will send out a separate patch with fixes tag to fix the skb true
size.

> > To mitigate this, bypass the page_pool fragment path and force a single RX
> > packet per page allocation when all the following conditions are met:
> >   1. The system is configured with a 4K PAGE_SIZE.
> >   2. A processor-specific quirk is detected via SMBIOS Type 4 data.
> 
> I don't think we want the kernel to be in the business of carrying
> matching on platform names and providing optimal config by default.
> This sort of logic needs to live in user space or the hypervisor 
> (which can then pass a single bit to the driver to enable the behavior)
> 
As per our internal discussion the hypervisor cannot provide the CPU
version info(in vm as well as in bare metal offerings).

On handling it from user side are you suggesting it to introduce a new
ethtool Private Flags and have udev rules for the driver to set the private
flag and switch to full page rx buffers? Given that the wide number of distro
support this might be harder to maintain/backport. 

Also the dmi parsing design was influenced by other net wireleass
drivers as /wireless/ath/ath10k/core.c. If this approach is not
acceptable for MANA driver then will have to take a alternate route
based on the dsicussion right above it.

> > This approach restores expected line-rate performance by ensuring
> > predictable RX refill behavior on affected hardware.
> > 
> > There is no behavioral change for systems using larger page sizes
> > (16K/64K), or platforms where this processor-specific quirk do not
> > apply.
> -- 
> pw-bot: cr

Thank you for your comments Jakub, and also pointing out the skb true
size issue. I am sending out a separate to fix the skb true size issue.

Regards
Dipayaan Roy


^ permalink raw reply

* Re: [PATCH v3 04/16] mm: add vm_ops->mapped hook
From: Vlastimil Babka (SUSE) @ 2026-03-20 18:26 UTC (permalink / raw)
  To: Lorenzo Stoakes (Oracle), Andrew Morton
  Cc: Jonathan Corbet, Clemens Ladisch, Arnd Bergmann,
	Greg Kroah-Hartman, K . Y . Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Alexander Shishkin, Maxime Coquelin,
	Alexandre Torgue, Miquel Raynal, Richard Weinberger,
	Vignesh Raghavendra, Bodo Stroesser, Martin K . Petersen,
	David Howells, Marc Dionne, Alexander Viro, Christian Brauner,
	Jan Kara, David Hildenbrand, Liam R . Howlett, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Jann Horn, Pedro Falcato,
	linux-kernel, linux-doc, linux-hyperv, linux-mtd, linux-staging,
	linux-scsi, target-devel, linux-afs, linux-fsdevel, linux-mm,
	Ryan Roberts
In-Reply-To: <a97366fa6f22a0ca1340cfd2b0d4df87c80ac80a.1773944114.git.ljs@kernel.org>

On 3/19/26 19:23, Lorenzo Stoakes (Oracle) wrote:
> Previously, when a driver needed to do something like establish a
> reference count, it could do so in the mmap hook in the knowledge that the
> mapping would succeed.
> 
> With the introduction of f_op->mmap_prepare this is no longer the case, as
> it is invoked prior to actually establishing the mapping.
> 
> mmap_prepare is not appropriate for this kind of thing as it is called
> before any merge might take place, and after which an error might occur
> meaning resources could be leaked.
> 
> To take this into account, introduce a new vm_ops->mapped callback which
> is invoked when the VMA is first mapped (though notably - not when it is
> merged - which is correct and mirrors existing mmap/open/close behaviour).
> 
> We do better that vm_ops->open() here, as this callback can return an
> error, at which point the VMA will be unmapped.
> 
> Note that vm_ops->mapped() is invoked after any mmap action is complete
> (such as I/O remapping).
> 
> We intentionally do not expose the VMA at this point, exposing only the
> fields that could be used, and an output parameter in case the operation
> needs to update the vma->vm_private_data field.
> 
> In order to deal with stacked filesystems which invoke inner filesystem's
> mmap() invocations, add __compat_vma_mapped() and invoke it on vfs_mmap()
> (via compat_vma_mmap()) to ensure that the mapped callback is handled when
> an mmap() caller invokes a nested filesystem's mmap_prepare() callback.
> 
> We can now also remove call_action_complete() and invoke
> mmap_action_complete() directly, as we separate out the rmap lock logic.
> 
> The rmap lock logic, which was added in order to keep hugetlb working (!)
> to allow for the rmap lock to be held longer, needs to be propagated to the
> error paths on mmap complete and mapped hook error paths.
> 
> This is because do_munmap() might otherwise deadlock with the rmap being
> held, so instead we unlock at the point of unmap.

Hmm but that was also true prior to this series? So is this a bugfix? Should
it be a stable hotfix done outside of the series before the refactoring?

> This is fine as any reliance on the rmap being held is irrelevant on error.
> 
> While we're here, refactor mmap_action_finish() to avoid a big if (err)
> branch.
> 
> We also abstract unmapping of a VMA on mmap action completion into its own
> helper function, unmap_vma_locked().
> 
> Update the mmap_prepare documentation to describe the mapped hook and make
> it clear what its intended use is.
> 
> Additionally, update VMA userland test headers to reflect the change.
> 
> Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
> ---
>  Documentation/filesystems/mmap_prepare.rst |  15 +++
>  include/linux/fs.h                         |   9 +-
>  include/linux/mm.h                         |  20 +++-
>  mm/internal.h                              |   8 ++
>  mm/util.c                                  | 129 ++++++++++++++-------
>  mm/vma.c                                   |  35 +++---
>  tools/testing/vma/include/dup.h            |  27 ++++-
>  tools/testing/vma/include/stubs.h          |   3 +-
>  8 files changed, 186 insertions(+), 60 deletions(-)
> 
> diff --git a/Documentation/filesystems/mmap_prepare.rst b/Documentation/filesystems/mmap_prepare.rst
> index 65a1f094e469..20db474915da 100644
> --- a/Documentation/filesystems/mmap_prepare.rst
> +++ b/Documentation/filesystems/mmap_prepare.rst
> @@ -25,6 +25,21 @@ That is - no resources should be allocated nor state updated to reflect that a
>  mapping has been established, as the mapping may either be merged, or fail to be
>  mapped after the callback is complete.
>  
> +Mapped callback
> +---------------
> +
> +If resources need to be allocated per-mapping, or state such as a reference
> +count needs to be manipulated, this should be done using the ``vm_ops->mapped``
> +hook, which itself should be set by the >mmap_prepare hook.
> +
> +This callback is only invoked if a new mapping has been established and was not
> +merged with any other, and is invoked at a point where no error may occur before
> +the mapping is established.
> +
> +You may return an error to the callback itself, which will cause the mapping to
> +become unmapped and an error returned to the mmap() caller. This is useful if
> +resources need to be allocated, and that allocation might fail.
> +
>  How To Use
>  ==========
>  
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index a2628a12bd2b..c390f5c667e3 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2059,13 +2059,20 @@ static inline bool can_mmap_file(struct file *file)
>  }
>  
>  int compat_vma_mmap(struct file *file, struct vm_area_struct *vma);
> +int __vma_check_mmap_hook(struct vm_area_struct *vma);
>  
>  static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
>  {
> +	int err;
> +
>  	if (file->f_op->mmap_prepare)
>  		return compat_vma_mmap(file, vma);
>  
> -	return file->f_op->mmap(file, vma);
> +	err = file->f_op->mmap(file, vma);
> +	if (err)
> +		return err;
> +
> +	return __vma_check_mmap_hook(vma);
>  }
>  
>  static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index da94edb287cd..68dee1101313 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -777,6 +777,23 @@ struct vm_operations_struct {
>  	 * Context: User context.  May sleep.  Caller holds mmap_lock.
>  	 */
>  	void (*close)(struct vm_area_struct *vma);
> +	/**
> +	 * @mapped: Called when the VMA is first mapped in the MM. Not called if
> +	 * the new VMA is merged with an adjacent VMA.
> +	 *
> +	 * The @vm_private_data field is an output field allowing the user to
> +	 * modify vma->vm_private_data as necessary.
> +	 *
> +	 * ONLY valid if set from f_op->mmap_prepare. Will result in an error if
> +	 * set from f_op->mmap.
> +	 *
> +	 * Returns %0 on success, or an error otherwise. On error, the VMA will
> +	 * be unmapped.
> +	 *
> +	 * Context: User context.  May sleep.  Caller holds mmap_lock.
> +	 */
> +	int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff,
> +		      const struct file *file, void **vm_private_data);
>  	/* Called any time before splitting to check if it's allowed */
>  	int (*may_split)(struct vm_area_struct *vma, unsigned long addr);
>  	int (*mremap)(struct vm_area_struct *vma);
> @@ -4327,7 +4344,8 @@ static inline void mmap_action_ioremap_full(struct vm_area_desc *desc,
>  
>  int mmap_action_prepare(struct vm_area_desc *desc);
>  int mmap_action_complete(struct vm_area_struct *vma,
> -			 struct mmap_action *action);
> +			 struct mmap_action *action,
> +			 bool rmap_lock_held);
>  
>  /* Look up the first VMA which exactly match the interval vm_start ... vm_end */
>  static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
> diff --git a/mm/internal.h b/mm/internal.h
> index 0256ca44115a..e0f554178143 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -202,6 +202,14 @@ static inline void vma_close(struct vm_area_struct *vma)
>  /* unmap_vmas is in mm/memory.c */
>  void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap);
>  
> +static inline void unmap_vma_locked(struct vm_area_struct *vma)
> +{
> +	const size_t len = vma_pages(vma) << PAGE_SHIFT;
> +
> +	mmap_assert_write_locked(vma->vm_mm);
> +	do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
> +}
> +
>  #ifdef CONFIG_MMU
>  
>  static inline void get_anon_vma(struct anon_vma *anon_vma)
> diff --git a/mm/util.c b/mm/util.c
> index 73c97a748d8e..fc1bd8a8f3ea 100644
> --- a/mm/util.c
> +++ b/mm/util.c
> @@ -1163,6 +1163,54 @@ void flush_dcache_folio(struct folio *folio)
>  EXPORT_SYMBOL(flush_dcache_folio);
>  #endif
>  
> +static int __compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +	struct vm_area_desc desc = {
> +		.mm = vma->vm_mm,
> +		.file = file,
> +		.start = vma->vm_start,
> +		.end = vma->vm_end,
> +
> +		.pgoff = vma->vm_pgoff,
> +		.vm_file = vma->vm_file,
> +		.vma_flags = vma->flags,
> +		.page_prot = vma->vm_page_prot,
> +
> +		.action.type = MMAP_NOTHING, /* Default */
> +	};
> +	int err;
> +
> +	err = vfs_mmap_prepare(file, &desc);
> +	if (err)
> +		return err;
> +
> +	err = mmap_action_prepare(&desc);
> +	if (err)
> +		return err;
> +
> +	set_vma_from_desc(vma, &desc);
> +	return mmap_action_complete(vma, &desc.action, /*rmap_lock_held=*/false);

Patch 1 removed this function and this one reinstates it with some
modifications. Could patch 1 only remove the export and otherwise both do
only the necessary modifications?

> +}
> +
> +static int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
> +{
> +	const struct vm_operations_struct *vm_ops = vma->vm_ops;
> +	void *vm_private_data = vma->vm_private_data;
> +	int err;
> +
> +	if (!vm_ops || !vm_ops->mapped)
> +		return 0;
> +
> +	err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff, file,
> +			     &vm_private_data);
> +	if (err)
> +		unmap_vma_locked(vma);
> +	else if (vm_private_data != vma->vm_private_data)
> +		vma->vm_private_data = vm_private_data;
> +
> +	return err;
> +}
> +
>  /**
>   * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an
>   * existing VMA and execute any requested actions.
> @@ -1191,34 +1239,26 @@ EXPORT_SYMBOL(flush_dcache_folio);
>   */
>  int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
>  {
> -	struct vm_area_desc desc = {
> -		.mm = vma->vm_mm,
> -		.file = file,
> -		.start = vma->vm_start,
> -		.end = vma->vm_end,
> -
> -		.pgoff = vma->vm_pgoff,
> -		.vm_file = vma->vm_file,
> -		.vma_flags = vma->flags,
> -		.page_prot = vma->vm_page_prot,
> -
> -		.action.type = MMAP_NOTHING, /* Default */
> -	};
>  	int err;
>  
> -	err = vfs_mmap_prepare(file, &desc);
> +	err = __compat_vma_mmap(file, vma);
>  	if (err)
>  		return err;
>  
> -	err = mmap_action_prepare(&desc);
> -	if (err)
> -		return err;
> -
> -	set_vma_from_desc(vma, &desc);
> -	return mmap_action_complete(vma, &desc.action);
> +	return __compat_vma_mapped(file, vma);
>  }
>  EXPORT_SYMBOL(compat_vma_mmap);
>  
> +int __vma_check_mmap_hook(struct vm_area_struct *vma)
> +{
> +	/* vm_ops->mapped is not valid if mmap() is specified. */
> +	if (vma->vm_ops && WARN_ON_ONCE(vma->vm_ops->mapped))
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL(__vma_check_mmap_hook);
> +
>  static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio,
>  			 const struct page *page)
>  {
> @@ -1308,32 +1348,31 @@ void snapshot_page(struct page_snapshot *ps, const struct page *page)
>  }
>  
>  static int mmap_action_finish(struct vm_area_struct *vma,
> -			      struct mmap_action *action, int err)
> +			      struct mmap_action *action, int err,
> +			      bool rmap_lock_held)
>  {
> +	if (rmap_lock_held)
> +		i_mmap_unlock_write(vma->vm_file->f_mapping);

Should this be moved below the if (!err) ?

Otherwise I think we unlock prematurely, and can even try to unlock twice -
here and in call_mapped_hook(). And we want to unlock only if we're about to
munmap, right?

> +
> +	if (!err) {
> +		if (action->success_hook)
> +			return action->success_hook(vma);
> +		return 0;
> +	}
> +
>  	/*
>  	 * If an error occurs, unmap the VMA altogether and return an error. We
>  	 * only clear the newly allocated VMA, since this function is only
>  	 * invoked if we do NOT merge, so we only clean up the VMA we created.
>  	 */
> -	if (err) {
> -		const size_t len = vma_pages(vma) << PAGE_SHIFT;
> -
> -		do_munmap(current->mm, vma->vm_start, len, NULL);
> -
> -		if (action->error_hook) {
> -			/* We may want to filter the error. */
> -			err = action->error_hook(err);
> -
> -			/* The caller should not clear the error. */
> -			VM_WARN_ON_ONCE(!err);
> -		}
> -		return err;
> +	unmap_vma_locked(vma);
> +	if (action->error_hook) {
> +		/* We may want to filter the error. */
> +		err = action->error_hook(err);
> +		/* The caller should not clear the error. */
> +		VM_WARN_ON_ONCE(!err);
>  	}
> -
> -	if (action->success_hook)
> -		return action->success_hook(vma);
> -
> -	return 0;
> +	return err;
>  }
>  
>  #ifdef CONFIG_MMU
> @@ -1364,13 +1403,15 @@ EXPORT_SYMBOL(mmap_action_prepare);
>   * mmap_action_complete - Execute VMA descriptor action.
>   * @vma: The VMA to perform the action upon.
>   * @action: The action to perform.
> + * @rmap_lock_held: Is the file rmap lock held?
>   *
>   * Similar to mmap_action_prepare().
>   *
>   * Return: 0 on success, or error, at which point the VMA will be unmapped.
>   */
>  int mmap_action_complete(struct vm_area_struct *vma,
> -			 struct mmap_action *action)
> +			 struct mmap_action *action,
> +			 bool rmap_lock_held)
>  
>  {
>  	int err = 0;
> @@ -1388,7 +1429,8 @@ int mmap_action_complete(struct vm_area_struct *vma,
>  		break;
>  	}
>  
> -	return mmap_action_finish(vma, action, err);
> +	return mmap_action_finish(vma, action, err,
> +				  rmap_lock_held);
>  }
>  EXPORT_SYMBOL(mmap_action_complete);
>  #else
> @@ -1408,7 +1450,8 @@ int mmap_action_prepare(struct vm_area_desc *desc)
>  EXPORT_SYMBOL(mmap_action_prepare);
>  
>  int mmap_action_complete(struct vm_area_struct *vma,
> -			 struct mmap_action *action)
> +			 struct mmap_action *action,
> +			 bool rmap_lock_held)
>  {
>  	int err = 0;
>  
> @@ -1423,7 +1466,7 @@ int mmap_action_complete(struct vm_area_struct *vma,
>  		break;
>  	}
>  
> -	return mmap_action_finish(vma, action, err);
> +	return mmap_action_finish(vma, action, err, rmap_lock_held);
>  }
>  EXPORT_SYMBOL(mmap_action_complete);
>  #endif
> diff --git a/mm/vma.c b/mm/vma.c
> index 2a86c7575000..a27d1278ea6d 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -2731,21 +2731,28 @@ static bool can_set_ksm_flags_early(struct mmap_state *map)
>  	return false;
>  }
>  
> -static int call_action_complete(struct mmap_state *map,
> -				struct mmap_action *action,
> -				struct vm_area_struct *vma)
> +static int call_mapped_hook(struct mmap_state *map,
> +			    struct vm_area_struct *vma)
>  {
> -	int ret;
> -
> -	ret = mmap_action_complete(vma, action);
> +	const struct vm_operations_struct *vm_ops = vma->vm_ops;
> +	void *vm_private_data = vma->vm_private_data;
> +	int err;
>  
> -	/* If we held the file rmap we need to release it. */
> -	if (map->hold_file_rmap_lock) {
> -		struct file *file = vma->vm_file;
> +	if (!vm_ops || !vm_ops->mapped)
> +		return 0;
> +	err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff,
> +			     vma->vm_file, &vm_private_data);
> +	if (err) {
> +		if (map->hold_file_rmap_lock)
> +			i_mmap_unlock_write(vma->vm_file->f_mapping);
>  
> -		i_mmap_unlock_write(file->f_mapping);
> +		unmap_vma_locked(vma);
> +		return err;
>  	}
> -	return ret;
> +	/* Update private data if changed. */
> +	if (vm_private_data != vma->vm_private_data)
> +		vma->vm_private_data = vm_private_data;
> +	return 0;
>  }
>  
>  static unsigned long __mmap_region(struct file *file, unsigned long addr,
> @@ -2799,8 +2806,10 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
>  	__mmap_complete(&map, vma);
>  
>  	if (have_mmap_prepare && allocated_new) {
> -		error = call_action_complete(&map, &desc.action, vma);
> -
> +		error = mmap_action_complete(vma, &desc.action,
> +					     map.hold_file_rmap_lock);
> +		if (!error)
> +			error = call_mapped_hook(&map, vma);

And if neither of those above end up doing i_mmap_unlock_write(), we should
do it here? I think currently the misplaced unlock in mmap_action_finish()
masks the lack of it here, otherwise bots would already notice. Loss of
locking coverage (due to premature unlock) or the risk of double unlock is
probably harder to trigger. Or maybe a later patch in the series happens to
fix the issues, so it's just a bisection hazard here. Or I'm completely wrong.

>  		if (error)
>  			return error;
>  	}
> diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h
> index 8ae525ed1738..aa34966cbc62 100644
> --- a/tools/testing/vma/include/dup.h
> +++ b/tools/testing/vma/include/dup.h
> @@ -643,6 +643,23 @@ struct vm_operations_struct {
>  	 * Context: User context.  May sleep.  Caller holds mmap_lock.
>  	 */
>  	void (*close)(struct vm_area_struct *vma);
> +	/**
> +	 * @mapped: Called when the VMA is first mapped in the MM. Not called if
> +	 * the new VMA is merged with an adjacent VMA.
> +	 *
> +	 * The @vm_private_data field is an output field allowing the user to
> +	 * modify vma->vm_private_data as necessary.
> +	 *
> +	 * ONLY valid if set from f_op->mmap_prepare. Will result in an error if
> +	 * set from f_op->mmap.
> +	 *
> +	 * Returns %0 on success, or an error otherwise. On error, the VMA will
> +	 * be unmapped.
> +	 *
> +	 * Context: User context.  May sleep.  Caller holds mmap_lock.
> +	 */
> +	int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff,
> +		      const struct file *file, void **vm_private_data);
>  	/* Called any time before splitting to check if it's allowed */
>  	int (*may_split)(struct vm_area_struct *vma, unsigned long addr);
>  	int (*mremap)(struct vm_area_struct *vma);
> @@ -1281,7 +1298,7 @@ static inline int __compat_vma_mmap(const struct file_operations *f_op,
>  		return err;
>  
>  	set_vma_from_desc(vma, &desc);
> -	return mmap_action_complete(vma, &desc.action);
> +	return mmap_action_complete(vma, &desc.action, /*rmap_lock_held=*/false);
>  }
>  
>  static inline int compat_vma_mmap(struct file *file,
> @@ -1500,3 +1517,11 @@ static inline pgprot_t vma_get_page_prot(vma_flags_t vma_flags)
>  
>  	return vm_get_page_prot(vm_flags);
>  }
> +
> +static inline void unmap_vma_locked(struct vm_area_struct *vma)
> +{
> +	const size_t len = vma_pages(vma) << PAGE_SHIFT;
> +
> +	mmap_assert_write_locked(vma->vm_mm);
> +	do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
> +}
> diff --git a/tools/testing/vma/include/stubs.h b/tools/testing/vma/include/stubs.h
> index a30b8bc84955..d1c3d4ddb5e9 100644
> --- a/tools/testing/vma/include/stubs.h
> +++ b/tools/testing/vma/include/stubs.h
> @@ -87,7 +87,8 @@ static inline int mmap_action_prepare(struct vm_area_desc *desc)
>  }
>  
>  static inline int mmap_action_complete(struct vm_area_struct *vma,
> -				       struct mmap_action *action)
> +				       struct mmap_action *action,
> +				       bool rmap_lock_held)
>  {
>  	return 0;
>  }


^ permalink raw reply

* Re: [PATCH v3 03/16] mm: document vm_operations_struct->open the same as close()
From: Vlastimil Babka (SUSE) @ 2026-03-20 17:30 UTC (permalink / raw)
  To: Lorenzo Stoakes (Oracle), Andrew Morton
  Cc: Jonathan Corbet, Clemens Ladisch, Arnd Bergmann,
	Greg Kroah-Hartman, K . Y . Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Alexander Shishkin, Maxime Coquelin,
	Alexandre Torgue, Miquel Raynal, Richard Weinberger,
	Vignesh Raghavendra, Bodo Stroesser, Martin K . Petersen,
	David Howells, Marc Dionne, Alexander Viro, Christian Brauner,
	Jan Kara, David Hildenbrand, Liam R . Howlett, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Jann Horn, Pedro Falcato,
	linux-kernel, linux-doc, linux-hyperv, linux-mtd, linux-staging,
	linux-scsi, target-devel, linux-afs, linux-fsdevel, linux-mm,
	Ryan Roberts
In-Reply-To: <808919eaae0b682ec631301b3c06d85c62ba428d.1773944114.git.ljs@kernel.org>

On 3/19/26 19:23, Lorenzo Stoakes (Oracle) wrote:
> Describe when the operation is invoked and the context in which it is
> invoked, matching the description already added for vm_op->close().
> 
> While we're here, update all outdated references to an 'area' field for
> VMAs to the more consistent 'vma'.
> 
> Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>

Acked-by: Vlastimil Babka (SUSE) <vbabka@kernel.org>


^ permalink raw reply

* Re: [PATCH v3 02/16] mm: add documentation for the mmap_prepare file operation callback
From: Vlastimil Babka (SUSE) @ 2026-03-20 17:23 UTC (permalink / raw)
  To: Lorenzo Stoakes (Oracle), Andrew Morton
  Cc: Jonathan Corbet, Clemens Ladisch, Arnd Bergmann,
	Greg Kroah-Hartman, K . Y . Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Alexander Shishkin, Maxime Coquelin,
	Alexandre Torgue, Miquel Raynal, Richard Weinberger,
	Vignesh Raghavendra, Bodo Stroesser, Martin K . Petersen,
	David Howells, Marc Dionne, Alexander Viro, Christian Brauner,
	Jan Kara, David Hildenbrand, Liam R . Howlett, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Jann Horn, Pedro Falcato,
	linux-kernel, linux-doc, linux-hyperv, linux-mtd, linux-staging,
	linux-scsi, target-devel, linux-afs, linux-fsdevel, linux-mm,
	Ryan Roberts
In-Reply-To: <172ef809d9976b067bba4cd9d2b78410c6c6d03d.1773944114.git.ljs@kernel.org>

On 3/19/26 19:23, Lorenzo Stoakes (Oracle) wrote:
> This documentation makes it easier for a driver/file system implementer to
> correctly use this callback.
> 
> It covers the fundamentals, whilst intentionally leaving the less lovely
> possible actions one might take undocumented (for instance - the
> success_hook, error_hook fields in mmap_action).
> 
> The document also covers the new VMA flags implementation which is the
> only one which will work correctly with mmap_prepare.
> 
> Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>

Acked-by: Vlastimil Babka (SUSE) <vbabka@kernel.org>


^ permalink raw reply

* Re: [PATCH v3 01/16] mm: various small mmap_prepare cleanups
From: Vlastimil Babka (SUSE) @ 2026-03-20 17:10 UTC (permalink / raw)
  To: Lorenzo Stoakes (Oracle), Andrew Morton
  Cc: Jonathan Corbet, Clemens Ladisch, Arnd Bergmann,
	Greg Kroah-Hartman, K . Y . Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Alexander Shishkin, Maxime Coquelin,
	Alexandre Torgue, Miquel Raynal, Richard Weinberger,
	Vignesh Raghavendra, Bodo Stroesser, Martin K . Petersen,
	David Howells, Marc Dionne, Alexander Viro, Christian Brauner,
	Jan Kara, David Hildenbrand, Liam R . Howlett, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Jann Horn, Pedro Falcato,
	linux-kernel, linux-doc, linux-hyperv, linux-stm32,
	linux-arm-kernel, linux-mtd, linux-staging, linux-scsi,
	target-devel, linux-afs, linux-fsdevel, linux-mm, Ryan Roberts
In-Reply-To: <498a579bfbcbb8b0e4a9c39243b4454347f03a46.1773944114.git.ljs@kernel.org>

On 3/19/26 19:23, Lorenzo Stoakes (Oracle) wrote:
> Rather than passing arbitrary fields, pass a vm_area_desc pointer to mmap
> prepare functions to mmap prepare, and an action and vma pointer to mmap
> complete in order to put all the action-specific logic in the function
> actually doing the work.
> 
> Additionally, allow mmap prepare functions to return an error so we can
> error out as soon as possible if there is something logically incorrect in
> the input.
> 
> Update remap_pfn_range_prepare() to properly check the input range for the
> CoW case.
> 
> Also remove io_remap_pfn_range_complete(), as we can simply set up the
> fields correctly in io_remap_pfn_range_prepare() and use
> remap_pfn_range_complete() for this.
> 
> While we're here, make remap_pfn_range_prepare_vma() a little neater, and
> pass mmap_action directly to call_action_complete().
> 
> Then, update compat_vma_mmap() to perform its logic directly, as
> __compat_vma_map() is not used by anything so we don't need to export it.
> 
> Also update compat_vma_mmap() to use vfs_mmap_prepare() rather than
> calling the mmap_prepare op directly.
> 
> Finally, update the VMA userland tests to reflect the changes.
> 
> Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>

Acked-by: Vlastimil Babka (SUSE) <vbabka@kernel.org>


^ permalink raw reply

* Re: [GIT PULL] Hyper-V fixes for v7.0-rc5
From: pr-tracker-bot @ 2026-03-20 16:25 UTC (permalink / raw)
  To: Wei Liu
  Cc: Linus Torvalds, Wei Liu, Linux on Hyper-V List, Linux Kernel List,
	kys, haiyangz, decui, longli
In-Reply-To: <20260320051524.GA759166@liuwe-devbox-debian-v2.local>

The pull request you sent on Fri, 20 Mar 2026 05:15:24 +0000:

> ssh://git@gitolite.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git tags/hyperv-fixes-signed-20260319

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/c3d13784d5b200fc4b4a1f5d5f5585b8e3a5777e

Thank you!

-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/prtracker.html

^ permalink raw reply

* [PATCH net-next] net: mana: Use at least SZ_4K in doorbell ID range check
From: Erni Sri Satya Vennela @ 2026-03-20 12:21 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, kuba, pabeni, horms, shradhagupta, dipayanroy,
	shirazsaleem, kotaranov, yury.norov, kees, linux-hyperv, netdev,
	linux-kernel
  Cc: Erni Sri Satya Vennela

mana_gd_ring_doorbell() accesses doorbell offsets up to 0xFF8 + 8 = 4KB
within a doorbell page. When db_page_size is zero, the validation check
in mana_gd_register_device() reduces to:
  db_page_off + 0 > bar0_size
which passes, even though mana_gd_ring_doorbell() will access
[db_page_off, db_page_off + 4KB) and may go beyond BAR0.

Use max(SZ_4K, db_page_size) in the range check so that a zero or
unexpectedly small db_page_size still results in a rejection when the
doorbell page would fall outside BAR0.

Fixes: 89fe91c65992 ("net: mana: hardening: Validate doorbell ID from GDMA_REGISTER_DEVICE response")
Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
---
 drivers/net/ethernet/microsoft/mana/gdma_main.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 2ba1fa3336f9..49ea3dcbf74a 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -4,6 +4,7 @@
 #include <linux/debugfs.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/sizes.h>
 #include <linux/utsname.h>
 #include <linux/version.h>
 #include <linux/msi.h>
@@ -1255,6 +1256,7 @@ int mana_gd_register_device(struct gdma_dev *gd)
 	struct gdma_context *gc = gd->gdma_context;
 	struct gdma_register_device_resp resp = {};
 	struct gdma_general_req req = {};
+	u64 db_page_sz;
 	int err;
 
 	gd->pdid = INVALID_PDID;
@@ -1278,8 +1280,14 @@ int mana_gd_register_device(struct gdma_dev *gd)
 	 *   addr = db_page_base + db_page_size * db_id
 	 *        = (bar0_va + db_page_off) + (db_page_size * db_id)
 	 * So we need: db_page_off + db_page_size * (db_id + 1) <= bar0_size
+	 *
+	 * mana_gd_ring_doorbell() always accesses [offset, offset + 4KB),
+	 * so use at least SZ_4K to catch a zero or small db_page_size.
 	 */
-	if (gc->db_page_off + gc->db_page_size * ((u64)resp.db_id + 1) > gc->bar0_size) {
+	db_page_sz = max_t(u64, SZ_4K, gc->db_page_size);
+
+	if (gc->db_page_off + db_page_sz * ((u64)resp.db_id + 1) >
+	    gc->bar0_size) {
 		dev_err(gc->dev, "Doorbell ID %u out of range\n", resp.db_id);
 		return -EPROTO;
 	}
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH net-next v4] net: mana: Expose hardware diagnostic info via debugfs
From: Simon Horman @ 2026-03-20  9:55 UTC (permalink / raw)
  To: Erni Sri Satya Vennela
  Cc: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, kuba, pabeni, kotaranov, shradhagupta, shirazsaleem,
	dipayanroy, yury.norov, kees, ssengar, gargaditya, linux-hyperv,
	netdev, linux-kernel, linux-rdma
In-Reply-To: <20260319070926.1459515-1-ernis@linux.microsoft.com>

On Thu, Mar 19, 2026 at 12:09:13AM -0700, Erni Sri Satya Vennela wrote:
> Add debugfs entries to expose hardware configuration and diagnostic
> information that aids in debugging driver initialization and runtime
> operations without adding noise to dmesg.
> 
> The debugfs directory creation and removal for each PCI device is
> integrated into mana_gd_setup() and mana_gd_cleanup_device()
> respectively, so that all callers (probe, remove, suspend, resume,
> shutdown) share a single code path.
> 
> Device-level entries (under /sys/kernel/debug/mana/<slot>/):
>   - num_msix_usable, max_num_queues: Max resources from hardware
>   - gdma_protocol_ver, pf_cap_flags1: VF version negotiation results
>   - num_vports, bm_hostmode: Device configuration
> 
> Per-vPort entries (under /sys/kernel/debug/mana/<slot>/vportN/):
>   - port_handle: Hardware vPort handle
>   - max_sq, max_rq: Max queues from vPort config
>   - indir_table_sz: Indirection table size
>   - steer_rx, steer_rss, steer_update_tab, steer_cqe_coalescing:
>     Last applied steering configuration parameters
> 
> Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
> ---
> Changes in v4:
> * Rebase and fix conflicts.
> Changes in v3:
> * Rename mana_gd_cleanup to mana_gd_cleanup_device.
> * Add creation of debugfs entries in mana_gd_setup.
> * Add removal of debugfs entries in mana_gd_cleanup_device.
> * Remove bm_hostmode and num_vports from debugfs in mana_remove itself,
>   because "ac" gets freed before debugfs_remove_recursive, to avoid
>   Use-After-Free error.
> * Add "goto out:" in mana_cfg_vport_steering to avoid populating apc
>   values when resp.hdr.status is not NULL.

Thanks for the updates.

Reviewed-by: Simon Horman <horms@kernel.org>


^ permalink raw reply

* Re: [PATCH v2] PCI: hv: Set default NUMA node to 0 for devices without affinity info
From: Wei Liu @ 2026-03-20  5:17 UTC (permalink / raw)
  To: Long Li
  Cc: K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Lorenzo Pieralisi, Krzysztof Wilczyński,
	Manivannan Sadhasivam, Bjorn Helgaas, Rob Herring, Michael Kelley,
	linux-hyperv, linux-pci, linux-kernel
In-Reply-To: <20260316210742.1240128-1-longli@microsoft.com>

On Mon, Mar 16, 2026 at 02:07:42PM -0700, Long Li wrote:
> When hv_pci_assign_numa_node() processes a device that does not have
> HV_PCI_DEVICE_FLAG_NUMA_AFFINITY set or has an out-of-range
> virtual_numa_node, the device NUMA node is left unset. On x86_64,
> the uninitialized default happens to be 0, but on ARM64 it is
> NUMA_NO_NODE (-1).
> 
> Tests show that when no NUMA information is available from the Hyper-V
> host, devices perform best when assigned to node 0. With NUMA_NO_NODE
> the kernel may spread work across NUMA nodes, which degrades
> performance on Hyper-V, particularly for high-throughput devices like
> MANA.
> 
> Always set the device NUMA node to 0 before the conditional NUMA
> affinity check, so that devices get a performant default when the host
> provides no NUMA information, and behavior is consistent on both
> x86_64 and ARM64.
> 
> Fixes: 999dd956d838 ("PCI: hv: Add support for protocol 1.3 and support PCI_BUS_RELATIONS2")
> Signed-off-by: Long Li <longli@microsoft.com>

I can pick this up next week. PCI maintainers, if you want this to go
through your tree instead, please let me know.

Wei

> ---
> Changes in v2:
> - Rewrite commit message to focus on performance as the primary
>   motivation: NUMA_NO_NODE causes the kernel to spread work across
>   NUMA nodes, degrading performance on Hyper-V
> 
>  drivers/pci/controller/pci-hyperv.c | 8 ++++++++
>  1 file changed, 8 insertions(+)
> 
> diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
> index 2c7a406b4ba8..38a790f642a1 100644
> --- a/drivers/pci/controller/pci-hyperv.c
> +++ b/drivers/pci/controller/pci-hyperv.c
> @@ -2485,6 +2485,14 @@ static void hv_pci_assign_numa_node(struct hv_pcibus_device *hbus)
>  		if (!hv_dev)
>  			continue;
>  
> +		/*
> +		 * If the Hyper-V host doesn't provide a NUMA node for the
> +		 * device, default to node 0. With NUMA_NO_NODE the kernel
> +		 * may spread work across NUMA nodes, which degrades
> +		 * performance on Hyper-V.
> +		 */
> +		set_dev_node(&dev->dev, 0);
> +
>  		if (hv_dev->desc.flags & HV_PCI_DEVICE_FLAG_NUMA_AFFINITY &&
>  		    hv_dev->desc.virtual_numa_node < num_possible_nodes())
>  			/*
> -- 
> 2.43.0
> 

^ permalink raw reply

* [GIT PULL] Hyper-V fixes for v7.0-rc5
From: Wei Liu @ 2026-03-20  5:15 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Wei Liu, Linux on Hyper-V List, Linux Kernel List, kys, haiyangz,
	decui, longli

Hi Linus

The following changes since commit 6de23f81a5e08be8fbf5e8d7e9febc72a5b5f27f:

  Linux 7.0-rc1 (2026-02-22 13:18:59 -0800)

are available in the Git repository at:

  ssh://git@gitolite.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git tags/hyperv-fixes-signed-20260319

for you to fetch changes up to c0e296f257671ba10249630fe58026f29e4804d9:

  mshv: Fix error handling in mshv_region_pin (2026-03-18 16:18:49 +0000)

----------------------------------------------------------------
hyperv-fixes for v7.0-rc5
 - Fix ARM64 MSHV support (Anirudh Rayabharam)
 - Fix MSHV driver memory handling issues (Stanislav Kinsburskii)
 - Update maintainers for Hyper-V DRM driver (Saurabh Sengar)
 - Misc clean up in MSHV crashdump code (Ard Biesheuvel, Uros Bizjak)
 - Minor improvements to MSHV code (Mukesh R, Wei Liu)
 - Revert not yet released MSHV scrub partition hypercall (Wei Liu)
----------------------------------------------------------------
Anirudh Rayabharam (Microsoft) (2):
      mshv: refactor synic init and cleanup
      mshv: add arm64 support for doorbell & intercept SINTs

Ard Biesheuvel (1):
      x86/hyperv: Use __naked attribute to fix stackless C function

Mukesh R (1):
      mshv: pass struct mshv_user_mem_region by reference

Saurabh Sengar (1):
      MAINTAINERS: Update maintainers for Hyper-V DRM driver

Stanislav Kinsburskii (2):
      mshv: Fix use-after-free in mshv_map_user_memory error path
      mshv: Fix error handling in mshv_region_pin

Uros Bizjak (3):
      x86/hyperv: Save segment registers directly to memory in hv_hvcrash_ctxt_save()
      x86/hyperv: Use current_stack_pointer to avoid asm() in hv_hvcrash_ctxt_save()
      x86/hyperv: Use any general-purpose register when saving %cr2 and %cr8

Wei Liu (2):
      x86/hyperv: print out reserved vectors in hexadecimal
      Revert "mshv: expose the scrub partition hypercall"

 MAINTAINERS                    |   4 +-
 arch/x86/hyperv/hv_crash.c     | 118 +++++++++++++-------------
 arch/x86/kernel/cpu/mshyperv.c |   5 +-
 drivers/hv/mshv_regions.c      |   6 +-
 drivers/hv/mshv_root.h         |   5 +-
 drivers/hv/mshv_root_main.c    |  93 +++++---------------
 drivers/hv/mshv_synic.c        | 188 +++++++++++++++++++++++++++++++++++++----
 include/hyperv/hvgdk_mini.h    |   3 +-
 8 files changed, 270 insertions(+), 152 deletions(-)

^ permalink raw reply

* Re: [PATCH v3 00/16] mm: expand mmap_prepare functionality and usage
From: Andrew Morton @ 2026-03-19 20:31 UTC (permalink / raw)
  To: Lorenzo Stoakes (Oracle)
  Cc: Jonathan Corbet, Clemens Ladisch, Arnd Bergmann,
	Greg Kroah-Hartman, K . Y . Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Alexander Shishkin, Maxime Coquelin,
	Alexandre Torgue, Miquel Raynal, Richard Weinberger,
	Vignesh Raghavendra, Bodo Stroesser, Martin K . Petersen,
	David Howells, Marc Dionne, Alexander Viro, Christian Brauner,
	Jan Kara, David Hildenbrand, Liam R . Howlett, Vlastimil Babka,
	Mike Rapoport, Suren Baghdasaryan, Michal Hocko, Jann Horn,
	Pedro Falcato, linux-kernel, linux-doc, linux-hyperv, linux-stm32,
	linux-arm-kernel, linux-mtd, linux-staging, linux-scsi,
	target-devel, linux-afs, linux-fsdevel, linux-mm, Ryan Roberts
In-Reply-To: <cover.1773944114.git.ljs@kernel.org>

On Thu, 19 Mar 2026 18:23:24 +0000 "Lorenzo Stoakes (Oracle)" <ljs@kernel.org> wrote:

> This series expands the mmap_prepare functionality, which is intended to
> replace the deprecated f_op->mmap hook which has been the source of bugs
> and security issues for some time.
> 

Thanks, I updated mm.git's mm-new branch to this version.
> 
> v3:
> * Propagated tags (thanks Suren, Richard!)
> * Updated 12/16 to correctly clear the vm_area_desc data structure in
>   set_desc_from_vma() as per Joshua Hahn (thanks! :)
> * Fixed type in 12/16 as per Suren (cheers!)
> * Fixed up 6/16 to use mmap_action_ioremap_full() in simple_ioremap_prepare() as
>   suggested by Suren.
> * Also fixed up 6/16 to call io_remap_pfn_range_prepare() direct rather than
>   mmap_action_prepare() as per Suren.
> * Also fixed up 6/16 to pass vm_len rather than vm_[start, end] to
>   __simple_ioremap_prep() as per Suren (thanks for all the above! :)
> * Fixed issue in rmap lock being held - we were referencing a vma->vm_file after
>   the VMA was unmapped, so UAF. Avoid that. Also do_munmap() relies on rmap lock
>   NOT being held or may deadlock, so extend functionality to ensure we drop it
>   when it is held on error paths.
> * Updated 'area' -> 'vma' variable in 3/16 in VMA test dup.h.
> * Fixed up reference to __compat_vma_mmap() in 12/16 commit message.
> * Updated 1/16 to no longer duplicatively apply io_remap_pfn_range_pfn().
> * Updated 1/16 to delegate I/O remap complete to remap complete logic.
> * Fixed various typos in 12/16.
> * Fixed stale comment typos in 13/16.
> * Fixed commit msg and comment typos in 14/16.
> * Removed accidental sneak peak to future functionality in 15/16 commit message
>   :).
> * Fixed up field names to be identical in VMA tests + mm_types.h in 6/16,
>   15/16.

Here's how v3 altered mm.git:


 drivers/hv/vmbus_drv.c            |    6 +-
 drivers/target/target_core_user.c |    2 
 include/linux/mm.h                |    3 -
 include/linux/uio_driver.h        |    2 
 mm/internal.h                     |   22 ++++-----
 mm/memory.c                       |   16 +++----
 mm/util.c                         |   62 ++++++++++++++--------------
 mm/vma.c                          |   24 +++-------
 tools/testing/vma/include/dup.h   |   19 ++++----
 tools/testing/vma/include/stubs.h |    3 -
 10 files changed, 77 insertions(+), 82 deletions(-)

--- a/drivers/hv/vmbus_drv.c~b
+++ a/drivers/hv/vmbus_drv.c
@@ -1955,8 +1955,8 @@ static int hv_mmap_ring_buffer_wrapper(s
 	int err;
 
 	/*
-	 * hv_(create|remove)_ring_sysfs implementation ensures that mmap_ring_buffer
-	 * is not NULL.
+	 * hv_(create|remove)_ring_sysfs implementation ensures that
+	 * mmap_prepare_ring_buffer is not NULL.
 	 */
 	compat_set_desc_from_vma(&desc, filp, vma);
 	err = channel->mmap_prepare_ring_buffer(channel, &desc);
@@ -2055,7 +2055,7 @@ static const struct kobj_type vmbus_chan
 /**
  * hv_create_ring_sysfs() - create "ring" sysfs entry corresponding to ring buffers for a channel.
  * @channel: Pointer to vmbus_channel structure
- * @hv_mmap_ring_buffer: function pointer for initializing the function to be called on mmap
+ * @hv_mmap_prepare_ring_buffer: function pointer for initializing the function to be called on mmap
  *                       channel's "ring" sysfs node, which is for the ring buffer of that channel.
  *                       Function pointer is of below type:
  *                       int (*hv_mmap_prepare_ring_buffer)(struct vmbus_channel *channel,
--- a/drivers/target/target_core_user.c~b
+++ a/drivers/target/target_core_user.c
@@ -1865,7 +1865,7 @@ static int tcmu_vma_mapped(unsigned long
 {
 	struct tcmu_dev *udev = *vm_private_data;
 
-	pr_debug("vma_open\n");
+	pr_debug("vma_mapped\n");
 
 	kref_get(&udev->kref);
 	return 0;
--- a/include/linux/mm.h~b
+++ a/include/linux/mm.h
@@ -4405,7 +4405,8 @@ static inline void mmap_action_map_kerne
 
 int mmap_action_prepare(struct vm_area_desc *desc);
 int mmap_action_complete(struct vm_area_struct *vma,
-			 struct mmap_action *action);
+			 struct mmap_action *action,
+			 bool rmap_lock_held);
 
 /* Look up the first VMA which exactly match the interval vm_start ... vm_end */
 static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
--- a/include/linux/uio_driver.h~b
+++ a/include/linux/uio_driver.h
@@ -97,7 +97,7 @@ struct uio_device {
  * @irq_flags:		flags for request_irq()
  * @priv:		optional private data
  * @handler:		the device's irq handler
- * @mmap_prepare:	mmap_pepare operation for this uio device
+ * @mmap_prepare:	mmap_prepare operation for this uio device
  * @open:		open operation for this uio device
  * @release:		release operation for this uio device
  * @irqcontrol:		disable/enable irqs when 0/1 is written to /dev/uioX
--- a/mm/internal.h~b
+++ a/mm/internal.h
@@ -1805,29 +1805,25 @@ int remap_pfn_range_prepare(struct vm_ar
 int remap_pfn_range_complete(struct vm_area_struct *vma,
 			     struct mmap_action *action);
 int simple_ioremap_prepare(struct vm_area_desc *desc);
-/* No simple_ioremap_complete, is ultimately handled by remap complete. */
 
 static inline int io_remap_pfn_range_prepare(struct vm_area_desc *desc)
 {
 	struct mmap_action *action = &desc->action;
 	const unsigned long orig_pfn = action->remap.start_pfn;
+	const pgprot_t orig_pgprot = action->remap.pgprot;
 	const unsigned long size = action->remap.size;
 	const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size);
+	int err;
 
 	action->remap.start_pfn = pfn;
-	return remap_pfn_range_prepare(desc);
-}
-
-static inline int io_remap_pfn_range_complete(struct vm_area_struct *vma,
-					      struct mmap_action *action)
-{
-	const unsigned long size = action->remap.size;
-	const unsigned long orig_pfn = action->remap.start_pfn;
-	const pgprot_t orig_prot = vma->vm_page_prot;
+	action->remap.pgprot = pgprot_decrypted(orig_pgprot);
+	err = remap_pfn_range_prepare(desc);
+	if (err)
+		return err;
 
-	action->remap.pgprot = pgprot_decrypted(orig_prot);
-	action->remap.start_pfn  = io_remap_pfn_range_pfn(orig_pfn, size);
-	return remap_pfn_range_complete(vma, action);
+	/* Remap does the actual work. */
+	action->type = MMAP_REMAP_PFN;
+	return 0;
 }
 
 #ifdef CONFIG_MMU_NOTIFIER
--- a/mm/memory.c~b
+++ a/mm/memory.c
@@ -3207,11 +3207,10 @@ int remap_pfn_range_complete(struct vm_a
 	return do_remap_pfn_range(vma, start, pfn, size, prot);
 }
 
-static int __simple_ioremap_prep(unsigned long vm_start, unsigned long vm_end,
-				 pgoff_t vm_pgoff, phys_addr_t start_phys,
-				 unsigned long size, unsigned long *pfnp)
+static int __simple_ioremap_prep(unsigned long vm_len, pgoff_t vm_pgoff,
+				 phys_addr_t start_phys, unsigned long size,
+				 unsigned long *pfnp)
 {
-	const unsigned long vm_len = vm_end - vm_start;
 	unsigned long pfn, pages;
 
 	/* Check that the physical memory area passed in looks valid */
@@ -3250,14 +3249,14 @@ int simple_ioremap_prepare(struct vm_are
 	unsigned long pfn;
 	int err;
 
-	err = __simple_ioremap_prep(desc->start, desc->end, desc->pgoff,
+	err = __simple_ioremap_prep(vma_desc_size(desc), desc->pgoff,
 				    start, size, &pfn);
 	if (err)
 		return err;
 
 	/* The I/O remap logic does the heavy lifting. */
-	mmap_action_ioremap(desc, desc->start, pfn, vma_desc_size(desc));
-	return mmap_action_prepare(desc);
+	mmap_action_ioremap_full(desc, pfn);
+	return io_remap_pfn_range_prepare(desc);
 }
 
 /**
@@ -3283,8 +3282,7 @@ int vm_iomap_memory(struct vm_area_struc
 	unsigned long pfn;
 	int err;
 
-	err = __simple_ioremap_prep(vm_start, vm_end, vma->vm_pgoff, start,
-				    len, &pfn);
+	err = __simple_ioremap_prep(vm_len, vma->vm_pgoff, start, len, &pfn);
 	if (err)
 		return err;
 
--- a/mm/util.c~b
+++ a/mm/util.c
@@ -1181,6 +1181,8 @@ void compat_set_desc_from_vma(struct vm_
 			      const struct file *file,
 			      const struct vm_area_struct *vma)
 {
+	memset(desc, 0, sizeof(*desc));
+
 	desc->mm = vma->vm_mm;
 	desc->file = (struct file *)file;
 	desc->start = vma->vm_start;
@@ -1224,7 +1226,7 @@ static int __compat_vma_mapped(struct fi
  * @vma: The VMA to which @desc should be applied.
  *
  * The function assumes that you have obtained a VMA descriptor @desc from
- * compt_set_desc_from_vma(), and already executed the mmap_prepare() hook upon
+ * compat_set_desc_from_vma(), and already executed the mmap_prepare() hook upon
  * it.
  *
  * It then performs any specified mmap actions, and invokes the vm_ops->mapped()
@@ -1249,7 +1251,8 @@ int __compat_vma_mmap(struct vm_area_des
 	/* Update the VMA from the descriptor. */
 	compat_set_vma_from_desc(vma, desc);
 	/* Complete any specified mmap actions. */
-	err = mmap_action_complete(vma, &desc->action);
+	err = mmap_action_complete(vma, &desc->action,
+				   /*rmap_lock_held=*/false);
 	if (err)
 		return err;
 
@@ -1397,29 +1400,31 @@ again:
 }
 
 static int mmap_action_finish(struct vm_area_struct *vma,
-			      struct mmap_action *action, int err)
+			      struct mmap_action *action, int err,
+			      bool rmap_lock_held)
 {
+	if (rmap_lock_held)
+		i_mmap_unlock_write(vma->vm_file->f_mapping);
+
+	if (!err) {
+		if (action->success_hook)
+			return action->success_hook(vma);
+		return 0;
+	}
+
 	/*
 	 * If an error occurs, unmap the VMA altogether and return an error. We
 	 * only clear the newly allocated VMA, since this function is only
 	 * invoked if we do NOT merge, so we only clean up the VMA we created.
 	 */
-	if (err) {
-		unmap_vma_locked(vma);
-		if (action->error_hook) {
-			/* We may want to filter the error. */
-			err = action->error_hook(err);
-
-			/* The caller should not clear the error. */
-			VM_WARN_ON_ONCE(!err);
-		}
-		return err;
+	unmap_vma_locked(vma);
+	if (action->error_hook) {
+		/* We may want to filter the error. */
+		err = action->error_hook(err);
+		/* The caller should not clear the error. */
+		VM_WARN_ON_ONCE(!err);
 	}
-
-	if (action->success_hook)
-		return action->success_hook(vma);
-
-	return 0;
+	return err;
 }
 
 #ifdef CONFIG_MMU
@@ -1454,13 +1459,15 @@ EXPORT_SYMBOL(mmap_action_prepare);
  * mmap_action_complete - Execute VMA descriptor action.
  * @vma: The VMA to perform the action upon.
  * @action: The action to perform.
+ * @rmap_lock_held: Is the file rmap lock held?
  *
  * Similar to mmap_action_prepare().
  *
  * Return: 0 on success, or error, at which point the VMA will be unmapped.
  */
 int mmap_action_complete(struct vm_area_struct *vma,
-			 struct mmap_action *action)
+			 struct mmap_action *action,
+			 bool rmap_lock_held)
 
 {
 	int err = 0;
@@ -1471,23 +1478,19 @@ int mmap_action_complete(struct vm_area_
 	case MMAP_REMAP_PFN:
 		err = remap_pfn_range_complete(vma, action);
 		break;
-	case MMAP_IO_REMAP_PFN:
-		err = io_remap_pfn_range_complete(vma, action);
-		break;
 	case MMAP_MAP_KERNEL_PAGES:
 		err = map_kernel_pages_complete(vma, action);
 		break;
+	case MMAP_IO_REMAP_PFN:
 	case MMAP_SIMPLE_IO_REMAP:
-		/*
-		 * The simple I/O remap should have been delegated to an I/O
-		 * remap.
-		 */
+		/* Should have been delegated. */
 		WARN_ON_ONCE(1);
 		err = -EINVAL;
 		break;
 	}
 
-	return mmap_action_finish(vma, action, err);
+	return mmap_action_finish(vma, action, err,
+				  rmap_lock_held);
 }
 EXPORT_SYMBOL(mmap_action_complete);
 #else
@@ -1509,7 +1512,8 @@ int mmap_action_prepare(struct vm_area_d
 EXPORT_SYMBOL(mmap_action_prepare);
 
 int mmap_action_complete(struct vm_area_struct *vma,
-			 struct mmap_action *action)
+			 struct mmap_action *action,
+			 bool rmap_lock_held)
 {
 	int err = 0;
 
@@ -1525,7 +1529,7 @@ int mmap_action_complete(struct vm_area_
 		break;
 	}
 
-	return mmap_action_finish(vma, action, err);
+	return mmap_action_finish(vma, action, err, rmap_lock_held);
 }
 EXPORT_SYMBOL(mmap_action_complete);
 #endif
--- a/mm/vma.c~b
+++ a/mm/vma.c
@@ -2732,7 +2732,8 @@ static bool can_set_ksm_flags_early(stru
 	return false;
 }
 
-static int call_mapped_hook(struct vm_area_struct *vma)
+static int call_mapped_hook(struct mmap_state *map,
+			    struct vm_area_struct *vma)
 {
 	const struct vm_operations_struct *vm_ops = vma->vm_ops;
 	void *vm_private_data = vma->vm_private_data;
@@ -2743,6 +2744,9 @@ static int call_mapped_hook(struct vm_ar
 	err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff,
 			     vma->vm_file, &vm_private_data);
 	if (err) {
+		if (map->hold_file_rmap_lock)
+			i_mmap_unlock_write(vma->vm_file->f_mapping);
+
 		unmap_vma_locked(vma);
 		return err;
 	}
@@ -2752,17 +2756,6 @@ static int call_mapped_hook(struct vm_ar
 	return 0;
 }
 
-static void maybe_drop_file_rmap_lock(struct mmap_state *map,
-				      struct vm_area_struct *vma)
-{
-	struct file *file;
-
-	if (!map->hold_file_rmap_lock)
-		return;
-	file = vma->vm_file;
-	i_mmap_unlock_write(file->f_mapping);
-}
-
 static unsigned long __mmap_region(struct file *file, unsigned long addr,
 		unsigned long len, vma_flags_t vma_flags,
 		unsigned long pgoff, struct list_head *uf)
@@ -2814,11 +2807,10 @@ static unsigned long __mmap_region(struc
 	__mmap_complete(&map, vma);
 
 	if (have_mmap_prepare && allocated_new) {
-		error = mmap_action_complete(vma, &desc.action);
+		error = mmap_action_complete(vma, &desc.action,
+					     map.hold_file_rmap_lock);
 		if (!error)
-			error = call_mapped_hook(vma);
-
-		maybe_drop_file_rmap_lock(&map, vma);
+			error = call_mapped_hook(&map, vma);
 		if (error)
 			return error;
 	}
--- a/tools/testing/vma/include/dup.h~b
+++ a/tools/testing/vma/include/dup.h
@@ -470,13 +470,13 @@ struct mmap_action {
 			pgprot_t pgprot;
 		} remap;
 		struct {
-			phys_addr_t start;
-			unsigned long len;
+			phys_addr_t start_phys_addr;
+			unsigned long size;
 		} simple_ioremap;
 		struct {
 			unsigned long start;
 			struct page **pages;
-			unsigned long num;
+			unsigned long nr_pages;
 			pgoff_t pgoff;
 		} map_kernel;
 	};
@@ -648,7 +648,7 @@ struct vm_operations_struct {
 	 * upon first mapping a VMA.
 	 * Context: User context.  May sleep.  Caller holds mmap_lock.
 	 */
-	void (*open)(struct vm_area_struct * area);
+	void (*open)(struct vm_area_struct *vma);
 	/**
 	 * @close: Called when the VMA is being removed from the MM.
 	 * Context: User context.  May sleep.  Caller holds mmap_lock.
@@ -672,8 +672,8 @@ struct vm_operations_struct {
 	int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff,
 		      const struct file *file, void **vm_private_data);
 	/* Called any time before splitting to check if it's allowed */
-	int (*may_split)(struct vm_area_struct *area, unsigned long addr);
-	int (*mremap)(struct vm_area_struct *area);
+	int (*may_split)(struct vm_area_struct *vma, unsigned long addr);
+	int (*mremap)(struct vm_area_struct *vma);
 	/*
 	 * Called by mprotect() to make driver-specific permission
 	 * checks before mprotect() is finalised.   The VMA must not
@@ -685,7 +685,7 @@ struct vm_operations_struct {
 	vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order);
 	vm_fault_t (*map_pages)(struct vm_fault *vmf,
 			pgoff_t start_pgoff, pgoff_t end_pgoff);
-	unsigned long (*pagesize)(struct vm_area_struct * area);
+	unsigned long (*pagesize)(struct vm_area_struct *vma);
 
 	/* notification that a previously read-only page is about to become
 	 * writable, if an error is returned it will cause a SIGBUS */
@@ -1288,6 +1288,8 @@ static inline void compat_set_desc_from_
 			      const struct file *file,
 			      const struct vm_area_struct *vma)
 {
+	memset(desc, 0, sizeof(*desc));
+
 	desc->mm = vma->vm_mm;
 	desc->file = (struct file *)file;
 	desc->start = vma->vm_start;
@@ -1342,7 +1344,8 @@ static inline int __compat_vma_mmap(stru
 	/* Update the VMA from the descriptor. */
 	compat_set_vma_from_desc(vma, desc);
 	/* Complete any specified mmap actions. */
-	err = mmap_action_complete(vma, &desc->action);
+	err = mmap_action_complete(vma, &desc->action,
+				   /*rmap_lock_held=*/false);
 	if (err)
 		return err;
 
--- a/tools/testing/vma/include/stubs.h~b
+++ a/tools/testing/vma/include/stubs.h
@@ -87,7 +87,8 @@ static inline int mmap_action_prepare(st
 }
 
 static inline int mmap_action_complete(struct vm_area_struct *vma,
-				       struct mmap_action *action)
+				       struct mmap_action *action,
+				       bool rmap_lock_held)
 {
 	return 0;
 }
_


^ permalink raw reply

* [PATCH 55/55] drivers: hv: dxgkrnl: Code cleanup for upstream submission
From: Eric Curtin @ 2026-03-19 20:25 UTC (permalink / raw)
  To: linux-hyperv; +Cc: linux-kernel, iourit, wei.liu, decui, haiyangz
In-Reply-To: <20260319202509.63802-1-eric.curtin@docker.com>

Address issues raised in previous LKML submission attempts (v1-v3):

- Replace deprecated one-element arrays [1] with C99 flexible arrays []
  in dxgvmbus.h and dxgkrnl.h
- Replace %px with %p in DXG_TRACE calls (avoids exposing kernel layout)
- Remove unnecessary braces from single-statement if blocks
- Remove LINUX_VERSION_CODE guard: max_pkt_size exists in all supported kernels
- Remove unused linux/version.h include from dxgkrnl.h
- Fix whitespace (space before tab) in dxgvmbus.h and d3dkmthk.h
- Replace DXG_ERR non-debug macro do{}while(0) with direct dev_err call
- Change -EBADE to -ENODEV for global channel duplicate detection
- Remove MODULE_VERSION as it is not recommended for in-tree drivers
- Add explanatory comment to guid_to_luid() cast
- Update MAINTAINERS email to iourit@linux.microsoft.com

Signed-off-by: Iouri Tarassov <iourit@linux.microsoft.com>
---
 MAINTAINERS                     |  2 +-
 drivers/hv/dxgkrnl/dxgadapter.c |  8 ++++----
 drivers/hv/dxgkrnl/dxgkrnl.h    | 13 +++++--------
 drivers/hv/dxgkrnl/dxgmodule.c  |  5 ++---
 drivers/hv/dxgkrnl/dxgvmbus.c   |  5 +----
 drivers/hv/dxgkrnl/dxgvmbus.h   | 26 +++++++++++++-------------
 drivers/hv/dxgkrnl/hmgr.c       |  3 +--
 include/uapi/misc/d3dkmthk.h    |  2 +-
 8 files changed, 28 insertions(+), 36 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 4fe0b3501931..493c65a02b80 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9772,7 +9772,7 @@ F:	drivers/mtd/hyperbus/
 F:	include/linux/mtd/hyperbus.h
 
 Hyper-V vGPU DRIVER
-M:	Iouri Tarassov <iourit@microsoft.com>
+M:	Iouri Tarassov <iourit@linux.microsoft.com>
 L:	linux-hyperv@vger.kernel.org
 S:	Supported
 F:	drivers/hv/dxgkrnl/
diff --git a/drivers/hv/dxgkrnl/dxgadapter.c b/drivers/hv/dxgkrnl/dxgadapter.c
index 6d3cabb24e6f..d395fdcb63fa 100644
--- a/drivers/hv/dxgkrnl/dxgadapter.c
+++ b/drivers/hv/dxgkrnl/dxgadapter.c
@@ -136,7 +136,7 @@ void dxgadapter_release(struct kref *refcount)
 	struct dxgadapter *adapter;
 
 	adapter = container_of(refcount, struct dxgadapter, adapter_kref);
-	DXG_TRACE("Destroying adapter: %px", adapter);
+	DXG_TRACE("Destroying adapter: %p", adapter);
 	kfree(adapter);
 }
 
@@ -271,7 +271,7 @@ struct dxgdevice *dxgdevice_create(struct dxgadapter *adapter,
 			kref_put(&device->device_kref, dxgdevice_release);
 			device = NULL;
 		} else {
-			DXG_TRACE("dxgdevice created: %px", device);
+			DXG_TRACE("dxgdevice created: %p", device);
 		}
 	}
 	return device;
@@ -720,7 +720,7 @@ void dxgdevice_release(struct kref *refcount)
 	struct dxgdevice *device;
 
 	device = container_of(refcount, struct dxgdevice, device_kref);
-	DXG_TRACE("Destroying device: %px", device);
+	DXG_TRACE("Destroying device: %p", device);
 	kref_put(&device->adapter->adapter_kref, dxgadapter_release);
 	kfree(device);
 }
@@ -1103,7 +1103,7 @@ int dxgprocess_adapter_add_device(struct dxgprocess *process,
 
 void dxgprocess_adapter_remove_device(struct dxgdevice *device)
 {
-	DXG_TRACE("Removing device: %px", device);
+	DXG_TRACE("Removing device: %p", device);
 	mutex_lock(&device->adapter_info->device_list_mutex);
 	if (device->device_list_entry.next) {
 		list_del(&device->device_list_entry);
diff --git a/drivers/hv/dxgkrnl/dxgkrnl.h b/drivers/hv/dxgkrnl/dxgkrnl.h
index d816a875d5ab..4a4605f45736 100644
--- a/drivers/hv/dxgkrnl/dxgkrnl.h
+++ b/drivers/hv/dxgkrnl/dxgkrnl.h
@@ -27,7 +27,6 @@
 #include <linux/pci.h>
 #include <linux/hyperv.h>
 #include <uapi/misc/d3dkmthk.h>
-#include <linux/version.h>
 #include "misc.h"
 #include "hmgr.h"
 #include <uapi/misc/d3dkmthk.h>
@@ -719,7 +718,7 @@ bool dxgresource_is_active(struct dxgresource *res);
 
 struct privdata {
 	u32 data_size;
-	u8 data[1];
+	u8 data[];
 };
 
 struct dxgallocation {
@@ -769,9 +768,9 @@ long dxgk_unlocked_ioctl(struct file *f, unsigned int p1, unsigned long p2);
 
 int dxg_unmap_iospace(void *va, u32 size);
 /*
- * The convention is that VNBus instance id is a GUID, but the host sets
- * the lower part of the value to the host adapter LUID. The function
- * provides the necessary conversion.
+ * The convention is that VMBus instance id is a GUID, but the host sets
+ * the lower part of the value to the host adapter LUID. The cast reads
+ * the first sizeof(winluid) bytes of the GUID as a winluid value.
  */
 static inline void guid_to_luid(guid_t *guid, struct winluid *luid)
 {
@@ -1029,9 +1028,7 @@ void dxgk_validate_ioctls(void);
 #else
 
 #define DXG_TRACE(...)
-#define DXG_ERR(fmt, ...) do {					\
-	dev_err(DXGDEV, "%s: " fmt, __func__, ##__VA_ARGS__);	\
-} while (0)
+#define DXG_ERR(fmt, ...)	dev_err(DXGDEV, "%s: " fmt, __func__, ##__VA_ARGS__)
 
 #endif /* DEBUG */
 
diff --git a/drivers/hv/dxgkrnl/dxgmodule.c b/drivers/hv/dxgkrnl/dxgmodule.c
index c2a4a2a2136f..435dc60511b8 100644
--- a/drivers/hv/dxgkrnl/dxgmodule.c
+++ b/drivers/hv/dxgkrnl/dxgmodule.c
@@ -158,7 +158,7 @@ static void dxg_signal_dma_fence(struct dxghostevent *eventhdr)
 {
 	struct dxgsyncpoint *event = (struct dxgsyncpoint *)eventhdr;
 
-	DXG_TRACE("syncpoint: %px, fence: %lld", event, event->fence_value);
+	DXG_TRACE("syncpoint: %p, fence: %lld", event, event->fence_value);
 	event->fence_value++;
 	list_del(&eventhdr->host_event_list_entry);
 	dma_fence_signal(&event->base);
@@ -788,7 +788,7 @@ static int dxg_probe_vmbus(struct hv_device *hdev,
 		if (dxgglobal->hdev) {
 			/* This device should appear only once */
 			DXG_ERR("global channel already exists");
-			ret = -EBADE;
+			ret = -ENODEV;
 			goto error;
 		}
 		dxgglobal->hdev = hdev;
@@ -969,4 +969,3 @@ module_exit(dxg_drv_exit);
 
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Microsoft Dxgkrnl virtual compute device Driver");
-MODULE_VERSION("2.0.3");
diff --git a/drivers/hv/dxgkrnl/dxgvmbus.c b/drivers/hv/dxgkrnl/dxgvmbus.c
index abb6d2af89ac..4b1ccaac440c 100644
--- a/drivers/hv/dxgkrnl/dxgvmbus.c
+++ b/drivers/hv/dxgkrnl/dxgvmbus.c
@@ -246,9 +246,7 @@ int dxgvmbuschannel_init(struct dxgvmbuschannel *ch, struct hv_device *hdev)
 		goto cleanup;
 	}
 
-#if KERNEL_VERSION(5, 15, 0) <= LINUX_VERSION_CODE
 	hdev->channel->max_pkt_size = DXG_MAX_VM_BUS_PACKET_SIZE;
-#endif
 	ret = vmbus_open(hdev->channel, RING_BUFSIZE, RING_BUFSIZE,
 			 NULL, 0, dxgvmbuschannel_receive, ch);
 	if (ret) {
@@ -1482,9 +1480,8 @@ int create_existing_sysmem(struct dxgdevice *device,
 				   dxgalloc->pages);
 	if (ret1 != npages) {
 		DXG_ERR("get_user_pages_fast failed: %d", ret1);
-		if (ret1 > 0 && ret1 < npages) {
+		if (ret1 > 0 && ret1 < npages)
 			unpin_user_pages(dxgalloc->pages, ret1);
-		}
 		vfree(dxgalloc->pages);
 		dxgalloc->pages = NULL;
 		ret = -ENOMEM;
diff --git a/drivers/hv/dxgkrnl/dxgvmbus.h b/drivers/hv/dxgkrnl/dxgvmbus.h
index a7e625b2f896..22246826d2f1 100644
--- a/drivers/hv/dxgkrnl/dxgvmbus.h
+++ b/drivers/hv/dxgkrnl/dxgvmbus.h
@@ -313,12 +313,12 @@ struct dxgkvmb_command_queryadapterinfo {
 	struct dxgkvmb_command_vgpu_to_host hdr;
 	enum kmtqueryadapterinfotype	query_type;
 	u32				private_data_size;
-	u8				private_data[1];
+	u8				private_data[];
 };
 
 struct dxgkvmb_command_queryadapterinfo_return {
 	struct ntstatus			status;
-	u8				private_data[1];
+	u8				private_data[];
 };
 
 /* Returns ntstatus */
@@ -391,7 +391,7 @@ struct dxgkvmb_command_makeresident {
 	struct d3dkmthandle		paging_queue;
 	struct d3dddi_makeresident_flags flags;
 	u32				alloc_count;
-	struct d3dkmthandle		allocations[1];
+	struct d3dkmthandle		allocations[];
 };
 
 struct dxgkvmb_command_makeresident_return {
@@ -405,7 +405,7 @@ struct dxgkvmb_command_evict {
 	struct d3dkmthandle		device;
 	struct d3dddi_evict_flags	flags;
 	u32				alloc_count;
-	struct d3dkmthandle		allocations[1];
+	struct d3dkmthandle		allocations[];
 };
 
 struct dxgkvmb_command_evict_return {
@@ -476,7 +476,7 @@ struct dxgkvmb_command_updategpuvirtualaddress {
 	struct d3dkmthandle		fence_object;
 	u32				num_operations;
 	u32				flags;
-	struct d3dddi_updategpuvirtualaddress_operation operations[1];
+	struct d3dddi_updategpuvirtualaddress_operation operations[];
 };
 
 struct dxgkvmb_command_queryclockcalibration {
@@ -627,7 +627,7 @@ struct dxgkvmb_command_destroyallocation {
 	struct d3dkmthandle		resource;
 	u32				alloc_count;
 	struct d3dddicb_destroyallocation2flags flags;
-	struct d3dkmthandle		allocations[1];
+	struct d3dkmthandle		allocations[];
 };
 
 struct dxgkvmb_command_createcontextvirtual {
@@ -639,7 +639,7 @@ struct dxgkvmb_command_createcontextvirtual {
 	struct d3dddi_createcontextflags flags;
 	enum d3dkmt_clienthint		client_hint;
 	u32				priv_drv_data_size;
-	u8				priv_drv_data[1];
+	u8				priv_drv_data[];
 };
 
 /* The command returns ntstatus */
@@ -768,7 +768,7 @@ struct dxgkvmb_command_offerallocations {
 	enum d3dkmt_offer_priority	priority;
 	struct d3dkmt_offer_flags	flags;
 	bool				resources;
-	struct d3dkmthandle		allocations[1];
+	struct d3dkmthandle		allocations[];
 };
 
 struct dxgkvmb_command_reclaimallocations {
@@ -778,13 +778,13 @@ struct dxgkvmb_command_reclaimallocations {
 	u32				allocation_count;
 	bool				resources;
 	bool				write_results;
-	struct d3dkmthandle		allocations[1];
+	struct d3dkmthandle		allocations[];
 };
 
 struct dxgkvmb_command_reclaimallocations_return {
 	u64				paging_fence_value;
 	struct ntstatus			status;
-	enum d3dddi_reclaim_result	discarded[1];
+	enum d3dddi_reclaim_result	discarded[];
 };
 
 /* Returns ntstatus */
@@ -804,7 +804,7 @@ struct dxgkvmb_command_createhwqueue {
 	struct d3dkmthandle		context;
 	struct d3dddi_createhwqueueflags flags;
 	u32				priv_drv_data_size;
-	char				priv_drv_data[1];
+	char				priv_drv_data[];
 };
 
 /* The command returns ntstatus */
@@ -833,7 +833,7 @@ struct dxgkvmb_command_escape {
 	struct d3dddi_escapeflags	flags;
 	u32				priv_drv_data_size;
 	struct d3dkmthandle		context;
-	u8				priv_drv_data[1];
+	u8				priv_drv_data[];
 };
 
 struct dxgkvmb_command_queryvideomemoryinfo {
@@ -879,7 +879,7 @@ struct dxgk_feature_desc {
 	struct {
 		u16 supported		: 1;
 		u16 virtualization_mode : 3;
-		u16 global 		: 1;
+		u16 global		: 1;
 		u16 driver_feature	: 1;
 		u16 internal		: 1;
 		u16 reserved		: 9;
diff --git a/drivers/hv/dxgkrnl/hmgr.c b/drivers/hv/dxgkrnl/hmgr.c
index 059f94307a0e..95879f59133e 100644
--- a/drivers/hv/dxgkrnl/hmgr.c
+++ b/drivers/hv/dxgkrnl/hmgr.c
@@ -467,9 +467,8 @@ void hmgrtable_free_handle(struct hmgrtable *table, enum hmgrentry_type t,
 			entry->next_free_index = i;
 		}
 		table->free_handle_list_tail = i;
-		if (table->free_handle_list_head == HMGRTABLE_INVALID_INDEX) {
+		if (table->free_handle_list_head == HMGRTABLE_INVALID_INDEX)
 			table->free_handle_list_head = i;
-		}
 	} else {
 		DXG_ERR("Invalid handle to free: %d %x", i, h.v);
 	}
diff --git a/include/uapi/misc/d3dkmthk.h b/include/uapi/misc/d3dkmthk.h
index db40e8ff40b0..a58b2513dfd3 100644
--- a/include/uapi/misc/d3dkmthk.h
+++ b/include/uapi/misc/d3dkmthk.h
@@ -1612,7 +1612,7 @@ struct d3dkmt_opensyncobjectfromsyncfile {
 };
 
 struct d3dkmt_enumprocesses {
-	struct winluid 		adapter_luid;
+	struct winluid		adapter_luid;
 #ifdef __KERNEL__
 	__u32			*buffer;
 #else

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox