[PATCH 3/3] mm: read remote memory without the mmap lock where possible

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: Rik van Riel <riel@surriel.com>,
	x86@kernel.org, linux-mm@kvack.org,
	"Thomas Gleixner" <tglx@kernel.org>,
	"Ingo Molnar" <mingo@redhat.com>,
	"Dmitry Ilvokhin" <d@ilvokhin.com>,
	"Borislav Petkov" <bp@alien8.de>,
	"Dave Hansen" <dave.hansen@linux.intel.com>,
	"Andrew Morton" <akpm@linux-foundation.org>,
	"David Hildenbrand" <david@kernel.org>,
	"Lorenzo Stoakes" <ljs@kernel.org>,
	"Liam R. Howlett" <liam@infradead.org>,
	"Vlastimil Babka" <vbabka@kernel.org>,
	"Suren Baghdasaryan" <surenb@google.com>
Subject: [PATCH 3/3] mm: read remote memory without the mmap lock where possible
Date: Tue, 16 Jun 2026 15:03:00 -0400	[thread overview]
Message-ID: <20260616190300.1509639-4-riel@surriel.com> (raw)
In-Reply-To: <20260616190300.1509639-1-riel@surriel.com>

__access_remote_vm() takes mmap_read_lock() for the entire transfer and
uses get_user_pages_remote(), which faults pages in.  For the common
case of reading memory that is already resident -- /proc/PID/cmdline,
/proc/PID/environ, ptrace PEEK of resident pages -- the mmap lock is
unnecessary and is badly contended on large machines.

Add an opportunistic, read-only fast path that transfers what it can
without the mmap lock.  For each address it takes the per-VMA lock with
lock_vma_under_rcu(), re-checks the read-side VMA permissions, and uses
folio_walk_start(..., FW_VMA_LOCKED) to grab a short-lived reference to
a present page before copying it out.  Anything non-trivial -- a not-
present page (needs faulting), a hugetlb or VM_IO/VM_PFNMAP mapping, or
a race with a VMA writer -- falls back to the existing mmap_lock path
for the remainder.

untagged_addr_remote() asserts the mmap lock, so add an unlocked variant
for the fast path; the untag mask is a stable per-mm value.

Only reads are handled here; writes keep using the slow path.

Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Rik van Riel <riel@surriel.com>
---
 arch/x86/include/asm/uaccess_64.h |  12 +++
 include/linux/uaccess.h           |  11 ++
 mm/memory.c                       | 166 +++++++++++++++++++++++++++++-
 3 files changed, 188 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 4a52497ba6a1..c6fac900a747 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -51,6 +51,18 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm,
 	(__force __typeof__(addr))__untagged_addr_remote(mm, __addr);	\
 })
 
+/* Same as __untagged_addr_remote(), but usable without the mmap lock held. */
+static inline unsigned long __untagged_addr_remote_unlocked(struct mm_struct *mm,
+							    unsigned long addr)
+{
+	return addr & READ_ONCE((mm)->context.untag_mask);
+}
+
+#define untagged_addr_remote_unlocked(mm, addr)	({			\
+	unsigned long __addr = (__force unsigned long)(addr);		\
+	(__force __typeof__(addr))__untagged_addr_remote_unlocked(mm, __addr); \
+})
+
 #endif
 
 #define valid_user_address(x) \
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 8a264662b242..c8c83372c9d8 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -34,6 +34,17 @@
 })
 #endif
 
+/*
+ * Like untagged_addr_remote(), but for callers that stabilize @mm by other
+ * means (e.g. a per-VMA lock) and must not assert the mmap lock.
+ */
+#ifndef untagged_addr_remote_unlocked
+#define untagged_addr_remote_unlocked(mm, addr)	({	\
+	(void)(mm);					\
+	untagged_addr(addr);				\
+})
+#endif
+
 #ifdef masked_user_access_begin
  #define can_do_masked_user_access() 1
 # ifndef masked_user_write_access_begin
diff --git a/mm/memory.c b/mm/memory.c
index 86a973119bd4..0b23b82eaa18 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -42,6 +42,8 @@
 #include <linux/kernel_stat.h>
 #include <linux/mm.h>
 #include <linux/mm_inline.h>
+#include <linux/secretmem.h>
+#include <linux/pagewalk.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/numa_balancing.h>
 #include <linux/sched/task.h>
@@ -7062,6 +7064,153 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
 EXPORT_SYMBOL_GPL(generic_access_phys);
 #endif
 
+/*
+ * The fast path uses folio_walk_start(FW_VMA_LOCKED), which needs the per-VMA
+ * lock and RCU-freed page tables to walk page tables without the mmap lock.
+ */
+#if defined(CONFIG_PER_VMA_LOCK) && defined(CONFIG_MMU_GATHER_RCU_TABLE_FREE)
+/*
+ * Opportunistic lockless fast path for __access_remote_vm() reads.
+ *
+ * Memory already resident in @mm can be read without taking the heavily
+ * contended mmap_lock: a per-VMA lock stabilizes the VMA, and folio_walk_start()
+ * with FW_VMA_LOCKED grabs a short-lived reference to a present page via an
+ * RCU/PTL protected page table walk (relying on MMU_GATHER_RCU_TABLE_FREE).
+ *
+ * Anything that would require faulting a page in, touching a hugetlb or
+ * VM_IO/VM_PFNMAP mapping, or that races a VMA writer is left to the mmap_lock
+ * path in __access_remote_vm().  Only reads are handled here.
+ *
+ * Returns the number of bytes transferred via the fast path.
+ */
+static int access_remote_vm_fast(struct mm_struct *mm, unsigned long addr,
+				 void *buf, int len, unsigned int gup_flags)
+{
+	void *old_buf = buf;
+
+	addr = untagged_addr_remote_unlocked(mm, addr);
+
+	while (len) {
+		struct vm_area_struct *vma;
+		vm_flags_t vm_flags;
+
+		vma = lock_vma_under_rcu(mm, addr);
+		if (!vma)
+			break;
+
+		/*
+		 * Mirror the read-side permission checks of check_vma_flags(),
+		 * and exclude what FW_VMA_LOCKED cannot handle (hugetlb) or what
+		 * needs the ->access() handler (VM_IO/VM_PFNMAP).  Checked once
+		 * per VMA; anything not positively allowed falls back to the
+		 * slow path, which re-validates everything.
+		 */
+		vm_flags = vma->vm_flags;
+		if ((vm_flags & (VM_IO | VM_PFNMAP)) ||
+		    is_vm_hugetlb_page(vma) || vma_is_secretmem(vma) ||
+		    (!(vm_flags & VM_READ) &&
+		     (!(gup_flags & FOLL_FORCE) || !(vm_flags & VM_MAYREAD)))) {
+			vma_end_read(vma);
+			break;
+		}
+
+		/*
+		 * Copy as much of this VMA as we can without re-acquiring the
+		 * per-VMA lock; re-lock only when @addr leaves the VMA.
+		 */
+		while (len && addr < vma->vm_end) {
+			struct folio_walk fw;
+			struct folio *folio;
+			struct page *page;
+			unsigned long entry_size, entry_left, folio_left, span;
+			unsigned long copied, idx0;
+			int offset;
+
+			folio = folio_walk_start(&fw, vma, addr, FW_VMA_LOCKED);
+			if (!folio) {
+				vma_end_read(vma);
+				goto out;
+			}
+			page = fw.page;
+			if (!page) {
+				folio_walk_end(&fw, vma);
+				vma_end_read(vma);
+				goto out;
+			}
+			/* Pin the folio so it stays valid after the PTL is dropped. */
+			folio_get(folio);
+			folio_walk_end(&fw, vma);
+
+			/*
+			 * folio_walk_start() validated exactly one mapping entry,
+			 * which covers a contiguous, present run of this folio:
+			 * PAGE_SIZE for a pte, PMD_SIZE for a pmd leaf, PUD_SIZE
+			 * for a pud leaf.  Copy up to the end of that entry,
+			 * bounded by the folio, the VMA and len, so a huge mapping
+			 * is handled in one walk instead of per page.
+			 */
+			offset = offset_in_page(addr);
+			switch (fw.level) {
+			case FW_LEVEL_PUD:
+				entry_size = PUD_SIZE;
+				break;
+			case FW_LEVEL_PMD:
+				entry_size = PMD_SIZE;
+				break;
+			default:
+				entry_size = PAGE_SIZE;
+				break;
+			}
+			entry_left = entry_size - (addr & (entry_size - 1));
+			idx0 = folio_page_idx(folio, page);
+			folio_left = ((folio_nr_pages(folio) - idx0) << PAGE_SHIFT) -
+				     offset;
+			span = min3((unsigned long)len, entry_left, folio_left);
+			span = min(span, vma->vm_end - addr);
+
+			/*
+			 * Copy the span page-by-page: kmap_local_folio() maps one
+			 * page on HIGHMEM and copy_from_user_page() flushes per
+			 * page on aliasing caches, but the page tables are not
+			 * re-walked.  The span borrows the single folio reference
+			 * taken above, so each mapping is dropped with
+			 * kunmap_local() (not folio_release_kmap(), which would
+			 * also drop a folio reference per page).
+			 */
+			for (copied = 0; copied < span; ) {
+				unsigned long foff = offset + copied;
+				unsigned long pidx = idx0 + (foff >> PAGE_SHIFT);
+				int poff = foff & ~PAGE_MASK;
+				int chunk = min_t(unsigned long, span - copied,
+						  PAGE_SIZE - poff);
+				void *maddr = kmap_local_folio(folio,
+						pidx << PAGE_SHIFT);
+
+				copy_from_user_page(vma, folio_page(folio, pidx),
+						    addr + copied, buf + copied,
+						    maddr + poff, chunk);
+				kunmap_local(maddr);
+				copied += chunk;
+			}
+
+			folio_put(folio);
+			len -= span;
+			buf += span;
+			addr += span;
+		}
+		vma_end_read(vma);
+	}
+out:
+	return buf - old_buf;
+}
+#else
+static int access_remote_vm_fast(struct mm_struct *mm, unsigned long addr,
+				 void *buf, int len, unsigned int gup_flags)
+{
+	return 0;
+}
+#endif /* CONFIG_PER_VMA_LOCK && CONFIG_MMU_GATHER_RCU_TABLE_FREE */
+
 /*
  * Access another process' address space as given in mm.
  */
@@ -7071,8 +7220,23 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
 	void *old_buf = buf;
 	int write = gup_flags & FOLL_WRITE;
 
+	/*
+	 * Try the lockless fast path for reads first; it transfers what it can
+	 * from resident memory without taking mmap_lock, and leaves the
+	 * remainder (if any) to the slow path below.
+	 */
+	if (!write) {
+		int done = access_remote_vm_fast(mm, addr, buf, len, gup_flags);
+
+		addr += done;
+		buf += done;
+		len -= done;
+		if (!len)
+			return buf - old_buf;
+	}
+
 	if (mmap_read_lock_killable(mm))
-		return 0;
+		return buf - old_buf;
 
 	/* Untag the address before looking up the VMA */
 	addr = untagged_addr_remote(mm, addr);
-- 
2.53.0-Meta

next prev parent reply	other threads:[~2026-06-16 19:03 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-16 19:02 [PATCH 0/3] mm: __access_remote_vm with per-VMA lock Rik van Riel
2026-06-16 19:02 ` [PATCH 1/3] x86/mm: use READ_ONCE/WRITE_ONCE for mm->context.untag_mask Rik van Riel
2026-06-18 16:40   ` Usama Arif
2026-06-16 19:02 ` [PATCH 2/3] mm/pagewalk: let folio_walk_start() run under the per-VMA lock Rik van Riel
2026-06-16 19:03 ` Rik van Riel [this message]
2026-06-17  6:19   ` [PATCH 3/3] mm: read remote memory without the mmap lock where possible Suren Baghdasaryan
2026-06-18 17:01   ` Usama Arif
2026-06-18 17:07     ` David Hildenbrand (Arm)
2026-06-18 17:22       ` Usama Arif
2026-06-17  1:10 ` [PATCH 0/3] mm: __access_remote_vm with per-VMA lock Suren Baghdasaryan
2026-06-17  9:42   ` David Hildenbrand (Arm)
2026-06-17 13:33     ` Suren Baghdasaryan
2026-06-18 20:37       ` David Hildenbrand (Arm)

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:4a52497ba6a dfblob:c6fac900a74 dfblob:8a264662b24
dfblob:c8c83372c9d dfblob:86a973119bd dfblob:0b23b82eaa1 )
 OR (
bs:"[PATCH 3/3] mm: read remote memory without the mmap lock where possible" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260616190300.1509639-4-riel@surriel.com \
    --to=riel@surriel.com \
    --cc=akpm@linux-foundation.org \
    --cc=bp@alien8.de \
    --cc=d@ilvokhin.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=david@kernel.org \
    --cc=liam@infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ljs@kernel.org \
    --cc=mingo@redhat.com \
    --cc=surenb@google.com \
    --cc=tglx@kernel.org \
    --cc=vbabka@kernel.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.