From: "Kiryl Shutsemau (Meta)" <kas@kernel.org>
To: akpm@linux-foundation.org, rppt@kernel.org, peterx@redhat.com,
david@kernel.org
Cc: ljs@kernel.org, surenb@google.com, vbabka@kernel.org,
Liam.Howlett@oracle.com, ziy@nvidia.com, corbet@lwn.net,
skhan@linuxfoundation.org, seanjc@google.com,
pbonzini@redhat.com, jthoughton@google.com, aarcange@redhat.com,
sj@kernel.org, usama.arif@linux.dev, linux-mm@kvack.org,
linux-kernel@vger.kernel.org, linux-doc@vger.kernel.org,
linux-kselftest@vger.kernel.org, kvm@vger.kernel.org,
kernel-team@meta.com, "Kiryl Shutsemau (Meta)" <kas@kernel.org>
Subject: [PATCH v4 09/14] mm/userfaultfd: add RWP fault delivery and expose UFFDIO_REGISTER_MODE_RWP
Date: Mon, 25 May 2026 12:37:23 +0100 [thread overview]
Message-ID: <20260525113737.1942478-10-kas@kernel.org> (raw)
In-Reply-To: <20260525113737.1942478-1-kas@kernel.org>
Wire the fault side of read-write protection tracking and turn the
userspace interface on.
An RWP-protected PTE is PAGE_NONE with the uffd bit set. The
PROT_NONE triggers a fault on any access; the uffd bit distinguishes
it from plain mprotect(PROT_NONE) or NUMA hinting.
Fault dispatch, per level:
PTE handle_pte_fault() -> do_uffd_rwp()
PMD __handle_mm_fault() -> do_huge_pmd_uffd_rwp()
hugetlb hugetlb_fault() -> hugetlb_handle_userfault()
The RWP branches gate on userfaultfd_pte_rwp() / userfaultfd_huge_pmd_rwp()
(VM_UFFD_RWP plus the uffd bit) and fall through to do_numa_page() /
do_huge_pmd_numa_page() otherwise. Each delivers a
UFFD_PAGEFAULT_FLAG_RWP message through handle_userfault(); the handler
resolves it with UFFDIO_RWPROTECT clearing MODE_RWP.
userfaultfd_must_wait() and userfaultfd_huge_must_wait() add matching
protnone+uffd waiters so sync-mode fault handlers block correctly.
Expose the UAPI:
UFFDIO_REGISTER_MODE_RWP -> UFFD_API_REGISTER_MODES
UFFD_FEATURE_RWP -> UFFD_API_FEATURES
_UFFDIO_RWPROTECT -> UFFD_API_RANGE_IOCTLS
UFFD_API_RANGE_IOCTLS_BASIC
UFFD_FEATURE_RWP is masked out at UFFDIO_API time when PROT_NONE is
not available or VM_UFFD_RWP aliases VM_NONE (32-bit), so userspace
never sees an advertised-but-broken feature.
Works on anonymous, shmem, and hugetlb memory.
Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
include/linux/huge_mm.h | 7 +++++++
include/linux/userfaultfd_k.h | 24 ++++++++++++++++++++++++
include/uapi/linux/userfaultfd.h | 12 ++++++++----
mm/huge_memory.c | 5 +++++
mm/hugetlb.c | 11 +++++++++++
mm/memory.c | 21 +++++++++++++++++++--
mm/userfaultfd.c | 32 ++++++++++++++++++++++++++++++--
7 files changed, 104 insertions(+), 8 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index edece3e26985..fe48d76957fb 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -529,6 +529,8 @@ static inline bool folio_test_pmd_mappable(struct folio *folio)
vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);
+vm_fault_t do_huge_pmd_uffd_rwp(struct vm_fault *vmf);
+
vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf);
extern struct folio *huge_zero_folio;
@@ -716,6 +718,11 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
return NULL;
}
+static inline vm_fault_t do_huge_pmd_uffd_rwp(struct vm_fault *vmf)
+{
+ return 0;
+}
+
static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
{
return 0;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index f78d5d370d0a..d8f5f400c8ef 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -233,6 +233,18 @@ static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma,
return userfaultfd_wp(vma) && pmd_uffd(pmd);
}
+static inline bool userfaultfd_pte_rwp(struct vm_area_struct *vma,
+ pte_t pte)
+{
+ return userfaultfd_rwp(vma) && pte_uffd(pte);
+}
+
+static inline bool userfaultfd_huge_pmd_rwp(struct vm_area_struct *vma,
+ pmd_t pmd)
+{
+ return userfaultfd_rwp(vma) && pmd_uffd(pmd);
+}
+
static inline bool userfaultfd_armed(struct vm_area_struct *vma)
{
return vma->vm_flags & __VM_UFFD_FLAGS;
@@ -363,6 +375,18 @@ static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma,
return false;
}
+static inline bool userfaultfd_pte_rwp(struct vm_area_struct *vma,
+ pte_t pte)
+{
+ return false;
+}
+
+static inline bool userfaultfd_huge_pmd_rwp(struct vm_area_struct *vma,
+ pmd_t pmd)
+{
+ return false;
+}
+
static inline bool userfaultfd_armed(struct vm_area_struct *vma)
{
return false;
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 7b78aa3b5318..d803e76d47ad 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -25,7 +25,8 @@
#define UFFD_API ((__u64)0xAA)
#define UFFD_API_REGISTER_MODES (UFFDIO_REGISTER_MODE_MISSING | \
UFFDIO_REGISTER_MODE_WP | \
- UFFDIO_REGISTER_MODE_MINOR)
+ UFFDIO_REGISTER_MODE_MINOR | \
+ UFFDIO_REGISTER_MODE_RWP)
#define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
UFFD_FEATURE_EVENT_FORK | \
UFFD_FEATURE_EVENT_REMAP | \
@@ -42,7 +43,8 @@
UFFD_FEATURE_WP_UNPOPULATED | \
UFFD_FEATURE_POISON | \
UFFD_FEATURE_WP_ASYNC | \
- UFFD_FEATURE_MOVE)
+ UFFD_FEATURE_MOVE | \
+ UFFD_FEATURE_RWP)
#define UFFD_API_IOCTLS \
((__u64)1 << _UFFDIO_REGISTER | \
(__u64)1 << _UFFDIO_UNREGISTER | \
@@ -54,13 +56,15 @@
(__u64)1 << _UFFDIO_MOVE | \
(__u64)1 << _UFFDIO_WRITEPROTECT | \
(__u64)1 << _UFFDIO_CONTINUE | \
- (__u64)1 << _UFFDIO_POISON)
+ (__u64)1 << _UFFDIO_POISON | \
+ (__u64)1 << _UFFDIO_RWPROTECT)
#define UFFD_API_RANGE_IOCTLS_BASIC \
((__u64)1 << _UFFDIO_WAKE | \
(__u64)1 << _UFFDIO_COPY | \
(__u64)1 << _UFFDIO_WRITEPROTECT | \
(__u64)1 << _UFFDIO_CONTINUE | \
- (__u64)1 << _UFFDIO_POISON)
+ (__u64)1 << _UFFDIO_POISON | \
+ (__u64)1 << _UFFDIO_RWPROTECT)
/*
* Valid ioctl command number range with this API is from 0x00 to
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8620ba92263f..cd32bd51e311 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2289,6 +2289,11 @@ static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
return pmd_dirty(pmd);
}
+vm_fault_t do_huge_pmd_uffd_rwp(struct vm_fault *vmf)
+{
+ return handle_userfault(vmf, VM_UFFD_RWP);
+}
+
/* NUMA hinting page fault entry point for trans huge pmds */
vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
{
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8555810cd42e..f63718296cc2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6062,6 +6062,17 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_mutex;
}
+ /*
+ * Protnone hugetlb PTEs with the uffd bit are used by
+ * userfaultfd RWP for access tracking. Plain PROT_NONE (without the
+ * marker) is not an RWP fault and is not expected on hugetlb (no
+ * NUMA hinting), so let normal hugetlb fault handling proceed.
+ */
+ if (pte_protnone(vmf.orig_pte) && vma_is_accessible(vma) &&
+ userfaultfd_rwp(vma) && huge_pte_uffd(vmf.orig_pte)) {
+ return hugetlb_handle_userfault(&vmf, mapping, VM_UFFD_RWP);
+ }
+
/*
* If we are going to COW/unshare the mapping later, we examine the
* pending reservations for this page now. This will ensure that any
diff --git a/mm/memory.c b/mm/memory.c
index e4ae5350db41..3e393881031d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6135,6 +6135,12 @@ static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_stru
}
}
+static vm_fault_t do_uffd_rwp(struct vm_fault *vmf)
+{
+ pte_unmap(vmf->pte);
+ return handle_userfault(vmf, VM_UFFD_RWP);
+}
+
static vm_fault_t do_numa_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
@@ -6410,8 +6416,16 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
if (!pte_present(vmf->orig_pte))
return do_swap_page(vmf);
- if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
+ if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) {
+ /*
+ * RWP-protected PTEs are protnone plus the uffd bit. On a
+ * VM_UFFD_RWP VMA, a protnone PTE without the uffd bit is
+ * NUMA hinting and must still fall through to do_numa_page().
+ */
+ if (userfaultfd_pte_rwp(vmf->vma, vmf->orig_pte))
+ return do_uffd_rwp(vmf);
return do_numa_page(vmf);
+ }
spin_lock(vmf->ptl);
entry = vmf->orig_pte;
@@ -6525,8 +6539,11 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
return 0;
}
if (pmd_trans_huge(vmf.orig_pmd)) {
- if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
+ if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) {
+ if (userfaultfd_huge_pmd_rwp(vma, vmf.orig_pmd))
+ return do_huge_pmd_uffd_rwp(&vmf);
return do_huge_pmd_numa_page(&vmf);
+ }
if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
!pmd_write(vmf.orig_pmd)) {
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 78eb63702649..b966df47800c 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -2650,6 +2650,12 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
*/
if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
return true;
+ /*
+ * PTE is still RW-protected (protnone with uffd bit), wait for
+ * resolution. Plain PROT_NONE without the marker is not an RWP fault.
+ */
+ if (pte_protnone(pte) && huge_pte_uffd(pte) && (reason & VM_UFFD_RWP))
+ return true;
return false;
}
@@ -2710,8 +2716,14 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
if (!pmd_present(_pmd))
return false;
- if (pmd_trans_huge(_pmd))
- return !pmd_write(_pmd) && (reason & VM_UFFD_WP);
+ if (pmd_trans_huge(_pmd)) {
+ if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
+ return true;
+ if (pmd_protnone(_pmd) && pmd_uffd(_pmd) &&
+ (reason & VM_UFFD_RWP))
+ return true;
+ return false;
+ }
pte = pte_offset_map(pmd, address);
if (!pte)
@@ -2736,6 +2748,13 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
*/
if (!pte_write(ptent) && (reason & VM_UFFD_WP))
goto out;
+ /*
+ * PTE is still RW-protected (protnone with uffd bit), wait for
+ * userspace to resolve. Plain PROT_NONE without the marker is not
+ * an RWP fault.
+ */
+ if (pte_protnone(ptent) && pte_uffd(ptent) && (reason & VM_UFFD_RWP))
+ goto out;
ret = false;
out:
@@ -4477,6 +4496,15 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
}
+ /*
+ * RWP needs both PROT_NONE support and the uffd-wp PTE bit. The
+ * VM_UFFD_RWP check covers compile-time unavailability; the
+ * pgtable_supports_uffd() check covers runtime (e.g. riscv
+ * without the SVRSW60T59B extension) where the PTE bit is declared
+ * but not actually usable.
+ */
+ if (VM_UFFD_RWP == VM_NONE || !pgtable_supports_uffd())
+ uffdio_api.features &= ~UFFD_FEATURE_RWP;
ret = -EINVAL;
if (features & ~uffdio_api.features)
--
2.54.0
next prev parent reply other threads:[~2026-05-25 11:39 UTC|newest]
Thread overview: 22+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-25 11:37 [PATCH v4 00/14] userfaultfd: working set tracking for VM guest memory Kiryl Shutsemau (Meta)
2026-05-25 11:37 ` [PATCH v4 01/14] mm: decouple protnone helpers from CONFIG_NUMA_BALANCING Kiryl Shutsemau (Meta)
[not found] ` <20260525120640.328441F000E9@smtp.kernel.org>
2026-05-25 14:41 ` Kiryl Shutsemau
2026-05-25 11:37 ` [PATCH v4 02/14] mm: rename uffd-wp PTE bit macros to uffd Kiryl Shutsemau (Meta)
2026-05-25 11:37 ` [PATCH v4 03/14] mm: rename uffd-wp PTE accessors " Kiryl Shutsemau (Meta)
[not found] ` <20260525120513.C51E91F00A3A@smtp.kernel.org>
2026-05-25 14:43 ` Kiryl Shutsemau
2026-05-25 11:37 ` [PATCH v4 04/14] mm: add VM_UFFD_RWP VMA flag Kiryl Shutsemau (Meta)
[not found] ` <20260525121916.831DA1F000E9@smtp.kernel.org>
2026-05-25 14:59 ` Kiryl Shutsemau
2026-05-25 11:37 ` [PATCH v4 05/14] mm: add MM_CP_UFFD_RWP change_protection() flag Kiryl Shutsemau (Meta)
[not found] ` <20260525121319.3B03D1F000E9@smtp.kernel.org>
2026-05-25 15:03 ` Kiryl Shutsemau
2026-05-25 11:37 ` [PATCH v4 06/14] mm: preserve RWP marker across PTE rewrites Kiryl Shutsemau (Meta)
[not found] ` <20260525120819.C18561F000E9@smtp.kernel.org>
2026-05-25 15:07 ` Kiryl Shutsemau
2026-05-25 11:37 ` [PATCH v4 07/14] mm: handle VM_UFFD_RWP in khugepaged, rmap, and GUP Kiryl Shutsemau (Meta)
[not found] ` <20260525131923.11A1B1F000E9@smtp.kernel.org>
2026-05-25 15:18 ` Kiryl Shutsemau
2026-05-25 11:37 ` [PATCH v4 08/14] userfaultfd: add UFFDIO_REGISTER_MODE_RWP and UFFDIO_RWPROTECT plumbing Kiryl Shutsemau (Meta)
[not found] ` <20260525121111.E857E1F000E9@smtp.kernel.org>
2026-05-25 15:19 ` Kiryl Shutsemau
2026-05-25 11:37 ` Kiryl Shutsemau (Meta) [this message]
[not found] ` <20260525121858.57D0B1F000E9@smtp.kernel.org>
2026-05-25 15:27 ` [PATCH v4 09/14] mm/userfaultfd: add RWP fault delivery and expose UFFDIO_REGISTER_MODE_RWP Kiryl Shutsemau
2026-05-25 11:37 ` [PATCH v4 10/14] mm/pagemap: add PAGE_IS_ACCESSED for RWP tracking Kiryl Shutsemau (Meta)
[not found] ` <20260525122659.BB52A1F000E9@smtp.kernel.org>
2026-05-25 15:29 ` Kiryl Shutsemau
2026-05-25 11:37 ` [PATCH v4 11/14] userfaultfd: add UFFD_FEATURE_RWP_ASYNC for async fault resolution Kiryl Shutsemau (Meta)
[not found] ` <20260525123622.DEF511F000E9@smtp.kernel.org>
2026-05-25 15:35 ` Kiryl Shutsemau
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260525113737.1942478-10-kas@kernel.org \
--to=kas@kernel.org \
--cc=Liam.Howlett@oracle.com \
--cc=aarcange@redhat.com \
--cc=akpm@linux-foundation.org \
--cc=corbet@lwn.net \
--cc=david@kernel.org \
--cc=jthoughton@google.com \
--cc=kernel-team@meta.com \
--cc=kvm@vger.kernel.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=ljs@kernel.org \
--cc=pbonzini@redhat.com \
--cc=peterx@redhat.com \
--cc=rppt@kernel.org \
--cc=seanjc@google.com \
--cc=sj@kernel.org \
--cc=skhan@linuxfoundation.org \
--cc=surenb@google.com \
--cc=usama.arif@linux.dev \
--cc=vbabka@kernel.org \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox