All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Kiryl Shutsemau (Meta)" <kas@kernel.org>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Xu <peterx@redhat.com>,
	David Hildenbrand <david@kernel.org>,
	Lorenzo Stoakes <ljs@kernel.org>, Mike Rapoport <rppt@kernel.org>,
	Suren Baghdasaryan <surenb@google.com>,
	Vlastimil Babka <vbabka@kernel.org>,
	"Liam R . Howlett" <Liam.Howlett@oracle.com>,
	Zi Yan <ziy@nvidia.com>, Jonathan Corbet <corbet@lwn.net>,
	Shuah Khan <skhan@linuxfoundation.org>,
	Sean Christopherson <seanjc@google.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	linux-doc@vger.kernel.org, linux-kselftest@vger.kernel.org,
	kvm@vger.kernel.org, "Kiryl Shutsemau (Meta)" <kas@kernel.org>
Subject: [RFC, PATCH 03/12] userfaultfd: implement UFFDIO_DEACTIVATE ioctl
Date: Tue, 14 Apr 2026 15:23:37 +0100	[thread overview]
Message-ID: <20260414142354.1465950-4-kas@kernel.org> (raw)
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>

UFFDIO_DEACTIVATE marks pages as deactivated within a VM_UFFD_MINOR
range:

- Anonymous memory: set protnone via change_protection(MM_CP_UFFD_DEACTIVATE).
  Pages stay resident with PFNs preserved, only permissions removed.
  MM_CP_UFFD_DEACTIVATE is handled independently from MM_CP_PROT_NUMA,
  bypassing folio_can_map_prot_numa() and CONFIG_NUMA_BALANCING guards.

- Shared shmem/hugetlbfs: zap PTEs via zap_page_range_single().
  Pages stay in page cache.

- Private hugetlb: rejected with -EINVAL (zapping would destroy content).

Cleanup on unregister/close: restore protnone PTEs to normal permissions
in userfaultfd_clear_vma(), preventing permanently inaccessible pages.

Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
 fs/userfaultfd.c              | 35 ++++++++++++++++
 include/linux/mm.h            |  2 +
 include/linux/userfaultfd_k.h |  2 +
 mm/huge_memory.c              |  9 ++--
 mm/mprotect.c                 |  9 +++-
 mm/userfaultfd.c              | 78 +++++++++++++++++++++++++++++++++--
 6 files changed, 127 insertions(+), 8 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 8d508ad19e89..b317c9854b86 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1441,6 +1441,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
 			ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
 
+		/* DEACTIVATE is only supported for MINOR ranges. */
+		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
+			ioctls_out &= ~((__u64)1 << _UFFDIO_DEACTIVATE);
+
 		/*
 		 * Now that we scanned all vmas we can already tell
 		 * userland which ioctls methods are guaranteed to
@@ -1788,6 +1792,34 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
 	return ret;
 }
 
+static int userfaultfd_deactivate(struct userfaultfd_ctx *ctx,
+				  unsigned long arg)
+{
+	int ret;
+	struct uffdio_range uffdio_range;
+
+	if (atomic_read(&ctx->mmap_changing))
+		return -EAGAIN;
+
+	if (copy_from_user(&uffdio_range, (void __user *)arg,
+			   sizeof(uffdio_range)))
+		return -EFAULT;
+
+	ret = validate_range(ctx->mm, uffdio_range.start, uffdio_range.len);
+	if (ret)
+		return ret;
+
+	if (mmget_not_zero(ctx->mm)) {
+		ret = mdeactivate_range(ctx, uffdio_range.start,
+					uffdio_range.len);
+		mmput(ctx->mm);
+	} else {
+		return -ESRCH;
+	}
+
+	return ret;
+}
+
 static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
 {
 	__s64 ret;
@@ -2108,6 +2140,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
 	case UFFDIO_POISON:
 		ret = userfaultfd_poison(ctx, arg);
 		break;
+	case UFFDIO_DEACTIVATE:
+		ret = userfaultfd_deactivate(ctx, arg);
+		break;
 	}
 	return ret;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index abb4963c1f06..fc2841264d56 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3036,6 +3036,8 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen);
 #define  MM_CP_UFFD_WP_RESOLVE             (1UL << 3) /* Resolve wp */
 #define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
 					    MM_CP_UFFD_WP_RESOLVE)
+/* Whether this change is for uffd deactivation */
+#define  MM_CP_UFFD_DEACTIVATE             (1UL << 4)
 
 bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
 			     pte_t pte);
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index d1d4ed4a08b0..c94b5c5b5f24 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -130,6 +130,8 @@ extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
 			       unsigned long len, bool enable_wp);
 extern long uffd_wp_range(struct vm_area_struct *vma,
 			  unsigned long start, unsigned long len, bool enable_wp);
+extern int mdeactivate_range(struct userfaultfd_ctx *ctx, unsigned long start,
+			     unsigned long len);
 
 /* move_pages */
 void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b298cba853ab..2ad736ff007c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2563,6 +2563,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	spinlock_t *ptl;
 	pmd_t oldpmd, entry;
 	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+	bool uffd_deactivate = cp_flags & MM_CP_UFFD_DEACTIVATE;
 	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
 	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
 	int ret = 1;
@@ -2582,8 +2583,11 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		goto unlock;
 	}
 
-	if (prot_numa) {
+	/* Already protnone — nothing to do for either NUMA or uffd */
+	if ((prot_numa || uffd_deactivate) && pmd_protnone(*pmd))
+		goto unlock;
 
+	if (prot_numa) {
 		/*
 		 * Avoid trapping faults against the zero page. The read-only
 		 * data is likely to be read-cached on the local CPU and
@@ -2592,9 +2596,6 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		if (is_huge_zero_pmd(*pmd))
 			goto unlock;
 
-		if (pmd_protnone(*pmd))
-			goto unlock;
-
 		if (!folio_can_map_prot_numa(pmd_folio(*pmd), vma,
 					     vma_is_single_threaded_private(vma)))
 			goto unlock;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index c0571445bef7..7c612a680014 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -220,6 +220,7 @@ static long change_pte_range(struct mmu_gather *tlb,
 	long pages = 0;
 	bool is_private_single_threaded;
 	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+	bool uffd_deactivate = cp_flags & MM_CP_UFFD_DEACTIVATE;
 	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
 	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
 	int nr_ptes;
@@ -245,7 +246,8 @@ static long change_pte_range(struct mmu_gather *tlb,
 			pte_t ptent;
 
 			/* Already in the desired state. */
-			if (prot_numa && pte_protnone(oldpte))
+			if ((prot_numa || uffd_deactivate) &&
+			    pte_protnone(oldpte))
 				continue;
 
 			page = vm_normal_page(vma, addr, oldpte);
@@ -255,6 +257,8 @@ static long change_pte_range(struct mmu_gather *tlb,
 			/*
 			 * Avoid trapping faults against the zero or KSM
 			 * pages. See similar comment in change_huge_pmd.
+			 * Skip this filter for uffd deactivation which
+			 * must set protnone regardless of NUMA placement.
 			 */
 			if (prot_numa &&
 			    !folio_can_map_prot_numa(folio, vma,
@@ -651,6 +655,9 @@ long change_protection(struct mmu_gather *tlb,
 	WARN_ON_ONCE(cp_flags & MM_CP_PROT_NUMA);
 #endif
 
+	if (cp_flags & MM_CP_UFFD_DEACTIVATE)
+		newprot = PAGE_NONE;
+
 	if (is_vm_hugetlb_page(vma))
 		pages = hugetlb_change_protection(vma, start, end, newprot,
 						  cp_flags);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index dba1ea26fdfe..3373b11b9d83 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -775,7 +775,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 
 	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
 		goto out_unlock;
-	if (!vma_is_shmem(dst_vma) &&
+	if (!vma_is_shmem(dst_vma) && !vma_is_anonymous(dst_vma) &&
 	    uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
 		goto out_unlock;
 
@@ -797,13 +797,16 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 			break;
 		}
 		dst_pmdval = pmdp_get_lockless(dst_pmd);
+		if (unlikely(!pmd_present(dst_pmdval))) {
+			err = -EEXIST;
+			break;
+		}
 		/*
 		 * If the dst_pmd is THP don't override it and just be strict.
 		 * (This includes the case where the PMD used to be THP and
 		 * changed back to none after __pte_alloc().)
 		 */
-		if (unlikely(!pmd_present(dst_pmdval) ||
-				pmd_trans_huge(dst_pmdval))) {
+		if (unlikely(pmd_trans_huge(dst_pmdval))) {
 			err = -EEXIST;
 			break;
 		}
@@ -996,6 +999,65 @@ int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
 	return err;
 }
 
+int mdeactivate_range(struct userfaultfd_ctx *ctx, unsigned long start,
+		      unsigned long len)
+{
+	struct mm_struct *dst_mm = ctx->mm;
+	unsigned long end = start + len;
+	struct vm_area_struct *dst_vma;
+	long err;
+	VMA_ITERATOR(vmi, dst_mm, start);
+
+	VM_WARN_ON_ONCE(start & ~PAGE_MASK);
+	VM_WARN_ON_ONCE(len & ~PAGE_MASK);
+	VM_WARN_ON_ONCE(start + len <= start);
+
+	guard(mmap_read_lock)(dst_mm);
+	guard(rwsem_read)(&ctx->map_changing_lock);
+
+	if (atomic_read(&ctx->mmap_changing))
+		return -EAGAIN;
+
+	err = -ENOENT;
+	for_each_vma_range(vmi, dst_vma, end) {
+		unsigned long vma_start = max(dst_vma->vm_start, start);
+		unsigned long vma_end = min(dst_vma->vm_end, end);
+
+		if (!userfaultfd_minor(dst_vma)) {
+			err = -ENOENT;
+			break;
+		}
+
+		/*
+		 * Private hugetlb has no page cache to fall back on —
+		 * zapping PTEs would destroy page content.
+		 */
+		if (is_vm_hugetlb_page(dst_vma) &&
+		    !(dst_vma->vm_flags & VM_SHARED)) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (vma_is_anonymous(dst_vma)) {
+			/* Anonymous: set protnone, pages stay resident */
+			struct mmu_gather tlb;
+
+			tlb_gather_mmu(&tlb, dst_mm);
+			err = change_protection(&tlb, dst_vma, vma_start,
+						vma_end,
+						MM_CP_UFFD_DEACTIVATE);
+			tlb_finish_mmu(&tlb);
+			if (err < 0)
+				break;
+		} else {
+			/* Shared shmem/hugetlb: zap PTEs, pages stay in page cache */
+			zap_page_range_single(dst_vma, vma_start,
+					      vma_end - vma_start, NULL);
+		}
+		err = 0;
+	}
+	return err;
+}
 
 void double_pt_lock(spinlock_t *ptl1,
 		    spinlock_t *ptl2)
@@ -1988,6 +2050,16 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
 	if (userfaultfd_wp(vma))
 		uffd_wp_range(vma, start, end - start, false);
 
+	/* Restore protnone PTEs to normal permissions */
+	if (userfaultfd_minor(vma) && vma_is_anonymous(vma)) {
+		struct mmu_gather tlb;
+
+		tlb_gather_mmu(&tlb, vma->vm_mm);
+		change_protection(&tlb, vma, start, end,
+				  MM_CP_TRY_CHANGE_WRITABLE);
+		tlb_finish_mmu(&tlb);
+	}
+
 	ret = vma_modify_flags_uffd(vmi, prev, vma, start, end,
 				    vma->vm_flags & ~__VM_UFFD_FLAGS,
 				    NULL_VM_UFFD_CTX, give_up_on_oom);
-- 
2.51.2


  parent reply	other threads:[~2026-04-14 14:24 UTC|newest]

Thread overview: 52+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-14 14:23 [RFC, PATCH 00/12] userfaultfd: working set tracking for VM guest memory Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 01/12] userfaultfd: define UAPI constants for anonymous minor faults Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 02/12] userfaultfd: add UFFD_FEATURE_MINOR_ANON registration support Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` Kiryl Shutsemau (Meta) [this message]
2026-04-14 14:23 ` [RFC, PATCH 04/12] userfaultfd: UFFDIO_CONTINUE for anonymous memory Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 05/12] mm: intercept protnone faults on VM_UFFD_MINOR anonymous VMAs Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 06/12] userfaultfd: auto-resolve shmem and hugetlbfs minor faults in async mode Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 07/12] sched/numa: skip scanning anonymous VM_UFFD_MINOR VMAs Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 08/12] userfaultfd: enable UFFD_FEATURE_MINOR_ANON Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 09/12] mm/pagemap: add PAGE_IS_UFFD_DEACTIVATED to PAGEMAP_SCAN Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 10/12] userfaultfd: add UFFDIO_SET_MODE for runtime sync/async toggle Kiryl Shutsemau (Meta)
2026-04-15 15:08   ` Usama Arif
2026-04-16 13:27     ` Kiryl Shutsemau
2026-04-14 14:23 ` [RFC, PATCH 11/12] selftests/mm: add userfaultfd anonymous minor fault tests Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 12/12] Documentation/userfaultfd: document working set tracking Kiryl Shutsemau (Meta)
2026-04-14 15:28 ` [RFC, PATCH 00/12] userfaultfd: working set tracking for VM guest memory Peter Xu
2026-04-14 17:08   ` Kiryl Shutsemau
2026-04-14 17:45     ` Peter Xu
2026-04-14 15:37 ` David Hildenbrand (Arm)
2026-04-14 17:10   ` Kiryl Shutsemau
2026-04-16 13:49     ` Kiryl Shutsemau
2026-04-16 18:32       ` David Hildenbrand (Arm)
2026-04-16 20:25         ` Kiryl Shutsemau
2026-04-17 11:02           ` Kiryl Shutsemau
2026-04-17 11:43           ` David Hildenbrand (Arm)
2026-04-17 12:26             ` Kiryl Shutsemau
2026-04-19 14:33               ` Kiryl Shutsemau
2026-04-21 13:03                 ` David Hildenbrand (Arm)
2026-04-21 14:33                   ` Kiryl Shutsemau
2026-04-22  9:27                     ` Kiryl Shutsemau
2026-04-22 18:27                       ` David Hildenbrand (Arm)
2026-04-22 18:39                     ` David Hildenbrand (Arm)
2026-04-23 14:27                       ` Kiryl Shutsemau
2026-04-23 14:50                         ` Peter Xu
2026-04-23 18:08                           ` Kiryl Shutsemau
2026-04-23 18:57                             ` Peter Xu
2026-04-23 19:25                               ` David Hildenbrand (Arm)
2026-04-23 20:10                                 ` Peter Xu
2026-04-24 11:37                                   ` Kiryl Shutsemau
2026-04-24 12:59                                     ` Peter Xu
2026-04-25  5:56                                   ` David Hildenbrand (Arm)
2026-04-24  0:26                               ` SeongJae Park
2026-04-24 11:55                                 ` Peter Xu
2026-04-24 23:59                                   ` SeongJae Park
2026-04-24 10:34                               ` Kiryl Shutsemau
2026-04-24 11:51                                 ` Peter Xu
2026-04-24 13:49                                   ` Kiryl Shutsemau
2026-04-24 15:55                                     ` Peter Xu
2026-04-24 16:09                                       ` Peter Xu
2026-04-27 10:52                                       ` Kiryl Shutsemau
2026-04-25  6:05                                     ` David Hildenbrand (Arm)
2026-04-27 10:23                                       ` Kiryl Shutsemau

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260414142354.1465950-4-kas@kernel.org \
    --to=kas@kernel.org \
    --cc=Liam.Howlett@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=corbet@lwn.net \
    --cc=david@kernel.org \
    --cc=kvm@vger.kernel.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ljs@kernel.org \
    --cc=pbonzini@redhat.com \
    --cc=peterx@redhat.com \
    --cc=rppt@kernel.org \
    --cc=seanjc@google.com \
    --cc=skhan@linuxfoundation.org \
    --cc=surenb@google.com \
    --cc=vbabka@kernel.org \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.