All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Kiryl Shutsemau (Meta)" <kas@kernel.org>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Xu <peterx@redhat.com>,
	David Hildenbrand <david@kernel.org>,
	Lorenzo Stoakes <ljs@kernel.org>, Mike Rapoport <rppt@kernel.org>,
	Suren Baghdasaryan <surenb@google.com>,
	Vlastimil Babka <vbabka@kernel.org>,
	"Liam R . Howlett" <Liam.Howlett@oracle.com>,
	Zi Yan <ziy@nvidia.com>, Jonathan Corbet <corbet@lwn.net>,
	Shuah Khan <skhan@linuxfoundation.org>,
	Sean Christopherson <seanjc@google.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	linux-doc@vger.kernel.org, linux-kselftest@vger.kernel.org,
	kvm@vger.kernel.org, "Kiryl Shutsemau (Meta)" <kas@kernel.org>
Subject: [RFC, PATCH 04/12] userfaultfd: UFFDIO_CONTINUE for anonymous memory
Date: Tue, 14 Apr 2026 15:23:38 +0100	[thread overview]
Message-ID: <20260414142354.1465950-5-kas@kernel.org> (raw)
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>

Allow UFFDIO_CONTINUE on anonymous VMAs with VM_UFFD_MINOR. For shmem,
CONTINUE installs a PTE from page cache. For anonymous memory, the
page is already mapped via a protnone PTE — CONTINUE restores the
original VMA permissions.

PTE level: mfill_atomic_pte_continue_anon() walks to the PTE, verifies
protnone, restores permissions. Rename the shmem path to
mfill_atomic_pte_continue_shmem() for clarity.

PMD/THP level: mfill_atomic_pmd_continue_anon() restores protnone PMD
permissions in place without splitting. Handles PMD races with EAGAIN
retry in the mfill_atomic loop.

Add protnone PTE/PMD checks in userfaultfd_must_wait() so sync minor
faults properly block until resolved.

Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
 fs/userfaultfd.c |  9 +++++-
 mm/userfaultfd.c | 82 ++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 84 insertions(+), 7 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index b317c9854b86..43064238fd8d 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -340,8 +340,11 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
 	if (!pmd_present(_pmd))
 		return false;
 
-	if (pmd_trans_huge(_pmd))
+	if (pmd_trans_huge(_pmd)) {
+		if (pmd_protnone(_pmd) && (reason & VM_UFFD_MINOR))
+			return true;
 		return !pmd_write(_pmd) && (reason & VM_UFFD_WP);
+	}
 
 	pte = pte_offset_map(pmd, address);
 	if (!pte)
@@ -366,6 +369,9 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
 	 */
 	if (!pte_write(ptent) && (reason & VM_UFFD_WP))
 		goto out;
+	/* PTE is still protnone (deactivated), wait for userspace to resolve. */
+	if (pte_protnone(ptent) && (reason & VM_UFFD_MINOR))
+		goto out;
 
 	ret = false;
 out:
@@ -1820,6 +1826,7 @@ static int userfaultfd_deactivate(struct userfaultfd_ctx *ctx,
 	return ret;
 }
 
+
 static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
 {
 	__s64 ret;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 3373b11b9d83..4c52fa5d1608 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -380,8 +380,61 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
 	return ret;
 }
 
-/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
-static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
+static int mfill_atomic_pte_continue_anon(pmd_t *dst_pmd,
+					  struct vm_area_struct *dst_vma,
+					  unsigned long dst_addr,
+					  uffd_flags_t flags)
+{
+	pte_t *ptep, pte;
+	spinlock_t *ptl;
+	int ret = -EFAULT;
+
+	ptep = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl);
+	if (!ptep)
+		return ret;
+
+	pte = ptep_get(ptep);
+	if (!pte_protnone(pte))
+		goto out_unlock;
+
+	pte = pte_modify(pte, dst_vma->vm_page_prot);
+	pte = pte_mkyoung(pte);
+	if (flags & MFILL_ATOMIC_WP)
+		pte = pte_wrprotect(pte);
+	set_pte_at(dst_vma->vm_mm, dst_addr, ptep, pte);
+	update_mmu_cache(dst_vma, dst_addr, ptep);
+	ret = 0;
+out_unlock:
+	pte_unmap_unlock(ptep, ptl);
+	return ret;
+}
+
+static int mfill_atomic_pmd_continue_anon(struct mm_struct *mm,
+					  struct vm_area_struct *vma,
+					  unsigned long addr,
+					  pmd_t *pmd, pmd_t orig_pmd,
+					  uffd_flags_t flags)
+{
+	spinlock_t *ptl;
+	pmd_t entry;
+
+	ptl = pmd_lock(mm, pmd);
+	if (unlikely(!pmd_same(pmdp_get(pmd), orig_pmd))) {
+		spin_unlock(ptl);
+		return -EAGAIN;
+	}
+
+	entry = pmd_modify(orig_pmd, vma->vm_page_prot);
+	entry = pmd_mkyoung(entry);
+	if (flags & MFILL_ATOMIC_WP)
+		entry = pmd_wrprotect(entry);
+	set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, entry);
+	update_mmu_cache_pmd(vma, addr, pmd);
+	spin_unlock(ptl);
+	return 0;
+}
+
+static int mfill_atomic_pte_continue_shmem(pmd_t *dst_pmd,
 				     struct vm_area_struct *dst_vma,
 				     unsigned long dst_addr,
 				     uffd_flags_t flags)
@@ -667,7 +720,10 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
 	ssize_t err;
 
 	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
-		return mfill_atomic_pte_continue(dst_pmd, dst_vma,
+		if (vma_is_anonymous(dst_vma))
+			return mfill_atomic_pte_continue_anon(dst_pmd, dst_vma,
+							      dst_addr, flags);
+		return mfill_atomic_pte_continue_shmem(dst_pmd, dst_vma,
 						 dst_addr, flags);
 	} else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
 		return mfill_atomic_pte_poison(dst_pmd, dst_vma,
@@ -802,11 +858,25 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 			break;
 		}
 		/*
-		 * If the dst_pmd is THP don't override it and just be strict.
-		 * (This includes the case where the PMD used to be THP and
-		 * changed back to none after __pte_alloc().)
+		 * THP PMD: for anon CONTINUE, restore protnone PMD
+		 * permissions in place. For other operations, reject.
 		 */
 		if (unlikely(pmd_trans_huge(dst_pmdval))) {
+			if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) &&
+			    vma_is_anonymous(dst_vma) &&
+			    pmd_protnone(dst_pmdval)) {
+				err = mfill_atomic_pmd_continue_anon(
+					dst_mm, dst_vma, dst_addr,
+					dst_pmd, dst_pmdval, flags);
+				if (err == -EAGAIN)
+					continue; /* PMD changed, re-read it */
+				if (err)
+					break;
+				dst_addr += HPAGE_PMD_SIZE;
+				src_addr += HPAGE_PMD_SIZE;
+				copied += HPAGE_PMD_SIZE;
+				continue;
+			}
 			err = -EEXIST;
 			break;
 		}
-- 
2.51.2


  parent reply	other threads:[~2026-04-14 14:24 UTC|newest]

Thread overview: 52+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-14 14:23 [RFC, PATCH 00/12] userfaultfd: working set tracking for VM guest memory Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 01/12] userfaultfd: define UAPI constants for anonymous minor faults Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 02/12] userfaultfd: add UFFD_FEATURE_MINOR_ANON registration support Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 03/12] userfaultfd: implement UFFDIO_DEACTIVATE ioctl Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` Kiryl Shutsemau (Meta) [this message]
2026-04-14 14:23 ` [RFC, PATCH 05/12] mm: intercept protnone faults on VM_UFFD_MINOR anonymous VMAs Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 06/12] userfaultfd: auto-resolve shmem and hugetlbfs minor faults in async mode Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 07/12] sched/numa: skip scanning anonymous VM_UFFD_MINOR VMAs Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 08/12] userfaultfd: enable UFFD_FEATURE_MINOR_ANON Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 09/12] mm/pagemap: add PAGE_IS_UFFD_DEACTIVATED to PAGEMAP_SCAN Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 10/12] userfaultfd: add UFFDIO_SET_MODE for runtime sync/async toggle Kiryl Shutsemau (Meta)
2026-04-15 15:08   ` Usama Arif
2026-04-16 13:27     ` Kiryl Shutsemau
2026-04-14 14:23 ` [RFC, PATCH 11/12] selftests/mm: add userfaultfd anonymous minor fault tests Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 12/12] Documentation/userfaultfd: document working set tracking Kiryl Shutsemau (Meta)
2026-04-14 15:28 ` [RFC, PATCH 00/12] userfaultfd: working set tracking for VM guest memory Peter Xu
2026-04-14 17:08   ` Kiryl Shutsemau
2026-04-14 17:45     ` Peter Xu
2026-04-14 15:37 ` David Hildenbrand (Arm)
2026-04-14 17:10   ` Kiryl Shutsemau
2026-04-16 13:49     ` Kiryl Shutsemau
2026-04-16 18:32       ` David Hildenbrand (Arm)
2026-04-16 20:25         ` Kiryl Shutsemau
2026-04-17 11:02           ` Kiryl Shutsemau
2026-04-17 11:43           ` David Hildenbrand (Arm)
2026-04-17 12:26             ` Kiryl Shutsemau
2026-04-19 14:33               ` Kiryl Shutsemau
2026-04-21 13:03                 ` David Hildenbrand (Arm)
2026-04-21 14:33                   ` Kiryl Shutsemau
2026-04-22  9:27                     ` Kiryl Shutsemau
2026-04-22 18:27                       ` David Hildenbrand (Arm)
2026-04-22 18:39                     ` David Hildenbrand (Arm)
2026-04-23 14:27                       ` Kiryl Shutsemau
2026-04-23 14:50                         ` Peter Xu
2026-04-23 18:08                           ` Kiryl Shutsemau
2026-04-23 18:57                             ` Peter Xu
2026-04-23 19:25                               ` David Hildenbrand (Arm)
2026-04-23 20:10                                 ` Peter Xu
2026-04-24 11:37                                   ` Kiryl Shutsemau
2026-04-24 12:59                                     ` Peter Xu
2026-04-25  5:56                                   ` David Hildenbrand (Arm)
2026-04-24  0:26                               ` SeongJae Park
2026-04-24 11:55                                 ` Peter Xu
2026-04-24 23:59                                   ` SeongJae Park
2026-04-24 10:34                               ` Kiryl Shutsemau
2026-04-24 11:51                                 ` Peter Xu
2026-04-24 13:49                                   ` Kiryl Shutsemau
2026-04-24 15:55                                     ` Peter Xu
2026-04-24 16:09                                       ` Peter Xu
2026-04-27 10:52                                       ` Kiryl Shutsemau
2026-04-25  6:05                                     ` David Hildenbrand (Arm)
2026-04-27 10:23                                       ` Kiryl Shutsemau

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260414142354.1465950-5-kas@kernel.org \
    --to=kas@kernel.org \
    --cc=Liam.Howlett@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=corbet@lwn.net \
    --cc=david@kernel.org \
    --cc=kvm@vger.kernel.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ljs@kernel.org \
    --cc=pbonzini@redhat.com \
    --cc=peterx@redhat.com \
    --cc=rppt@kernel.org \
    --cc=seanjc@google.com \
    --cc=skhan@linuxfoundation.org \
    --cc=surenb@google.com \
    --cc=vbabka@kernel.org \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.