[PATCH 3/3] mm/uffd: Detect pgtable allocation failures

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Peter Xu <peterx@redhat.com>
To: linux-mm@kvack.org, linux-kernel@vger.kernel.org
Cc: Mike Kravetz <mike.kravetz@oracle.com>,
	Muchun Song <songmuchun@bytedance.com>,
	peterx@redhat.com, Nadav Amit <nadav.amit@gmail.com>,
	Andrea Arcangeli <aarcange@redhat.com>,
	David Hildenbrand <david@redhat.com>,
	James Houghton <jthoughton@google.com>,
	Axel Rasmussen <axelrasmussen@google.com>,
	Andrew Morton <akpm@linux-foundation.org>
Subject: [PATCH 3/3] mm/uffd: Detect pgtable allocation failures
Date: Wed,  4 Jan 2023 17:52:07 -0500	[thread overview]
Message-ID: <20230104225207.1066932-4-peterx@redhat.com> (raw)
In-Reply-To: <20230104225207.1066932-1-peterx@redhat.com>

Before this patch, when there's any pgtable allocation issues happened
during change_protection(), the error will be ignored from the syscall.
For shmem, there will be an error dumped into the host dmesg.  Two issues
with that:

  (1) Doing a trace dump when allocation fails is not anything close to
      grace..

  (2) The user should be notified with any kind of such error, so the user
      can trap it and decide what to do next, either by retrying, or stop
      the process properly, or anything else.

For userfault users, this will change the API of UFFDIO_WRITEPROTECT when
pgtable allocation failure happened.  It should not normally break anyone,
though.  If it breaks, then in good ways.

One man-page update will be on the way to introduce the new -ENOMEM for
UFFDIO_WRITEPROTECT.  Not marking stable so we keep the old behavior on the
5.19-till-now kernels.

Reported-by: James Houghton <jthoughton@google.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
---
 include/linux/userfaultfd_k.h |  2 +-
 mm/hugetlb.c                  |  6 ++-
 mm/mempolicy.c                |  2 +-
 mm/mprotect.c                 | 69 +++++++++++++++++++++++------------
 mm/userfaultfd.c              | 16 +++++---
 5 files changed, 62 insertions(+), 33 deletions(-)

diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 9df0b9a762cc..3767f18114ef 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -73,7 +73,7 @@ extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
 extern int mwriteprotect_range(struct mm_struct *dst_mm,
 			       unsigned long start, unsigned long len,
 			       bool enable_wp, atomic_t *mmap_changing);
-extern void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma,
+extern long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma,
 			  unsigned long start, unsigned long len, bool enable_wp);
 
 /* mm helpers */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 84bc665c7c86..d82d97e03eae 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6658,8 +6658,10 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
 			 * pre-allocations to install pte markers.
 			 */
 			ptep = huge_pte_alloc(mm, vma, address, psize);
-			if (!ptep)
+			if (!ptep) {
+				pages = -ENOMEM;
 				break;
+			}
 		}
 		ptl = huge_pte_lock(h, mm, ptep);
 		if (huge_pmd_unshare(mm, vma, address, ptep)) {
@@ -6749,7 +6751,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
 	hugetlb_vma_unlock_write(vma);
 	mmu_notifier_invalidate_range_end(&range);
 
-	return pages << h->order;
+	return pages > 0 ? (pages << h->order) : pages;
 }
 
 /* Return true if reservation was successful, false otherwise.  */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a86b8f15e2f0..85a34f1f3ab8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -636,7 +636,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
 	tlb_gather_mmu(&tlb, vma->vm_mm);
 
 	nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
-	if (nr_updated)
+	if (nr_updated > 0)
 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 
 	tlb_finish_mmu(&tlb);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 0af22ab59ea8..ade0d5f85a36 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -330,28 +330,34 @@ uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags)
 /*
  * If wr-protecting the range for file-backed, populate pgtable for the case
  * when pgtable is empty but page cache exists.  When {pte|pmd|...}_alloc()
- * failed it means no memory, we don't have a better option but stop.
+ * failed we treat it the same way as pgtable allocation failures during
+ * page faults by kicking OOM and returning error.
  */
 #define  change_pmd_prepare(vma, pmd, cp_flags)				\
-	do {								\
+	({								\
+		long err = 0;						\
 		if (unlikely(uffd_wp_protect_file(vma, cp_flags))) {	\
-			if (WARN_ON_ONCE(pte_alloc(vma->vm_mm, pmd)))	\
-				break;					\
+			if (pte_alloc(vma->vm_mm, pmd))			\
+				err = -ENOMEM;				\
 		}							\
-	} while (0)
+		err;							\
+	})
+
 /*
  * This is the general pud/p4d/pgd version of change_pmd_prepare(). We need to
  * have separate change_pmd_prepare() because pte_alloc() returns 0 on success,
  * while {pmd|pud|p4d}_alloc() returns the valid pointer on success.
  */
 #define  change_prepare(vma, high, low, addr, cp_flags)			\
-	do {								\
-		if (unlikely(uffd_wp_protect_file(vma, cp_flags))) {	\
-			low##_t *p = low##_alloc(vma->vm_mm, high, addr); \
-			if (WARN_ON_ONCE(p == NULL))			\
-				break;					\
-		}							\
-	} while (0)
+	  ({								\
+		  long err = 0;						\
+		  if (unlikely(uffd_wp_protect_file(vma, cp_flags))) {	\
+			  low##_t *p = low##_alloc(vma->vm_mm, high, addr); \
+			  if (p == NULL)				\
+				  err = -ENOMEM;			\
+		  }							\
+		  err;							\
+	  })
 
 static inline long change_pmd_range(struct mmu_gather *tlb,
 		struct vm_area_struct *vma, pud_t *pud, unsigned long addr,
@@ -367,11 +373,15 @@ static inline long change_pmd_range(struct mmu_gather *tlb,
 
 	pmd = pmd_offset(pud, addr);
 	do {
-		long this_pages;
+		long ret;
 
 		next = pmd_addr_end(addr, end);
 
-		change_pmd_prepare(vma, pmd, cp_flags);
+		ret = change_pmd_prepare(vma, pmd, cp_flags);
+		if (ret) {
+			pages = ret;
+			break;
+		}
 		/*
 		 * Automatic NUMA balancing walks the tables with mmap_lock
 		 * held for read. It's possible a parallel update to occur
@@ -401,7 +411,11 @@ static inline long change_pmd_range(struct mmu_gather *tlb,
 				 * cleared; make sure pmd populated if
 				 * necessary, then fall-through to pte level.
 				 */
-				change_pmd_prepare(vma, pmd, cp_flags);
+				ret = change_pmd_prepare(vma, pmd, cp_flags);
+				if (ret) {
+					pages = ret;
+					break;
+				}
 			} else {
 				/*
 				 * change_huge_pmd() does not defer TLB flushes,
@@ -422,9 +436,8 @@ static inline long change_pmd_range(struct mmu_gather *tlb,
 			}
 			/* fall through, the trans huge pmd just split */
 		}
-		this_pages = change_pte_range(tlb, vma, pmd, addr, next,
-					      newprot, cp_flags);
-		pages += this_pages;
+		pages += change_pte_range(tlb, vma, pmd, addr, next,
+					  newprot, cp_flags);
 next:
 		cond_resched();
 	} while (pmd++, addr = next, addr != end);
@@ -443,12 +456,14 @@ static inline long change_pud_range(struct mmu_gather *tlb,
 {
 	pud_t *pud;
 	unsigned long next;
-	long pages = 0;
+	long pages = 0, ret;
 
 	pud = pud_offset(p4d, addr);
 	do {
 		next = pud_addr_end(addr, end);
-		change_prepare(vma, pud, pmd, addr, cp_flags);
+		ret = change_prepare(vma, pud, pmd, addr, cp_flags);
+		if (ret)
+			return ret;
 		if (pud_none_or_clear_bad(pud))
 			continue;
 		pages += change_pmd_range(tlb, vma, pud, addr, next, newprot,
@@ -464,12 +479,14 @@ static inline long change_p4d_range(struct mmu_gather *tlb,
 {
 	p4d_t *p4d;
 	unsigned long next;
-	long pages = 0;
+	long pages = 0, ret;
 
 	p4d = p4d_offset(pgd, addr);
 	do {
 		next = p4d_addr_end(addr, end);
-		change_prepare(vma, p4d, pud, addr, cp_flags);
+		ret = change_prepare(vma, p4d, pud, addr, cp_flags);
+		if (ret)
+			return ret;
 		if (p4d_none_or_clear_bad(p4d))
 			continue;
 		pages += change_pud_range(tlb, vma, p4d, addr, next, newprot,
@@ -486,14 +503,18 @@ static long change_protection_range(struct mmu_gather *tlb,
 	struct mm_struct *mm = vma->vm_mm;
 	pgd_t *pgd;
 	unsigned long next;
-	long pages = 0;
+	long pages = 0, ret;
 
 	BUG_ON(addr >= end);
 	pgd = pgd_offset(mm, addr);
 	tlb_start_vma(tlb, vma);
 	do {
 		next = pgd_addr_end(addr, end);
-		change_prepare(vma, pgd, p4d, addr, cp_flags);
+		ret = change_prepare(vma, pgd, p4d, addr, cp_flags);
+		if (ret) {
+			pages = ret;
+			break;
+		}
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
 		pages += change_p4d_range(tlb, vma, pgd, addr, next, newprot,
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 65ad172add27..53c3d916ff66 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -710,11 +710,12 @@ ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
 			      mmap_changing, 0);
 }
 
-void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
+long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
 		   unsigned long start, unsigned long len, bool enable_wp)
 {
 	unsigned int mm_cp_flags;
 	struct mmu_gather tlb;
+	long ret;
 
 	if (enable_wp)
 		mm_cp_flags = MM_CP_UFFD_WP;
@@ -730,8 +731,10 @@ void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
 	if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma))
 		mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
 	tlb_gather_mmu(&tlb, dst_mm);
-	change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags);
+	ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags);
 	tlb_finish_mmu(&tlb);
+
+	return ret;
 }
 
 int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
@@ -740,7 +743,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
 {
 	struct vm_area_struct *dst_vma;
 	unsigned long page_mask;
-	int err;
+	long err;
 
 	/*
 	 * Sanitize the command parameters:
@@ -779,9 +782,12 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
 			goto out_unlock;
 	}
 
-	uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp);
+	err = uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp);
+
+	/* Return 0 on success, <0 on failures */
+	if (err > 0)
+		err = 0;
 
-	err = 0;
 out_unlock:
 	mmap_read_unlock(dst_mm);
 	return err;
-- 
2.37.3

next prev parent reply	other threads:[~2023-01-04 22:52 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-01-04 22:52 [PATCH 0/3] mm/uffd: Fix missing markers on hugetlb Peter Xu
2023-01-04 22:52 ` [PATCH 1/3] mm/hugetlb: Pre-allocate pgtable pages for uffd wr-protects Peter Xu
2023-01-05  1:50   ` James Houghton
2023-01-05  8:39   ` David Hildenbrand
2023-01-05 18:37   ` Mike Kravetz
2023-01-04 22:52 ` [PATCH 2/3] mm/mprotect: Use long for page accountings and retval Peter Xu
2023-01-05  1:51   ` James Houghton
2023-01-05  8:44   ` David Hildenbrand
2023-01-05 19:22     ` Peter Xu
2023-01-09  8:04       ` David Hildenbrand
2023-01-05 18:48   ` Mike Kravetz
2023-01-04 22:52 ` Peter Xu [this message]
2023-01-05  1:52   ` [PATCH 3/3] mm/uffd: Detect pgtable allocation failures James Houghton
2023-01-05  3:10   ` Nadav Amit
2023-01-05  8:59     ` David Hildenbrand
2023-01-05 18:01       ` Nadav Amit
2023-01-05 19:51         ` Peter Xu
2023-01-18 21:51           ` Nadav Amit
2023-01-09  8:36         ` David Hildenbrand
2023-01-05  8:47   ` David Hildenbrand
2023-01-05  8:16 ` [PATCH 0/3] mm/uffd: Fix missing markers on hugetlb David Hildenbrand

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:9df0b9a762c dfblob:3767f18114e dfblob:84bc665c7c8
dfblob:d82d97e03ea dfblob:a86b8f15e2f dfblob:85a34f1f3ab
dfblob:0af22ab59ea dfblob:ade0d5f85a3 dfblob:65ad172add2
dfblob:53c3d916ff6 )
 OR (
bs:"[PATCH 3/3] mm/uffd: Detect pgtable allocation failures" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230104225207.1066932-4-peterx@redhat.com \
    --to=peterx@redhat.com \
    --cc=aarcange@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=axelrasmussen@google.com \
    --cc=david@redhat.com \
    --cc=jthoughton@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mike.kravetz@oracle.com \
    --cc=nadav.amit@gmail.com \
    --cc=songmuchun@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).