[PATCH 04/10] mm/hugetlb: Move swap entry handling into vma lock when faulted

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Peter Xu <peterx@redhat.com>
To: linux-mm@kvack.org, linux-kernel@vger.kernel.org
Cc: James Houghton <jthoughton@google.com>,
	Jann Horn <jannh@google.com>,
	peterx@redhat.com, Andrew Morton <akpm@linux-foundation.org>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Rik van Riel <riel@surriel.com>,
	Nadav Amit <nadav.amit@gmail.com>,
	Miaohe Lin <linmiaohe@huawei.com>,
	Muchun Song <songmuchun@bytedance.com>,
	Mike Kravetz <mike.kravetz@oracle.com>,
	David Hildenbrand <david@redhat.com>
Subject: [PATCH 04/10] mm/hugetlb: Move swap entry handling into vma lock when faulted
Date: Tue, 29 Nov 2022 14:35:20 -0500	[thread overview]
Message-ID: <20221129193526.3588187-5-peterx@redhat.com> (raw)
In-Reply-To: <20221129193526.3588187-1-peterx@redhat.com>

In hugetlb_fault(), there used to have a special path to handle swap entry
at the entrance using huge_pte_offset().  That's unsafe because
huge_pte_offset() for a pmd sharable range can access freed pgtables if
without any lock to protect the pgtable from being freed after pmd unshare.

Here the simplest solution to make it safe is to move the swap handling to
be after the vma lock being held.  We may need to take the fault mutex on
either migration or hwpoison entries now (also the vma lock, but that's
really needed), however neither of them is hot path.

Note that the vma lock cannot be released in hugetlb_fault() when the
migration entry is detected, because in migration_entry_wait_huge() the
pgtable page will be used again (by taking the pgtable lock), so that also
need to be protected by the vma lock.  Modify migration_entry_wait_huge()
so that it must be called with vma read lock held, and properly release the
lock in __migration_entry_wait_huge().

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 include/linux/swapops.h |  6 ++++--
 mm/hugetlb.c            | 32 +++++++++++++++-----------------
 mm/migrate.c            | 25 +++++++++++++++++++++----
 3 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 27ade4f22abb..09b22b169a71 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -335,7 +335,8 @@ extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					unsigned long address);
 #ifdef CONFIG_HUGETLB_PAGE
-extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl);
+extern void __migration_entry_wait_huge(struct vm_area_struct *vma,
+					pte_t *ptep, spinlock_t *ptl);
 extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte);
 #endif	/* CONFIG_HUGETLB_PAGE */
 #else  /* CONFIG_MIGRATION */
@@ -364,7 +365,8 @@ static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					 unsigned long address) { }
 #ifdef CONFIG_HUGETLB_PAGE
-static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { }
+static inline void __migration_entry_wait_huge(struct vm_area_struct *vma,
+					       pte_t *ptep, spinlock_t *ptl) { }
 static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { }
 #endif	/* CONFIG_HUGETLB_PAGE */
 static inline int is_writable_migration_entry(swp_entry_t entry)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dfe677fadaf8..776e34ccf029 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5826,22 +5826,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	int need_wait_lock = 0;
 	unsigned long haddr = address & huge_page_mask(h);
 
-	ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
-	if (ptep) {
-		/*
-		 * Since we hold no locks, ptep could be stale.  That is
-		 * OK as we are only making decisions based on content and
-		 * not actually modifying content here.
-		 */
-		entry = huge_ptep_get(ptep);
-		if (unlikely(is_hugetlb_entry_migration(entry))) {
-			migration_entry_wait_huge(vma, ptep);
-			return 0;
-		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
-			return VM_FAULT_HWPOISON_LARGE |
-				VM_FAULT_SET_HINDEX(hstate_index(h));
-	}
-
 	/*
 	 * Serialize hugepage allocation and instantiation, so that we don't
 	 * get spurious allocation failures if two CPUs race to instantiate
@@ -5888,8 +5872,22 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
 	 * properly handle it.
 	 */
-	if (!pte_present(entry))
+	if (!pte_present(entry)) {
+		if (unlikely(is_hugetlb_entry_migration(entry))) {
+			/*
+			 * Release fault lock first because the vma lock is
+			 * needed to guard the huge_pte_lockptr() later in
+			 * migration_entry_wait_huge().  The vma lock will
+			 * be released there.
+			 */
+			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+			migration_entry_wait_huge(vma, ptep);
+			return 0;
+		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+			ret = VM_FAULT_HWPOISON_LARGE |
+			    VM_FAULT_SET_HINDEX(hstate_index(h));
 		goto out_mutex;
+	}
 
 	/*
 	 * If we are going to COW/unshare the mapping later, we examine the
diff --git a/mm/migrate.c b/mm/migrate.c
index 267ad0d073ae..c13c828d34f3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -326,24 +326,41 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 }
 
 #ifdef CONFIG_HUGETLB_PAGE
-void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl)
+void __migration_entry_wait_huge(struct vm_area_struct *vma,
+				 pte_t *ptep, spinlock_t *ptl)
 {
 	pte_t pte;
 
+	/*
+	 * The vma read lock must be taken, which will be released before
+	 * the function returns.  It makes sure the pgtable page (along
+	 * with its spin lock) not be freed in parallel.
+	 */
+	hugetlb_vma_assert_locked(vma);
+
 	spin_lock(ptl);
 	pte = huge_ptep_get(ptep);
 
-	if (unlikely(!is_hugetlb_entry_migration(pte)))
+	if (unlikely(!is_hugetlb_entry_migration(pte))) {
 		spin_unlock(ptl);
-	else
+		hugetlb_vma_unlock_read(vma);
+	} else {
+		/*
+		 * If migration entry existed, safe to release vma lock
+		 * here because the pgtable page won't be freed without the
+		 * pgtable lock released.  See comment right above pgtable
+		 * lock release in migration_entry_wait_on_locked().
+		 */
+		hugetlb_vma_unlock_read(vma);
 		migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl);
+	}
 }
 
 void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte)
 {
 	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte);
 
-	__migration_entry_wait_huge(pte, ptl);
+	__migration_entry_wait_huge(vma, pte, ptl);
 }
 #endif
 
-- 
2.37.3

next prev parent reply	other threads:[~2022-11-29 19:35 UTC|newest]

Thread overview: 60+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-11-29 19:35 [PATCH 00/10] mm/hugetlb: Make huge_pte_offset() thread-safe for pmd unshare Peter Xu
2022-11-29 19:35 ` [PATCH 01/10] mm/hugetlb: Let vma_offset_start() to return start Peter Xu
2022-11-30 10:11   ` David Hildenbrand
2022-11-29 19:35 ` [PATCH 02/10] mm/hugetlb: Don't wait for migration entry during follow page Peter Xu
2022-11-30  4:37   ` Mike Kravetz
2022-11-30 10:15   ` David Hildenbrand
2022-11-29 19:35 ` [PATCH 03/10] mm/hugetlb: Document huge_pte_offset usage Peter Xu
2022-11-30  4:55   ` Mike Kravetz
2022-11-30 15:58     ` Peter Xu
2022-12-05 21:47       ` Mike Kravetz
2022-11-30 10:21   ` David Hildenbrand
2022-11-30 10:24   ` David Hildenbrand
2022-11-30 16:09     ` Peter Xu
2022-11-30 16:11       ` David Hildenbrand
2022-11-30 16:25         ` Peter Xu
2022-11-30 16:31           ` David Hildenbrand
2022-11-29 19:35 ` Peter Xu [this message]
2022-12-05 22:14   ` [PATCH 04/10] mm/hugetlb: Move swap entry handling into vma lock when faulted Mike Kravetz
2022-12-05 23:36     ` Peter Xu
2022-11-29 19:35 ` [PATCH 05/10] mm/hugetlb: Make userfaultfd_huge_must_wait() safe to pmd unshare Peter Xu
2022-11-30 16:08   ` David Hildenbrand
2022-12-05 22:23   ` Mike Kravetz
2022-11-29 19:35 ` [PATCH 06/10] mm/hugetlb: Make hugetlb_follow_page_mask() " Peter Xu
2022-11-30 16:09   ` David Hildenbrand
2022-12-05 22:29   ` Mike Kravetz
2022-11-29 19:35 ` [PATCH 07/10] mm/hugetlb: Make follow_hugetlb_page() " Peter Xu
2022-11-30 16:09   ` David Hildenbrand
2022-12-05 22:45   ` Mike Kravetz
2022-11-29 19:35 ` [PATCH 08/10] mm/hugetlb: Make walk_hugetlb_range() " Peter Xu
2022-11-30 16:11   ` David Hildenbrand
2022-12-05 23:33   ` Mike Kravetz
2022-12-05 23:52     ` John Hubbard
2022-12-06 16:45       ` Peter Xu
2022-12-06 18:50         ` Mike Kravetz
2022-12-06 21:03         ` John Hubbard
2022-12-06 21:51           ` Peter Xu
2022-12-06 22:31             ` John Hubbard
2022-12-07  0:07               ` Peter Xu
2022-12-07  2:38                 ` John Hubbard
2022-12-07 14:58                   ` Peter Xu
2022-11-29 19:35 ` [PATCH 09/10] mm/hugetlb: Make page_vma_mapped_walk() " Peter Xu
2022-11-30 16:18   ` David Hildenbrand
2022-11-30 16:32     ` Peter Xu
2022-11-30 16:39       ` David Hildenbrand
2022-12-05 23:52   ` Mike Kravetz
2022-12-06 17:10     ` Mike Kravetz
2022-12-06 17:39       ` Peter Xu
2022-12-06 17:43         ` Peter Xu
2022-12-06 19:58           ` Mike Kravetz
2022-11-29 19:35 ` [PATCH 10/10] mm/hugetlb: Introduce hugetlb_walk() Peter Xu
2022-11-30  5:18   ` Eric Biggers
2022-11-30 15:37     ` Peter Xu
2022-12-06  0:21       ` Mike Kravetz
2022-11-29 20:49 ` [PATCH 00/10] mm/hugetlb: Make huge_pte_offset() thread-safe for pmd unshare Andrew Morton
2022-11-29 21:19   ` Peter Xu
2022-11-29 21:26     ` Andrew Morton
2022-11-29 20:51 ` Andrew Morton
2022-11-29 21:36   ` Peter Xu
2022-11-30  9:46 ` David Hildenbrand
2022-11-30 16:23   ` Peter Xu

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:27ade4f22ab dfblob:09b22b169a7 dfblob:dfe677fadaf
dfblob:776e34ccf02 dfblob:267ad0d073a dfblob:c13c828d34f )
 OR (
bs:"[PATCH 04/10] mm/hugetlb: Move swap entry handling into vma lock when faulted" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20221129193526.3588187-5-peterx@redhat.com \
    --to=peterx@redhat.com \
    --cc=aarcange@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=david@redhat.com \
    --cc=jannh@google.com \
    --cc=jthoughton@google.com \
    --cc=linmiaohe@huawei.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mike.kravetz@oracle.com \
    --cc=nadav.amit@gmail.com \
    --cc=riel@surriel.com \
    --cc=songmuchun@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).