From: Peter Xu <peterx@redhat.com>
To: linux-mm@kvack.org, linux-kernel@vger.kernel.org
Cc: Lorenzo Stoakes <lstoakes@gmail.com>,
John Hubbard <jhubbard@nvidia.com>,
Andrew Morton <akpm@linux-foundation.org>,
Mike Rapoport <rppt@kernel.org>,
David Hildenbrand <david@redhat.com>,
peterx@redhat.com, Yang Shi <shy828301@gmail.com>,
Andrea Arcangeli <aarcange@redhat.com>,
Vlastimil Babka <vbabka@suse.cz>,
"Kirill A . Shutemov" <kirill@shutemov.name>,
James Houghton <jthoughton@google.com>,
Matthew Wilcox <willy@infradead.org>,
Mike Kravetz <mike.kravetz@oracle.com>,
Hugh Dickins <hughd@google.com>, Jason Gunthorpe <jgg@nvidia.com>
Subject: [PATCH v3 6/8] mm/gup: Retire follow_hugetlb_page()
Date: Fri, 23 Jun 2023 10:29:34 -0400 [thread overview]
Message-ID: <20230623142936.268456-7-peterx@redhat.com> (raw)
In-Reply-To: <20230623142936.268456-1-peterx@redhat.com>
Now __get_user_pages() should be well prepared to handle thp completely,
as long as hugetlb gup requests even without the hugetlb's special path.
Time to retire follow_hugetlb_page().
Tweak misc comments to reflect reality of follow_hugetlb_page()'s removal.
Signed-off-by: Peter Xu <peterx@redhat.com>
---
fs/userfaultfd.c | 2 +-
include/linux/hugetlb.h | 12 ---
mm/gup.c | 19 ----
mm/hugetlb.c | 224 ----------------------------------------
4 files changed, 1 insertion(+), 256 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 7cecd49e078b..ae711f1d7a83 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -427,7 +427,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
*
* We also don't do userfault handling during
* coredumping. hugetlbfs has the special
- * follow_hugetlb_page() to skip missing pages in the
+ * hugetlb_follow_page_mask() to skip missing pages in the
* FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
* the no_page_table() helper in follow_page_mask(), but the
* shmem_vm_ops->fault method is invoked even during
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 2e2d89e79d6c..bb5024718fc1 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -133,9 +133,6 @@ int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
unsigned int *page_mask);
-long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
- struct page **, unsigned long *, unsigned long *,
- long, unsigned int, int *);
void unmap_hugepage_range(struct vm_area_struct *,
unsigned long, unsigned long, struct page *,
zap_flags_t);
@@ -305,15 +302,6 @@ static inline struct page *hugetlb_follow_page_mask(
BUILD_BUG(); /* should never be compiled in if !CONFIG_HUGETLB_PAGE*/
}
-static inline long follow_hugetlb_page(struct mm_struct *mm,
- struct vm_area_struct *vma, struct page **pages,
- unsigned long *position, unsigned long *nr_pages,
- long i, unsigned int flags, int *nonblocking)
-{
- BUG();
- return 0;
-}
-
static inline int copy_hugetlb_page_range(struct mm_struct *dst,
struct mm_struct *src,
struct vm_area_struct *dst_vma,
diff --git a/mm/gup.c b/mm/gup.c
index 22e32cff9ac7..ac61244e6fca 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -775,9 +775,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
* Call hugetlb_follow_page_mask for hugetlb vmas as it will use
* special hugetlb page table walking code. This eliminates the
* need to check for hugetlb entries in the general walking code.
- *
- * hugetlb_follow_page_mask is only for follow_page() handling here.
- * Ordinary GUP uses follow_hugetlb_page for hugetlb processing.
*/
if (is_vm_hugetlb_page(vma))
return hugetlb_follow_page_mask(vma, address, flags,
@@ -1138,22 +1135,6 @@ static long __get_user_pages(struct mm_struct *mm,
ret = check_vma_flags(vma, gup_flags);
if (ret)
goto out;
-
- if (is_vm_hugetlb_page(vma)) {
- i = follow_hugetlb_page(mm, vma, pages,
- &start, &nr_pages, i,
- gup_flags, locked);
- if (!*locked) {
- /*
- * We've got a VM_FAULT_RETRY
- * and we've lost mmap_lock.
- * We must stop here.
- */
- BUG_ON(gup_flags & FOLL_NOWAIT);
- goto out;
- }
- continue;
- }
}
retry:
/*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b4973edef9f2..50a3579782a5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5721,7 +5721,6 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* Return whether there is a pagecache page to back given address within VMA.
- * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
*/
static bool hugetlbfs_pagecache_present(struct hstate *h,
struct vm_area_struct *vma, unsigned long address)
@@ -6422,37 +6421,6 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
}
#endif /* CONFIG_USERFAULTFD */
-static void record_subpages(struct page *page, struct vm_area_struct *vma,
- int refs, struct page **pages)
-{
- int nr;
-
- for (nr = 0; nr < refs; nr++) {
- if (likely(pages))
- pages[nr] = nth_page(page, nr);
- }
-}
-
-static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma,
- unsigned int flags, pte_t *pte,
- bool *unshare)
-{
- pte_t pteval = huge_ptep_get(pte);
-
- *unshare = false;
- if (is_swap_pte(pteval))
- return true;
- if (huge_pte_write(pteval))
- return false;
- if (flags & FOLL_WRITE)
- return true;
- if (gup_must_unshare(vma, flags, pte_page(pteval))) {
- *unshare = true;
- return true;
- }
- return false;
-}
-
struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
unsigned int *page_mask)
@@ -6522,198 +6490,6 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
return page;
}
-long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page **pages, unsigned long *position,
- unsigned long *nr_pages, long i, unsigned int flags,
- int *locked)
-{
- unsigned long pfn_offset;
- unsigned long vaddr = *position;
- unsigned long remainder = *nr_pages;
- struct hstate *h = hstate_vma(vma);
- int err = -EFAULT, refs;
-
- while (vaddr < vma->vm_end && remainder) {
- pte_t *pte;
- spinlock_t *ptl = NULL;
- bool unshare = false;
- int absent;
- struct page *page;
-
- /*
- * If we have a pending SIGKILL, don't keep faulting pages and
- * potentially allocating memory.
- */
- if (fatal_signal_pending(current)) {
- remainder = 0;
- break;
- }
-
- hugetlb_vma_lock_read(vma);
- /*
- * Some archs (sparc64, sh*) have multiple pte_ts to
- * each hugepage. We have to make sure we get the
- * first, for the page indexing below to work.
- *
- * Note that page table lock is not held when pte is null.
- */
- pte = hugetlb_walk(vma, vaddr & huge_page_mask(h),
- huge_page_size(h));
- if (pte)
- ptl = huge_pte_lock(h, mm, pte);
- absent = !pte || huge_pte_none(huge_ptep_get(pte));
-
- /*
- * When coredumping, it suits get_dump_page if we just return
- * an error where there's an empty slot with no huge pagecache
- * to back it. This way, we avoid allocating a hugepage, and
- * the sparse dumpfile avoids allocating disk blocks, but its
- * huge holes still show up with zeroes where they need to be.
- */
- if (absent && (flags & FOLL_DUMP) &&
- !hugetlbfs_pagecache_present(h, vma, vaddr)) {
- if (pte)
- spin_unlock(ptl);
- hugetlb_vma_unlock_read(vma);
- remainder = 0;
- break;
- }
-
- /*
- * We need call hugetlb_fault for both hugepages under migration
- * (in which case hugetlb_fault waits for the migration,) and
- * hwpoisoned hugepages (in which case we need to prevent the
- * caller from accessing to them.) In order to do this, we use
- * here is_swap_pte instead of is_hugetlb_entry_migration and
- * is_hugetlb_entry_hwpoisoned. This is because it simply covers
- * both cases, and because we can't follow correct pages
- * directly from any kind of swap entries.
- */
- if (absent ||
- __follow_hugetlb_must_fault(vma, flags, pte, &unshare)) {
- vm_fault_t ret;
- unsigned int fault_flags = 0;
-
- if (pte)
- spin_unlock(ptl);
- hugetlb_vma_unlock_read(vma);
-
- if (flags & FOLL_WRITE)
- fault_flags |= FAULT_FLAG_WRITE;
- else if (unshare)
- fault_flags |= FAULT_FLAG_UNSHARE;
- if (locked) {
- fault_flags |= FAULT_FLAG_ALLOW_RETRY |
- FAULT_FLAG_KILLABLE;
- if (flags & FOLL_INTERRUPTIBLE)
- fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
- }
- if (flags & FOLL_NOWAIT)
- fault_flags |= FAULT_FLAG_ALLOW_RETRY |
- FAULT_FLAG_RETRY_NOWAIT;
- if (flags & FOLL_TRIED) {
- /*
- * Note: FAULT_FLAG_ALLOW_RETRY and
- * FAULT_FLAG_TRIED can co-exist
- */
- fault_flags |= FAULT_FLAG_TRIED;
- }
- ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
- if (ret & VM_FAULT_ERROR) {
- err = vm_fault_to_errno(ret, flags);
- remainder = 0;
- break;
- }
- if (ret & VM_FAULT_RETRY) {
- if (locked &&
- !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
- *locked = 0;
- *nr_pages = 0;
- /*
- * VM_FAULT_RETRY must not return an
- * error, it will return zero
- * instead.
- *
- * No need to update "position" as the
- * caller will not check it after
- * *nr_pages is set to 0.
- */
- return i;
- }
- continue;
- }
-
- pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
- page = pte_page(huge_ptep_get(pte));
-
- VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
- !PageAnonExclusive(page), page);
-
- /*
- * If subpage information not requested, update counters
- * and skip the same_page loop below.
- */
- if (!pages && !pfn_offset &&
- (vaddr + huge_page_size(h) < vma->vm_end) &&
- (remainder >= pages_per_huge_page(h))) {
- vaddr += huge_page_size(h);
- remainder -= pages_per_huge_page(h);
- i += pages_per_huge_page(h);
- spin_unlock(ptl);
- hugetlb_vma_unlock_read(vma);
- continue;
- }
-
- /* vaddr may not be aligned to PAGE_SIZE */
- refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
- (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
-
- if (pages)
- record_subpages(nth_page(page, pfn_offset),
- vma, refs,
- likely(pages) ? pages + i : NULL);
-
- if (pages) {
- /*
- * try_grab_folio() should always succeed here,
- * because: a) we hold the ptl lock, and b) we've just
- * checked that the huge page is present in the page
- * tables. If the huge page is present, then the tail
- * pages must also be present. The ptl prevents the
- * head page and tail pages from being rearranged in
- * any way. As this is hugetlb, the pages will never
- * be p2pdma or not longterm pinable. So this page
- * must be available at this point, unless the page
- * refcount overflowed:
- */
- if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
- flags))) {
- spin_unlock(ptl);
- hugetlb_vma_unlock_read(vma);
- remainder = 0;
- err = -ENOMEM;
- break;
- }
- }
-
- vaddr += (refs << PAGE_SHIFT);
- remainder -= refs;
- i += refs;
-
- spin_unlock(ptl);
- hugetlb_vma_unlock_read(vma);
- }
- *nr_pages = remainder;
- /*
- * setting position is actually required only if remainder is
- * not zero but it's faster not to add a "if (remainder)"
- * branch.
- */
- *position = vaddr;
-
- return i ? i : err;
-}
-
long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end,
pgprot_t newprot, unsigned long cp_flags)
--
2.40.1
next prev parent reply other threads:[~2023-06-23 14:29 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-06-23 14:29 [PATCH v3 0/8] mm/gup: Unify hugetlb, speed up thp Peter Xu
2023-06-23 14:29 ` [PATCH v3 1/8] mm/hugetlb: Handle FOLL_DUMP well in follow_page_mask() Peter Xu
2023-06-23 14:29 ` [PATCH v3 2/8] mm/hugetlb: Prepare hugetlb_follow_page_mask() for FOLL_PIN Peter Xu
2023-06-26 8:06 ` David Hildenbrand
2023-06-26 16:23 ` Peter Xu
2023-06-23 14:29 ` [PATCH v3 3/8] mm/hugetlb: Add page_mask for hugetlb_follow_page_mask() Peter Xu
2023-06-23 14:29 ` [PATCH v3 4/8] mm/gup: Cleanup next_page handling Peter Xu
2023-06-23 14:29 ` [PATCH v3 5/8] mm/gup: Accelerate thp gup even for "pages != NULL" Peter Xu
2023-06-23 14:29 ` Peter Xu [this message]
2023-06-26 8:08 ` [PATCH v3 6/8] mm/gup: Retire follow_hugetlb_page() David Hildenbrand
2023-06-23 14:29 ` [PATCH v3 7/8] selftests/mm: Add -a to run_vmtests.sh Peter Xu
2023-06-26 8:10 ` David Hildenbrand
2023-06-26 16:24 ` Peter Xu
2023-06-26 20:24 ` John Hubbard
2023-06-26 20:40 ` Peter Xu
2023-06-23 14:29 ` [PATCH v3 8/8] selftests/mm: Add gup test matrix in run_vmtests.sh Peter Xu
2023-06-26 8:11 ` David Hildenbrand
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230623142936.268456-7-peterx@redhat.com \
--to=peterx@redhat.com \
--cc=aarcange@redhat.com \
--cc=akpm@linux-foundation.org \
--cc=david@redhat.com \
--cc=hughd@google.com \
--cc=jgg@nvidia.com \
--cc=jhubbard@nvidia.com \
--cc=jthoughton@google.com \
--cc=kirill@shutemov.name \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=lstoakes@gmail.com \
--cc=mike.kravetz@oracle.com \
--cc=rppt@kernel.org \
--cc=shy828301@gmail.com \
--cc=vbabka@suse.cz \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).