From: James Houghton <jthoughton@google.com>
To: Mike Kravetz <mike.kravetz@oracle.com>,
Muchun Song <songmuchun@bytedance.com>,
Peter Xu <peterx@redhat.com>
Cc: David Hildenbrand <david@redhat.com>,
David Rientjes <rientjes@google.com>,
Axel Rasmussen <axelrasmussen@google.com>,
Mina Almasry <almasrymina@google.com>,
Jue Wang <juew@google.com>,
Manish Mishra <manish.mishra@nutanix.com>,
"Dr . David Alan Gilbert" <dgilbert@redhat.com>,
linux-mm@kvack.org, linux-kernel@vger.kernel.org,
James Houghton <jthoughton@google.com>
Subject: [RFC PATCH 14/26] hugetlb: add HGM support for hugetlb_fault and hugetlb_no_page
Date: Fri, 24 Jun 2022 17:36:44 +0000 [thread overview]
Message-ID: <20220624173656.2033256-15-jthoughton@google.com> (raw)
In-Reply-To: <20220624173656.2033256-1-jthoughton@google.com>
This CL is the first main functional HugeTLB change. Together, these
changes allow the HugeTLB fault path to handle faults on HGM-enabled
VMAs. The two main behaviors that can be done now:
1. Faults can be passed to handle_userfault. (Userspace will want to
use UFFD_FEATURE_REAL_ADDRESS to get the real address to know which
region they should be call UFFDIO_CONTINUE on later.)
2. Faults on pages that have been partially mapped (and userfaultfd is
not being used) will get mapped at the largest possible size.
For example, if a 1G page has been partially mapped at 2M, and we
fault on an unmapped 2M section, hugetlb_no_page will create a 2M
PMD to map the faulting address.
This commit does not handle hugetlb_wp right now, and it doesn't handle
HugeTLB page migration and swap entries.
Signed-off-by: James Houghton <jthoughton@google.com>
---
include/linux/hugetlb.h | 12 ++++
mm/hugetlb.c | 121 +++++++++++++++++++++++++++++++---------
2 files changed, 106 insertions(+), 27 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 321f5745d87f..ac4ac8fbd901 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1185,6 +1185,9 @@ enum split_mode {
#ifdef CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING
/* If HugeTLB high-granularity mappings are enabled for this VMA. */
bool hugetlb_hgm_enabled(struct vm_area_struct *vma);
+int hugetlb_alloc_largest_pte(struct hugetlb_pte *hpte, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long start,
+ unsigned long end);
int huge_pte_alloc_high_granularity(struct hugetlb_pte *hpte,
struct mm_struct *mm,
struct vm_area_struct *vma,
@@ -1197,6 +1200,15 @@ static inline bool hugetlb_hgm_enabled(struct vm_area_struct *vma)
{
return false;
}
+
+static inline
+int hugetlb_alloc_largest_pte(struct hugetlb_pte *hpte, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long start,
+ unsigned long end)
+{
+ BUG();
+}
+
static inline int huge_pte_alloc_high_granularity(struct hugetlb_pte *hpte,
struct mm_struct *mm,
struct vm_area_struct *vma,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6e0c5fbfe32c..da30621656b8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5605,18 +5605,24 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
struct vm_area_struct *vma,
struct address_space *mapping, pgoff_t idx,
- unsigned long address, pte_t *ptep,
+ unsigned long address, struct hugetlb_pte *hpte,
pte_t old_pte, unsigned int flags)
{
struct hstate *h = hstate_vma(vma);
vm_fault_t ret = VM_FAULT_SIGBUS;
int anon_rmap = 0;
unsigned long size;
- struct page *page;
+ struct page *page, *subpage;
pte_t new_pte;
spinlock_t *ptl;
unsigned long haddr = address & huge_page_mask(h);
+ unsigned long haddr_hgm = address & hugetlb_pte_mask(hpte);
bool new_page, new_pagecache_page = false;
+ /*
+ * This page is getting mapped for the first time, in which case we
+ * want to increment its mapcount.
+ */
+ bool new_mapping = hpte->shift == huge_page_shift(h);
/*
* Currently, we are forced to kill the process in the event the
@@ -5665,9 +5671,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* here. Before returning error, get ptl and make
* sure there really is no pte entry.
*/
- ptl = huge_pte_lock(h, mm, ptep);
+ ptl = hugetlb_pte_lock(mm, hpte);
ret = 0;
- if (huge_pte_none(huge_ptep_get(ptep)))
+ if (hugetlb_pte_none(hpte))
ret = vmf_error(PTR_ERR(page));
spin_unlock(ptl);
goto out;
@@ -5731,18 +5737,25 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
vma_end_reservation(h, vma, haddr);
}
- ptl = huge_pte_lock(h, mm, ptep);
+ ptl = hugetlb_pte_lock(mm, hpte);
ret = 0;
/* If pte changed from under us, retry */
- if (!pte_same(huge_ptep_get(ptep), old_pte))
+ if (!pte_same(hugetlb_ptep_get(hpte), old_pte))
goto backout;
- if (anon_rmap) {
- ClearHPageRestoreReserve(page);
- hugepage_add_new_anon_rmap(page, vma, haddr);
- } else
- page_dup_file_rmap(page, true);
- new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
+ if (new_mapping) {
+ /* Only increment this page's mapcount if we are mapping it
+ * for the first time.
+ */
+ if (anon_rmap) {
+ ClearHPageRestoreReserve(page);
+ hugepage_add_new_anon_rmap(page, vma, haddr);
+ } else
+ page_dup_file_rmap(page, true);
+ }
+
+ subpage = hugetlb_find_subpage(h, page, haddr_hgm);
+ new_pte = make_huge_pte(vma, subpage, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
/*
* If this pte was previously wr-protected, keep it wr-protected even
@@ -5750,12 +5763,13 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
*/
if (unlikely(pte_marker_uffd_wp(old_pte)))
new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte));
- set_huge_pte_at(mm, haddr, ptep, new_pte);
+ set_huge_pte_at(mm, haddr_hgm, hpte->ptep, new_pte);
- hugetlb_count_add(pages_per_huge_page(h), mm);
+ hugetlb_count_add(hugetlb_pte_size(hpte) / PAGE_SIZE, mm);
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+ BUG_ON(hugetlb_pte_size(hpte) != huge_page_size(h));
/* Optimization, do the COW without a second fault */
- ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl);
+ ret = hugetlb_wp(mm, vma, address, hpte->ptep, flags, page, ptl);
}
spin_unlock(ptl);
@@ -5816,11 +5830,15 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
u32 hash;
pgoff_t idx;
struct page *page = NULL;
+ struct page *subpage = NULL;
struct page *pagecache_page = NULL;
struct hstate *h = hstate_vma(vma);
struct address_space *mapping;
int need_wait_lock = 0;
unsigned long haddr = address & huge_page_mask(h);
+ unsigned long haddr_hgm;
+ bool hgm_enabled = hugetlb_hgm_enabled(vma);
+ struct hugetlb_pte hpte;
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (ptep) {
@@ -5866,11 +5884,22 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
- entry = huge_ptep_get(ptep);
+ hugetlb_pte_populate(&hpte, ptep, huge_page_shift(h));
+
+ if (hgm_enabled) {
+ ret = hugetlb_walk_to(mm, &hpte, address,
+ PAGE_SIZE, /*stop_at_none=*/true);
+ if (ret) {
+ ret = vmf_error(ret);
+ goto out_mutex;
+ }
+ }
+
+ entry = hugetlb_ptep_get(&hpte);
/* PTE markers should be handled the same way as none pte */
- if (huge_pte_none_mostly(entry)) {
- ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
- entry, flags);
+ if (hugetlb_pte_none_mostly(&hpte)) {
+ ret = hugetlb_no_page(mm, vma, mapping, idx, address, &hpte,
+ entry, flags);
goto out_mutex;
}
@@ -5908,14 +5937,17 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vma, haddr);
}
- ptl = huge_pte_lock(h, mm, ptep);
+ ptl = hugetlb_pte_lock(mm, &hpte);
/* Check for a racing update before calling hugetlb_wp() */
- if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
+ if (unlikely(!pte_same(entry, hugetlb_ptep_get(&hpte))))
goto out_ptl;
+ /* haddr_hgm is the base address of the region that hpte maps. */
+ haddr_hgm = address & hugetlb_pte_mask(&hpte);
+
/* Handle userfault-wp first, before trying to lock more pages */
- if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
+ if (userfaultfd_wp(vma) && huge_pte_uffd_wp(hugetlb_ptep_get(&hpte)) &&
(flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
struct vm_fault vmf = {
.vma = vma,
@@ -5939,7 +5971,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* pagecache_page, so here we need take the former one
* when page != pagecache_page or !pagecache_page.
*/
- page = pte_page(entry);
+ subpage = pte_page(entry);
+ page = compound_head(subpage);
if (page != pagecache_page)
if (!trylock_page(page)) {
need_wait_lock = 1;
@@ -5950,7 +5983,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
if (!huge_pte_write(entry)) {
- ret = hugetlb_wp(mm, vma, address, ptep, flags,
+ BUG_ON(hugetlb_pte_size(&hpte) != huge_page_size(h));
+ ret = hugetlb_wp(mm, vma, address, hpte.ptep, flags,
pagecache_page, ptl);
goto out_put_page;
} else if (likely(flags & FAULT_FLAG_WRITE)) {
@@ -5958,9 +5992,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
}
entry = pte_mkyoung(entry);
- if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
+ if (huge_ptep_set_access_flags(vma, haddr_hgm, hpte.ptep, entry,
flags & FAULT_FLAG_WRITE))
- update_mmu_cache(vma, haddr, ptep);
+ update_mmu_cache(vma, haddr_hgm, hpte.ptep);
out_put_page:
if (page != pagecache_page)
unlock_page(page);
@@ -6951,7 +6985,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
pte = (pte_t *)pmd_alloc(mm, pud, addr);
}
}
- BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
+ if (!hugetlb_hgm_enabled(vma))
+ BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
return pte;
}
@@ -7057,6 +7092,38 @@ static unsigned int __shift_for_hstate(struct hstate *h)
(tmp_h) <= &hstates[hugetlb_max_hstate]; \
(tmp_h)++)
+/*
+ * Allocate a HugeTLB PTE that maps as much of [start, end) as possible with a
+ * single page table entry. The allocated HugeTLB PTE is returned in hpte.
+ */
+int hugetlb_alloc_largest_pte(struct hugetlb_pte *hpte, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long start,
+ unsigned long end)
+{
+ struct hstate *h = hstate_vma(vma), *tmp_h;
+ unsigned int shift;
+ int ret;
+
+ for_each_hgm_shift(h, tmp_h, shift) {
+ unsigned long sz = 1UL << shift;
+
+ if (!IS_ALIGNED(start, sz) || start + sz > end)
+ continue;
+ ret = huge_pte_alloc_high_granularity(hpte, mm, vma, start,
+ shift, HUGETLB_SPLIT_NONE,
+ /*write_locked=*/false);
+ if (ret)
+ return ret;
+
+ if (hpte->shift > shift)
+ return -EEXIST;
+
+ BUG_ON(hpte->shift != shift);
+ return 0;
+ }
+ return -EINVAL;
+}
+
/*
* Given a particular address, split the HugeTLB PTE that currently maps it
* so that, for the given address, the PTE that maps it is `desired_shift`.
--
2.37.0.rc0.161.g10f37bed90-goog
next prev parent reply other threads:[~2022-06-24 17:37 UTC|newest]
Thread overview: 125+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-06-24 17:36 [RFC PATCH 00/26] hugetlb: Introduce HugeTLB high-granularity mapping James Houghton
2022-06-24 17:36 ` [RFC PATCH 01/26] hugetlb: make hstate accessor functions const James Houghton
2022-06-24 18:43 ` Mina Almasry
[not found] ` <e55f90f5-ba14-5d6e-8f8f-abf731b9095e@nutanix.com>
2022-06-27 12:09 ` manish.mishra
2022-06-28 17:08 ` James Houghton
2022-06-29 6:18 ` Muchun Song
2022-06-24 17:36 ` [RFC PATCH 02/26] hugetlb: sort hstates in hugetlb_init_hstates James Houghton
2022-06-24 18:51 ` Mina Almasry
2022-06-27 12:08 ` manish.mishra
2022-06-28 15:35 ` James Houghton
2022-06-27 18:42 ` Mike Kravetz
2022-06-28 15:40 ` James Houghton
2022-06-29 6:39 ` Muchun Song
2022-06-29 21:06 ` Mike Kravetz
2022-06-29 21:13 ` James Houghton
2022-06-24 17:36 ` [RFC PATCH 03/26] hugetlb: add make_huge_pte_with_shift James Houghton
2022-06-24 19:01 ` Mina Almasry
2022-06-27 12:13 ` manish.mishra
2022-06-24 17:36 ` [RFC PATCH 04/26] hugetlb: make huge_pte_lockptr take an explicit shift argument James Houghton
2022-06-27 12:26 ` manish.mishra
2022-06-27 20:51 ` Mike Kravetz
2022-06-28 15:29 ` James Houghton
2022-06-29 6:09 ` Muchun Song
2022-06-29 21:03 ` Mike Kravetz
2022-06-29 21:39 ` James Houghton
2022-06-29 22:24 ` Mike Kravetz
2022-06-30 9:35 ` Muchun Song
2022-06-30 16:23 ` James Houghton
2022-06-30 17:40 ` Mike Kravetz
2022-07-01 3:32 ` Muchun Song
2022-06-24 17:36 ` [RFC PATCH 05/26] hugetlb: add CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING James Houghton
2022-06-27 12:28 ` manish.mishra
2022-06-28 20:03 ` Mina Almasry
2022-06-24 17:36 ` [RFC PATCH 06/26] mm: make free_p?d_range functions public James Houghton
2022-06-27 12:31 ` manish.mishra
2022-06-28 20:35 ` Mike Kravetz
2022-07-12 20:52 ` James Houghton
2022-06-24 17:36 ` [RFC PATCH 07/26] hugetlb: add hugetlb_pte to track HugeTLB page table entries James Houghton
2022-06-27 12:47 ` manish.mishra
2022-06-29 16:28 ` James Houghton
2022-06-28 20:25 ` Mina Almasry
2022-06-29 16:42 ` James Houghton
2022-06-28 20:44 ` Mike Kravetz
2022-06-29 16:24 ` James Houghton
2022-07-11 23:32 ` Mike Kravetz
2022-07-12 9:42 ` Dr. David Alan Gilbert
2022-07-12 17:51 ` Mike Kravetz
2022-07-15 16:35 ` Peter Xu
2022-07-15 21:52 ` Axel Rasmussen
2022-07-15 23:03 ` Peter Xu
2022-09-08 17:38 ` Peter Xu
2022-09-08 17:54 ` James Houghton
2022-06-24 17:36 ` [RFC PATCH 08/26] hugetlb: add hugetlb_free_range to free PT structures James Houghton
2022-06-27 12:52 ` manish.mishra
2022-06-28 20:27 ` Mina Almasry
2022-06-24 17:36 ` [RFC PATCH 09/26] hugetlb: add hugetlb_hgm_enabled James Houghton
2022-06-27 12:55 ` manish.mishra
2022-06-28 20:33 ` Mina Almasry
2022-09-08 18:07 ` Peter Xu
2022-09-08 18:13 ` James Houghton
2022-06-24 17:36 ` [RFC PATCH 10/26] hugetlb: add for_each_hgm_shift James Houghton
2022-06-27 13:01 ` manish.mishra
2022-06-28 21:58 ` Mina Almasry
2022-07-07 21:39 ` Mike Kravetz
2022-07-08 15:52 ` James Houghton
2022-07-09 21:55 ` Mina Almasry
2022-06-24 17:36 ` [RFC PATCH 11/26] hugetlb: add hugetlb_walk_to to do PT walks James Houghton
2022-06-27 13:07 ` manish.mishra
2022-07-07 23:03 ` Mike Kravetz
2022-09-08 18:20 ` Peter Xu
2022-06-24 17:36 ` [RFC PATCH 12/26] hugetlb: add HugeTLB splitting functionality James Houghton
2022-06-27 13:50 ` manish.mishra
2022-06-29 16:10 ` James Houghton
2022-06-29 14:33 ` manish.mishra
2022-06-29 16:20 ` James Houghton
2022-06-24 17:36 ` [RFC PATCH 13/26] hugetlb: add huge_pte_alloc_high_granularity James Houghton
2022-06-29 14:11 ` manish.mishra
2022-06-24 17:36 ` James Houghton [this message]
2022-06-29 14:40 ` [RFC PATCH 14/26] hugetlb: add HGM support for hugetlb_fault and hugetlb_no_page manish.mishra
2022-06-29 15:56 ` James Houghton
2022-06-24 17:36 ` [RFC PATCH 15/26] hugetlb: make unmapping compatible with high-granularity mappings James Houghton
2022-07-19 10:19 ` manish.mishra
2022-07-19 15:58 ` James Houghton
2022-06-24 17:36 ` [RFC PATCH 16/26] hugetlb: make hugetlb_change_protection compatible with HGM James Houghton
2022-06-24 17:36 ` [RFC PATCH 17/26] hugetlb: update follow_hugetlb_page to support HGM James Houghton
2022-07-19 10:48 ` manish.mishra
2022-07-19 16:19 ` James Houghton
2022-06-24 17:36 ` [RFC PATCH 18/26] hugetlb: use struct hugetlb_pte for walk_hugetlb_range James Houghton
2022-06-24 17:36 ` [RFC PATCH 19/26] hugetlb: add HGM support for copy_hugetlb_page_range James Houghton
2022-07-11 23:41 ` Mike Kravetz
2022-07-12 17:19 ` James Houghton
2022-07-12 18:06 ` Mike Kravetz
2022-07-15 21:39 ` Axel Rasmussen
2022-06-24 17:36 ` [RFC PATCH 20/26] hugetlb: add support for high-granularity UFFDIO_CONTINUE James Houghton
2022-07-15 16:21 ` Peter Xu
2022-07-15 16:58 ` James Houghton
2022-07-15 17:20 ` Peter Xu
2022-07-20 20:58 ` James Houghton
2022-07-21 19:09 ` Peter Xu
2022-07-21 19:44 ` James Houghton
2022-07-21 19:53 ` Peter Xu
2022-06-24 17:36 ` [RFC PATCH 21/26] hugetlb: add hugetlb_collapse James Houghton
2022-06-24 17:36 ` [RFC PATCH 22/26] madvise: add uapi for HugeTLB HGM collapse: MADV_COLLAPSE James Houghton
2022-06-24 17:36 ` [RFC PATCH 23/26] userfaultfd: add UFFD_FEATURE_MINOR_HUGETLBFS_HGM James Houghton
2022-06-24 17:36 ` [RFC PATCH 24/26] arm64/hugetlb: add support for high-granularity mappings James Houghton
2022-06-24 17:36 ` [RFC PATCH 25/26] selftests: add HugeTLB HGM to userfaultfd selftest James Houghton
2022-06-24 17:36 ` [RFC PATCH 26/26] selftests: add HugeTLB HGM to KVM demand paging selftest James Houghton
2022-06-24 18:29 ` [RFC PATCH 00/26] hugetlb: Introduce HugeTLB high-granularity mapping Matthew Wilcox
2022-06-27 16:36 ` James Houghton
2022-06-27 17:56 ` Dr. David Alan Gilbert
2022-06-27 20:31 ` James Houghton
2022-06-28 0:04 ` Nadav Amit
2022-06-30 19:21 ` Peter Xu
2022-07-01 5:54 ` Nadav Amit
2022-06-28 8:20 ` Dr. David Alan Gilbert
2022-06-30 16:09 ` Peter Xu
2022-06-24 18:41 ` Mina Almasry
2022-06-27 16:27 ` James Houghton
2022-06-28 14:17 ` Muchun Song
2022-06-28 17:26 ` Mina Almasry
2022-06-28 17:56 ` Dr. David Alan Gilbert
2022-06-29 18:31 ` James Houghton
2022-06-29 20:39 ` Axel Rasmussen
2022-06-24 18:47 ` Matthew Wilcox
2022-06-27 16:48 ` James Houghton
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220624173656.2033256-15-jthoughton@google.com \
--to=jthoughton@google.com \
--cc=almasrymina@google.com \
--cc=axelrasmussen@google.com \
--cc=david@redhat.com \
--cc=dgilbert@redhat.com \
--cc=juew@google.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=manish.mishra@nutanix.com \
--cc=mike.kravetz@oracle.com \
--cc=peterx@redhat.com \
--cc=rientjes@google.com \
--cc=songmuchun@bytedance.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).