From: Ryan Roberts <ryan.roberts@arm.com>
To: Andrew Morton <akpm@linux-foundation.org>,
"Matthew Wilcox (Oracle)" <willy@infradead.org>,
"Yin, Fengwei" <fengwei.yin@intel.com>,
Yu Zhao <yuzhao@google.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>,
linux-mm@kvack.org, linux-arm-kernel@lists.infradead.org
Subject: [RFC PATCH 5/6] mm: Allocate large folios for anonymous memory
Date: Fri, 17 Mar 2023 10:58:01 +0000 [thread overview]
Message-ID: <20230317105802.2634004-6-ryan.roberts@arm.com> (raw)
In-Reply-To: <20230317105802.2634004-1-ryan.roberts@arm.com>
Add the machinery to determine what order of folio to allocate within
do_anonymous_page() and deal with racing faults to the same region.
TODO: For now, the maximum order is set to 4. This should probably be
set per-vma based on factors, and adjusted dynamically.
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
mm/memory.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 124 insertions(+), 16 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c
index c9e09415ee18..3d01eab46d9c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4013,6 +4013,77 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
return ret;
}
+/*
+ * Returns index of first pte that is not none, or nr if all are none.
+ */
+static int check_all_ptes_none(pte_t *pte, int nr)
+{
+ int i;
+
+ for (i = 0; i < nr; i++) {
+ if (!pte_none(*pte++))
+ return i;
+ }
+
+ return nr;
+}
+
+static void calc_anonymous_folio_order(struct vm_fault *vmf,
+ int *order_out,
+ unsigned long *addr_out)
+{
+ /*
+ * The aim here is to determine what size of folio we should allocate
+ * for this fault. Factors include:
+ * - Folio must be naturally aligned within VA space
+ * - Folio must not breach boundaries of vma
+ * - Folio must be fully contained inside one pmd entry
+ * - Folio must not overlap any non-none ptes
+ * - Order must not be higher than *order_out upon entry
+ *
+ * Note that the caller may or may not choose to lock the pte. If
+ * unlocked, the calculation should be considered an estimate that will
+ * need to be validated under the lock.
+ */
+
+ struct vm_area_struct *vma = vmf->vma;
+ int nr;
+ int order = min(*order_out, PMD_SHIFT - PAGE_SHIFT);
+ unsigned long addr;
+ pte_t *pte;
+ pte_t *first_set = NULL;
+ int ret;
+
+ for (; order > 0; order--) {
+ nr = 1 << order;
+ addr = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE);
+ pte = vmf->pte - ((vmf->address - addr) >> PAGE_SHIFT);
+
+ /* Check vma bounds. */
+ if (addr < vma->vm_start ||
+ addr + nr * PAGE_SIZE > vma->vm_end)
+ continue;
+
+ /* All ptes covered by order already known to be none. */
+ if (pte + nr <= first_set)
+ break;
+
+ /* Already found set pte in range covered by order. */
+ if (pte <= first_set)
+ continue;
+
+ /* Need to check if all the ptes are none. */
+ ret = check_all_ptes_none(pte, nr);
+ if (ret == nr)
+ break;
+
+ first_set = pte + ret;
+ }
+
+ *order_out = order;
+ *addr_out = order > 0 ? addr : vmf->address;
+}
+
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -4024,6 +4095,9 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
struct folio *folio;
vm_fault_t ret = 0;
pte_t entry;
+ unsigned long addr;
+ int order = 4; // TODO: Policy for maximum folio order.
+ int pgcount;
/* File mapping without ->vm_ops ? */
if (vma->vm_flags & VM_SHARED)
@@ -4065,24 +4139,41 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
return handle_userfault(vmf, VM_UFFD_MISSING);
}
- goto setpte;
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+ goto unlock;
}
- /* Allocate our own private page. */
+retry:
+ /*
+ * Estimate the folio order to allocate. We are not under the ptl here
+ * so this estiamte needs to be re-checked later once we have the lock.
+ */
+ vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
+ calc_anonymous_folio_order(vmf, &order, &addr);
+ pte_unmap(vmf->pte);
+
+ /* Allocate our own private folio. */
if (unlikely(anon_vma_prepare(vma)))
goto oom;
- folio = try_vma_alloc_zeroed_movable_folio(vma, vmf->address, 0);
+ folio = try_vma_alloc_zeroed_movable_folio(vma, addr, order);
if (!folio)
goto oom;
+ /* We may have been granted less than we asked for. */
+ order = folio_order(folio);
+ pgcount = folio_nr_pages(folio);
+
if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
goto oom_free_page;
- cgroup_throttle_swaprate(&folio->page, GFP_KERNEL);
+ folio_throttle_swaprate(folio, GFP_KERNEL);
/*
* The memory barrier inside __folio_mark_uptodate makes sure that
- * preceding stores to the page contents become visible before
- * the set_pte_at() write.
+ * preceding stores to the folio contents become visible before
+ * the set_ptes() write.
*/
__folio_mark_uptodate(folio);
@@ -4091,11 +4182,26 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
- vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
- &vmf->ptl);
- if (!pte_none(*vmf->pte)) {
- update_mmu_tlb(vma, vmf->address, vmf->pte);
- goto release;
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
+
+ /*
+ * Ensure our estimate above is still correct; we could have raced with
+ * another thread to service a fault in the region.
+ */
+ if (check_all_ptes_none(vmf->pte, pgcount) != pgcount) {
+ pte_t *pte = vmf->pte + ((vmf->address - addr) >> PAGE_SHIFT);
+
+ /* If faulting pte was allocated by another, exit early. */
+ if (!pte_none(*pte)) {
+ update_mmu_tlb(vma, vmf->address, pte);
+ goto release;
+ }
+
+ /* Else try again, with a lower order. */
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ folio_put(folio);
+ order--;
+ goto retry;
}
ret = check_stable_address_space(vma->vm_mm);
@@ -4109,14 +4215,16 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
return handle_userfault(vmf, VM_UFFD_MISSING);
}
- inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
- folio_add_new_anon_rmap(folio, vma, vmf->address);
+ folio_ref_add(folio, pgcount - 1);
+
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, pgcount);
+ folio_add_new_anon_rmap_range(folio, vma, addr);
folio_add_lru_vma(folio, vma);
-setpte:
- set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
+
+ set_ptes(vma->vm_mm, addr, vmf->pte, entry, pgcount);
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, vmf->address, vmf->pte);
+ update_mmu_cache_range(vma, addr, vmf->pte, pgcount);
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
--
2.25.1
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
WARNING: multiple messages have this Message-ID (diff)
From: Ryan Roberts <ryan.roberts@arm.com>
To: Andrew Morton <akpm@linux-foundation.org>,
"Matthew Wilcox (Oracle)" <willy@infradead.org>,
"Yin, Fengwei" <fengwei.yin@intel.com>,
Yu Zhao <yuzhao@google.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>,
linux-mm@kvack.org, linux-arm-kernel@lists.infradead.org
Subject: [RFC PATCH 5/6] mm: Allocate large folios for anonymous memory
Date: Fri, 17 Mar 2023 10:58:01 +0000 [thread overview]
Message-ID: <20230317105802.2634004-6-ryan.roberts@arm.com> (raw)
In-Reply-To: <20230317105802.2634004-1-ryan.roberts@arm.com>
Add the machinery to determine what order of folio to allocate within
do_anonymous_page() and deal with racing faults to the same region.
TODO: For now, the maximum order is set to 4. This should probably be
set per-vma based on factors, and adjusted dynamically.
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
mm/memory.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 124 insertions(+), 16 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c
index c9e09415ee18..3d01eab46d9c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4013,6 +4013,77 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
return ret;
}
+/*
+ * Returns index of first pte that is not none, or nr if all are none.
+ */
+static int check_all_ptes_none(pte_t *pte, int nr)
+{
+ int i;
+
+ for (i = 0; i < nr; i++) {
+ if (!pte_none(*pte++))
+ return i;
+ }
+
+ return nr;
+}
+
+static void calc_anonymous_folio_order(struct vm_fault *vmf,
+ int *order_out,
+ unsigned long *addr_out)
+{
+ /*
+ * The aim here is to determine what size of folio we should allocate
+ * for this fault. Factors include:
+ * - Folio must be naturally aligned within VA space
+ * - Folio must not breach boundaries of vma
+ * - Folio must be fully contained inside one pmd entry
+ * - Folio must not overlap any non-none ptes
+ * - Order must not be higher than *order_out upon entry
+ *
+ * Note that the caller may or may not choose to lock the pte. If
+ * unlocked, the calculation should be considered an estimate that will
+ * need to be validated under the lock.
+ */
+
+ struct vm_area_struct *vma = vmf->vma;
+ int nr;
+ int order = min(*order_out, PMD_SHIFT - PAGE_SHIFT);
+ unsigned long addr;
+ pte_t *pte;
+ pte_t *first_set = NULL;
+ int ret;
+
+ for (; order > 0; order--) {
+ nr = 1 << order;
+ addr = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE);
+ pte = vmf->pte - ((vmf->address - addr) >> PAGE_SHIFT);
+
+ /* Check vma bounds. */
+ if (addr < vma->vm_start ||
+ addr + nr * PAGE_SIZE > vma->vm_end)
+ continue;
+
+ /* All ptes covered by order already known to be none. */
+ if (pte + nr <= first_set)
+ break;
+
+ /* Already found set pte in range covered by order. */
+ if (pte <= first_set)
+ continue;
+
+ /* Need to check if all the ptes are none. */
+ ret = check_all_ptes_none(pte, nr);
+ if (ret == nr)
+ break;
+
+ first_set = pte + ret;
+ }
+
+ *order_out = order;
+ *addr_out = order > 0 ? addr : vmf->address;
+}
+
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -4024,6 +4095,9 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
struct folio *folio;
vm_fault_t ret = 0;
pte_t entry;
+ unsigned long addr;
+ int order = 4; // TODO: Policy for maximum folio order.
+ int pgcount;
/* File mapping without ->vm_ops ? */
if (vma->vm_flags & VM_SHARED)
@@ -4065,24 +4139,41 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
return handle_userfault(vmf, VM_UFFD_MISSING);
}
- goto setpte;
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+ goto unlock;
}
- /* Allocate our own private page. */
+retry:
+ /*
+ * Estimate the folio order to allocate. We are not under the ptl here
+ * so this estiamte needs to be re-checked later once we have the lock.
+ */
+ vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
+ calc_anonymous_folio_order(vmf, &order, &addr);
+ pte_unmap(vmf->pte);
+
+ /* Allocate our own private folio. */
if (unlikely(anon_vma_prepare(vma)))
goto oom;
- folio = try_vma_alloc_zeroed_movable_folio(vma, vmf->address, 0);
+ folio = try_vma_alloc_zeroed_movable_folio(vma, addr, order);
if (!folio)
goto oom;
+ /* We may have been granted less than we asked for. */
+ order = folio_order(folio);
+ pgcount = folio_nr_pages(folio);
+
if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
goto oom_free_page;
- cgroup_throttle_swaprate(&folio->page, GFP_KERNEL);
+ folio_throttle_swaprate(folio, GFP_KERNEL);
/*
* The memory barrier inside __folio_mark_uptodate makes sure that
- * preceding stores to the page contents become visible before
- * the set_pte_at() write.
+ * preceding stores to the folio contents become visible before
+ * the set_ptes() write.
*/
__folio_mark_uptodate(folio);
@@ -4091,11 +4182,26 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
- vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
- &vmf->ptl);
- if (!pte_none(*vmf->pte)) {
- update_mmu_tlb(vma, vmf->address, vmf->pte);
- goto release;
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
+
+ /*
+ * Ensure our estimate above is still correct; we could have raced with
+ * another thread to service a fault in the region.
+ */
+ if (check_all_ptes_none(vmf->pte, pgcount) != pgcount) {
+ pte_t *pte = vmf->pte + ((vmf->address - addr) >> PAGE_SHIFT);
+
+ /* If faulting pte was allocated by another, exit early. */
+ if (!pte_none(*pte)) {
+ update_mmu_tlb(vma, vmf->address, pte);
+ goto release;
+ }
+
+ /* Else try again, with a lower order. */
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ folio_put(folio);
+ order--;
+ goto retry;
}
ret = check_stable_address_space(vma->vm_mm);
@@ -4109,14 +4215,16 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
return handle_userfault(vmf, VM_UFFD_MISSING);
}
- inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
- folio_add_new_anon_rmap(folio, vma, vmf->address);
+ folio_ref_add(folio, pgcount - 1);
+
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, pgcount);
+ folio_add_new_anon_rmap_range(folio, vma, addr);
folio_add_lru_vma(folio, vma);
-setpte:
- set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
+
+ set_ptes(vma->vm_mm, addr, vmf->pte, entry, pgcount);
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, vmf->address, vmf->pte);
+ update_mmu_cache_range(vma, addr, vmf->pte, pgcount);
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
--
2.25.1
next prev parent reply other threads:[~2023-03-17 10:59 UTC|newest]
Thread overview: 30+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-03-17 10:57 [RFC PATCH 0/6] variable-order, large folios for anonymous memory Ryan Roberts
2023-03-17 10:57 ` Ryan Roberts
2023-03-17 10:57 ` [RFC PATCH 1/6] mm: Expose clear_huge_page() unconditionally Ryan Roberts
2023-03-17 10:57 ` Ryan Roberts
2023-03-17 10:57 ` [RFC PATCH 2/6] mm: pass gfp flags and order to vma_alloc_zeroed_movable_folio() Ryan Roberts
2023-03-17 10:57 ` Ryan Roberts
2023-03-17 10:57 ` [RFC PATCH 3/6] mm: Introduce try_vma_alloc_zeroed_movable_folio() Ryan Roberts
2023-03-17 10:57 ` Ryan Roberts
2023-03-17 10:58 ` [RFC PATCH 4/6] mm: Implement folio_add_new_anon_rmap_range() Ryan Roberts
2023-03-17 10:58 ` Ryan Roberts
2023-03-22 6:59 ` Yin Fengwei
2023-03-22 6:59 ` Yin Fengwei
2023-03-22 7:10 ` Yin Fengwei
2023-03-22 7:10 ` Yin Fengwei
2023-03-22 7:42 ` Ryan Roberts
2023-03-22 7:42 ` Ryan Roberts
2023-03-17 10:58 ` Ryan Roberts [this message]
2023-03-17 10:58 ` [RFC PATCH 5/6] mm: Allocate large folios for anonymous memory Ryan Roberts
2023-03-17 10:58 ` [RFC PATCH 6/6] WORKAROUND: Don't split large folios on madvise Ryan Roberts
2023-03-17 10:58 ` Ryan Roberts
2023-03-22 8:19 ` Yin Fengwei
2023-03-22 8:19 ` Yin Fengwei
2023-03-22 8:59 ` Ryan Roberts
2023-03-22 8:59 ` Ryan Roberts
2023-03-22 12:03 ` [RFC PATCH 0/6] variable-order, large folios for anonymous memory Ryan Roberts
2023-03-22 12:03 ` Ryan Roberts
2023-03-22 13:36 ` Yin, Fengwei
2023-03-22 13:36 ` Yin, Fengwei
2023-03-22 14:25 ` Ryan Roberts
2023-03-22 14:25 ` Ryan Roberts
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230317105802.2634004-6-ryan.roberts@arm.com \
--to=ryan.roberts@arm.com \
--cc=akpm@linux-foundation.org \
--cc=fengwei.yin@intel.com \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=linux-mm@kvack.org \
--cc=willy@infradead.org \
--cc=yuzhao@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.