From: Dev Jain <dev.jain@arm.com>
To: akpm@linux-foundation.org, david@redhat.com, willy@infradead.org,
kirill.shutemov@linux.intel.com
Cc: npache@redhat.com, ryan.roberts@arm.com,
anshuman.khandual@arm.com, catalin.marinas@arm.com,
cl@gentwo.org, vbabka@suse.cz, mhocko@suse.com,
apopple@nvidia.com, dave.hansen@linux.intel.com, will@kernel.org,
baohua@kernel.org, jack@suse.cz, srivatsa@csail.mit.edu,
haowenchao22@gmail.com, hughd@google.com,
aneesh.kumar@kernel.org, yang@os.amperecomputing.com,
peterx@redhat.com, ioworker0@gmail.com,
wangkefeng.wang@huawei.com, ziy@nvidia.com, jglisse@google.com,
surenb@google.com, vishal.moola@gmail.com, zokeefe@google.com,
zhengqi.arch@bytedance.com, jhubbard@nvidia.com,
21cnbao@gmail.com, linux-mm@kvack.org,
linux-kernel@vger.kernel.org, Dev Jain <dev.jain@arm.com>
Subject: [PATCH v2 07/17] khugepaged: Scan PTEs order-wise
Date: Tue, 11 Feb 2025 16:43:16 +0530 [thread overview]
Message-ID: <20250211111326.14295-8-dev.jain@arm.com> (raw)
In-Reply-To: <20250211111326.14295-1-dev.jain@arm.com>
Scan the PTEs order-wise, using the mask of suitable orders for this VMA
derived in conjunction with sysfs THP settings. Scale down the tunables (to
be changed in subsequent patches); in case of collapse failure, we drop down
to the next order. Otherwise, we try to jump to the highest possible order
and then start a fresh scan. Note that madvise(MADV_COLLAPSE) has not been generalized.
Signed-off-by: Dev Jain <dev.jain@arm.com>
---
mm/khugepaged.c | 97 ++++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 83 insertions(+), 14 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 498cb5ad9ff1..fbfd8a78ef51 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -21,6 +21,7 @@
#include <linux/shmem_fs.h>
#include <linux/dax.h>
#include <linux/ksm.h>
+#include <linux/count_zeros.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -1295,36 +1296,57 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
{
pmd_t *pmd;
pte_t *pte, *_pte;
- int result = SCAN_FAIL, referenced = 0;
- int none_or_zero = 0, shared = 0;
- struct page *page = NULL;
struct folio *folio = NULL;
- unsigned long _address;
+ int result = SCAN_FAIL;
spinlock_t *ptl;
- int node = NUMA_NO_NODE, unmapped = 0;
+ unsigned int max_ptes_shared, max_ptes_none, max_ptes_swap;
+ int referenced, shared, none_or_zero, unmapped;
+ unsigned long _address, orig_address = address;
+ int node = NUMA_NO_NODE;
bool writable = false;
+ unsigned long orders, orig_orders;
+ int order, prev_order;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ orders = thp_vma_allowable_orders(vma, vma->vm_flags,
+ TVA_IN_PF | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL_ANON);
+ orders = thp_vma_suitable_orders(vma, address, orders);
+ orig_orders = orders;
+ order = highest_order(orders);
+
+ /* MADV_COLLAPSE needs to work irrespective of sysfs setting */
+ if (!cc->is_khugepaged)
+ order = HPAGE_PMD_ORDER;
+
+scan_pte_range:
+
+ max_ptes_shared = khugepaged_max_ptes_shared >> (HPAGE_PMD_ORDER - order);
+ max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
+ max_ptes_swap = khugepaged_max_ptes_swap >> (HPAGE_PMD_ORDER - order);
+ referenced = 0, shared = 0, none_or_zero = 0, unmapped = 0;
+
+ /* Check pmd after taking mmap lock */
result = find_pmd_or_thp_or_none(mm, address, &pmd);
if (result != SCAN_SUCCEED)
goto out;
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
+
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte) {
result = SCAN_PMD_NULL;
goto out;
}
- for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
+ for (_address = address, _pte = pte; _pte < pte + (1UL << order);
_pte++, _address += PAGE_SIZE) {
pte_t pteval = ptep_get(_pte);
if (is_swap_pte(pteval)) {
++unmapped;
if (!cc->is_khugepaged ||
- unmapped <= khugepaged_max_ptes_swap) {
+ unmapped <= max_ptes_swap) {
/*
* Always be strict with uffd-wp
* enabled swap entries. Please see
@@ -1345,7 +1367,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
++none_or_zero;
if (!userfaultfd_armed(vma) &&
(!cc->is_khugepaged ||
- none_or_zero <= khugepaged_max_ptes_none)) {
+ none_or_zero <= max_ptes_none)) {
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
@@ -1369,12 +1391,11 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
if (pte_write(pteval))
writable = true;
- page = vm_normal_page(vma, _address, pteval);
- if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
+ folio = vm_normal_folio(vma, _address, pteval);
+ if (unlikely(!folio) || unlikely(folio_is_zone_device(folio))) {
result = SCAN_PAGE_NULL;
goto out_unmap;
}
- folio = page_folio(page);
if (!folio_test_anon(folio)) {
result = SCAN_PAGE_ANON;
@@ -1390,7 +1411,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
if (folio_likely_mapped_shared(folio)) {
++shared;
if (cc->is_khugepaged &&
- shared > khugepaged_max_ptes_shared) {
+ shared > max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
goto out_unmap;
@@ -1447,7 +1468,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
result = SCAN_PAGE_RO;
} else if (cc->is_khugepaged &&
(!referenced ||
- (unmapped && referenced < HPAGE_PMD_NR / 2))) {
+ (unmapped && referenced < (1UL << order) / 2))) {
result = SCAN_LACK_REFERENCED_PAGE;
} else {
result = SCAN_SUCCEED;
@@ -1456,10 +1477,58 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
pte_unmap_unlock(pte, ptl);
if (result == SCAN_SUCCEED) {
result = collapse_huge_page(mm, address, referenced,
- unmapped, HPAGE_PMD_ORDER, cc);
+ unmapped, order, cc);
/* collapse_huge_page will return with the mmap_lock released */
*mmap_locked = false;
+ /* Skip over this range and decide order */
+ if (result == SCAN_SUCCEED)
+ goto decide_order;
+ }
+ if (result != SCAN_SUCCEED) {
+
+ /* Go to the next order */
+ prev_order = order;
+ order = next_order(&orders, order);
+ if (order < 2) {
+ /* Skip over this range, and decide order */
+ _address = address + (PAGE_SIZE << prev_order);
+ _pte = pte + (1UL << prev_order);
+ goto decide_order;
+ }
+ goto maybe_mmap_lock;
}
+
+decide_order:
+ /* Immediately exit on exhaustion of range */
+ if (_address == orig_address + (PAGE_SIZE << HPAGE_PMD_ORDER))
+ goto out;
+
+ /* Get highest order possible starting from address */
+ order = count_trailing_zeros(_address >> PAGE_SHIFT);
+
+ orders = orig_orders & ((1UL << (order + 1)) - 1);
+ if (!(orders & (1UL << order)))
+ order = next_order(&orders, order);
+
+ /* This should never happen, since we are on an aligned address */
+ BUG_ON(cc->is_khugepaged && order < 2);
+
+ address = _address;
+ pte = _pte;
+
+maybe_mmap_lock:
+ if (!(*mmap_locked)) {
+ mmap_read_lock(mm);
+ *mmap_locked = true;
+ /* Validate VMA after retaking mmap_lock */
+ result = hugepage_vma_revalidate(mm, address, true, &vma,
+ order, cc);
+ if (result != SCAN_SUCCEED) {
+ mmap_read_unlock(mm);
+ goto out;
+ }
+ }
+ goto scan_pte_range;
out:
trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
none_or_zero, result, unmapped);
--
2.30.2
next prev parent reply other threads:[~2025-02-11 11:14 UTC|newest]
Thread overview: 22+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-02-11 11:13 [PATCH v2 00/17] khugepaged: Asynchronous mTHP collapse Dev Jain
2025-02-11 11:13 ` [PATCH v2 01/17] khugepaged: Generalize alloc_charge_folio() Dev Jain
2025-02-11 11:13 ` [PATCH v2 02/17] khugepaged: Generalize hugepage_vma_revalidate() Dev Jain
2025-02-11 11:13 ` [PATCH v2 03/17] khugepaged: Generalize __collapse_huge_page_swapin() Dev Jain
2025-02-11 11:13 ` [PATCH v2 04/17] khugepaged: Generalize __collapse_huge_page_isolate() Dev Jain
2025-02-11 11:13 ` [PATCH v2 05/17] khugepaged: Generalize __collapse_huge_page_copy() Dev Jain
2025-02-11 11:13 ` [PATCH v2 06/17] khugepaged: Abstract PMD-THP collapse Dev Jain
2025-02-11 11:13 ` Dev Jain [this message]
2025-02-11 11:13 ` [PATCH v2 08/17] khugepaged: Introduce vma_collapse_anon_folio() Dev Jain
2025-02-11 11:13 ` [PATCH v2 09/17] khugepaged: Define collapse policy if a larger folio is already mapped Dev Jain
2025-02-11 11:13 ` [PATCH v2 10/17] khugepaged: Exit early on fully-mapped aligned mTHP Dev Jain
2025-02-11 11:13 ` [PATCH v2 11/17] khugepaged: Enable sysfs to control order of collapse Dev Jain
2025-02-11 11:13 ` [PATCH v2 12/17] khugepaged: Enable variable-sized VMA collapse Dev Jain
2025-02-11 11:13 ` [PATCH v2 13/17] khugepaged: Lock all VMAs mapping the PTE table Dev Jain
2025-02-11 11:13 ` [PATCH v2 14/17] khugepaged: Reset scan address to correct alignment Dev Jain
2025-02-11 11:13 ` [PATCH v2 15/17] khugepaged: Delay cond_resched() Dev Jain
2025-02-11 11:13 ` [PATCH v2 16/17] khugepaged: Implement strict policy for mTHP collapse Dev Jain
2025-02-11 11:13 ` [PATCH v2 17/17] Documentation: transhuge: Define khugepaged mTHP collapse policy Dev Jain
2025-02-11 23:23 ` [PATCH v2 00/17] khugepaged: Asynchronous mTHP collapse Andrew Morton
2025-02-12 4:18 ` Dev Jain
2025-02-15 1:47 ` Nico Pache
2025-02-15 7:36 ` Dev Jain
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250211111326.14295-8-dev.jain@arm.com \
--to=dev.jain@arm.com \
--cc=21cnbao@gmail.com \
--cc=akpm@linux-foundation.org \
--cc=aneesh.kumar@kernel.org \
--cc=anshuman.khandual@arm.com \
--cc=apopple@nvidia.com \
--cc=baohua@kernel.org \
--cc=catalin.marinas@arm.com \
--cc=cl@gentwo.org \
--cc=dave.hansen@linux.intel.com \
--cc=david@redhat.com \
--cc=haowenchao22@gmail.com \
--cc=hughd@google.com \
--cc=ioworker0@gmail.com \
--cc=jack@suse.cz \
--cc=jglisse@google.com \
--cc=jhubbard@nvidia.com \
--cc=kirill.shutemov@linux.intel.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mhocko@suse.com \
--cc=npache@redhat.com \
--cc=peterx@redhat.com \
--cc=ryan.roberts@arm.com \
--cc=srivatsa@csail.mit.edu \
--cc=surenb@google.com \
--cc=vbabka@suse.cz \
--cc=vishal.moola@gmail.com \
--cc=wangkefeng.wang@huawei.com \
--cc=will@kernel.org \
--cc=willy@infradead.org \
--cc=yang@os.amperecomputing.com \
--cc=zhengqi.arch@bytedance.com \
--cc=ziy@nvidia.com \
--cc=zokeefe@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.