From: Nico Pache <npache@redhat.com>
To: linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
linux-mm@kvack.org, linux-trace-kernel@vger.kernel.org
Cc: aarcange@redhat.com, akpm@linux-foundation.org,
anshuman.khandual@arm.com, apopple@nvidia.com, baohua@kernel.org,
baolin.wang@linux.alibaba.com, byungchul@sk.com,
catalin.marinas@arm.com, cl@gentwo.org, corbet@lwn.net,
dave.hansen@linux.intel.com, david@kernel.org, dev.jain@arm.com,
gourry@gourry.net, hannes@cmpxchg.org, hughd@google.com,
jack@suse.cz, jackmanb@google.com, jannh@google.com,
jglisse@google.com, joshua.hahnjy@gmail.com, kas@kernel.org,
lance.yang@linux.dev, liam@infradead.org, ljs@kernel.org,
mathieu.desnoyers@efficios.com, matthew.brost@intel.com,
mhiramat@kernel.org, mhocko@suse.com, npache@redhat.com,
peterx@redhat.com, pfalcato@suse.de, rakie.kim@sk.com,
raquini@redhat.com, rdunlap@infradead.org,
richard.weiyang@gmail.com, rientjes@google.com,
rostedt@goodmis.org, rppt@kernel.org, ryan.roberts@arm.com,
shivankg@amd.com, sunnanyong@huawei.com, surenb@google.com,
thomas.hellstrom@linux.intel.com, tiwai@suse.de,
usamaarif642@gmail.com, vbabka@suse.cz, vishal.moola@gmail.com,
wangkefeng.wang@huawei.com, will@kernel.org, willy@infradead.org,
yang@os.amperecomputing.com, ying.huang@linux.alibaba.com,
ziy@nvidia.com, zokeefe@google.com,
Usama Arif <usama.arif@linux.dev>
Subject: [PATCH mm-unstable v17 03/14] mm/khugepaged: rework max_ptes_* handling with helper functions
Date: Mon, 11 May 2026 12:58:03 -0600 [thread overview]
Message-ID: <20260511185817.686831-4-npache@redhat.com> (raw)
In-Reply-To: <20260511185817.686831-1-npache@redhat.com>
The following cleanup reworks all the max_ptes_* handling into helper
functions. This increases the code readability and will later be used to
implement the mTHP handling of these variables.
With these changes we abstract all the madvise_collapse() special casing
(dont respect the sysctls) away from the functions that utilize them. And
will be used later in this series to cleanly restrict the mTHP collapse
behavior.
No functional change is intended; however, we are now only reading the
sysfs variables once per scan, whereas before these variables were being
read on each loop iteration.
Suggested-by: David Hildenbrand <david@kernel.org>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Acked-by: Usama Arif <usama.arif@linux.dev>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 118 +++++++++++++++++++++++++++++++++---------------
1 file changed, 82 insertions(+), 36 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index f0e29d5c7b1f..f68853b3caa7 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -348,6 +348,62 @@ static bool pte_none_or_zero(pte_t pte)
return pte_present(pte) && is_zero_pfn(pte_pfn(pte));
}
+/**
+ * collapse_max_ptes_none - Calculate maximum allowed none-page or zero-page
+ * PTEs for the given collapse operation.
+ * @cc: The collapse control struct
+ * @vma: The vma to check for userfaultfd
+ *
+ * Return: Maximum number of none-page or zero-page PTEs allowed for the
+ * collapse operation.
+ */
+static unsigned int collapse_max_ptes_none(struct collapse_control *cc,
+ struct vm_area_struct *vma)
+{
+ // If the vma is userfaultfd-armed, allow no none-page or zero-page PTEs.
+ if (vma && userfaultfd_armed(vma))
+ return 0;
+ // for MADV_COLLAPSE, allow any none-page or zero-page PTEs.
+ if (!cc->is_khugepaged)
+ return HPAGE_PMD_NR;
+ // For all other cases repect the user defined maximum.
+ return khugepaged_max_ptes_none;
+}
+
+/**
+ * collapse_max_ptes_shared - Calculate maximum allowed PTEs that map shared
+ * anonymous pages for the given collapse operation.
+ * @cc: The collapse control struct
+ *
+ * Return: Maximum number of PTEs that map shared anonymous pages for the
+ * collapse operation
+ */
+static unsigned int collapse_max_ptes_shared(struct collapse_control *cc)
+{
+ // for MADV_COLLAPSE, do not restrict the number of PTEs that map shared
+ // anonymous pages.
+ if (!cc->is_khugepaged)
+ return HPAGE_PMD_NR;
+ return khugepaged_max_ptes_shared;
+}
+
+/**
+ * collapse_max_ptes_swap - Calculate the maximum allowed non-present PTEs or the
+ * maximum allowed non-present pagecache entries for the given collapse operation.
+ * @cc: The collapse control struct
+ *
+ * Return: Maximum number of non-present PTEs or the maximum allowed non-present
+ * pagecache entries for the collapse operation.
+ */
+static unsigned int collapse_max_ptes_swap(struct collapse_control *cc)
+{
+ // for MADV_COLLAPSE, do not restrict the number PTEs entries or
+ // pagecache entries that are non-present.
+ if (!cc->is_khugepaged)
+ return HPAGE_PMD_NR;
+ return khugepaged_max_ptes_swap;
+}
+
int hugepage_madvise(struct vm_area_struct *vma,
vm_flags_t *vm_flags, int advice)
{
@@ -546,21 +602,19 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
pte_t *_pte;
int none_or_zero = 0, shared = 0, referenced = 0;
enum scan_result result = SCAN_FAIL;
+ unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma);
+ unsigned int max_ptes_shared = collapse_max_ptes_shared(cc);
for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, addr += PAGE_SIZE) {
pte_t pteval = ptep_get(_pte);
if (pte_none_or_zero(pteval)) {
- ++none_or_zero;
- if (!userfaultfd_armed(vma) &&
- (!cc->is_khugepaged ||
- none_or_zero <= khugepaged_max_ptes_none)) {
- continue;
- } else {
+ if (++none_or_zero > max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
goto out;
}
+ continue;
}
if (!pte_present(pteval)) {
result = SCAN_PTE_NON_PRESENT;
@@ -591,9 +645,7 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
/* See collapse_scan_pmd(). */
if (folio_maybe_mapped_shared(folio)) {
- ++shared;
- if (cc->is_khugepaged &&
- shared > khugepaged_max_ptes_shared) {
+ if (++shared > max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
goto out;
@@ -1261,6 +1313,9 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long start_addr,
bool *lock_dropped, struct collapse_control *cc)
{
+ const unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma);
+ const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc);
+ const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc);
pmd_t *pmd;
pte_t *pte, *_pte;
int none_or_zero = 0, shared = 0, referenced = 0;
@@ -1294,36 +1349,29 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
pte_t pteval = ptep_get(_pte);
if (pte_none_or_zero(pteval)) {
- ++none_or_zero;
- if (!userfaultfd_armed(vma) &&
- (!cc->is_khugepaged ||
- none_or_zero <= khugepaged_max_ptes_none)) {
- continue;
- } else {
+ if (++none_or_zero > max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
goto out_unmap;
}
+ continue;
}
if (!pte_present(pteval)) {
- ++unmapped;
- if (!cc->is_khugepaged ||
- unmapped <= khugepaged_max_ptes_swap) {
- /*
- * Always be strict with uffd-wp
- * enabled swap entries. Please see
- * comment below for pte_uffd_wp().
- */
- if (pte_swp_uffd_wp_any(pteval)) {
- result = SCAN_PTE_UFFD_WP;
- goto out_unmap;
- }
- continue;
- } else {
+ if (++unmapped > max_ptes_swap) {
result = SCAN_EXCEED_SWAP_PTE;
count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
goto out_unmap;
}
+ /*
+ * Always be strict with uffd-wp
+ * enabled swap entries. Please see
+ * comment below for pte_uffd_wp().
+ */
+ if (pte_swp_uffd_wp_any(pteval)) {
+ result = SCAN_PTE_UFFD_WP;
+ goto out_unmap;
+ }
+ continue;
}
if (pte_uffd_wp(pteval)) {
/*
@@ -1366,9 +1414,7 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
* is shared.
*/
if (folio_maybe_mapped_shared(folio)) {
- ++shared;
- if (cc->is_khugepaged &&
- shared > khugepaged_max_ptes_shared) {
+ if (++shared > max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
goto out_unmap;
@@ -2323,6 +2369,8 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
unsigned long addr, struct file *file, pgoff_t start,
struct collapse_control *cc)
{
+ const unsigned int max_ptes_none = collapse_max_ptes_none(cc, NULL);
+ const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc);
struct folio *folio = NULL;
struct address_space *mapping = file->f_mapping;
XA_STATE(xas, &mapping->i_pages, start);
@@ -2341,8 +2389,7 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
if (xa_is_value(folio)) {
swap += 1 << xas_get_order(&xas);
- if (cc->is_khugepaged &&
- swap > khugepaged_max_ptes_swap) {
+ if (swap > max_ptes_swap) {
result = SCAN_EXCEED_SWAP_PTE;
count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
break;
@@ -2413,8 +2460,7 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
cc->progress += HPAGE_PMD_NR;
if (result == SCAN_SUCCEED) {
- if (cc->is_khugepaged &&
- present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+ if (present < HPAGE_PMD_NR - max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
} else {
--
2.54.0
next prev parent reply other threads:[~2026-05-11 18:59 UTC|newest]
Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-11 18:58 [PATCH mm-unstable v17 00/14] khugepaged: mTHP support Nico Pache
2026-05-11 18:58 ` [PATCH mm-unstable v17 01/14] mm/khugepaged: generalize hugepage_vma_revalidate for " Nico Pache
2026-05-11 18:58 ` [PATCH mm-unstable v17 02/14] mm/khugepaged: generalize alloc_charge_folio() Nico Pache
2026-05-11 18:58 ` Nico Pache [this message]
2026-05-12 4:44 ` [PATCH mm-unstable v17 03/14] mm/khugepaged: rework max_ptes_* handling with helper functions Lance Yang
2026-05-12 7:29 ` David Hildenbrand (Arm)
2026-05-11 18:58 ` [PATCH mm-unstable v17 04/14] mm/khugepaged: generalize __collapse_huge_page_* for mTHP support Nico Pache
2026-05-12 7:42 ` Lance Yang
2026-05-11 18:58 ` [PATCH mm-unstable v17 05/14] mm/khugepaged: require collapse_huge_page to enter/exit with the lock dropped Nico Pache
2026-05-12 7:42 ` David Hildenbrand (Arm)
2026-05-11 18:58 ` [PATCH mm-unstable v17 06/14] mm/khugepaged: generalize collapse_huge_page for mTHP collapse Nico Pache
2026-05-11 18:58 ` [PATCH mm-unstable v17 07/14] mm/khugepaged: skip collapsing mTHP to smaller orders Nico Pache
2026-05-11 18:58 ` [PATCH mm-unstable v17 08/14] mm/khugepaged: add per-order mTHP collapse failure statistics Nico Pache
2026-05-11 18:58 ` [PATCH mm-unstable v17 09/14] mm/khugepaged: improve tracepoints for mTHP orders Nico Pache
2026-05-11 18:58 ` [PATCH mm-unstable v17 10/14] mm/khugepaged: introduce collapse_allowable_orders helper function Nico Pache
2026-05-11 18:58 ` [PATCH mm-unstable v17 11/14] mm/khugepaged: Introduce mTHP collapse support Nico Pache
2026-05-12 15:44 ` Wei Yang
2026-05-11 18:58 ` [PATCH mm-unstable v17 12/14] mm/khugepaged: avoid unnecessary mTHP collapse attempts Nico Pache
2026-05-11 18:58 ` [PATCH mm-unstable v17 13/14] mm/khugepaged: run khugepaged for all orders Nico Pache
2026-05-11 18:58 ` [PATCH mm-unstable v17 14/14] Documentation: mm: update the admin guide for mTHP collapse Nico Pache
2026-05-11 21:04 ` [PATCH mm-unstable v17 00/14] khugepaged: mTHP support Andrew Morton
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260511185817.686831-4-npache@redhat.com \
--to=npache@redhat.com \
--cc=aarcange@redhat.com \
--cc=akpm@linux-foundation.org \
--cc=anshuman.khandual@arm.com \
--cc=apopple@nvidia.com \
--cc=baohua@kernel.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=byungchul@sk.com \
--cc=catalin.marinas@arm.com \
--cc=cl@gentwo.org \
--cc=corbet@lwn.net \
--cc=dave.hansen@linux.intel.com \
--cc=david@kernel.org \
--cc=dev.jain@arm.com \
--cc=gourry@gourry.net \
--cc=hannes@cmpxchg.org \
--cc=hughd@google.com \
--cc=jack@suse.cz \
--cc=jackmanb@google.com \
--cc=jannh@google.com \
--cc=jglisse@google.com \
--cc=joshua.hahnjy@gmail.com \
--cc=kas@kernel.org \
--cc=lance.yang@linux.dev \
--cc=liam@infradead.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-trace-kernel@vger.kernel.org \
--cc=ljs@kernel.org \
--cc=mathieu.desnoyers@efficios.com \
--cc=matthew.brost@intel.com \
--cc=mhiramat@kernel.org \
--cc=mhocko@suse.com \
--cc=peterx@redhat.com \
--cc=pfalcato@suse.de \
--cc=rakie.kim@sk.com \
--cc=raquini@redhat.com \
--cc=rdunlap@infradead.org \
--cc=richard.weiyang@gmail.com \
--cc=rientjes@google.com \
--cc=rostedt@goodmis.org \
--cc=rppt@kernel.org \
--cc=ryan.roberts@arm.com \
--cc=shivankg@amd.com \
--cc=sunnanyong@huawei.com \
--cc=surenb@google.com \
--cc=thomas.hellstrom@linux.intel.com \
--cc=tiwai@suse.de \
--cc=usama.arif@linux.dev \
--cc=usamaarif642@gmail.com \
--cc=vbabka@suse.cz \
--cc=vishal.moola@gmail.com \
--cc=wangkefeng.wang@huawei.com \
--cc=will@kernel.org \
--cc=willy@infradead.org \
--cc=yang@os.amperecomputing.com \
--cc=ying.huang@linux.alibaba.com \
--cc=ziy@nvidia.com \
--cc=zokeefe@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox