* [PATCH v12 mm-new 01/15] khugepaged: rename hpage_collapse_* to collapse_*
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-11-08 1:42 ` Wei Yang
2025-10-22 18:37 ` [PATCH v12 mm-new 02/15] introduce collapse_single_pmd to unify khugepaged and madvise_collapse Nico Pache
` (14 subsequent siblings)
15 siblings, 1 reply; 77+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas, tiwai,
will, dave.hansen, jack, cl, jglisse, surenb, zokeefe, hannes,
rientjes, mhocko, rdunlap, hughd, richard.weiyang, lance.yang,
vbabka, rppt, jannh, pfalcato
The hpage_collapse functions describe functions used by madvise_collapse
and khugepaged. remove the unnecessary hpage prefix to shorten the
function name.
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 73 ++++++++++++++++++++++++-------------------------
mm/mremap.c | 2 +-
2 files changed, 37 insertions(+), 38 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 5b7276bc14b1..6c4abc7f45cf 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -395,14 +395,14 @@ void __init khugepaged_destroy(void)
kmem_cache_destroy(mm_slot_cache);
}
-static inline int hpage_collapse_test_exit(struct mm_struct *mm)
+static inline int collapse_test_exit(struct mm_struct *mm)
{
return atomic_read(&mm->mm_users) == 0;
}
-static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
+static inline int collapse_test_exit_or_disable(struct mm_struct *mm)
{
- return hpage_collapse_test_exit(mm) ||
+ return collapse_test_exit(mm) ||
mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
}
@@ -436,7 +436,7 @@ void __khugepaged_enter(struct mm_struct *mm)
int wakeup;
/* __khugepaged_exit() must not run from under us */
- VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
+ VM_BUG_ON_MM(collapse_test_exit(mm), mm);
if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm)))
return;
@@ -490,7 +490,7 @@ void __khugepaged_exit(struct mm_struct *mm)
} else if (slot) {
/*
* This is required to serialize against
- * hpage_collapse_test_exit() (which is guaranteed to run
+ * collapse_test_exit() (which is guaranteed to run
* under mmap sem read mode). Stop here (after we return all
* pagetables will be destroyed) until khugepaged has finished
* working on the pagetables under the mmap_lock.
@@ -580,7 +580,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
folio = page_folio(page);
VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
- /* See hpage_collapse_scan_pmd(). */
+ /* See collapse_scan_pmd(). */
if (folio_maybe_mapped_shared(folio)) {
++shared;
if (cc->is_khugepaged &&
@@ -831,7 +831,7 @@ struct collapse_control khugepaged_collapse_control = {
.is_khugepaged = true,
};
-static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc)
+static bool collapse_scan_abort(int nid, struct collapse_control *cc)
{
int i;
@@ -866,7 +866,7 @@ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
}
#ifdef CONFIG_NUMA
-static int hpage_collapse_find_target_node(struct collapse_control *cc)
+static int collapse_find_target_node(struct collapse_control *cc)
{
int nid, target_node = 0, max_value = 0;
@@ -885,7 +885,7 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc)
return target_node;
}
#else
-static int hpage_collapse_find_target_node(struct collapse_control *cc)
+static int collapse_find_target_node(struct collapse_control *cc)
{
return 0;
}
@@ -906,7 +906,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED :
TVA_FORCED_COLLAPSE;
- if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
+ if (unlikely(collapse_test_exit_or_disable(mm)))
return SCAN_ANY_PROCESS;
*vmap = vma = find_vma(mm, address);
@@ -979,7 +979,7 @@ static int check_pmd_still_valid(struct mm_struct *mm,
/*
* Bring missing pages in from swap, to complete THP collapse.
- * Only done if hpage_collapse_scan_pmd believes it is worthwhile.
+ * Only done if khugepaged_scan_pmd believes it is worthwhile.
*
* Called and returns without pte mapped or spinlocks held.
* Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
@@ -1065,7 +1065,7 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
{
gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
GFP_TRANSHUGE);
- int node = hpage_collapse_find_target_node(cc);
+ int node = collapse_find_target_node(cc);
struct folio *folio;
folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
@@ -1244,10 +1244,10 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
return result;
}
-static int hpage_collapse_scan_pmd(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long start_addr, bool *mmap_locked,
- struct collapse_control *cc)
+static int collapse_scan_pmd(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long start_addr, bool *mmap_locked,
+ struct collapse_control *cc)
{
pmd_t *pmd;
pte_t *pte, *_pte;
@@ -1355,7 +1355,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
* hit record.
*/
node = folio_nid(folio);
- if (hpage_collapse_scan_abort(node, cc)) {
+ if (collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
goto out_unmap;
}
@@ -1421,7 +1421,7 @@ static void collect_mm_slot(struct mm_slot *slot)
lockdep_assert_held(&khugepaged_mm_lock);
- if (hpage_collapse_test_exit(mm)) {
+ if (collapse_test_exit(mm)) {
/* free mm_slot */
hash_del(&slot->hash);
list_del(&slot->mm_node);
@@ -1741,7 +1741,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED)
continue;
- if (hpage_collapse_test_exit(mm))
+ if (collapse_test_exit(mm))
continue;
/*
* When a vma is registered with uffd-wp, we cannot recycle
@@ -2263,9 +2263,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
return result;
}
-static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
- struct file *file, pgoff_t start,
- struct collapse_control *cc)
+static int collapse_scan_file(struct mm_struct *mm, unsigned long addr,
+ struct file *file, pgoff_t start,
+ struct collapse_control *cc)
{
struct folio *folio = NULL;
struct address_space *mapping = file->f_mapping;
@@ -2320,7 +2320,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
}
node = folio_nid(folio);
- if (hpage_collapse_scan_abort(node, cc)) {
+ if (collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
folio_put(folio);
break;
@@ -2370,7 +2370,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
return result;
}
-static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
+static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result,
struct collapse_control *cc)
__releases(&khugepaged_mm_lock)
__acquires(&khugepaged_mm_lock)
@@ -2405,7 +2405,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
goto breakouterloop_mmap_lock;
progress++;
- if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
+ if (unlikely(collapse_test_exit_or_disable(mm)))
goto breakouterloop;
vma_iter_init(&vmi, mm, khugepaged_scan.address);
@@ -2413,7 +2413,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
unsigned long hstart, hend;
cond_resched();
- if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
+ if (unlikely(collapse_test_exit_or_disable(mm))) {
progress++;
break;
}
@@ -2434,7 +2434,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
bool mmap_locked = true;
cond_resched();
- if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
+ if (unlikely(collapse_test_exit_or_disable(mm)))
goto breakouterloop;
VM_BUG_ON(khugepaged_scan.address < hstart ||
@@ -2447,12 +2447,12 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
mmap_read_unlock(mm);
mmap_locked = false;
- *result = hpage_collapse_scan_file(mm,
+ *result = collapse_scan_file(mm,
khugepaged_scan.address, file, pgoff, cc);
fput(file);
if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
mmap_read_lock(mm);
- if (hpage_collapse_test_exit_or_disable(mm))
+ if (collapse_test_exit_or_disable(mm))
goto breakouterloop;
*result = collapse_pte_mapped_thp(mm,
khugepaged_scan.address, false);
@@ -2461,7 +2461,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
mmap_read_unlock(mm);
}
} else {
- *result = hpage_collapse_scan_pmd(mm, vma,
+ *result = collapse_scan_pmd(mm, vma,
khugepaged_scan.address, &mmap_locked, cc);
}
@@ -2494,7 +2494,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
* Release the current mm_slot if this mm is about to die, or
* if we scanned all vmas of this mm.
*/
- if (hpage_collapse_test_exit(mm) || !vma) {
+ if (collapse_test_exit(mm) || !vma) {
/*
* Make sure that if mm_users is reaching zero while
* khugepaged runs here, khugepaged_exit will find
@@ -2545,8 +2545,8 @@ static void khugepaged_do_scan(struct collapse_control *cc)
pass_through_head++;
if (khugepaged_has_work() &&
pass_through_head < 2)
- progress += khugepaged_scan_mm_slot(pages - progress,
- &result, cc);
+ progress += collapse_scan_mm_slot(pages - progress,
+ &result, cc);
else
progress = pages;
spin_unlock(&khugepaged_mm_lock);
@@ -2787,12 +2787,11 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
mmap_read_unlock(mm);
mmap_locked = false;
- result = hpage_collapse_scan_file(mm, addr, file, pgoff,
- cc);
+ result = collapse_scan_file(mm, addr, file, pgoff, cc);
fput(file);
} else {
- result = hpage_collapse_scan_pmd(mm, vma, addr,
- &mmap_locked, cc);
+ result = collapse_scan_pmd(mm, vma, addr,
+ &mmap_locked, cc);
}
if (!mmap_locked)
*lock_dropped = true;
diff --git a/mm/mremap.c b/mm/mremap.c
index bd7314898ec5..e2a1793b43ce 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -240,7 +240,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
goto out;
}
/*
- * Now new_pte is none, so hpage_collapse_scan_file() path can not find
+ * Now new_pte is none, so collapse_scan_file() path can not find
* this by traversing file->f_mapping, so there is no concurrency with
* retract_page_tables(). In addition, we already hold the exclusive
* mmap_lock, so this new_pte page is stable, so there is no need to get
--
2.51.0
^ permalink raw reply related [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 01/15] khugepaged: rename hpage_collapse_* to collapse_*
2025-10-22 18:37 ` [PATCH v12 mm-new 01/15] khugepaged: rename hpage_collapse_* to collapse_* Nico Pache
@ 2025-11-08 1:42 ` Wei Yang
0 siblings, 0 replies; 77+ messages in thread
From: Wei Yang @ 2025-11-08 1:42 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david, ziy,
baolin.wang, lorenzo.stoakes, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 22, 2025 at 12:37:03PM -0600, Nico Pache wrote:
>The hpage_collapse functions describe functions used by madvise_collapse
>and khugepaged. remove the unnecessary hpage prefix to shorten the
>function name.
>
>Reviewed-by: Lance Yang <lance.yang@linux.dev>
>Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
>Reviewed-by: Zi Yan <ziy@nvidia.com>
>Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>Acked-by: David Hildenbrand <david@redhat.com>
>Signed-off-by: Nico Pache <npache@redhat.com>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
--
Wei Yang
Help you, Help me
^ permalink raw reply [flat|nested] 77+ messages in thread
* [PATCH v12 mm-new 02/15] introduce collapse_single_pmd to unify khugepaged and madvise_collapse
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
2025-10-22 18:37 ` [PATCH v12 mm-new 01/15] khugepaged: rename hpage_collapse_* to collapse_* Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-10-27 9:00 ` Lance Yang
` (2 more replies)
2025-10-22 18:37 ` [PATCH v12 mm-new 03/15] khugepaged: generalize hugepage_vma_revalidate for mTHP support Nico Pache
` (13 subsequent siblings)
15 siblings, 3 replies; 77+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas, tiwai,
will, dave.hansen, jack, cl, jglisse, surenb, zokeefe, hannes,
rientjes, mhocko, rdunlap, hughd, richard.weiyang, lance.yang,
vbabka, rppt, jannh, pfalcato
The khugepaged daemon and madvise_collapse have two different
implementations that do almost the same thing.
Create collapse_single_pmd to increase code reuse and create an entry
point to these two users.
Refactor madvise_collapse and collapse_scan_mm_slot to use the new
collapse_single_pmd function. This introduces a minor behavioral change
that is most likely an undiscovered bug. The current implementation of
khugepaged tests collapse_test_exit_or_disable before calling
collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
case. By unifying these two callers madvise_collapse now also performs
this check. We also modify the return value to be SCAN_ANY_PROCESS which
properly indicates that this process is no longer valid to operate on.
We also guard the khugepaged_pages_collapsed variable to ensure its only
incremented for khugepaged.
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 97 ++++++++++++++++++++++++++-----------------------
1 file changed, 52 insertions(+), 45 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6c4abc7f45cf..36e31d99e507 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2370,6 +2370,53 @@ static int collapse_scan_file(struct mm_struct *mm, unsigned long addr,
return result;
}
+/*
+ * Try to collapse a single PMD starting at a PMD aligned addr, and return
+ * the results.
+ */
+static int collapse_single_pmd(unsigned long addr,
+ struct vm_area_struct *vma, bool *mmap_locked,
+ struct collapse_control *cc)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ int result;
+ struct file *file;
+ pgoff_t pgoff;
+
+ if (vma_is_anonymous(vma)) {
+ result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cc);
+ goto end;
+ }
+
+ file = get_file(vma->vm_file);
+ pgoff = linear_page_index(vma, addr);
+
+ mmap_read_unlock(mm);
+ *mmap_locked = false;
+ result = collapse_scan_file(mm, addr, file, pgoff, cc);
+ fput(file);
+ if (result != SCAN_PTE_MAPPED_HUGEPAGE)
+ goto end;
+
+ mmap_read_lock(mm);
+ *mmap_locked = true;
+ if (collapse_test_exit_or_disable(mm)) {
+ mmap_read_unlock(mm);
+ *mmap_locked = false;
+ return SCAN_ANY_PROCESS;
+ }
+ result = collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
+ if (result == SCAN_PMD_MAPPED)
+ result = SCAN_SUCCEED;
+ mmap_read_unlock(mm);
+ *mmap_locked = false;
+
+end:
+ if (cc->is_khugepaged && result == SCAN_SUCCEED)
+ ++khugepaged_pages_collapsed;
+ return result;
+}
+
static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result,
struct collapse_control *cc)
__releases(&khugepaged_mm_lock)
@@ -2440,34 +2487,9 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result,
VM_BUG_ON(khugepaged_scan.address < hstart ||
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
- if (!vma_is_anonymous(vma)) {
- struct file *file = get_file(vma->vm_file);
- pgoff_t pgoff = linear_page_index(vma,
- khugepaged_scan.address);
-
- mmap_read_unlock(mm);
- mmap_locked = false;
- *result = collapse_scan_file(mm,
- khugepaged_scan.address, file, pgoff, cc);
- fput(file);
- if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
- mmap_read_lock(mm);
- if (collapse_test_exit_or_disable(mm))
- goto breakouterloop;
- *result = collapse_pte_mapped_thp(mm,
- khugepaged_scan.address, false);
- if (*result == SCAN_PMD_MAPPED)
- *result = SCAN_SUCCEED;
- mmap_read_unlock(mm);
- }
- } else {
- *result = collapse_scan_pmd(mm, vma,
- khugepaged_scan.address, &mmap_locked, cc);
- }
-
- if (*result == SCAN_SUCCEED)
- ++khugepaged_pages_collapsed;
+ *result = collapse_single_pmd(khugepaged_scan.address,
+ vma, &mmap_locked, cc);
/* move to next address */
khugepaged_scan.address += HPAGE_PMD_SIZE;
progress += HPAGE_PMD_NR;
@@ -2781,34 +2803,19 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
mmap_assert_locked(mm);
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
- if (!vma_is_anonymous(vma)) {
- struct file *file = get_file(vma->vm_file);
- pgoff_t pgoff = linear_page_index(vma, addr);
- mmap_read_unlock(mm);
- mmap_locked = false;
- result = collapse_scan_file(mm, addr, file, pgoff, cc);
- fput(file);
- } else {
- result = collapse_scan_pmd(mm, vma, addr,
- &mmap_locked, cc);
- }
+ result = collapse_single_pmd(addr, vma, &mmap_locked, cc);
+
if (!mmap_locked)
*lock_dropped = true;
-handle_result:
switch (result) {
case SCAN_SUCCEED:
case SCAN_PMD_MAPPED:
++thps;
break;
- case SCAN_PTE_MAPPED_HUGEPAGE:
- BUG_ON(mmap_locked);
- mmap_read_lock(mm);
- result = collapse_pte_mapped_thp(mm, addr, true);
- mmap_read_unlock(mm);
- goto handle_result;
/* Whitelisted set of results where continuing OK */
+ case SCAN_PTE_MAPPED_HUGEPAGE:
case SCAN_PMD_NULL:
case SCAN_PTE_NON_PRESENT:
case SCAN_PTE_UFFD_WP:
--
2.51.0
^ permalink raw reply related [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 02/15] introduce collapse_single_pmd to unify khugepaged and madvise_collapse
2025-10-22 18:37 ` [PATCH v12 mm-new 02/15] introduce collapse_single_pmd to unify khugepaged and madvise_collapse Nico Pache
@ 2025-10-27 9:00 ` Lance Yang
2025-10-27 15:44 ` Lorenzo Stoakes
2025-11-08 1:44 ` Wei Yang
2 siblings, 0 replies; 77+ messages in thread
From: Lance Yang @ 2025-10-27 9:00 UTC (permalink / raw)
To: Nico Pache
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, linux-mm, linux-trace-kernel,
willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
linux-kernel, vishal.moola, thomas.hellstrom, linux-doc, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas, tiwai,
will, dave.hansen, jack, cl, jglisse, surenb, zokeefe, hannes,
rientjes, mhocko, rdunlap, hughd, richard.weiyang, vbabka, rppt,
jannh, pfalcato
On 2025/10/23 02:37, Nico Pache wrote:
> The khugepaged daemon and madvise_collapse have two different
> implementations that do almost the same thing.
>
> Create collapse_single_pmd to increase code reuse and create an entry
> point to these two users.
>
> Refactor madvise_collapse and collapse_scan_mm_slot to use the new
> collapse_single_pmd function. This introduces a minor behavioral change
> that is most likely an undiscovered bug. The current implementation of
> khugepaged tests collapse_test_exit_or_disable before calling
> collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
> case. By unifying these two callers madvise_collapse now also performs
> this check. We also modify the return value to be SCAN_ANY_PROCESS which
> properly indicates that this process is no longer valid to operate on.
>
> We also guard the khugepaged_pages_collapsed variable to ensure its only
> incremented for khugepaged.
>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Acked-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
Nice cleanup! LGTM.
Reviewed-by: Lance Yang <lance.yang@linux.dev>
> mm/khugepaged.c | 97 ++++++++++++++++++++++++++-----------------------
> 1 file changed, 52 insertions(+), 45 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 6c4abc7f45cf..36e31d99e507 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -2370,6 +2370,53 @@ static int collapse_scan_file(struct mm_struct *mm, unsigned long addr,
> return result;
> }
>
> +/*
> + * Try to collapse a single PMD starting at a PMD aligned addr, and return
> + * the results.
> + */
> +static int collapse_single_pmd(unsigned long addr,
> + struct vm_area_struct *vma, bool *mmap_locked,
> + struct collapse_control *cc)
> +{
> + struct mm_struct *mm = vma->vm_mm;
> + int result;
> + struct file *file;
> + pgoff_t pgoff;
> +
> + if (vma_is_anonymous(vma)) {
> + result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cc);
> + goto end;
> + }
> +
> + file = get_file(vma->vm_file);
> + pgoff = linear_page_index(vma, addr);
> +
> + mmap_read_unlock(mm);
> + *mmap_locked = false;
> + result = collapse_scan_file(mm, addr, file, pgoff, cc);
> + fput(file);
> + if (result != SCAN_PTE_MAPPED_HUGEPAGE)
> + goto end;
> +
> + mmap_read_lock(mm);
> + *mmap_locked = true;
> + if (collapse_test_exit_or_disable(mm)) {
> + mmap_read_unlock(mm);
> + *mmap_locked = false;
> + return SCAN_ANY_PROCESS;
> + }
> + result = collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
> + if (result == SCAN_PMD_MAPPED)
> + result = SCAN_SUCCEED;
> + mmap_read_unlock(mm);
> + *mmap_locked = false;
> +
> +end:
> + if (cc->is_khugepaged && result == SCAN_SUCCEED)
> + ++khugepaged_pages_collapsed;
> + return result;
> +}
> +
> static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result,
> struct collapse_control *cc)
> __releases(&khugepaged_mm_lock)
> @@ -2440,34 +2487,9 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result,
> VM_BUG_ON(khugepaged_scan.address < hstart ||
> khugepaged_scan.address + HPAGE_PMD_SIZE >
> hend);
> - if (!vma_is_anonymous(vma)) {
> - struct file *file = get_file(vma->vm_file);
> - pgoff_t pgoff = linear_page_index(vma,
> - khugepaged_scan.address);
> -
> - mmap_read_unlock(mm);
> - mmap_locked = false;
> - *result = collapse_scan_file(mm,
> - khugepaged_scan.address, file, pgoff, cc);
> - fput(file);
> - if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> - mmap_read_lock(mm);
> - if (collapse_test_exit_or_disable(mm))
> - goto breakouterloop;
> - *result = collapse_pte_mapped_thp(mm,
> - khugepaged_scan.address, false);
> - if (*result == SCAN_PMD_MAPPED)
> - *result = SCAN_SUCCEED;
> - mmap_read_unlock(mm);
> - }
> - } else {
> - *result = collapse_scan_pmd(mm, vma,
> - khugepaged_scan.address, &mmap_locked, cc);
> - }
> -
> - if (*result == SCAN_SUCCEED)
> - ++khugepaged_pages_collapsed;
>
> + *result = collapse_single_pmd(khugepaged_scan.address,
> + vma, &mmap_locked, cc);
> /* move to next address */
> khugepaged_scan.address += HPAGE_PMD_SIZE;
> progress += HPAGE_PMD_NR;
> @@ -2781,34 +2803,19 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> mmap_assert_locked(mm);
> memset(cc->node_load, 0, sizeof(cc->node_load));
> nodes_clear(cc->alloc_nmask);
> - if (!vma_is_anonymous(vma)) {
> - struct file *file = get_file(vma->vm_file);
> - pgoff_t pgoff = linear_page_index(vma, addr);
>
> - mmap_read_unlock(mm);
> - mmap_locked = false;
> - result = collapse_scan_file(mm, addr, file, pgoff, cc);
> - fput(file);
> - } else {
> - result = collapse_scan_pmd(mm, vma, addr,
> - &mmap_locked, cc);
> - }
> + result = collapse_single_pmd(addr, vma, &mmap_locked, cc);
> +
> if (!mmap_locked)
> *lock_dropped = true;
>
> -handle_result:
> switch (result) {
> case SCAN_SUCCEED:
> case SCAN_PMD_MAPPED:
> ++thps;
> break;
> - case SCAN_PTE_MAPPED_HUGEPAGE:
> - BUG_ON(mmap_locked);
> - mmap_read_lock(mm);
> - result = collapse_pte_mapped_thp(mm, addr, true);
> - mmap_read_unlock(mm);
> - goto handle_result;
> /* Whitelisted set of results where continuing OK */
> + case SCAN_PTE_MAPPED_HUGEPAGE:
> case SCAN_PMD_NULL:
> case SCAN_PTE_NON_PRESENT:
> case SCAN_PTE_UFFD_WP:
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 02/15] introduce collapse_single_pmd to unify khugepaged and madvise_collapse
2025-10-22 18:37 ` [PATCH v12 mm-new 02/15] introduce collapse_single_pmd to unify khugepaged and madvise_collapse Nico Pache
2025-10-27 9:00 ` Lance Yang
@ 2025-10-27 15:44 ` Lorenzo Stoakes
2025-11-08 1:44 ` Wei Yang
2 siblings, 0 replies; 77+ messages in thread
From: Lorenzo Stoakes @ 2025-10-27 15:44 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david, ziy,
baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Wed, Oct 22, 2025 at 12:37:04PM -0600, Nico Pache wrote:
> The khugepaged daemon and madvise_collapse have two different
> implementations that do almost the same thing.
>
> Create collapse_single_pmd to increase code reuse and create an entry
> point to these two users.
>
> Refactor madvise_collapse and collapse_scan_mm_slot to use the new
> collapse_single_pmd function. This introduces a minor behavioral change
> that is most likely an undiscovered bug. The current implementation of
> khugepaged tests collapse_test_exit_or_disable before calling
> collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
> case. By unifying these two callers madvise_collapse now also performs
> this check. We also modify the return value to be SCAN_ANY_PROCESS which
> properly indicates that this process is no longer valid to operate on.
>
> We also guard the khugepaged_pages_collapsed variable to ensure its only
> incremented for khugepaged.
>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Acked-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
Thanks, this LGTM so:
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
> mm/khugepaged.c | 97 ++++++++++++++++++++++++++-----------------------
> 1 file changed, 52 insertions(+), 45 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 6c4abc7f45cf..36e31d99e507 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -2370,6 +2370,53 @@ static int collapse_scan_file(struct mm_struct *mm, unsigned long addr,
> return result;
> }
>
> +/*
> + * Try to collapse a single PMD starting at a PMD aligned addr, and return
> + * the results.
> + */
> +static int collapse_single_pmd(unsigned long addr,
> + struct vm_area_struct *vma, bool *mmap_locked,
> + struct collapse_control *cc)
> +{
> + struct mm_struct *mm = vma->vm_mm;
> + int result;
> + struct file *file;
> + pgoff_t pgoff;
> +
> + if (vma_is_anonymous(vma)) {
> + result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cc);
> + goto end;
> + }
> +
> + file = get_file(vma->vm_file);
> + pgoff = linear_page_index(vma, addr);
> +
> + mmap_read_unlock(mm);
> + *mmap_locked = false;
> + result = collapse_scan_file(mm, addr, file, pgoff, cc);
> + fput(file);
> + if (result != SCAN_PTE_MAPPED_HUGEPAGE)
> + goto end;
> +
> + mmap_read_lock(mm);
> + *mmap_locked = true;
> + if (collapse_test_exit_or_disable(mm)) {
> + mmap_read_unlock(mm);
> + *mmap_locked = false;
> + return SCAN_ANY_PROCESS;
> + }
> + result = collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
> + if (result == SCAN_PMD_MAPPED)
> + result = SCAN_SUCCEED;
> + mmap_read_unlock(mm);
> + *mmap_locked = false;
> +
> +end:
> + if (cc->is_khugepaged && result == SCAN_SUCCEED)
> + ++khugepaged_pages_collapsed;
> + return result;
> +}
> +
> static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result,
> struct collapse_control *cc)
> __releases(&khugepaged_mm_lock)
> @@ -2440,34 +2487,9 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result,
> VM_BUG_ON(khugepaged_scan.address < hstart ||
> khugepaged_scan.address + HPAGE_PMD_SIZE >
> hend);
> - if (!vma_is_anonymous(vma)) {
> - struct file *file = get_file(vma->vm_file);
> - pgoff_t pgoff = linear_page_index(vma,
> - khugepaged_scan.address);
> -
> - mmap_read_unlock(mm);
> - mmap_locked = false;
> - *result = collapse_scan_file(mm,
> - khugepaged_scan.address, file, pgoff, cc);
> - fput(file);
> - if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> - mmap_read_lock(mm);
> - if (collapse_test_exit_or_disable(mm))
> - goto breakouterloop;
> - *result = collapse_pte_mapped_thp(mm,
> - khugepaged_scan.address, false);
> - if (*result == SCAN_PMD_MAPPED)
> - *result = SCAN_SUCCEED;
> - mmap_read_unlock(mm);
> - }
> - } else {
> - *result = collapse_scan_pmd(mm, vma,
> - khugepaged_scan.address, &mmap_locked, cc);
> - }
> -
> - if (*result == SCAN_SUCCEED)
> - ++khugepaged_pages_collapsed;
>
> + *result = collapse_single_pmd(khugepaged_scan.address,
> + vma, &mmap_locked, cc);
> /* move to next address */
> khugepaged_scan.address += HPAGE_PMD_SIZE;
> progress += HPAGE_PMD_NR;
> @@ -2781,34 +2803,19 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> mmap_assert_locked(mm);
> memset(cc->node_load, 0, sizeof(cc->node_load));
> nodes_clear(cc->alloc_nmask);
> - if (!vma_is_anonymous(vma)) {
> - struct file *file = get_file(vma->vm_file);
> - pgoff_t pgoff = linear_page_index(vma, addr);
>
> - mmap_read_unlock(mm);
> - mmap_locked = false;
> - result = collapse_scan_file(mm, addr, file, pgoff, cc);
> - fput(file);
> - } else {
> - result = collapse_scan_pmd(mm, vma, addr,
> - &mmap_locked, cc);
> - }
> + result = collapse_single_pmd(addr, vma, &mmap_locked, cc);
> +
> if (!mmap_locked)
> *lock_dropped = true;
>
> -handle_result:
> switch (result) {
> case SCAN_SUCCEED:
> case SCAN_PMD_MAPPED:
> ++thps;
> break;
> - case SCAN_PTE_MAPPED_HUGEPAGE:
> - BUG_ON(mmap_locked);
> - mmap_read_lock(mm);
> - result = collapse_pte_mapped_thp(mm, addr, true);
> - mmap_read_unlock(mm);
> - goto handle_result;
> /* Whitelisted set of results where continuing OK */
> + case SCAN_PTE_MAPPED_HUGEPAGE:
> case SCAN_PMD_NULL:
> case SCAN_PTE_NON_PRESENT:
> case SCAN_PTE_UFFD_WP:
> --
> 2.51.0
>
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 02/15] introduce collapse_single_pmd to unify khugepaged and madvise_collapse
2025-10-22 18:37 ` [PATCH v12 mm-new 02/15] introduce collapse_single_pmd to unify khugepaged and madvise_collapse Nico Pache
2025-10-27 9:00 ` Lance Yang
2025-10-27 15:44 ` Lorenzo Stoakes
@ 2025-11-08 1:44 ` Wei Yang
2 siblings, 0 replies; 77+ messages in thread
From: Wei Yang @ 2025-11-08 1:44 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david, ziy,
baolin.wang, lorenzo.stoakes, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 22, 2025 at 12:37:04PM -0600, Nico Pache wrote:
>The khugepaged daemon and madvise_collapse have two different
>implementations that do almost the same thing.
>
>Create collapse_single_pmd to increase code reuse and create an entry
>point to these two users.
>
>Refactor madvise_collapse and collapse_scan_mm_slot to use the new
>collapse_single_pmd function. This introduces a minor behavioral change
>that is most likely an undiscovered bug. The current implementation of
>khugepaged tests collapse_test_exit_or_disable before calling
>collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
>case. By unifying these two callers madvise_collapse now also performs
>this check. We also modify the return value to be SCAN_ANY_PROCESS which
>properly indicates that this process is no longer valid to operate on.
>
>We also guard the khugepaged_pages_collapsed variable to ensure its only
>incremented for khugepaged.
>
>Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>Acked-by: David Hildenbrand <david@redhat.com>
>Signed-off-by: Nico Pache <npache@redhat.com>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
One nit below.
>---
> mm/khugepaged.c | 97 ++++++++++++++++++++++++++-----------------------
> 1 file changed, 52 insertions(+), 45 deletions(-)
>
>diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>index 6c4abc7f45cf..36e31d99e507 100644
>--- a/mm/khugepaged.c
>+++ b/mm/khugepaged.c
>@@ -2370,6 +2370,53 @@ static int collapse_scan_file(struct mm_struct *mm, unsigned long addr,
> return result;
> }
>
>+/*
>+ * Try to collapse a single PMD starting at a PMD aligned addr, and return
>+ * the results.
>+ */
>+static int collapse_single_pmd(unsigned long addr,
>+ struct vm_area_struct *vma, bool *mmap_locked,
>+ struct collapse_control *cc)
>+{
>+ struct mm_struct *mm = vma->vm_mm;
>+ int result;
>+ struct file *file;
>+ pgoff_t pgoff;
>+
>+ if (vma_is_anonymous(vma)) {
>+ result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cc);
>+ goto end;
>+ }
>+
>+ file = get_file(vma->vm_file);
>+ pgoff = linear_page_index(vma, addr);
>+
>+ mmap_read_unlock(mm);
>+ *mmap_locked = false;
>+ result = collapse_scan_file(mm, addr, file, pgoff, cc);
>+ fput(file);
>+ if (result != SCAN_PTE_MAPPED_HUGEPAGE)
>+ goto end;
>+
>+ mmap_read_lock(mm);
>+ *mmap_locked = true;
>+ if (collapse_test_exit_or_disable(mm)) {
>+ mmap_read_unlock(mm);
>+ *mmap_locked = false;
>+ return SCAN_ANY_PROCESS;
>+ }
>+ result = collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
>+ if (result == SCAN_PMD_MAPPED)
>+ result = SCAN_SUCCEED;
>+ mmap_read_unlock(mm);
>+ *mmap_locked = false;
For all cases, we would set mmap_locked to false. Not sure it bother to adjust
it.
--
Wei Yang
Help you, Help me
^ permalink raw reply [flat|nested] 77+ messages in thread
* [PATCH v12 mm-new 03/15] khugepaged: generalize hugepage_vma_revalidate for mTHP support
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
2025-10-22 18:37 ` [PATCH v12 mm-new 01/15] khugepaged: rename hpage_collapse_* to collapse_* Nico Pache
2025-10-22 18:37 ` [PATCH v12 mm-new 02/15] introduce collapse_single_pmd to unify khugepaged and madvise_collapse Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-10-27 9:02 ` Lance Yang
2025-11-08 1:54 ` Wei Yang
2025-10-22 18:37 ` [PATCH v12 mm-new 04/15] khugepaged: generalize alloc_charge_folio() Nico Pache
` (12 subsequent siblings)
15 siblings, 2 replies; 77+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas, tiwai,
will, dave.hansen, jack, cl, jglisse, surenb, zokeefe, hannes,
rientjes, mhocko, rdunlap, hughd, richard.weiyang, lance.yang,
vbabka, rppt, jannh, pfalcato
For khugepaged to support different mTHP orders, we must generalize this
to check if the PMD is not shared by another VMA and that the order is
enabled.
No functional change in this patch. Also correct a comment about the
functionality of the revalidation.
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Co-developed-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 20 +++++++++++---------
1 file changed, 11 insertions(+), 9 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 36e31d99e507..6cf8700823f9 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -893,14 +893,13 @@ static int collapse_find_target_node(struct collapse_control *cc)
/*
* If mmap_lock temporarily dropped, revalidate vma
- * before taking mmap_lock.
+ * after taking the mmap_lock again.
* Returns enum scan_result value.
*/
static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
- bool expect_anon,
- struct vm_area_struct **vmap,
- struct collapse_control *cc)
+ bool expect_anon, struct vm_area_struct **vmap,
+ struct collapse_control *cc, unsigned int order)
{
struct vm_area_struct *vma;
enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED :
@@ -913,15 +912,16 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
if (!vma)
return SCAN_VMA_NULL;
+ /* Always check the PMD order to ensure its not shared by another VMA */
if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
return SCAN_ADDRESS_RANGE;
- if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER))
+ if (!thp_vma_allowable_orders(vma, vma->vm_flags, type, BIT(order)))
return SCAN_VMA_CHECK;
/*
* Anon VMA expected, the address may be unmapped then
* remapped to file after khugepaged reaquired the mmap_lock.
*
- * thp_vma_allowable_order may return true for qualified file
+ * thp_vma_allowable_orders may return true for qualified file
* vmas.
*/
if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
@@ -1117,7 +1117,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
goto out_nolock;
mmap_read_lock(mm);
- result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
+ result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
+ HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm);
goto out_nolock;
@@ -1151,7 +1152,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
* mmap_lock.
*/
mmap_write_lock(mm);
- result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
+ result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
+ HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED)
goto out_up_write;
/* check if the pmd is still valid */
@@ -2792,7 +2794,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
mmap_read_lock(mm);
mmap_locked = true;
result = hugepage_vma_revalidate(mm, addr, false, &vma,
- cc);
+ cc, HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED) {
last_fail = result;
goto out_nolock;
--
2.51.0
^ permalink raw reply related [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 03/15] khugepaged: generalize hugepage_vma_revalidate for mTHP support
2025-10-22 18:37 ` [PATCH v12 mm-new 03/15] khugepaged: generalize hugepage_vma_revalidate for mTHP support Nico Pache
@ 2025-10-27 9:02 ` Lance Yang
2025-11-08 1:54 ` Wei Yang
1 sibling, 0 replies; 77+ messages in thread
From: Lance Yang @ 2025-10-27 9:02 UTC (permalink / raw)
To: Nico Pache
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, linux-kernel, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, linux-doc, linux-mm,
linux-trace-kernel, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, vbabka, rppt, jannh, pfalcato
On 2025/10/23 02:37, Nico Pache wrote:
> For khugepaged to support different mTHP orders, we must generalize this
> to check if the PMD is not shared by another VMA and that the order is
> enabled.
>
> No functional change in this patch. Also correct a comment about the
> functionality of the revalidation.
>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Acked-by: David Hildenbrand <david@redhat.com>
> Co-developed-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
LGTM!
Reviewed-by: Lance Yang <lance.yang@linux.dev>
> mm/khugepaged.c | 20 +++++++++++---------
> 1 file changed, 11 insertions(+), 9 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 36e31d99e507..6cf8700823f9 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -893,14 +893,13 @@ static int collapse_find_target_node(struct collapse_control *cc)
>
> /*
> * If mmap_lock temporarily dropped, revalidate vma
> - * before taking mmap_lock.
> + * after taking the mmap_lock again.
> * Returns enum scan_result value.
> */
>
> static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
> - bool expect_anon,
> - struct vm_area_struct **vmap,
> - struct collapse_control *cc)
> + bool expect_anon, struct vm_area_struct **vmap,
> + struct collapse_control *cc, unsigned int order)
> {
> struct vm_area_struct *vma;
> enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED :
> @@ -913,15 +912,16 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
> if (!vma)
> return SCAN_VMA_NULL;
>
> + /* Always check the PMD order to ensure its not shared by another VMA */
> if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
> return SCAN_ADDRESS_RANGE;
> - if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER))
> + if (!thp_vma_allowable_orders(vma, vma->vm_flags, type, BIT(order)))
> return SCAN_VMA_CHECK;
> /*
> * Anon VMA expected, the address may be unmapped then
> * remapped to file after khugepaged reaquired the mmap_lock.
> *
> - * thp_vma_allowable_order may return true for qualified file
> + * thp_vma_allowable_orders may return true for qualified file
> * vmas.
> */
> if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
> @@ -1117,7 +1117,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> goto out_nolock;
>
> mmap_read_lock(mm);
> - result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
> + result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> + HPAGE_PMD_ORDER);
> if (result != SCAN_SUCCEED) {
> mmap_read_unlock(mm);
> goto out_nolock;
> @@ -1151,7 +1152,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> * mmap_lock.
> */
> mmap_write_lock(mm);
> - result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
> + result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> + HPAGE_PMD_ORDER);
> if (result != SCAN_SUCCEED)
> goto out_up_write;
> /* check if the pmd is still valid */
> @@ -2792,7 +2794,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> mmap_read_lock(mm);
> mmap_locked = true;
> result = hugepage_vma_revalidate(mm, addr, false, &vma,
> - cc);
> + cc, HPAGE_PMD_ORDER);
> if (result != SCAN_SUCCEED) {
> last_fail = result;
> goto out_nolock;
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 03/15] khugepaged: generalize hugepage_vma_revalidate for mTHP support
2025-10-22 18:37 ` [PATCH v12 mm-new 03/15] khugepaged: generalize hugepage_vma_revalidate for mTHP support Nico Pache
2025-10-27 9:02 ` Lance Yang
@ 2025-11-08 1:54 ` Wei Yang
1 sibling, 0 replies; 77+ messages in thread
From: Wei Yang @ 2025-11-08 1:54 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david, ziy,
baolin.wang, lorenzo.stoakes, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 22, 2025 at 12:37:05PM -0600, Nico Pache wrote:
>For khugepaged to support different mTHP orders, we must generalize this
>to check if the PMD is not shared by another VMA and that the order is
>enabled.
>
>No functional change in this patch. Also correct a comment about the
>functionality of the revalidation.
>
>Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>Acked-by: David Hildenbrand <david@redhat.com>
>Co-developed-by: Dev Jain <dev.jain@arm.com>
>Signed-off-by: Dev Jain <dev.jain@arm.com>
>Signed-off-by: Nico Pache <npache@redhat.com>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
--
Wei Yang
Help you, Help me
^ permalink raw reply [flat|nested] 77+ messages in thread
* [PATCH v12 mm-new 04/15] khugepaged: generalize alloc_charge_folio()
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (2 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 03/15] khugepaged: generalize hugepage_vma_revalidate for mTHP support Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-10-27 9:05 ` Lance Yang
2025-11-08 2:34 ` Wei Yang
2025-10-22 18:37 ` [PATCH v12 mm-new 05/15] khugepaged: generalize __collapse_huge_page_* for mTHP support Nico Pache
` (11 subsequent siblings)
15 siblings, 2 replies; 77+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas, tiwai,
will, dave.hansen, jack, cl, jglisse, surenb, zokeefe, hannes,
rientjes, mhocko, rdunlap, hughd, richard.weiyang, lance.yang,
vbabka, rppt, jannh, pfalcato
From: Dev Jain <dev.jain@arm.com>
Pass order to alloc_charge_folio() and update mTHP statistics.
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Co-developed-by: Nico Pache <npache@redhat.com>
Signed-off-by: Nico Pache <npache@redhat.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
---
Documentation/admin-guide/mm/transhuge.rst | 8 ++++++++
include/linux/huge_mm.h | 2 ++
mm/huge_memory.c | 4 ++++
mm/khugepaged.c | 17 +++++++++++------
4 files changed, 25 insertions(+), 6 deletions(-)
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 1654211cc6cf..13269a0074d4 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -634,6 +634,14 @@ anon_fault_fallback_charge
instead falls back to using huge pages with lower orders or
small pages even though the allocation was successful.
+collapse_alloc
+ is incremented every time a huge page is successfully allocated for a
+ khugepaged collapse.
+
+collapse_alloc_failed
+ is incremented every time a huge page allocation fails during a
+ khugepaged collapse.
+
zswpout
is incremented every time a huge page is swapped out to zswap in one
piece without splitting.
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7698b3542c4f..3d29624c4f3f 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -128,6 +128,8 @@ enum mthp_stat_item {
MTHP_STAT_ANON_FAULT_ALLOC,
MTHP_STAT_ANON_FAULT_FALLBACK,
MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
+ MTHP_STAT_COLLAPSE_ALLOC,
+ MTHP_STAT_COLLAPSE_ALLOC_FAILED,
MTHP_STAT_ZSWPOUT,
MTHP_STAT_SWPIN,
MTHP_STAT_SWPIN_FALLBACK,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 370ecfd6a182..0063d1ba926e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -620,6 +620,8 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+DEFINE_MTHP_STAT_ATTR(collapse_alloc, MTHP_STAT_COLLAPSE_ALLOC);
+DEFINE_MTHP_STAT_ATTR(collapse_alloc_failed, MTHP_STAT_COLLAPSE_ALLOC_FAILED);
DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK);
@@ -685,6 +687,8 @@ static struct attribute *any_stats_attrs[] = {
#endif
&split_attr.attr,
&split_failed_attr.attr,
+ &collapse_alloc_attr.attr,
+ &collapse_alloc_failed_attr.attr,
NULL,
};
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6cf8700823f9..36ee659acfbb 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1061,21 +1061,26 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
}
static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
- struct collapse_control *cc)
+ struct collapse_control *cc, unsigned int order)
{
gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
GFP_TRANSHUGE);
int node = collapse_find_target_node(cc);
struct folio *folio;
- folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
+ folio = __folio_alloc(gfp, order, node, &cc->alloc_nmask);
if (!folio) {
*foliop = NULL;
- count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ if (order == HPAGE_PMD_ORDER)
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_ALLOC_FAILED);
return SCAN_ALLOC_HUGE_PAGE_FAIL;
}
- count_vm_event(THP_COLLAPSE_ALLOC);
+ if (order == HPAGE_PMD_ORDER)
+ count_vm_event(THP_COLLAPSE_ALLOC);
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_ALLOC);
+
if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
folio_put(folio);
*foliop = NULL;
@@ -1112,7 +1117,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
*/
mmap_read_unlock(mm);
- result = alloc_charge_folio(&folio, mm, cc);
+ result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED)
goto out_nolock;
@@ -1850,7 +1855,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
- result = alloc_charge_folio(&new_folio, mm, cc);
+ result = alloc_charge_folio(&new_folio, mm, cc, HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED)
goto out;
--
2.51.0
^ permalink raw reply related [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 04/15] khugepaged: generalize alloc_charge_folio()
2025-10-22 18:37 ` [PATCH v12 mm-new 04/15] khugepaged: generalize alloc_charge_folio() Nico Pache
@ 2025-10-27 9:05 ` Lance Yang
2025-11-08 2:34 ` Wei Yang
1 sibling, 0 replies; 77+ messages in thread
From: Lance Yang @ 2025-10-27 9:05 UTC (permalink / raw)
To: Nico Pache, dev.jain
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, linux-trace-kernel, willy, peterx, wangkefeng.wang,
linux-mm, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, linux-kernel, cl,
jglisse, surenb, zokeefe, hannes, rientjes, mhocko, rdunlap,
hughd, richard.weiyang, vbabka, rppt, jannh, pfalcato, linux-doc
On 2025/10/23 02:37, Nico Pache wrote:
> From: Dev Jain <dev.jain@arm.com>
>
> Pass order to alloc_charge_folio() and update mTHP statistics.
>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Acked-by: David Hildenbrand <david@redhat.com>
> Co-developed-by: Nico Pache <npache@redhat.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
Cool! LGTM.
Reviewed-by: Lance Yang <lance.yang@linux.dev>
> Documentation/admin-guide/mm/transhuge.rst | 8 ++++++++
> include/linux/huge_mm.h | 2 ++
> mm/huge_memory.c | 4 ++++
> mm/khugepaged.c | 17 +++++++++++------
> 4 files changed, 25 insertions(+), 6 deletions(-)
>
> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> index 1654211cc6cf..13269a0074d4 100644
> --- a/Documentation/admin-guide/mm/transhuge.rst
> +++ b/Documentation/admin-guide/mm/transhuge.rst
> @@ -634,6 +634,14 @@ anon_fault_fallback_charge
> instead falls back to using huge pages with lower orders or
> small pages even though the allocation was successful.
>
> +collapse_alloc
> + is incremented every time a huge page is successfully allocated for a
> + khugepaged collapse.
> +
> +collapse_alloc_failed
> + is incremented every time a huge page allocation fails during a
> + khugepaged collapse.
> +
> zswpout
> is incremented every time a huge page is swapped out to zswap in one
> piece without splitting.
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 7698b3542c4f..3d29624c4f3f 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -128,6 +128,8 @@ enum mthp_stat_item {
> MTHP_STAT_ANON_FAULT_ALLOC,
> MTHP_STAT_ANON_FAULT_FALLBACK,
> MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
> + MTHP_STAT_COLLAPSE_ALLOC,
> + MTHP_STAT_COLLAPSE_ALLOC_FAILED,
> MTHP_STAT_ZSWPOUT,
> MTHP_STAT_SWPIN,
> MTHP_STAT_SWPIN_FALLBACK,
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 370ecfd6a182..0063d1ba926e 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -620,6 +620,8 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
> DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
> DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
> DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> +DEFINE_MTHP_STAT_ATTR(collapse_alloc, MTHP_STAT_COLLAPSE_ALLOC);
> +DEFINE_MTHP_STAT_ATTR(collapse_alloc_failed, MTHP_STAT_COLLAPSE_ALLOC_FAILED);
> DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
> DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
> DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK);
> @@ -685,6 +687,8 @@ static struct attribute *any_stats_attrs[] = {
> #endif
> &split_attr.attr,
> &split_failed_attr.attr,
> + &collapse_alloc_attr.attr,
> + &collapse_alloc_failed_attr.attr,
> NULL,
> };
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 6cf8700823f9..36ee659acfbb 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1061,21 +1061,26 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
> }
>
> static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
> - struct collapse_control *cc)
> + struct collapse_control *cc, unsigned int order)
> {
> gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
> GFP_TRANSHUGE);
> int node = collapse_find_target_node(cc);
> struct folio *folio;
>
> - folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
> + folio = __folio_alloc(gfp, order, node, &cc->alloc_nmask);
> if (!folio) {
> *foliop = NULL;
> - count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
> + if (order == HPAGE_PMD_ORDER)
> + count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
> + count_mthp_stat(order, MTHP_STAT_COLLAPSE_ALLOC_FAILED);
> return SCAN_ALLOC_HUGE_PAGE_FAIL;
> }
>
> - count_vm_event(THP_COLLAPSE_ALLOC);
> + if (order == HPAGE_PMD_ORDER)
> + count_vm_event(THP_COLLAPSE_ALLOC);
> + count_mthp_stat(order, MTHP_STAT_COLLAPSE_ALLOC);
> +
> if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
> folio_put(folio);
> *foliop = NULL;
> @@ -1112,7 +1117,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> */
> mmap_read_unlock(mm);
>
> - result = alloc_charge_folio(&folio, mm, cc);
> + result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
> if (result != SCAN_SUCCEED)
> goto out_nolock;
>
> @@ -1850,7 +1855,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
> VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
> VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
>
> - result = alloc_charge_folio(&new_folio, mm, cc);
> + result = alloc_charge_folio(&new_folio, mm, cc, HPAGE_PMD_ORDER);
> if (result != SCAN_SUCCEED)
> goto out;
>
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 04/15] khugepaged: generalize alloc_charge_folio()
2025-10-22 18:37 ` [PATCH v12 mm-new 04/15] khugepaged: generalize alloc_charge_folio() Nico Pache
2025-10-27 9:05 ` Lance Yang
@ 2025-11-08 2:34 ` Wei Yang
1 sibling, 0 replies; 77+ messages in thread
From: Wei Yang @ 2025-11-08 2:34 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david, ziy,
baolin.wang, lorenzo.stoakes, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 22, 2025 at 12:37:06PM -0600, Nico Pache wrote:
>From: Dev Jain <dev.jain@arm.com>
>
>Pass order to alloc_charge_folio() and update mTHP statistics.
>
>Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>Acked-by: David Hildenbrand <david@redhat.com>
>Co-developed-by: Nico Pache <npache@redhat.com>
>Signed-off-by: Nico Pache <npache@redhat.com>
>Signed-off-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
--
Wei Yang
Help you, Help me
^ permalink raw reply [flat|nested] 77+ messages in thread
* [PATCH v12 mm-new 05/15] khugepaged: generalize __collapse_huge_page_* for mTHP support
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (3 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 04/15] khugepaged: generalize alloc_charge_folio() Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-10-27 9:17 ` Lance Yang
` (2 more replies)
2025-10-22 18:37 ` [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function Nico Pache
` (10 subsequent siblings)
15 siblings, 3 replies; 77+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas, tiwai,
will, dave.hansen, jack, cl, jglisse, surenb, zokeefe, hannes,
rientjes, mhocko, rdunlap, hughd, richard.weiyang, lance.yang,
vbabka, rppt, jannh, pfalcato
generalize the order of the __collapse_huge_page_* functions
to support future mTHP collapse.
mTHP collapse will not honor the khugepaged_max_ptes_shared or
khugepaged_max_ptes_swap parameters, and will fail if it encounters a
shared or swapped entry.
No functional changes in this patch.
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: David Hildenbrand <david@redhat.com>
Co-developed-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 78 ++++++++++++++++++++++++++++++-------------------
1 file changed, 48 insertions(+), 30 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 36ee659acfbb..4ccebf5dda97 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -537,25 +537,25 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
}
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
- unsigned long start_addr,
- pte_t *pte,
- struct collapse_control *cc,
- struct list_head *compound_pagelist)
+ unsigned long start_addr, pte_t *pte, struct collapse_control *cc,
+ unsigned int order, struct list_head *compound_pagelist)
{
struct page *page = NULL;
struct folio *folio = NULL;
unsigned long addr = start_addr;
pte_t *_pte;
int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
+ const unsigned long nr_pages = 1UL << order;
+ int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
- for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
+ for (_pte = pte; _pte < pte + nr_pages;
_pte++, addr += PAGE_SIZE) {
pte_t pteval = ptep_get(_pte);
if (pte_none_or_zero(pteval)) {
++none_or_zero;
if (!userfaultfd_armed(vma) &&
(!cc->is_khugepaged ||
- none_or_zero <= khugepaged_max_ptes_none)) {
+ none_or_zero <= max_ptes_none)) {
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
@@ -583,8 +583,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
/* See collapse_scan_pmd(). */
if (folio_maybe_mapped_shared(folio)) {
++shared;
- if (cc->is_khugepaged &&
- shared > khugepaged_max_ptes_shared) {
+ /*
+ * TODO: Support shared pages without leading to further
+ * mTHP collapses. Currently bringing in new pages via
+ * shared may cause a future higher order collapse on a
+ * rescan of the same range.
+ */
+ if (order != HPAGE_PMD_ORDER || (cc->is_khugepaged &&
+ shared > khugepaged_max_ptes_shared)) {
result = SCAN_EXCEED_SHARED_PTE;
count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
goto out;
@@ -677,18 +683,18 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
}
static void __collapse_huge_page_copy_succeeded(pte_t *pte,
- struct vm_area_struct *vma,
- unsigned long address,
- spinlock_t *ptl,
- struct list_head *compound_pagelist)
+ struct vm_area_struct *vma, unsigned long address,
+ spinlock_t *ptl, unsigned int order,
+ struct list_head *compound_pagelist)
{
- unsigned long end = address + HPAGE_PMD_SIZE;
+ unsigned long end = address + (PAGE_SIZE << order);
struct folio *src, *tmp;
pte_t pteval;
pte_t *_pte;
unsigned int nr_ptes;
+ const unsigned long nr_pages = 1UL << order;
- for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
+ for (_pte = pte; _pte < pte + nr_pages; _pte += nr_ptes,
address += nr_ptes * PAGE_SIZE) {
nr_ptes = 1;
pteval = ptep_get(_pte);
@@ -741,13 +747,11 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
}
static void __collapse_huge_page_copy_failed(pte_t *pte,
- pmd_t *pmd,
- pmd_t orig_pmd,
- struct vm_area_struct *vma,
- struct list_head *compound_pagelist)
+ pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
+ unsigned int order, struct list_head *compound_pagelist)
{
spinlock_t *pmd_ptl;
-
+ const unsigned long nr_pages = 1UL << order;
/*
* Re-establish the PMD to point to the original page table
* entry. Restoring PMD needs to be done prior to releasing
@@ -761,7 +765,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
* Release both raw and compound pages isolated
* in __collapse_huge_page_isolate.
*/
- release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
+ release_pte_pages(pte, pte + nr_pages, compound_pagelist);
}
/*
@@ -781,16 +785,16 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
*/
static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
- unsigned long address, spinlock_t *ptl,
+ unsigned long address, spinlock_t *ptl, unsigned int order,
struct list_head *compound_pagelist)
{
unsigned int i;
int result = SCAN_SUCCEED;
-
+ const unsigned long nr_pages = 1UL << order;
/*
* Copying pages' contents is subject to memory poison at any iteration.
*/
- for (i = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0; i < nr_pages; i++) {
pte_t pteval = ptep_get(pte + i);
struct page *page = folio_page(folio, i);
unsigned long src_addr = address + i * PAGE_SIZE;
@@ -809,10 +813,10 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
if (likely(result == SCAN_SUCCEED))
__collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
- compound_pagelist);
+ order, compound_pagelist);
else
__collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
- compound_pagelist);
+ order, compound_pagelist);
return result;
}
@@ -985,13 +989,12 @@ static int check_pmd_still_valid(struct mm_struct *mm,
* Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
*/
static int __collapse_huge_page_swapin(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long start_addr, pmd_t *pmd,
- int referenced)
+ struct vm_area_struct *vma, unsigned long start_addr,
+ pmd_t *pmd, int referenced, unsigned int order)
{
int swapped_in = 0;
vm_fault_t ret = 0;
- unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE);
+ unsigned long addr, end = start_addr + (PAGE_SIZE << order);
int result;
pte_t *pte = NULL;
spinlock_t *ptl;
@@ -1022,6 +1025,19 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
if (!is_swap_pte(vmf.orig_pte))
continue;
+ /*
+ * TODO: Support swapin without leading to further mTHP
+ * collapses. Currently bringing in new pages via swapin may
+ * cause a future higher order collapse on a rescan of the same
+ * range.
+ */
+ if (order != HPAGE_PMD_ORDER) {
+ pte_unmap(pte);
+ mmap_read_unlock(mm);
+ result = SCAN_EXCEED_SWAP_PTE;
+ goto out;
+ }
+
vmf.pte = pte;
vmf.ptl = ptl;
ret = do_swap_page(&vmf);
@@ -1142,7 +1158,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
* that case. Continuing to collapse causes inconsistency.
*/
result = __collapse_huge_page_swapin(mm, vma, address, pmd,
- referenced);
+ referenced, HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED)
goto out_nolock;
}
@@ -1190,6 +1206,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
if (pte) {
result = __collapse_huge_page_isolate(vma, address, pte, cc,
+ HPAGE_PMD_ORDER,
&compound_pagelist);
spin_unlock(pte_ptl);
} else {
@@ -1220,6 +1237,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
vma, address, pte_ptl,
+ HPAGE_PMD_ORDER,
&compound_pagelist);
pte_unmap(pte);
if (unlikely(result != SCAN_SUCCEED))
--
2.51.0
^ permalink raw reply related [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 05/15] khugepaged: generalize __collapse_huge_page_* for mTHP support
2025-10-22 18:37 ` [PATCH v12 mm-new 05/15] khugepaged: generalize __collapse_huge_page_* for mTHP support Nico Pache
@ 2025-10-27 9:17 ` Lance Yang
2025-10-27 16:00 ` Lorenzo Stoakes
2025-11-08 3:01 ` Wei Yang
2 siblings, 0 replies; 77+ messages in thread
From: Lance Yang @ 2025-10-27 9:17 UTC (permalink / raw)
To: Nico Pache
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, linux-kernel, baohua, willy, peterx,
wangkefeng.wang, linux-mm, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, vbabka, rppt, jannh, pfalcato,
linux-trace-kernel, linux-doc
On 2025/10/23 02:37, Nico Pache wrote:
> generalize the order of the __collapse_huge_page_* functions
> to support future mTHP collapse.
>
> mTHP collapse will not honor the khugepaged_max_ptes_shared or
> khugepaged_max_ptes_swap parameters, and will fail if it encounters a
> shared or swapped entry.
Yeah, IMHO, it's the right call to avoid the complexity of potential
"collapse creep" at this stage and get the core functionality right first ;)
>
> No functional changes in this patch.
>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Acked-by: David Hildenbrand <david@redhat.com>
> Co-developed-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
Cool! LGTM.
Reviewed-by: Lance Yang <lance.yang@linux.dev>
> mm/khugepaged.c | 78 ++++++++++++++++++++++++++++++-------------------
> 1 file changed, 48 insertions(+), 30 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 36ee659acfbb..4ccebf5dda97 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -537,25 +537,25 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
> }
>
> static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> - unsigned long start_addr,
> - pte_t *pte,
> - struct collapse_control *cc,
> - struct list_head *compound_pagelist)
> + unsigned long start_addr, pte_t *pte, struct collapse_control *cc,
> + unsigned int order, struct list_head *compound_pagelist)
> {
> struct page *page = NULL;
> struct folio *folio = NULL;
> unsigned long addr = start_addr;
> pte_t *_pte;
> int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
> + const unsigned long nr_pages = 1UL << order;
> + int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
>
> - for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
> + for (_pte = pte; _pte < pte + nr_pages;
> _pte++, addr += PAGE_SIZE) {
> pte_t pteval = ptep_get(_pte);
> if (pte_none_or_zero(pteval)) {
> ++none_or_zero;
> if (!userfaultfd_armed(vma) &&
> (!cc->is_khugepaged ||
> - none_or_zero <= khugepaged_max_ptes_none)) {
> + none_or_zero <= max_ptes_none)) {
> continue;
> } else {
> result = SCAN_EXCEED_NONE_PTE;
> @@ -583,8 +583,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> /* See collapse_scan_pmd(). */
> if (folio_maybe_mapped_shared(folio)) {
> ++shared;
> - if (cc->is_khugepaged &&
> - shared > khugepaged_max_ptes_shared) {
> + /*
> + * TODO: Support shared pages without leading to further
> + * mTHP collapses. Currently bringing in new pages via
> + * shared may cause a future higher order collapse on a
> + * rescan of the same range.
> + */
> + if (order != HPAGE_PMD_ORDER || (cc->is_khugepaged &&
> + shared > khugepaged_max_ptes_shared)) {
> result = SCAN_EXCEED_SHARED_PTE;
> count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
> goto out;
> @@ -677,18 +683,18 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> }
>
> static void __collapse_huge_page_copy_succeeded(pte_t *pte,
> - struct vm_area_struct *vma,
> - unsigned long address,
> - spinlock_t *ptl,
> - struct list_head *compound_pagelist)
> + struct vm_area_struct *vma, unsigned long address,
> + spinlock_t *ptl, unsigned int order,
> + struct list_head *compound_pagelist)
> {
> - unsigned long end = address + HPAGE_PMD_SIZE;
> + unsigned long end = address + (PAGE_SIZE << order);
> struct folio *src, *tmp;
> pte_t pteval;
> pte_t *_pte;
> unsigned int nr_ptes;
> + const unsigned long nr_pages = 1UL << order;
>
> - for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
> + for (_pte = pte; _pte < pte + nr_pages; _pte += nr_ptes,
> address += nr_ptes * PAGE_SIZE) {
> nr_ptes = 1;
> pteval = ptep_get(_pte);
> @@ -741,13 +747,11 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
> }
>
> static void __collapse_huge_page_copy_failed(pte_t *pte,
> - pmd_t *pmd,
> - pmd_t orig_pmd,
> - struct vm_area_struct *vma,
> - struct list_head *compound_pagelist)
> + pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
> + unsigned int order, struct list_head *compound_pagelist)
> {
> spinlock_t *pmd_ptl;
> -
> + const unsigned long nr_pages = 1UL << order;
> /*
> * Re-establish the PMD to point to the original page table
> * entry. Restoring PMD needs to be done prior to releasing
> @@ -761,7 +765,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
> * Release both raw and compound pages isolated
> * in __collapse_huge_page_isolate.
> */
> - release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
> + release_pte_pages(pte, pte + nr_pages, compound_pagelist);
> }
>
> /*
> @@ -781,16 +785,16 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
> */
> static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
> pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
> - unsigned long address, spinlock_t *ptl,
> + unsigned long address, spinlock_t *ptl, unsigned int order,
> struct list_head *compound_pagelist)
> {
> unsigned int i;
> int result = SCAN_SUCCEED;
> -
> + const unsigned long nr_pages = 1UL << order;
> /*
> * Copying pages' contents is subject to memory poison at any iteration.
> */
> - for (i = 0; i < HPAGE_PMD_NR; i++) {
> + for (i = 0; i < nr_pages; i++) {
> pte_t pteval = ptep_get(pte + i);
> struct page *page = folio_page(folio, i);
> unsigned long src_addr = address + i * PAGE_SIZE;
> @@ -809,10 +813,10 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
>
> if (likely(result == SCAN_SUCCEED))
> __collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
> - compound_pagelist);
> + order, compound_pagelist);
> else
> __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
> - compound_pagelist);
> + order, compound_pagelist);
>
> return result;
> }
> @@ -985,13 +989,12 @@ static int check_pmd_still_valid(struct mm_struct *mm,
> * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
> */
> static int __collapse_huge_page_swapin(struct mm_struct *mm,
> - struct vm_area_struct *vma,
> - unsigned long start_addr, pmd_t *pmd,
> - int referenced)
> + struct vm_area_struct *vma, unsigned long start_addr,
> + pmd_t *pmd, int referenced, unsigned int order)
> {
> int swapped_in = 0;
> vm_fault_t ret = 0;
> - unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE);
> + unsigned long addr, end = start_addr + (PAGE_SIZE << order);
> int result;
> pte_t *pte = NULL;
> spinlock_t *ptl;
> @@ -1022,6 +1025,19 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
> if (!is_swap_pte(vmf.orig_pte))
> continue;
>
> + /*
> + * TODO: Support swapin without leading to further mTHP
> + * collapses. Currently bringing in new pages via swapin may
> + * cause a future higher order collapse on a rescan of the same
> + * range.
> + */
> + if (order != HPAGE_PMD_ORDER) {
> + pte_unmap(pte);
> + mmap_read_unlock(mm);
> + result = SCAN_EXCEED_SWAP_PTE;
> + goto out;
> + }
> +
> vmf.pte = pte;
> vmf.ptl = ptl;
> ret = do_swap_page(&vmf);
> @@ -1142,7 +1158,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> * that case. Continuing to collapse causes inconsistency.
> */
> result = __collapse_huge_page_swapin(mm, vma, address, pmd,
> - referenced);
> + referenced, HPAGE_PMD_ORDER);
> if (result != SCAN_SUCCEED)
> goto out_nolock;
> }
> @@ -1190,6 +1206,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
> if (pte) {
> result = __collapse_huge_page_isolate(vma, address, pte, cc,
> + HPAGE_PMD_ORDER,
> &compound_pagelist);
> spin_unlock(pte_ptl);
> } else {
> @@ -1220,6 +1237,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>
> result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
> vma, address, pte_ptl,
> + HPAGE_PMD_ORDER,
> &compound_pagelist);
> pte_unmap(pte);
> if (unlikely(result != SCAN_SUCCEED))
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 05/15] khugepaged: generalize __collapse_huge_page_* for mTHP support
2025-10-22 18:37 ` [PATCH v12 mm-new 05/15] khugepaged: generalize __collapse_huge_page_* for mTHP support Nico Pache
2025-10-27 9:17 ` Lance Yang
@ 2025-10-27 16:00 ` Lorenzo Stoakes
2025-11-08 3:01 ` Wei Yang
2 siblings, 0 replies; 77+ messages in thread
From: Lorenzo Stoakes @ 2025-10-27 16:00 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david, ziy,
baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Wed, Oct 22, 2025 at 12:37:07PM -0600, Nico Pache wrote:
> generalize the order of the __collapse_huge_page_* functions
> to support future mTHP collapse.
>
> mTHP collapse will not honor the khugepaged_max_ptes_shared or
> khugepaged_max_ptes_swap parameters, and will fail if it encounters a
> shared or swapped entry.
>
> No functional changes in this patch.
>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Acked-by: David Hildenbrand <david@redhat.com>
> Co-developed-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
Thanks for addressing the v10 stuff (didn't check at v11).
Overall LGTM, so:
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Few minor nits below.
> ---
> mm/khugepaged.c | 78 ++++++++++++++++++++++++++++++-------------------
> 1 file changed, 48 insertions(+), 30 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 36ee659acfbb..4ccebf5dda97 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -537,25 +537,25 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
> }
>
> static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> - unsigned long start_addr,
> - pte_t *pte,
> - struct collapse_control *cc,
> - struct list_head *compound_pagelist)
> + unsigned long start_addr, pte_t *pte, struct collapse_control *cc,
> + unsigned int order, struct list_head *compound_pagelist)
This series isn't the right place for it, but god do we need helper structs in
this code... :)
> {
> struct page *page = NULL;
> struct folio *folio = NULL;
> unsigned long addr = start_addr;
> pte_t *_pte;
> int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
> + const unsigned long nr_pages = 1UL << order;
> + int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
Nit, but we should const-ify this too.
>
> - for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
> + for (_pte = pte; _pte < pte + nr_pages;
> _pte++, addr += PAGE_SIZE) {
> pte_t pteval = ptep_get(_pte);
> if (pte_none_or_zero(pteval)) {
> ++none_or_zero;
> if (!userfaultfd_armed(vma) &&
> (!cc->is_khugepaged ||
> - none_or_zero <= khugepaged_max_ptes_none)) {
> + none_or_zero <= max_ptes_none)) {
> continue;
> } else {
> result = SCAN_EXCEED_NONE_PTE;
> @@ -583,8 +583,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> /* See collapse_scan_pmd(). */
> if (folio_maybe_mapped_shared(folio)) {
> ++shared;
> - if (cc->is_khugepaged &&
> - shared > khugepaged_max_ptes_shared) {
> + /*
> + * TODO: Support shared pages without leading to further
> + * mTHP collapses. Currently bringing in new pages via
> + * shared may cause a future higher order collapse on a
> + * rescan of the same range.
> + */
Yeah, I wish we could find a way to address this in some other way but given the
mire of THP code putting this comment here for now is probably the only sensible
way.
> + if (order != HPAGE_PMD_ORDER || (cc->is_khugepaged &&
> + shared > khugepaged_max_ptes_shared)) {
> result = SCAN_EXCEED_SHARED_PTE;
> count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
> goto out;
> @@ -677,18 +683,18 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> }
>
> static void __collapse_huge_page_copy_succeeded(pte_t *pte,
> - struct vm_area_struct *vma,
> - unsigned long address,
> - spinlock_t *ptl,
> - struct list_head *compound_pagelist)
> + struct vm_area_struct *vma, unsigned long address,
> + spinlock_t *ptl, unsigned int order,
> + struct list_head *compound_pagelist)
> {
> - unsigned long end = address + HPAGE_PMD_SIZE;
> + unsigned long end = address + (PAGE_SIZE << order);
> struct folio *src, *tmp;
> pte_t pteval;
> pte_t *_pte;
> unsigned int nr_ptes;
> + const unsigned long nr_pages = 1UL << order;
>
> - for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
> + for (_pte = pte; _pte < pte + nr_pages; _pte += nr_ptes,
> address += nr_ptes * PAGE_SIZE) {
> nr_ptes = 1;
> pteval = ptep_get(_pte);
> @@ -741,13 +747,11 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
> }
>
> static void __collapse_huge_page_copy_failed(pte_t *pte,
> - pmd_t *pmd,
> - pmd_t orig_pmd,
> - struct vm_area_struct *vma,
> - struct list_head *compound_pagelist)
> + pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
> + unsigned int order, struct list_head *compound_pagelist)
> {
> spinlock_t *pmd_ptl;
> -
> + const unsigned long nr_pages = 1UL << order;
> /*
> * Re-establish the PMD to point to the original page table
> * entry. Restoring PMD needs to be done prior to releasing
> @@ -761,7 +765,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
> * Release both raw and compound pages isolated
> * in __collapse_huge_page_isolate.
> */
> - release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
> + release_pte_pages(pte, pte + nr_pages, compound_pagelist);
> }
>
> /*
> @@ -781,16 +785,16 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
> */
> static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
> pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
> - unsigned long address, spinlock_t *ptl,
> + unsigned long address, spinlock_t *ptl, unsigned int order,
> struct list_head *compound_pagelist)
> {
> unsigned int i;
> int result = SCAN_SUCCEED;
> -
> + const unsigned long nr_pages = 1UL << order;
> /*
> * Copying pages' contents is subject to memory poison at any iteration.
> */
> - for (i = 0; i < HPAGE_PMD_NR; i++) {
> + for (i = 0; i < nr_pages; i++) {
> pte_t pteval = ptep_get(pte + i);
> struct page *page = folio_page(folio, i);
> unsigned long src_addr = address + i * PAGE_SIZE;
> @@ -809,10 +813,10 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
>
> if (likely(result == SCAN_SUCCEED))
> __collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
> - compound_pagelist);
> + order, compound_pagelist);
> else
> __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
> - compound_pagelist);
> + order, compound_pagelist);
>
> return result;
> }
> @@ -985,13 +989,12 @@ static int check_pmd_still_valid(struct mm_struct *mm,
> * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
> */
> static int __collapse_huge_page_swapin(struct mm_struct *mm,
> - struct vm_area_struct *vma,
> - unsigned long start_addr, pmd_t *pmd,
> - int referenced)
> + struct vm_area_struct *vma, unsigned long start_addr,
> + pmd_t *pmd, int referenced, unsigned int order)
Nit, super nit really, but since other __collapse_huge_page_*() functions have
..., order, param) as their last parameters, perhaps worth flipping referenced +
order here?
Not a big deal though.
> {
> int swapped_in = 0;
> vm_fault_t ret = 0;
> - unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE);
> + unsigned long addr, end = start_addr + (PAGE_SIZE << order);
> int result;
> pte_t *pte = NULL;
> spinlock_t *ptl;
> @@ -1022,6 +1025,19 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
> if (!is_swap_pte(vmf.orig_pte))
> continue;
>
> + /*
> + * TODO: Support swapin without leading to further mTHP
> + * collapses. Currently bringing in new pages via swapin may
> + * cause a future higher order collapse on a rescan of the same
> + * range.
> + */
Same comment as above re: this, i.e. that it's a pity but probably unavoidable
for now.
> + if (order != HPAGE_PMD_ORDER) {
> + pte_unmap(pte);
> + mmap_read_unlock(mm);
> + result = SCAN_EXCEED_SWAP_PTE;
> + goto out;
> + }
> +
> vmf.pte = pte;
> vmf.ptl = ptl;
> ret = do_swap_page(&vmf);
> @@ -1142,7 +1158,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> * that case. Continuing to collapse causes inconsistency.
> */
> result = __collapse_huge_page_swapin(mm, vma, address, pmd,
> - referenced);
> + referenced, HPAGE_PMD_ORDER);
> if (result != SCAN_SUCCEED)
> goto out_nolock;
> }
> @@ -1190,6 +1206,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
> if (pte) {
> result = __collapse_huge_page_isolate(vma, address, pte, cc,
> + HPAGE_PMD_ORDER,
> &compound_pagelist);
> spin_unlock(pte_ptl);
> } else {
> @@ -1220,6 +1237,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>
> result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
> vma, address, pte_ptl,
> + HPAGE_PMD_ORDER,
> &compound_pagelist);
> pte_unmap(pte);
> if (unlikely(result != SCAN_SUCCEED))
> --
> 2.51.0
>
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 05/15] khugepaged: generalize __collapse_huge_page_* for mTHP support
2025-10-22 18:37 ` [PATCH v12 mm-new 05/15] khugepaged: generalize __collapse_huge_page_* for mTHP support Nico Pache
2025-10-27 9:17 ` Lance Yang
2025-10-27 16:00 ` Lorenzo Stoakes
@ 2025-11-08 3:01 ` Wei Yang
2 siblings, 0 replies; 77+ messages in thread
From: Wei Yang @ 2025-11-08 3:01 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david, ziy,
baolin.wang, lorenzo.stoakes, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 22, 2025 at 12:37:07PM -0600, Nico Pache wrote:
>generalize the order of the __collapse_huge_page_* functions
>to support future mTHP collapse.
>
>mTHP collapse will not honor the khugepaged_max_ptes_shared or
>khugepaged_max_ptes_swap parameters, and will fail if it encounters a
>shared or swapped entry.
>
>No functional changes in this patch.
>
>Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>Acked-by: David Hildenbrand <david@redhat.com>
>Co-developed-by: Dev Jain <dev.jain@arm.com>
>Signed-off-by: Dev Jain <dev.jain@arm.com>
>Signed-off-by: Nico Pache <npache@redhat.com>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
--
Wei Yang
Help you, Help me
^ permalink raw reply [flat|nested] 77+ messages in thread
* [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (4 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 05/15] khugepaged: generalize __collapse_huge_page_* for mTHP support Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-10-27 17:53 ` Lorenzo Stoakes
2025-10-22 18:37 ` [PATCH v12 mm-new 07/15] khugepaged: generalize collapse_huge_page for mTHP collapse Nico Pache
` (9 subsequent siblings)
15 siblings, 1 reply; 77+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas, tiwai,
will, dave.hansen, jack, cl, jglisse, surenb, zokeefe, hannes,
rientjes, mhocko, rdunlap, hughd, richard.weiyang, lance.yang,
vbabka, rppt, jannh, pfalcato
The current mechanism for determining mTHP collapse scales the
khugepaged_max_ptes_none value based on the target order. This
introduces an undesirable feedback loop, or "creep", when max_ptes_none
is set to a value greater than HPAGE_PMD_NR / 2.
With this configuration, a successful collapse to order N will populate
enough pages to satisfy the collapse condition on order N+1 on the next
scan. This leads to unnecessary work and memory churn.
To fix this issue introduce a helper function that caps the max_ptes_none
to HPAGE_PMD_NR / 2 - 1 (255 on 4k page size). The function also scales
the max_ptes_none number by the (PMD_ORDER - target collapse order).
The limits can be ignored by passing full_scan=true, this is useful for
madvise_collapse (which ignores limits), or in the case of
collapse_scan_pmd(), allows the full PMD to be scanned when mTHP
collapse is available.
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 35 ++++++++++++++++++++++++++++++++++-
1 file changed, 34 insertions(+), 1 deletion(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 4ccebf5dda97..286c3a7afdee 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -459,6 +459,39 @@ void __khugepaged_enter(struct mm_struct *mm)
wake_up_interruptible(&khugepaged_wait);
}
+/**
+ * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse
+ * @order: The folio order being collapsed to
+ * @full_scan: Whether this is a full scan (ignore limits)
+ *
+ * For madvise-triggered collapses (full_scan=true), all limits are bypassed
+ * and allow up to HPAGE_PMD_NR - 1 empty PTEs.
+ *
+ * For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the configured
+ * khugepaged_max_ptes_none value.
+ *
+ * For mTHP collapses, scale down the max_ptes_none proportionally to the folio
+ * order, but caps it at HPAGE_PMD_NR/2-1 to prevent a collapse feedback loop.
+ *
+ * Return: Maximum number of empty PTEs allowed for the collapse operation
+ */
+static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
+{
+ unsigned int max_ptes_none;
+
+ /* ignore max_ptes_none limits */
+ if (full_scan)
+ return HPAGE_PMD_NR - 1;
+
+ if (order == HPAGE_PMD_ORDER)
+ return khugepaged_max_ptes_none;
+
+ max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
+
+ return max_ptes_none >> (HPAGE_PMD_ORDER - order);
+
+}
+
void khugepaged_enter_vma(struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
@@ -546,7 +579,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
pte_t *_pte;
int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
const unsigned long nr_pages = 1UL << order;
- int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
+ int max_ptes_none = collapse_max_ptes_none(order, !cc->is_khugepaged);
for (_pte = pte; _pte < pte + nr_pages;
_pte++, addr += PAGE_SIZE) {
--
2.51.0
^ permalink raw reply related [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-22 18:37 ` [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function Nico Pache
@ 2025-10-27 17:53 ` Lorenzo Stoakes
2025-10-28 10:09 ` Baolin Wang
2025-10-28 13:36 ` Nico Pache
0 siblings, 2 replies; 77+ messages in thread
From: Lorenzo Stoakes @ 2025-10-27 17:53 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david, ziy,
baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Wed, Oct 22, 2025 at 12:37:08PM -0600, Nico Pache wrote:
> The current mechanism for determining mTHP collapse scales the
> khugepaged_max_ptes_none value based on the target order. This
> introduces an undesirable feedback loop, or "creep", when max_ptes_none
> is set to a value greater than HPAGE_PMD_NR / 2.
>
> With this configuration, a successful collapse to order N will populate
> enough pages to satisfy the collapse condition on order N+1 on the next
> scan. This leads to unnecessary work and memory churn.
>
> To fix this issue introduce a helper function that caps the max_ptes_none
> to HPAGE_PMD_NR / 2 - 1 (255 on 4k page size). The function also scales
> the max_ptes_none number by the (PMD_ORDER - target collapse order).
>
> The limits can be ignored by passing full_scan=true, this is useful for
> madvise_collapse (which ignores limits), or in the case of
> collapse_scan_pmd(), allows the full PMD to be scanned when mTHP
> collapse is available.
>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
> mm/khugepaged.c | 35 ++++++++++++++++++++++++++++++++++-
> 1 file changed, 34 insertions(+), 1 deletion(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 4ccebf5dda97..286c3a7afdee 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -459,6 +459,39 @@ void __khugepaged_enter(struct mm_struct *mm)
> wake_up_interruptible(&khugepaged_wait);
> }
>
> +/**
> + * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse
> + * @order: The folio order being collapsed to
> + * @full_scan: Whether this is a full scan (ignore limits)
> + *
> + * For madvise-triggered collapses (full_scan=true), all limits are bypassed
> + * and allow up to HPAGE_PMD_NR - 1 empty PTEs.
> + *
> + * For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the configured
> + * khugepaged_max_ptes_none value.
> + *
> + * For mTHP collapses, scale down the max_ptes_none proportionally to the folio
> + * order, but caps it at HPAGE_PMD_NR/2-1 to prevent a collapse feedback loop.
> + *
> + * Return: Maximum number of empty PTEs allowed for the collapse operation
> + */
> +static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
> +{
> + unsigned int max_ptes_none;
> +
> + /* ignore max_ptes_none limits */
> + if (full_scan)
> + return HPAGE_PMD_NR - 1;
> +
> + if (order == HPAGE_PMD_ORDER)
> + return khugepaged_max_ptes_none;
> +
> + max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
I mean not to beat a dead horse re: v11 commentary, but I thought we were going
to implement David's idea re: the new 'eagerness' tunable, and again we're now just
implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
I'm still really quite uncomfortable with us silently capping this value.
If we're putting forward theoretical ideas that are to be later built upon, this
series should be an RFC.
But if we really intend to silently ignore user input the problem is that then
becomes established uAPI.
I think it's _sensible_ to avoid this mTHP escalation problem, but the issue is
visibility I think.
I think people are going to find it odd that you set it to something, but then
get something else.
As an alternative we could have a new sysfs field:
/sys/kernel/mm/transparent_hugepage/khugepaged/max_mthp_ptes_none
That shows the cap clearly.
In fact, it could be read-only... and just expose it to the user. That reduces
complexity.
We can then bring in eagerness later and have the same situation of
max_ptes_none being a parameter that exists (plus this additional read-only
parameter).
> +
> + return max_ptes_none >> (HPAGE_PMD_ORDER - order);
> +
> +}
> +
> void khugepaged_enter_vma(struct vm_area_struct *vma,
> vm_flags_t vm_flags)
> {
> @@ -546,7 +579,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> pte_t *_pte;
> int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
> const unsigned long nr_pages = 1UL << order;
> - int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
> + int max_ptes_none = collapse_max_ptes_none(order, !cc->is_khugepaged);
>
> for (_pte = pte; _pte < pte + nr_pages;
> _pte++, addr += PAGE_SIZE) {
> --
> 2.51.0
>
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-27 17:53 ` Lorenzo Stoakes
@ 2025-10-28 10:09 ` Baolin Wang
2025-10-28 13:57 ` Nico Pache
2025-10-28 17:07 ` Lorenzo Stoakes
2025-10-28 13:36 ` Nico Pache
1 sibling, 2 replies; 77+ messages in thread
From: Baolin Wang @ 2025-10-28 10:09 UTC (permalink / raw)
To: Lorenzo Stoakes, Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david, ziy,
Liam.Howlett, ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas, tiwai,
will, dave.hansen, jack, cl, jglisse, surenb, zokeefe, hannes,
rientjes, mhocko, rdunlap, hughd, richard.weiyang, lance.yang,
vbabka, rppt, jannh, pfalcato
On 2025/10/28 01:53, Lorenzo Stoakes wrote:
> On Wed, Oct 22, 2025 at 12:37:08PM -0600, Nico Pache wrote:
>> The current mechanism for determining mTHP collapse scales the
>> khugepaged_max_ptes_none value based on the target order. This
>> introduces an undesirable feedback loop, or "creep", when max_ptes_none
>> is set to a value greater than HPAGE_PMD_NR / 2.
>>
>> With this configuration, a successful collapse to order N will populate
>> enough pages to satisfy the collapse condition on order N+1 on the next
>> scan. This leads to unnecessary work and memory churn.
>>
>> To fix this issue introduce a helper function that caps the max_ptes_none
>> to HPAGE_PMD_NR / 2 - 1 (255 on 4k page size). The function also scales
>> the max_ptes_none number by the (PMD_ORDER - target collapse order).
>>
>> The limits can be ignored by passing full_scan=true, this is useful for
>> madvise_collapse (which ignores limits), or in the case of
>> collapse_scan_pmd(), allows the full PMD to be scanned when mTHP
>> collapse is available.
>>
>> Signed-off-by: Nico Pache <npache@redhat.com>
>> ---
>> mm/khugepaged.c | 35 ++++++++++++++++++++++++++++++++++-
>> 1 file changed, 34 insertions(+), 1 deletion(-)
>>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 4ccebf5dda97..286c3a7afdee 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -459,6 +459,39 @@ void __khugepaged_enter(struct mm_struct *mm)
>> wake_up_interruptible(&khugepaged_wait);
>> }
>>
>> +/**
>> + * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse
>> + * @order: The folio order being collapsed to
>> + * @full_scan: Whether this is a full scan (ignore limits)
>> + *
>> + * For madvise-triggered collapses (full_scan=true), all limits are bypassed
>> + * and allow up to HPAGE_PMD_NR - 1 empty PTEs.
>> + *
>> + * For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the configured
>> + * khugepaged_max_ptes_none value.
>> + *
>> + * For mTHP collapses, scale down the max_ptes_none proportionally to the folio
>> + * order, but caps it at HPAGE_PMD_NR/2-1 to prevent a collapse feedback loop.
>> + *
>> + * Return: Maximum number of empty PTEs allowed for the collapse operation
>> + */
>> +static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
>> +{
>> + unsigned int max_ptes_none;
>> +
>> + /* ignore max_ptes_none limits */
>> + if (full_scan)
>> + return HPAGE_PMD_NR - 1;
>> +
>> + if (order == HPAGE_PMD_ORDER)
>> + return khugepaged_max_ptes_none;
>> +
>> + max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
>
> I mean not to beat a dead horse re: v11 commentary, but I thought we were going
> to implement David's idea re: the new 'eagerness' tunable, and again we're now just
> implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
>
> I'm still really quite uncomfortable with us silently capping this value.
>
> If we're putting forward theoretical ideas that are to be later built upon, this
> series should be an RFC.
>
> But if we really intend to silently ignore user input the problem is that then
> becomes established uAPI.
>
> I think it's _sensible_ to avoid this mTHP escalation problem, but the issue is
> visibility I think.
>
> I think people are going to find it odd that you set it to something, but then
> get something else.
>
> As an alternative we could have a new sysfs field:
>
> /sys/kernel/mm/transparent_hugepage/khugepaged/max_mthp_ptes_none
>
> That shows the cap clearly.
>
> In fact, it could be read-only... and just expose it to the user. That reduces
> complexity.
>
> We can then bring in eagerness later and have the same situation of
> max_ptes_none being a parameter that exists (plus this additional read-only
> parameter).
We all know that ultimately using David's suggestion to add the
'eagerness' tunable parameter is the best approach, but for now, we need
an initial version to support mTHP collapse (as we've already discussed
extensively here:)).
I don't like the idea of adding another and potentially confusing
'max_mthp_ptes_none' interface, which might make it more difficult to
accommodate the 'eagerness' parameter in the future.
If Nico's current proposal still doesn't satisfy everyone, I personally
lean towards David's earlier simplified approach:
max_ptes_none == 511 -> collapse mTHP always
max_ptes_none != 511 -> collapse mTHP only if all PTEs are non-none/zero
Let's first have an initial approach in place, which will also simplify
the following addition of the 'eagerness' tunable parameter.
Nico, Lorenzo, and David, what do you think?
Code should be:
static unsigned int collapse_max_ptes_none(unsigned int order, bool
full_scan)
{
unsigned int max_ptes_none;
/* ignore max_ptes_none limits */
if (full_scan)
return HPAGE_PMD_NR - 1;
if (order == HPAGE_PMD_ORDER)
return khugepaged_max_ptes_none;
/*
* For mTHP collapse, we can simplify the logic:
* max_ptes_none == 511 -> collapse mTHP always
* max_ptes_none != 511 -> collapse mTHP only if we all PTEs
are non-none/zero
*/
if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
return khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER -
order);
return 0;
}
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 10:09 ` Baolin Wang
@ 2025-10-28 13:57 ` Nico Pache
2025-10-28 17:07 ` Lorenzo Stoakes
1 sibling, 0 replies; 77+ messages in thread
From: Nico Pache @ 2025-10-28 13:57 UTC (permalink / raw)
To: Baolin Wang
Cc: Lorenzo Stoakes, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, david, ziy, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Tue, Oct 28, 2025 at 4:10 AM Baolin Wang
<baolin.wang@linux.alibaba.com> wrote:
>
>
>
> On 2025/10/28 01:53, Lorenzo Stoakes wrote:
> > On Wed, Oct 22, 2025 at 12:37:08PM -0600, Nico Pache wrote:
> >> The current mechanism for determining mTHP collapse scales the
> >> khugepaged_max_ptes_none value based on the target order. This
> >> introduces an undesirable feedback loop, or "creep", when max_ptes_none
> >> is set to a value greater than HPAGE_PMD_NR / 2.
> >>
> >> With this configuration, a successful collapse to order N will populate
> >> enough pages to satisfy the collapse condition on order N+1 on the next
> >> scan. This leads to unnecessary work and memory churn.
> >>
> >> To fix this issue introduce a helper function that caps the max_ptes_none
> >> to HPAGE_PMD_NR / 2 - 1 (255 on 4k page size). The function also scales
> >> the max_ptes_none number by the (PMD_ORDER - target collapse order).
> >>
> >> The limits can be ignored by passing full_scan=true, this is useful for
> >> madvise_collapse (which ignores limits), or in the case of
> >> collapse_scan_pmd(), allows the full PMD to be scanned when mTHP
> >> collapse is available.
> >>
> >> Signed-off-by: Nico Pache <npache@redhat.com>
> >> ---
> >> mm/khugepaged.c | 35 ++++++++++++++++++++++++++++++++++-
> >> 1 file changed, 34 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> >> index 4ccebf5dda97..286c3a7afdee 100644
> >> --- a/mm/khugepaged.c
> >> +++ b/mm/khugepaged.c
> >> @@ -459,6 +459,39 @@ void __khugepaged_enter(struct mm_struct *mm)
> >> wake_up_interruptible(&khugepaged_wait);
> >> }
> >>
> >> +/**
> >> + * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse
> >> + * @order: The folio order being collapsed to
> >> + * @full_scan: Whether this is a full scan (ignore limits)
> >> + *
> >> + * For madvise-triggered collapses (full_scan=true), all limits are bypassed
> >> + * and allow up to HPAGE_PMD_NR - 1 empty PTEs.
> >> + *
> >> + * For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the configured
> >> + * khugepaged_max_ptes_none value.
> >> + *
> >> + * For mTHP collapses, scale down the max_ptes_none proportionally to the folio
> >> + * order, but caps it at HPAGE_PMD_NR/2-1 to prevent a collapse feedback loop.
> >> + *
> >> + * Return: Maximum number of empty PTEs allowed for the collapse operation
> >> + */
> >> +static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
> >> +{
> >> + unsigned int max_ptes_none;
> >> +
> >> + /* ignore max_ptes_none limits */
> >> + if (full_scan)
> >> + return HPAGE_PMD_NR - 1;
> >> +
> >> + if (order == HPAGE_PMD_ORDER)
> >> + return khugepaged_max_ptes_none;
> >> +
> >> + max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
> >
> > I mean not to beat a dead horse re: v11 commentary, but I thought we were going
> > to implement David's idea re: the new 'eagerness' tunable, and again we're now just
> > implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
> >
> > I'm still really quite uncomfortable with us silently capping this value.
> >
> > If we're putting forward theoretical ideas that are to be later built upon, this
> > series should be an RFC.
> >
> > But if we really intend to silently ignore user input the problem is that then
> > becomes established uAPI.
> >
> > I think it's _sensible_ to avoid this mTHP escalation problem, but the issue is
> > visibility I think.
> >
> > I think people are going to find it odd that you set it to something, but then
> > get something else.
> >
> > As an alternative we could have a new sysfs field:
> >
> > /sys/kernel/mm/transparent_hugepage/khugepaged/max_mthp_ptes_none
> >
> > That shows the cap clearly.
> >
> > In fact, it could be read-only... and just expose it to the user. That reduces
> > complexity.
> >
> > We can then bring in eagerness later and have the same situation of
> > max_ptes_none being a parameter that exists (plus this additional read-only
> > parameter).
>
Hey Baolin,
> We all know that ultimately using David's suggestion to add the
> 'eagerness' tunable parameter is the best approach, but for now, we need
> an initial version to support mTHP collapse (as we've already discussed
> extensively here:)).
>
> I don't like the idea of adding another and potentially confusing
> 'max_mthp_ptes_none' interface, which might make it more difficult to
> accommodate the 'eagerness' parameter in the future.
>
> If Nico's current proposal still doesn't satisfy everyone, I personally
> lean towards David's earlier simplified approach:
> max_ptes_none == 511 -> collapse mTHP always
> max_ptes_none != 511 -> collapse mTHP only if all PTEs are non-none/zero
>
> Let's first have an initial approach in place, which will also simplify
> the following addition of the 'eagerness' tunable parameter.
>
> Nico, Lorenzo, and David, what do you think?
I still believe capping it at PMD_NR/2 provides the right mix between
preventing the undesired behavior, and keeping some degree of
tunability, as the admin guides suggests max_ptes_none should be used.
I would be willing to compromise and take this other approach until
the "eagerness" is in place. However, I do believe David's idea for
eagerness is to also cap the max_ptes_none at PMD_NR/2 for the second
to highest eagerness level (ie, 511, 255, ...). So in practice, we
won't see any behavioral changes when that series comes around;
whereas setting max_ptes_none=0 for mTHP initially, then adding
eagerness will result in a change in behavior from the initial
implementation.
With that said, Lorenzo, David, What's the final verdict?
-- Nico
>
> Code should be:
> static unsigned int collapse_max_ptes_none(unsigned int order, bool
> full_scan)
> {
> unsigned int max_ptes_none;
>
> /* ignore max_ptes_none limits */
> if (full_scan)
> return HPAGE_PMD_NR - 1;
>
> if (order == HPAGE_PMD_ORDER)
> return khugepaged_max_ptes_none;
>
> /*
> * For mTHP collapse, we can simplify the logic:
> * max_ptes_none == 511 -> collapse mTHP always
> * max_ptes_none != 511 -> collapse mTHP only if we all PTEs
> are non-none/zero
> */
> if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
> return khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER -
> order);
>
> return 0;
> }
Side note: Thank you Baolin for your review/testing of the V12 :)
>
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 10:09 ` Baolin Wang
2025-10-28 13:57 ` Nico Pache
@ 2025-10-28 17:07 ` Lorenzo Stoakes
2025-10-28 17:56 ` David Hildenbrand
1 sibling, 1 reply; 77+ messages in thread
From: Lorenzo Stoakes @ 2025-10-28 17:07 UTC (permalink / raw)
To: Baolin Wang
Cc: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm, linux-doc,
david, ziy, Liam.Howlett, ryan.roberts, dev.jain, corbet, rostedt,
mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Tue, Oct 28, 2025 at 06:09:43PM +0800, Baolin Wang wrote:
>
>
> On 2025/10/28 01:53, Lorenzo Stoakes wrote:
> > On Wed, Oct 22, 2025 at 12:37:08PM -0600, Nico Pache wrote:
> > > The current mechanism for determining mTHP collapse scales the
> > > khugepaged_max_ptes_none value based on the target order. This
> > > introduces an undesirable feedback loop, or "creep", when max_ptes_none
> > > is set to a value greater than HPAGE_PMD_NR / 2.
> > >
> > > With this configuration, a successful collapse to order N will populate
> > > enough pages to satisfy the collapse condition on order N+1 on the next
> > > scan. This leads to unnecessary work and memory churn.
> > >
> > > To fix this issue introduce a helper function that caps the max_ptes_none
> > > to HPAGE_PMD_NR / 2 - 1 (255 on 4k page size). The function also scales
> > > the max_ptes_none number by the (PMD_ORDER - target collapse order).
> > >
> > > The limits can be ignored by passing full_scan=true, this is useful for
> > > madvise_collapse (which ignores limits), or in the case of
> > > collapse_scan_pmd(), allows the full PMD to be scanned when mTHP
> > > collapse is available.
> > >
> > > Signed-off-by: Nico Pache <npache@redhat.com>
> > > ---
> > > mm/khugepaged.c | 35 ++++++++++++++++++++++++++++++++++-
> > > 1 file changed, 34 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > index 4ccebf5dda97..286c3a7afdee 100644
> > > --- a/mm/khugepaged.c
> > > +++ b/mm/khugepaged.c
> > > @@ -459,6 +459,39 @@ void __khugepaged_enter(struct mm_struct *mm)
> > > wake_up_interruptible(&khugepaged_wait);
> > > }
> > >
> > > +/**
> > > + * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse
> > > + * @order: The folio order being collapsed to
> > > + * @full_scan: Whether this is a full scan (ignore limits)
> > > + *
> > > + * For madvise-triggered collapses (full_scan=true), all limits are bypassed
> > > + * and allow up to HPAGE_PMD_NR - 1 empty PTEs.
> > > + *
> > > + * For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the configured
> > > + * khugepaged_max_ptes_none value.
> > > + *
> > > + * For mTHP collapses, scale down the max_ptes_none proportionally to the folio
> > > + * order, but caps it at HPAGE_PMD_NR/2-1 to prevent a collapse feedback loop.
> > > + *
> > > + * Return: Maximum number of empty PTEs allowed for the collapse operation
> > > + */
> > > +static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
> > > +{
> > > + unsigned int max_ptes_none;
> > > +
> > > + /* ignore max_ptes_none limits */
> > > + if (full_scan)
> > > + return HPAGE_PMD_NR - 1;
> > > +
> > > + if (order == HPAGE_PMD_ORDER)
> > > + return khugepaged_max_ptes_none;
> > > +
> > > + max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
> >
> > I mean not to beat a dead horse re: v11 commentary, but I thought we were going
> > to implement David's idea re: the new 'eagerness' tunable, and again we're now just
> > implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
> >
> > I'm still really quite uncomfortable with us silently capping this value.
> >
> > If we're putting forward theoretical ideas that are to be later built upon, this
> > series should be an RFC.
> >
> > But if we really intend to silently ignore user input the problem is that then
> > becomes established uAPI.
> >
> > I think it's _sensible_ to avoid this mTHP escalation problem, but the issue is
> > visibility I think.
> >
> > I think people are going to find it odd that you set it to something, but then
> > get something else.
> >
> > As an alternative we could have a new sysfs field:
> >
> > /sys/kernel/mm/transparent_hugepage/khugepaged/max_mthp_ptes_none
> >
> > That shows the cap clearly.
> >
> > In fact, it could be read-only... and just expose it to the user. That reduces
> > complexity.
> >
> > We can then bring in eagerness later and have the same situation of
> > max_ptes_none being a parameter that exists (plus this additional read-only
> > parameter).
>
> We all know that ultimately using David's suggestion to add the 'eagerness'
> tunable parameter is the best approach, but for now, we need an initial
> version to support mTHP collapse (as we've already discussed extensively
> here:)).
>
> I don't like the idea of adding another and potentially confusing
> 'max_mthp_ptes_none' interface, which might make it more difficult to
> accommodate the 'eagerness' parameter in the future.
See my reply to Nico, I disagree that it affects eagerness.
>
> If Nico's current proposal still doesn't satisfy everyone, I personally lean
It's not upstreamable. We cannot silently violate user expectation or silently
change behaviour like this.
> towards David's earlier simplified approach:
> max_ptes_none == 511 -> collapse mTHP always
> max_ptes_none != 511 -> collapse mTHP only if all PTEs are non-none/zero
Pretty sure David's suggestion was that max_ptes_none would literally get set to
511 if you specified 511, or 0 if you specified anything else.
Which would make thing visible to users and not ignore their tunable setting,
which is the whole issue IMO.
But we can't do that, because we know at the very least Meta use small non-zero
values that they expect to be honoured.
So again we're stuck in the situation of max_ptes_none being ignored for mTHP
and users being totally unaware.
>
> Let's first have an initial approach in place, which will also simplify the
Well hang on, this isn't the same as 'do anything we like'.
It immediately becomes uAPI, and 'I'll do that later' often becomes 'I'll never
do that because I got too busy'.
Yes perhaps we have to wait for the eagerness parameter, but any interim
solution must be _solid_ and not do strange/unexpected things.
We've (and of course, it was a silly thing to do) provided the ability for users
to specify this max_ptes_none behaviour for khugepaged.
Suddenly putting an asterix next to that like '*except mTHP where we totally
ignore you if you specify values we don't like' doesn't seem like a great way
forward.
As I said to Nico too, we _have_ to export and support max_ptes_none for uAPI
reasons. And presumably eagerness will want to specify different settings for
mTHP vs. PMD THP, so exposing this (read-only mind you) somehow isn't as crazy
as it might seem.
> following addition of the 'eagerness' tunable parameter.
>
> Nico, Lorenzo, and David, what do you think?
>
> Code should be:
> static unsigned int collapse_max_ptes_none(unsigned int order, bool
> full_scan)
> {
> unsigned int max_ptes_none;
>
> /* ignore max_ptes_none limits */
> if (full_scan)
> return HPAGE_PMD_NR - 1;
>
> if (order == HPAGE_PMD_ORDER)
> return khugepaged_max_ptes_none;
>
> /*
> * For mTHP collapse, we can simplify the logic:
> * max_ptes_none == 511 -> collapse mTHP always
> * max_ptes_none != 511 -> collapse mTHP only if we all PTEs are
> non-none/zero
> */
> if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
> return khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER -
> order);
>
> return 0;
> }
Thanks, Lorenzo
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 17:07 ` Lorenzo Stoakes
@ 2025-10-28 17:56 ` David Hildenbrand
2025-10-28 18:09 ` Lorenzo Stoakes
0 siblings, 1 reply; 77+ messages in thread
From: David Hildenbrand @ 2025-10-28 17:56 UTC (permalink / raw)
To: Lorenzo Stoakes, Baolin Wang
Cc: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm, linux-doc,
ziy, Liam.Howlett, ryan.roberts, dev.jain, corbet, rostedt,
mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
[...]
>
>> towards David's earlier simplified approach:
>> max_ptes_none == 511 -> collapse mTHP always
>> max_ptes_none != 511 -> collapse mTHP only if all PTEs are non-none/zero
>
> Pretty sure David's suggestion was that max_ptes_none would literally get set to
> 511 if you specified 511, or 0 if you specified anything else.
We had multiple incarnations of this approach, but the first one really was:
max_ptes_none == 511 -> collapse mTHP always
max_ptes_none == 0 -> collapse mTHP only if all non-none/zero
And for the intermediate values
(1) pr_warn() when mTHPs are enabled, stating that mTHP collapse is not
supported yet with other values
(2) treat it like max_ptes_none == 0 or (maybe better?) just disable
mTHP collapse
I still like that approach because it let's us defer solving the creep
problem later and doesn't add a silent capping.
Using intermediate max_ptes_none values are really only reasonable with
the deferred shrinker today. And that one does not support mTHP even
with this series, so it's future work either way.
--
Cheers
David / dhildenb
^ permalink raw reply [flat|nested] 77+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 17:56 ` David Hildenbrand
@ 2025-10-28 18:09 ` Lorenzo Stoakes
2025-10-28 18:17 ` David Hildenbrand
0 siblings, 1 reply; 77+ messages in thread
From: Lorenzo Stoakes @ 2025-10-28 18:09 UTC (permalink / raw)
To: David Hildenbrand
Cc: Baolin Wang, Nico Pache, linux-kernel, linux-trace-kernel,
linux-mm, linux-doc, ziy, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
(It'd be good if we could keep all the 'solutions' in one thread as I made a
detailed reply there and now all that will get lost across two threads but
*sigh* never mind. Insert rant about email development here.)
On Tue, Oct 28, 2025 at 06:56:10PM +0100, David Hildenbrand wrote:
> [...]
>
> >
> > > towards David's earlier simplified approach:
> > > max_ptes_none == 511 -> collapse mTHP always
> > > max_ptes_none != 511 -> collapse mTHP only if all PTEs are non-none/zero
> >
> > Pretty sure David's suggestion was that max_ptes_none would literally get set to
> > 511 if you specified 511, or 0 if you specified anything else.
>
> We had multiple incarnations of this approach, but the first one really was:
>
> max_ptes_none == 511 -> collapse mTHP always
But won't 511 mean we just 'creep' to maximum collapse again? Does that solve
anything?
> max_ptes_none == 0 -> collapse mTHP only if all non-none/zero
>
> And for the intermediate values
>
> (1) pr_warn() when mTHPs are enabled, stating that mTHP collapse is not
> supported yet with other values
It feels a bit much to issue a kernel warning every time somebody twiddles that
value, and it's kind of against user expectation a bit.
But maybe it's the least worst way of communicating things. It's still
absolutely gross.
> (2) treat it like max_ptes_none == 0 or (maybe better?) just disable mTHP
> collapse
Yeah disabling mTHP collapse for these values seems sane, but it also seems that
we should be capping for this to work correctly no?
Also I think all this probably violates requirements of users who want to have
different behaviour for mTHP and PMD THP.
The default is 511 so we're in creep territory even with the damn default :)
>
>
> I still like that approach because it let's us defer solving the creep
> problem later and doesn't add a silent capping.
I mean it seems you're more or less saying allow creep. Which I'm kind of ok
with for a first pass thing, and defer it for later.
>
> Using intermediate max_ptes_none values are really only reasonable with the
> deferred shrinker today. And that one does not support mTHP even with this
> series, so it's future work either way.
Right, that's a nice fact to be aware of.
>
> --
> Cheers
>
> David / dhildenb
>
Thanks, Lorenzo
^ permalink raw reply [flat|nested] 77+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 18:09 ` Lorenzo Stoakes
@ 2025-10-28 18:17 ` David Hildenbrand
2025-10-28 18:41 ` Lorenzo Stoakes
0 siblings, 1 reply; 77+ messages in thread
From: David Hildenbrand @ 2025-10-28 18:17 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: Baolin Wang, Nico Pache, linux-kernel, linux-trace-kernel,
linux-mm, linux-doc, ziy, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On 28.10.25 19:09, Lorenzo Stoakes wrote:
> (It'd be good if we could keep all the 'solutions' in one thread as I made a
> detailed reply there and now all that will get lost across two threads but
> *sigh* never mind. Insert rant about email development here.)
Yeah, I focused in my other mails on things to avoid creep while
allowing for mTHP collapse.
>
> On Tue, Oct 28, 2025 at 06:56:10PM +0100, David Hildenbrand wrote:
>> [...]
>>
>>>
>>>> towards David's earlier simplified approach:
>>>> max_ptes_none == 511 -> collapse mTHP always
>>>> max_ptes_none != 511 -> collapse mTHP only if all PTEs are non-none/zero
>>>
>>> Pretty sure David's suggestion was that max_ptes_none would literally get set to
>>> 511 if you specified 511, or 0 if you specified anything else.
>>
>> We had multiple incarnations of this approach, but the first one really was:
>>
>> max_ptes_none == 511 -> collapse mTHP always
>
> But won't 511 mean we just 'creep' to maximum collapse again? Does that solve
> anything?
No creep, because you'll always collapse.
Creep only happens if you wouldn't collapse a PMD without prior mTHP
collapse, but suddenly would in the same scenario simply because you had
prior mTHP collapse.
At least that's my understanding.
>
>> max_ptes_none == 0 -> collapse mTHP only if all non-none/zero
>>
>> And for the intermediate values
>>
>> (1) pr_warn() when mTHPs are enabled, stating that mTHP collapse is not
>> supported yet with other values
>
> It feels a bit much to issue a kernel warning every time somebody twiddles that
> value, and it's kind of against user expectation a bit.
pr_warn_once() is what I meant.
>
> But maybe it's the least worst way of communicating things. It's still
> absolutely gross.
>
>> (2) treat it like max_ptes_none == 0 or (maybe better?) just disable mTHP
>> collapse
>
> Yeah disabling mTHP collapse for these values seems sane, but it also seems that
> we should be capping for this to work correctly no?
I didn't get the interaction with capping, can you elaborate?
>
> Also I think all this probably violates requirements of users who want to have
> different behaviour for mTHP and PMD THP.
>
> The default is 511 so we're in creep territory even with the damn default :)
I don't think so, but maybe I am wrong.
--
Cheers
David / dhildenb
^ permalink raw reply [flat|nested] 77+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 18:17 ` David Hildenbrand
@ 2025-10-28 18:41 ` Lorenzo Stoakes
2025-10-29 15:04 ` David Hildenbrand
0 siblings, 1 reply; 77+ messages in thread
From: Lorenzo Stoakes @ 2025-10-28 18:41 UTC (permalink / raw)
To: David Hildenbrand
Cc: Baolin Wang, Nico Pache, linux-kernel, linux-trace-kernel,
linux-mm, linux-doc, ziy, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Tue, Oct 28, 2025 at 07:17:16PM +0100, David Hildenbrand wrote:
> On 28.10.25 19:09, Lorenzo Stoakes wrote:
> > (It'd be good if we could keep all the 'solutions' in one thread as I made a
> > detailed reply there and now all that will get lost across two threads but
> > *sigh* never mind. Insert rant about email development here.)
>
> Yeah, I focused in my other mails on things to avoid creep while allowing
> for mTHP collapse.
>
> >
> > On Tue, Oct 28, 2025 at 06:56:10PM +0100, David Hildenbrand wrote:
> > > [...]
> > >
> > > >
> > > > > towards David's earlier simplified approach:
> > > > > max_ptes_none == 511 -> collapse mTHP always
> > > > > max_ptes_none != 511 -> collapse mTHP only if all PTEs are non-none/zero
> > > >
> > > > Pretty sure David's suggestion was that max_ptes_none would literally get set to
> > > > 511 if you specified 511, or 0 if you specified anything else.
> > >
> > > We had multiple incarnations of this approach, but the first one really was:
> > >
> > > max_ptes_none == 511 -> collapse mTHP always
> >
> > But won't 511 mean we just 'creep' to maximum collapse again? Does that solve
> > anything?
>
> No creep, because you'll always collapse.
OK so in the 511 scenario, do we simply immediately collapse to the largest
possible _mTHP_ page size if based on adjacent none/zero page entries in the
PTE, and _never_ collapse to PMD on this basis even if we do have sufficient
none/zero PTE entries to do so?
And only collapse to PMD size if we have sufficient adjacent PTE entries that
are populated?
Let's really nail this down actually so we can be super clear what the issue is
here.
>
> Creep only happens if you wouldn't collapse a PMD without prior mTHP
> collapse, but suddenly would in the same scenario simply because you had
> prior mTHP collapse.
>
> At least that's my understanding.
OK, that makes sense, is the logic (this may be part of the bit I haven't
reviewed yet tbh) then that for khugepaged mTHP we have the system where we
always require prior mTHP collapse _first_?
>
> >
> > > max_ptes_none == 0 -> collapse mTHP only if all non-none/zero
> > >
> > > And for the intermediate values
> > >
> > > (1) pr_warn() when mTHPs are enabled, stating that mTHP collapse is not
> > > supported yet with other values
> >
> > It feels a bit much to issue a kernel warning every time somebody twiddles that
> > value, and it's kind of against user expectation a bit.
>
> pr_warn_once() is what I meant.
Right, but even then it feels a bit extreme, warnings are pretty serious
things. Then again there's precedent for this, and it may be the least worse
solution.
I just picture a cloud provider turning this on with mTHP then getting their
monitoring team reporting some urgent communication about warnings in dmesg :)
>
> >
> > But maybe it's the least worst way of communicating things. It's still
> > absolutely gross.
> >
> > > (2) treat it like max_ptes_none == 0 or (maybe better?) just disable mTHP
> > > collapse
> >
> > Yeah disabling mTHP collapse for these values seems sane, but it also seems that
> > we should be capping for this to work correctly no?
>
> I didn't get the interaction with capping, can you elaborate?
I think that's addressed in the discussion above, once we clarify the creep
thing then the rest should fall out.
>
> >
> > Also I think all this probably violates requirements of users who want to have
> > different behaviour for mTHP and PMD THP.
> >
> > The default is 511 so we're in creep territory even with the damn default :)
>
> I don't think so, but maybe I am wrong.
Discussed above.
>
>
> --
> Cheers
>
> David / dhildenb
>
Thanks, Lorenzo
^ permalink raw reply [flat|nested] 77+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 18:41 ` Lorenzo Stoakes
@ 2025-10-29 15:04 ` David Hildenbrand
2025-10-29 18:41 ` Lorenzo Stoakes
2025-10-29 20:45 ` Nico Pache
0 siblings, 2 replies; 77+ messages in thread
From: David Hildenbrand @ 2025-10-29 15:04 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: Baolin Wang, Nico Pache, linux-kernel, linux-trace-kernel,
linux-mm, linux-doc, ziy, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
>>
>> No creep, because you'll always collapse.
>
> OK so in the 511 scenario, do we simply immediately collapse to the largest
> possible _mTHP_ page size if based on adjacent none/zero page entries in the
> PTE, and _never_ collapse to PMD on this basis even if we do have sufficient
> none/zero PTE entries to do so?
Right. And if we fail to allocate a PMD, we would collapse to smaller
sizes, and later, once a PMD is possible, collapse to a PMD.
But there is no creep, as we would have collapsed a PMD right from the
start either way.
>
> And only collapse to PMD size if we have sufficient adjacent PTE entries that
> are populated?
>
> Let's really nail this down actually so we can be super clear what the issue is
> here.
>
I hope what I wrote above made sense.
>
>>
>> Creep only happens if you wouldn't collapse a PMD without prior mTHP
>> collapse, but suddenly would in the same scenario simply because you had
>> prior mTHP collapse.
>>
>> At least that's my understanding.
>
> OK, that makes sense, is the logic (this may be part of the bit I haven't
> reviewed yet tbh) then that for khugepaged mTHP we have the system where we
> always require prior mTHP collapse _first_?
So I would describe creep as
"we would not collapse a PMD THP because max_ptes_none is violated, but
because we collapsed smaller mTHP THPs before, we essentially suddenly
have more PTEs that are not none-or-zero, making us suddenly collapse a
PMD THP at the same place".
Assume the following: max_ptes_none = 256
This means we would only collapse if at most half (256/512) of the PTEs
are none-or-zero.
But imagine the (simplified) PTE layout with PMD = 8 entries to simplify:
[ P Z P Z P Z Z Z ]
3 Present vs. 5 Zero -> do not collapse a PMD (8)
But sssume we collapse smaller mTHP (2 entries) first
[ P P P P P P Z Z ]
We collapsed 3x "P Z" into "P P" because the ratio allowed for it.
Suddenly we have
6 Present vs 2 Zero and we collapse a PMD (8)
[ P P P P P P P P ]
That's the "creep" problem.
>
>>
>>>
>>>> max_ptes_none == 0 -> collapse mTHP only if all non-none/zero
>>>>
>>>> And for the intermediate values
>>>>
>>>> (1) pr_warn() when mTHPs are enabled, stating that mTHP collapse is not
>>>> supported yet with other values
>>>
>>> It feels a bit much to issue a kernel warning every time somebody twiddles that
>>> value, and it's kind of against user expectation a bit.
>>
>> pr_warn_once() is what I meant.
>
> Right, but even then it feels a bit extreme, warnings are pretty serious
> things. Then again there's precedent for this, and it may be the least worse
> solution.
>
> I just picture a cloud provider turning this on with mTHP then getting their
> monitoring team reporting some urgent communication about warnings in dmesg :)
I mean, one could make the states mutually, maybe?
Disallow enabling mTHP with max_ptes_none set to unsupported values and
the other way around.
That would probably be cleanest, although the implementation might get a
bit more involved (but it's solvable).
But the concern could be that there are configs that could suddenly
break: someone that set max_ptes_none and enabled mTHP.
I'll note that we could also consider only supporting "max_ptes_none =
511" (default) to start with.
The nice thing about that value is that it us fully supported with the
underused shrinker, because max_ptes_none=511 -> never shrink.
--
Cheers
David / dhildenb
^ permalink raw reply [flat|nested] 77+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-29 15:04 ` David Hildenbrand
@ 2025-10-29 18:41 ` Lorenzo Stoakes
2025-10-29 21:10 ` Nico Pache
2025-10-29 20:45 ` Nico Pache
1 sibling, 1 reply; 77+ messages in thread
From: Lorenzo Stoakes @ 2025-10-29 18:41 UTC (permalink / raw)
To: David Hildenbrand
Cc: Baolin Wang, Nico Pache, linux-kernel, linux-trace-kernel,
linux-mm, linux-doc, ziy, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Wed, Oct 29, 2025 at 04:04:06PM +0100, David Hildenbrand wrote:
> > >
> > > No creep, because you'll always collapse.
> >
> > OK so in the 511 scenario, do we simply immediately collapse to the largest
> > possible _mTHP_ page size if based on adjacent none/zero page entries in the
> > PTE, and _never_ collapse to PMD on this basis even if we do have sufficient
> > none/zero PTE entries to do so?
>
> Right. And if we fail to allocate a PMD, we would collapse to smaller sizes,
> and later, once a PMD is possible, collapse to a PMD.
>
> But there is no creep, as we would have collapsed a PMD right from the start
> either way.
Hmm, would this mean at 511 mTHP collapse _across zero entries_ would only
ever collapse to PMD, except in cases where, for instance, PTE entries
belong to distinct VMAs and so you have to collapse to mTHP as a result?
Or IOW 'always collapse to the largest size you can I don't care if it
takes up more memory'
And at 0, we'd never collapse anything across zero entries, and only when
adjacent present entries can be collapse to mTHP/PMD do we do so?
>
> >
> > And only collapse to PMD size if we have sufficient adjacent PTE entries that
> > are populated?
> >
> > Let's really nail this down actually so we can be super clear what the issue is
> > here.
> >
>
> I hope what I wrote above made sense.
Asking some q's still, probably more a me thing :)
>
> >
> > >
> > > Creep only happens if you wouldn't collapse a PMD without prior mTHP
> > > collapse, but suddenly would in the same scenario simply because you had
> > > prior mTHP collapse.
> > >
> > > At least that's my understanding.
> >
> > OK, that makes sense, is the logic (this may be part of the bit I haven't
> > reviewed yet tbh) then that for khugepaged mTHP we have the system where we
> > always require prior mTHP collapse _first_?
>
> So I would describe creep as
>
> "we would not collapse a PMD THP because max_ptes_none is violated, but
> because we collapsed smaller mTHP THPs before, we essentially suddenly have
> more PTEs that are not none-or-zero, making us suddenly collapse a PMD THP
> at the same place".
Yeah that makes sense.
>
> Assume the following: max_ptes_none = 256
>
> This means we would only collapse if at most half (256/512) of the PTEs are
> none-or-zero.
>
> But imagine the (simplified) PTE layout with PMD = 8 entries to simplify:
>
> [ P Z P Z P Z Z Z ]
>
> 3 Present vs. 5 Zero -> do not collapse a PMD (8)
OK I'm thinking this is more about /ratio/ than anything else.
PMD - <=50% - ok 5/8 = 62.5% no collapse.
>
> But sssume we collapse smaller mTHP (2 entries) first
>
> [ P P P P P P Z Z ]
...512 KB mTHP (2 entries) - <= 50% means we can do...
>
> We collapsed 3x "P Z" into "P P" because the ratio allowed for it.
Yes so that's:
[ P Z P Z P Z Z Z ]
->
[ P P P P P P Z Z ]
Right?
>
> Suddenly we have
>
> 6 Present vs 2 Zero and we collapse a PMD (8)
>
> [ P P P P P P P P ]
>
> That's the "creep" problem.
I guess we try PMD collapse first then mTHP, but the worry is another pass
will collapse to PMD right?
Whereas < 50% ratio means we never end up 'propagating' or 'creeping' like
this because each collapse never provides enough reduction in zero entries
to allow for higher order collapse.
Hence the idea of capping at 255
>
> >
> > >
> > > >
> > > > > max_ptes_none == 0 -> collapse mTHP only if all non-none/zero
> > > > >
> > > > > And for the intermediate values
> > > > >
> > > > > (1) pr_warn() when mTHPs are enabled, stating that mTHP collapse is not
> > > > > supported yet with other values
> > > >
> > > > It feels a bit much to issue a kernel warning every time somebody twiddles that
> > > > value, and it's kind of against user expectation a bit.
> > >
> > > pr_warn_once() is what I meant.
> >
> > Right, but even then it feels a bit extreme, warnings are pretty serious
> > things. Then again there's precedent for this, and it may be the least worse
> > solution.
> >
> > I just picture a cloud provider turning this on with mTHP then getting their
> > monitoring team reporting some urgent communication about warnings in dmesg :)
>
> I mean, one could make the states mutually, maybe?
>
> Disallow enabling mTHP with max_ptes_none set to unsupported values and the
> other way around.
>
> That would probably be cleanest, although the implementation might get a bit
> more involved (but it's solvable).
>
> But the concern could be that there are configs that could suddenly break:
> someone that set max_ptes_none and enabled mTHP.
Yeah we could always return an error on setting to an unsupported value.
I mean pr_warn() is nasty but maybe necessary.
>
>
> I'll note that we could also consider only supporting "max_ptes_none = 511"
> (default) to start with.
>
> The nice thing about that value is that it us fully supported with the
> underused shrinker, because max_ptes_none=511 -> never shrink.
It feels like = 0 would be useful though?
>
> --
> Cheers
>
> David / dhildenb
>
Thanks, Lorenzo
^ permalink raw reply [flat|nested] 77+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-29 18:41 ` Lorenzo Stoakes
@ 2025-10-29 21:10 ` Nico Pache
2025-10-30 18:03 ` Lorenzo Stoakes
0 siblings, 1 reply; 77+ messages in thread
From: Nico Pache @ 2025-10-29 21:10 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: David Hildenbrand, Baolin Wang, linux-kernel, linux-trace-kernel,
linux-mm, linux-doc, ziy, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Wed, Oct 29, 2025 at 12:42 PM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> On Wed, Oct 29, 2025 at 04:04:06PM +0100, David Hildenbrand wrote:
> > > >
> > > > No creep, because you'll always collapse.
> > >
> > > OK so in the 511 scenario, do we simply immediately collapse to the largest
> > > possible _mTHP_ page size if based on adjacent none/zero page entries in the
> > > PTE, and _never_ collapse to PMD on this basis even if we do have sufficient
> > > none/zero PTE entries to do so?
> >
> > Right. And if we fail to allocate a PMD, we would collapse to smaller sizes,
> > and later, once a PMD is possible, collapse to a PMD.
> >
> > But there is no creep, as we would have collapsed a PMD right from the start
> > either way.
>
> Hmm, would this mean at 511 mTHP collapse _across zero entries_ would only
> ever collapse to PMD, except in cases where, for instance, PTE entries
> belong to distinct VMAs and so you have to collapse to mTHP as a result?
There are a few failure cases, like exceeding thresholds, or
allocations failures, but yes your assessment is correct.
At 511, the PMD collapse will be satisfied by a single PTE. If the
collapse fails we will try both sides of the PMD (1024kb , 1024kb).
the one that contains the non-none PTE will collapse
This is where the (HPAGE_PMD_ORDER - order) comes from.
imagine the 511 case above
511 >> HPAGE_PMD_ORDER - 9 == 511 >> 0 = 511 max ptes none
511 >> PMD_ORDER - 8 (1024kb) == 511 >> 1 = 255 max_ptes_none
both of these align to the orders size minus 1.
>
> Or IOW 'always collapse to the largest size you can I don't care if it
> takes up more memory'
>
> And at 0, we'd never collapse anything across zero entries, and only when
> adjacent present entries can be collapse to mTHP/PMD do we do so?
Yep!
max_pte_none =0 + all mTHP sizes enabled, gives you a really good
distribution of mTHP sizes in the systems, as zero memory will be
wasted and the most optimal size (space wise) will eb found. At least
for the memory allocated through khugepaged. The Defer patchset I had
on top of this series was exactly for that purpose-- Allow khugepaged
to determine all the THP usage in the system (other than madvise), and
allow granular control of memory waste.
>
> >
> > >
> > > And only collapse to PMD size if we have sufficient adjacent PTE entries that
> > > are populated?
> > >
> > > Let's really nail this down actually so we can be super clear what the issue is
> > > here.
> > >
> >
> > I hope what I wrote above made sense.
>
> Asking some q's still, probably more a me thing :)
>
> >
> > >
> > > >
> > > > Creep only happens if you wouldn't collapse a PMD without prior mTHP
> > > > collapse, but suddenly would in the same scenario simply because you had
> > > > prior mTHP collapse.
> > > >
> > > > At least that's my understanding.
> > >
> > > OK, that makes sense, is the logic (this may be part of the bit I haven't
> > > reviewed yet tbh) then that for khugepaged mTHP we have the system where we
> > > always require prior mTHP collapse _first_?
> >
> > So I would describe creep as
> >
> > "we would not collapse a PMD THP because max_ptes_none is violated, but
> > because we collapsed smaller mTHP THPs before, we essentially suddenly have
> > more PTEs that are not none-or-zero, making us suddenly collapse a PMD THP
> > at the same place".
>
> Yeah that makes sense.
>
> >
> > Assume the following: max_ptes_none = 256
> >
> > This means we would only collapse if at most half (256/512) of the PTEs are
> > none-or-zero.
> >
> > But imagine the (simplified) PTE layout with PMD = 8 entries to simplify:
> >
> > [ P Z P Z P Z Z Z ]
> >
> > 3 Present vs. 5 Zero -> do not collapse a PMD (8)
>
> OK I'm thinking this is more about /ratio/ than anything else.
>
> PMD - <=50% - ok 5/8 = 62.5% no collapse.
< 50%*.
At 50% it's 256 which is actually the worst case scenario. But I read
further, and it seems like you grasped the issue.
>
> >
> > But sssume we collapse smaller mTHP (2 entries) first
> >
> > [ P P P P P P Z Z ]
>
> ...512 KB mTHP (2 entries) - <= 50% means we can do...
>
> >
> > We collapsed 3x "P Z" into "P P" because the ratio allowed for it.
>
> Yes so that's:
>
> [ P Z P Z P Z Z Z ]
>
> ->
>
> [ P P P P P P Z Z ]
>
> Right?
>
> >
> > Suddenly we have
> >
> > 6 Present vs 2 Zero and we collapse a PMD (8)
> >
> > [ P P P P P P P P ]
> >
> > That's the "creep" problem.
>
> I guess we try PMD collapse first then mTHP, but the worry is another pass
> will collapse to PMD right?
>
>
> Whereas < 50% ratio means we never end up 'propagating' or 'creeping' like
> this because each collapse never provides enough reduction in zero entries
> to allow for higher order collapse.
>
> Hence the idea of capping at 255
Yep! We've discussed other solutions, like tracking collapsed pages,
or the solutions brought up by David. But this seemed like the most
logical to me, as it keeps some of the tunability. I now understand
the concern wasnt so much the capping, but rather the silent nature of
it, and the uAPI expectations surrounding enforcing such a limit (for
both past and future behavioral expectations).
>
> >
> > >
> > > >
> > > > >
> > > > > > max_ptes_none == 0 -> collapse mTHP only if all non-none/zero
> > > > > >
> > > > > > And for the intermediate values
> > > > > >
> > > > > > (1) pr_warn() when mTHPs are enabled, stating that mTHP collapse is not
> > > > > > supported yet with other values
> > > > >
> > > > > It feels a bit much to issue a kernel warning every time somebody twiddles that
> > > > > value, and it's kind of against user expectation a bit.
> > > >
> > > > pr_warn_once() is what I meant.
> > >
> > > Right, but even then it feels a bit extreme, warnings are pretty serious
> > > things. Then again there's precedent for this, and it may be the least worse
> > > solution.
> > >
> > > I just picture a cloud provider turning this on with mTHP then getting their
> > > monitoring team reporting some urgent communication about warnings in dmesg :)
> >
> > I mean, one could make the states mutually, maybe?
> >
> > Disallow enabling mTHP with max_ptes_none set to unsupported values and the
> > other way around.
> >
> > That would probably be cleanest, although the implementation might get a bit
> > more involved (but it's solvable).
> >
> > But the concern could be that there are configs that could suddenly break:
> > someone that set max_ptes_none and enabled mTHP.
>
> Yeah we could always return an error on setting to an unsupported value.
>
> I mean pr_warn() is nasty but maybe necessary.
>
> >
> >
> > I'll note that we could also consider only supporting "max_ptes_none = 511"
> > (default) to start with.
> >
> > The nice thing about that value is that it us fully supported with the
> > underused shrinker, because max_ptes_none=511 -> never shrink.
>
> It feels like = 0 would be useful though?
I personally think the default of 511 is wrong and should be on the
lower end of the scale. The exception being thp=always, where I
believe the kernel should treat it as 511.
But the second part of that would also violate the users max_ptes_none
setting, so it's probably much harder in practice, and also not really
part of this series, just my opinion.
Cheers.
-- Nico
>
> >
> > --
> > Cheers
> >
> > David / dhildenb
> >
>
> Thanks, Lorenzo
>
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-29 21:10 ` Nico Pache
@ 2025-10-30 18:03 ` Lorenzo Stoakes
0 siblings, 0 replies; 77+ messages in thread
From: Lorenzo Stoakes @ 2025-10-30 18:03 UTC (permalink / raw)
To: Nico Pache
Cc: David Hildenbrand, Baolin Wang, linux-kernel, linux-trace-kernel,
linux-mm, linux-doc, ziy, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Wed, Oct 29, 2025 at 03:10:19PM -0600, Nico Pache wrote:
> On Wed, Oct 29, 2025 at 12:42 PM Lorenzo Stoakes
> <lorenzo.stoakes@oracle.com> wrote:
> >
> > On Wed, Oct 29, 2025 at 04:04:06PM +0100, David Hildenbrand wrote:
> > > > >
> > > > > No creep, because you'll always collapse.
> > > >
> > > > OK so in the 511 scenario, do we simply immediately collapse to the largest
> > > > possible _mTHP_ page size if based on adjacent none/zero page entries in the
> > > > PTE, and _never_ collapse to PMD on this basis even if we do have sufficient
> > > > none/zero PTE entries to do so?
> > >
> > > Right. And if we fail to allocate a PMD, we would collapse to smaller sizes,
> > > and later, once a PMD is possible, collapse to a PMD.
> > >
> > > But there is no creep, as we would have collapsed a PMD right from the start
> > > either way.
> >
> > Hmm, would this mean at 511 mTHP collapse _across zero entries_ would only
> > ever collapse to PMD, except in cases where, for instance, PTE entries
> > belong to distinct VMAs and so you have to collapse to mTHP as a result?
>
> There are a few failure cases, like exceeding thresholds, or
> allocations failures, but yes your assessment is correct.
Yeah of course being mm there are thorny edge cases :) we do love those...
>
> At 511, the PMD collapse will be satisfied by a single PTE. If the
> collapse fails we will try both sides of the PMD (1024kb , 1024kb).
> the one that contains the non-none PTE will collapse
Right yes.
>
> This is where the (HPAGE_PMD_ORDER - order) comes from.
> imagine the 511 case above
> 511 >> HPAGE_PMD_ORDER - 9 == 511 >> 0 = 511 max ptes none
> 511 >> PMD_ORDER - 8 (1024kb) == 511 >> 1 = 255 max_ptes_none
>
> both of these align to the orders size minus 1.
Right.
>
> >
> > Or IOW 'always collapse to the largest size you can I don't care if it
> > takes up more memory'
> >
> > And at 0, we'd never collapse anything across zero entries, and only when
> > adjacent present entries can be collapse to mTHP/PMD do we do so?
>
> Yep!
>
> max_pte_none =0 + all mTHP sizes enabled, gives you a really good
> distribution of mTHP sizes in the systems, as zero memory will be
> wasted and the most optimal size (space wise) will eb found. At least
> for the memory allocated through khugepaged. The Defer patchset I had
> on top of this series was exactly for that purpose-- Allow khugepaged
> to determine all the THP usage in the system (other than madvise), and
> allow granular control of memory waste.
Yeah, well it's a trade off really isn't it on 'eagerness' to collapse
non-present entries :)
But we'll come back to that when David has time :)
>
> >
> > >
> > > >
> > > > And only collapse to PMD size if we have sufficient adjacent PTE entries that
> > > > are populated?
> > > >
> > > > Let's really nail this down actually so we can be super clear what the issue is
> > > > here.
> > > >
> > >
> > > I hope what I wrote above made sense.
> >
> > Asking some q's still, probably more a me thing :)
> >
> > >
> > > >
> > > > >
> > > > > Creep only happens if you wouldn't collapse a PMD without prior mTHP
> > > > > collapse, but suddenly would in the same scenario simply because you had
> > > > > prior mTHP collapse.
> > > > >
> > > > > At least that's my understanding.
> > > >
> > > > OK, that makes sense, is the logic (this may be part of the bit I haven't
> > > > reviewed yet tbh) then that for khugepaged mTHP we have the system where we
> > > > always require prior mTHP collapse _first_?
> > >
> > > So I would describe creep as
> > >
> > > "we would not collapse a PMD THP because max_ptes_none is violated, but
> > > because we collapsed smaller mTHP THPs before, we essentially suddenly have
> > > more PTEs that are not none-or-zero, making us suddenly collapse a PMD THP
> > > at the same place".
> >
> > Yeah that makes sense.
> >
> > >
> > > Assume the following: max_ptes_none = 256
> > >
> > > This means we would only collapse if at most half (256/512) of the PTEs are
> > > none-or-zero.
> > >
> > > But imagine the (simplified) PTE layout with PMD = 8 entries to simplify:
> > >
> > > [ P Z P Z P Z Z Z ]
> > >
> > > 3 Present vs. 5 Zero -> do not collapse a PMD (8)
> >
> > OK I'm thinking this is more about /ratio/ than anything else.
> >
> > PMD - <=50% - ok 5/8 = 62.5% no collapse.
>
> < 50%*.
>
> At 50% it's 256 which is actually the worst case scenario. But I read
> further, and it seems like you grasped the issue.
Yeah this is < 50% vs. <= 50% which are fundamentally different obviously :)
>
> >
> > >
> > > But sssume we collapse smaller mTHP (2 entries) first
> > >
> > > [ P P P P P P Z Z ]
> >
> > ...512 KB mTHP (2 entries) - <= 50% means we can do...
> >
> > >
> > > We collapsed 3x "P Z" into "P P" because the ratio allowed for it.
> >
> > Yes so that's:
> >
> > [ P Z P Z P Z Z Z ]
> >
> > ->
> >
> > [ P P P P P P Z Z ]
> >
> > Right?
> >
> > >
> > > Suddenly we have
> > >
> > > 6 Present vs 2 Zero and we collapse a PMD (8)
> > >
> > > [ P P P P P P P P ]
> > >
> > > That's the "creep" problem.
> >
> > I guess we try PMD collapse first then mTHP, but the worry is another pass
> > will collapse to PMD right?
> >
> >
> > Whereas < 50% ratio means we never end up 'propagating' or 'creeping' like
> > this because each collapse never provides enough reduction in zero entries
> > to allow for higher order collapse.
> >
> > Hence the idea of capping at 255
>
> Yep! We've discussed other solutions, like tracking collapsed pages,
> or the solutions brought up by David. But this seemed like the most
> logical to me, as it keeps some of the tunability. I now understand
> the concern wasnt so much the capping, but rather the silent nature of
> it, and the uAPI expectations surrounding enforcing such a limit (for
> both past and future behavioral expectations).
Yes, that's the primary concern on my side.
>
> >
> > >
> > > >
> > > > >
> > > > > >
> > > > > > > max_ptes_none == 0 -> collapse mTHP only if all non-none/zero
> > > > > > >
> > > > > > > And for the intermediate values
> > > > > > >
> > > > > > > (1) pr_warn() when mTHPs are enabled, stating that mTHP collapse is not
> > > > > > > supported yet with other values
> > > > > >
> > > > > > It feels a bit much to issue a kernel warning every time somebody twiddles that
> > > > > > value, and it's kind of against user expectation a bit.
> > > > >
> > > > > pr_warn_once() is what I meant.
> > > >
> > > > Right, but even then it feels a bit extreme, warnings are pretty serious
> > > > things. Then again there's precedent for this, and it may be the least worse
> > > > solution.
> > > >
> > > > I just picture a cloud provider turning this on with mTHP then getting their
> > > > monitoring team reporting some urgent communication about warnings in dmesg :)
> > >
> > > I mean, one could make the states mutually, maybe?
> > >
> > > Disallow enabling mTHP with max_ptes_none set to unsupported values and the
> > > other way around.
> > >
> > > That would probably be cleanest, although the implementation might get a bit
> > > more involved (but it's solvable).
> > >
> > > But the concern could be that there are configs that could suddenly break:
> > > someone that set max_ptes_none and enabled mTHP.
> >
> > Yeah we could always return an error on setting to an unsupported value.
> >
> > I mean pr_warn() is nasty but maybe necessary.
> >
> > >
> > >
> > > I'll note that we could also consider only supporting "max_ptes_none = 511"
> > > (default) to start with.
> > >
> > > The nice thing about that value is that it us fully supported with the
> > > underused shrinker, because max_ptes_none=511 -> never shrink.
> >
> > It feels like = 0 would be useful though?
>
> I personally think the default of 511 is wrong and should be on the
> lower end of the scale. The exception being thp=always, where I
> believe the kernel should treat it as 511.
I think that'd be confusing to have different behaviour for thp=always, and I'd
rather we didn't do that.
But ultimately it's all moot I think as these are all uAPI things now.
It was a mistake to even export this IMO, but that can't be helped now :)
>
> But the second part of that would also violate the users max_ptes_none
> setting, so it's probably much harder in practice, and also not really
> part of this series, just my opinion.
I'm confused what you mean here?
In any case I think the 511/0 solution is the way forwards.
>
> Cheers.
> -- Nico
>
> >
> > >
> > > --
> > > Cheers
> > >
> > > David / dhildenb
> > >
> >
> > Thanks, Lorenzo
> >
>
Cheers, Lorenzo
^ permalink raw reply [flat|nested] 77+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-29 15:04 ` David Hildenbrand
2025-10-29 18:41 ` Lorenzo Stoakes
@ 2025-10-29 20:45 ` Nico Pache
1 sibling, 0 replies; 77+ messages in thread
From: Nico Pache @ 2025-10-29 20:45 UTC (permalink / raw)
To: David Hildenbrand
Cc: Lorenzo Stoakes, Baolin Wang, linux-kernel, linux-trace-kernel,
linux-mm, linux-doc, ziy, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Wed, Oct 29, 2025 at 9:04 AM David Hildenbrand <david@redhat.com> wrote:
>
> >>
> >> No creep, because you'll always collapse.
> >
> > OK so in the 511 scenario, do we simply immediately collapse to the largest
> > possible _mTHP_ page size if based on adjacent none/zero page entries in the
> > PTE, and _never_ collapse to PMD on this basis even if we do have sufficient
> > none/zero PTE entries to do so?
>
> Right. And if we fail to allocate a PMD, we would collapse to smaller
> sizes, and later, once a PMD is possible, collapse to a PMD.
>
> But there is no creep, as we would have collapsed a PMD right from the
> start either way.
>
> >
> > And only collapse to PMD size if we have sufficient adjacent PTE entries that
> > are populated?
> >
> > Let's really nail this down actually so we can be super clear what the issue is
> > here.
> >
>
> I hope what I wrote above made sense.
>
> >
> >>
> >> Creep only happens if you wouldn't collapse a PMD without prior mTHP
> >> collapse, but suddenly would in the same scenario simply because you had
> >> prior mTHP collapse.
> >>
> >> At least that's my understanding.
> >
> > OK, that makes sense, is the logic (this may be part of the bit I haven't
> > reviewed yet tbh) then that for khugepaged mTHP we have the system where we
> > always require prior mTHP collapse _first_?
>
> So I would describe creep as
>
> "we would not collapse a PMD THP because max_ptes_none is violated, but
> because we collapsed smaller mTHP THPs before, we essentially suddenly
> have more PTEs that are not none-or-zero, making us suddenly collapse a
> PMD THP at the same place".
>
> Assume the following: max_ptes_none = 256
>
> This means we would only collapse if at most half (256/512) of the PTEs
> are none-or-zero.
>
> But imagine the (simplified) PTE layout with PMD = 8 entries to simplify:
>
> [ P Z P Z P Z Z Z ]
>
> 3 Present vs. 5 Zero -> do not collapse a PMD (8)
>
> But sssume we collapse smaller mTHP (2 entries) first
>
> [ P P P P P P Z Z ]
>
> We collapsed 3x "P Z" into "P P" because the ratio allowed for it.
>
> Suddenly we have
>
> 6 Present vs 2 Zero and we collapse a PMD (8)
>
> [ P P P P P P P P ]
>
> That's the "creep" problem.
I'd like to add a little to this,
The worst case scenario is all mTHP sizes enabled and a value of 256.
A 16kb collapse would then lead all the way up to a PMD collapse,
stopping to collapse at each mTHP level on each subsequent scan of the
same PMD range. The larger the max_pte_none value is, the less "stops"
it will make before reaching a PMD size, but it will ultimately creep
up to a PMD. Hence the cap. At 511, a single pte in a range will
always satisfy the PMD collapse, so we will never attempt any other
orders (other than in the case of the collapse failing, which David
explains above).
Hopefully that helps give some more insight to the creep problem.
Cheers
-- Nico
>
> >
> >>
> >>>
> >>>> max_ptes_none == 0 -> collapse mTHP only if all non-none/zero
> >>>>
> >>>> And for the intermediate values
> >>>>
> >>>> (1) pr_warn() when mTHPs are enabled, stating that mTHP collapse is not
> >>>> supported yet with other values
> >>>
> >>> It feels a bit much to issue a kernel warning every time somebody twiddles that
> >>> value, and it's kind of against user expectation a bit.
> >>
> >> pr_warn_once() is what I meant.
> >
> > Right, but even then it feels a bit extreme, warnings are pretty serious
> > things. Then again there's precedent for this, and it may be the least worse
> > solution.
> >
> > I just picture a cloud provider turning this on with mTHP then getting their
> > monitoring team reporting some urgent communication about warnings in dmesg :)
>
> I mean, one could make the states mutually, maybe?
>
> Disallow enabling mTHP with max_ptes_none set to unsupported values and
> the other way around.
>
> That would probably be cleanest, although the implementation might get a
> bit more involved (but it's solvable).
>
> But the concern could be that there are configs that could suddenly
> break: someone that set max_ptes_none and enabled mTHP.
>
>
> I'll note that we could also consider only supporting "max_ptes_none =
> 511" (default) to start with.
>
> The nice thing about that value is that it us fully supported with the
> underused shrinker, because max_ptes_none=511 -> never shrink.
>
> --
> Cheers
>
> David / dhildenb
>
^ permalink raw reply [flat|nested] 77+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-27 17:53 ` Lorenzo Stoakes
2025-10-28 10:09 ` Baolin Wang
@ 2025-10-28 13:36 ` Nico Pache
2025-10-28 14:15 ` David Hildenbrand
2025-10-28 16:57 ` Lorenzo Stoakes
1 sibling, 2 replies; 77+ messages in thread
From: Nico Pache @ 2025-10-28 13:36 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david, ziy,
baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Mon, Oct 27, 2025 at 11:54 AM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> On Wed, Oct 22, 2025 at 12:37:08PM -0600, Nico Pache wrote:
> > The current mechanism for determining mTHP collapse scales the
> > khugepaged_max_ptes_none value based on the target order. This
> > introduces an undesirable feedback loop, or "creep", when max_ptes_none
> > is set to a value greater than HPAGE_PMD_NR / 2.
> >
> > With this configuration, a successful collapse to order N will populate
> > enough pages to satisfy the collapse condition on order N+1 on the next
> > scan. This leads to unnecessary work and memory churn.
> >
> > To fix this issue introduce a helper function that caps the max_ptes_none
> > to HPAGE_PMD_NR / 2 - 1 (255 on 4k page size). The function also scales
> > the max_ptes_none number by the (PMD_ORDER - target collapse order).
> >
> > The limits can be ignored by passing full_scan=true, this is useful for
> > madvise_collapse (which ignores limits), or in the case of
> > collapse_scan_pmd(), allows the full PMD to be scanned when mTHP
> > collapse is available.
> >
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> > mm/khugepaged.c | 35 ++++++++++++++++++++++++++++++++++-
> > 1 file changed, 34 insertions(+), 1 deletion(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 4ccebf5dda97..286c3a7afdee 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -459,6 +459,39 @@ void __khugepaged_enter(struct mm_struct *mm)
> > wake_up_interruptible(&khugepaged_wait);
> > }
> >
> > +/**
> > + * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse
> > + * @order: The folio order being collapsed to
> > + * @full_scan: Whether this is a full scan (ignore limits)
> > + *
> > + * For madvise-triggered collapses (full_scan=true), all limits are bypassed
> > + * and allow up to HPAGE_PMD_NR - 1 empty PTEs.
> > + *
> > + * For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the configured
> > + * khugepaged_max_ptes_none value.
> > + *
> > + * For mTHP collapses, scale down the max_ptes_none proportionally to the folio
> > + * order, but caps it at HPAGE_PMD_NR/2-1 to prevent a collapse feedback loop.
> > + *
> > + * Return: Maximum number of empty PTEs allowed for the collapse operation
> > + */
> > +static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
> > +{
> > + unsigned int max_ptes_none;
> > +
> > + /* ignore max_ptes_none limits */
> > + if (full_scan)
> > + return HPAGE_PMD_NR - 1;
> > +
> > + if (order == HPAGE_PMD_ORDER)
> > + return khugepaged_max_ptes_none;
> > +
> > + max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
>
Hey Lorenzo,
> I mean not to beat a dead horse re: v11 commentary, but I thought we were going
> to implement David's idea re: the new 'eagerness' tunable, and again we're now just
> implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
I spoke to David and he said to continue forward with this series; the
"eagerness" tunable will take some time, and may require further
considerations/discussion.
>
> I'm still really quite uncomfortable with us silently capping this value.
>
> If we're putting forward theoretical ideas that are to be later built upon, this
> series should be an RFC.
>
> But if we really intend to silently ignore user input the problem is that then
> becomes established uAPI.
>
> I think it's _sensible_ to avoid this mTHP escalation problem, but the issue is
> visibility I think.
>
> I think people are going to find it odd that you set it to something, but then
> get something else.
The alternative solution is to not support max_ptes_none for mTHP
collapse and not allow none/zero pages. This is essentially "capping"
the value too.
>
> As an alternative we could have a new sysfs field:
>
> /sys/kernel/mm/transparent_hugepage/khugepaged/max_mthp_ptes_none
>
> That shows the cap clearly.
>
> In fact, it could be read-only... and just expose it to the user. That reduces
> complexity.
I agree with Baolin here; adding another tunable will only increase
the complexity for our future goals, and also provides needless
insight into the internals when they can not be customized.
Cheers,
-- Nico
>
> We can then bring in eagerness later and have the same situation of
> max_ptes_none being a parameter that exists (plus this additional read-only
> parameter).
>
> > +
> > + return max_ptes_none >> (HPAGE_PMD_ORDER - order);
> > +
> > +}
> > +
> > void khugepaged_enter_vma(struct vm_area_struct *vma,
> > vm_flags_t vm_flags)
> > {
> > @@ -546,7 +579,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > pte_t *_pte;
> > int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
> > const unsigned long nr_pages = 1UL << order;
> > - int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
> > + int max_ptes_none = collapse_max_ptes_none(order, !cc->is_khugepaged);
> >
> > for (_pte = pte; _pte < pte + nr_pages;
> > _pte++, addr += PAGE_SIZE) {
> > --
> > 2.51.0
> >
>
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 13:36 ` Nico Pache
@ 2025-10-28 14:15 ` David Hildenbrand
2025-10-28 17:29 ` Lorenzo Stoakes
2025-10-28 16:57 ` Lorenzo Stoakes
1 sibling, 1 reply; 77+ messages in thread
From: David Hildenbrand @ 2025-10-28 14:15 UTC (permalink / raw)
To: Nico Pache, Lorenzo Stoakes
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, ziy,
baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On 28.10.25 14:36, Nico Pache wrote:
> On Mon, Oct 27, 2025 at 11:54 AM Lorenzo Stoakes
> <lorenzo.stoakes@oracle.com> wrote:
>>
>> On Wed, Oct 22, 2025 at 12:37:08PM -0600, Nico Pache wrote:
>>> The current mechanism for determining mTHP collapse scales the
>>> khugepaged_max_ptes_none value based on the target order. This
>>> introduces an undesirable feedback loop, or "creep", when max_ptes_none
>>> is set to a value greater than HPAGE_PMD_NR / 2.
>>>
>>> With this configuration, a successful collapse to order N will populate
>>> enough pages to satisfy the collapse condition on order N+1 on the next
>>> scan. This leads to unnecessary work and memory churn.
>>>
>>> To fix this issue introduce a helper function that caps the max_ptes_none
>>> to HPAGE_PMD_NR / 2 - 1 (255 on 4k page size). The function also scales
>>> the max_ptes_none number by the (PMD_ORDER - target collapse order).
>>>
>>> The limits can be ignored by passing full_scan=true, this is useful for
>>> madvise_collapse (which ignores limits), or in the case of
>>> collapse_scan_pmd(), allows the full PMD to be scanned when mTHP
>>> collapse is available.
>>>
>>> Signed-off-by: Nico Pache <npache@redhat.com>
>>> ---
>>> mm/khugepaged.c | 35 ++++++++++++++++++++++++++++++++++-
>>> 1 file changed, 34 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>> index 4ccebf5dda97..286c3a7afdee 100644
>>> --- a/mm/khugepaged.c
>>> +++ b/mm/khugepaged.c
>>> @@ -459,6 +459,39 @@ void __khugepaged_enter(struct mm_struct *mm)
>>> wake_up_interruptible(&khugepaged_wait);
>>> }
>>>
>>> +/**
>>> + * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse
>>> + * @order: The folio order being collapsed to
>>> + * @full_scan: Whether this is a full scan (ignore limits)
>>> + *
>>> + * For madvise-triggered collapses (full_scan=true), all limits are bypassed
>>> + * and allow up to HPAGE_PMD_NR - 1 empty PTEs.
>>> + *
>>> + * For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the configured
>>> + * khugepaged_max_ptes_none value.
>>> + *
>>> + * For mTHP collapses, scale down the max_ptes_none proportionally to the folio
>>> + * order, but caps it at HPAGE_PMD_NR/2-1 to prevent a collapse feedback loop.
>>> + *
>>> + * Return: Maximum number of empty PTEs allowed for the collapse operation
>>> + */
>>> +static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
>>> +{
>>> + unsigned int max_ptes_none;
>>> +
>>> + /* ignore max_ptes_none limits */
>>> + if (full_scan)
>>> + return HPAGE_PMD_NR - 1;
>>> +
>>> + if (order == HPAGE_PMD_ORDER)
>>> + return khugepaged_max_ptes_none;
>>> +
>>> + max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
>>
>
> Hey Lorenzo,
>
>> I mean not to beat a dead horse re: v11 commentary, but I thought we were going
>> to implement David's idea re: the new 'eagerness' tunable, and again we're now just
>> implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
>
> I spoke to David and he said to continue forward with this series; the
> "eagerness" tunable will take some time, and may require further
> considerations/discussion.
Right, after talking to Johannes it got clearer that what we envisioned
with "eagerness" would not be like swappiness, and we will really have
to be careful here. I don't know yet when I will have time to look into
that.
If we want to avoid the implicit capping, I think there are the
following possible approaches
(1) Tolerate creep for now, maybe warning if the user configures it.
(2) Avoid creep by counting zero-filled pages towards none_or_zero.
(3) Have separate toggles for each THP size. Doesn't quite solve the
problem, only shifts it.
Anything else?
IIUC, creep is less of a problem when we have the underused shrinker
enabled: whatever we over-allocated can (unless longterm-pinned etc) get
reclaimed again.
So maybe having underused-shrinker support for mTHP as well would be a
solution to tackle (1) later?
--
Cheers
David / dhildenb
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 14:15 ` David Hildenbrand
@ 2025-10-28 17:29 ` Lorenzo Stoakes
2025-10-28 17:36 ` Lorenzo Stoakes
2025-10-28 18:08 ` David Hildenbrand
0 siblings, 2 replies; 77+ messages in thread
From: Lorenzo Stoakes @ 2025-10-28 17:29 UTC (permalink / raw)
To: David Hildenbrand
Cc: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm, linux-doc,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Tue, Oct 28, 2025 at 03:15:26PM +0100, David Hildenbrand wrote:
> On 28.10.25 14:36, Nico Pache wrote:
> > On Mon, Oct 27, 2025 at 11:54 AM Lorenzo Stoakes
> > <lorenzo.stoakes@oracle.com> wrote:
> > >
> > > On Wed, Oct 22, 2025 at 12:37:08PM -0600, Nico Pache wrote:
> > > > The current mechanism for determining mTHP collapse scales the
> > > > khugepaged_max_ptes_none value based on the target order. This
> > > > introduces an undesirable feedback loop, or "creep", when max_ptes_none
> > > > is set to a value greater than HPAGE_PMD_NR / 2.
> > > >
> > > > With this configuration, a successful collapse to order N will populate
> > > > enough pages to satisfy the collapse condition on order N+1 on the next
> > > > scan. This leads to unnecessary work and memory churn.
> > > >
> > > > To fix this issue introduce a helper function that caps the max_ptes_none
> > > > to HPAGE_PMD_NR / 2 - 1 (255 on 4k page size). The function also scales
> > > > the max_ptes_none number by the (PMD_ORDER - target collapse order).
> > > >
> > > > The limits can be ignored by passing full_scan=true, this is useful for
> > > > madvise_collapse (which ignores limits), or in the case of
> > > > collapse_scan_pmd(), allows the full PMD to be scanned when mTHP
> > > > collapse is available.
> > > >
> > > > Signed-off-by: Nico Pache <npache@redhat.com>
> > > > ---
> > > > mm/khugepaged.c | 35 ++++++++++++++++++++++++++++++++++-
> > > > 1 file changed, 34 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > > index 4ccebf5dda97..286c3a7afdee 100644
> > > > --- a/mm/khugepaged.c
> > > > +++ b/mm/khugepaged.c
> > > > @@ -459,6 +459,39 @@ void __khugepaged_enter(struct mm_struct *mm)
> > > > wake_up_interruptible(&khugepaged_wait);
> > > > }
> > > >
> > > > +/**
> > > > + * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse
> > > > + * @order: The folio order being collapsed to
> > > > + * @full_scan: Whether this is a full scan (ignore limits)
> > > > + *
> > > > + * For madvise-triggered collapses (full_scan=true), all limits are bypassed
> > > > + * and allow up to HPAGE_PMD_NR - 1 empty PTEs.
> > > > + *
> > > > + * For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the configured
> > > > + * khugepaged_max_ptes_none value.
> > > > + *
> > > > + * For mTHP collapses, scale down the max_ptes_none proportionally to the folio
> > > > + * order, but caps it at HPAGE_PMD_NR/2-1 to prevent a collapse feedback loop.
> > > > + *
> > > > + * Return: Maximum number of empty PTEs allowed for the collapse operation
> > > > + */
> > > > +static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
> > > > +{
> > > > + unsigned int max_ptes_none;
> > > > +
> > > > + /* ignore max_ptes_none limits */
> > > > + if (full_scan)
> > > > + return HPAGE_PMD_NR - 1;
> > > > +
> > > > + if (order == HPAGE_PMD_ORDER)
> > > > + return khugepaged_max_ptes_none;
> > > > +
> > > > + max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
> > >
> >
> > Hey Lorenzo,
> >
> > > I mean not to beat a dead horse re: v11 commentary, but I thought we were going
> > > to implement David's idea re: the new 'eagerness' tunable, and again we're now just
> > > implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
> >
> > I spoke to David and he said to continue forward with this series; the
> > "eagerness" tunable will take some time, and may require further
> > considerations/discussion.
>
> Right, after talking to Johannes it got clearer that what we envisioned with
I'm not sure that you meant to say go ahead with the series as-is with this
silent capping?
Either way we need better communication of this, because I wasn't aware that was
the plan for one, and it means this patch directly ignores review from 2
versions ago, which needs to be documented _somewhere_ so people aren't confused.
And it would maybe allowed us to have this converation ahead of time rather than
now.
> "eagerness" would not be like swappiness, and we will really have to be
> careful here. I don't know yet when I will have time to look into that.
I guess I missed this part of the converastion, what do you mean?
The whole concept is that we have a paramaeter whose value is _abstracted_ and
which we control what it means.
I'm not sure exactly why that would now be problematic? The fundamental concept
seems sound no? Last I remember of the conversation this was the case.
>
> If we want to avoid the implicit capping, I think there are the following
> possible approaches
>
> (1) Tolerate creep for now, maybe warning if the user configures it.
I mean this seems a viable option if there is pressure to land this series
before we have a viable uAPI for configuring this.
A part of me thinks we shouldn't rush series in for that reason though and
should require that we have a proper control here.
But I guess this approach is the least-worst as it leaves us with the most
options moving forwards.
> (2) Avoid creep by counting zero-filled pages towards none_or_zero.
Would this really make all that much difference?
> (3) Have separate toggles for each THP size. Doesn't quite solve the
> problem, only shifts it.
Yeah I did wonder about this as an alternative solution. But of course it then
makes it vague what the parent values means in respect of the individual levels,
unless we have an 'inherit' mode there too (possible).
It's going to be confusing though as max_ptes_none sits at the root khugepaged/
level and I don't think any other parameter from khugepaged/ is exposed at
individual page size levels.
And of course doing this means we
>
> Anything else?
Err... I mean I'm not sure if you missed it but I suggested an approach in the
sub-thread - exposing mthp_max_ptes_none as a _READ-ONLY_ field at:
/sys/kernel/mm/transparent_hugepage/khugepaged/max_mthp_ptes_none
Then we allow the capping, but simply document that we specify what the capped
value will be here for mTHP.
That struck me as the simplest way of getting this series landed without
necessarily violating any future eagerness which:
a. Must still support khugepaged/max_ptes_none - we aren't getting away from
this, it's uAPI.
b. Surely must want to do different things for mTHP in eagerness, so if we're
exposing some PTE value in max_ptes_none doing so in
khugepaged/mthp_max_ptes_none wouldn't be problematic (note again - it's
readonly so unlike max_ptes_none we don't have to worry about the other
direction).
HOWEVER, eagerness might want want to change this behaviour per-mTHP size, in
which case perhaps mthp_max_ptes_none would be problematic in that it is some
kind of average.
Then again we could always revert to putting this parameter as in (3) in that
case, ugly but kinda viable.
>
> IIUC, creep is less of a problem when we have the underused shrinker
> enabled: whatever we over-allocated can (unless longterm-pinned etc) get
> reclaimed again.
>
> So maybe having underused-shrinker support for mTHP as well would be a
> solution to tackle (1) later?
How viable is this in the short term?
>
> --
> Cheers
>
> David / dhildenb
>
Another possible solution:
If mthp_max_ptes_none is not workable, we could have a toggle at, e.g.:
/sys/kernel/mm/transparent_hugepage/khugepaged/mthp_cap_collapse_none
As a simple boolean. If switched on then we document that it caps mTHP as
per Nico's suggestion.
That way we avoid the 'silent' issue I have with all this and it's an
explicit setting.
Cheers, Lorenzo
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 17:29 ` Lorenzo Stoakes
@ 2025-10-28 17:36 ` Lorenzo Stoakes
2025-10-28 18:08 ` David Hildenbrand
1 sibling, 0 replies; 77+ messages in thread
From: Lorenzo Stoakes @ 2025-10-28 17:36 UTC (permalink / raw)
To: David Hildenbrand
Cc: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm, linux-doc,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Tue, Oct 28, 2025 at 05:29:59PM +0000, Lorenzo Stoakes wrote:
> >
> > If we want to avoid the implicit capping, I think there are the following
> > possible approaches
> >
> > (1) Tolerate creep for now, maybe warning if the user configures it.
>
> I mean this seems a viable option if there is pressure to land this series
> before we have a viable uAPI for configuring this.
>
> A part of me thinks we shouldn't rush series in for that reason though and
> should require that we have a proper control here.
>
> But I guess this approach is the least-worst as it leaves us with the most
> options moving forwards.
>
> > (2) Avoid creep by counting zero-filled pages towards none_or_zero.
>
> Would this really make all that much difference?
>
> > (3) Have separate toggles for each THP size. Doesn't quite solve the
> > problem, only shifts it.
>
> Yeah I did wonder about this as an alternative solution. But of course it then
> makes it vague what the parent values means in respect of the individual levels,
> unless we have an 'inherit' mode there too (possible).
>
> It's going to be confusing though as max_ptes_none sits at the root khugepaged/
> level and I don't think any other parameter from khugepaged/ is exposed at
> individual page size levels.
>
> And of course doing this means we
Oops didn't finish the thought!
Here it is:
And of course this means we continue to propagate this max_ptes_none concept
only now in more places which is yuck.
Unless you meant putting something other than max_ptes_none at different levels?
Cheers, Lorenzo
^ permalink raw reply [flat|nested] 77+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 17:29 ` Lorenzo Stoakes
2025-10-28 17:36 ` Lorenzo Stoakes
@ 2025-10-28 18:08 ` David Hildenbrand
2025-10-28 18:59 ` Lorenzo Stoakes
1 sibling, 1 reply; 77+ messages in thread
From: David Hildenbrand @ 2025-10-28 18:08 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm, linux-doc,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
>>> Hey Lorenzo,
>>>
>>>> I mean not to beat a dead horse re: v11 commentary, but I thought we were going
>>>> to implement David's idea re: the new 'eagerness' tunable, and again we're now just
>>>> implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
>>>
>>> I spoke to David and he said to continue forward with this series; the
>>> "eagerness" tunable will take some time, and may require further
>>> considerations/discussion.
>>
>> Right, after talking to Johannes it got clearer that what we envisioned with
>
> I'm not sure that you meant to say go ahead with the series as-is with this
> silent capping?
No, "go ahead" as in "let's find some way forward that works for all and
is not too crazy".
[...]
>> "eagerness" would not be like swappiness, and we will really have to be
>> careful here. I don't know yet when I will have time to look into that.
>
> I guess I missed this part of the converastion, what do you mean?
Johannes raised issues with that on the list and afterwards we had an
offline discussion about some of the details and why something
unpredictable is not good.
>
> The whole concept is that we have a paramaeter whose value is _abstracted_ and
> which we control what it means.
>
> I'm not sure exactly why that would now be problematic? The fundamental concept
> seems sound no? Last I remember of the conversation this was the case.
The basic idea was to do something abstracted as swappiness. Turns out
"swappiness" is really something predictable, not something we can
randomly change how it behaves under the hood.
So we'd have to find something similar for "eagerness", and that's where
it stops being easy.
>
>>
>> If we want to avoid the implicit capping, I think there are the following
>> possible approaches
>>
>> (1) Tolerate creep for now, maybe warning if the user configures it.
>
> I mean this seems a viable option if there is pressure to land this series
> before we have a viable uAPI for configuring this.
>
> A part of me thinks we shouldn't rush series in for that reason though and
> should require that we have a proper control here.
>
> But I guess this approach is the least-worst as it leaves us with the most
> options moving forwards.
Yes. There is also the alternative of respecting only 0 / 511 for mTHP
collapse for now as discussed in the other thread.
>
>> (2) Avoid creep by counting zero-filled pages towards none_or_zero.
>
> Would this really make all that much difference?
It solves the creep problem I think, but it's a bit nasty IMHO.
>
>> (3) Have separate toggles for each THP size. Doesn't quite solve the
>> problem, only shifts it.
>
> Yeah I did wonder about this as an alternative solution. But of course it then
> makes it vague what the parent values means in respect of the individual levels,
> unless we have an 'inherit' mode there too (possible).
>
> It's going to be confusing though as max_ptes_none sits at the root khugepaged/
> level and I don't think any other parameter from khugepaged/ is exposed at
> individual page size levels.
>
> And of course doing this means we
>
>>
>> Anything else?
>
> Err... I mean I'm not sure if you missed it but I suggested an approach in the
> sub-thread - exposing mthp_max_ptes_none as a _READ-ONLY_ field at:
>
> /sys/kernel/mm/transparent_hugepage/khugepaged/max_mthp_ptes_none
>
> Then we allow the capping, but simply document that we specify what the capped
> value will be here for mTHP.
I did not have time to read the details on that so far.
It would be one solution forward. I dislike it because I think the whole
capping is an intermediate thing that can be (and likely must be, when
considering mTHP underused shrinking I think) solved in the future
differently. That's why I would prefer adding this only if there is no
other, simpler, way forward.
>
> That struck me as the simplest way of getting this series landed without
> necessarily violating any future eagerness which:
>
> a. Must still support khugepaged/max_ptes_none - we aren't getting away from
> this, it's uAPI.
>
> b. Surely must want to do different things for mTHP in eagerness, so if we're
> exposing some PTE value in max_ptes_none doing so in
> khugepaged/mthp_max_ptes_none wouldn't be problematic (note again - it's
> readonly so unlike max_ptes_none we don't have to worry about the other
> direction).
>
> HOWEVER, eagerness might want want to change this behaviour per-mTHP size, in
> which case perhaps mthp_max_ptes_none would be problematic in that it is some
> kind of average.
>
> Then again we could always revert to putting this parameter as in (3) in that
> case, ugly but kinda viable.
>
>>
>> IIUC, creep is less of a problem when we have the underused shrinker
>> enabled: whatever we over-allocated can (unless longterm-pinned etc) get
>> reclaimed again.
>>
>> So maybe having underused-shrinker support for mTHP as well would be a
>> solution to tackle (1) later?
>
> How viable is this in the short term?
I once started looking into it, but it will require quite some work,
because the lists will essentially include each and every (m)THP in the
system ... so i think we will need some redesign.
>
> Another possible solution:
>
> If mthp_max_ptes_none is not workable, we could have a toggle at, e.g.:
>
> /sys/kernel/mm/transparent_hugepage/khugepaged/mthp_cap_collapse_none
>
> As a simple boolean. If switched on then we document that it caps mTHP as
> per Nico's suggestion.
>
> That way we avoid the 'silent' issue I have with all this and it's an
> explicit setting.
Right, but it's another toggle I wish we wouldn't need. We could of
course also make it some compile-time option, but not sure if that's
really any better.
I'd hope we find an easy way forward that doesn't require new toggles,
at least for now ...
--
Cheers
David / dhildenb
^ permalink raw reply [flat|nested] 77+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 18:08 ` David Hildenbrand
@ 2025-10-28 18:59 ` Lorenzo Stoakes
2025-10-28 19:08 ` Lorenzo Stoakes
` (3 more replies)
0 siblings, 4 replies; 77+ messages in thread
From: Lorenzo Stoakes @ 2025-10-28 18:59 UTC (permalink / raw)
To: David Hildenbrand
Cc: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm, linux-doc,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Tue, Oct 28, 2025 at 07:08:38PM +0100, David Hildenbrand wrote:
>
> > > > Hey Lorenzo,
> > > >
> > > > > I mean not to beat a dead horse re: v11 commentary, but I thought we were going
> > > > > to implement David's idea re: the new 'eagerness' tunable, and again we're now just
> > > > > implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
> > > >
> > > > I spoke to David and he said to continue forward with this series; the
> > > > "eagerness" tunable will take some time, and may require further
> > > > considerations/discussion.
> > >
> > > Right, after talking to Johannes it got clearer that what we envisioned with
> >
> > I'm not sure that you meant to say go ahead with the series as-is with this
> > silent capping?
>
> No, "go ahead" as in "let's find some way forward that works for all and is
> not too crazy".
Right we clearly needed to discuss that further at the time but that's moot now,
we're figuring it out now :)
>
> [...]
>
> > > "eagerness" would not be like swappiness, and we will really have to be
> > > careful here. I don't know yet when I will have time to look into that.
> >
> > I guess I missed this part of the converastion, what do you mean?
>
> Johannes raised issues with that on the list and afterwards we had an
> offline discussion about some of the details and why something unpredictable
> is not good.
Could we get these details on-list so we can discuss them? This doesn't have to
be urgent, but I would like to have a say in this or at least be part of the
converastion please.
>
> >
> > The whole concept is that we have a paramaeter whose value is _abstracted_ and
> > which we control what it means.
> >
> > I'm not sure exactly why that would now be problematic? The fundamental concept
> > seems sound no? Last I remember of the conversation this was the case.
>
> The basic idea was to do something abstracted as swappiness. Turns out
> "swappiness" is really something predictable, not something we can randomly
> change how it behaves under the hood.
>
> So we'd have to find something similar for "eagerness", and that's where it
> stops being easy.
I think we shouldn't be too stuck on
>
> >
> > >
> > > If we want to avoid the implicit capping, I think there are the following
> > > possible approaches
> > >
> > > (1) Tolerate creep for now, maybe warning if the user configures it.
> >
> > I mean this seems a viable option if there is pressure to land this series
> > before we have a viable uAPI for configuring this.
> >
> > A part of me thinks we shouldn't rush series in for that reason though and
> > should require that we have a proper control here.
> >
> > But I guess this approach is the least-worst as it leaves us with the most
> > options moving forwards.
>
> Yes. There is also the alternative of respecting only 0 / 511 for mTHP
> collapse for now as discussed in the other thread.
Yes I guess let's carry that on over there.
I mean this is why I said it's better to try to keep things in one thread :) but
anyway, we've forked and can't be helped now.
To be clear that was a criticism of - email development - not you.
It's _extremely easy_ to have this happen because one thread naturally leads to
a broader discussion of a given topic, whereas another has questions from
somebody else about the same topic, to which people reply and then... you have a
fork and it can't be helped.
I guess I'm saying it'd be good if we could say 'ok let's move this to X'.
But that's also broken in its own way, you can't stop people from replying in
the other thread still and yeah. It's a limitation of this model :)
>
> >
> > > (2) Avoid creep by counting zero-filled pages towards none_or_zero.
> >
> > Would this really make all that much difference?
>
> It solves the creep problem I think, but it's a bit nasty IMHO.
Ah because you'd end up wtih a bunch of zeroed pages from the prior mTHP
collapses, interesting...
Scanning for that does seem a bit nasty though yes...
>
> >
> > > (3) Have separate toggles for each THP size. Doesn't quite solve the
> > > problem, only shifts it.
> >
> > Yeah I did wonder about this as an alternative solution. But of course it then
> > makes it vague what the parent values means in respect of the individual levels,
> > unless we have an 'inherit' mode there too (possible).
> >
> > It's going to be confusing though as max_ptes_none sits at the root khugepaged/
> > level and I don't think any other parameter from khugepaged/ is exposed at
> > individual page size levels.
> >
> > And of course doing this means we
> >
> > >
> > > Anything else?
> >
> > Err... I mean I'm not sure if you missed it but I suggested an approach in the
> > sub-thread - exposing mthp_max_ptes_none as a _READ-ONLY_ field at:
> >
> > /sys/kernel/mm/transparent_hugepage/khugepaged/max_mthp_ptes_none
> >
> > Then we allow the capping, but simply document that we specify what the capped
> > value will be here for mTHP.
>
> I did not have time to read the details on that so far.
OK. It is a bit nasty, yes. The idea is to find something that allows the
capping to work.
>
> It would be one solution forward. I dislike it because I think the whole
> capping is an intermediate thing that can be (and likely must be, when
> considering mTHP underused shrinking I think) solved in the future
> differently. That's why I would prefer adding this only if there is no
> other, simpler, way forward.
Yes I agree that if we could avoid it it'd be great.
Really I proposed this solution on the basis that we were somehow ok with the
capping.
If we can avoid that'd be ideal as it reduces complexity and 'unexpected'
behaviour.
We'll clarify on the other thread, but the 511/0 was compelling to me before as
a simplification, and if we can have a straightforward model of how mTHP
collapse across none/zero page PTEs behaves this is ideal.
The only question is w.r.t. warnings etc. but we can handle details there.
>
> >
> > That struck me as the simplest way of getting this series landed without
> > necessarily violating any future eagerness which:
> >
> > a. Must still support khugepaged/max_ptes_none - we aren't getting away from
> > this, it's uAPI.
> >
> > b. Surely must want to do different things for mTHP in eagerness, so if we're
> > exposing some PTE value in max_ptes_none doing so in
> > khugepaged/mthp_max_ptes_none wouldn't be problematic (note again - it's
> > readonly so unlike max_ptes_none we don't have to worry about the other
> > direction).
> >
> > HOWEVER, eagerness might want want to change this behaviour per-mTHP size, in
> > which case perhaps mthp_max_ptes_none would be problematic in that it is some
> > kind of average.
> >
> > Then again we could always revert to putting this parameter as in (3) in that
> > case, ugly but kinda viable.
> >
> > >
> > > IIUC, creep is less of a problem when we have the underused shrinker
> > > enabled: whatever we over-allocated can (unless longterm-pinned etc) get
> > > reclaimed again.
> > >
> > > So maybe having underused-shrinker support for mTHP as well would be a
> > > solution to tackle (1) later?
> >
> > How viable is this in the short term?
>
> I once started looking into it, but it will require quite some work, because
> the lists will essentially include each and every (m)THP in the system ...
> so i think we will need some redesign.
Ack.
This aligns with non-0/511 settings being non-functional for mTHP atm anyway.
>
> >
> > Another possible solution:
> >
> > If mthp_max_ptes_none is not workable, we could have a toggle at, e.g.:
> >
> > /sys/kernel/mm/transparent_hugepage/khugepaged/mthp_cap_collapse_none
> >
> > As a simple boolean. If switched on then we document that it caps mTHP as
> > per Nico's suggestion.
> >
> > That way we avoid the 'silent' issue I have with all this and it's an
> > explicit setting.
>
> Right, but it's another toggle I wish we wouldn't need. We could of course
> also make it some compile-time option, but not sure if that's really any
> better.
>
> I'd hope we find an easy way forward that doesn't require new toggles, at
> least for now ...
Right, well I agree if we can make this 0/511 thing work, let's do that.
Toggle are just 'least worst' workarounds on assumption of the need for capping.
>
> --
> Cheers
>
> David / dhildenb
>
Thanks, Lorenzo
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 18:59 ` Lorenzo Stoakes
@ 2025-10-28 19:08 ` Lorenzo Stoakes
2025-10-29 2:09 ` Baolin Wang
` (2 subsequent siblings)
3 siblings, 0 replies; 77+ messages in thread
From: Lorenzo Stoakes @ 2025-10-28 19:08 UTC (permalink / raw)
To: David Hildenbrand
Cc: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm, linux-doc,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Tue, Oct 28, 2025 at 06:59:31PM +0000, Lorenzo Stoakes wrote:
> On Tue, Oct 28, 2025 at 07:08:38PM +0100, David Hildenbrand wrote:
> > >
> > > The whole concept is that we have a paramaeter whose value is _abstracted_ and
> > > which we control what it means.
> > >
> > > I'm not sure exactly why that would now be problematic? The fundamental concept
> > > seems sound no? Last I remember of the conversation this was the case.
> >
> > The basic idea was to do something abstracted as swappiness. Turns out
> > "swappiness" is really something predictable, not something we can randomly
> > change how it behaves under the hood.
> >
> > So we'd have to find something similar for "eagerness", and that's where it
> > stops being easy.
>
> I think we shouldn't be too stuck on
>
I really am the master of the unfinished sentence :)
I was going to say we shouldn't be too stuck on the analogy to swappiness and
just maintain the broad concept that eagerness is abstracted and we get to
determine what that looks like.
But absolutely I accept that it's highly sensitive and likely embodies a great
many moving parts and we must be cautious absolutely.
This is something that can be deferred for later.
Cheers, Lorenzo
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 18:59 ` Lorenzo Stoakes
2025-10-28 19:08 ` Lorenzo Stoakes
@ 2025-10-29 2:09 ` Baolin Wang
2025-10-29 2:49 ` Nico Pache
2025-10-29 18:55 ` Lorenzo Stoakes
2025-10-29 2:47 ` Nico Pache
2025-10-31 11:12 ` David Hildenbrand
3 siblings, 2 replies; 77+ messages in thread
From: Baolin Wang @ 2025-10-29 2:09 UTC (permalink / raw)
To: Lorenzo Stoakes, David Hildenbrand
Cc: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm, linux-doc,
ziy, Liam.Howlett, ryan.roberts, dev.jain, corbet, rostedt,
mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On 2025/10/29 02:59, Lorenzo Stoakes wrote:
> On Tue, Oct 28, 2025 at 07:08:38PM +0100, David Hildenbrand wrote:
>>
>>>>> Hey Lorenzo,
>>>>>
>>>>>> I mean not to beat a dead horse re: v11 commentary, but I thought we were going
>>>>>> to implement David's idea re: the new 'eagerness' tunable, and again we're now just
>>>>>> implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
>>>>>
>>>>> I spoke to David and he said to continue forward with this series; the
>>>>> "eagerness" tunable will take some time, and may require further
>>>>> considerations/discussion.
>>>>
>>>> Right, after talking to Johannes it got clearer that what we envisioned with
>>>
>>> I'm not sure that you meant to say go ahead with the series as-is with this
>>> silent capping?
>>
>> No, "go ahead" as in "let's find some way forward that works for all and is
>> not too crazy".
>
> Right we clearly needed to discuss that further at the time but that's moot now,
> we're figuring it out now :)
>
>>
>> [...]
>>
>>>> "eagerness" would not be like swappiness, and we will really have to be
>>>> careful here. I don't know yet when I will have time to look into that.
>>>
>>> I guess I missed this part of the converastion, what do you mean?
>>
>> Johannes raised issues with that on the list and afterwards we had an
>> offline discussion about some of the details and why something unpredictable
>> is not good.
>
> Could we get these details on-list so we can discuss them? This doesn't have to
> be urgent, but I would like to have a say in this or at least be part of the
> converastion please.
>
>>
>>>
>>> The whole concept is that we have a paramaeter whose value is _abstracted_ and
>>> which we control what it means.
>>>
>>> I'm not sure exactly why that would now be problematic? The fundamental concept
>>> seems sound no? Last I remember of the conversation this was the case.
>>
>> The basic idea was to do something abstracted as swappiness. Turns out
>> "swappiness" is really something predictable, not something we can randomly
>> change how it behaves under the hood.
>>
>> So we'd have to find something similar for "eagerness", and that's where it
>> stops being easy.
>
> I think we shouldn't be too stuck on
>
>>
>>>
>>>>
>>>> If we want to avoid the implicit capping, I think there are the following
>>>> possible approaches
>>>>
>>>> (1) Tolerate creep for now, maybe warning if the user configures it.
>>>
>>> I mean this seems a viable option if there is pressure to land this series
>>> before we have a viable uAPI for configuring this.
>>>
>>> A part of me thinks we shouldn't rush series in for that reason though and
>>> should require that we have a proper control here.
>>>
>>> But I guess this approach is the least-worst as it leaves us with the most
>>> options moving forwards.
>>
>> Yes. There is also the alternative of respecting only 0 / 511 for mTHP
>> collapse for now as discussed in the other thread.
>
> Yes I guess let's carry that on over there.
>
> I mean this is why I said it's better to try to keep things in one thread :) but
> anyway, we've forked and can't be helped now.
>
> To be clear that was a criticism of - email development - not you.
>
> It's _extremely easy_ to have this happen because one thread naturally leads to
> a broader discussion of a given topic, whereas another has questions from
> somebody else about the same topic, to which people reply and then... you have a
> fork and it can't be helped.
>
> I guess I'm saying it'd be good if we could say 'ok let's move this to X'.
>
> But that's also broken in its own way, you can't stop people from replying in
> the other thread still and yeah. It's a limitation of this model :)
>
>>
>>>
>>>> (2) Avoid creep by counting zero-filled pages towards none_or_zero.
>>>
>>> Would this really make all that much difference?
>>
>> It solves the creep problem I think, but it's a bit nasty IMHO.
>
> Ah because you'd end up wtih a bunch of zeroed pages from the prior mTHP
> collapses, interesting...
>
> Scanning for that does seem a bit nasty though yes...
>
>>
>>>
>>>> (3) Have separate toggles for each THP size. Doesn't quite solve the
>>>> problem, only shifts it.
>>>
>>> Yeah I did wonder about this as an alternative solution. But of course it then
>>> makes it vague what the parent values means in respect of the individual levels,
>>> unless we have an 'inherit' mode there too (possible).
>>>
>>> It's going to be confusing though as max_ptes_none sits at the root khugepaged/
>>> level and I don't think any other parameter from khugepaged/ is exposed at
>>> individual page size levels.
>>>
>>> And of course doing this means we
>>>
>>>>
>>>> Anything else?
>>>
>>> Err... I mean I'm not sure if you missed it but I suggested an approach in the
>>> sub-thread - exposing mthp_max_ptes_none as a _READ-ONLY_ field at:
>>>
>>> /sys/kernel/mm/transparent_hugepage/khugepaged/max_mthp_ptes_none
>>>
>>> Then we allow the capping, but simply document that we specify what the capped
>>> value will be here for mTHP.
>>
>> I did not have time to read the details on that so far.
>
> OK. It is a bit nasty, yes. The idea is to find something that allows the
> capping to work.
>
>>
>> It would be one solution forward. I dislike it because I think the whole
>> capping is an intermediate thing that can be (and likely must be, when
>> considering mTHP underused shrinking I think) solved in the future
>> differently. That's why I would prefer adding this only if there is no
>> other, simpler, way forward.
>
> Yes I agree that if we could avoid it it'd be great.
>
> Really I proposed this solution on the basis that we were somehow ok with the
> capping.
>
> If we can avoid that'd be ideal as it reduces complexity and 'unexpected'
> behaviour.
>
> We'll clarify on the other thread, but the 511/0 was compelling to me before as
> a simplification, and if we can have a straightforward model of how mTHP
> collapse across none/zero page PTEs behaves this is ideal.
>
> The only question is w.r.t. warnings etc. but we can handle details there.
>
>>
>>>
>>> That struck me as the simplest way of getting this series landed without
>>> necessarily violating any future eagerness which:
>>>
>>> a. Must still support khugepaged/max_ptes_none - we aren't getting away from
>>> this, it's uAPI.
>>>
>>> b. Surely must want to do different things for mTHP in eagerness, so if we're
>>> exposing some PTE value in max_ptes_none doing so in
>>> khugepaged/mthp_max_ptes_none wouldn't be problematic (note again - it's
>>> readonly so unlike max_ptes_none we don't have to worry about the other
>>> direction).
>>>
>>> HOWEVER, eagerness might want want to change this behaviour per-mTHP size, in
>>> which case perhaps mthp_max_ptes_none would be problematic in that it is some
>>> kind of average.
>>>
>>> Then again we could always revert to putting this parameter as in (3) in that
>>> case, ugly but kinda viable.
>>>
>>>>
>>>> IIUC, creep is less of a problem when we have the underused shrinker
>>>> enabled: whatever we over-allocated can (unless longterm-pinned etc) get
>>>> reclaimed again.
>>>>
>>>> So maybe having underused-shrinker support for mTHP as well would be a
>>>> solution to tackle (1) later?
>>>
>>> How viable is this in the short term?
>>
>> I once started looking into it, but it will require quite some work, because
>> the lists will essentially include each and every (m)THP in the system ...
>> so i think we will need some redesign.
>
> Ack.
>
> This aligns with non-0/511 settings being non-functional for mTHP atm anyway.
>
>>
>>>
>>> Another possible solution:
>>>
>>> If mthp_max_ptes_none is not workable, we could have a toggle at, e.g.:
>>>
>>> /sys/kernel/mm/transparent_hugepage/khugepaged/mthp_cap_collapse_none
>>>
>>> As a simple boolean. If switched on then we document that it caps mTHP as
>>> per Nico's suggestion.
>>>
>>> That way we avoid the 'silent' issue I have with all this and it's an
>>> explicit setting.
>>
>> Right, but it's another toggle I wish we wouldn't need. We could of course
>> also make it some compile-time option, but not sure if that's really any
>> better.
>>
>> I'd hope we find an easy way forward that doesn't require new toggles, at
>> least for now ...
>
> Right, well I agree if we can make this 0/511 thing work, let's do that.
>
> Toggle are just 'least worst' workarounds on assumption of the need for capping.
I finally finished reading through the discussions across multiple
threads:), and it looks like we've reached a preliminary consensus (make
0/511 work). Great and thanks!
IIUC, the strategy is, configuring it to 511 means always enabling mTHP
collapse, configuring it to 0 means collapsing mTHP only if all PTEs are
non-none/zero, and for other values, we issue a warning and prohibit
mTHP collapse (avoid Lorenzo's concern about silently changing
max_ptes_none). Then the implementation for collapse_max_ptes_none()
should be as follows:
static int collapse_max_ptes_none(unsigned int order, bool full_scan)
{
/* ignore max_ptes_none limits */
if (full_scan)
return HPAGE_PMD_NR - 1;
if (order == HPAGE_PMD_ORDER)
return khugepaged_max_ptes_none;
/*
* To prevent creeping towards larger order collapses for mTHP
collapse,
* we restrict khugepaged_max_ptes_none to only 511 or 0,
simplifying the
* logic. This means:
* max_ptes_none == 511 -> collapse mTHP always
* max_ptes_none == 0 -> collapse mTHP only if we all PTEs are
non-none/zero
*/
if (!khugepaged_max_ptes_none || khugepaged_max_ptes_none ==
HPAGE_PMD_NR - 1)
return khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER -
order);
pr_warn_once("mTHP collapse only supports
khugepaged_max_ptes_none configured as 0 or %d\n", HPAGE_PMD_NR - 1);
return -EINVAL;
}
So what do you think?
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-29 2:09 ` Baolin Wang
@ 2025-10-29 2:49 ` Nico Pache
2025-10-29 18:55 ` Lorenzo Stoakes
1 sibling, 0 replies; 77+ messages in thread
From: Nico Pache @ 2025-10-29 2:49 UTC (permalink / raw)
To: Baolin Wang
Cc: Lorenzo Stoakes, David Hildenbrand, linux-kernel,
linux-trace-kernel, linux-mm, linux-doc, ziy, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas, tiwai,
will, dave.hansen, jack, cl, jglisse, surenb, zokeefe, hannes,
rientjes, mhocko, rdunlap, hughd, richard.weiyang, lance.yang,
vbabka, rppt, jannh, pfalcato
On Tue, Oct 28, 2025 at 8:10 PM Baolin Wang
<baolin.wang@linux.alibaba.com> wrote:
>
>
>
> On 2025/10/29 02:59, Lorenzo Stoakes wrote:
> > On Tue, Oct 28, 2025 at 07:08:38PM +0100, David Hildenbrand wrote:
> >>
> >>>>> Hey Lorenzo,
> >>>>>
> >>>>>> I mean not to beat a dead horse re: v11 commentary, but I thought we were going
> >>>>>> to implement David's idea re: the new 'eagerness' tunable, and again we're now just
> >>>>>> implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
> >>>>>
> >>>>> I spoke to David and he said to continue forward with this series; the
> >>>>> "eagerness" tunable will take some time, and may require further
> >>>>> considerations/discussion.
> >>>>
> >>>> Right, after talking to Johannes it got clearer that what we envisioned with
> >>>
> >>> I'm not sure that you meant to say go ahead with the series as-is with this
> >>> silent capping?
> >>
> >> No, "go ahead" as in "let's find some way forward that works for all and is
> >> not too crazy".
> >
> > Right we clearly needed to discuss that further at the time but that's moot now,
> > we're figuring it out now :)
> >
> >>
> >> [...]
> >>
> >>>> "eagerness" would not be like swappiness, and we will really have to be
> >>>> careful here. I don't know yet when I will have time to look into that.
> >>>
> >>> I guess I missed this part of the converastion, what do you mean?
> >>
> >> Johannes raised issues with that on the list and afterwards we had an
> >> offline discussion about some of the details and why something unpredictable
> >> is not good.
> >
> > Could we get these details on-list so we can discuss them? This doesn't have to
> > be urgent, but I would like to have a say in this or at least be part of the
> > converastion please.
> >
> >>
> >>>
> >>> The whole concept is that we have a paramaeter whose value is _abstracted_ and
> >>> which we control what it means.
> >>>
> >>> I'm not sure exactly why that would now be problematic? The fundamental concept
> >>> seems sound no? Last I remember of the conversation this was the case.
> >>
> >> The basic idea was to do something abstracted as swappiness. Turns out
> >> "swappiness" is really something predictable, not something we can randomly
> >> change how it behaves under the hood.
> >>
> >> So we'd have to find something similar for "eagerness", and that's where it
> >> stops being easy.
> >
> > I think we shouldn't be too stuck on
> >
> >>
> >>>
> >>>>
> >>>> If we want to avoid the implicit capping, I think there are the following
> >>>> possible approaches
> >>>>
> >>>> (1) Tolerate creep for now, maybe warning if the user configures it.
> >>>
> >>> I mean this seems a viable option if there is pressure to land this series
> >>> before we have a viable uAPI for configuring this.
> >>>
> >>> A part of me thinks we shouldn't rush series in for that reason though and
> >>> should require that we have a proper control here.
> >>>
> >>> But I guess this approach is the least-worst as it leaves us with the most
> >>> options moving forwards.
> >>
> >> Yes. There is also the alternative of respecting only 0 / 511 for mTHP
> >> collapse for now as discussed in the other thread.
> >
> > Yes I guess let's carry that on over there.
> >
> > I mean this is why I said it's better to try to keep things in one thread :) but
> > anyway, we've forked and can't be helped now.
> >
> > To be clear that was a criticism of - email development - not you.
> >
> > It's _extremely easy_ to have this happen because one thread naturally leads to
> > a broader discussion of a given topic, whereas another has questions from
> > somebody else about the same topic, to which people reply and then... you have a
> > fork and it can't be helped.
> >
> > I guess I'm saying it'd be good if we could say 'ok let's move this to X'.
> >
> > But that's also broken in its own way, you can't stop people from replying in
> > the other thread still and yeah. It's a limitation of this model :)
> >
> >>
> >>>
> >>>> (2) Avoid creep by counting zero-filled pages towards none_or_zero.
> >>>
> >>> Would this really make all that much difference?
> >>
> >> It solves the creep problem I think, but it's a bit nasty IMHO.
> >
> > Ah because you'd end up wtih a bunch of zeroed pages from the prior mTHP
> > collapses, interesting...
> >
> > Scanning for that does seem a bit nasty though yes...
> >
> >>
> >>>
> >>>> (3) Have separate toggles for each THP size. Doesn't quite solve the
> >>>> problem, only shifts it.
> >>>
> >>> Yeah I did wonder about this as an alternative solution. But of course it then
> >>> makes it vague what the parent values means in respect of the individual levels,
> >>> unless we have an 'inherit' mode there too (possible).
> >>>
> >>> It's going to be confusing though as max_ptes_none sits at the root khugepaged/
> >>> level and I don't think any other parameter from khugepaged/ is exposed at
> >>> individual page size levels.
> >>>
> >>> And of course doing this means we
> >>>
> >>>>
> >>>> Anything else?
> >>>
> >>> Err... I mean I'm not sure if you missed it but I suggested an approach in the
> >>> sub-thread - exposing mthp_max_ptes_none as a _READ-ONLY_ field at:
> >>>
> >>> /sys/kernel/mm/transparent_hugepage/khugepaged/max_mthp_ptes_none
> >>>
> >>> Then we allow the capping, but simply document that we specify what the capped
> >>> value will be here for mTHP.
> >>
> >> I did not have time to read the details on that so far.
> >
> > OK. It is a bit nasty, yes. The idea is to find something that allows the
> > capping to work.
> >
> >>
> >> It would be one solution forward. I dislike it because I think the whole
> >> capping is an intermediate thing that can be (and likely must be, when
> >> considering mTHP underused shrinking I think) solved in the future
> >> differently. That's why I would prefer adding this only if there is no
> >> other, simpler, way forward.
> >
> > Yes I agree that if we could avoid it it'd be great.
> >
> > Really I proposed this solution on the basis that we were somehow ok with the
> > capping.
> >
> > If we can avoid that'd be ideal as it reduces complexity and 'unexpected'
> > behaviour.
> >
> > We'll clarify on the other thread, but the 511/0 was compelling to me before as
> > a simplification, and if we can have a straightforward model of how mTHP
> > collapse across none/zero page PTEs behaves this is ideal.
> >
> > The only question is w.r.t. warnings etc. but we can handle details there.
> >
> >>
> >>>
> >>> That struck me as the simplest way of getting this series landed without
> >>> necessarily violating any future eagerness which:
> >>>
> >>> a. Must still support khugepaged/max_ptes_none - we aren't getting away from
> >>> this, it's uAPI.
> >>>
> >>> b. Surely must want to do different things for mTHP in eagerness, so if we're
> >>> exposing some PTE value in max_ptes_none doing so in
> >>> khugepaged/mthp_max_ptes_none wouldn't be problematic (note again - it's
> >>> readonly so unlike max_ptes_none we don't have to worry about the other
> >>> direction).
> >>>
> >>> HOWEVER, eagerness might want want to change this behaviour per-mTHP size, in
> >>> which case perhaps mthp_max_ptes_none would be problematic in that it is some
> >>> kind of average.
> >>>
> >>> Then again we could always revert to putting this parameter as in (3) in that
> >>> case, ugly but kinda viable.
> >>>
> >>>>
> >>>> IIUC, creep is less of a problem when we have the underused shrinker
> >>>> enabled: whatever we over-allocated can (unless longterm-pinned etc) get
> >>>> reclaimed again.
> >>>>
> >>>> So maybe having underused-shrinker support for mTHP as well would be a
> >>>> solution to tackle (1) later?
> >>>
> >>> How viable is this in the short term?
> >>
> >> I once started looking into it, but it will require quite some work, because
> >> the lists will essentially include each and every (m)THP in the system ...
> >> so i think we will need some redesign.
> >
> > Ack.
> >
> > This aligns with non-0/511 settings being non-functional for mTHP atm anyway.
> >
> >>
> >>>
> >>> Another possible solution:
> >>>
> >>> If mthp_max_ptes_none is not workable, we could have a toggle at, e.g.:
> >>>
> >>> /sys/kernel/mm/transparent_hugepage/khugepaged/mthp_cap_collapse_none
> >>>
> >>> As a simple boolean. If switched on then we document that it caps mTHP as
> >>> per Nico's suggestion.
> >>>
> >>> That way we avoid the 'silent' issue I have with all this and it's an
> >>> explicit setting.
> >>
> >> Right, but it's another toggle I wish we wouldn't need. We could of course
> >> also make it some compile-time option, but not sure if that's really any
> >> better.
> >>
> >> I'd hope we find an easy way forward that doesn't require new toggles, at
> >> least for now ...
> >
> > Right, well I agree if we can make this 0/511 thing work, let's do that.
> >
> > Toggle are just 'least worst' workarounds on assumption of the need for capping.
>
> I finally finished reading through the discussions across multiple
> threads:), and it looks like we've reached a preliminary consensus (make
> 0/511 work). Great and thanks!
>
> IIUC, the strategy is, configuring it to 511 means always enabling mTHP
> collapse, configuring it to 0 means collapsing mTHP only if all PTEs are
> non-none/zero, and for other values, we issue a warning and prohibit
> mTHP collapse (avoid Lorenzo's concern about silently changing
> max_ptes_none). Then the implementation for collapse_max_ptes_none()
> should be as follows:
>
> static int collapse_max_ptes_none(unsigned int order, bool full_scan)
> {
> /* ignore max_ptes_none limits */
> if (full_scan)
> return HPAGE_PMD_NR - 1;
>
> if (order == HPAGE_PMD_ORDER)
> return khugepaged_max_ptes_none;
>
> /*
> * To prevent creeping towards larger order collapses for mTHP
> collapse,
> * we restrict khugepaged_max_ptes_none to only 511 or 0,
> simplifying the
> * logic. This means:
> * max_ptes_none == 511 -> collapse mTHP always
> * max_ptes_none == 0 -> collapse mTHP only if we all PTEs are
> non-none/zero
> */
> if (!khugepaged_max_ptes_none || khugepaged_max_ptes_none ==
> HPAGE_PMD_NR - 1)
> return khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER -
> order);
>
> pr_warn_once("mTHP collapse only supports
> khugepaged_max_ptes_none configured as 0 or %d\n", HPAGE_PMD_NR - 1);
> return -EINVAL;
> }
>
> So what do you think?
Yes i'm glad we finally came to some consensus, despite it being a
less than ideal solution.
Hopefully the eagerness patchset re-introduces all the lost
functionality in the future.
Cheers
-- Nico
>
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-29 2:09 ` Baolin Wang
2025-10-29 2:49 ` Nico Pache
@ 2025-10-29 18:55 ` Lorenzo Stoakes
2025-10-29 21:14 ` Nico Pache
1 sibling, 1 reply; 77+ messages in thread
From: Lorenzo Stoakes @ 2025-10-29 18:55 UTC (permalink / raw)
To: Baolin Wang
Cc: David Hildenbrand, Nico Pache, linux-kernel, linux-trace-kernel,
linux-mm, linux-doc, ziy, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Wed, Oct 29, 2025 at 10:09:43AM +0800, Baolin Wang wrote:
> I finally finished reading through the discussions across multiple
> threads:), and it looks like we've reached a preliminary consensus (make
> 0/511 work). Great and thanks!
Yes we're getting there :) it's a sincere effort to try to find a way to move
forwards.
>
> IIUC, the strategy is, configuring it to 511 means always enabling mTHP
> collapse, configuring it to 0 means collapsing mTHP only if all PTEs are
> non-none/zero, and for other values, we issue a warning and prohibit mTHP
> collapse (avoid Lorenzo's concern about silently changing max_ptes_none).
> Then the implementation for collapse_max_ptes_none() should be as follows:
>
> static int collapse_max_ptes_none(unsigned int order, bool full_scan)
> {
> /* ignore max_ptes_none limits */
> if (full_scan)
> return HPAGE_PMD_NR - 1;
>
> if (order == HPAGE_PMD_ORDER)
> return khugepaged_max_ptes_none;
>
> /*
> * To prevent creeping towards larger order collapses for mTHP
> collapse,
> * we restrict khugepaged_max_ptes_none to only 511 or 0,
> simplifying the
> * logic. This means:
> * max_ptes_none == 511 -> collapse mTHP always
> * max_ptes_none == 0 -> collapse mTHP only if we all PTEs are
> non-none/zero
> */
> if (!khugepaged_max_ptes_none || khugepaged_max_ptes_none ==
> HPAGE_PMD_NR - 1)
> return khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER -
> order);
>
> pr_warn_once("mTHP collapse only supports khugepaged_max_ptes_none
> configured as 0 or %d\n", HPAGE_PMD_NR - 1);
> return -EINVAL;
> }
>
> So what do you think?
Yeah I think something like this.
Though I'd implement it more explicitly like:
/* Zero/non-present collapse disabled. */
if (!khugepaged_max_ptes_none)
return 0;
/* Collapse the maximum number of zero/non-present PTEs. */
if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
return (1 << order) - 1;
Then we can do away with this confusing (HPAGE_PMD_ORDER - order) stuff.
A quick check in google sheets suggests my maths is ok here but do correct me if
I'm wrong :)
Cheers, Lorenzo
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-29 18:55 ` Lorenzo Stoakes
@ 2025-10-29 21:14 ` Nico Pache
2025-10-30 1:15 ` Baolin Wang
0 siblings, 1 reply; 77+ messages in thread
From: Nico Pache @ 2025-10-29 21:14 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: Baolin Wang, David Hildenbrand, linux-kernel, linux-trace-kernel,
linux-mm, linux-doc, ziy, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Wed, Oct 29, 2025 at 12:56 PM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> On Wed, Oct 29, 2025 at 10:09:43AM +0800, Baolin Wang wrote:
> > I finally finished reading through the discussions across multiple
> > threads:), and it looks like we've reached a preliminary consensus (make
> > 0/511 work). Great and thanks!
>
> Yes we're getting there :) it's a sincere effort to try to find a way to move
> forwards.
>
> >
> > IIUC, the strategy is, configuring it to 511 means always enabling mTHP
> > collapse, configuring it to 0 means collapsing mTHP only if all PTEs are
> > non-none/zero, and for other values, we issue a warning and prohibit mTHP
> > collapse (avoid Lorenzo's concern about silently changing max_ptes_none).
> > Then the implementation for collapse_max_ptes_none() should be as follows:
> >
> > static int collapse_max_ptes_none(unsigned int order, bool full_scan)
> > {
> > /* ignore max_ptes_none limits */
> > if (full_scan)
> > return HPAGE_PMD_NR - 1;
> >
> > if (order == HPAGE_PMD_ORDER)
> > return khugepaged_max_ptes_none;
> >
> > /*
> > * To prevent creeping towards larger order collapses for mTHP
> > collapse,
> > * we restrict khugepaged_max_ptes_none to only 511 or 0,
> > simplifying the
> > * logic. This means:
> > * max_ptes_none == 511 -> collapse mTHP always
> > * max_ptes_none == 0 -> collapse mTHP only if we all PTEs are
> > non-none/zero
> > */
> > if (!khugepaged_max_ptes_none || khugepaged_max_ptes_none ==
> > HPAGE_PMD_NR - 1)
> > return khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER -
> > order);
> >
> > pr_warn_once("mTHP collapse only supports khugepaged_max_ptes_none
> > configured as 0 or %d\n", HPAGE_PMD_NR - 1);
> > return -EINVAL;
> > }
> >
> > So what do you think?
>
> Yeah I think something like this.
>
> Though I'd implement it more explicitly like:
>
> /* Zero/non-present collapse disabled. */
> if (!khugepaged_max_ptes_none)
> return 0;
>
> /* Collapse the maximum number of zero/non-present PTEs. */
> if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
> return (1 << order) - 1;
>
> Then we can do away with this confusing (HPAGE_PMD_ORDER - order) stuff.
This looks cleaner/more explicit given the limits we are enforcing!
I'll go for something like that.
>
> A quick check in google sheets suggests my maths is ok here but do correct me if
> I'm wrong :)
LGTM!
Thanks for all the reviews! I'm glad we were able to find a solution :)
-- Nico
>
> Cheers, Lorenzo
>
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-29 21:14 ` Nico Pache
@ 2025-10-30 1:15 ` Baolin Wang
0 siblings, 0 replies; 77+ messages in thread
From: Baolin Wang @ 2025-10-30 1:15 UTC (permalink / raw)
To: Nico Pache, Lorenzo Stoakes
Cc: David Hildenbrand, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, ziy, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On 2025/10/30 05:14, Nico Pache wrote:
> On Wed, Oct 29, 2025 at 12:56 PM Lorenzo Stoakes
> <lorenzo.stoakes@oracle.com> wrote:
>>
>> On Wed, Oct 29, 2025 at 10:09:43AM +0800, Baolin Wang wrote:
>>> I finally finished reading through the discussions across multiple
>>> threads:), and it looks like we've reached a preliminary consensus (make
>>> 0/511 work). Great and thanks!
>>
>> Yes we're getting there :) it's a sincere effort to try to find a way to move
>> forwards.
>>
>>>
>>> IIUC, the strategy is, configuring it to 511 means always enabling mTHP
>>> collapse, configuring it to 0 means collapsing mTHP only if all PTEs are
>>> non-none/zero, and for other values, we issue a warning and prohibit mTHP
>>> collapse (avoid Lorenzo's concern about silently changing max_ptes_none).
>>> Then the implementation for collapse_max_ptes_none() should be as follows:
>>>
>>> static int collapse_max_ptes_none(unsigned int order, bool full_scan)
>>> {
>>> /* ignore max_ptes_none limits */
>>> if (full_scan)
>>> return HPAGE_PMD_NR - 1;
>>>
>>> if (order == HPAGE_PMD_ORDER)
>>> return khugepaged_max_ptes_none;
>>>
>>> /*
>>> * To prevent creeping towards larger order collapses for mTHP
>>> collapse,
>>> * we restrict khugepaged_max_ptes_none to only 511 or 0,
>>> simplifying the
>>> * logic. This means:
>>> * max_ptes_none == 511 -> collapse mTHP always
>>> * max_ptes_none == 0 -> collapse mTHP only if we all PTEs are
>>> non-none/zero
>>> */
>>> if (!khugepaged_max_ptes_none || khugepaged_max_ptes_none ==
>>> HPAGE_PMD_NR - 1)
>>> return khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER -
>>> order);
>>>
>>> pr_warn_once("mTHP collapse only supports khugepaged_max_ptes_none
>>> configured as 0 or %d\n", HPAGE_PMD_NR - 1);
>>> return -EINVAL;
>>> }
>>>
>>> So what do you think?
>>
>> Yeah I think something like this.
>>
>> Though I'd implement it more explicitly like:
>>
>> /* Zero/non-present collapse disabled. */
>> if (!khugepaged_max_ptes_none)
>> return 0;
>>
>> /* Collapse the maximum number of zero/non-present PTEs. */
>> if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
>> return (1 << order) - 1;
>>
>> Then we can do away with this confusing (HPAGE_PMD_ORDER - order) stuff.
>
> This looks cleaner/more explicit given the limits we are enforcing!
>
> I'll go for something like that.
>
>>
>> A quick check in google sheets suggests my maths is ok here but do correct me if
>> I'm wrong :)
>
> LGTM!
LGTM. Thanks.
^ permalink raw reply [flat|nested] 77+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 18:59 ` Lorenzo Stoakes
2025-10-28 19:08 ` Lorenzo Stoakes
2025-10-29 2:09 ` Baolin Wang
@ 2025-10-29 2:47 ` Nico Pache
2025-10-29 18:58 ` Lorenzo Stoakes
2025-10-31 11:12 ` David Hildenbrand
3 siblings, 1 reply; 77+ messages in thread
From: Nico Pache @ 2025-10-29 2:47 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: David Hildenbrand, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Tue, Oct 28, 2025 at 1:00 PM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> On Tue, Oct 28, 2025 at 07:08:38PM +0100, David Hildenbrand wrote:
> >
> > > > > Hey Lorenzo,
> > > > >
> > > > > > I mean not to beat a dead horse re: v11 commentary, but I thought we were going
> > > > > > to implement David's idea re: the new 'eagerness' tunable, and again we're now just
> > > > > > implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
> > > > >
> > > > > I spoke to David and he said to continue forward with this series; the
> > > > > "eagerness" tunable will take some time, and may require further
> > > > > considerations/discussion.
> > > >
> > > > Right, after talking to Johannes it got clearer that what we envisioned with
> > >
> > > I'm not sure that you meant to say go ahead with the series as-is with this
> > > silent capping?
> >
> > No, "go ahead" as in "let's find some way forward that works for all and is
> > not too crazy".
>
> Right we clearly needed to discuss that further at the time but that's moot now,
> we're figuring it out now :)
>
> >
> > [...]
> >
> > > > "eagerness" would not be like swappiness, and we will really have to be
> > > > careful here. I don't know yet when I will have time to look into that.
> > >
> > > I guess I missed this part of the converastion, what do you mean?
> >
> > Johannes raised issues with that on the list and afterwards we had an
> > offline discussion about some of the details and why something unpredictable
> > is not good.
>
> Could we get these details on-list so we can discuss them? This doesn't have to
> be urgent, but I would like to have a say in this or at least be part of the
> converastion please.
>
> >
> > >
> > > The whole concept is that we have a paramaeter whose value is _abstracted_ and
> > > which we control what it means.
> > >
> > > I'm not sure exactly why that would now be problematic? The fundamental concept
> > > seems sound no? Last I remember of the conversation this was the case.
> >
> > The basic idea was to do something abstracted as swappiness. Turns out
> > "swappiness" is really something predictable, not something we can randomly
> > change how it behaves under the hood.
> >
> > So we'd have to find something similar for "eagerness", and that's where it
> > stops being easy.
>
> I think we shouldn't be too stuck on
>
> >
> > >
> > > >
> > > > If we want to avoid the implicit capping, I think there are the following
> > > > possible approaches
> > > >
> > > > (1) Tolerate creep for now, maybe warning if the user configures it.
> > >
> > > I mean this seems a viable option if there is pressure to land this series
> > > before we have a viable uAPI for configuring this.
> > >
> > > A part of me thinks we shouldn't rush series in for that reason though and
> > > should require that we have a proper control here.
> > >
> > > But I guess this approach is the least-worst as it leaves us with the most
> > > options moving forwards.
> >
> > Yes. There is also the alternative of respecting only 0 / 511 for mTHP
> > collapse for now as discussed in the other thread.
>
> Yes I guess let's carry that on over there.
>
> I mean this is why I said it's better to try to keep things in one thread :) but
> anyway, we've forked and can't be helped now.
>
> To be clear that was a criticism of - email development - not you.
>
> It's _extremely easy_ to have this happen because one thread naturally leads to
> a broader discussion of a given topic, whereas another has questions from
> somebody else about the same topic, to which people reply and then... you have a
> fork and it can't be helped.
>
> I guess I'm saying it'd be good if we could say 'ok let's move this to X'.
>
> But that's also broken in its own way, you can't stop people from replying in
> the other thread still and yeah. It's a limitation of this model :)
>
> >
> > >
> > > > (2) Avoid creep by counting zero-filled pages towards none_or_zero.
> > >
> > > Would this really make all that much difference?
> >
> > It solves the creep problem I think, but it's a bit nasty IMHO.
>
> Ah because you'd end up wtih a bunch of zeroed pages from the prior mTHP
> collapses, interesting...
>
> Scanning for that does seem a bit nasty though yes...
>
> >
> > >
> > > > (3) Have separate toggles for each THP size. Doesn't quite solve the
> > > > problem, only shifts it.
> > >
> > > Yeah I did wonder about this as an alternative solution. But of course it then
> > > makes it vague what the parent values means in respect of the individual levels,
> > > unless we have an 'inherit' mode there too (possible).
> > >
> > > It's going to be confusing though as max_ptes_none sits at the root khugepaged/
> > > level and I don't think any other parameter from khugepaged/ is exposed at
> > > individual page size levels.
> > >
> > > And of course doing this means we
> > >
> > > >
> > > > Anything else?
> > >
> > > Err... I mean I'm not sure if you missed it but I suggested an approach in the
> > > sub-thread - exposing mthp_max_ptes_none as a _READ-ONLY_ field at:
> > >
> > > /sys/kernel/mm/transparent_hugepage/khugepaged/max_mthp_ptes_none
> > >
> > > Then we allow the capping, but simply document that we specify what the capped
> > > value will be here for mTHP.
> >
> > I did not have time to read the details on that so far.
>
> OK. It is a bit nasty, yes. The idea is to find something that allows the
> capping to work.
>
> >
> > It would be one solution forward. I dislike it because I think the whole
> > capping is an intermediate thing that can be (and likely must be, when
> > considering mTHP underused shrinking I think) solved in the future
> > differently. That's why I would prefer adding this only if there is no
> > other, simpler, way forward.
>
> Yes I agree that if we could avoid it it'd be great.
>
> Really I proposed this solution on the basis that we were somehow ok with the
> capping.
>
> If we can avoid that'd be ideal as it reduces complexity and 'unexpected'
> behaviour.
>
> We'll clarify on the other thread, but the 511/0 was compelling to me before as
> a simplification, and if we can have a straightforward model of how mTHP
> collapse across none/zero page PTEs behaves this is ideal.
>
> The only question is w.r.t. warnings etc. but we can handle details there.
>
> >
> > >
> > > That struck me as the simplest way of getting this series landed without
> > > necessarily violating any future eagerness which:
> > >
> > > a. Must still support khugepaged/max_ptes_none - we aren't getting away from
> > > this, it's uAPI.
> > >
> > > b. Surely must want to do different things for mTHP in eagerness, so if we're
> > > exposing some PTE value in max_ptes_none doing so in
> > > khugepaged/mthp_max_ptes_none wouldn't be problematic (note again - it's
> > > readonly so unlike max_ptes_none we don't have to worry about the other
> > > direction).
> > >
> > > HOWEVER, eagerness might want want to change this behaviour per-mTHP size, in
> > > which case perhaps mthp_max_ptes_none would be problematic in that it is some
> > > kind of average.
> > >
> > > Then again we could always revert to putting this parameter as in (3) in that
> > > case, ugly but kinda viable.
> > >
> > > >
> > > > IIUC, creep is less of a problem when we have the underused shrinker
> > > > enabled: whatever we over-allocated can (unless longterm-pinned etc) get
> > > > reclaimed again.
> > > >
> > > > So maybe having underused-shrinker support for mTHP as well would be a
> > > > solution to tackle (1) later?
> > >
> > > How viable is this in the short term?
> >
> > I once started looking into it, but it will require quite some work, because
> > the lists will essentially include each and every (m)THP in the system ...
> > so i think we will need some redesign.
>
> Ack.
>
> This aligns with non-0/511 settings being non-functional for mTHP atm anyway.
>
> >
> > >
> > > Another possible solution:
> > >
> > > If mthp_max_ptes_none is not workable, we could have a toggle at, e.g.:
> > >
> > > /sys/kernel/mm/transparent_hugepage/khugepaged/mthp_cap_collapse_none
> > >
> > > As a simple boolean. If switched on then we document that it caps mTHP as
> > > per Nico's suggestion.
> > >
> > > That way we avoid the 'silent' issue I have with all this and it's an
> > > explicit setting.
> >
> > Right, but it's another toggle I wish we wouldn't need. We could of course
> > also make it some compile-time option, but not sure if that's really any
> > better.
> >
> > I'd hope we find an easy way forward that doesn't require new toggles, at
> > least for now ...
>
> Right, well I agree if we can make this 0/511 thing work, let's do that.
Ok, great, some consensus! I will go ahead with that solution.
Just to make sure we are all on the same page,
the max_ptes_none value will be treated as 0 for anything other than
PMD collapse, or in the case of 511. Or will the max_ptes_none only
work for mTHP collapse when it is 0.
static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
{
unsigned int max_ptes_none;
/* ignore max_ptes_none limits */
if (full_scan)
return HPAGE_PMD_NR - 1;
if (order == HPAGE_PMD_ORDER)
return khugepaged_max_ptes_none;
if (khugepaged_max_ptes_none != HPAGE_PMD_NR - 1)
return 0;
return max_ptes_none >> (HPAGE_PMD_ORDER - order);
}
Here's the implementation for the first approach, looks like Baolin
was able to catch up and beat me to the other solution while I was
mulling over the thread lol
Cheers,
-- Nico
>
> Toggle are just 'least worst' workarounds on assumption of the need for capping.
>
> >
> > --
> > Cheers
> >
> > David / dhildenb
> >
>
> Thanks, Lorenzo
>
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-29 2:47 ` Nico Pache
@ 2025-10-29 18:58 ` Lorenzo Stoakes
2025-10-29 21:23 ` Nico Pache
0 siblings, 1 reply; 77+ messages in thread
From: Lorenzo Stoakes @ 2025-10-29 18:58 UTC (permalink / raw)
To: Nico Pache
Cc: David Hildenbrand, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Tue, Oct 28, 2025 at 08:47:12PM -0600, Nico Pache wrote:
> On Tue, Oct 28, 2025 at 1:00 PM Lorenzo Stoakes
> > Right, well I agree if we can make this 0/511 thing work, let's do that.
>
> Ok, great, some consensus! I will go ahead with that solution.
:) awesome.
>
> Just to make sure we are all on the same page,
I am still stabilising my understanding of the creep issue, see the thread
where David kindly + patiently goes in detail, I think I am at a
(pre-examining algorithm itself) broad understanding of this.
>
> the max_ptes_none value will be treated as 0 for anything other than
> PMD collapse, or in the case of 511. Or will the max_ptes_none only
> work for mTHP collapse when it is 0.
511 implies always collapse zero/none, 0 implies never, as I understand it.
>
> static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
> {
> unsigned int max_ptes_none;
>
> /* ignore max_ptes_none limits */
> if (full_scan)
> return HPAGE_PMD_NR - 1;
>
> if (order == HPAGE_PMD_ORDER)
> return khugepaged_max_ptes_none;
>
> if (khugepaged_max_ptes_none != HPAGE_PMD_NR - 1)
> return 0;
>
> return max_ptes_none >> (HPAGE_PMD_ORDER - order);
> }
>
> Here's the implementation for the first approach, looks like Baolin
> was able to catch up and beat me to the other solution while I was
> mulling over the thread lol
Broadly looks similar to Baolin's, I made some suggestions over there
though!
>
> Cheers,
> -- Nico
Thanks, Lorenzo
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-29 18:58 ` Lorenzo Stoakes
@ 2025-10-29 21:23 ` Nico Pache
2025-10-30 10:15 ` Lorenzo Stoakes
0 siblings, 1 reply; 77+ messages in thread
From: Nico Pache @ 2025-10-29 21:23 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: David Hildenbrand, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Wed, Oct 29, 2025 at 12:59 PM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> On Tue, Oct 28, 2025 at 08:47:12PM -0600, Nico Pache wrote:
> > On Tue, Oct 28, 2025 at 1:00 PM Lorenzo Stoakes
> > > Right, well I agree if we can make this 0/511 thing work, let's do that.
> >
> > Ok, great, some consensus! I will go ahead with that solution.
>
> :) awesome.
>
> >
> > Just to make sure we are all on the same page,
>
> I am still stabilising my understanding of the creep issue, see the thread
> where David kindly + patiently goes in detail, I think I am at a
> (pre-examining algorithm itself) broad understanding of this.
I added some details of the creep issue in my other replies, hopefully
that also helps!
>
> >
> > the max_ptes_none value will be treated as 0 for anything other than
> > PMD collapse, or in the case of 511. Or will the max_ptes_none only
> > work for mTHP collapse when it is 0.
>
> 511 implies always collapse zero/none, 0 implies never, as I understand it.
0 implies only collapse if a given mTHP size is fully occupied by
present PTES. Since we start at PMD and work our way down we will
always end up with a PMD range of fully occupied mTHPs, potentially of
all different sizes.
>
> >
> > static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
> > {
> > unsigned int max_ptes_none;
> >
> > /* ignore max_ptes_none limits */
> > if (full_scan)
> > return HPAGE_PMD_NR - 1;
> >
> > if (order == HPAGE_PMD_ORDER)
> > return khugepaged_max_ptes_none;
> >
> > if (khugepaged_max_ptes_none != HPAGE_PMD_NR - 1)
> > return 0;
> >
> > return max_ptes_none >> (HPAGE_PMD_ORDER - order);
> > }
> >
> > Here's the implementation for the first approach, looks like Baolin
> > was able to catch up and beat me to the other solution while I was
> > mulling over the thread lol
>
> Broadly looks similar to Baolin's, I made some suggestions over there
> though!
Thanks! They are both based on my current collapse_max_ptes_none! Just
a slight difference in behavior surrounding the two suggested
solutions by David.
I will still have to implement the logic for not attempting mTHP
collapses if it is any intermediate value (i.e. the function returns
-EINVAL).
-- Nico
>
> >
> > Cheers,
> > -- Nico
>
> Thanks, Lorenzo
>
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-29 21:23 ` Nico Pache
@ 2025-10-30 10:15 ` Lorenzo Stoakes
0 siblings, 0 replies; 77+ messages in thread
From: Lorenzo Stoakes @ 2025-10-30 10:15 UTC (permalink / raw)
To: Nico Pache
Cc: David Hildenbrand, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Wed, Oct 29, 2025 at 03:23:27PM -0600, Nico Pache wrote:
> On Wed, Oct 29, 2025 at 12:59 PM Lorenzo Stoakes
> <lorenzo.stoakes@oracle.com> wrote:
> >
> > On Tue, Oct 28, 2025 at 08:47:12PM -0600, Nico Pache wrote:
> > > On Tue, Oct 28, 2025 at 1:00 PM Lorenzo Stoakes
> > > > Right, well I agree if we can make this 0/511 thing work, let's do that.
> > >
> > > Ok, great, some consensus! I will go ahead with that solution.
> >
> > :) awesome.
> >
> > >
> > > Just to make sure we are all on the same page,
> >
> > I am still stabilising my understanding of the creep issue, see the thread
> > where David kindly + patiently goes in detail, I think I am at a
> > (pre-examining algorithm itself) broad understanding of this.
>
> I added some details of the creep issue in my other replies, hopefully
> that also helps!
>
> >
> > >
> > > the max_ptes_none value will be treated as 0 for anything other than
> > > PMD collapse, or in the case of 511. Or will the max_ptes_none only
> > > work for mTHP collapse when it is 0.
> >
> > 511 implies always collapse zero/none, 0 implies never, as I understand it.
>
> 0 implies only collapse if a given mTHP size is fully occupied by
> present PTES. Since we start at PMD and work our way down we will
> always end up with a PMD range of fully occupied mTHPs, potentially of
> all different sizes.
Yeah this was my understanding, I mean terminology is tricky here (+ I am
probably not being entirely clear tbh), so I mean less so '0 means no
collapse' but rather '0 means no collapse of zero/none' but of course can
allow for collapse of present PTEs (within the same VMA).
>
> >
> > >
> > > static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
> > > {
> > > unsigned int max_ptes_none;
> > >
> > > /* ignore max_ptes_none limits */
> > > if (full_scan)
> > > return HPAGE_PMD_NR - 1;
> > >
> > > if (order == HPAGE_PMD_ORDER)
> > > return khugepaged_max_ptes_none;
> > >
> > > if (khugepaged_max_ptes_none != HPAGE_PMD_NR - 1)
> > > return 0;
> > >
> > > return max_ptes_none >> (HPAGE_PMD_ORDER - order);
> > > }
> > >
> > > Here's the implementation for the first approach, looks like Baolin
> > > was able to catch up and beat me to the other solution while I was
> > > mulling over the thread lol
> >
> > Broadly looks similar to Baolin's, I made some suggestions over there
> > though!
>
> Thanks! They are both based on my current collapse_max_ptes_none! Just
> a slight difference in behavior surrounding the two suggested
> solutions by David.
Yes which is convenient as it's less delta for you!
>
> I will still have to implement the logic for not attempting mTHP
> collapses if it is any intermediate value (i.e. the function returns
> -EINVAL).
Ack
>
> -- Nico
>
> >
> > >
> > > Cheers,
> > > -- Nico
> >
> > Thanks, Lorenzo
> >
>
Cheers, Lorenzo
^ permalink raw reply [flat|nested] 77+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 18:59 ` Lorenzo Stoakes
` (2 preceding siblings ...)
2025-10-29 2:47 ` Nico Pache
@ 2025-10-31 11:12 ` David Hildenbrand
3 siblings, 0 replies; 77+ messages in thread
From: David Hildenbrand @ 2025-10-31 11:12 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm, linux-doc,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
>>>> "eagerness" would not be like swappiness, and we will really have to be
>>>> careful here. I don't know yet when I will have time to look into that.
>>>
>>> I guess I missed this part of the converastion, what do you mean?
>>
>> Johannes raised issues with that on the list and afterwards we had an
>> offline discussion about some of the details and why something unpredictable
>> is not good.
>
> Could we get these details on-list so we can discuss them? This doesn't have to
> be urgent, but I would like to have a say in this or at least be part of the
> converastion please.
Sorry, I only found now time to reply on this point. Johannes raised the
point in [1], and afterwards we went a bit into detail in a off-list
discussion.
In essence, I think he is right that is something we have to be very
careful about. So it turned out as something that will take a lot more
time+effort on my side than I originally thought, turning it not
feasible in the short term given how I already lack behind on so many
other things.
So I concluded that it's probably best to have such and effort be
independent of this series. And in some way it is either way, because
max_ptes_none is just a horrible interface given the values are
architecture dependent.
I'll be happy if we can focus in this series on the bare minimum initial
support, and avoid any magic (scaling / capping) as it all turned out to
be much more tricky (interaction with the deferred shrinker ...) than
most of us initially thought.
But I think we're already on the same page here, just wanted to share a
bit more details on the max_ptes_none vs. eagerness idea.
[1] https://lkml.kernel.org/r/20250915134359.GA827803@cmpxchg.org
--
Cheers
David / dhildenb
^ permalink raw reply [flat|nested] 77+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 13:36 ` Nico Pache
2025-10-28 14:15 ` David Hildenbrand
@ 2025-10-28 16:57 ` Lorenzo Stoakes
2025-10-28 17:49 ` David Hildenbrand
1 sibling, 1 reply; 77+ messages in thread
From: Lorenzo Stoakes @ 2025-10-28 16:57 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david, ziy,
baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Tue, Oct 28, 2025 at 07:36:55AM -0600, Nico Pache wrote:
> On Mon, Oct 27, 2025 at 11:54 AM Lorenzo Stoakes
> <lorenzo.stoakes@oracle.com> wrote:
> >
> > On Wed, Oct 22, 2025 at 12:37:08PM -0600, Nico Pache wrote:
> > > The current mechanism for determining mTHP collapse scales the
> > > khugepaged_max_ptes_none value based on the target order. This
> > > introduces an undesirable feedback loop, or "creep", when max_ptes_none
> > > is set to a value greater than HPAGE_PMD_NR / 2.
> > >
> > > With this configuration, a successful collapse to order N will populate
> > > enough pages to satisfy the collapse condition on order N+1 on the next
> > > scan. This leads to unnecessary work and memory churn.
> > >
> > > To fix this issue introduce a helper function that caps the max_ptes_none
> > > to HPAGE_PMD_NR / 2 - 1 (255 on 4k page size). The function also scales
> > > the max_ptes_none number by the (PMD_ORDER - target collapse order).
> > >
> > > The limits can be ignored by passing full_scan=true, this is useful for
> > > madvise_collapse (which ignores limits), or in the case of
> > > collapse_scan_pmd(), allows the full PMD to be scanned when mTHP
> > > collapse is available.
> > >
> > > Signed-off-by: Nico Pache <npache@redhat.com>
> > > ---
> > > mm/khugepaged.c | 35 ++++++++++++++++++++++++++++++++++-
> > > 1 file changed, 34 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > index 4ccebf5dda97..286c3a7afdee 100644
> > > --- a/mm/khugepaged.c
> > > +++ b/mm/khugepaged.c
> > > @@ -459,6 +459,39 @@ void __khugepaged_enter(struct mm_struct *mm)
> > > wake_up_interruptible(&khugepaged_wait);
> > > }
> > >
> > > +/**
> > > + * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse
> > > + * @order: The folio order being collapsed to
> > > + * @full_scan: Whether this is a full scan (ignore limits)
> > > + *
> > > + * For madvise-triggered collapses (full_scan=true), all limits are bypassed
> > > + * and allow up to HPAGE_PMD_NR - 1 empty PTEs.
> > > + *
> > > + * For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the configured
> > > + * khugepaged_max_ptes_none value.
> > > + *
> > > + * For mTHP collapses, scale down the max_ptes_none proportionally to the folio
> > > + * order, but caps it at HPAGE_PMD_NR/2-1 to prevent a collapse feedback loop.
> > > + *
> > > + * Return: Maximum number of empty PTEs allowed for the collapse operation
> > > + */
> > > +static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
> > > +{
> > > + unsigned int max_ptes_none;
> > > +
> > > + /* ignore max_ptes_none limits */
> > > + if (full_scan)
> > > + return HPAGE_PMD_NR - 1;
> > > +
> > > + if (order == HPAGE_PMD_ORDER)
> > > + return khugepaged_max_ptes_none;
> > > +
> > > + max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
> >
>
> Hey Lorenzo,
>
> > I mean not to beat a dead horse re: v11 commentary, but I thought we were going
> > to implement David's idea re: the new 'eagerness' tunable, and again we're now just
> > implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
>
> I spoke to David and he said to continue forward with this series; the
> "eagerness" tunable will take some time, and may require further
> considerations/discussion.
It would be good to communicate this in the patch, I wasn't aware he had said go
ahead with it. Maybe I missed the mail.
Also others might not be aware. When you're explicitly ignoring prior
review from 2 version ago you really do need to spell out why, at least for
civility's sake.
Apologies if there was communication I've forgotten about/missed. But
either way please can we very explicitly communicate these things.
>
> >
> > I'm still really quite uncomfortable with us silently capping this value.
> >
> > If we're putting forward theoretical ideas that are to be later built upon, this
> > series should be an RFC.
> >
> > But if we really intend to silently ignore user input the problem is that then
> > becomes established uAPI.
> >
> > I think it's _sensible_ to avoid this mTHP escalation problem, but the issue is
> > visibility I think.
> >
> > I think people are going to find it odd that you set it to something, but then
> > get something else.
>
> The alternative solution is to not support max_ptes_none for mTHP
> collapse and not allow none/zero pages. This is essentially "capping"
> the value too.
No that alternative equally _silently_ ignores the user-specified tunable,
which is my objection.
The problem you have here is max_ptes_none _defaults_ to a value that
violates the cap for mTHP (511).
So neither solution is workable.
>
> >
> > As an alternative we could have a new sysfs field:
> >
> > /sys/kernel/mm/transparent_hugepage/khugepaged/max_mthp_ptes_none
> >
> > That shows the cap clearly.
> >
> > In fact, it could be read-only... and just expose it to the user. That reduces
> > complexity.
>
> I agree with Baolin here; adding another tunable will only increase
> the complexity for our future goals, and also provides needless
> insight into the internals when they can not be customized.
We already have needless insight into internals with max_pte_none which we can
never, ever remove due to uAPI so that ship has sailed I'm afraid.
I don't personally think adding a read-only view of this data really makes
that much worse.
Also if we do go ahead with eagerness, I expect we are going to want to
have different max_pte_none values for mTHP/non-mTHP.
We _will_ need to convert between max_pte_none and eagerness in some way
(though when eagerness comes along, we can start having 'detent' values,
that is if a use specifies max_ptes_none of 237 we could change it to 128
for instance) and as a result show eagerness _in terms of_ max_pte_none.
Since we _have_ to do this for uAPI reasons, it doesn't seem really that
harmful or adding to complexity to do the equivalent for a _read-only_
field for mTHP.
AFAIC this patch right now is not upstreamable for the simple reason of
violating user expectation (even if that expectation might be silly) and
_silently_ updating max_ptes_none for mTHP.
So this suggestion was designed to try to get us towards something
upstreamable.
So it's not a case of 'sorry I don't like that we can't do it' + we go
ahead with things as they are, it's a case of - we really need to find a
way to do this not-silently or AFAICT, the series is blocked on this until
this is resolved.
Perhaps we should have discussed 'what to do for v12' more on-list and
could have avoided this ahead of time...
Thanks, Lorenzo
>
> Cheers,
> -- Nico
>
> >
> > We can then bring in eagerness later and have the same situation of
> > max_ptes_none being a parameter that exists (plus this additional read-only
> > parameter).
> >
> > > +
> > > + return max_ptes_none >> (HPAGE_PMD_ORDER - order);
> > > +
> > > +}
> > > +
> > > void khugepaged_enter_vma(struct vm_area_struct *vma,
> > > vm_flags_t vm_flags)
> > > {
> > > @@ -546,7 +579,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > > pte_t *_pte;
> > > int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
> > > const unsigned long nr_pages = 1UL << order;
> > > - int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
> > > + int max_ptes_none = collapse_max_ptes_none(order, !cc->is_khugepaged);
> > >
> > > for (_pte = pte; _pte < pte + nr_pages;
> > > _pte++, addr += PAGE_SIZE) {
> > > --
> > > 2.51.0
> > >
> >
>
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 16:57 ` Lorenzo Stoakes
@ 2025-10-28 17:49 ` David Hildenbrand
2025-10-28 17:59 ` Lorenzo Stoakes
0 siblings, 1 reply; 77+ messages in thread
From: David Hildenbrand @ 2025-10-28 17:49 UTC (permalink / raw)
To: Lorenzo Stoakes, Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, ziy,
baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
>> Hey Lorenzo,
>>
>>> I mean not to beat a dead horse re: v11 commentary, but I thought we were going
>>> to implement David's idea re: the new 'eagerness' tunable, and again we're now just
>>> implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
>>
>> I spoke to David and he said to continue forward with this series; the
>> "eagerness" tunable will take some time, and may require further
>> considerations/discussion.
>
> It would be good to communicate this in the patch, I wasn't aware he had said go
> ahead with it. Maybe I missed the mail.
Just to clarify: yes, I think we should find a way to move forward with
this series without an eagerness toggle.
That doesn't imply that we'll be using the capping as proposed here (I
hate it, it's just tricky to work around it for now).
And ideally, we can do that without any temporary tunables, because I'm
sure it is a problem we can solve internally long-term.
--
Cheers
David / dhildenb
^ permalink raw reply [flat|nested] 77+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 17:49 ` David Hildenbrand
@ 2025-10-28 17:59 ` Lorenzo Stoakes
0 siblings, 0 replies; 77+ messages in thread
From: Lorenzo Stoakes @ 2025-10-28 17:59 UTC (permalink / raw)
To: David Hildenbrand
Cc: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm, linux-doc,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Tue, Oct 28, 2025 at 06:49:48PM +0100, David Hildenbrand wrote:
> > > Hey Lorenzo,
> > >
> > > > I mean not to beat a dead horse re: v11 commentary, but I thought we were going
> > > > to implement David's idea re: the new 'eagerness' tunable, and again we're now just
> > > > implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
> > >
> > > I spoke to David and he said to continue forward with this series; the
> > > "eagerness" tunable will take some time, and may require further
> > > considerations/discussion.
> >
> > It would be good to communicate this in the patch, I wasn't aware he had said go
> > ahead with it. Maybe I missed the mail.
>
> Just to clarify: yes, I think we should find a way to move forward with this
> series without an eagerness toggle.
OK, let's please communicate this clearly in future. Maybe I missed the comms on
that.
>
> That doesn't imply that we'll be using the capping as proposed here (I hate
> it, it's just tricky to work around it for now).
OK well this is what I thought, that you hadn't meant that we should go ahead
with the logic completely unaltered from that which was explicitly pushed back
on in v10 I think.
We obviously need to figure out a way forward on this so let's get that
done as quickly as we can.
>
> And ideally, we can do that without any temporary tunables, because I'm sure
> it is a problem we can solve internally long-term.
I went into great detail replying on the relevant thread about this, that's
have that discussion there for sanity's sake.
>
> --
> Cheers
>
> David / dhildenb
>
Thanks, Lorenzo
^ permalink raw reply [flat|nested] 77+ messages in thread
* [PATCH v12 mm-new 07/15] khugepaged: generalize collapse_huge_page for mTHP collapse
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (5 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-10-27 3:25 ` Baolin Wang
2025-11-06 18:14 ` Lorenzo Stoakes
2025-10-22 18:37 ` [PATCH v12 mm-new 08/15] khugepaged: skip collapsing mTHP to smaller orders Nico Pache
` (8 subsequent siblings)
15 siblings, 2 replies; 77+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas, tiwai,
will, dave.hansen, jack, cl, jglisse, surenb, zokeefe, hannes,
rientjes, mhocko, rdunlap, hughd, richard.weiyang, lance.yang,
vbabka, rppt, jannh, pfalcato
Pass an order and offset to collapse_huge_page to support collapsing anon
memory to arbitrary orders within a PMD. order indicates what mTHP size we
are attempting to collapse to, and offset indicates were in the PMD to
start the collapse attempt.
For non-PMD collapse we must leave the anon VMA write locked until after
we collapse the mTHP-- in the PMD case all the pages are isolated, but in
the mTHP case this is not true, and we must keep the lock to prevent
changes to the VMA from occurring.
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 108 ++++++++++++++++++++++++++++++------------------
1 file changed, 67 insertions(+), 41 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 286c3a7afdee..75e7ebdccc36 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1142,43 +1142,50 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
return SCAN_SUCCEED;
}
-static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
- int referenced, int unmapped,
- struct collapse_control *cc)
+static int collapse_huge_page(struct mm_struct *mm, unsigned long pmd_address,
+ int referenced, int unmapped, struct collapse_control *cc,
+ bool *mmap_locked, unsigned int order, unsigned long offset)
{
LIST_HEAD(compound_pagelist);
pmd_t *pmd, _pmd;
- pte_t *pte;
+ pte_t *pte = NULL, mthp_pte;
pgtable_t pgtable;
struct folio *folio;
spinlock_t *pmd_ptl, *pte_ptl;
int result = SCAN_FAIL;
struct vm_area_struct *vma;
struct mmu_notifier_range range;
+ bool anon_vma_locked = false;
+ const unsigned long nr_pages = 1UL << order;
+ unsigned long mthp_address = pmd_address + offset * PAGE_SIZE;
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ VM_BUG_ON(pmd_address & ~HPAGE_PMD_MASK);
/*
* Before allocating the hugepage, release the mmap_lock read lock.
* The allocation can take potentially a long time if it involves
* sync compaction, and we do not need to hold the mmap_lock during
* that. We will recheck the vma after taking it again in write mode.
+ * If collapsing mTHPs we may have already released the read_lock.
*/
- mmap_read_unlock(mm);
+ if (*mmap_locked) {
+ mmap_read_unlock(mm);
+ *mmap_locked = false;
+ }
- result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
+ result = alloc_charge_folio(&folio, mm, cc, order);
if (result != SCAN_SUCCEED)
goto out_nolock;
mmap_read_lock(mm);
- result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
- HPAGE_PMD_ORDER);
+ *mmap_locked = true;
+ result = hugepage_vma_revalidate(mm, pmd_address, true, &vma, cc, order);
if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm);
goto out_nolock;
}
- result = find_pmd_or_thp_or_none(mm, address, &pmd);
+ result = find_pmd_or_thp_or_none(mm, pmd_address, &pmd);
if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm);
goto out_nolock;
@@ -1190,13 +1197,14 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
* released when it fails. So we jump out_nolock directly in
* that case. Continuing to collapse causes inconsistency.
*/
- result = __collapse_huge_page_swapin(mm, vma, address, pmd,
- referenced, HPAGE_PMD_ORDER);
+ result = __collapse_huge_page_swapin(mm, vma, mthp_address, pmd,
+ referenced, order);
if (result != SCAN_SUCCEED)
goto out_nolock;
}
mmap_read_unlock(mm);
+ *mmap_locked = false;
/*
* Prevent all access to pagetables with the exception of
* gup_fast later handled by the ptep_clear_flush and the VM
@@ -1206,20 +1214,20 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
* mmap_lock.
*/
mmap_write_lock(mm);
- result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
- HPAGE_PMD_ORDER);
+ result = hugepage_vma_revalidate(mm, pmd_address, true, &vma, cc, order);
if (result != SCAN_SUCCEED)
goto out_up_write;
/* check if the pmd is still valid */
vma_start_write(vma);
- result = check_pmd_still_valid(mm, address, pmd);
+ result = check_pmd_still_valid(mm, pmd_address, pmd);
if (result != SCAN_SUCCEED)
goto out_up_write;
anon_vma_lock_write(vma->anon_vma);
+ anon_vma_locked = true;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
- address + HPAGE_PMD_SIZE);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, mthp_address,
+ mthp_address + (PAGE_SIZE << order));
mmu_notifier_invalidate_range_start(&range);
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
@@ -1231,24 +1239,21 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
* Parallel GUP-fast is fine since GUP-fast will back off when
* it detects PMD is changed.
*/
- _pmd = pmdp_collapse_flush(vma, address, pmd);
+ _pmd = pmdp_collapse_flush(vma, pmd_address, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(&range);
tlb_remove_table_sync_one();
- pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
+ pte = pte_offset_map_lock(mm, &_pmd, mthp_address, &pte_ptl);
if (pte) {
- result = __collapse_huge_page_isolate(vma, address, pte, cc,
- HPAGE_PMD_ORDER,
- &compound_pagelist);
+ result = __collapse_huge_page_isolate(vma, mthp_address, pte, cc,
+ order, &compound_pagelist);
spin_unlock(pte_ptl);
} else {
result = SCAN_PMD_NULL;
}
if (unlikely(result != SCAN_SUCCEED)) {
- if (pte)
- pte_unmap(pte);
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
/*
@@ -1258,21 +1263,21 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
*/
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
spin_unlock(pmd_ptl);
- anon_vma_unlock_write(vma->anon_vma);
goto out_up_write;
}
/*
- * All pages are isolated and locked so anon_vma rmap
- * can't run anymore.
+ * For PMD collapse all pages are isolated and locked so anon_vma
+ * rmap can't run anymore. For mTHP collapse we must hold the lock
*/
- anon_vma_unlock_write(vma->anon_vma);
+ if (order == HPAGE_PMD_ORDER) {
+ anon_vma_unlock_write(vma->anon_vma);
+ anon_vma_locked = false;
+ }
result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
- vma, address, pte_ptl,
- HPAGE_PMD_ORDER,
- &compound_pagelist);
- pte_unmap(pte);
+ vma, mthp_address, pte_ptl,
+ order, &compound_pagelist);
if (unlikely(result != SCAN_SUCCEED))
goto out_up_write;
@@ -1282,20 +1287,42 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
* write.
*/
__folio_mark_uptodate(folio);
- pgtable = pmd_pgtable(_pmd);
+ if (order == HPAGE_PMD_ORDER) {
+ pgtable = pmd_pgtable(_pmd);
- spin_lock(pmd_ptl);
- BUG_ON(!pmd_none(*pmd));
- pgtable_trans_huge_deposit(mm, pmd, pgtable);
- map_anon_folio_pmd_nopf(folio, pmd, vma, address);
- spin_unlock(pmd_ptl);
+ spin_lock(pmd_ptl);
+ WARN_ON_ONCE(!pmd_none(*pmd));
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_address);
+ spin_unlock(pmd_ptl);
+ } else { /* mTHP collapse */
+ mthp_pte = mk_pte(folio_page(folio, 0), vma->vm_page_prot);
+ mthp_pte = maybe_mkwrite(pte_mkdirty(mthp_pte), vma);
+
+ spin_lock(pmd_ptl);
+ WARN_ON_ONCE(!pmd_none(*pmd));
+ folio_ref_add(folio, nr_pages - 1);
+ folio_add_new_anon_rmap(folio, vma, mthp_address, RMAP_EXCLUSIVE);
+ folio_add_lru_vma(folio, vma);
+ set_ptes(vma->vm_mm, mthp_address, pte, mthp_pte, nr_pages);
+ update_mmu_cache_range(NULL, vma, mthp_address, pte, nr_pages);
+
+ smp_wmb(); /* make PTEs visible before PMD. See pmd_install() */
+ pmd_populate(mm, pmd, pmd_pgtable(_pmd));
+ spin_unlock(pmd_ptl);
+ }
folio = NULL;
result = SCAN_SUCCEED;
out_up_write:
+ if (anon_vma_locked)
+ anon_vma_unlock_write(vma->anon_vma);
+ if (pte)
+ pte_unmap(pte);
mmap_write_unlock(mm);
out_nolock:
+ *mmap_locked = false;
if (folio)
folio_put(folio);
trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
@@ -1463,9 +1490,8 @@ static int collapse_scan_pmd(struct mm_struct *mm,
pte_unmap_unlock(pte, ptl);
if (result == SCAN_SUCCEED) {
result = collapse_huge_page(mm, start_addr, referenced,
- unmapped, cc);
- /* collapse_huge_page will return with the mmap_lock released */
- *mmap_locked = false;
+ unmapped, cc, mmap_locked,
+ HPAGE_PMD_ORDER, 0);
}
out:
trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
--
2.51.0
^ permalink raw reply related [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 07/15] khugepaged: generalize collapse_huge_page for mTHP collapse
2025-10-22 18:37 ` [PATCH v12 mm-new 07/15] khugepaged: generalize collapse_huge_page for mTHP collapse Nico Pache
@ 2025-10-27 3:25 ` Baolin Wang
2025-11-06 18:14 ` Lorenzo Stoakes
1 sibling, 0 replies; 77+ messages in thread
From: Baolin Wang @ 2025-10-27 3:25 UTC (permalink / raw)
To: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, lorenzo.stoakes, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On 2025/10/23 02:37, Nico Pache wrote:
> Pass an order and offset to collapse_huge_page to support collapsing anon
> memory to arbitrary orders within a PMD. order indicates what mTHP size we
> are attempting to collapse to, and offset indicates were in the PMD to
> start the collapse attempt.
>
> For non-PMD collapse we must leave the anon VMA write locked until after
> we collapse the mTHP-- in the PMD case all the pages are isolated, but in
> the mTHP case this is not true, and we must keep the lock to prevent
> changes to the VMA from occurring.
>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
LGTM. And passed my mTHP collapse testing cases. So:
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
^ permalink raw reply [flat|nested] 77+ messages in thread
* Re: [PATCH v12 mm-new 07/15] khugepaged: generalize collapse_huge_page for mTHP collapse
2025-10-22 18:37 ` [PATCH v12 mm-new 07/15] khugepaged: generalize collapse_huge_page for mTHP collapse Nico Pache
2025-10-27 3:25 ` Baolin Wang
@ 2025-11-06 18:14 ` Lorenzo Stoakes
2025-11-07 3:09 ` Dev Jain
2025-11-07 19:33 ` Nico Pache
1 sibling, 2 replies; 77+ messages in thread
From: Lorenzo Stoakes @ 2025-11-06 18:14 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david, ziy,
baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Wed, Oct 22, 2025 at 12:37:09PM -0600, Nico Pache wrote:
> Pass an order and offset to collapse_huge_page to support collapsing anon
> memory to arbitrary orders within a PMD. order indicates what mTHP size we
> are attempting to collapse to, and offset indicates were in the PMD to
> start the collapse attempt.
>
> For non-PMD collapse we must leave the anon VMA write locked until after
> we collapse the mTHP-- in the PMD case all the pages are isolated, but in
NIT but is this -- a typo?
> the mTHP case this is not true, and we must keep the lock to prevent
> changes to the VMA from occurring.
>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
> mm/khugepaged.c | 108 ++++++++++++++++++++++++++++++------------------
> 1 file changed, 67 insertions(+), 41 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 286c3a7afdee..75e7ebdccc36 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1142,43 +1142,50 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
> return SCAN_SUCCEED;
> }
>
> -static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> - int referenced, int unmapped,
> - struct collapse_control *cc)
> +static int collapse_huge_page(struct mm_struct *mm, unsigned long pmd_address,
Presumably pmd_address is the PMD-aligned address?
> + int referenced, int unmapped, struct collapse_control *cc,
> + bool *mmap_locked, unsigned int order, unsigned long offset)
It'd be nice to pass through a helper struct at this point having so many params
but perhaps we can deal with that in a follow up series.
If PMD address is the PMD-aligned address, and mthp_address = pmd_address +
offset * PAGE_SIZE, couldn't we just pass in the mthp address and get the
PMD address by aligning down to PMD size and reduce the number of args by
1?
> {
> LIST_HEAD(compound_pagelist);
> pmd_t *pmd, _pmd;
> - pte_t *pte;
> + pte_t *pte = NULL, mthp_pte;
mthp_pte is only used in a single if () branch and can be declared there
AFAICT?
> pgtable_t pgtable;
> struct folio *folio;
> spinlock_t *pmd_ptl, *pte_ptl;
> int result = SCAN_FAIL;
> struct vm_area_struct *vma;
> struct mmu_notifier_range range;
> + bool anon_vma_locked = false;
> + const unsigned long nr_pages = 1UL << order;
> + unsigned long mthp_address = pmd_address + offset * PAGE_SIZE;
Do we ever update this? If not we can const-ify.
>
> - VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> + VM_BUG_ON(pmd_address & ~HPAGE_PMD_MASK);
NIT: Be nice to convert this to a VM_WARN_ON_ONCE(), as VM_BUG_ON() is not
right here.
>
> /*
> * Before allocating the hugepage, release the mmap_lock read lock.
> * The allocation can take potentially a long time if it involves
> * sync compaction, and we do not need to hold the mmap_lock during
> * that. We will recheck the vma after taking it again in write mode.
> + * If collapsing mTHPs we may have already released the read_lock.
> */
> - mmap_read_unlock(mm);
> + if (*mmap_locked) {
> + mmap_read_unlock(mm);
> + *mmap_locked = false;
> + }
>
> - result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
> + result = alloc_charge_folio(&folio, mm, cc, order);
> if (result != SCAN_SUCCEED)
> goto out_nolock;
>
> mmap_read_lock(mm);
> - result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> - HPAGE_PMD_ORDER);
> + *mmap_locked = true;
> + result = hugepage_vma_revalidate(mm, pmd_address, true, &vma, cc, order);
> if (result != SCAN_SUCCEED) {
> mmap_read_unlock(mm);
I don't really love the semantics of 'sometimes we set *mmap_locked false
when we unlock, sometimes we rely on out_nolock doing it'.
Let's just set it false when we unlock and VM_WARN_ON_ONCE(*mmap_locked) in
out_nolock.
> goto out_nolock;
> }
>
> - result = find_pmd_or_thp_or_none(mm, address, &pmd);
> + result = find_pmd_or_thp_or_none(mm, pmd_address, &pmd);
> if (result != SCAN_SUCCEED) {
> mmap_read_unlock(mm);
> goto out_nolock;
> @@ -1190,13 +1197,14 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> * released when it fails. So we jump out_nolock directly in
> * that case. Continuing to collapse causes inconsistency.
> */
> - result = __collapse_huge_page_swapin(mm, vma, address, pmd,
> - referenced, HPAGE_PMD_ORDER);
> + result = __collapse_huge_page_swapin(mm, vma, mthp_address, pmd,
> + referenced, order);
> if (result != SCAN_SUCCEED)
> goto out_nolock;
> }
>
> mmap_read_unlock(mm);
> + *mmap_locked = false;
> /*
> * Prevent all access to pagetables with the exception of
> * gup_fast later handled by the ptep_clear_flush and the VM
> @@ -1206,20 +1214,20 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> * mmap_lock.
> */
> mmap_write_lock(mm);
> - result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> - HPAGE_PMD_ORDER);
> + result = hugepage_vma_revalidate(mm, pmd_address, true, &vma, cc, order);
> if (result != SCAN_SUCCEED)
> goto out_up_write;
> /* check if the pmd is still valid */
> vma_start_write(vma);
> - result = check_pmd_still_valid(mm, address, pmd);
> + result = check_pmd_still_valid(mm, pmd_address, pmd);
> if (result != SCAN_SUCCEED)
> goto out_up_write;
>
> anon_vma_lock_write(vma->anon_vma);
> + anon_vma_locked = true;
>
> - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
> - address + HPAGE_PMD_SIZE);
> + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, mthp_address,
> + mthp_address + (PAGE_SIZE << order));
> mmu_notifier_invalidate_range_start(&range);
>
> pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
> @@ -1231,24 +1239,21 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> * Parallel GUP-fast is fine since GUP-fast will back off when
> * it detects PMD is changed.
> */
> - _pmd = pmdp_collapse_flush(vma, address, pmd);
> + _pmd = pmdp_collapse_flush(vma, pmd_address, pmd);
Not your fault but so hate this _p** convention. One for a follow up I
suppose.
> spin_unlock(pmd_ptl);
> mmu_notifier_invalidate_range_end(&range);
> tlb_remove_table_sync_one();
>
> - pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
> + pte = pte_offset_map_lock(mm, &_pmd, mthp_address, &pte_ptl);
> if (pte) {
> - result = __collapse_huge_page_isolate(vma, address, pte, cc,
> - HPAGE_PMD_ORDER,
> - &compound_pagelist);
> + result = __collapse_huge_page_isolate(vma, mthp_address, pte, cc,
> + order, &compound_pagelist);
> spin_unlock(pte_ptl);
> } else {
> result = SCAN_PMD_NULL;
> }
>
> if (unlikely(result != SCAN_SUCCEED)) {
> - if (pte)
> - pte_unmap(pte);
OK I guess we drop this because it's handled in out_up_write. I assume no
issue keeping PTE mapped here?
> spin_lock(pmd_ptl);
> BUG_ON(!pmd_none(*pmd));
> /*
> @@ -1258,21 +1263,21 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> */
> pmd_populate(mm, pmd, pmd_pgtable(_pmd));
> spin_unlock(pmd_ptl);
> - anon_vma_unlock_write(vma->anon_vma);
> goto out_up_write;
> }
>
> /*
> - * All pages are isolated and locked so anon_vma rmap
> - * can't run anymore.
> + * For PMD collapse all pages are isolated and locked so anon_vma
> + * rmap can't run anymore. For mTHP collapse we must hold the lock
> */
> - anon_vma_unlock_write(vma->anon_vma);
> + if (order == HPAGE_PMD_ORDER) {
> + anon_vma_unlock_write(vma->anon_vma);
> + anon_vma_locked = false;
> + }
>
> result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
> - vma, address, pte_ptl,
> - HPAGE_PMD_ORDER,
> - &compound_pagelist);
> - pte_unmap(pte);
> + vma, mthp_address, pte_ptl,
> + order, &compound_pagelist);
Looking through __collapse_huge_page_copy() there doesn't seem to be any
issue with holding anon lock here.
> if (unlikely(result != SCAN_SUCCEED))
> goto out_up_write;
>
> @@ -1282,20 +1287,42 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> * write.
> */
> __folio_mark_uptodate(folio);
> - pgtable = pmd_pgtable(_pmd);
> + if (order == HPAGE_PMD_ORDER) {
> + pgtable = pmd_pgtable(_pmd);
>
> - spin_lock(pmd_ptl);
> - BUG_ON(!pmd_none(*pmd));
> - pgtable_trans_huge_deposit(mm, pmd, pgtable);
> - map_anon_folio_pmd_nopf(folio, pmd, vma, address);
> - spin_unlock(pmd_ptl);
> + spin_lock(pmd_ptl);
> + WARN_ON_ONCE(!pmd_none(*pmd));
> + pgtable_trans_huge_deposit(mm, pmd, pgtable);
> + map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_address);
> + spin_unlock(pmd_ptl);
> + } else { /* mTHP collapse */
As per above, let's just declare mthp_pte here.
> + mthp_pte = mk_pte(folio_page(folio, 0), vma->vm_page_prot);
Hm, so we make a PTE that references the first page of the folio? I guess
the folio will be an mTHP folio so we're just creating essentially a
> + mthp_pte = maybe_mkwrite(pte_mkdirty(mthp_pte), vma);
In set_pte_range() we have a whole host of other checks like dirty,
uffd_wp, etc. I wonder if we need to consider those?
> +
> + spin_lock(pmd_ptl);
We're duplicating this in both branches, why not do outside if/else?
> + WARN_ON_ONCE(!pmd_none(*pmd));
Hmm so the PMD entry will still always be empty on mTHP collapse? Surely we
could be collapsing more than one mTHP into an existing PTE table no? I may
be missing something here/confused :)
> + folio_ref_add(folio, nr_pages - 1);
If we're setting the refcount here, where is the ref count being set in the
PMD path?
> + folio_add_new_anon_rmap(folio, vma, mthp_address, RMAP_EXCLUSIVE);
> + folio_add_lru_vma(folio, vma);
> + set_ptes(vma->vm_mm, mthp_address, pte, mthp_pte, nr_pages);
> + update_mmu_cache_range(NULL, vma, mthp_address, pte, nr_pages);
Prior to this change the only user of this are functions in memory.c, I
do wonder if this is the wrong abstraction here.
But maybe that's _yet another_ thing for a follow up (the THP code is a
mess).
> +
> + smp_wmb(); /* make PTEs visible before PMD. See pmd_install() */
Feels like we could avoid open-coding this by just using pmd_install()?
Also are we therefore missing a mm_inc_nr_ptes() invocation here, or do we
update mm->pgtables_bytes elsewhere?
> + pmd_populate(mm, pmd, pmd_pgtable(_pmd));
Why are we referencing pmd in PMD branch and _pmd here?
> + spin_unlock(pmd_ptl);
The PMD case does this stuff in map_anon_pmd_nopf(), could we add one for
mTHP?
This function is already horribly overwrought (not your fault) so I'd like
to avoid adding open-coded blocks as much as possible.
> + }
>
> folio = NULL;
>
> result = SCAN_SUCCEED;
> out_up_write:
> + if (anon_vma_locked)
> + anon_vma_unlock_write(vma->anon_vma);
> + if (pte)
> + pte_unmap(pte);
> mmap_write_unlock(mm);
> out_nolock:
> + *mmap_locked = false;
See above comment about setting this prior to jumping to out_nolock.
> if (folio)
> folio_put(folio);
> trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
> @@ -1463,9 +1490,8 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> pte_unmap_unlock(pte, ptl);
> if (result == SCAN_SUCCEED) {
> result = collapse_huge_page(mm, start_addr, referenced,
> - unmapped, cc);
> - /* collapse_huge_page will return with the mmap_lock released */
> - *mmap_locked = false;
> + unmapped, cc, mmap_locked,
> + HPAGE_PMD_ORDER, 0);
> }
> out:
> trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
> --
> 2.51.0
>
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 07/15] khugepaged: generalize collapse_huge_page for mTHP collapse
2025-11-06 18:14 ` Lorenzo Stoakes
@ 2025-11-07 3:09 ` Dev Jain
2025-11-07 9:18 ` Lorenzo Stoakes
2025-11-07 19:33 ` Nico Pache
1 sibling, 1 reply; 77+ messages in thread
From: Dev Jain @ 2025-11-07 3:09 UTC (permalink / raw)
To: Lorenzo Stoakes, Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david, ziy,
baolin.wang, Liam.Howlett, ryan.roberts, corbet, rostedt,
mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
> ----------[snip]------------
>
>> +
>> + spin_lock(pmd_ptl);
> We're duplicating this in both branches, why not do outside if/else?
>
>> + WARN_ON_ONCE(!pmd_none(*pmd));
> Hmm so the PMD entry will still always be empty on mTHP collapse? Surely we
> could be collapsing more than one mTHP into an existing PTE table no? I may
> be missing something here/confused :)
After this code path isolates the PTE table, we don't want any other code path
doing "Hey, I see an empty PMD, let's install a PTE table here". One of the
reasons why all the heavy locking is required here.
Also, I want to ask a question about WARN vs BUG_ON: suppose that the
race I described above occurs. After khugepaged isolates the PTE table, someone
faults in a PTE table there, and eventually writes data in the underlying folios.
Then the buggy khugepaged nukes out that table and installs a new one, installing
an mTHP folio which had old data. How do we decide whether such a condition is
worthy of a BUG_ON (leading to system crash) vs letting this pass with WARN?
>
> ------------[snip]----------
>
^ permalink raw reply [flat|nested] 77+ messages in thread
* Re: [PATCH v12 mm-new 07/15] khugepaged: generalize collapse_huge_page for mTHP collapse
2025-11-07 3:09 ` Dev Jain
@ 2025-11-07 9:18 ` Lorenzo Stoakes
0 siblings, 0 replies; 77+ messages in thread
From: Lorenzo Stoakes @ 2025-11-07 9:18 UTC (permalink / raw)
To: Dev Jain
Cc: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm, linux-doc,
david, ziy, baolin.wang, Liam.Howlett, ryan.roberts, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Fri, Nov 07, 2025 at 08:39:03AM +0530, Dev Jain wrote:
> > ----------[snip]------------
PLease when you snip can you not snip way the code being referenced?
That's really unhelpful and now this sub-thread loses a ton of context...
> >
> > > +
> > > + spin_lock(pmd_ptl);
> > We're duplicating this in both branches, why not do outside if/else?
> >
> > > + WARN_ON_ONCE(!pmd_none(*pmd));
> > Hmm so the PMD entry will still always be empty on mTHP collapse? Surely we
> > could be collapsing more than one mTHP into an existing PTE table no? I may
> > be missing something here/confused :)
>
> After this code path isolates the PTE table, we don't want any other code path
> doing "Hey, I see an empty PMD, let's install a PTE table here". One of the
> reasons why all the heavy locking is required here.
That wasn't the question, the question was why are not able to install mTHP
entries in an existing PTE table.
I'm obviously aware that we need to lock here.
>
> Also, I want to ask a question about WARN vs BUG_ON: suppose that the
> race I described above occurs. After khugepaged isolates the PTE table, someone
> faults in a PTE table there, and eventually writes data in the underlying folios.
> Then the buggy khugepaged nukes out that table and installs a new one, installing
> an mTHP folio which had old data. How do we decide whether such a condition is
> worthy of a BUG_ON (leading to system crash) vs letting this pass with WARN?
To all intents and purposes just use a WARN_ON(). A BUG_ON() is almost
never right. This has been done to death.
Probably the WARN_ON() should be a VM_WARN_ON_ONCE() because this is
something that should simply not be happening in practice.
Or can make if (WARN_ON_ONCE(...)) abort, but then we complicate already
very complciated code.
>
>
> >
> > ------------[snip]----------
> >
Thanks, Lorenzo
^ permalink raw reply [flat|nested] 77+ messages in thread
* Re: [PATCH v12 mm-new 07/15] khugepaged: generalize collapse_huge_page for mTHP collapse
2025-11-06 18:14 ` Lorenzo Stoakes
2025-11-07 3:09 ` Dev Jain
@ 2025-11-07 19:33 ` Nico Pache
1 sibling, 0 replies; 77+ messages in thread
From: Nico Pache @ 2025-11-07 19:33 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david, ziy,
baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Thu, Nov 6, 2025 at 11:15 AM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> On Wed, Oct 22, 2025 at 12:37:09PM -0600, Nico Pache wrote:
> > Pass an order and offset to collapse_huge_page to support collapsing anon
> > memory to arbitrary orders within a PMD. order indicates what mTHP size we
> > are attempting to collapse to, and offset indicates were in the PMD to
> > start the collapse attempt.
> >
> > For non-PMD collapse we must leave the anon VMA write locked until after
> > we collapse the mTHP-- in the PMD case all the pages are isolated, but in
>
> NIT but is this -- a typo?
no its an em dash. I can replace it with a period if you'd like, but
both work in this context.
>
> > the mTHP case this is not true, and we must keep the lock to prevent
> > changes to the VMA from occurring.
> >
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> > mm/khugepaged.c | 108 ++++++++++++++++++++++++++++++------------------
> > 1 file changed, 67 insertions(+), 41 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 286c3a7afdee..75e7ebdccc36 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -1142,43 +1142,50 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
> > return SCAN_SUCCEED;
> > }
> >
> > -static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > - int referenced, int unmapped,
> > - struct collapse_control *cc)
> > +static int collapse_huge_page(struct mm_struct *mm, unsigned long pmd_address,
>
> Presumably pmd_address is the PMD-aligned address?
>
> > + int referenced, int unmapped, struct collapse_control *cc,
> > + bool *mmap_locked, unsigned int order, unsigned long offset)
>
> It'd be nice to pass through a helper struct at this point having so many params
> but perhaps we can deal with that in a follow up series.
>
> If PMD address is the PMD-aligned address, and mthp_address = pmd_address +
> offset * PAGE_SIZE, couldn't we just pass in the mthp address and get the
> PMD address by aligning down to PMD size and reduce the number of args by
> 1?
Yeah that seems like a good idea. Thanks
>
> > {
> > LIST_HEAD(compound_pagelist);
> > pmd_t *pmd, _pmd;
> > - pte_t *pte;
> > + pte_t *pte = NULL, mthp_pte;
>
> mthp_pte is only used in a single if () branch and can be declared there
> AFAICT?
ack!
>
> > pgtable_t pgtable;
> > struct folio *folio;
> > spinlock_t *pmd_ptl, *pte_ptl;
> > int result = SCAN_FAIL;
> > struct vm_area_struct *vma;
> > struct mmu_notifier_range range;
> > + bool anon_vma_locked = false;
> > + const unsigned long nr_pages = 1UL << order;
> > + unsigned long mthp_address = pmd_address + offset * PAGE_SIZE;
>
> Do we ever update this? If not we can const-ify.
ack!
>
> >
> > - VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> > + VM_BUG_ON(pmd_address & ~HPAGE_PMD_MASK);
>
> NIT: Be nice to convert this to a VM_WARN_ON_ONCE(), as VM_BUG_ON() is not
> right here.
>
> >
> > /*
> > * Before allocating the hugepage, release the mmap_lock read lock.
> > * The allocation can take potentially a long time if it involves
> > * sync compaction, and we do not need to hold the mmap_lock during
> > * that. We will recheck the vma after taking it again in write mode.
> > + * If collapsing mTHPs we may have already released the read_lock.
> > */
> > - mmap_read_unlock(mm);
> > + if (*mmap_locked) {
> > + mmap_read_unlock(mm);
> > + *mmap_locked = false;
> > + }
> >
> > - result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
> > + result = alloc_charge_folio(&folio, mm, cc, order);
> > if (result != SCAN_SUCCEED)
> > goto out_nolock;
> >
> > mmap_read_lock(mm);
> > - result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> > - HPAGE_PMD_ORDER);
> > + *mmap_locked = true;
> > + result = hugepage_vma_revalidate(mm, pmd_address, true, &vma, cc, order);
> > if (result != SCAN_SUCCEED) {
> > mmap_read_unlock(mm);
>
> I don't really love the semantics of 'sometimes we set *mmap_locked false
> when we unlock, sometimes we rely on out_nolock doing it'.
>
> Let's just set it false when we unlock and VM_WARN_ON_ONCE(*mmap_locked) in
> out_nolock.
Ok that sounds like a good idea! thanks
>
> > goto out_nolock;
> > }
> >
> > - result = find_pmd_or_thp_or_none(mm, address, &pmd);
> > + result = find_pmd_or_thp_or_none(mm, pmd_address, &pmd);
> > if (result != SCAN_SUCCEED) {
> > mmap_read_unlock(mm);
> > goto out_nolock;
> > @@ -1190,13 +1197,14 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > * released when it fails. So we jump out_nolock directly in
> > * that case. Continuing to collapse causes inconsistency.
> > */
> > - result = __collapse_huge_page_swapin(mm, vma, address, pmd,
> > - referenced, HPAGE_PMD_ORDER);
> > + result = __collapse_huge_page_swapin(mm, vma, mthp_address, pmd,
> > + referenced, order);
> > if (result != SCAN_SUCCEED)
> > goto out_nolock;
> > }
> >
> > mmap_read_unlock(mm);
> > + *mmap_locked = false;
> > /*
> > * Prevent all access to pagetables with the exception of
> > * gup_fast later handled by the ptep_clear_flush and the VM
> > @@ -1206,20 +1214,20 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > * mmap_lock.
> > */
> > mmap_write_lock(mm);
> > - result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> > - HPAGE_PMD_ORDER);
> > + result = hugepage_vma_revalidate(mm, pmd_address, true, &vma, cc, order);
> > if (result != SCAN_SUCCEED)
> > goto out_up_write;
> > /* check if the pmd is still valid */
> > vma_start_write(vma);
> > - result = check_pmd_still_valid(mm, address, pmd);
> > + result = check_pmd_still_valid(mm, pmd_address, pmd);
> > if (result != SCAN_SUCCEED)
> > goto out_up_write;
> >
> > anon_vma_lock_write(vma->anon_vma);
> > + anon_vma_locked = true;
> >
> > - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
> > - address + HPAGE_PMD_SIZE);
> > + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, mthp_address,
> > + mthp_address + (PAGE_SIZE << order));
> > mmu_notifier_invalidate_range_start(&range);
> >
> > pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
> > @@ -1231,24 +1239,21 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > * Parallel GUP-fast is fine since GUP-fast will back off when
> > * it detects PMD is changed.
> > */
> > - _pmd = pmdp_collapse_flush(vma, address, pmd);
> > + _pmd = pmdp_collapse_flush(vma, pmd_address, pmd);
>
> Not your fault but so hate this _p** convention. One for a follow up I
> suppose.
>
> > spin_unlock(pmd_ptl);
> > mmu_notifier_invalidate_range_end(&range);
> > tlb_remove_table_sync_one();
> >
> > - pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
> > + pte = pte_offset_map_lock(mm, &_pmd, mthp_address, &pte_ptl);
> > if (pte) {
> > - result = __collapse_huge_page_isolate(vma, address, pte, cc,
> > - HPAGE_PMD_ORDER,
> > - &compound_pagelist);
> > + result = __collapse_huge_page_isolate(vma, mthp_address, pte, cc,
> > + order, &compound_pagelist);
> > spin_unlock(pte_ptl);
> > } else {
> > result = SCAN_PMD_NULL;
> > }
> >
> > if (unlikely(result != SCAN_SUCCEED)) {
> > - if (pte)
> > - pte_unmap(pte);
>
> OK I guess we drop this because it's handled in out_up_write. I assume no
> issue keeping PTE mapped here?
Correct, I dont think there are any issues here. The checks for pte
and anon_vma_locked in out_up_write should keep everything in order.
>
> > spin_lock(pmd_ptl);
> > BUG_ON(!pmd_none(*pmd));
> > /*
> > @@ -1258,21 +1263,21 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > */
> > pmd_populate(mm, pmd, pmd_pgtable(_pmd));
> > spin_unlock(pmd_ptl);
> > - anon_vma_unlock_write(vma->anon_vma);
> > goto out_up_write;
> > }
> >
> > /*
> > - * All pages are isolated and locked so anon_vma rmap
> > - * can't run anymore.
> > + * For PMD collapse all pages are isolated and locked so anon_vma
> > + * rmap can't run anymore. For mTHP collapse we must hold the lock
> > */
> > - anon_vma_unlock_write(vma->anon_vma);
> > + if (order == HPAGE_PMD_ORDER) {
> > + anon_vma_unlock_write(vma->anon_vma);
> > + anon_vma_locked = false;
> > + }
> >
> > result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
> > - vma, address, pte_ptl,
> > - HPAGE_PMD_ORDER,
> > - &compound_pagelist);
> > - pte_unmap(pte);
> > + vma, mthp_address, pte_ptl,
> > + order, &compound_pagelist);
>
> Looking through __collapse_huge_page_copy() there doesn't seem to be any
> issue with holding anon lock here.
>
> > if (unlikely(result != SCAN_SUCCEED))
> > goto out_up_write;
> >
> > @@ -1282,20 +1287,42 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > * write.
> > */
> > __folio_mark_uptodate(folio);
> > - pgtable = pmd_pgtable(_pmd);
> > + if (order == HPAGE_PMD_ORDER) {
> > + pgtable = pmd_pgtable(_pmd);
> >
> > - spin_lock(pmd_ptl);
> > - BUG_ON(!pmd_none(*pmd));
> > - pgtable_trans_huge_deposit(mm, pmd, pgtable);
> > - map_anon_folio_pmd_nopf(folio, pmd, vma, address);
> > - spin_unlock(pmd_ptl);
> > + spin_lock(pmd_ptl);
> > + WARN_ON_ONCE(!pmd_none(*pmd));
> > + pgtable_trans_huge_deposit(mm, pmd, pgtable);
> > + map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_address);
> > + spin_unlock(pmd_ptl);
> > + } else { /* mTHP collapse */
>
> As per above, let's just declare mthp_pte here.
ack
>
> > + mthp_pte = mk_pte(folio_page(folio, 0), vma->vm_page_prot);
>
> Hm, so we make a PTE that references the first page of the folio? I guess
> the folio will be an mTHP folio so we're just creating essentially a
>
> > + mthp_pte = maybe_mkwrite(pte_mkdirty(mthp_pte), vma);
>
> In set_pte_range() we have a whole host of other checks like dirty,
> uffd_wp, etc. I wonder if we need to consider those?
I dont believe so because those checks are coming from fault handling.
Here we are doing almost the same thing that the PMD case was doing
with some influence from do_anonymous_page()
>
> > +
> > + spin_lock(pmd_ptl);
>
> We're duplicating this in both branches, why not do outside if/else?
ack
>
> > + WARN_ON_ONCE(!pmd_none(*pmd));
>
> Hmm so the PMD entry will still always be empty on mTHP collapse? Surely we
> could be collapsing more than one mTHP into an existing PTE table no? I may
> be missing something here/confused :)
We remove the PMD entry to ensure no GUP-fast call can operate on this PMD.
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
* This removes any huge TLB entry from the CPU so we won't allow
* huge and small TLB entries for the same virtual address to
* avoid the risk of CPU bugs in that area.
*
* Parallel GUP-fast is fine since GUP-fast will back off when
* it detects PMD is changed.
*/
_pmd = pmdp_collapse_flush(vma, pmd_address, pmd);
pmdp_collapse_flush removes the PMD
pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
In the PMD case we install a new PMD, in the mTHP case (and in the
failure cases), we reinstall the same PMD once we are done/exit.
>
> > + folio_ref_add(folio, nr_pages - 1);
>
> If we're setting the refcount here, where is the ref count being set in the
> PMD path?
Both folios are initiated with a single ref. PMDs only need 1 ref,
while mTHPs need a ref for each PTE; hence why we increment by
nr_pages - 1.
>
> > + folio_add_new_anon_rmap(folio, vma, mthp_address, RMAP_EXCLUSIVE);
> > + folio_add_lru_vma(folio, vma);
> > + set_ptes(vma->vm_mm, mthp_address, pte, mthp_pte, nr_pages);
> > + update_mmu_cache_range(NULL, vma, mthp_address, pte, nr_pages);
>
> Prior to this change the only user of this are functions in memory.c, I
> do wonder if this is the wrong abstraction here.
>
> But maybe that's _yet another_ thing for a follow up (the THP code is a
> mess).
Yes, I tried to do something similar to the new
map_anon_folio_pmd_nopf, but it proved to be harder than expected. The
other cases that do similar operations all differ slightly so unifying
is going to be tricky/require more testing.
>
> > +
> > + smp_wmb(); /* make PTEs visible before PMD. See c() */
>
> Feels like we could avoid open-coding this by just using pmd_install()?
The locking seems to differ which may make that tricky.
>
> Also are we therefore missing a mm_inc_nr_ptes() invocation here, or do we
> update mm->pgtables_bytes elsewhere?
If I understand correctly, we already have accounted for the ptes when
we alloc'd them and their parent PMD. Since we are operating on an
already allocated PMD, I dont think we need to handle accounting for
PMD or mTHP collapse. Ill send some time confirming this before
posting.
>
>
> > + pmd_populate(mm, pmd, pmd_pgtable(_pmd));
>
> Why are we referencing pmd in PMD branch and _pmd here?
I explained it a little more above, but we are reinstalling the
original PMD entry, which was removed for gup race reasons.
>
> > + spin_unlock(pmd_ptl);
>
> The PMD case does this stuff in map_anon_pmd_nopf(), could we add one for
> mTHP?
Yes but I believe we should clean it up after. Unifying most of the
callers proved tricky.
>
> This function is already horribly overwrought (not your fault) so I'd like
> to avoid adding open-coded blocks as much as possible.
>
> > + }
> >
> > folio = NULL;
> >
> > result = SCAN_SUCCEED;
> > out_up_write:
> > + if (anon_vma_locked)
> > + anon_vma_unlock_write(vma->anon_vma);
> > + if (pte)
> > + pte_unmap(pte);
> > mmap_write_unlock(mm);
> > out_nolock:
> > + *mmap_locked = false;
>
> See above comment about setting this prior to jumping to out_nolock.
ack
Thanks for the reviews!
-- Nico
>
> > if (folio)
> > folio_put(folio);
> > trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
> > @@ -1463,9 +1490,8 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > pte_unmap_unlock(pte, ptl);
> > if (result == SCAN_SUCCEED) {
> > result = collapse_huge_page(mm, start_addr, referenced,
> > - unmapped, cc);
> > - /* collapse_huge_page will return with the mmap_lock released */
> > - *mmap_locked = false;
> > + unmapped, cc, mmap_locked,
> > + HPAGE_PMD_ORDER, 0);
> > }
> > out:
> > trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
> > --
> > 2.51.0
> >
>
^ permalink raw reply [flat|nested] 77+ messages in thread
* [PATCH v12 mm-new 08/15] khugepaged: skip collapsing mTHP to smaller orders
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (6 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 07/15] khugepaged: generalize collapse_huge_page for mTHP collapse Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-10-22 18:37 ` [PATCH v12 mm-new 09/15] khugepaged: add per-order mTHP collapse failure statistics Nico Pache
` (7 subsequent siblings)
15 siblings, 0 replies; 77+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas, tiwai,
will, dave.hansen, jack, cl, jglisse, surenb, zokeefe, hannes,
rientjes, mhocko, rdunlap, hughd, richard.weiyang, lance.yang,
vbabka, rppt, jannh, pfalcato
khugepaged may try to collapse a mTHP to a smaller mTHP, resulting in
some pages being unmapped. Skip these cases until we have a way to check
if its ok to collapse to a smaller mTHP size (like in the case of a
partially mapped folio).
This patch is inspired by Dev Jain's work on khugepaged mTHP support [1].
[1] https://lore.kernel.org/lkml/20241216165105.56185-11-dev.jain@arm.com/
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: David Hildenbrand <david@redhat.com>
Co-developed-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 75e7ebdccc36..d741af15e18c 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -629,6 +629,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
goto out;
}
}
+ /*
+ * TODO: In some cases of partially-mapped folios, we'd actually
+ * want to collapse.
+ */
+ if (order != HPAGE_PMD_ORDER && folio_order(folio) >= order) {
+ result = SCAN_PTE_MAPPED_HUGEPAGE;
+ goto out;
+ }
if (folio_test_large(folio)) {
struct folio *f;
--
2.51.0
^ permalink raw reply related [flat|nested] 77+ messages in thread* [PATCH v12 mm-new 09/15] khugepaged: add per-order mTHP collapse failure statistics
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (7 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 08/15] khugepaged: skip collapsing mTHP to smaller orders Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-11-06 18:45 ` Lorenzo Stoakes
2025-10-22 18:37 ` [PATCH v12 mm-new 10/15] khugepaged: improve tracepoints for mTHP orders Nico Pache
` (6 subsequent siblings)
15 siblings, 1 reply; 77+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas, tiwai,
will, dave.hansen, jack, cl, jglisse, surenb, zokeefe, hannes,
rientjes, mhocko, rdunlap, hughd, richard.weiyang, lance.yang,
vbabka, rppt, jannh, pfalcato
Add three new mTHP statistics to track collapse failures for different
orders when encountering swap PTEs, excessive none PTEs, and shared PTEs:
- collapse_exceed_swap_pte: Increment when mTHP collapse fails due to swap
PTEs
- collapse_exceed_none_pte: Counts when mTHP collapse fails due to
exceeding the none PTE threshold for the given order
- collapse_exceed_shared_pte: Counts when mTHP collapse fails due to shared
PTEs
These statistics complement the existing THP_SCAN_EXCEED_* events by
providing per-order granularity for mTHP collapse attempts. The stats are
exposed via sysfs under
`/sys/kernel/mm/transparent_hugepage/hugepages-*/stats/` for each
supported hugepage size.
As we currently dont support collapsing mTHPs that contain a swap or
shared entry, those statistics keep track of how often we are
encountering failed mTHP collapses due to these restrictions.
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
Documentation/admin-guide/mm/transhuge.rst | 23 ++++++++++++++++++++++
include/linux/huge_mm.h | 3 +++
mm/huge_memory.c | 7 +++++++
mm/khugepaged.c | 16 ++++++++++++---
4 files changed, 46 insertions(+), 3 deletions(-)
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 13269a0074d4..7c71cda8aea1 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -709,6 +709,29 @@ nr_anon_partially_mapped
an anonymous THP as "partially mapped" and count it here, even though it
is not actually partially mapped anymore.
+collapse_exceed_none_pte
+ The number of anonymous mTHP pte ranges where the number of none PTEs
+ exceeded the max_ptes_none threshold. For mTHP collapse, khugepaged
+ checks a PMD region and tracks which PTEs are present. It then tries
+ to collapse to the largest enabled mTHP size. The allowed number of empty
+ PTEs is the max_ptes_none threshold scaled by the collapse order. This
+ counter records the number of times a collapse attempt was skipped for
+ this reason, and khugepaged moved on to try the next available mTHP size.
+
+collapse_exceed_swap_pte
+ The number of anonymous mTHP pte ranges which contain at least one swap
+ PTE. Currently khugepaged does not support collapsing mTHP regions
+ that contain a swap PTE. This counter can be used to monitor the
+ number of khugepaged mTHP collapses that failed due to the presence
+ of a swap PTE.
+
+collapse_exceed_shared_pte
+ The number of anonymous mTHP pte ranges which contain at least one shared
+ PTE. Currently khugepaged does not support collapsing mTHP pte ranges
+ that contain a shared PTE. This counter can be used to monitor the
+ number of khugepaged mTHP collapses that failed due to the presence
+ of a shared PTE.
+
As the system ages, allocating huge pages may be expensive as the
system uses memory compaction to copy data around memory to free a
huge page for use. There are some counters in ``/proc/vmstat`` to help
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 3d29624c4f3f..4b2773235041 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -144,6 +144,9 @@ enum mthp_stat_item {
MTHP_STAT_SPLIT_DEFERRED,
MTHP_STAT_NR_ANON,
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
+ MTHP_STAT_COLLAPSE_EXCEED_SWAP,
+ MTHP_STAT_COLLAPSE_EXCEED_NONE,
+ MTHP_STAT_COLLAPSE_EXCEED_SHARED,
__MTHP_STAT_COUNT
};
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0063d1ba926e..7335b92969d6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -638,6 +638,10 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
+DEFINE_MTHP_STAT_ATTR(collapse_exceed_swap_pte, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
+DEFINE_MTHP_STAT_ATTR(collapse_exceed_none_pte, MTHP_STAT_COLLAPSE_EXCEED_NONE);
+DEFINE_MTHP_STAT_ATTR(collapse_exceed_shared_pte, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
+
static struct attribute *anon_stats_attrs[] = {
&anon_fault_alloc_attr.attr,
@@ -654,6 +658,9 @@ static struct attribute *anon_stats_attrs[] = {
&split_deferred_attr.attr,
&nr_anon_attr.attr,
&nr_anon_partially_mapped_attr.attr,
+ &collapse_exceed_swap_pte_attr.attr,
+ &collapse_exceed_none_pte_attr.attr,
+ &collapse_exceed_shared_pte_attr.attr,
NULL,
};
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index d741af15e18c..053202141ea3 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -592,7 +592,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
- count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+ if (order == HPAGE_PMD_ORDER)
+ count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_NONE);
goto out;
}
}
@@ -622,10 +624,17 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* shared may cause a future higher order collapse on a
* rescan of the same range.
*/
- if (order != HPAGE_PMD_ORDER || (cc->is_khugepaged &&
- shared > khugepaged_max_ptes_shared)) {
+ if (order != HPAGE_PMD_ORDER) {
+ result = SCAN_EXCEED_SHARED_PTE;
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
+ goto out;
+ }
+
+ if (cc->is_khugepaged &&
+ shared > khugepaged_max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
goto out;
}
}
@@ -1073,6 +1082,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
* range.
*/
if (order != HPAGE_PMD_ORDER) {
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
pte_unmap(pte);
mmap_read_unlock(mm);
result = SCAN_EXCEED_SWAP_PTE;
--
2.51.0
^ permalink raw reply related [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 09/15] khugepaged: add per-order mTHP collapse failure statistics
2025-10-22 18:37 ` [PATCH v12 mm-new 09/15] khugepaged: add per-order mTHP collapse failure statistics Nico Pache
@ 2025-11-06 18:45 ` Lorenzo Stoakes
2025-11-07 17:14 ` Nico Pache
0 siblings, 1 reply; 77+ messages in thread
From: Lorenzo Stoakes @ 2025-11-06 18:45 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david, ziy,
baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Wed, Oct 22, 2025 at 12:37:11PM -0600, Nico Pache wrote:
> Add three new mTHP statistics to track collapse failures for different
> orders when encountering swap PTEs, excessive none PTEs, and shared PTEs:
>
> - collapse_exceed_swap_pte: Increment when mTHP collapse fails due to swap
> PTEs
>
> - collapse_exceed_none_pte: Counts when mTHP collapse fails due to
> exceeding the none PTE threshold for the given order
>
> - collapse_exceed_shared_pte: Counts when mTHP collapse fails due to shared
> PTEs
>
> These statistics complement the existing THP_SCAN_EXCEED_* events by
> providing per-order granularity for mTHP collapse attempts. The stats are
> exposed via sysfs under
> `/sys/kernel/mm/transparent_hugepage/hugepages-*/stats/` for each
> supported hugepage size.
>
> As we currently dont support collapsing mTHPs that contain a swap or
> shared entry, those statistics keep track of how often we are
> encountering failed mTHP collapses due to these restrictions.
>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
> Documentation/admin-guide/mm/transhuge.rst | 23 ++++++++++++++++++++++
> include/linux/huge_mm.h | 3 +++
> mm/huge_memory.c | 7 +++++++
> mm/khugepaged.c | 16 ++++++++++++---
> 4 files changed, 46 insertions(+), 3 deletions(-)
>
> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> index 13269a0074d4..7c71cda8aea1 100644
> --- a/Documentation/admin-guide/mm/transhuge.rst
> +++ b/Documentation/admin-guide/mm/transhuge.rst
> @@ -709,6 +709,29 @@ nr_anon_partially_mapped
> an anonymous THP as "partially mapped" and count it here, even though it
> is not actually partially mapped anymore.
>
> +collapse_exceed_none_pte
> + The number of anonymous mTHP pte ranges where the number of none PTEs
Ranges? Is the count per-mTHP folio? Or per PTE entry? Let's clarify.
> + exceeded the max_ptes_none threshold. For mTHP collapse, khugepaged
> + checks a PMD region and tracks which PTEs are present. It then tries
> + to collapse to the largest enabled mTHP size. The allowed number of empty
Well and then tries to collapse to the next and etc. right? So maybe worth
mentioning?
> + PTEs is the max_ptes_none threshold scaled by the collapse order. This
I think this needs clarification, scaled how? Also obviously with the proposed
new approach we will need to correct this to reflect the 511/0 situation.
> + counter records the number of times a collapse attempt was skipped for
> + this reason, and khugepaged moved on to try the next available mTHP size.
OK you mention the moving on here, so for each attempted mTHP size which exeeds
max_none_pte we increment this stat correct? Probably worth clarifying that.
> +
> +collapse_exceed_swap_pte
> + The number of anonymous mTHP pte ranges which contain at least one swap
> + PTE. Currently khugepaged does not support collapsing mTHP regions
> + that contain a swap PTE. This counter can be used to monitor the
> + number of khugepaged mTHP collapses that failed due to the presence
> + of a swap PTE.
OK so as soon as we encounter a swap PTE we abort and this counts each instance
of that?
I guess worth spelling that out? Given we don't support it, surely the opening
description should be 'The number of anonymous mTHP PTE ranges which were unable
to be collapsed due to containing one or more swap PTEs'.
> +
> +collapse_exceed_shared_pte
> + The number of anonymous mTHP pte ranges which contain at least one shared
> + PTE. Currently khugepaged does not support collapsing mTHP pte ranges
> + that contain a shared PTE. This counter can be used to monitor the
> + number of khugepaged mTHP collapses that failed due to the presence
> + of a shared PTE.
Same comments as above.
> +
> As the system ages, allocating huge pages may be expensive as the
> system uses memory compaction to copy data around memory to free a
> huge page for use. There are some counters in ``/proc/vmstat`` to help
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 3d29624c4f3f..4b2773235041 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -144,6 +144,9 @@ enum mthp_stat_item {
> MTHP_STAT_SPLIT_DEFERRED,
> MTHP_STAT_NR_ANON,
> MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
> + MTHP_STAT_COLLAPSE_EXCEED_SWAP,
> + MTHP_STAT_COLLAPSE_EXCEED_NONE,
> + MTHP_STAT_COLLAPSE_EXCEED_SHARED,
> __MTHP_STAT_COUNT
> };
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 0063d1ba926e..7335b92969d6 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -638,6 +638,10 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
> DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
> DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
> DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
> +DEFINE_MTHP_STAT_ATTR(collapse_exceed_swap_pte, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
> +DEFINE_MTHP_STAT_ATTR(collapse_exceed_none_pte, MTHP_STAT_COLLAPSE_EXCEED_NONE);
> +DEFINE_MTHP_STAT_ATTR(collapse_exceed_shared_pte, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> +
>
> static struct attribute *anon_stats_attrs[] = {
> &anon_fault_alloc_attr.attr,
> @@ -654,6 +658,9 @@ static struct attribute *anon_stats_attrs[] = {
> &split_deferred_attr.attr,
> &nr_anon_attr.attr,
> &nr_anon_partially_mapped_attr.attr,
> + &collapse_exceed_swap_pte_attr.attr,
> + &collapse_exceed_none_pte_attr.attr,
> + &collapse_exceed_shared_pte_attr.attr,
> NULL,
> };
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index d741af15e18c..053202141ea3 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -592,7 +592,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> continue;
> } else {
> result = SCAN_EXCEED_NONE_PTE;
> - count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
> + if (order == HPAGE_PMD_ORDER)
> + count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
> + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_NONE);
> goto out;
> }
> }
> @@ -622,10 +624,17 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> * shared may cause a future higher order collapse on a
> * rescan of the same range.
> */
> - if (order != HPAGE_PMD_ORDER || (cc->is_khugepaged &&
> - shared > khugepaged_max_ptes_shared)) {
> + if (order != HPAGE_PMD_ORDER) {
A little nit/idea in general for series - since we do this order !=
HPAGE_PMD_ORDER check all over, maybe have a predict function like:
static bool is_mthp_order(unsigned int order)
{
return order != HPAGE_PMD_ORDER;
}
> + result = SCAN_EXCEED_SHARED_PTE;
> + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> + goto out;
> + }
> +
> + if (cc->is_khugepaged &&
> + shared > khugepaged_max_ptes_shared) {
> result = SCAN_EXCEED_SHARED_PTE;
> count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
> + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
OK I _think_ I mentioned this in a previous revision so forgive me for being
repetitious but we also count PMD orders here?
But in the MTHP_STAT_COLLAPSE_EXCEED_NONE and MTP_STAT_COLLAPSE_EXCEED_SWAP
cases we don't? Why's that?
> goto out;
> }
> }
> @@ -1073,6 +1082,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
> * range.
> */
> if (order != HPAGE_PMD_ORDER) {
> + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
> pte_unmap(pte);
> mmap_read_unlock(mm);
> result = SCAN_EXCEED_SWAP_PTE;
> --
> 2.51.0
>
Thanks, Lorenzo
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 09/15] khugepaged: add per-order mTHP collapse failure statistics
2025-11-06 18:45 ` Lorenzo Stoakes
@ 2025-11-07 17:14 ` Nico Pache
0 siblings, 0 replies; 77+ messages in thread
From: Nico Pache @ 2025-11-07 17:14 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david, ziy,
baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Thu, Nov 6, 2025 at 11:47 AM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> On Wed, Oct 22, 2025 at 12:37:11PM -0600, Nico Pache wrote:
> > Add three new mTHP statistics to track collapse failures for different
> > orders when encountering swap PTEs, excessive none PTEs, and shared PTEs:
> >
> > - collapse_exceed_swap_pte: Increment when mTHP collapse fails due to swap
> > PTEs
> >
> > - collapse_exceed_none_pte: Counts when mTHP collapse fails due to
> > exceeding the none PTE threshold for the given order
> >
> > - collapse_exceed_shared_pte: Counts when mTHP collapse fails due to shared
> > PTEs
> >
> > These statistics complement the existing THP_SCAN_EXCEED_* events by
> > providing per-order granularity for mTHP collapse attempts. The stats are
> > exposed via sysfs under
> > `/sys/kernel/mm/transparent_hugepage/hugepages-*/stats/` for each
> > supported hugepage size.
> >
> > As we currently dont support collapsing mTHPs that contain a swap or
> > shared entry, those statistics keep track of how often we are
> > encountering failed mTHP collapses due to these restrictions.
> >
> > Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> > Documentation/admin-guide/mm/transhuge.rst | 23 ++++++++++++++++++++++
> > include/linux/huge_mm.h | 3 +++
> > mm/huge_memory.c | 7 +++++++
> > mm/khugepaged.c | 16 ++++++++++++---
> > 4 files changed, 46 insertions(+), 3 deletions(-)
> >
> > diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> > index 13269a0074d4..7c71cda8aea1 100644
> > --- a/Documentation/admin-guide/mm/transhuge.rst
> > +++ b/Documentation/admin-guide/mm/transhuge.rst
> > @@ -709,6 +709,29 @@ nr_anon_partially_mapped
> > an anonymous THP as "partially mapped" and count it here, even though it
> > is not actually partially mapped anymore.
> >
> > +collapse_exceed_none_pte
> > + The number of anonymous mTHP pte ranges where the number of none PTEs
>
> Ranges? Is the count per-mTHP folio? Or per PTE entry? Let's clarify.
I dont know the proper terminology. But what we have here is a range
of PTEs that is being considered for mTHP folio collapse; however, it
is still not a mTHP folio which is why I hesitated to call it that.
Given this counter is per mTHP size I think the proper way to say this would be:
The number of collapse attempts that failed due to exceeding the
max_ptes_none threshold.
>
> > + exceeded the max_ptes_none threshold. For mTHP collapse, khugepaged
> > + checks a PMD region and tracks which PTEs are present. It then tries
> > + to collapse to the largest enabled mTHP size. The allowed number of empty
>
> Well and then tries to collapse to the next and etc. right? So maybe worth
> mentioning?
>
> > + PTEs is the max_ptes_none threshold scaled by the collapse order. This
>
> I think this needs clarification, scaled how? Also obviously with the proposed
> new approach we will need to correct this to reflect the 511/0 situation.
>
> > + counter records the number of times a collapse attempt was skipped for
> > + this reason, and khugepaged moved on to try the next available mTHP size.
>
> OK you mention the moving on here, so for each attempted mTHP size which exeeds
> max_none_pte we increment this stat correct? Probably worth clarifying that.
>
> > +
> > +collapse_exceed_swap_pte
> > + The number of anonymous mTHP pte ranges which contain at least one swap
> > + PTE. Currently khugepaged does not support collapsing mTHP regions
> > + that contain a swap PTE. This counter can be used to monitor the
> > + number of khugepaged mTHP collapses that failed due to the presence
> > + of a swap PTE.
>
> OK so as soon as we encounter a swap PTE we abort and this counts each instance
> of that?
>
> I guess worth spelling that out? Given we don't support it, surely the opening
> description should be 'The number of anonymous mTHP PTE ranges which were unable
> to be collapsed due to containing one or more swap PTEs'.
>
> > +
> > +collapse_exceed_shared_pte
> > + The number of anonymous mTHP pte ranges which contain at least one shared
> > + PTE. Currently khugepaged does not support collapsing mTHP pte ranges
> > + that contain a shared PTE. This counter can be used to monitor the
> > + number of khugepaged mTHP collapses that failed due to the presence
> > + of a shared PTE.
>
> Same comments as above.
>
> > +
> > As the system ages, allocating huge pages may be expensive as the
> > system uses memory compaction to copy data around memory to free a
> > huge page for use. There are some counters in ``/proc/vmstat`` to help
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index 3d29624c4f3f..4b2773235041 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -144,6 +144,9 @@ enum mthp_stat_item {
> > MTHP_STAT_SPLIT_DEFERRED,
> > MTHP_STAT_NR_ANON,
> > MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
> > + MTHP_STAT_COLLAPSE_EXCEED_SWAP,
> > + MTHP_STAT_COLLAPSE_EXCEED_NONE,
> > + MTHP_STAT_COLLAPSE_EXCEED_SHARED,
> > __MTHP_STAT_COUNT
> > };
> >
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 0063d1ba926e..7335b92969d6 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -638,6 +638,10 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
> > DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
> > DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
> > DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
> > +DEFINE_MTHP_STAT_ATTR(collapse_exceed_swap_pte, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
> > +DEFINE_MTHP_STAT_ATTR(collapse_exceed_none_pte, MTHP_STAT_COLLAPSE_EXCEED_NONE);
> > +DEFINE_MTHP_STAT_ATTR(collapse_exceed_shared_pte, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> > +
> >
> > static struct attribute *anon_stats_attrs[] = {
> > &anon_fault_alloc_attr.attr,
> > @@ -654,6 +658,9 @@ static struct attribute *anon_stats_attrs[] = {
> > &split_deferred_attr.attr,
> > &nr_anon_attr.attr,
> > &nr_anon_partially_mapped_attr.attr,
> > + &collapse_exceed_swap_pte_attr.attr,
> > + &collapse_exceed_none_pte_attr.attr,
> > + &collapse_exceed_shared_pte_attr.attr,
> > NULL,
> > };
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index d741af15e18c..053202141ea3 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -592,7 +592,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > continue;
> > } else {
> > result = SCAN_EXCEED_NONE_PTE;
> > - count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
> > + if (order == HPAGE_PMD_ORDER)
> > + count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
> > + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_NONE);
> > goto out;
> > }
> > }
> > @@ -622,10 +624,17 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > * shared may cause a future higher order collapse on a
> > * rescan of the same range.
> > */
> > - if (order != HPAGE_PMD_ORDER || (cc->is_khugepaged &&
> > - shared > khugepaged_max_ptes_shared)) {
> > + if (order != HPAGE_PMD_ORDER) {
>
Thanks for the review! I'll go clean these up for the next version
> A little nit/idea in general for series - since we do this order !=
> HPAGE_PMD_ORDER check all over, maybe have a predict function like:
>
> static bool is_mthp_order(unsigned int order)
> {
> return order != HPAGE_PMD_ORDER;
> }
sure!
>
> > + result = SCAN_EXCEED_SHARED_PTE;
> > + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> > + goto out;
> > + }
> > +
> > + if (cc->is_khugepaged &&
> > + shared > khugepaged_max_ptes_shared) {
> > result = SCAN_EXCEED_SHARED_PTE;
> > count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
> > + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
>
> OK I _think_ I mentioned this in a previous revision so forgive me for being
> repetitious but we also count PMD orders here?
>
> But in the MTHP_STAT_COLLAPSE_EXCEED_NONE and MTP_STAT_COLLAPSE_EXCEED_SWAP
> cases we don't? Why's that?
Hmm I could have sworn I fixed that... perhaps I reintroduced the
missing stat update when I had to rebase/undo the cleanup series by
Lance. I will fix this.
Cheers.
-- Nico
>
>
> > goto out;
> > }
> > }
> > @@ -1073,6 +1082,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
> > * range.
> > */
> > if (order != HPAGE_PMD_ORDER) {
> > + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
> > pte_unmap(pte);
> > mmap_read_unlock(mm);
> > result = SCAN_EXCEED_SWAP_PTE;
> > --
> > 2.51.0
> >
>
> Thanks, Lorenzo
>
^ permalink raw reply [flat|nested] 77+ messages in thread
* [PATCH v12 mm-new 10/15] khugepaged: improve tracepoints for mTHP orders
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (8 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 09/15] khugepaged: add per-order mTHP collapse failure statistics Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-10-22 18:37 ` [PATCH v12 mm-new 11/15] khugepaged: introduce collapse_allowable_orders helper function Nico Pache
` (5 subsequent siblings)
15 siblings, 0 replies; 77+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas, tiwai,
will, dave.hansen, jack, cl, jglisse, surenb, zokeefe, hannes,
rientjes, mhocko, rdunlap, hughd, richard.weiyang, lance.yang,
vbabka, rppt, jannh, pfalcato
Add the order to the mm_collapse_huge_page<_swapin,_isolate> tracepoints to
give better insight into what order is being operated at for.
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
include/trace/events/huge_memory.h | 34 +++++++++++++++++++-----------
mm/khugepaged.c | 9 ++++----
2 files changed, 27 insertions(+), 16 deletions(-)
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index dd94d14a2427..19d99b2549e6 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -88,40 +88,44 @@ TRACE_EVENT(mm_khugepaged_scan_pmd,
TRACE_EVENT(mm_collapse_huge_page,
- TP_PROTO(struct mm_struct *mm, int isolated, int status),
+ TP_PROTO(struct mm_struct *mm, int isolated, int status, unsigned int order),
- TP_ARGS(mm, isolated, status),
+ TP_ARGS(mm, isolated, status, order),
TP_STRUCT__entry(
__field(struct mm_struct *, mm)
__field(int, isolated)
__field(int, status)
+ __field(unsigned int, order)
),
TP_fast_assign(
__entry->mm = mm;
__entry->isolated = isolated;
__entry->status = status;
+ __entry->order = order;
),
- TP_printk("mm=%p, isolated=%d, status=%s",
+ TP_printk("mm=%p, isolated=%d, status=%s order=%u",
__entry->mm,
__entry->isolated,
- __print_symbolic(__entry->status, SCAN_STATUS))
+ __print_symbolic(__entry->status, SCAN_STATUS),
+ __entry->order)
);
TRACE_EVENT(mm_collapse_huge_page_isolate,
TP_PROTO(struct folio *folio, int none_or_zero,
- int referenced, int status),
+ int referenced, int status, unsigned int order),
- TP_ARGS(folio, none_or_zero, referenced, status),
+ TP_ARGS(folio, none_or_zero, referenced, status, order),
TP_STRUCT__entry(
__field(unsigned long, pfn)
__field(int, none_or_zero)
__field(int, referenced)
__field(int, status)
+ __field(unsigned int, order)
),
TP_fast_assign(
@@ -129,26 +133,30 @@ TRACE_EVENT(mm_collapse_huge_page_isolate,
__entry->none_or_zero = none_or_zero;
__entry->referenced = referenced;
__entry->status = status;
+ __entry->order = order;
),
- TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, status=%s",
+ TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, status=%s order=%u",
__entry->pfn,
__entry->none_or_zero,
__entry->referenced,
- __print_symbolic(__entry->status, SCAN_STATUS))
+ __print_symbolic(__entry->status, SCAN_STATUS),
+ __entry->order)
);
TRACE_EVENT(mm_collapse_huge_page_swapin,
- TP_PROTO(struct mm_struct *mm, int swapped_in, int referenced, int ret),
+ TP_PROTO(struct mm_struct *mm, int swapped_in, int referenced, int ret,
+ unsigned int order),
- TP_ARGS(mm, swapped_in, referenced, ret),
+ TP_ARGS(mm, swapped_in, referenced, ret, order),
TP_STRUCT__entry(
__field(struct mm_struct *, mm)
__field(int, swapped_in)
__field(int, referenced)
__field(int, ret)
+ __field(unsigned int, order)
),
TP_fast_assign(
@@ -156,13 +164,15 @@ TRACE_EVENT(mm_collapse_huge_page_swapin,
__entry->swapped_in = swapped_in;
__entry->referenced = referenced;
__entry->ret = ret;
+ __entry->order = order;
),
- TP_printk("mm=%p, swapped_in=%d, referenced=%d, ret=%d",
+ TP_printk("mm=%p, swapped_in=%d, referenced=%d, ret=%d, order=%u",
__entry->mm,
__entry->swapped_in,
__entry->referenced,
- __entry->ret)
+ __entry->ret,
+ __entry->order)
);
TRACE_EVENT(mm_khugepaged_scan_file,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 053202141ea3..0dbbe04c31fe 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -722,13 +722,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
} else {
result = SCAN_SUCCEED;
trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
- referenced, result);
+ referenced, result, order);
return result;
}
out:
release_pte_pages(pte, _pte, compound_pagelist);
trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
- referenced, result);
+ referenced, result, order);
return result;
}
@@ -1123,7 +1123,8 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
result = SCAN_SUCCEED;
out:
- trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result);
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result,
+ order);
return result;
}
@@ -1343,7 +1344,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long pmd_address,
*mmap_locked = false;
if (folio)
folio_put(folio);
- trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
+ trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result, order);
return result;
}
--
2.51.0
^ permalink raw reply related [flat|nested] 77+ messages in thread* [PATCH v12 mm-new 11/15] khugepaged: introduce collapse_allowable_orders helper function
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (9 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 10/15] khugepaged: improve tracepoints for mTHP orders Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-11-06 18:49 ` Lorenzo Stoakes
2025-10-22 18:37 ` [PATCH v12 mm-new 12/15] khugepaged: Introduce mTHP collapse support Nico Pache
` (4 subsequent siblings)
15 siblings, 1 reply; 77+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas, tiwai,
will, dave.hansen, jack, cl, jglisse, surenb, zokeefe, hannes,
rientjes, mhocko, rdunlap, hughd, richard.weiyang, lance.yang,
vbabka, rppt, jannh, pfalcato
Add collapse_allowable_orders() to generalize THP order eligibility. The
function determines which THP orders are permitted based on collapse
context (khugepaged vs madv_collapse).
This consolidates collapse configuration logic and provides a clean
interface for future mTHP collapse support where the orders may be
different.
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 15 ++++++++++++---
1 file changed, 12 insertions(+), 3 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 0dbbe04c31fe..89a105124790 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -489,7 +489,16 @@ static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
return max_ptes_none >> (HPAGE_PMD_ORDER - order);
+}
+
+/* Check what orders are allowed based on the vma and collapse type */
+static unsigned long collapse_allowable_orders(struct vm_area_struct *vma,
+ vm_flags_t vm_flags, bool is_khugepaged)
+{
+ enum tva_type tva_flags = is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
+ unsigned long orders = BIT(HPAGE_PMD_ORDER);
+ return thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
}
void khugepaged_enter_vma(struct vm_area_struct *vma,
@@ -497,7 +506,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
{
if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
hugepage_pmd_enabled()) {
- if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
+ if (collapse_allowable_orders(vma, vm_flags, true))
__khugepaged_enter(vma->vm_mm);
}
}
@@ -2567,7 +2576,7 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result,
progress++;
break;
}
- if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
+ if (!collapse_allowable_orders(vma, vma->vm_flags, true)) {
skip:
progress++;
continue;
@@ -2873,7 +2882,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
BUG_ON(vma->vm_start > start);
BUG_ON(vma->vm_end < end);
- if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
+ if (!collapse_allowable_orders(vma, vma->vm_flags, false))
return -EINVAL;
cc = kmalloc(sizeof(*cc), GFP_KERNEL);
--
2.51.0
^ permalink raw reply related [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 11/15] khugepaged: introduce collapse_allowable_orders helper function
2025-10-22 18:37 ` [PATCH v12 mm-new 11/15] khugepaged: introduce collapse_allowable_orders helper function Nico Pache
@ 2025-11-06 18:49 ` Lorenzo Stoakes
2025-11-07 18:01 ` Nico Pache
0 siblings, 1 reply; 77+ messages in thread
From: Lorenzo Stoakes @ 2025-11-06 18:49 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david, ziy,
baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Wed, Oct 22, 2025 at 12:37:13PM -0600, Nico Pache wrote:
> Add collapse_allowable_orders() to generalize THP order eligibility. The
> function determines which THP orders are permitted based on collapse
> context (khugepaged vs madv_collapse).
>
> This consolidates collapse configuration logic and provides a clean
> interface for future mTHP collapse support where the orders may be
> different.
>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
With nits below adddressed, LGTM so:
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
> mm/khugepaged.c | 15 ++++++++++++---
> 1 file changed, 12 insertions(+), 3 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 0dbbe04c31fe..89a105124790 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -489,7 +489,16 @@ static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
> max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
>
> return max_ptes_none >> (HPAGE_PMD_ORDER - order);
> +}
> +
> +/* Check what orders are allowed based on the vma and collapse type */
> +static unsigned long collapse_allowable_orders(struct vm_area_struct *vma,
> + vm_flags_t vm_flags, bool is_khugepaged)
> +{
> + enum tva_type tva_flags = is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
> + unsigned long orders = BIT(HPAGE_PMD_ORDER);
Nit, but can const-ify.
>
> + return thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
> }
>
> void khugepaged_enter_vma(struct vm_area_struct *vma,
> @@ -497,7 +506,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
> {
> if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
> hugepage_pmd_enabled()) {
> - if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
> + if (collapse_allowable_orders(vma, vm_flags, true))
If we have a 'mystery meat' boolean parameter can we always use the convention of:
collapse_allowable_orders(vma, vm_flags, /*is_khugepaged=*/true)
Please? Same goes for other invocations obviously.
> __khugepaged_enter(vma->vm_mm);
> }
> }
> @@ -2567,7 +2576,7 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result,
> progress++;
> break;
> }
> - if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
> + if (!collapse_allowable_orders(vma, vma->vm_flags, true)) {
> skip:
> progress++;
> continue;
> @@ -2873,7 +2882,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> BUG_ON(vma->vm_start > start);
> BUG_ON(vma->vm_end < end);
>
> - if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
> + if (!collapse_allowable_orders(vma, vma->vm_flags, false))
> return -EINVAL;
>
> cc = kmalloc(sizeof(*cc), GFP_KERNEL);
> --
> 2.51.0
>
^ permalink raw reply [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 11/15] khugepaged: introduce collapse_allowable_orders helper function
2025-11-06 18:49 ` Lorenzo Stoakes
@ 2025-11-07 18:01 ` Nico Pache
0 siblings, 0 replies; 77+ messages in thread
From: Nico Pache @ 2025-11-07 18:01 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david, ziy,
baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On Thu, Nov 6, 2025 at 11:51 AM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> On Wed, Oct 22, 2025 at 12:37:13PM -0600, Nico Pache wrote:
> > Add collapse_allowable_orders() to generalize THP order eligibility. The
> > function determines which THP orders are permitted based on collapse
> > context (khugepaged vs madv_collapse).
> >
> > This consolidates collapse configuration logic and provides a clean
> > interface for future mTHP collapse support where the orders may be
> > different.
> >
> > Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> > Signed-off-by: Nico Pache <npache@redhat.com>
>
> With nits below adddressed, LGTM so:
>
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Thank you!
> > ---
> > mm/khugepaged.c | 15 ++++++++++++---
> > 1 file changed, 12 insertions(+), 3 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 0dbbe04c31fe..89a105124790 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -489,7 +489,16 @@ static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
> > max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
> >
> > return max_ptes_none >> (HPAGE_PMD_ORDER - order);
> > +}
> > +
> > +/* Check what orders are allowed based on the vma and collapse type */
> > +static unsigned long collapse_allowable_orders(struct vm_area_struct *vma,
> > + vm_flags_t vm_flags, bool is_khugepaged)
> > +{
> > + enum tva_type tva_flags = is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
> > + unsigned long orders = BIT(HPAGE_PMD_ORDER);
>
> Nit, but can const-ify.
It becomes a function of is_khugepaged in a later patch.
>
> >
> > + return thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
> > }
> >
> > void khugepaged_enter_vma(struct vm_area_struct *vma,
> > @@ -497,7 +506,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
> > {
> > if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
> > hugepage_pmd_enabled()) {
> > - if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
> > + if (collapse_allowable_orders(vma, vm_flags, true))
>
> If we have a 'mystery meat' boolean parameter can we always use the convention of:
>
> collapse_allowable_orders(vma, vm_flags, /*is_khugepaged=*/true)
>
> Please? Same goes for other invocations obviously.
Sounds good! I'll fix those up.
Thanks,
-- Nico
>
>
> > __khugepaged_enter(vma->vm_mm);
> > }
> > }
> > @@ -2567,7 +2576,7 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result,
> > progress++;
> > break;
> > }
> > - if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
> > + if (!collapse_allowable_orders(vma, vma->vm_flags, true)) {
> > skip:
> > progress++;
> > continue;
> > @@ -2873,7 +2882,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> > BUG_ON(vma->vm_start > start);
> > BUG_ON(vma->vm_end < end);
> >
> > - if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
> > + if (!collapse_allowable_orders(vma, vma->vm_flags, false))
> > return -EINVAL;
> >
> > cc = kmalloc(sizeof(*cc), GFP_KERNEL);
> > --
> > 2.51.0
> >
>
^ permalink raw reply [flat|nested] 77+ messages in thread
* [PATCH v12 mm-new 12/15] khugepaged: Introduce mTHP collapse support
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (10 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 11/15] khugepaged: introduce collapse_allowable_orders helper function Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-10-27 6:28 ` Baolin Wang
2025-10-22 18:37 ` [PATCH v12 mm-new 13/15] khugepaged: avoid unnecessary mTHP collapse attempts Nico Pache
` (3 subsequent siblings)
15 siblings, 1 reply; 77+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas, tiwai,
will, dave.hansen, jack, cl, jglisse, surenb, zokeefe, hannes,
rientjes, mhocko, rdunlap, hughd, richard.weiyang, lance.yang,
vbabka, rppt, jannh, pfalcato
During PMD range scanning, track occupied pages in a bitmap. If mTHPs are
enabled we remove the restriction of max_ptes_none during the scan phase
to avoid missing potential mTHP candidates.
Implement collapse_scan_bitmap() to perform binary recursion on the bitmap
and determine the best eligible order for the collapse. A stack struct is
used instead of traditional recursion. The algorithm splits the bitmap
into smaller chunks to find the best fit mTHP. max_ptes_none is scaled by
the attempted collapse order to determine how "full" an order must be
before being considered for collapse.
Once we determine what mTHP sizes fits best in that PMD range a collapse
is attempted. A minimum collapse order of 2 is used as this is the lowest
order supported by anon memory.
mTHP collapses reject regions containing swapped out or shared pages.
This is because adding new entries can lead to new none pages, and these
may lead to constant promotion into a higher order (m)THP. A similar
issue can occur with "max_ptes_none > HPAGE_PMD_NR/2" due to a collapse
introducing at least 2x the number of pages, and on a future scan will
satisfy the promotion condition once again. This issue is prevented via
the collapse_allowable_orders() function.
Currently madv_collapse is not supported and will only attempt PMD
collapse.
We can also remove the check for is_khugepaged inside the PMD scan as
the collapse_max_ptes_none() function handles this logic now.
Signed-off-by: Nico Pache <npache@redhat.com>
---
include/linux/khugepaged.h | 2 +
mm/khugepaged.c | 128 ++++++++++++++++++++++++++++++++++---
2 files changed, 122 insertions(+), 8 deletions(-)
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index eb1946a70cff..179ce716e769 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -1,6 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_KHUGEPAGED_H
#define _LINUX_KHUGEPAGED_H
+#define KHUGEPAGED_MIN_MTHP_ORDER 2
+#define MAX_MTHP_BITMAP_STACK (1UL << (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER))
#include <linux/mm.h>
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 89a105124790..e2319bfd0065 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -93,6 +93,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
static struct kmem_cache *mm_slot_cache __ro_after_init;
+struct scan_bit_state {
+ u8 order;
+ u16 offset;
+};
+
struct collapse_control {
bool is_khugepaged;
@@ -101,6 +106,13 @@ struct collapse_control {
/* nodemask for allocation fallback */
nodemask_t alloc_nmask;
+
+ /*
+ * bitmap used to collapse mTHP sizes.
+ */
+ DECLARE_BITMAP(mthp_bitmap, HPAGE_PMD_NR);
+ DECLARE_BITMAP(mthp_bitmap_mask, HPAGE_PMD_NR);
+ struct scan_bit_state mthp_bitmap_stack[MAX_MTHP_BITMAP_STACK];
};
/**
@@ -1357,6 +1369,85 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long pmd_address,
return result;
}
+static void push_mthp_bitmap_stack(struct collapse_control *cc, int *top,
+ u8 order, u16 offset)
+{
+ cc->mthp_bitmap_stack[++*top] = (struct scan_bit_state)
+ { order, offset };
+}
+
+/*
+ * collapse_scan_bitmap() consumes the bitmap that is generated during
+ * collapse_scan_pmd() to determine what regions and mTHP orders fit best.
+ *
+ * Each bit in the bitmap represents a single occupied (!none/zero) page.
+ * A stack structure cc->mthp_bitmap_stack is used to check different regions
+ * of the bitmap for collapse eligibility. We start at the PMD order and
+ * check if it is eligible for collapse; if not, we add two entries to the
+ * stack at a lower order to represent the left and right halves of the region.
+ *
+ * For each region, we calculate the number of set bits and compare it
+ * against a threshold derived from collapse_max_ptes_none(). A region is
+ * eligible if the number of set bits exceeds this threshold.
+ */
+static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
+ int referenced, int unmapped, struct collapse_control *cc,
+ bool *mmap_locked, unsigned long enabled_orders)
+{
+ u8 order, next_order;
+ u16 offset, mid_offset;
+ int num_chunks;
+ int bits_set, threshold_bits;
+ int top = -1;
+ int collapsed = 0;
+ int ret;
+ struct scan_bit_state state;
+ unsigned int max_none_ptes;
+
+ push_mthp_bitmap_stack(cc, &top, HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER, 0);
+
+ while (top >= 0) {
+ state = cc->mthp_bitmap_stack[top--];
+ order = state.order + KHUGEPAGED_MIN_MTHP_ORDER;
+ offset = state.offset;
+ num_chunks = 1UL << order;
+
+ /* Skip mTHP orders that are not enabled */
+ if (!test_bit(order, &enabled_orders))
+ goto next_order;
+
+ max_none_ptes = collapse_max_ptes_none(order, !cc->is_khugepaged);
+
+ /* Calculate weight of the range */
+ bitmap_zero(cc->mthp_bitmap_mask, HPAGE_PMD_NR);
+ bitmap_set(cc->mthp_bitmap_mask, offset, num_chunks);
+ bits_set = bitmap_weight_and(cc->mthp_bitmap,
+ cc->mthp_bitmap_mask, HPAGE_PMD_NR);
+
+ threshold_bits = (1UL << order) - max_none_ptes - 1;
+
+ /* Check if the region is eligible based on the threshold */
+ if (bits_set > threshold_bits) {
+ ret = collapse_huge_page(mm, address, referenced,
+ unmapped, cc, mmap_locked,
+ order, offset);
+ if (ret == SCAN_SUCCEED) {
+ collapsed += 1UL << order;
+ continue;
+ }
+ }
+
+next_order:
+ if (state.order > 0) {
+ next_order = state.order - 1;
+ mid_offset = offset + (num_chunks / 2);
+ push_mthp_bitmap_stack(cc, &top, next_order, mid_offset);
+ push_mthp_bitmap_stack(cc, &top, next_order, offset);
+ }
+ }
+ return collapsed;
+}
+
static int collapse_scan_pmd(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long start_addr, bool *mmap_locked,
@@ -1364,11 +1455,15 @@ static int collapse_scan_pmd(struct mm_struct *mm,
{
pmd_t *pmd;
pte_t *pte, *_pte;
+ int i;
int result = SCAN_FAIL, referenced = 0;
- int none_or_zero = 0, shared = 0;
+ int none_or_zero = 0, shared = 0, nr_collapsed = 0;
struct page *page = NULL;
+ unsigned int max_ptes_none;
struct folio *folio = NULL;
unsigned long addr;
+ unsigned long enabled_orders;
+ bool full_scan = true;
spinlock_t *ptl;
int node = NUMA_NO_NODE, unmapped = 0;
@@ -1378,16 +1473,29 @@ static int collapse_scan_pmd(struct mm_struct *mm,
if (result != SCAN_SUCCEED)
goto out;
+ bitmap_zero(cc->mthp_bitmap, HPAGE_PMD_NR);
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
+
+ enabled_orders = collapse_allowable_orders(vma, vma->vm_flags, cc->is_khugepaged);
+
+ /*
+ * If PMD is the only enabled order, enforce max_ptes_none, otherwise
+ * scan all pages to populate the bitmap for mTHP collapse.
+ */
+ if (cc->is_khugepaged && enabled_orders == _BITUL(HPAGE_PMD_ORDER))
+ full_scan = false;
+ max_ptes_none = collapse_max_ptes_none(HPAGE_PMD_ORDER, full_scan);
+
pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
if (!pte) {
result = SCAN_PMD_NULL;
goto out;
}
- for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
- _pte++, addr += PAGE_SIZE) {
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ _pte = pte + i;
+ addr = start_addr + i * PAGE_SIZE;
pte_t pteval = ptep_get(_pte);
if (is_swap_pte(pteval)) {
++unmapped;
@@ -1412,8 +1520,7 @@ static int collapse_scan_pmd(struct mm_struct *mm,
if (pte_none_or_zero(pteval)) {
++none_or_zero;
if (!userfaultfd_armed(vma) &&
- (!cc->is_khugepaged ||
- none_or_zero <= khugepaged_max_ptes_none)) {
+ none_or_zero <= max_ptes_none) {
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
@@ -1461,6 +1568,8 @@ static int collapse_scan_pmd(struct mm_struct *mm,
}
}
+ /* Set bit for occupied pages */
+ bitmap_set(cc->mthp_bitmap, i, 1);
/*
* Record which node the original page is from and save this
* information to cc->node_load[].
@@ -1517,9 +1626,12 @@ static int collapse_scan_pmd(struct mm_struct *mm,
out_unmap:
pte_unmap_unlock(pte, ptl);
if (result == SCAN_SUCCEED) {
- result = collapse_huge_page(mm, start_addr, referenced,
- unmapped, cc, mmap_locked,
- HPAGE_PMD_ORDER, 0);
+ nr_collapsed = collapse_scan_bitmap(mm, start_addr, referenced, unmapped,
+ cc, mmap_locked, enabled_orders);
+ if (nr_collapsed > 0)
+ result = SCAN_SUCCEED;
+ else
+ result = SCAN_FAIL;
}
out:
trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
--
2.51.0
^ permalink raw reply related [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 12/15] khugepaged: Introduce mTHP collapse support
2025-10-22 18:37 ` [PATCH v12 mm-new 12/15] khugepaged: Introduce mTHP collapse support Nico Pache
@ 2025-10-27 6:28 ` Baolin Wang
0 siblings, 0 replies; 77+ messages in thread
From: Baolin Wang @ 2025-10-27 6:28 UTC (permalink / raw)
To: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, lorenzo.stoakes, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato
On 2025/10/23 02:37, Nico Pache wrote:
> During PMD range scanning, track occupied pages in a bitmap. If mTHPs are
> enabled we remove the restriction of max_ptes_none during the scan phase
> to avoid missing potential mTHP candidates.
>
> Implement collapse_scan_bitmap() to perform binary recursion on the bitmap
> and determine the best eligible order for the collapse. A stack struct is
> used instead of traditional recursion. The algorithm splits the bitmap
> into smaller chunks to find the best fit mTHP. max_ptes_none is scaled by
> the attempted collapse order to determine how "full" an order must be
> before being considered for collapse.
>
> Once we determine what mTHP sizes fits best in that PMD range a collapse
> is attempted. A minimum collapse order of 2 is used as this is the lowest
> order supported by anon memory.
>
> mTHP collapses reject regions containing swapped out or shared pages.
> This is because adding new entries can lead to new none pages, and these
> may lead to constant promotion into a higher order (m)THP. A similar
> issue can occur with "max_ptes_none > HPAGE_PMD_NR/2" due to a collapse
> introducing at least 2x the number of pages, and on a future scan will
> satisfy the promotion condition once again. This issue is prevented via
> the collapse_allowable_orders() function.
>
> Currently madv_collapse is not supported and will only attempt PMD
> collapse.
>
> We can also remove the check for is_khugepaged inside the PMD scan as
> the collapse_max_ptes_none() function handles this logic now.
>
> Signed-off-by: Nico Pache <npache@redhat.com>
I've tested this patch, and it works as expected. (Some nits are listed
below)
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> ---
> include/linux/khugepaged.h | 2 +
> mm/khugepaged.c | 128 ++++++++++++++++++++++++++++++++++---
> 2 files changed, 122 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> index eb1946a70cff..179ce716e769 100644
> --- a/include/linux/khugepaged.h
> +++ b/include/linux/khugepaged.h
> @@ -1,6 +1,8 @@
> /* SPDX-License-Identifier: GPL-2.0 */
> #ifndef _LINUX_KHUGEPAGED_H
> #define _LINUX_KHUGEPAGED_H
> +#define KHUGEPAGED_MIN_MTHP_ORDER 2
> +#define MAX_MTHP_BITMAP_STACK (1UL << (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER))
>
> #include <linux/mm.h>
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 89a105124790..e2319bfd0065 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -93,6 +93,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
>
> static struct kmem_cache *mm_slot_cache __ro_after_init;
>
> +struct scan_bit_state {
> + u8 order;
> + u16 offset;
> +};
> +
> struct collapse_control {
> bool is_khugepaged;
>
> @@ -101,6 +106,13 @@ struct collapse_control {
>
> /* nodemask for allocation fallback */
> nodemask_t alloc_nmask;
> +
> + /*
> + * bitmap used to collapse mTHP sizes.
> + */
> + DECLARE_BITMAP(mthp_bitmap, HPAGE_PMD_NR);
> + DECLARE_BITMAP(mthp_bitmap_mask, HPAGE_PMD_NR);
Nit: please remove the extra spaces.
> + struct scan_bit_state mthp_bitmap_stack[MAX_MTHP_BITMAP_STACK];
> };
>
> /**
> @@ -1357,6 +1369,85 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long pmd_address,
> return result;
> }
>
> +static void push_mthp_bitmap_stack(struct collapse_control *cc, int *top,
> + u8 order, u16 offset)
> +{
> + cc->mthp_bitmap_stack[++*top] = (struct scan_bit_state)
> + { order, offset };
> +}
> +
> +/*
> + * collapse_scan_bitmap() consumes the bitmap that is generated during
> + * collapse_scan_pmd() to determine what regions and mTHP orders fit best.
> + *
> + * Each bit in the bitmap represents a single occupied (!none/zero) page.
> + * A stack structure cc->mthp_bitmap_stack is used to check different regions
> + * of the bitmap for collapse eligibility. We start at the PMD order and
> + * check if it is eligible for collapse; if not, we add two entries to the
> + * stack at a lower order to represent the left and right halves of the region.
> + *
> + * For each region, we calculate the number of set bits and compare it
> + * against a threshold derived from collapse_max_ptes_none(). A region is
> + * eligible if the number of set bits exceeds this threshold.
> + */
> +static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
> + int referenced, int unmapped, struct collapse_control *cc,
> + bool *mmap_locked, unsigned long enabled_orders)
> +{
> + u8 order, next_order;
> + u16 offset, mid_offset;
> + int num_chunks;
> + int bits_set, threshold_bits;
> + int top = -1;
> + int collapsed = 0;
> + int ret;
> + struct scan_bit_state state;
> + unsigned int max_none_ptes;
Nit: could you rearrange the order of variable definitions? Like reverse
Christmas trees.
> +
> + push_mthp_bitmap_stack(cc, &top, HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER, 0);
> +
> + while (top >= 0) {
> + state = cc->mthp_bitmap_stack[top--];
> + order = state.order + KHUGEPAGED_MIN_MTHP_ORDER;
> + offset = state.offset;
> + num_chunks = 1UL << order;
Nit: ‘num_chunks’ should be 'unsigned long'.
> +
> + /* Skip mTHP orders that are not enabled */
> + if (!test_bit(order, &enabled_orders))
> + goto next_order;
> +
> + max_none_ptes = collapse_max_ptes_none(order, !cc->is_khugepaged);
> +
> + /* Calculate weight of the range */
> + bitmap_zero(cc->mthp_bitmap_mask, HPAGE_PMD_NR);
> + bitmap_set(cc->mthp_bitmap_mask, offset, num_chunks);
> + bits_set = bitmap_weight_and(cc->mthp_bitmap,
> + cc->mthp_bitmap_mask, HPAGE_PMD_NR);
> +
> + threshold_bits = (1UL << order) - max_none_ptes - 1;
> +
> + /* Check if the region is eligible based on the threshold */
> + if (bits_set > threshold_bits) {
> + ret = collapse_huge_page(mm, address, referenced,
> + unmapped, cc, mmap_locked,
> + order, offset);
> + if (ret == SCAN_SUCCEED) {
> + collapsed += 1UL << order;
> + continue;
> + }
> + }
> +
> +next_order:
> + if (state.order > 0) {
> + next_order = state.order - 1;
> + mid_offset = offset + (num_chunks / 2);
> + push_mthp_bitmap_stack(cc, &top, next_order, mid_offset);
> + push_mthp_bitmap_stack(cc, &top, next_order, offset);
> + }
> + }
> + return collapsed;
> +}
> +
> static int collapse_scan_pmd(struct mm_struct *mm,
> struct vm_area_struct *vma,
> unsigned long start_addr, bool *mmap_locked,
> @@ -1364,11 +1455,15 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> {
> pmd_t *pmd;
> pte_t *pte, *_pte;
> + int i;
> int result = SCAN_FAIL, referenced = 0;
> - int none_or_zero = 0, shared = 0;
> + int none_or_zero = 0, shared = 0, nr_collapsed = 0;
> struct page *page = NULL;
> + unsigned int max_ptes_none;
> struct folio *folio = NULL;
> unsigned long addr;
> + unsigned long enabled_orders;
> + bool full_scan = true;
> spinlock_t *ptl;
> int node = NUMA_NO_NODE, unmapped = 0;
>
> @@ -1378,16 +1473,29 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> if (result != SCAN_SUCCEED)
> goto out;
>
> + bitmap_zero(cc->mthp_bitmap, HPAGE_PMD_NR);
> memset(cc->node_load, 0, sizeof(cc->node_load));
> nodes_clear(cc->alloc_nmask);
> +
> + enabled_orders = collapse_allowable_orders(vma, vma->vm_flags, cc->is_khugepaged);
> +
> + /*
> + * If PMD is the only enabled order, enforce max_ptes_none, otherwise
> + * scan all pages to populate the bitmap for mTHP collapse.
> + */
> + if (cc->is_khugepaged && enabled_orders == _BITUL(HPAGE_PMD_ORDER))
> + full_scan = false;
> + max_ptes_none = collapse_max_ptes_none(HPAGE_PMD_ORDER, full_scan);
> +
> pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
> if (!pte) {
> result = SCAN_PMD_NULL;
> goto out;
> }
>
> - for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> - _pte++, addr += PAGE_SIZE) {
> + for (i = 0; i < HPAGE_PMD_NR; i++) {
> + _pte = pte + i;
> + addr = start_addr + i * PAGE_SIZE;
> pte_t pteval = ptep_get(_pte);
> if (is_swap_pte(pteval)) {
> ++unmapped;
> @@ -1412,8 +1520,7 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> if (pte_none_or_zero(pteval)) {
> ++none_or_zero;
> if (!userfaultfd_armed(vma) &&
> - (!cc->is_khugepaged ||
> - none_or_zero <= khugepaged_max_ptes_none)) {
> + none_or_zero <= max_ptes_none) {
> continue;
> } else {
> result = SCAN_EXCEED_NONE_PTE;
> @@ -1461,6 +1568,8 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> }
> }
>
> + /* Set bit for occupied pages */
> + bitmap_set(cc->mthp_bitmap, i, 1);
> /*
> * Record which node the original page is from and save this
> * information to cc->node_load[].
> @@ -1517,9 +1626,12 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> out_unmap:
> pte_unmap_unlock(pte, ptl);
> if (result == SCAN_SUCCEED) {
> - result = collapse_huge_page(mm, start_addr, referenced,
> - unmapped, cc, mmap_locked,
> - HPAGE_PMD_ORDER, 0);
> + nr_collapsed = collapse_scan_bitmap(mm, start_addr, referenced, unmapped,
> + cc, mmap_locked, enabled_orders);
> + if (nr_collapsed > 0)
> + result = SCAN_SUCCEED;
> + else
> + result = SCAN_FAIL;
> }
> out:
> trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
^ permalink raw reply [flat|nested] 77+ messages in thread
* [PATCH v12 mm-new 13/15] khugepaged: avoid unnecessary mTHP collapse attempts
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (11 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 12/15] khugepaged: Introduce mTHP collapse support Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-10-22 18:37 ` [PATCH v12 mm-new 14/15] khugepaged: run khugepaged for all orders Nico Pache
` (2 subsequent siblings)
15 siblings, 0 replies; 77+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas, tiwai,
will, dave.hansen, jack, cl, jglisse, surenb, zokeefe, hannes,
rientjes, mhocko, rdunlap, hughd, richard.weiyang, lance.yang,
vbabka, rppt, jannh, pfalcato
There are cases where, if an attempted collapse fails, all subsequent
orders are guaranteed to also fail. Avoid these collapse attempts by
bailing out early.
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 31 ++++++++++++++++++++++++++++++-
1 file changed, 30 insertions(+), 1 deletion(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index e2319bfd0065..54f5c7888e46 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1431,10 +1431,39 @@ static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
ret = collapse_huge_page(mm, address, referenced,
unmapped, cc, mmap_locked,
order, offset);
- if (ret == SCAN_SUCCEED) {
+
+ /*
+ * Analyze failure reason to determine next action:
+ * - goto next_order: try smaller orders in same region
+ * - continue: try other regions at same order
+ * - break: stop all attempts (system-wide failure)
+ */
+ switch (ret) {
+ /* Cases were we should continue to the next region */
+ case SCAN_SUCCEED:
collapsed += 1UL << order;
+ fallthrough;
+ case SCAN_PTE_MAPPED_HUGEPAGE:
continue;
+ /* Cases were lower orders might still succeed */
+ case SCAN_LACK_REFERENCED_PAGE:
+ case SCAN_EXCEED_NONE_PTE:
+ case SCAN_EXCEED_SWAP_PTE:
+ case SCAN_EXCEED_SHARED_PTE:
+ case SCAN_PAGE_LOCK:
+ case SCAN_PAGE_COUNT:
+ case SCAN_PAGE_LRU:
+ case SCAN_PAGE_NULL:
+ case SCAN_DEL_PAGE_LRU:
+ case SCAN_PTE_NON_PRESENT:
+ case SCAN_PTE_UFFD_WP:
+ case SCAN_ALLOC_HUGE_PAGE_FAIL:
+ goto next_order;
+ /* All other cases should stop collapse attempts */
+ default:
+ break;
}
+ break;
}
next_order:
--
2.51.0
^ permalink raw reply related [flat|nested] 77+ messages in thread* [PATCH v12 mm-new 14/15] khugepaged: run khugepaged for all orders
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (12 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 13/15] khugepaged: avoid unnecessary mTHP collapse attempts Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-10-22 18:37 ` [PATCH v12 mm-new 15/15] Documentation: mm: update the admin guide for mTHP collapse Nico Pache
2025-10-22 20:13 ` [PATCH v12 mm-new 00/15] khugepaged: mTHP support Andrew Morton
15 siblings, 0 replies; 77+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas, tiwai,
will, dave.hansen, jack, cl, jglisse, surenb, zokeefe, hannes,
rientjes, mhocko, rdunlap, hughd, richard.weiyang, lance.yang,
vbabka, rppt, jannh, pfalcato
From: Baolin Wang <baolin.wang@linux.alibaba.com>
If any order (m)THP is enabled we should allow running khugepaged to
attempt scanning and collapsing mTHPs. In order for khugepaged to operate
when only mTHP sizes are specified in sysfs, we must modify the predicate
function that determines whether it ought to run to do so.
This function is currently called hugepage_pmd_enabled(), this patch
renames it to hugepage_enabled() and updates the logic to check to
determine whether any valid orders may exist which would justify
khugepaged running.
We must also update collapse_allowable_orders() to check all orders if
the vma is anonymous and the collapse is khugepaged.
After this patch khugepaged mTHP collapse is fully enabled.
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 25 +++++++++++++------------
1 file changed, 13 insertions(+), 12 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 54f5c7888e46..8ed9f8e2d376 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -418,23 +418,23 @@ static inline int collapse_test_exit_or_disable(struct mm_struct *mm)
mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
}
-static bool hugepage_pmd_enabled(void)
+static bool hugepage_enabled(void)
{
/*
* We cover the anon, shmem and the file-backed case here; file-backed
* hugepages, when configured in, are determined by the global control.
- * Anon pmd-sized hugepages are determined by the pmd-size control.
+ * Anon hugepages are determined by its per-size mTHP control.
* Shmem pmd-sized hugepages are also determined by its pmd-size control,
* except when the global shmem_huge is set to SHMEM_HUGE_DENY.
*/
if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
hugepage_global_enabled())
return true;
- if (test_bit(PMD_ORDER, &huge_anon_orders_always))
+ if (READ_ONCE(huge_anon_orders_always))
return true;
- if (test_bit(PMD_ORDER, &huge_anon_orders_madvise))
+ if (READ_ONCE(huge_anon_orders_madvise))
return true;
- if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) &&
+ if (READ_ONCE(huge_anon_orders_inherit) &&
hugepage_global_enabled())
return true;
if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled())
@@ -508,7 +508,8 @@ static unsigned long collapse_allowable_orders(struct vm_area_struct *vma,
vm_flags_t vm_flags, bool is_khugepaged)
{
enum tva_type tva_flags = is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
- unsigned long orders = BIT(HPAGE_PMD_ORDER);
+ unsigned long orders = is_khugepaged && vma_is_anonymous(vma) ?
+ THP_ORDERS_ALL_ANON : BIT(HPAGE_PMD_ORDER);
return thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
}
@@ -517,7 +518,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
- hugepage_pmd_enabled()) {
+ hugepage_enabled()) {
if (collapse_allowable_orders(vma, vm_flags, true))
__khugepaged_enter(vma->vm_mm);
}
@@ -2791,7 +2792,7 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result,
static int khugepaged_has_work(void)
{
- return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled();
+ return !list_empty(&khugepaged_scan.mm_head) && hugepage_enabled();
}
static int khugepaged_wait_event(void)
@@ -2864,7 +2865,7 @@ static void khugepaged_wait_work(void)
return;
}
- if (hugepage_pmd_enabled())
+ if (hugepage_enabled())
wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
}
@@ -2895,7 +2896,7 @@ static void set_recommended_min_free_kbytes(void)
int nr_zones = 0;
unsigned long recommended_min;
- if (!hugepage_pmd_enabled()) {
+ if (!hugepage_enabled()) {
calculate_min_free_kbytes();
goto update_wmarks;
}
@@ -2945,7 +2946,7 @@ int start_stop_khugepaged(void)
int err = 0;
mutex_lock(&khugepaged_mutex);
- if (hugepage_pmd_enabled()) {
+ if (hugepage_enabled()) {
if (!khugepaged_thread)
khugepaged_thread = kthread_run(khugepaged, NULL,
"khugepaged");
@@ -2971,7 +2972,7 @@ int start_stop_khugepaged(void)
void khugepaged_min_free_kbytes_update(void)
{
mutex_lock(&khugepaged_mutex);
- if (hugepage_pmd_enabled() && khugepaged_thread)
+ if (hugepage_enabled() && khugepaged_thread)
set_recommended_min_free_kbytes();
mutex_unlock(&khugepaged_mutex);
}
--
2.51.0
^ permalink raw reply related [flat|nested] 77+ messages in thread* [PATCH v12 mm-new 15/15] Documentation: mm: update the admin guide for mTHP collapse
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (13 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 14/15] khugepaged: run khugepaged for all orders Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-10-22 19:52 ` Christoph Lameter (Ampere)
2025-10-22 20:13 ` [PATCH v12 mm-new 00/15] khugepaged: mTHP support Andrew Morton
15 siblings, 1 reply; 77+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas, tiwai,
will, dave.hansen, jack, cl, jglisse, surenb, zokeefe, hannes,
rientjes, mhocko, rdunlap, hughd, richard.weiyang, lance.yang,
vbabka, rppt, jannh, pfalcato, Bagas Sanjaya
Now that we can collapse to mTHPs lets update the admin guide to
reflect these changes and provide proper guidence on how to utilize it.
Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
Documentation/admin-guide/mm/transhuge.rst | 53 ++++++++++++----------
1 file changed, 30 insertions(+), 23 deletions(-)
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 7c71cda8aea1..2569a92fd96c 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -63,7 +63,8 @@ often.
THP can be enabled system wide or restricted to certain tasks or even
memory ranges inside task's address space. Unless THP is completely
disabled, there is ``khugepaged`` daemon that scans memory and
-collapses sequences of basic pages into PMD-sized huge pages.
+collapses sequences of basic pages into huge pages of either PMD size
+or mTHP sizes, if the system is configured to do so
The THP behaviour is controlled via :ref:`sysfs <thp_sysfs>`
interface and using madvise(2) and prctl(2) system calls.
@@ -212,17 +213,17 @@ PMD-mappable transparent hugepage::
All THPs at fault and collapse time will be added to _deferred_list,
and will therefore be split under memory presure if they are considered
"underused". A THP is underused if the number of zero-filled pages in
-the THP is above max_ptes_none (see below). It is possible to disable
-this behaviour by writing 0 to shrink_underused, and enable it by writing
-1 to it::
+the THP is above max_ptes_none (see below) scaled by the THP order. It is
+possible to disable this behaviour by writing 0 to shrink_underused, and enable
+it by writing 1 to it::
echo 0 > /sys/kernel/mm/transparent_hugepage/shrink_underused
echo 1 > /sys/kernel/mm/transparent_hugepage/shrink_underused
-khugepaged will be automatically started when PMD-sized THP is enabled
+khugepaged will be automatically started when any THP size is enabled
(either of the per-size anon control or the top-level control are set
to "always" or "madvise"), and it'll be automatically shutdown when
-PMD-sized THP is disabled (when both the per-size anon control and the
+all THP sizes are disabled (when both the per-size anon control and the
top-level control are "never")
process THP controls
@@ -264,11 +265,6 @@ support the following arguments::
Khugepaged controls
-------------------
-.. note::
- khugepaged currently only searches for opportunities to collapse to
- PMD-sized THP and no attempt is made to collapse to other THP
- sizes.
-
khugepaged runs usually at low frequency so while one may not want to
invoke defrag algorithms synchronously during the page faults, it
should be worth invoking defrag at least in khugepaged. However it's
@@ -296,11 +292,11 @@ allocation failure to throttle the next allocation attempt::
The khugepaged progress can be seen in the number of pages collapsed (note
that this counter may not be an exact count of the number of pages
collapsed, since "collapsed" could mean multiple things: (1) A PTE mapping
-being replaced by a PMD mapping, or (2) All 4K physical pages replaced by
-one 2M hugepage. Each may happen independently, or together, depending on
-the type of memory and the failures that occur. As such, this value should
-be interpreted roughly as a sign of progress, and counters in /proc/vmstat
-consulted for more accurate accounting)::
+being replaced by a PMD mapping, or (2) physical pages replaced by one
+hugepage of various sizes (PMD-sized or mTHP). Each may happen independently,
+or together, depending on the type of memory and the failures that occur.
+As such, this value should be interpreted roughly as a sign of progress,
+and counters in /proc/vmstat consulted for more accurate accounting)::
/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
@@ -308,16 +304,18 @@ for each pass::
/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans
-``max_ptes_none`` specifies how many extra small pages (that are
-not already mapped) can be allocated when collapsing a group
-of small pages into one large page::
+``max_ptes_none`` specifies how many empty (none/zero) pages are allowed
+when collapsing a group of small pages into one large page::
/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none
-A higher value leads to use additional memory for programs.
-A lower value leads to gain less thp performance. Value of
-max_ptes_none can waste cpu time very little, you can
-ignore it.
+For PMD-sized THP collapse, this directly limits the number of empty pages
+allowed in the 2MB region. For mTHP collapse, the kernel might use a more
+conservative value when determining eligibility.
+
+A higher value allows more empty pages, potentially leading to more memory
+usage but better THP performance. A lower value is more conservative and
+may result in fewer THP collapses.
``max_ptes_swap`` specifies how many pages can be brought in from
swap when collapsing a group of pages into a transparent huge page::
@@ -337,6 +335,15 @@ that THP is shared. Exceeding the number would block the collapse::
A higher value may increase memory footprint for some workloads.
+.. note::
+ For mTHP collapse, khugepaged does not support collapsing regions that
+ contain shared or swapped out pages, as this could lead to continuous
+ promotion to higher orders. The collapse will fail if any shared or
+ swapped PTEs are encountered during the scan.
+
+ Currently, madvise_collapse only supports collapsing to PMD-sized THPs
+ and does not attempt mTHP collapses.
+
Boot parameters
===============
--
2.51.0
^ permalink raw reply related [flat|nested] 77+ messages in thread* Re: [PATCH v12 mm-new 15/15] Documentation: mm: update the admin guide for mTHP collapse
2025-10-22 18:37 ` [PATCH v12 mm-new 15/15] Documentation: mm: update the admin guide for mTHP collapse Nico Pache
@ 2025-10-22 19:52 ` Christoph Lameter (Ampere)
2025-10-22 20:22 ` David Hildenbrand
0 siblings, 1 reply; 77+ messages in thread
From: Christoph Lameter (Ampere) @ 2025-10-22 19:52 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david, ziy,
baolin.wang, lorenzo.stoakes, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, jglisse, surenb, zokeefe, hannes, rientjes, mhocko, rdunlap,
hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato,
Bagas Sanjaya
On Wed, 22 Oct 2025, Nico Pache wrote:
> Currently, madvise_collapse only supports collapsing to PMD-sized THPs +
> and does not attempt mTHP collapses. +
madvise collapse is frequently used as far as I can tell from the THP
loads being tested. Could we support madvise collapse for mTHP?
^ permalink raw reply [flat|nested] 77+ messages in thread
* Re: [PATCH v12 mm-new 15/15] Documentation: mm: update the admin guide for mTHP collapse
2025-10-22 19:52 ` Christoph Lameter (Ampere)
@ 2025-10-22 20:22 ` David Hildenbrand
2025-10-23 8:00 ` Lorenzo Stoakes
2025-10-23 23:41 ` Christoph Lameter (Ampere)
0 siblings, 2 replies; 77+ messages in thread
From: David Hildenbrand @ 2025-10-22 20:22 UTC (permalink / raw)
To: Christoph Lameter (Ampere), Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, ziy,
baolin.wang, lorenzo.stoakes, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, jglisse, surenb, zokeefe, hannes, rientjes, mhocko, rdunlap,
hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato,
Bagas Sanjaya
On 22.10.25 21:52, Christoph Lameter (Ampere) wrote:
> On Wed, 22 Oct 2025, Nico Pache wrote:
>
>> Currently, madvise_collapse only supports collapsing to PMD-sized THPs +
>> and does not attempt mTHP collapses. +
>
> madvise collapse is frequently used as far as I can tell from the THP
> loads being tested. Could we support madvise collapse for mTHP?
The big question is still how user space can communicate the desired
order, and how we can not break existing users.
So I guess there will definitely be some support to trigger collapse to
mTHP in the future, the big question is through which interface. So it
will happen after this series.
Maybe through process_madvise() where we have an additional parameter, I
think that was what people discussed in the past.
--
Cheers
David / dhildenb
^ permalink raw reply [flat|nested] 77+ messages in thread
* Re: [PATCH v12 mm-new 15/15] Documentation: mm: update the admin guide for mTHP collapse
2025-10-22 20:22 ` David Hildenbrand
@ 2025-10-23 8:00 ` Lorenzo Stoakes
2025-10-23 8:44 ` Pedro Falcato
2025-10-23 23:41 ` Christoph Lameter (Ampere)
1 sibling, 1 reply; 77+ messages in thread
From: Lorenzo Stoakes @ 2025-10-23 8:00 UTC (permalink / raw)
To: David Hildenbrand
Cc: Christoph Lameter (Ampere), Nico Pache, linux-kernel,
linux-trace-kernel, linux-mm, linux-doc, ziy, baolin.wang,
Liam.Howlett, ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas, tiwai,
will, dave.hansen, jack, jglisse, surenb, zokeefe, hannes,
rientjes, mhocko, rdunlap, hughd, richard.weiyang, lance.yang,
vbabka, rppt, jannh, pfalcato, Bagas Sanjaya
On Wed, Oct 22, 2025 at 10:22:08PM +0200, David Hildenbrand wrote:
> On 22.10.25 21:52, Christoph Lameter (Ampere) wrote:
> > On Wed, 22 Oct 2025, Nico Pache wrote:
> >
> > > Currently, madvise_collapse only supports collapsing to PMD-sized THPs +
> > > and does not attempt mTHP collapses. +
> >
> > madvise collapse is frequently used as far as I can tell from the THP
> > loads being tested. Could we support madvise collapse for mTHP?
>
> The big question is still how user space can communicate the desired order,
> and how we can not break existing users.
Yes, and let's go one step at a time, this series still needs careful scrutiny
and we need to ensure the _fundamentals_ are in place for khugepaged before we
get into MADV_COLLAPSE :)
>
> So I guess there will definitely be some support to trigger collapse to mTHP
> in the future, the big question is through which interface. So it will
> happen after this series.
Yes.
>
> Maybe through process_madvise() where we have an additional parameter, I
> think that was what people discussed in the past.
I wouldn't absolutely love us doing that, given it is a general parameter so
would seem applicable to any madvise() option and could lead to confusion, also
process_madvise() was originally for cross-process madvise vector operations.
I expanded this to make it applicable to the current process (and introduced
PIDFD_SELF to make that more sane), and SJ has optimised it across vector
operations (thanks SJ! :), but in general - it seems very weird to have
madvise() provide an operation that process_madvise() providse another version
of that has an extra parameter.
As usual we've painted ourselves into a corner with an API... :)
Perhaps we'll to accept the process_madvise() compromise and add
MADV_COLLAPSE_MHTP that only works with it or something.
Of course adding a new syscall isn't impossible... madvise2() not very appealing
however...
TL;DR I guess we'll deal with that when we come to it :)
>
> --
> Cheers
>
> David / dhildenb
>
Cheers, Lorenzo
^ permalink raw reply [flat|nested] 77+ messages in thread
* Re: [PATCH v12 mm-new 15/15] Documentation: mm: update the admin guide for mTHP collapse
2025-10-23 8:00 ` Lorenzo Stoakes
@ 2025-10-23 8:44 ` Pedro Falcato
2025-10-24 13:54 ` Zach O'Keefe
0 siblings, 1 reply; 77+ messages in thread
From: Pedro Falcato @ 2025-10-23 8:44 UTC (permalink / raw)
To: Lorenzo Stoakes, David Hildenbrand
Cc: Christoph Lameter (Ampere), Nico Pache, linux-kernel,
linux-trace-kernel, linux-mm, linux-doc, ziy, baolin.wang,
Liam.Howlett, ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas, tiwai,
will, dave.hansen, jack, jglisse, surenb, zokeefe, hannes,
rientjes, mhocko, rdunlap, hughd, richard.weiyang, lance.yang,
vbabka, rppt, jannh, Bagas Sanjaya
On Thu, Oct 23, 2025 at 09:00:10AM +0100, Lorenzo Stoakes wrote:
> On Wed, Oct 22, 2025 at 10:22:08PM +0200, David Hildenbrand wrote:
> > On 22.10.25 21:52, Christoph Lameter (Ampere) wrote:
> > > On Wed, 22 Oct 2025, Nico Pache wrote:
> > >
> > > > Currently, madvise_collapse only supports collapsing to PMD-sized THPs +
> > > > and does not attempt mTHP collapses. +
> > >
> > > madvise collapse is frequently used as far as I can tell from the THP
> > > loads being tested. Could we support madvise collapse for mTHP?
> >
> > The big question is still how user space can communicate the desired order,
> > and how we can not break existing users.
>
Do we want to let userspace communicate order? It seems like an extremely
specific thing to do. A more simple&sane semantic could be something like:
"MADV_COLLAPSE collapses a given [addr, addr+len] range into the highest
order THP it can/thinks it should.". The implementation details of PMD or
contpte or <...> are lost by the time we get to userspace.
The man page itself is pretty vaguely written to allow us to do whatever
we want. It sounds to me that allowing userspace to create arbitrary order
mTHPs would be another pandora's box we shouldn't get into.
> Yes, and let's go one step at a time, this series still needs careful scrutiny
> and we need to ensure the _fundamentals_ are in place for khugepaged before we
> get into MADV_COLLAPSE :)
>
> >
> > So I guess there will definitely be some support to trigger collapse to mTHP
> > in the future, the big question is through which interface. So it will
> > happen after this series.
>
> Yes.
>
> >
> > Maybe through process_madvise() where we have an additional parameter, I
> > think that was what people discussed in the past.
>
> I wouldn't absolutely love us doing that, given it is a general parameter so
> would seem applicable to any madvise() option and could lead to confusion, also
> process_madvise() was originally for cross-process madvise vector operations.
For what it's worth, it would probably not be too hard to devise a generic
separation there between "generic flags" and "behavior-specific flags".
And then stuff the desired THP order into MADV_COLLAPSE-specific flags.
>
> I expanded this to make it applicable to the current process (and introduced
> PIDFD_SELF to make that more sane), and SJ has optimised it across vector
> operations (thanks SJ! :), but in general - it seems very weird to have
> madvise() provide an operation that process_madvise() providse another version
> of that has an extra parameter.
>
> As usual we've painted ourselves into a corner with an API... :)
But yes, I agree it would feel weird.
>
> Perhaps we'll to accept the process_madvise() compromise and add
> MADV_COLLAPSE_MHTP that only works with it or something.
>
> Of course adding a new syscall isn't impossible... madvise2() not very appealing
> however...
It is my impression that process_madvise() is already madvise2(), but
poorly named.
>
> TL;DR I guess we'll deal with that when we come to it :)
Amen :)
--
Pedro
^ permalink raw reply [flat|nested] 77+ messages in thread
* Re: [PATCH v12 mm-new 15/15] Documentation: mm: update the admin guide for mTHP collapse
2025-10-23 8:44 ` Pedro Falcato
@ 2025-10-24 13:54 ` Zach O'Keefe
0 siblings, 0 replies; 77+ messages in thread
From: Zach O'Keefe @ 2025-10-24 13:54 UTC (permalink / raw)
To: Pedro Falcato
Cc: Lorenzo Stoakes, David Hildenbrand, Christoph Lameter (Ampere),
Nico Pache, linux-kernel, linux-trace-kernel, linux-mm, linux-doc,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, jglisse, surenb,
hannes, rientjes, mhocko, rdunlap, hughd, richard.weiyang,
lance.yang, vbabka, rppt, jannh, Bagas Sanjaya
On Thu, Oct 23, 2025 at 1:44 AM Pedro Falcato <pfalcato@suse.de> wrote:
>
> On Thu, Oct 23, 2025 at 09:00:10AM +0100, Lorenzo Stoakes wrote:
> > On Wed, Oct 22, 2025 at 10:22:08PM +0200, David Hildenbrand wrote:
> > > On 22.10.25 21:52, Christoph Lameter (Ampere) wrote:
> > > > On Wed, 22 Oct 2025, Nico Pache wrote:
> > > >
> > > > > Currently, madvise_collapse only supports collapsing to PMD-sized THPs +
> > > > > and does not attempt mTHP collapses. +
> > > >
> > > > madvise collapse is frequently used as far as I can tell from the THP
> > > > loads being tested. Could we support madvise collapse for mTHP?
> > >
> > > The big question is still how user space can communicate the desired order,
> > > and how we can not break existing users.
> >
>
> Do we want to let userspace communicate order? It seems like an extremely
> specific thing to do. A more simple&sane semantic could be something like:
> "MADV_COLLAPSE collapses a given [addr, addr+len] range into the highest
> order THP it can/thinks it should.". The implementation details of PMD or
> contpte or <...> are lost by the time we get to userspace.
>
> The man page itself is pretty vaguely written to allow us to do whatever
> we want. It sounds to me that allowing userspace to create arbitrary order
> mTHPs would be another pandora's box we shouldn't get into.
>
> > Yes, and let's go one step at a time, this series still needs careful scrutiny
> > and we need to ensure the _fundamentals_ are in place for khugepaged before we
> > get into MADV_COLLAPSE :)
> >
> > >
> > > So I guess there will definitely be some support to trigger collapse to mTHP
> > > in the future, the big question is through which interface. So it will
> > > happen after this series.
> >
> > Yes.
> >
> > >
> > > Maybe through process_madvise() where we have an additional parameter, I
> > > think that was what people discussed in the past.
> >
> > I wouldn't absolutely love us doing that, given it is a general parameter so
> > would seem applicable to any madvise() option and could lead to confusion, also
> > process_madvise() was originally for cross-process madvise vector operations.
>
> For what it's worth, it would probably not be too hard to devise a generic
> separation there between "generic flags" and "behavior-specific flags".
> And then stuff the desired THP order into MADV_COLLAPSE-specific flags.
Yeah, this is how I envisioned the flags to be leveraged; reserve some
number of bits for generic, and overload the others for
advice-specific. I suspect once the seal is broken on this, more
advice-specific flags will promptly follow.
> >
> > I expanded this to make it applicable to the current process (and introduced
> > PIDFD_SELF to make that more sane), and SJ has optimised it across vector
> > operations (thanks SJ! :), but in general - it seems very weird to have
> > madvise() provide an operation that process_madvise() providse another version
> > of that has an extra parameter.
> >
> > As usual we've painted ourselves into a corner with an API... :)
>
> But yes, I agree it would feel weird.
>
> >
> > Perhaps we'll to accept the process_madvise() compromise and add
> > MADV_COLLAPSE_MHTP that only works with it or something.
> >
> > Of course adding a new syscall isn't impossible... madvise2() not very appealing
> > however...
>
> It is my impression that process_madvise() is already madvise2(), but
> poorly named.
+1
> >
> > TL;DR I guess we'll deal with that when we come to it :)
>
> Amen :)
>
> --
> Pedro
^ permalink raw reply [flat|nested] 77+ messages in thread
* Re: [PATCH v12 mm-new 15/15] Documentation: mm: update the admin guide for mTHP collapse
2025-10-22 20:22 ` David Hildenbrand
2025-10-23 8:00 ` Lorenzo Stoakes
@ 2025-10-23 23:41 ` Christoph Lameter (Ampere)
1 sibling, 0 replies; 77+ messages in thread
From: Christoph Lameter (Ampere) @ 2025-10-23 23:41 UTC (permalink / raw)
To: David Hildenbrand
Cc: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm, linux-doc,
ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, jglisse, surenb, zokeefe, hannes, rientjes, mhocko, rdunlap,
hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh, pfalcato,
Bagas Sanjaya
On Wed, 22 Oct 2025, David Hildenbrand wrote:
> The big question is still how user space can communicate the desired order,
> and how we can not break existing users.
>
> So I guess there will definitely be some support to trigger collapse to mTHP
> in the future, the big question is through which interface. So it will happen
> after this series.
Well we have a possibility of a memory policy for each VMA and we can set
memory policies for arbitrary memory ranges as well as per process through
the existing APIs from user space.
Extending the memory policies by a parameter to allow setting a preferred
order would allow us to use this mechanisms.
Memory policies can already be used to control numa balancing and
migration. The ability to specify page sizes is similar I think.
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 8fbbe613611a..429117bbd2f4 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -31,6 +31,7 @@ enum {
#define MPOL_F_STATIC_NODES (1 << 15)
#define MPOL_F_RELATIVE_NODES (1 << 14)
#define MPOL_F_NUMA_BALANCING (1 << 13) /* Optimize with NUMA balancing if possible */
+#define MPOL_F_PAGE_ORDER (1 << 12)
/*
* MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
@@ -56,6 +57,9 @@ enum {
MPOL_MF_MOVE | \
MPOL_MF_MOVE_ALL)
+#define MPOL_MF_PAGE_ORDER (1<<5) /* Set preferred page order */
+
+
/*
* Internal flags that share the struct mempolicy flags word with
* "mode flags". These flags are allocated from bit 0 up, as they
^ permalink raw reply related [flat|nested] 77+ messages in thread
* Re: [PATCH v12 mm-new 00/15] khugepaged: mTHP support
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (14 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 15/15] Documentation: mm: update the admin guide for mTHP collapse Nico Pache
@ 2025-10-22 20:13 ` Andrew Morton
15 siblings, 0 replies; 77+ messages in thread
From: Andrew Morton @ 2025-10-22 20:13 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david, ziy,
baolin.wang, lorenzo.stoakes, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, baohua,
willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, 22 Oct 2025 12:37:02 -0600 Nico Pache <npache@redhat.com> wrote:
> The following series provides khugepaged with the capability to collapse
> anonymous memory regions to mTHPs.
I added this to mm.git's mm-new branch, thanks.
I suppressed the 500 added-to-mm emails.
^ permalink raw reply [flat|nested] 77+ messages in thread