linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v10 12/13] khugepaged: add per-order mTHP khugepaged stats
@ 2025-08-19 13:48 Nico Pache
  2025-08-19 13:48 ` [PATCH v10 13/13] Documentation: mm: update the admin guide for mTHP collapse Nico Pache
  2025-08-19 14:23 ` [PATCH v10 12/13] khugepaged: add per-order mTHP khugepaged stats Nico Pache
  0 siblings, 2 replies; 6+ messages in thread
From: Nico Pache @ 2025-08-19 13:48 UTC (permalink / raw)
  To: linux-mm, linux-doc, linux-kernel, linux-trace-kernel
  Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
	ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
	mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
	usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
	kirill.shutemov, aarcange, raquini, anshuman.khandual,
	catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
	surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd

With mTHP support inplace, let add the per-order mTHP stats for
exceeding NONE, SWAP, and SHARED.

Signed-off-by: Nico Pache <npache@redhat.com>
---
 Documentation/admin-guide/mm/transhuge.rst | 17 +++++++++++++++++
 include/linux/huge_mm.h                    |  3 +++
 mm/huge_memory.c                           |  7 +++++++
 mm/khugepaged.c                            | 16 +++++++++++++---
 4 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 7ccb93e22852..b85547ac4fe9 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -705,6 +705,23 @@ nr_anon_partially_mapped
        an anonymous THP as "partially mapped" and count it here, even though it
        is not actually partially mapped anymore.
 
+collapse_exceed_swap_pte
+       The number of anonymous THP which contain at least one swap PTE.
+       Currently khugepaged does not support collapsing mTHP regions that
+       contain a swap PTE.
+
+collapse_exceed_none_pte
+       The number of anonymous THP which have exceeded the none PTE threshold.
+       With mTHP collapse, a bitmap is used to gather the state of a PMD region
+       and is then recursively checked from largest to smallest order against
+       the scaled max_ptes_none count. This counter indicates that the next
+       enabled order will be checked.
+
+collapse_exceed_shared_pte
+       The number of anonymous THP which contain at least one shared PTE.
+       Currently khugepaged does not support collapsing mTHP regions that
+       contain a shared PTE.
+
 As the system ages, allocating huge pages may be expensive as the
 system uses memory compaction to copy data around memory to free a
 huge page for use. There are some counters in ``/proc/vmstat`` to help
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 4ada5d1f7297..6f1593d0b4b5 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -144,6 +144,9 @@ enum mthp_stat_item {
 	MTHP_STAT_SPLIT_DEFERRED,
 	MTHP_STAT_NR_ANON,
 	MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
+	MTHP_STAT_COLLAPSE_EXCEED_SWAP,
+	MTHP_STAT_COLLAPSE_EXCEED_NONE,
+	MTHP_STAT_COLLAPSE_EXCEED_SHARED,
 	__MTHP_STAT_COUNT
 };
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 20d005c2c61f..9f0470c3e983 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -639,6 +639,10 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
 DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
 DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
 DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
+DEFINE_MTHP_STAT_ATTR(collapse_exceed_swap_pte, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
+DEFINE_MTHP_STAT_ATTR(collapse_exceed_none_pte, MTHP_STAT_COLLAPSE_EXCEED_NONE);
+DEFINE_MTHP_STAT_ATTR(collapse_exceed_shared_pte, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
+
 
 static struct attribute *anon_stats_attrs[] = {
 	&anon_fault_alloc_attr.attr,
@@ -655,6 +659,9 @@ static struct attribute *anon_stats_attrs[] = {
 	&split_deferred_attr.attr,
 	&nr_anon_attr.attr,
 	&nr_anon_partially_mapped_attr.attr,
+	&collapse_exceed_swap_pte_attr.attr,
+	&collapse_exceed_none_pte_attr.attr,
+	&collapse_exceed_shared_pte_attr.attr,
 	NULL,
 };
 
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index c13bc583a368..5a3386043f39 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -594,7 +594,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 				continue;
 			} else {
 				result = SCAN_EXCEED_NONE_PTE;
-				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+				if (order == HPAGE_PMD_ORDER)
+					count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+				count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_NONE);
 				goto out;
 			}
 		}
@@ -633,10 +635,17 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 			 * shared may cause a future higher order collapse on a
 			 * rescan of the same range.
 			 */
-			if (order != HPAGE_PMD_ORDER || (cc->is_khugepaged &&
-			    shared > khugepaged_max_ptes_shared)) {
+			if (order != HPAGE_PMD_ORDER) {
+				result = SCAN_EXCEED_SHARED_PTE;
+				count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
+				goto out;
+			}
+
+			if (cc->is_khugepaged &&
+			    shared > khugepaged_max_ptes_shared) {
 				result = SCAN_EXCEED_SHARED_PTE;
 				count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
+				count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
 				goto out;
 			}
 		}
@@ -1084,6 +1093,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
 		 * range.
 		 */
 		if (order != HPAGE_PMD_ORDER) {
+			count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
 			pte_unmap(pte);
 			mmap_read_unlock(mm);
 			result = SCAN_EXCEED_SWAP_PTE;
-- 
2.50.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH v10 13/13] Documentation: mm: update the admin guide for mTHP collapse
  2025-08-19 13:48 [PATCH v10 12/13] khugepaged: add per-order mTHP khugepaged stats Nico Pache
@ 2025-08-19 13:48 ` Nico Pache
  2025-08-19 14:24   ` Nico Pache
  2025-08-19 14:23 ` [PATCH v10 12/13] khugepaged: add per-order mTHP khugepaged stats Nico Pache
  1 sibling, 1 reply; 6+ messages in thread
From: Nico Pache @ 2025-08-19 13:48 UTC (permalink / raw)
  To: linux-mm, linux-doc, linux-kernel, linux-trace-kernel
  Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
	ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
	mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
	usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
	kirill.shutemov, aarcange, raquini, anshuman.khandual,
	catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
	surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
	Bagas Sanjaya

Now that we can collapse to mTHPs lets update the admin guide to
reflect these changes and provide proper guidence on how to utilize it.

Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
 Documentation/admin-guide/mm/transhuge.rst | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index b85547ac4fe9..1f9e6a32052c 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -63,7 +63,7 @@ often.
 THP can be enabled system wide or restricted to certain tasks or even
 memory ranges inside task's address space. Unless THP is completely
 disabled, there is ``khugepaged`` daemon that scans memory and
-collapses sequences of basic pages into PMD-sized huge pages.
+collapses sequences of basic pages into huge pages.
 
 The THP behaviour is controlled via :ref:`sysfs <thp_sysfs>`
 interface and using madvise(2) and prctl(2) system calls.
@@ -149,6 +149,18 @@ hugepage sizes have enabled="never". If enabling multiple hugepage
 sizes, the kernel will select the most appropriate enabled size for a
 given allocation.
 
+khugepaged uses max_ptes_none scaled to the order of the enabled mTHP size
+to determine collapses. When using mTHPs it's recommended to set
+max_ptes_none low-- ideally less than HPAGE_PMD_NR / 2 (255 on 4k page
+size). This will prevent undesired "creep" behavior that leads to
+continuously collapsing to the largest mTHP size; when we collapse, we are
+bringing in new non-zero pages that will, on a subsequent scan, cause the
+max_ptes_none check of the +1 order to always be satisfied. By limiting
+this to less than half the current order, we make sure we don't cause this
+feedback loop. max_ptes_shared and max_ptes_swap have no effect when
+collapsing to a mTHP, and mTHP collapse will fail on shared or swapped out
+pages.
+
 It's also possible to limit defrag efforts in the VM to generate
 anonymous hugepages in case they're not immediately free to madvise
 regions or to never try to defrag memory and simply fallback to regular
@@ -264,11 +276,6 @@ support the following arguments::
 Khugepaged controls
 -------------------
 
-.. note::
-   khugepaged currently only searches for opportunities to collapse to
-   PMD-sized THP and no attempt is made to collapse to other THP
-   sizes.
-
 khugepaged runs usually at low frequency so while one may not want to
 invoke defrag algorithms synchronously during the page faults, it
 should be worth invoking defrag at least in khugepaged. However it's
-- 
2.50.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH v10 12/13] khugepaged: add per-order mTHP khugepaged stats
  2025-08-19 13:41 [PATCH v10 00/13] khugepaged: mTHP support Nico Pache
@ 2025-08-19 14:16 ` Nico Pache
  2025-08-21 14:47   ` Lorenzo Stoakes
  0 siblings, 1 reply; 6+ messages in thread
From: Nico Pache @ 2025-08-19 14:16 UTC (permalink / raw)
  To: linux-mm, linux-doc, linux-kernel, linux-trace-kernel
  Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
	ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
	mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
	usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
	kirill.shutemov, aarcange, raquini, anshuman.khandual,
	catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
	surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd

With mTHP support inplace, let add the per-order mTHP stats for
exceeding NONE, SWAP, and SHARED.

Signed-off-by: Nico Pache <npache@redhat.com>
---
 Documentation/admin-guide/mm/transhuge.rst | 17 +++++++++++++++++
 include/linux/huge_mm.h                    |  3 +++
 mm/huge_memory.c                           |  7 +++++++
 mm/khugepaged.c                            | 16 +++++++++++++---
 4 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 7ccb93e22852..b85547ac4fe9 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -705,6 +705,23 @@ nr_anon_partially_mapped
        an anonymous THP as "partially mapped" and count it here, even though it
        is not actually partially mapped anymore.
 
+collapse_exceed_swap_pte
+       The number of anonymous THP which contain at least one swap PTE.
+       Currently khugepaged does not support collapsing mTHP regions that
+       contain a swap PTE.
+
+collapse_exceed_none_pte
+       The number of anonymous THP which have exceeded the none PTE threshold.
+       With mTHP collapse, a bitmap is used to gather the state of a PMD region
+       and is then recursively checked from largest to smallest order against
+       the scaled max_ptes_none count. This counter indicates that the next
+       enabled order will be checked.
+
+collapse_exceed_shared_pte
+       The number of anonymous THP which contain at least one shared PTE.
+       Currently khugepaged does not support collapsing mTHP regions that
+       contain a shared PTE.
+
 As the system ages, allocating huge pages may be expensive as the
 system uses memory compaction to copy data around memory to free a
 huge page for use. There are some counters in ``/proc/vmstat`` to help
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 4ada5d1f7297..6f1593d0b4b5 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -144,6 +144,9 @@ enum mthp_stat_item {
 	MTHP_STAT_SPLIT_DEFERRED,
 	MTHP_STAT_NR_ANON,
 	MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
+	MTHP_STAT_COLLAPSE_EXCEED_SWAP,
+	MTHP_STAT_COLLAPSE_EXCEED_NONE,
+	MTHP_STAT_COLLAPSE_EXCEED_SHARED,
 	__MTHP_STAT_COUNT
 };
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 20d005c2c61f..9f0470c3e983 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -639,6 +639,10 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
 DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
 DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
 DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
+DEFINE_MTHP_STAT_ATTR(collapse_exceed_swap_pte, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
+DEFINE_MTHP_STAT_ATTR(collapse_exceed_none_pte, MTHP_STAT_COLLAPSE_EXCEED_NONE);
+DEFINE_MTHP_STAT_ATTR(collapse_exceed_shared_pte, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
+
 
 static struct attribute *anon_stats_attrs[] = {
 	&anon_fault_alloc_attr.attr,
@@ -655,6 +659,9 @@ static struct attribute *anon_stats_attrs[] = {
 	&split_deferred_attr.attr,
 	&nr_anon_attr.attr,
 	&nr_anon_partially_mapped_attr.attr,
+	&collapse_exceed_swap_pte_attr.attr,
+	&collapse_exceed_none_pte_attr.attr,
+	&collapse_exceed_shared_pte_attr.attr,
 	NULL,
 };
 
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index c13bc583a368..5a3386043f39 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -594,7 +594,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 				continue;
 			} else {
 				result = SCAN_EXCEED_NONE_PTE;
-				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+				if (order == HPAGE_PMD_ORDER)
+					count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+				count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_NONE);
 				goto out;
 			}
 		}
@@ -633,10 +635,17 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 			 * shared may cause a future higher order collapse on a
 			 * rescan of the same range.
 			 */
-			if (order != HPAGE_PMD_ORDER || (cc->is_khugepaged &&
-			    shared > khugepaged_max_ptes_shared)) {
+			if (order != HPAGE_PMD_ORDER) {
+				result = SCAN_EXCEED_SHARED_PTE;
+				count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
+				goto out;
+			}
+
+			if (cc->is_khugepaged &&
+			    shared > khugepaged_max_ptes_shared) {
 				result = SCAN_EXCEED_SHARED_PTE;
 				count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
+				count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
 				goto out;
 			}
 		}
@@ -1084,6 +1093,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
 		 * range.
 		 */
 		if (order != HPAGE_PMD_ORDER) {
+			count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
 			pte_unmap(pte);
 			mmap_read_unlock(mm);
 			result = SCAN_EXCEED_SWAP_PTE;
-- 
2.50.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v10 12/13] khugepaged: add per-order mTHP khugepaged stats
  2025-08-19 13:48 [PATCH v10 12/13] khugepaged: add per-order mTHP khugepaged stats Nico Pache
  2025-08-19 13:48 ` [PATCH v10 13/13] Documentation: mm: update the admin guide for mTHP collapse Nico Pache
@ 2025-08-19 14:23 ` Nico Pache
  1 sibling, 0 replies; 6+ messages in thread
From: Nico Pache @ 2025-08-19 14:23 UTC (permalink / raw)
  To: linux-mm, linux-doc, linux-kernel, linux-trace-kernel
  Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
	ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
	mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
	usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
	kirill.shutemov, aarcange, raquini, anshuman.khandual,
	catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
	surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd

On Tue, Aug 19, 2025 at 7:49 AM Nico Pache <npache@redhat.com> wrote:
>
> With mTHP support inplace, let add the per-order mTHP stats for
> exceeding NONE, SWAP, and SHARED.
>
> Signed-off-by: Nico Pache <npache@redhat.com>

I had git send email error and had to resend this patch (12) and patch
13, but i forgot the in-reply-to
please ignore this one and reply to correct version

https://lore.kernel.org/lkml/20250819141610.626140-1-npache@redhat.com/
--in-reply-to=20250819141610.626140-1-npache@redhat.com


-- Nico

> ---
>  Documentation/admin-guide/mm/transhuge.rst | 17 +++++++++++++++++
>  include/linux/huge_mm.h                    |  3 +++
>  mm/huge_memory.c                           |  7 +++++++
>  mm/khugepaged.c                            | 16 +++++++++++++---
>  4 files changed, 40 insertions(+), 3 deletions(-)
>
> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> index 7ccb93e22852..b85547ac4fe9 100644
> --- a/Documentation/admin-guide/mm/transhuge.rst
> +++ b/Documentation/admin-guide/mm/transhuge.rst
> @@ -705,6 +705,23 @@ nr_anon_partially_mapped
>         an anonymous THP as "partially mapped" and count it here, even though it
>         is not actually partially mapped anymore.
>
> +collapse_exceed_swap_pte
> +       The number of anonymous THP which contain at least one swap PTE.
> +       Currently khugepaged does not support collapsing mTHP regions that
> +       contain a swap PTE.
> +
> +collapse_exceed_none_pte
> +       The number of anonymous THP which have exceeded the none PTE threshold.
> +       With mTHP collapse, a bitmap is used to gather the state of a PMD region
> +       and is then recursively checked from largest to smallest order against
> +       the scaled max_ptes_none count. This counter indicates that the next
> +       enabled order will be checked.
> +
> +collapse_exceed_shared_pte
> +       The number of anonymous THP which contain at least one shared PTE.
> +       Currently khugepaged does not support collapsing mTHP regions that
> +       contain a shared PTE.
> +
>  As the system ages, allocating huge pages may be expensive as the
>  system uses memory compaction to copy data around memory to free a
>  huge page for use. There are some counters in ``/proc/vmstat`` to help
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 4ada5d1f7297..6f1593d0b4b5 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -144,6 +144,9 @@ enum mthp_stat_item {
>         MTHP_STAT_SPLIT_DEFERRED,
>         MTHP_STAT_NR_ANON,
>         MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
> +       MTHP_STAT_COLLAPSE_EXCEED_SWAP,
> +       MTHP_STAT_COLLAPSE_EXCEED_NONE,
> +       MTHP_STAT_COLLAPSE_EXCEED_SHARED,
>         __MTHP_STAT_COUNT
>  };
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 20d005c2c61f..9f0470c3e983 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -639,6 +639,10 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
>  DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
>  DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
>  DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
> +DEFINE_MTHP_STAT_ATTR(collapse_exceed_swap_pte, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
> +DEFINE_MTHP_STAT_ATTR(collapse_exceed_none_pte, MTHP_STAT_COLLAPSE_EXCEED_NONE);
> +DEFINE_MTHP_STAT_ATTR(collapse_exceed_shared_pte, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> +
>
>  static struct attribute *anon_stats_attrs[] = {
>         &anon_fault_alloc_attr.attr,
> @@ -655,6 +659,9 @@ static struct attribute *anon_stats_attrs[] = {
>         &split_deferred_attr.attr,
>         &nr_anon_attr.attr,
>         &nr_anon_partially_mapped_attr.attr,
> +       &collapse_exceed_swap_pte_attr.attr,
> +       &collapse_exceed_none_pte_attr.attr,
> +       &collapse_exceed_shared_pte_attr.attr,
>         NULL,
>  };
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index c13bc583a368..5a3386043f39 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -594,7 +594,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>                                 continue;
>                         } else {
>                                 result = SCAN_EXCEED_NONE_PTE;
> -                               count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
> +                               if (order == HPAGE_PMD_ORDER)
> +                                       count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
> +                               count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_NONE);
>                                 goto out;
>                         }
>                 }
> @@ -633,10 +635,17 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>                          * shared may cause a future higher order collapse on a
>                          * rescan of the same range.
>                          */
> -                       if (order != HPAGE_PMD_ORDER || (cc->is_khugepaged &&
> -                           shared > khugepaged_max_ptes_shared)) {
> +                       if (order != HPAGE_PMD_ORDER) {
> +                               result = SCAN_EXCEED_SHARED_PTE;
> +                               count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> +                               goto out;
> +                       }
> +
> +                       if (cc->is_khugepaged &&
> +                           shared > khugepaged_max_ptes_shared) {
>                                 result = SCAN_EXCEED_SHARED_PTE;
>                                 count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
> +                               count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
>                                 goto out;
>                         }
>                 }
> @@ -1084,6 +1093,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
>                  * range.
>                  */
>                 if (order != HPAGE_PMD_ORDER) {
> +                       count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
>                         pte_unmap(pte);
>                         mmap_read_unlock(mm);
>                         result = SCAN_EXCEED_SWAP_PTE;
> --
> 2.50.1
>


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v10 13/13] Documentation: mm: update the admin guide for mTHP collapse
  2025-08-19 13:48 ` [PATCH v10 13/13] Documentation: mm: update the admin guide for mTHP collapse Nico Pache
@ 2025-08-19 14:24   ` Nico Pache
  0 siblings, 0 replies; 6+ messages in thread
From: Nico Pache @ 2025-08-19 14:24 UTC (permalink / raw)
  To: linux-mm, linux-doc, linux-kernel, linux-trace-kernel
  Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
	ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
	mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
	usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
	kirill.shutemov, aarcange, raquini, anshuman.khandual,
	catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
	surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
	Bagas Sanjaya

On Tue, Aug 19, 2025 at 7:49 AM Nico Pache <npache@redhat.com> wrote:
>
> Now that we can collapse to mTHPs lets update the admin guide to
> reflect these changes and provide proper guidence on how to utilize it.
>
> Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
I had git send email error and had to resend this patch (13), but i
forgot the in-reply-to
please ignore this one and reply to correct version

https://lore.kernel.org/lkml/20250819141742.626517-1-npache@redhat.com/
--in-reply-to=20250819141742.626517-1-npache@redhat.com



-- Nico
> ---
>  Documentation/admin-guide/mm/transhuge.rst | 19 +++++++++++++------
>  1 file changed, 13 insertions(+), 6 deletions(-)
>
> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> index b85547ac4fe9..1f9e6a32052c 100644
> --- a/Documentation/admin-guide/mm/transhuge.rst
> +++ b/Documentation/admin-guide/mm/transhuge.rst
> @@ -63,7 +63,7 @@ often.
>  THP can be enabled system wide or restricted to certain tasks or even
>  memory ranges inside task's address space. Unless THP is completely
>  disabled, there is ``khugepaged`` daemon that scans memory and
> -collapses sequences of basic pages into PMD-sized huge pages.
> +collapses sequences of basic pages into huge pages.
>
>  The THP behaviour is controlled via :ref:`sysfs <thp_sysfs>`
>  interface and using madvise(2) and prctl(2) system calls.
> @@ -149,6 +149,18 @@ hugepage sizes have enabled="never". If enabling multiple hugepage
>  sizes, the kernel will select the most appropriate enabled size for a
>  given allocation.
>
> +khugepaged uses max_ptes_none scaled to the order of the enabled mTHP size
> +to determine collapses. When using mTHPs it's recommended to set
> +max_ptes_none low-- ideally less than HPAGE_PMD_NR / 2 (255 on 4k page
> +size). This will prevent undesired "creep" behavior that leads to
> +continuously collapsing to the largest mTHP size; when we collapse, we are
> +bringing in new non-zero pages that will, on a subsequent scan, cause the
> +max_ptes_none check of the +1 order to always be satisfied. By limiting
> +this to less than half the current order, we make sure we don't cause this
> +feedback loop. max_ptes_shared and max_ptes_swap have no effect when
> +collapsing to a mTHP, and mTHP collapse will fail on shared or swapped out
> +pages.
> +
>  It's also possible to limit defrag efforts in the VM to generate
>  anonymous hugepages in case they're not immediately free to madvise
>  regions or to never try to defrag memory and simply fallback to regular
> @@ -264,11 +276,6 @@ support the following arguments::
>  Khugepaged controls
>  -------------------
>
> -.. note::
> -   khugepaged currently only searches for opportunities to collapse to
> -   PMD-sized THP and no attempt is made to collapse to other THP
> -   sizes.
> -
>  khugepaged runs usually at low frequency so while one may not want to
>  invoke defrag algorithms synchronously during the page faults, it
>  should be worth invoking defrag at least in khugepaged. However it's
> --
> 2.50.1
>


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v10 12/13] khugepaged: add per-order mTHP khugepaged stats
  2025-08-19 14:16 ` [PATCH v10 12/13] khugepaged: add per-order mTHP khugepaged stats Nico Pache
@ 2025-08-21 14:47   ` Lorenzo Stoakes
  0 siblings, 0 replies; 6+ messages in thread
From: Lorenzo Stoakes @ 2025-08-21 14:47 UTC (permalink / raw)
  To: Nico Pache
  Cc: linux-mm, linux-doc, linux-kernel, linux-trace-kernel, david, ziy,
	baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
	rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
	wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
	thomas.hellstrom, yang, kirill.shutemov, aarcange, raquini,
	anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
	jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
	rdunlap, hughd

On Tue, Aug 19, 2025 at 08:16:10AM -0600, Nico Pache wrote:
> With mTHP support inplace, let add the per-order mTHP stats for
> exceeding NONE, SWAP, and SHARED.
>

This is really not enough of a commit message. Exceeding what, where, why,
how? What does 'exceeding' mean here, etc. etc. More words please :)

> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  Documentation/admin-guide/mm/transhuge.rst | 17 +++++++++++++++++
>  include/linux/huge_mm.h                    |  3 +++
>  mm/huge_memory.c                           |  7 +++++++
>  mm/khugepaged.c                            | 16 +++++++++++++---
>  4 files changed, 40 insertions(+), 3 deletions(-)
>
> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> index 7ccb93e22852..b85547ac4fe9 100644
> --- a/Documentation/admin-guide/mm/transhuge.rst
> +++ b/Documentation/admin-guide/mm/transhuge.rst
> @@ -705,6 +705,23 @@ nr_anon_partially_mapped
>         an anonymous THP as "partially mapped" and count it here, even though it
>         is not actually partially mapped anymore.
>
> +collapse_exceed_swap_pte
> +       The number of anonymous THP which contain at least one swap PTE.

The number of anonymous THP what? Pages? Let's be specific.

> +       Currently khugepaged does not support collapsing mTHP regions that
> +       contain a swap PTE.

Wait what? So we have a counter for something that's unsupported? That
seems not so useful?

> +
> +collapse_exceed_none_pte
> +       The number of anonymous THP which have exceeded the none PTE threshold.

THP pages. What's the 'none PTE threshold'? Do you mean
/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none ?

Let's spell that out please, this is far too vague.

> +       With mTHP collapse, a bitmap is used to gather the state of a PMD region
> +       and is then recursively checked from largest to smallest order against
> +       the scaled max_ptes_none count. This counter indicates that the next
> +       enabled order will be checked.

I think you really need to expand upon this as this is confusing and vague.

I also don't think saying 'recursive' here really benefits anything, Just
saying that we try to collapse the largest mTHP size we can in each
instance, and then give a more 'words-y' explanation as to how
max_ptes_none is (in effect) converted to a ratio of a PMD, and then that
ratio is applied to the mTHP sizes.

You can then go on to say that this counter measures the number of
occasions in which this occurred.

> +
> +collapse_exceed_shared_pte
> +       The number of anonymous THP which contain at least one shared PTE.

anonymous THP pages right? :)

> +       Currently khugepaged does not support collapsing mTHP regions that
> +       contain a shared PTE.

Again I don't really understand the purpose of creating a counter for
something we don't support.

Let's add it when we support it.

I also in this case and the exceed swap case don't understand what you mean
by exceed here, you need to spell this out clearly.

Perhaps the context missing here is that you _also_ count THP events in
these counters.

But again, given we have THP_... counters for the stats mTHP doesn't do
yet, I'd say adding these is pointless.

> +
>  As the system ages, allocating huge pages may be expensive as the
>  system uses memory compaction to copy data around memory to free a
>  huge page for use. There are some counters in ``/proc/vmstat`` to help
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 4ada5d1f7297..6f1593d0b4b5 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -144,6 +144,9 @@ enum mthp_stat_item {
>  	MTHP_STAT_SPLIT_DEFERRED,
>  	MTHP_STAT_NR_ANON,
>  	MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
> +	MTHP_STAT_COLLAPSE_EXCEED_SWAP,
> +	MTHP_STAT_COLLAPSE_EXCEED_NONE,
> +	MTHP_STAT_COLLAPSE_EXCEED_SHARED,

Wh do we put 'collapse' here but not in the THP equivalents?

>  	__MTHP_STAT_COUNT
>  };
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 20d005c2c61f..9f0470c3e983 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -639,6 +639,10 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
>  DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
>  DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
>  DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
> +DEFINE_MTHP_STAT_ATTR(collapse_exceed_swap_pte, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
> +DEFINE_MTHP_STAT_ATTR(collapse_exceed_none_pte, MTHP_STAT_COLLAPSE_EXCEED_NONE);
> +DEFINE_MTHP_STAT_ATTR(collapse_exceed_shared_pte, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> +
>
>  static struct attribute *anon_stats_attrs[] = {
>  	&anon_fault_alloc_attr.attr,
> @@ -655,6 +659,9 @@ static struct attribute *anon_stats_attrs[] = {
>  	&split_deferred_attr.attr,
>  	&nr_anon_attr.attr,
>  	&nr_anon_partially_mapped_attr.attr,
> +	&collapse_exceed_swap_pte_attr.attr,
> +	&collapse_exceed_none_pte_attr.attr,
> +	&collapse_exceed_shared_pte_attr.attr,
>  	NULL,
>  };
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index c13bc583a368..5a3386043f39 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -594,7 +594,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  				continue;
>  			} else {
>  				result = SCAN_EXCEED_NONE_PTE;
> -				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);

Hm so wait you were miscounting statistics in patch 10/13 when you turned
all this one? That's not good.

This should be in place _first_ before enabling the feature.

> +				if (order == HPAGE_PMD_ORDER)
> +					count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
> +				count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_NONE);
>  				goto out;
>  			}
>  		}
> @@ -633,10 +635,17 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  			 * shared may cause a future higher order collapse on a
>  			 * rescan of the same range.
>  			 */
> -			if (order != HPAGE_PMD_ORDER || (cc->is_khugepaged &&
> -			    shared > khugepaged_max_ptes_shared)) {
> +			if (order != HPAGE_PMD_ORDER) {

Hm wait what? I dont understand what's going on here? You're no longer
actually doing any check except order != HPAGE_PMD_ORDER?... am I missnig
something?

Again why we are bothering to maintain a counter that doesn't mean anything
I don't know? I may be misinterpreting somehow however.

> +				result = SCAN_EXCEED_SHARED_PTE;
> +				count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> +				goto out;
> +			}
> +
> +			if (cc->is_khugepaged &&
> +			    shared > khugepaged_max_ptes_shared) {
>  				result = SCAN_EXCEED_SHARED_PTE;
>  				count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
> +				count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
>  				goto out;
>  			}
>  		}
> @@ -1084,6 +1093,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
>  		 * range.
>  		 */
>  		if (order != HPAGE_PMD_ORDER) {
> +			count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SWAP);

This again seems surely to not be testing for what it claims to be
tracking? I may again be missing context here.

>  			pte_unmap(pte);
>  			mmap_read_unlock(mm);
>  			result = SCAN_EXCEED_SWAP_PTE;
> --
> 2.50.1
>

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2025-08-21 14:49 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-19 13:48 [PATCH v10 12/13] khugepaged: add per-order mTHP khugepaged stats Nico Pache
2025-08-19 13:48 ` [PATCH v10 13/13] Documentation: mm: update the admin guide for mTHP collapse Nico Pache
2025-08-19 14:24   ` Nico Pache
2025-08-19 14:23 ` [PATCH v10 12/13] khugepaged: add per-order mTHP khugepaged stats Nico Pache
  -- strict thread matches above, loose matches on Subject: below --
2025-08-19 13:41 [PATCH v10 00/13] khugepaged: mTHP support Nico Pache
2025-08-19 14:16 ` [PATCH v10 12/13] khugepaged: add per-order mTHP khugepaged stats Nico Pache
2025-08-21 14:47   ` Lorenzo Stoakes

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).