* [PATCH v5 1/4] mm: defer THP insertion to khugepaged
2025-04-28 18:29 [PATCH v5 0/4] mm: introduce THP deferred setting Nico Pache
@ 2025-04-28 18:29 ` Nico Pache
2025-04-29 13:49 ` Zi Yan
2025-04-28 18:29 ` [PATCH v5 2/4] mm: document (m)THP defer usage Nico Pache
` (2 subsequent siblings)
3 siblings, 1 reply; 12+ messages in thread
From: Nico Pache @ 2025-04-28 18:29 UTC (permalink / raw)
To: linux-mm, linux-doc, linux-kernel, linux-kselftest
Cc: akpm, corbet, rostedt, mhiramat, mathieu.desnoyers, david, baohua,
baolin.wang, ryan.roberts, willy, peterx, shuah, ziy,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kirill.shutemov, aarcange, raquini,
dev.jain, anshuman.khandual, catalin.marinas, tiwai, will,
dave.hansen, jack, cl, jglisse, surenb, zokeefe, Liam.Howlett,
lorenzo.stoakes, hannes, rientjes, mhocko, rdunlap
setting /transparent_hugepages/enabled=always allows applications
to benefit from THPs without having to madvise. However, the pf handler
takes very few considerations to decide weather or not to actually use a
THP. This can lead to a lot of wasted memory. khugepaged only operates
on memory that was either allocated with enabled=always or MADV_HUGEPAGE.
Introduce the ability to set enabled=defer, which will prevent THPs from
being allocated by the page fault handler unless madvise is set,
leaving it up to khugepaged to decide which allocations will collapse to a
THP. This should allow applications to benefits from THPs, while curbing
some of the memory waste.
Co-developed-by: Rafael Aquini <raquini@redhat.com>
Signed-off-by: Rafael Aquini <raquini@redhat.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
include/linux/huge_mm.h | 15 +++++++++++++--
mm/huge_memory.c | 31 +++++++++++++++++++++++++++----
2 files changed, 40 insertions(+), 6 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e3d15c737008..57e6c962afb1 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -48,6 +48,7 @@ enum transparent_hugepage_flag {
TRANSPARENT_HUGEPAGE_UNSUPPORTED,
TRANSPARENT_HUGEPAGE_FLAG,
TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
@@ -186,6 +187,7 @@ static inline bool hugepage_global_enabled(void)
{
return transparent_hugepage_flags &
((1<<TRANSPARENT_HUGEPAGE_FLAG) |
+ (1<<TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG) |
(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG));
}
@@ -195,6 +197,12 @@ static inline bool hugepage_global_always(void)
(1<<TRANSPARENT_HUGEPAGE_FLAG);
}
+static inline bool hugepage_global_defer(void)
+{
+ return transparent_hugepage_flags &
+ (1<<TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG);
+}
+
static inline int highest_order(unsigned long orders)
{
return fls_long(orders) - 1;
@@ -291,13 +299,16 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
unsigned long tva_flags,
unsigned long orders)
{
+ if ((tva_flags & TVA_IN_PF) && hugepage_global_defer() &&
+ !(vm_flags & VM_HUGEPAGE))
+ return 0;
+
/* Optimization to check if required orders are enabled early. */
if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) {
unsigned long mask = READ_ONCE(huge_anon_orders_always);
-
if (vm_flags & VM_HUGEPAGE)
mask |= READ_ONCE(huge_anon_orders_madvise);
- if (hugepage_global_always() ||
+ if (hugepage_global_always() || hugepage_global_defer() ||
((vm_flags & VM_HUGEPAGE) && hugepage_global_enabled()))
mask |= READ_ONCE(huge_anon_orders_inherit);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8af5caa0d9bc..17b66adef029 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -297,12 +297,15 @@ static ssize_t enabled_show(struct kobject *kobj,
const char *output;
if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
- output = "[always] madvise never";
+ output = "[always] madvise defer never";
else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
&transparent_hugepage_flags))
- output = "always [madvise] never";
+ output = "always [madvise] defer never";
+ else if (test_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG,
+ &transparent_hugepage_flags))
+ output = "always madvise [defer] never";
else
- output = "always madvise [never]";
+ output = "always madvise defer [never]";
return sysfs_emit(buf, "%s\n", output);
}
@@ -315,13 +318,20 @@ static ssize_t enabled_store(struct kobject *kobj,
if (sysfs_streq(buf, "always")) {
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+ } else if (sysfs_streq(buf, "defer")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG, &transparent_hugepage_flags);
} else if (sysfs_streq(buf, "madvise")) {
clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
} else if (sysfs_streq(buf, "never")) {
clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG, &transparent_hugepage_flags);
} else
ret = -EINVAL;
@@ -954,18 +964,31 @@ static int __init setup_transparent_hugepage(char *str)
&transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
&transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG,
+ &transparent_hugepage_flags);
ret = 1;
+ } else if (!strcmp(str, "defer")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG,
+ &transparent_hugepage_flags);
} else if (!strcmp(str, "madvise")) {
clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
&transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG,
+ &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
- &transparent_hugepage_flags);
+ &transparent_hugepage_flags);
ret = 1;
} else if (!strcmp(str, "never")) {
clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
&transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
&transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG,
+ &transparent_hugepage_flags);
ret = 1;
}
out:
--
2.48.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH v5 1/4] mm: defer THP insertion to khugepaged
2025-04-28 18:29 ` [PATCH v5 1/4] mm: defer THP insertion to khugepaged Nico Pache
@ 2025-04-29 13:49 ` Zi Yan
2025-04-30 18:39 ` Nico Pache
0 siblings, 1 reply; 12+ messages in thread
From: Zi Yan @ 2025-04-29 13:49 UTC (permalink / raw)
To: Nico Pache
Cc: linux-mm, linux-doc, linux-kernel, linux-kselftest, akpm, corbet,
rostedt, mhiramat, mathieu.desnoyers, david, baohua, baolin.wang,
ryan.roberts, willy, peterx, shuah, wangkefeng.wang, usamaarif642,
sunnanyong, vishal.moola, thomas.hellstrom, yang, kirill.shutemov,
aarcange, raquini, dev.jain, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
Liam.Howlett, lorenzo.stoakes, hannes, rientjes, mhocko, rdunlap
On 28 Apr 2025, at 14:29, Nico Pache wrote:
> setting /transparent_hugepages/enabled=always allows applications
> to benefit from THPs without having to madvise. However, the pf handler
s/pf/page fault
> takes very few considerations to decide weather or not to actually use a
s/weather/whether
> THP. This can lead to a lot of wasted memory. khugepaged only operates
> on memory that was either allocated with enabled=always or MADV_HUGEPAGE.
>
> Introduce the ability to set enabled=defer, which will prevent THPs from
> being allocated by the page fault handler unless madvise is set,
> leaving it up to khugepaged to decide which allocations will collapse to a
> THP. This should allow applications to benefits from THPs, while curbing
> some of the memory waste.
>
> Co-developed-by: Rafael Aquini <raquini@redhat.com>
> Signed-off-by: Rafael Aquini <raquini@redhat.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
> include/linux/huge_mm.h | 15 +++++++++++++--
> mm/huge_memory.c | 31 +++++++++++++++++++++++++++----
> 2 files changed, 40 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index e3d15c737008..57e6c962afb1 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -48,6 +48,7 @@ enum transparent_hugepage_flag {
> TRANSPARENT_HUGEPAGE_UNSUPPORTED,
> TRANSPARENT_HUGEPAGE_FLAG,
> TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
> + TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG,
What does INST mean here? Can you add one sentence on this new flag
in the commit log to explain what it is short for?
> TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
> TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
> TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
> @@ -186,6 +187,7 @@ static inline bool hugepage_global_enabled(void)
> {
> return transparent_hugepage_flags &
> ((1<<TRANSPARENT_HUGEPAGE_FLAG) |
> + (1<<TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG) |
> (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG));
> }
>
> @@ -195,6 +197,12 @@ static inline bool hugepage_global_always(void)
> (1<<TRANSPARENT_HUGEPAGE_FLAG);
> }
>
> +static inline bool hugepage_global_defer(void)
> +{
> + return transparent_hugepage_flags &
> + (1<<TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG);
> +}
> +
> static inline int highest_order(unsigned long orders)
> {
> return fls_long(orders) - 1;
> @@ -291,13 +299,16 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
> unsigned long tva_flags,
> unsigned long orders)
> {
> + if ((tva_flags & TVA_IN_PF) && hugepage_global_defer() &&
> + !(vm_flags & VM_HUGEPAGE))
> + return 0;
> +
> /* Optimization to check if required orders are enabled early. */
> if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) {
> unsigned long mask = READ_ONCE(huge_anon_orders_always);
> -
This newline should stay, right?
The rest looks good to me. Thanks. Acked-by: Zi Yan <ziy@nvidia.com>
Best Regards,
Yan, Zi
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v5 1/4] mm: defer THP insertion to khugepaged
2025-04-29 13:49 ` Zi Yan
@ 2025-04-30 18:39 ` Nico Pache
0 siblings, 0 replies; 12+ messages in thread
From: Nico Pache @ 2025-04-30 18:39 UTC (permalink / raw)
To: Zi Yan
Cc: linux-mm, linux-doc, linux-kernel, linux-kselftest, akpm, corbet,
rostedt, mhiramat, mathieu.desnoyers, david, baohua, baolin.wang,
ryan.roberts, willy, peterx, shuah, wangkefeng.wang, usamaarif642,
sunnanyong, vishal.moola, thomas.hellstrom, yang, kirill.shutemov,
aarcange, raquini, dev.jain, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
Liam.Howlett, lorenzo.stoakes, hannes, rientjes, mhocko, rdunlap
On Tue, Apr 29, 2025 at 7:49 AM Zi Yan <ziy@nvidia.com> wrote:
>
> On 28 Apr 2025, at 14:29, Nico Pache wrote:
>
> > setting /transparent_hugepages/enabled=always allows applications
> > to benefit from THPs without having to madvise. However, the pf handler
>
> s/pf/page fault
>
> > takes very few considerations to decide weather or not to actually use a
>
> s/weather/whether
>
> > THP. This can lead to a lot of wasted memory. khugepaged only operates
> > on memory that was either allocated with enabled=always or MADV_HUGEPAGE.
> >
> > Introduce the ability to set enabled=defer, which will prevent THPs from
> > being allocated by the page fault handler unless madvise is set,
> > leaving it up to khugepaged to decide which allocations will collapse to a
> > THP. This should allow applications to benefits from THPs, while curbing
> > some of the memory waste.
> >
> > Co-developed-by: Rafael Aquini <raquini@redhat.com>
> > Signed-off-by: Rafael Aquini <raquini@redhat.com>
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> > include/linux/huge_mm.h | 15 +++++++++++++--
> > mm/huge_memory.c | 31 +++++++++++++++++++++++++++----
> > 2 files changed, 40 insertions(+), 6 deletions(-)
> >
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index e3d15c737008..57e6c962afb1 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -48,6 +48,7 @@ enum transparent_hugepage_flag {
> > TRANSPARENT_HUGEPAGE_UNSUPPORTED,
> > TRANSPARENT_HUGEPAGE_FLAG,
> > TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
> > + TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG,
>
> What does INST mean here? Can you add one sentence on this new flag
> in the commit log to explain what it is short for?
"INSERT". Someone else commented on the length of this FLAG name. I
forgot to update it.
I can shorten it to something like ..DEFER_FLAG or DEFER_PF_FLAG
>
>
> > TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
> > TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
> > TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
> > @@ -186,6 +187,7 @@ static inline bool hugepage_global_enabled(void)
> > {
> > return transparent_hugepage_flags &
> > ((1<<TRANSPARENT_HUGEPAGE_FLAG) |
> > + (1<<TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG) |
> > (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG));
> > }
> >
> > @@ -195,6 +197,12 @@ static inline bool hugepage_global_always(void)
> > (1<<TRANSPARENT_HUGEPAGE_FLAG);
> > }
> >
> > +static inline bool hugepage_global_defer(void)
> > +{
> > + return transparent_hugepage_flags &
> > + (1<<TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG);
> > +}
> > +
> > static inline int highest_order(unsigned long orders)
> > {
> > return fls_long(orders) - 1;
> > @@ -291,13 +299,16 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
> > unsigned long tva_flags,
> > unsigned long orders)
> > {
> > + if ((tva_flags & TVA_IN_PF) && hugepage_global_defer() &&
> > + !(vm_flags & VM_HUGEPAGE))
> > + return 0;
> > +
> > /* Optimization to check if required orders are enabled early. */
> > if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) {
> > unsigned long mask = READ_ONCE(huge_anon_orders_always);
> > -
>
> This newline should stay, right?
Yes, I can fix that.
>
> The rest looks good to me. Thanks. Acked-by: Zi Yan <ziy@nvidia.com>
Thank you!
-- Nico
>
> Best Regards,
> Yan, Zi
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v5 2/4] mm: document (m)THP defer usage
2025-04-28 18:29 [PATCH v5 0/4] mm: introduce THP deferred setting Nico Pache
2025-04-28 18:29 ` [PATCH v5 1/4] mm: defer THP insertion to khugepaged Nico Pache
@ 2025-04-28 18:29 ` Nico Pache
2025-04-30 20:15 ` Zi Yan
2025-04-28 18:29 ` [PATCH v5 3/4] khugepaged: add defer option to mTHP options Nico Pache
2025-04-28 18:29 ` [PATCH v5 4/4] selftests: mm: add defer to thp setting parser Nico Pache
3 siblings, 1 reply; 12+ messages in thread
From: Nico Pache @ 2025-04-28 18:29 UTC (permalink / raw)
To: linux-mm, linux-doc, linux-kernel, linux-kselftest
Cc: akpm, corbet, rostedt, mhiramat, mathieu.desnoyers, david, baohua,
baolin.wang, ryan.roberts, willy, peterx, shuah, ziy,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kirill.shutemov, aarcange, raquini,
dev.jain, anshuman.khandual, catalin.marinas, tiwai, will,
dave.hansen, jack, cl, jglisse, surenb, zokeefe, Liam.Howlett,
lorenzo.stoakes, hannes, rientjes, mhocko, rdunlap, Bagas Sanjaya
The new defer option for (m)THPs allows for a more conservative
approach to (m)THPs. Document its usage in the transhuge admin-guide.
Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
Documentation/admin-guide/mm/transhuge.rst | 31 ++++++++++++++++------
1 file changed, 23 insertions(+), 8 deletions(-)
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 5c63fe51b3ad..c50253357793 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -88,8 +88,9 @@ In certain cases when hugepages are enabled system wide, application
may end up allocating more memory resources. An application may mmap a
large region but only touch 1 byte of it, in that case a 2M page might
be allocated instead of a 4k page for no good. This is why it's
-possible to disable hugepages system-wide and to only have them inside
-MADV_HUGEPAGE madvise regions.
+possible to disable hugepages system-wide, only have them inside
+MADV_HUGEPAGE madvise regions, or defer them away from the page fault
+handler to khugepaged.
Embedded systems should enable hugepages only inside madvise regions
to eliminate any risk of wasting any precious byte of memory and to
@@ -99,6 +100,15 @@ Applications that gets a lot of benefit from hugepages and that don't
risk to lose memory by using hugepages, should use
madvise(MADV_HUGEPAGE) on their critical mmapped regions.
+Applications that would like to benefit from THPs but would still like a
+more memory conservative approach can choose 'defer'. This avoids
+inserting THPs at the page fault handler unless they are MADV_HUGEPAGE.
+Khugepaged will then scan the mappings for potential collapses into (m)THP
+pages. Admins using this the 'defer' setting should consider
+tweaking khugepaged/max_ptes_none. The current default of 511 may
+aggressively collapse your PTEs into PMDs. Lower this value to conserve
+more memory (i.e., max_ptes_none=64).
+
.. _thp_sysfs:
sysfs
@@ -109,11 +119,14 @@ Global THP controls
Transparent Hugepage Support for anonymous memory can be entirely disabled
(mostly for debugging purposes) or only enabled inside MADV_HUGEPAGE
-regions (to avoid the risk of consuming more memory resources) or enabled
-system wide. This can be achieved per-supported-THP-size with one of::
+regions (to avoid the risk of consuming more memory resources), deferred to
+khugepaged, or enabled system wide.
+
+This can be achieved per-supported-THP-size with one of::
echo always >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/enabled
echo madvise >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/enabled
+ echo defer >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/enabled
echo never >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/enabled
where <size> is the hugepage size being addressed, the available sizes
@@ -136,6 +149,7 @@ The top-level setting (for use with "inherit") can be set by issuing
one of the following commands::
echo always >/sys/kernel/mm/transparent_hugepage/enabled
+ echo defer >/sys/kernel/mm/transparent_hugepage/enabled
echo madvise >/sys/kernel/mm/transparent_hugepage/enabled
echo never >/sys/kernel/mm/transparent_hugepage/enabled
@@ -286,7 +300,8 @@ of small pages into one large page::
A higher value leads to use additional memory for programs.
A lower value leads to gain less thp performance. Value of
max_ptes_none can waste cpu time very little, you can
-ignore it.
+ignore it. Consider lowering this value when using
+``transparent_hugepage=defer``
``max_ptes_swap`` specifies how many pages can be brought in from
swap when collapsing a group of pages into a transparent huge page::
@@ -311,14 +326,14 @@ Boot parameters
You can change the sysfs boot time default for the top-level "enabled"
control by passing the parameter ``transparent_hugepage=always`` or
-``transparent_hugepage=madvise`` or ``transparent_hugepage=never`` to the
-kernel command line.
+``transparent_hugepage=madvise`` or ``transparent_hugepage=defer`` or
+``transparent_hugepage=never`` to the kernel command line.
Alternatively, each supported anonymous THP size can be controlled by
passing ``thp_anon=<size>[KMG],<size>[KMG]:<state>;<size>[KMG]-<size>[KMG]:<state>``,
where ``<size>`` is the THP size (must be a power of 2 of PAGE_SIZE and
supported anonymous THP) and ``<state>`` is one of ``always``, ``madvise``,
-``never`` or ``inherit``.
+``defer``, ``never`` or ``inherit``.
For example, the following will set 16K, 32K, 64K THP to ``always``,
set 128K, 512K to ``inherit``, set 256K to ``madvise`` and 1M, 2M
--
2.48.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH v5 2/4] mm: document (m)THP defer usage
2025-04-28 18:29 ` [PATCH v5 2/4] mm: document (m)THP defer usage Nico Pache
@ 2025-04-30 20:15 ` Zi Yan
2025-05-01 22:38 ` Nico Pache
0 siblings, 1 reply; 12+ messages in thread
From: Zi Yan @ 2025-04-30 20:15 UTC (permalink / raw)
To: Nico Pache
Cc: linux-mm, linux-doc, linux-kernel, linux-kselftest, akpm, corbet,
rostedt, mhiramat, mathieu.desnoyers, david, baohua, baolin.wang,
ryan.roberts, willy, peterx, shuah, wangkefeng.wang, usamaarif642,
sunnanyong, vishal.moola, thomas.hellstrom, yang, kirill.shutemov,
aarcange, raquini, dev.jain, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
Liam.Howlett, lorenzo.stoakes, hannes, rientjes, mhocko, rdunlap,
Bagas Sanjaya
On 28 Apr 2025, at 14:29, Nico Pache wrote:
> The new defer option for (m)THPs allows for a more conservative
> approach to (m)THPs. Document its usage in the transhuge admin-guide.
>
> Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
> Documentation/admin-guide/mm/transhuge.rst | 31 ++++++++++++++++------
> 1 file changed, 23 insertions(+), 8 deletions(-)
>
> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> index 5c63fe51b3ad..c50253357793 100644
> --- a/Documentation/admin-guide/mm/transhuge.rst
> +++ b/Documentation/admin-guide/mm/transhuge.rst
> @@ -88,8 +88,9 @@ In certain cases when hugepages are enabled system wide, application
> may end up allocating more memory resources. An application may mmap a
> large region but only touch 1 byte of it, in that case a 2M page might
> be allocated instead of a 4k page for no good. This is why it's
> -possible to disable hugepages system-wide and to only have them inside
> -MADV_HUGEPAGE madvise regions.
> +possible to disable hugepages system-wide, only have them inside
> +MADV_HUGEPAGE madvise regions, or defer them away from the page fault
> +handler to khugepaged.
>
> Embedded systems should enable hugepages only inside madvise regions
> to eliminate any risk of wasting any precious byte of memory and to
> @@ -99,6 +100,15 @@ Applications that gets a lot of benefit from hugepages and that don't
> risk to lose memory by using hugepages, should use
> madvise(MADV_HUGEPAGE) on their critical mmapped regions.
>
> +Applications that would like to benefit from THPs but would still like a
> +more memory conservative approach can choose 'defer'. This avoids
> +inserting THPs at the page fault handler unless they are MADV_HUGEPAGE.
> +Khugepaged will then scan the mappings for potential collapses into (m)THP
How about the text below? It explicitly states khugepaged behavior.
Khugepaged will then scan all mappings, even those not explicitly marked
with MADV_HUGEPAGE, for potential collapses into (m)THPs.
> +pages. Admins using this the 'defer' setting should consider
> +tweaking khugepaged/max_ptes_none. The current default of 511 may
> +aggressively collapse your PTEs into PMDs. Lower this value to conserve
> +more memory (i.e., max_ptes_none=64).
> +
> .. _thp_sysfs:
>
> sysfs
> @@ -109,11 +119,14 @@ Global THP controls
>
> Transparent Hugepage Support for anonymous memory can be entirely disabled
> (mostly for debugging purposes) or only enabled inside MADV_HUGEPAGE
> -regions (to avoid the risk of consuming more memory resources) or enabled
> -system wide. This can be achieved per-supported-THP-size with one of::
> +regions (to avoid the risk of consuming more memory resources), deferred to
> +khugepaged, or enabled system wide.
> +
> +This can be achieved per-supported-THP-size with one of::
>
> echo always >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/enabled
> echo madvise >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/enabled
> + echo defer >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/enabled
> echo never >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/enabled
>
> where <size> is the hugepage size being addressed, the available sizes
> @@ -136,6 +149,7 @@ The top-level setting (for use with "inherit") can be set by issuing
> one of the following commands::
>
> echo always >/sys/kernel/mm/transparent_hugepage/enabled
> + echo defer >/sys/kernel/mm/transparent_hugepage/enabled
> echo madvise >/sys/kernel/mm/transparent_hugepage/enabled
> echo never >/sys/kernel/mm/transparent_hugepage/enabled
>
> @@ -286,7 +300,8 @@ of small pages into one large page::
> A higher value leads to use additional memory for programs.
> A lower value leads to gain less thp performance. Value of
> max_ptes_none can waste cpu time very little, you can
> -ignore it.
> +ignore it. Consider lowering this value when using
> +``transparent_hugepage=defer``
>
> ``max_ptes_swap`` specifies how many pages can be brought in from
> swap when collapsing a group of pages into a transparent huge page::
> @@ -311,14 +326,14 @@ Boot parameters
>
> You can change the sysfs boot time default for the top-level "enabled"
> control by passing the parameter ``transparent_hugepage=always`` or
> -``transparent_hugepage=madvise`` or ``transparent_hugepage=never`` to the
> -kernel command line.
> +``transparent_hugepage=madvise`` or ``transparent_hugepage=defer`` or
> +``transparent_hugepage=never`` to the kernel command line.
>
> Alternatively, each supported anonymous THP size can be controlled by
> passing ``thp_anon=<size>[KMG],<size>[KMG]:<state>;<size>[KMG]-<size>[KMG]:<state>``,
> where ``<size>`` is the THP size (must be a power of 2 of PAGE_SIZE and
> supported anonymous THP) and ``<state>`` is one of ``always``, ``madvise``,
> -``never`` or ``inherit``.
> +``defer``, ``never`` or ``inherit``.
>
> For example, the following will set 16K, 32K, 64K THP to ``always``,
> set 128K, 512K to ``inherit``, set 256K to ``madvise`` and 1M, 2M
Otherwise, LGTM. Thanks. Reviewed-by: Zi Yan <ziy@nvidia.com>
--
Best Regards,
Yan, Zi
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v5 2/4] mm: document (m)THP defer usage
2025-04-30 20:15 ` Zi Yan
@ 2025-05-01 22:38 ` Nico Pache
0 siblings, 0 replies; 12+ messages in thread
From: Nico Pache @ 2025-05-01 22:38 UTC (permalink / raw)
To: Zi Yan
Cc: linux-mm, linux-doc, linux-kernel, linux-kselftest, akpm, corbet,
rostedt, mhiramat, mathieu.desnoyers, david, baohua, baolin.wang,
ryan.roberts, willy, peterx, shuah, wangkefeng.wang, usamaarif642,
sunnanyong, vishal.moola, thomas.hellstrom, yang, kirill.shutemov,
aarcange, raquini, dev.jain, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
Liam.Howlett, lorenzo.stoakes, hannes, rientjes, mhocko, rdunlap,
Bagas Sanjaya
On Wed, Apr 30, 2025 at 2:15 PM Zi Yan <ziy@nvidia.com> wrote:
>
> On 28 Apr 2025, at 14:29, Nico Pache wrote:
>
> > The new defer option for (m)THPs allows for a more conservative
> > approach to (m)THPs. Document its usage in the transhuge admin-guide.
> >
> > Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> > Documentation/admin-guide/mm/transhuge.rst | 31 ++++++++++++++++------
> > 1 file changed, 23 insertions(+), 8 deletions(-)
> >
> > diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> > index 5c63fe51b3ad..c50253357793 100644
> > --- a/Documentation/admin-guide/mm/transhuge.rst
> > +++ b/Documentation/admin-guide/mm/transhuge.rst
> > @@ -88,8 +88,9 @@ In certain cases when hugepages are enabled system wide, application
> > may end up allocating more memory resources. An application may mmap a
> > large region but only touch 1 byte of it, in that case a 2M page might
> > be allocated instead of a 4k page for no good. This is why it's
> > -possible to disable hugepages system-wide and to only have them inside
> > -MADV_HUGEPAGE madvise regions.
> > +possible to disable hugepages system-wide, only have them inside
> > +MADV_HUGEPAGE madvise regions, or defer them away from the page fault
> > +handler to khugepaged.
> >
> > Embedded systems should enable hugepages only inside madvise regions
> > to eliminate any risk of wasting any precious byte of memory and to
> > @@ -99,6 +100,15 @@ Applications that gets a lot of benefit from hugepages and that don't
> > risk to lose memory by using hugepages, should use
> > madvise(MADV_HUGEPAGE) on their critical mmapped regions.
> >
> > +Applications that would like to benefit from THPs but would still like a
> > +more memory conservative approach can choose 'defer'. This avoids
> > +inserting THPs at the page fault handler unless they are MADV_HUGEPAGE.
> > +Khugepaged will then scan the mappings for potential collapses into (m)THP
>
> How about the text below? It explicitly states khugepaged behavior.
>
> Khugepaged will then scan all mappings, even those not explicitly marked
> with MADV_HUGEPAGE, for potential collapses into (m)THPs.
I agree, this reads better. I can modify it on the V6 :)
>
> > +pages. Admins using this the 'defer' setting should consider
> > +tweaking khugepaged/max_ptes_none. The current default of 511 may
> > +aggressively collapse your PTEs into PMDs. Lower this value to conserve
> > +more memory (i.e., max_ptes_none=64).
> > +
> > .. _thp_sysfs:
> >
> > sysfs
> > @@ -109,11 +119,14 @@ Global THP controls
> >
> > Transparent Hugepage Support for anonymous memory can be entirely disabled
> > (mostly for debugging purposes) or only enabled inside MADV_HUGEPAGE
> > -regions (to avoid the risk of consuming more memory resources) or enabled
> > -system wide. This can be achieved per-supported-THP-size with one of::
> > +regions (to avoid the risk of consuming more memory resources), deferred to
> > +khugepaged, or enabled system wide.
> > +
> > +This can be achieved per-supported-THP-size with one of::
> >
> > echo always >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/enabled
> > echo madvise >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/enabled
> > + echo defer >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/enabled
> > echo never >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/enabled
> >
> > where <size> is the hugepage size being addressed, the available sizes
> > @@ -136,6 +149,7 @@ The top-level setting (for use with "inherit") can be set by issuing
> > one of the following commands::
> >
> > echo always >/sys/kernel/mm/transparent_hugepage/enabled
> > + echo defer >/sys/kernel/mm/transparent_hugepage/enabled
> > echo madvise >/sys/kernel/mm/transparent_hugepage/enabled
> > echo never >/sys/kernel/mm/transparent_hugepage/enabled
> >
> > @@ -286,7 +300,8 @@ of small pages into one large page::
> > A higher value leads to use additional memory for programs.
> > A lower value leads to gain less thp performance. Value of
> > max_ptes_none can waste cpu time very little, you can
> > -ignore it.
> > +ignore it. Consider lowering this value when using
> > +``transparent_hugepage=defer``
> >
> > ``max_ptes_swap`` specifies how many pages can be brought in from
> > swap when collapsing a group of pages into a transparent huge page::
> > @@ -311,14 +326,14 @@ Boot parameters
> >
> > You can change the sysfs boot time default for the top-level "enabled"
> > control by passing the parameter ``transparent_hugepage=always`` or
> > -``transparent_hugepage=madvise`` or ``transparent_hugepage=never`` to the
> > -kernel command line.
> > +``transparent_hugepage=madvise`` or ``transparent_hugepage=defer`` or
> > +``transparent_hugepage=never`` to the kernel command line.
> >
> > Alternatively, each supported anonymous THP size can be controlled by
> > passing ``thp_anon=<size>[KMG],<size>[KMG]:<state>;<size>[KMG]-<size>[KMG]:<state>``,
> > where ``<size>`` is the THP size (must be a power of 2 of PAGE_SIZE and
> > supported anonymous THP) and ``<state>`` is one of ``always``, ``madvise``,
> > -``never`` or ``inherit``.
> > +``defer``, ``never`` or ``inherit``.
> >
> > For example, the following will set 16K, 32K, 64K THP to ``always``,
> > set 128K, 512K to ``inherit``, set 256K to ``madvise`` and 1M, 2M
>
> Otherwise, LGTM. Thanks. Reviewed-by: Zi Yan <ziy@nvidia.com>
>
> --
> Best Regards,
> Yan, Zi
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v5 3/4] khugepaged: add defer option to mTHP options
2025-04-28 18:29 [PATCH v5 0/4] mm: introduce THP deferred setting Nico Pache
2025-04-28 18:29 ` [PATCH v5 1/4] mm: defer THP insertion to khugepaged Nico Pache
2025-04-28 18:29 ` [PATCH v5 2/4] mm: document (m)THP defer usage Nico Pache
@ 2025-04-28 18:29 ` Nico Pache
2025-04-30 20:34 ` Zi Yan
2025-04-28 18:29 ` [PATCH v5 4/4] selftests: mm: add defer to thp setting parser Nico Pache
3 siblings, 1 reply; 12+ messages in thread
From: Nico Pache @ 2025-04-28 18:29 UTC (permalink / raw)
To: linux-mm, linux-doc, linux-kernel, linux-kselftest
Cc: akpm, corbet, rostedt, mhiramat, mathieu.desnoyers, david, baohua,
baolin.wang, ryan.roberts, willy, peterx, shuah, ziy,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kirill.shutemov, aarcange, raquini,
dev.jain, anshuman.khandual, catalin.marinas, tiwai, will,
dave.hansen, jack, cl, jglisse, surenb, zokeefe, Liam.Howlett,
lorenzo.stoakes, hannes, rientjes, mhocko, rdunlap
Now that we have defer to globally disable THPs at fault time, lets add
a defer setting to the mTHP options. This will allow khugepaged to
operate at that order, while avoiding it at PF time.
Signed-off-by: Nico Pache <npache@redhat.com>
---
include/linux/huge_mm.h | 5 +++++
mm/huge_memory.c | 38 +++++++++++++++++++++++++++++++++-----
mm/khugepaged.c | 8 ++++----
3 files changed, 42 insertions(+), 9 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 57e6c962afb1..a877c59bea67 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -96,6 +96,7 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr;
#define TVA_SMAPS (1 << 0) /* Will be used for procfs */
#define TVA_IN_PF (1 << 1) /* Page fault handler */
#define TVA_ENFORCE_SYSFS (1 << 2) /* Obey sysfs configuration */
+#define TVA_IN_KHUGEPAGE ((1 << 2) | (1 << 3)) /* Khugepaged defer support */
#define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \
(!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order)))
@@ -182,6 +183,7 @@ extern unsigned long transparent_hugepage_flags;
extern unsigned long huge_anon_orders_always;
extern unsigned long huge_anon_orders_madvise;
extern unsigned long huge_anon_orders_inherit;
+extern unsigned long huge_anon_orders_defer;
static inline bool hugepage_global_enabled(void)
{
@@ -306,6 +308,9 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
/* Optimization to check if required orders are enabled early. */
if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) {
unsigned long mask = READ_ONCE(huge_anon_orders_always);
+
+ if ((tva_flags & TVA_IN_KHUGEPAGE) == TVA_IN_KHUGEPAGE)
+ mask |= READ_ONCE(huge_anon_orders_defer);
if (vm_flags & VM_HUGEPAGE)
mask |= READ_ONCE(huge_anon_orders_madvise);
if (hugepage_global_always() || hugepage_global_defer() ||
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 17b66adef029..705467ea9265 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -81,6 +81,7 @@ unsigned long huge_zero_pfn __read_mostly = ~0UL;
unsigned long huge_anon_orders_always __read_mostly;
unsigned long huge_anon_orders_madvise __read_mostly;
unsigned long huge_anon_orders_inherit __read_mostly;
+unsigned long huge_anon_orders_defer __read_mostly;
static bool anon_orders_configured __initdata;
static inline bool file_thp_enabled(struct vm_area_struct *vma)
@@ -505,13 +506,15 @@ static ssize_t anon_enabled_show(struct kobject *kobj,
const char *output;
if (test_bit(order, &huge_anon_orders_always))
- output = "[always] inherit madvise never";
+ output = "[always] inherit madvise defer never";
else if (test_bit(order, &huge_anon_orders_inherit))
- output = "always [inherit] madvise never";
+ output = "always [inherit] madvise defer never";
else if (test_bit(order, &huge_anon_orders_madvise))
- output = "always inherit [madvise] never";
+ output = "always inherit [madvise] defer never";
+ else if (test_bit(order, &huge_anon_orders_defer))
+ output = "always inherit madvise [defer] never";
else
- output = "always inherit madvise [never]";
+ output = "always inherit madvise defer [never]";
return sysfs_emit(buf, "%s\n", output);
}
@@ -527,25 +530,36 @@ static ssize_t anon_enabled_store(struct kobject *kobj,
spin_lock(&huge_anon_orders_lock);
clear_bit(order, &huge_anon_orders_inherit);
clear_bit(order, &huge_anon_orders_madvise);
+ clear_bit(order, &huge_anon_orders_defer);
set_bit(order, &huge_anon_orders_always);
spin_unlock(&huge_anon_orders_lock);
} else if (sysfs_streq(buf, "inherit")) {
spin_lock(&huge_anon_orders_lock);
clear_bit(order, &huge_anon_orders_always);
clear_bit(order, &huge_anon_orders_madvise);
+ clear_bit(order, &huge_anon_orders_defer);
set_bit(order, &huge_anon_orders_inherit);
spin_unlock(&huge_anon_orders_lock);
} else if (sysfs_streq(buf, "madvise")) {
spin_lock(&huge_anon_orders_lock);
clear_bit(order, &huge_anon_orders_always);
clear_bit(order, &huge_anon_orders_inherit);
+ clear_bit(order, &huge_anon_orders_defer);
set_bit(order, &huge_anon_orders_madvise);
spin_unlock(&huge_anon_orders_lock);
+ } else if (sysfs_streq(buf, "defer")) {
+ spin_lock(&huge_anon_orders_lock);
+ clear_bit(order, &huge_anon_orders_always);
+ clear_bit(order, &huge_anon_orders_inherit);
+ clear_bit(order, &huge_anon_orders_madvise);
+ set_bit(order, &huge_anon_orders_defer);
+ spin_unlock(&huge_anon_orders_lock);
} else if (sysfs_streq(buf, "never")) {
spin_lock(&huge_anon_orders_lock);
clear_bit(order, &huge_anon_orders_always);
clear_bit(order, &huge_anon_orders_inherit);
clear_bit(order, &huge_anon_orders_madvise);
+ clear_bit(order, &huge_anon_orders_defer);
spin_unlock(&huge_anon_orders_lock);
} else
ret = -EINVAL;
@@ -1002,7 +1016,7 @@ static char str_dup[PAGE_SIZE] __initdata;
static int __init setup_thp_anon(char *str)
{
char *token, *range, *policy, *subtoken;
- unsigned long always, inherit, madvise;
+ unsigned long always, inherit, madvise, defer;
char *start_size, *end_size;
int start, end, nr;
char *p;
@@ -1014,6 +1028,8 @@ static int __init setup_thp_anon(char *str)
always = huge_anon_orders_always;
madvise = huge_anon_orders_madvise;
inherit = huge_anon_orders_inherit;
+ defer = huge_anon_orders_defer;
+
p = str_dup;
while ((token = strsep(&p, ";")) != NULL) {
range = strsep(&token, ":");
@@ -1053,18 +1069,28 @@ static int __init setup_thp_anon(char *str)
bitmap_set(&always, start, nr);
bitmap_clear(&inherit, start, nr);
bitmap_clear(&madvise, start, nr);
+ bitmap_clear(&defer, start, nr);
} else if (!strcmp(policy, "madvise")) {
bitmap_set(&madvise, start, nr);
bitmap_clear(&inherit, start, nr);
bitmap_clear(&always, start, nr);
+ bitmap_clear(&defer, start, nr);
} else if (!strcmp(policy, "inherit")) {
bitmap_set(&inherit, start, nr);
bitmap_clear(&madvise, start, nr);
bitmap_clear(&always, start, nr);
+ bitmap_clear(&defer, start, nr);
+ } else if (!strcmp(policy, "defer")) {
+ bitmap_set(&defer, start, nr);
+ bitmap_clear(&madvise, start, nr);
+ bitmap_clear(&always, start, nr);
+ bitmap_clear(&inherit, start, nr);
} else if (!strcmp(policy, "never")) {
bitmap_clear(&inherit, start, nr);
bitmap_clear(&madvise, start, nr);
bitmap_clear(&always, start, nr);
+ bitmap_clear(&defer, start, nr);
+
} else {
pr_err("invalid policy %s in thp_anon boot parameter\n", policy);
goto err;
@@ -1075,6 +1101,8 @@ static int __init setup_thp_anon(char *str)
huge_anon_orders_always = always;
huge_anon_orders_madvise = madvise;
huge_anon_orders_inherit = inherit;
+ huge_anon_orders_defer = defer;
+
anon_orders_configured = true;
return 1;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 812181354c46..738870331aed 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -491,7 +491,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
{
if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
hugepage_pmd_enabled()) {
- if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS,
+ if (thp_vma_allowable_order(vma, vm_flags, TVA_IN_KHUGEPAGE,
PMD_ORDER))
__khugepaged_enter(vma->vm_mm);
}
@@ -955,7 +955,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
struct collapse_control *cc, int order)
{
struct vm_area_struct *vma;
- unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0;
+ unsigned long tva_flags = cc->is_khugepaged ? TVA_IN_KHUGEPAGE : 0;
if (unlikely(khugepaged_test_exit_or_disable(mm)))
return SCAN_ANY_PROCESS;
@@ -1429,7 +1429,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
bool writable = false;
int chunk_none_count = 0;
int scaled_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER);
- unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0;
+ unsigned long tva_flags = cc->is_khugepaged ? TVA_IN_KHUGEPAGE : 0;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
result = find_pmd_or_thp_or_none(mm, address, &pmd);
@@ -2632,7 +2632,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
break;
}
if (!thp_vma_allowable_order(vma, vma->vm_flags,
- TVA_ENFORCE_SYSFS, PMD_ORDER)) {
+ TVA_IN_KHUGEPAGE, PMD_ORDER)) {
skip:
progress++;
continue;
--
2.48.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH v5 3/4] khugepaged: add defer option to mTHP options
2025-04-28 18:29 ` [PATCH v5 3/4] khugepaged: add defer option to mTHP options Nico Pache
@ 2025-04-30 20:34 ` Zi Yan
2025-05-01 22:53 ` Nico Pache
0 siblings, 1 reply; 12+ messages in thread
From: Zi Yan @ 2025-04-30 20:34 UTC (permalink / raw)
To: Nico Pache
Cc: linux-mm, linux-doc, linux-kernel, linux-kselftest, akpm, corbet,
rostedt, mhiramat, mathieu.desnoyers, david, baohua, baolin.wang,
ryan.roberts, willy, peterx, shuah, wangkefeng.wang, usamaarif642,
sunnanyong, vishal.moola, thomas.hellstrom, yang, kirill.shutemov,
aarcange, raquini, dev.jain, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
Liam.Howlett, lorenzo.stoakes, hannes, rientjes, mhocko, rdunlap
On 28 Apr 2025, at 14:29, Nico Pache wrote:
> Now that we have defer to globally disable THPs at fault time, lets add
> a defer setting to the mTHP options. This will allow khugepaged to
> operate at that order, while avoiding it at PF time.
>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
> include/linux/huge_mm.h | 5 +++++
> mm/huge_memory.c | 38 +++++++++++++++++++++++++++++++++-----
> mm/khugepaged.c | 8 ++++----
> 3 files changed, 42 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 57e6c962afb1..a877c59bea67 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -96,6 +96,7 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr;
> #define TVA_SMAPS (1 << 0) /* Will be used for procfs */
> #define TVA_IN_PF (1 << 1) /* Page fault handler */
> #define TVA_ENFORCE_SYSFS (1 << 2) /* Obey sysfs configuration */
> +#define TVA_IN_KHUGEPAGE ((1 << 2) | (1 << 3)) /* Khugepaged defer support */
Why is TVA_IN_KHUGEPAGE a superset of TVA_ENFORCE_SYSFS? Because khugepaged
also obeys sysfs configuration?
I wonder if explicitly coding the behavior is better. For example,
in __thp_vma_allowable_orders(), enforce_sysfs = tva_flags & (TVA_ENFORCE_SYSFS | TVA_IN_KHUGEPAGE).
>
> #define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \
> (!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order)))
> @@ -182,6 +183,7 @@ extern unsigned long transparent_hugepage_flags;
> extern unsigned long huge_anon_orders_always;
> extern unsigned long huge_anon_orders_madvise;
> extern unsigned long huge_anon_orders_inherit;
> +extern unsigned long huge_anon_orders_defer;
>
> static inline bool hugepage_global_enabled(void)
> {
> @@ -306,6 +308,9 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
> /* Optimization to check if required orders are enabled early. */
> if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) {
And code here becomes tva_flags & (TVA_ENFORCE_SYSFS | TVA_IN_KHUGEPAGE).
Otherwise, LGTM. Reviewed-by: Zi Yan <ziy@nvidia.com>
--
Best Regards,
Yan, Zi
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v5 3/4] khugepaged: add defer option to mTHP options
2025-04-30 20:34 ` Zi Yan
@ 2025-05-01 22:53 ` Nico Pache
0 siblings, 0 replies; 12+ messages in thread
From: Nico Pache @ 2025-05-01 22:53 UTC (permalink / raw)
To: Zi Yan
Cc: linux-mm, linux-doc, linux-kernel, linux-kselftest, akpm, corbet,
rostedt, mhiramat, mathieu.desnoyers, david, baohua, baolin.wang,
ryan.roberts, willy, peterx, shuah, wangkefeng.wang, usamaarif642,
sunnanyong, vishal.moola, thomas.hellstrom, yang, kirill.shutemov,
aarcange, raquini, dev.jain, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
Liam.Howlett, lorenzo.stoakes, hannes, rientjes, mhocko, rdunlap
On Wed, Apr 30, 2025 at 2:34 PM Zi Yan <ziy@nvidia.com> wrote:
>
> On 28 Apr 2025, at 14:29, Nico Pache wrote:
>
> > Now that we have defer to globally disable THPs at fault time, lets add
> > a defer setting to the mTHP options. This will allow khugepaged to
> > operate at that order, while avoiding it at PF time.
> >
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> > include/linux/huge_mm.h | 5 +++++
> > mm/huge_memory.c | 38 +++++++++++++++++++++++++++++++++-----
> > mm/khugepaged.c | 8 ++++----
> > 3 files changed, 42 insertions(+), 9 deletions(-)
> >
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index 57e6c962afb1..a877c59bea67 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -96,6 +96,7 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr;
> > #define TVA_SMAPS (1 << 0) /* Will be used for procfs */
> > #define TVA_IN_PF (1 << 1) /* Page fault handler */
> > #define TVA_ENFORCE_SYSFS (1 << 2) /* Obey sysfs configuration */
> > +#define TVA_IN_KHUGEPAGE ((1 << 2) | (1 << 3)) /* Khugepaged defer support */
>
> Why is TVA_IN_KHUGEPAGE a superset of TVA_ENFORCE_SYSFS? Because khugepaged
> also obeys sysfs configuration?
Correct! The need for a TVA_IN_KHUGEPAGED is to isolate the "deferred"
mTHPs from being "allowed" unless we are in khugepaged.
>
> I wonder if explicitly coding the behavior is better. For example,
> in __thp_vma_allowable_orders(), enforce_sysfs = tva_flags & (TVA_ENFORCE_SYSFS | TVA_IN_KHUGEPAGE).
I'm rather indifferent about either approach. If you (or any others)
have a strong preference for an explicit (none-supersetted) TVA flag I
can make the change.
>
> >
> > #define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \
> > (!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order)))
> > @@ -182,6 +183,7 @@ extern unsigned long transparent_hugepage_flags;
> > extern unsigned long huge_anon_orders_always;
> > extern unsigned long huge_anon_orders_madvise;
> > extern unsigned long huge_anon_orders_inherit;
> > +extern unsigned long huge_anon_orders_defer;
> >
> > static inline bool hugepage_global_enabled(void)
> > {
> > @@ -306,6 +308,9 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
> > /* Optimization to check if required orders are enabled early. */
> > if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) {
>
> And code here becomes tva_flags & (TVA_ENFORCE_SYSFS | TVA_IN_KHUGEPAGE).
or just (enforce_sysfs & vma_is_anon) like you mentioned. Then we
check for the TVA_IN_KHUGEPAGED before appending the defer bits.
>
> Otherwise, LGTM. Reviewed-by: Zi Yan <ziy@nvidia.com>
Thanks !
>
> --
> Best Regards,
> Yan, Zi
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v5 4/4] selftests: mm: add defer to thp setting parser
2025-04-28 18:29 [PATCH v5 0/4] mm: introduce THP deferred setting Nico Pache
` (2 preceding siblings ...)
2025-04-28 18:29 ` [PATCH v5 3/4] khugepaged: add defer option to mTHP options Nico Pache
@ 2025-04-28 18:29 ` Nico Pache
2025-04-30 20:40 ` Zi Yan
3 siblings, 1 reply; 12+ messages in thread
From: Nico Pache @ 2025-04-28 18:29 UTC (permalink / raw)
To: linux-mm, linux-doc, linux-kernel, linux-kselftest
Cc: akpm, corbet, rostedt, mhiramat, mathieu.desnoyers, david, baohua,
baolin.wang, ryan.roberts, willy, peterx, shuah, ziy,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kirill.shutemov, aarcange, raquini,
dev.jain, anshuman.khandual, catalin.marinas, tiwai, will,
dave.hansen, jack, cl, jglisse, surenb, zokeefe, Liam.Howlett,
lorenzo.stoakes, hannes, rientjes, mhocko, rdunlap
add the defer setting to the selftests library for reading thp settings.
Signed-off-by: Nico Pache <npache@redhat.com>
---
tools/testing/selftests/mm/thp_settings.c | 1 +
tools/testing/selftests/mm/thp_settings.h | 1 +
2 files changed, 2 insertions(+)
diff --git a/tools/testing/selftests/mm/thp_settings.c b/tools/testing/selftests/mm/thp_settings.c
index ad872af1c81a..b2f9f62b302a 100644
--- a/tools/testing/selftests/mm/thp_settings.c
+++ b/tools/testing/selftests/mm/thp_settings.c
@@ -20,6 +20,7 @@ static const char * const thp_enabled_strings[] = {
"always",
"inherit",
"madvise",
+ "defer",
NULL
};
diff --git a/tools/testing/selftests/mm/thp_settings.h b/tools/testing/selftests/mm/thp_settings.h
index fc131d23d593..0d52e6d4f754 100644
--- a/tools/testing/selftests/mm/thp_settings.h
+++ b/tools/testing/selftests/mm/thp_settings.h
@@ -11,6 +11,7 @@ enum thp_enabled {
THP_ALWAYS,
THP_INHERIT,
THP_MADVISE,
+ THP_DEFER,
};
enum thp_defrag {
--
2.48.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH v5 4/4] selftests: mm: add defer to thp setting parser
2025-04-28 18:29 ` [PATCH v5 4/4] selftests: mm: add defer to thp setting parser Nico Pache
@ 2025-04-30 20:40 ` Zi Yan
0 siblings, 0 replies; 12+ messages in thread
From: Zi Yan @ 2025-04-30 20:40 UTC (permalink / raw)
To: Nico Pache
Cc: linux-mm, linux-doc, linux-kernel, linux-kselftest, akpm, corbet,
rostedt, mhiramat, mathieu.desnoyers, david, baohua, baolin.wang,
ryan.roberts, willy, peterx, shuah, wangkefeng.wang, usamaarif642,
sunnanyong, vishal.moola, thomas.hellstrom, yang, kirill.shutemov,
aarcange, raquini, dev.jain, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
Liam.Howlett, lorenzo.stoakes, hannes, rientjes, mhocko, rdunlap
On 28 Apr 2025, at 14:29, Nico Pache wrote:
> add the defer setting to the selftests library for reading thp settings.
>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
> tools/testing/selftests/mm/thp_settings.c | 1 +
> tools/testing/selftests/mm/thp_settings.h | 1 +
> 2 files changed, 2 insertions(+)
>
Acked-by: Zi Yan <ziy@nvidia.com>
--
Best Regards,
Yan, Zi
^ permalink raw reply [flat|nested] 12+ messages in thread