Linux Trace Kernel

Linux Trace Kernel
 help / color / mirror / Atom feed

* Re: [PATCH 06/15] cpufreq: Use trace_invoke_##name() at guarded tracepoint call sites
From: Rafael J. Wysocki @ 2026-03-12 18:58 UTC (permalink / raw)
  To: vineeth
  Cc: Steven Rostedt, Peter Zijlstra, Huang Rui, Gautham R. Shenoy,
	Mario Limonciello, Perry Yuan, Rafael J. Wysocki, Viresh Kumar,
	Srinivas Pandruvada, Len Brown, linux-pm, linux-kernel,
	linux-trace-kernel
In-Reply-To: <20260312150523.2054552-7-vineeth@bitbyteword.org>

On Thu, Mar 12, 2026 at 4:05 PM Vineeth Pillai (Google)
<vineeth@bitbyteword.org> wrote:
>
> Replace trace_foo() with the new trace_invoke_foo() at sites already
> guarded by trace_foo_enabled(), avoiding a redundant
> static_branch_unlikely() re-evaluation inside the tracepoint.
> trace_invoke_foo() calls the tracepoint callbacks directly without
> utilizing the static branch again.
>
> Suggested-by: Steven Rostedt <rostedt@goodmis.org>
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Vineeth Pillai (Google) <vineeth@bitbyteword.org>
> Assisted-by: Claude:claude-sonnet-4-6

Fine with me, so

Acked-by: Rafael J. Wysocki (Intel) <rafael@kernel.org> # cpufreq

> ---
>  drivers/cpufreq/amd-pstate.c   | 10 +++++-----
>  drivers/cpufreq/cpufreq.c      |  2 +-
>  drivers/cpufreq/intel_pstate.c |  2 +-
>  3 files changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
> index 5aa9fcd80cf51..3fa40a32ef6b5 100644
> --- a/drivers/cpufreq/amd-pstate.c
> +++ b/drivers/cpufreq/amd-pstate.c
> @@ -247,7 +247,7 @@ static int msr_update_perf(struct cpufreq_policy *policy, u8 min_perf,
>         if (trace_amd_pstate_epp_perf_enabled()) {
>                 union perf_cached perf = READ_ONCE(cpudata->perf);
>
> -               trace_amd_pstate_epp_perf(cpudata->cpu,
> +               trace_invoke_amd_pstate_epp_perf(cpudata->cpu,
>                                           perf.highest_perf,
>                                           epp,
>                                           min_perf,
> @@ -298,7 +298,7 @@ static int msr_set_epp(struct cpufreq_policy *policy, u8 epp)
>         if (trace_amd_pstate_epp_perf_enabled()) {
>                 union perf_cached perf = cpudata->perf;
>
> -               trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf,
> +               trace_invoke_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf,
>                                           epp,
>                                           FIELD_GET(AMD_CPPC_MIN_PERF_MASK,
>                                                     cpudata->cppc_req_cached),
> @@ -343,7 +343,7 @@ static int shmem_set_epp(struct cpufreq_policy *policy, u8 epp)
>         if (trace_amd_pstate_epp_perf_enabled()) {
>                 union perf_cached perf = cpudata->perf;
>
> -               trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf,
> +               trace_invoke_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf,
>                                           epp,
>                                           FIELD_GET(AMD_CPPC_MIN_PERF_MASK,
>                                                     cpudata->cppc_req_cached),
> @@ -507,7 +507,7 @@ static int shmem_update_perf(struct cpufreq_policy *policy, u8 min_perf,
>         if (trace_amd_pstate_epp_perf_enabled()) {
>                 union perf_cached perf = READ_ONCE(cpudata->perf);
>
> -               trace_amd_pstate_epp_perf(cpudata->cpu,
> +               trace_invoke_amd_pstate_epp_perf(cpudata->cpu,
>                                           perf.highest_perf,
>                                           epp,
>                                           min_perf,
> @@ -588,7 +588,7 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u8 min_perf,
>         }
>
>         if (trace_amd_pstate_perf_enabled() && amd_pstate_sample(cpudata)) {
> -               trace_amd_pstate_perf(min_perf, des_perf, max_perf, cpudata->freq,
> +               trace_invoke_amd_pstate_perf(min_perf, des_perf, max_perf, cpudata->freq,
>                         cpudata->cur.mperf, cpudata->cur.aperf, cpudata->cur.tsc,
>                                 cpudata->cpu, fast_switch);
>         }
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index 277884d91913c..cf57aeb503790 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -2222,7 +2222,7 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
>
>         if (trace_cpu_frequency_enabled()) {
>                 for_each_cpu(cpu, policy->cpus)
> -                       trace_cpu_frequency(freq, cpu);
> +                       trace_invoke_cpu_frequency(freq, cpu);
>         }
>
>         return freq;
> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
> index 11c58af419006..a0da9b31c4ffe 100644
> --- a/drivers/cpufreq/intel_pstate.c
> +++ b/drivers/cpufreq/intel_pstate.c
> @@ -3132,7 +3132,7 @@ static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, in
>                 return;
>
>         sample = &cpu->sample;
> -       trace_pstate_sample(trace_type,
> +       trace_invoke_pstate_sample(trace_type,
>                 0,
>                 old_pstate,
>                 cpu->pstate.current_pstate,
> --
> 2.53.0
>

^ permalink raw reply

* [PATCH v2 0/3] lib/bootconfig: three bug fixes
From: Josh Law @ 2026-03-12 19:11 UTC (permalink / raw)
  To: Masami Hiramatsu, Andrew Morton
  Cc: Josh Law, linux-kernel, linux-trace-kernel

Three fixes for lib/bootconfig.c:

1. Fix off-by-one in xbc_verify_tree() unclosed brace error reporting.
2. Check bounds before writing in __xbc_open_brace() to prevent
   potential out-of-bounds writes.
3. Fix snprintf truncation check in xbc_node_compose_key_after().

Changes since v1:
- Added explicit From: lines at top of changelog.

Josh Law (3):
  lib/bootconfig: fix off-by-one in xbc_verify_tree() unclosed brace
    error
  lib/bootconfig: check bounds before writing in __xbc_open_brace()
  lib/bootconfig: fix snprintf truncation check in
    xbc_node_compose_key_after()

 lib/bootconfig.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

-- 
2.34.1

^ permalink raw reply

* [PATCH v2 1/3] lib/bootconfig: fix off-by-one in xbc_verify_tree() unclosed brace error
From: Josh Law @ 2026-03-12 19:11 UTC (permalink / raw)
  To: Masami Hiramatsu, Andrew Morton
  Cc: Josh Law, linux-kernel, linux-trace-kernel
In-Reply-To: <20260312191143.28719-1-objecting@objecting.org>

From: Josh Law <objecting@objecting.org>

__xbc_open_brace() pushes entries with post-increment
(open_brace[brace_index++]), so brace_index always points one past
the last valid entry.  xbc_verify_tree() reads open_brace[brace_index]
to report which brace is unclosed, but this is one past the last
pushed entry and contains stale/zero data, causing the error message
to reference the wrong node.

Use open_brace[brace_index - 1] to correctly identify the unclosed
brace.  brace_index is known to be > 0 here since we are inside the
if (brace_index) guard.

Signed-off-by: Josh Law <objecting@objecting.org>
---
 lib/bootconfig.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 2bcd5c2aa87e..a1e6a2e14b01 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -802,7 +802,7 @@ static int __init xbc_verify_tree(void)

 	/* Brace closing */
 	if (brace_index) {
-		n = &xbc_nodes[open_brace[brace_index]];
+		n = &xbc_nodes[open_brace[brace_index - 1]];
 		return xbc_parse_error("Brace is not closed",
 					xbc_node_get_data(n));
 	}
-- 
2.34.1

^ permalink raw reply related

* [PATCH v2 2/3] lib/bootconfig: check bounds before writing in __xbc_open_brace()
From: Josh Law @ 2026-03-12 19:11 UTC (permalink / raw)
  To: Masami Hiramatsu, Andrew Morton
  Cc: Josh Law, linux-kernel, linux-trace-kernel
In-Reply-To: <20260312191143.28719-1-objecting@objecting.org>

From: Josh Law <objecting@objecting.org>

The bounds check for brace_index happens after the array write.
While the current call pattern prevents an actual out-of-bounds
access (the previous call would have returned an error), the
write-before-check pattern is fragile and would become a real
out-of-bounds write if the error return were ever not propagated.

Move the bounds check before the array write so the function is
self-contained and safe regardless of caller behavior.

Signed-off-by: Josh Law <objecting@objecting.org>
---
 lib/bootconfig.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index a1e6a2e14b01..62b4ed7a0ba6 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -532,9 +532,9 @@ static char *skip_spaces_until_newline(char *p)
 static int __init __xbc_open_brace(char *p)
 {
 	/* Push the last key as open brace */
-	open_brace[brace_index++] = xbc_node_index(last_parent);
 	if (brace_index >= XBC_DEPTH_MAX)
 		return xbc_parse_error("Exceed max depth of braces", p);
+	open_brace[brace_index++] = xbc_node_index(last_parent);
 
 	return 0;
 }
-- 
2.34.1


^ permalink raw reply related

* [PATCH v2 3/3] lib/bootconfig: fix snprintf truncation check in xbc_node_compose_key_after()
From: Josh Law @ 2026-03-12 19:11 UTC (permalink / raw)
  To: Masami Hiramatsu, Andrew Morton
  Cc: Josh Law, linux-kernel, linux-trace-kernel
In-Reply-To: <20260312191143.28719-1-objecting@objecting.org>

From: Josh Law <objecting@objecting.org>

snprintf() returns the number of characters that would have been
written excluding the NUL terminator.  Output is truncated when the
return value is >= the buffer size, not just > the buffer size.

When ret == size, the current code takes the non-truncated path,
advancing buf by ret and reducing size to 0.  This is wrong because
the output was actually truncated (the last character was replaced by
NUL).  Fix by using >= so the truncation path is taken correctly.

Signed-off-by: Josh Law <objecting@objecting.org>
---
 lib/bootconfig.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 62b4ed7a0ba6..b0ef1e74e98a 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -316,7 +316,7 @@ int __init xbc_node_compose_key_after(struct xbc_node *root,
 			       depth ? "." : "");
 		if (ret < 0)
 			return ret;
-		if (ret > size) {
+		if (ret >= size) {
 			size = 0;
 		} else {
 			size -= ret;
-- 
2.34.1

^ permalink raw reply related

* Re: [PATCH 48/61] mtd: Prefer IS_ERR_OR_NULL over manual NULL check
From: Richard Weinberger @ 2026-03-12 19:33 UTC (permalink / raw)
  To: Philipp Hahn
  Cc: amd-gfx, apparmor, bpf, ceph-devel, cocci, dm-devel,
	DRI mailing list, gfs2, intel-gfx, intel-wired-lan, iommu, kvm,
	linux-arm-kernel, linux-block, linux-bluetooth, linux-btrfs,
	linux-cifs, linux-clk, linux-erofs, linux-ext4, linux-fsdevel,
	linux-gpio, linux-hyperv, linux-input, linux-kernel, linux-leds,
	linux-media, linux-mips, linux-mm, linux-modules, linux-mtd,
	linux-nfs, linux-omap, linux-phy, linux-pm, linux-rockchip,
	linux-s390, linux-scsi, linux-sctp, LSM, linux-sh, linux-sound,
	linux-stm32, linux-trace-kernel, linux-usb, linux-wireless,
	netdev, ntfs3, samba-technical, sched-ext, target-devel,
	tipc-discussion, v9fs, Miquel Raynal, Vignesh Raghavendra
In-Reply-To: <20260310-b4-is_err_or_null-v1-48-bd63b656022d@avm.de>

----- Ursprüngliche Mail -----
> Von: "Philipp Hahn" <phahn-oss@avm.de>
> -	if (gpiomtd->nwp && !IS_ERR(gpiomtd->nwp))
> +	if (!IS_ERR_OR_NULL(gpiomtd->nwp))

No, please don't.

This makes reading the code not easier.

Thanks,
//richard

^ permalink raw reply

* Re: [PATCH mm-unstable v15 01/13] mm/khugepaged: generalize hugepage_vma_revalidate for mTHP support
From: David Hildenbrand (Arm) @ 2026-03-12 20:00 UTC (permalink / raw)
  To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, lorenzo.stoakes,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, peterx,
	pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang, rientjes,
	rostedt, rppt, ryan.roberts, shivankg, sunnanyong, surenb,
	thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
	wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260226032217.232353-1-npache@redhat.com>

On 2/26/26 04:22, Nico Pache wrote:
> For khugepaged to support different mTHP orders, we must generalize this
> to check if the PMD is not shared by another VMA and that the order is
> enabled.
> 
> No functional change in this patch. Also correct a comment about the
> functionality of the revalidation.
> 
> Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
> Reviewed-by: Lance Yang <lance.yang@linux.dev>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Reviewed-by: Zi Yan <ziy@nvidia.com>
> Co-developed-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Nico Pache <npache@redhat.com>

Acked-by: David Hildenbrand (Arm) <david@kernel.org>

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH mm-unstable v15 02/13] mm/khugepaged: generalize alloc_charge_folio()
From: David Hildenbrand (Arm) @ 2026-03-12 20:05 UTC (permalink / raw)
  To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, lorenzo.stoakes,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, peterx,
	pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang, rientjes,
	rostedt, rppt, ryan.roberts, shivankg, sunnanyong, surenb,
	thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
	wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260226032326.232770-1-npache@redhat.com>

On 2/26/26 04:23, Nico Pache wrote:
> From: Dev Jain <dev.jain@arm.com>
> 
> Pass order to alloc_charge_folio() and update mTHP statistics.
> 
> Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
> Reviewed-by: Lance Yang <lance.yang@linux.dev>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Reviewed-by: Zi Yan <ziy@nvidia.com>
> Co-developed-by: Nico Pache <npache@redhat.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---

Acked-by: David Hildenbrand (Arm) <david@kernel.org>

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH 13/15] spi: Use trace_invoke_##name() at guarded tracepoint call sites
From: Mark Brown @ 2026-03-12 20:05 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Vineeth Pillai (Google), Peter Zijlstra, Michael Hennerich,
	Nuno Sá, David Lechner, linux-spi, linux-kernel,
	linux-trace-kernel
In-Reply-To: <20260312130022.669e7de4@gandalf.local.home>

[-- Attachment #1: Type: text/plain, Size: 631 bytes --]

On Thu, Mar 12, 2026 at 01:00:22PM -0400, Steven Rostedt wrote:
> Mark Brown <broonie@kernel.org> wrote:

> > Possibly an _unchecked or something?  Honestly the suggestion someone
> > had for _do seemed OK to me.  Part of it is that I wouldn't think of
> > tracepoints as being something that I'd call.

> The "__do_trace.." is an internal function I don't want to expose.

> I'm thinking of: call_trace_foo(), as that should be pretty obvious to what
> it is.

Yeah, like I day it was the concept of there being a call in a
tracepoint that was a bit of a surprise.  It was more a grumble than
anything else, I did ack the change.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* Re: [PATCH mm-unstable v15 03/13] mm/khugepaged: generalize __collapse_huge_page_* for mTHP support
From: David Hildenbrand (Arm) @ 2026-03-12 20:32 UTC (permalink / raw)
  To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, lorenzo.stoakes,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, peterx,
	pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang, rientjes,
	rostedt, rppt, ryan.roberts, shivankg, sunnanyong, surenb,
	thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
	wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260226032347.232939-1-npache@redhat.com>

On 2/26/26 04:23, Nico Pache wrote:
> generalize the order of the __collapse_huge_page_* functions
> to support future mTHP collapse.
> 
> mTHP collapse will not honor the khugepaged_max_ptes_shared or
> khugepaged_max_ptes_swap parameters, and will fail if it encounters a
> shared or swapped entry.
> 
> No functional changes in this patch.
> 
> Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
> Reviewed-by: Lance Yang <lance.yang@linux.dev>
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Co-developed-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  mm/khugepaged.c | 73 +++++++++++++++++++++++++++++++------------------
>  1 file changed, 47 insertions(+), 26 deletions(-)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index a9b645402b7f..ecdbbf6a01a6 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -535,7 +535,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
>  
>  static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  		unsigned long start_addr, pte_t *pte, struct collapse_control *cc,
> -		struct list_head *compound_pagelist)
> +		unsigned int order, struct list_head *compound_pagelist)
>  {
>  	struct page *page = NULL;
>  	struct folio *folio = NULL;
> @@ -543,15 +543,17 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  	pte_t *_pte;
>  	int none_or_zero = 0, shared = 0, referenced = 0;
>  	enum scan_result result = SCAN_FAIL;
> +	const unsigned long nr_pages = 1UL << order;
> +	int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);

It might be a bit more readable to move "const unsigned long
nr_pages = 1UL << order;" all the way to the top.

Then, have here

	int max_ptes_none = 0;

and do at the beginning of the function:

	/* For MADV_COLLAPSE, we always collapse ... */
	if (!cc->is_khugepaged)
		max_ptes_none = HPAGE_PMD_NR;
	/*  ... except if userfaultf relies on MISSING faults. */
	if (!userfaultfd_armed(vma))
		max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);

(but see below regarding helper function)

then the code below becomes ...

>  
> -	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
> +	for (_pte = pte; _pte < pte + nr_pages;
>  	     _pte++, addr += PAGE_SIZE) {
>  		pte_t pteval = ptep_get(_pte);
>  		if (pte_none_or_zero(pteval)) {
>  			++none_or_zero;
>  			if (!userfaultfd_armed(vma) &&
>  			    (!cc->is_khugepaged ||
> -			     none_or_zero <= khugepaged_max_ptes_none)) {
> +			     none_or_zero <= max_ptes_none)) {

...

	if (none_or_zero <= max_ptes_none) {


I see that you do something like that (but slightly different) in the next
patch. You could easily extend the above by it.

Or go one step further and move all of that conditional into collapse_max_ptes_none(), whereby
you simply also pass the cc and the vma.

Then this all gets cleaned up and you'd end up above with

max_ptes_none = collapse_max_ptes_none(cc, vma, order);
if (max_ptes_none < 0)
	return result;

I'd do all that in this patch here, getting rid of #4.


>  				continue;
>  			} else {
>  				result = SCAN_EXCEED_NONE_PTE;
> @@ -585,8 +587,14 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  		/* See collapse_scan_pmd(). */
>  		if (folio_maybe_mapped_shared(folio)) {
>  			++shared;
> -			if (cc->is_khugepaged &&
> -			    shared > khugepaged_max_ptes_shared) {
> +			/*
> +			 * TODO: Support shared pages without leading to further
> +			 * mTHP collapses. Currently bringing in new pages via
> +			 * shared may cause a future higher order collapse on a
> +			 * rescan of the same range.
> +			 */
> +			if (!is_pmd_order(order) || (cc->is_khugepaged &&
> +			    shared > khugepaged_max_ptes_shared)) {

That's not how we indent within a nested ().

To make this easier to read, what about similarly having at the beginning
of the function:

int max_ptes_shared = 0;

/* For MADV_COLLAPSE, we always collapse. */
if (cc->is_khugepaged)
	max_ptes_none = HPAGE_PMD_NR;
/* TODO ... */
if (is_pmd_order(order))
	max_ptes_none = khugepaged_max_ptes_shared;

to turn this code into a

	if (shared > khugepaged_max_ptes_shared)

Also, here, might make sense to have a collapse_max_ptes_swap(cc, order)
to do that and clean it up.


>  				result = SCAN_EXCEED_SHARED_PTE;
>  				count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
>  				goto out;
> @@ -679,18 +687,18 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  }
>  
>  static void __collapse_huge_page_copy_succeeded(pte_t *pte,
> -						struct vm_area_struct *vma,
> -						unsigned long address,
> -						spinlock_t *ptl,
> -						struct list_head *compound_pagelist)
> +		struct vm_area_struct *vma, unsigned long address,
> +		spinlock_t *ptl, unsigned int order,
> +		struct list_head *compound_pagelist)
>  {
> -	unsigned long end = address + HPAGE_PMD_SIZE;
> +	unsigned long end = address + (PAGE_SIZE << order);
>  	struct folio *src, *tmp;
>  	pte_t pteval;
>  	pte_t *_pte;
>  	unsigned int nr_ptes;
> +	const unsigned long nr_pages = 1UL << order;

Move it further to the top.

>  
> -	for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
> +	for (_pte = pte; _pte < pte + nr_pages; _pte += nr_ptes,
>  	     address += nr_ptes * PAGE_SIZE) {
>  		nr_ptes = 1;
>  		pteval = ptep_get(_pte);
> @@ -743,13 +751,11 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
>  }
>  
>  static void __collapse_huge_page_copy_failed(pte_t *pte,
> -					     pmd_t *pmd,
> -					     pmd_t orig_pmd,
> -					     struct vm_area_struct *vma,
> -					     struct list_head *compound_pagelist)
> +		pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
> +		unsigned int order, struct list_head *compound_pagelist)
>  {
>  	spinlock_t *pmd_ptl;
> -
> +	const unsigned long nr_pages = 1UL << order;
>  	/*
>  	 * Re-establish the PMD to point to the original page table
>  	 * entry. Restoring PMD needs to be done prior to releasing
> @@ -763,7 +769,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
>  	 * Release both raw and compound pages isolated
>  	 * in __collapse_huge_page_isolate.
>  	 */
> -	release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
> +	release_pte_pages(pte, pte + nr_pages, compound_pagelist);
>  }
>  
>  /*
> @@ -783,16 +789,16 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
>   */
>  static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
>  		pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
> -		unsigned long address, spinlock_t *ptl,
> +		unsigned long address, spinlock_t *ptl, unsigned int order,
>  		struct list_head *compound_pagelist)
>  {
>  	unsigned int i;
>  	enum scan_result result = SCAN_SUCCEED;
> -
> +	const unsigned long nr_pages = 1UL << order;

Same here, all the way to the top.

>  	/*
>  	 * Copying pages' contents is subject to memory poison at any iteration.
>  	 */
> -	for (i = 0; i < HPAGE_PMD_NR; i++) {
> +	for (i = 0; i < nr_pages; i++) {
>  		pte_t pteval = ptep_get(pte + i);
>  		struct page *page = folio_page(folio, i);
>  		unsigned long src_addr = address + i * PAGE_SIZE;
> @@ -811,10 +817,10 @@ static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio *foli
>  
>  	if (likely(result == SCAN_SUCCEED))
>  		__collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
> -						    compound_pagelist);
> +						    order, compound_pagelist);
>  	else
>  		__collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
> -						 compound_pagelist);
> +						 order, compound_pagelist);
>  
>  	return result;
>  }
> @@ -985,12 +991,12 @@ static enum scan_result check_pmd_still_valid(struct mm_struct *mm,
>   * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
>   */
>  static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
> -		struct vm_area_struct *vma, unsigned long start_addr, pmd_t *pmd,
> -		int referenced)
> +		struct vm_area_struct *vma, unsigned long start_addr,
> +		pmd_t *pmd, int referenced, unsigned int order)
>  {
>  	int swapped_in = 0;
>  	vm_fault_t ret = 0;
> -	unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE);
> +	unsigned long addr, end = start_addr + (PAGE_SIZE << order);
>  	enum scan_result result;
>  	pte_t *pte = NULL;
>  	spinlock_t *ptl;
> @@ -1022,6 +1028,19 @@ static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
>  		    pte_present(vmf.orig_pte))
>  			continue;
>  
> +		/*
> +		 * TODO: Support swapin without leading to further mTHP
> +		 * collapses. Currently bringing in new pages via swapin may
> +		 * cause a future higher order collapse on a rescan of the same
> +		 * range.
> +		 */
> +		if (!is_pmd_order(order)) {
> +			pte_unmap(pte);
> +			mmap_read_unlock(mm);
> +			result = SCAN_EXCEED_SWAP_PTE;
> +			goto out;
> +		}
> +

Interesting, we just swapin everything we find :)

But do we really need this check here? I mean, we just found it to be present.

In the rare event that there was a race, do we really care? It was just
present, now it's swapped. Bad luck. Just swap it in.

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH mm-unstable v15 03/13] mm/khugepaged: generalize __collapse_huge_page_* for mTHP support
From: David Hildenbrand (Arm) @ 2026-03-12 20:36 UTC (permalink / raw)
  To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, lorenzo.stoakes,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, peterx,
	pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang, rientjes,
	rostedt, rppt, ryan.roberts, shivankg, sunnanyong, surenb,
	thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
	wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <8a4568de-e0f9-471b-bc94-1062d4af3938@kernel.org>

On 3/12/26 21:32, David Hildenbrand (Arm) wrote:
> On 2/26/26 04:23, Nico Pache wrote:
>> generalize the order of the __collapse_huge_page_* functions
>> to support future mTHP collapse.
>>
>> mTHP collapse will not honor the khugepaged_max_ptes_shared or
>> khugepaged_max_ptes_swap parameters, and will fail if it encounters a
>> shared or swapped entry.
>>
>> No functional changes in this patch.
>>
>> Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
>> Reviewed-by: Lance Yang <lance.yang@linux.dev>
>> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>> Co-developed-by: Dev Jain <dev.jain@arm.com>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> Signed-off-by: Nico Pache <npache@redhat.com>
>> ---
>>  mm/khugepaged.c | 73 +++++++++++++++++++++++++++++++------------------
>>  1 file changed, 47 insertions(+), 26 deletions(-)
>>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index a9b645402b7f..ecdbbf6a01a6 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -535,7 +535,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
>>  
>>  static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>  		unsigned long start_addr, pte_t *pte, struct collapse_control *cc,
>> -		struct list_head *compound_pagelist)
>> +		unsigned int order, struct list_head *compound_pagelist)
>>  {
>>  	struct page *page = NULL;
>>  	struct folio *folio = NULL;
>> @@ -543,15 +543,17 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>  	pte_t *_pte;
>>  	int none_or_zero = 0, shared = 0, referenced = 0;
>>  	enum scan_result result = SCAN_FAIL;
>> +	const unsigned long nr_pages = 1UL << order;
>> +	int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
> 
> It might be a bit more readable to move "const unsigned long
> nr_pages = 1UL << order;" all the way to the top.
> 
> Then, have here
> 
> 	int max_ptes_none = 0;
> 
> and do at the beginning of the function:
> 
> 	/* For MADV_COLLAPSE, we always collapse ... */
> 	if (!cc->is_khugepaged)
> 		max_ptes_none = HPAGE_PMD_NR;
> 	/*  ... except if userfaultf relies on MISSING faults. */
> 	if (!userfaultfd_armed(vma))
> 		max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
> 
> (but see below regarding helper function)
> 
> then the code below becomes ...
> 
>>  
>> -	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
>> +	for (_pte = pte; _pte < pte + nr_pages;
>>  	     _pte++, addr += PAGE_SIZE) {
>>  		pte_t pteval = ptep_get(_pte);
>>  		if (pte_none_or_zero(pteval)) {
>>  			++none_or_zero;
>>  			if (!userfaultfd_armed(vma) &&
>>  			    (!cc->is_khugepaged ||
>> -			     none_or_zero <= khugepaged_max_ptes_none)) {
>> +			     none_or_zero <= max_ptes_none)) {
> 
> ...
> 
> 	if (none_or_zero <= max_ptes_none) {
> 
> 
> I see that you do something like that (but slightly different) in the next
> patch. You could easily extend the above by it.
> 
> Or go one step further and move all of that conditional into collapse_max_ptes_none(), whereby
> you simply also pass the cc and the vma.
> 
> Then this all gets cleaned up and you'd end up above with
> 
> max_ptes_none = collapse_max_ptes_none(cc, vma, order);
> if (max_ptes_none < 0)
> 	return result;
> 
> I'd do all that in this patch here, getting rid of #4.
> 
> 
>>  				continue;
>>  			} else {
>>  				result = SCAN_EXCEED_NONE_PTE;
>> @@ -585,8 +587,14 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>  		/* See collapse_scan_pmd(). */
>>  		if (folio_maybe_mapped_shared(folio)) {
>>  			++shared;
>> -			if (cc->is_khugepaged &&
>> -			    shared > khugepaged_max_ptes_shared) {
>> +			/*
>> +			 * TODO: Support shared pages without leading to further
>> +			 * mTHP collapses. Currently bringing in new pages via
>> +			 * shared may cause a future higher order collapse on a
>> +			 * rescan of the same range.
>> +			 */
>> +			if (!is_pmd_order(order) || (cc->is_khugepaged &&
>> +			    shared > khugepaged_max_ptes_shared)) {
> 
> That's not how we indent within a nested ().
> 
> To make this easier to read, what about similarly having at the beginning
> of the function:
> 
> int max_ptes_shared = 0;
> 
> /* For MADV_COLLAPSE, we always collapse. */
> if (cc->is_khugepaged)
> 	max_ptes_none = HPAGE_PMD_NR;
> /* TODO ... */
> if (is_pmd_order(order))
> 	max_ptes_none = khugepaged_max_ptes_shared;
> 
> to turn this code into a
> 
> 	if (shared > khugepaged_max_ptes_shared)
> 
> Also, here, might make sense to have a collapse_max_ptes_swap(cc, order)
> to do that and clean it up.
> 
> 
>>  				result = SCAN_EXCEED_SHARED_PTE;
>>  				count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
>>  				goto out;
>> @@ -679,18 +687,18 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>  }
>>  
>>  static void __collapse_huge_page_copy_succeeded(pte_t *pte,
>> -						struct vm_area_struct *vma,
>> -						unsigned long address,
>> -						spinlock_t *ptl,
>> -						struct list_head *compound_pagelist)
>> +		struct vm_area_struct *vma, unsigned long address,
>> +		spinlock_t *ptl, unsigned int order,
>> +		struct list_head *compound_pagelist)
>>  {
>> -	unsigned long end = address + HPAGE_PMD_SIZE;
>> +	unsigned long end = address + (PAGE_SIZE << order);
>>  	struct folio *src, *tmp;
>>  	pte_t pteval;
>>  	pte_t *_pte;
>>  	unsigned int nr_ptes;
>> +	const unsigned long nr_pages = 1UL << order;
> 
> Move it further to the top.
> 
>>  
>> -	for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
>> +	for (_pte = pte; _pte < pte + nr_pages; _pte += nr_ptes,
>>  	     address += nr_ptes * PAGE_SIZE) {
>>  		nr_ptes = 1;
>>  		pteval = ptep_get(_pte);
>> @@ -743,13 +751,11 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
>>  }
>>  
>>  static void __collapse_huge_page_copy_failed(pte_t *pte,
>> -					     pmd_t *pmd,
>> -					     pmd_t orig_pmd,
>> -					     struct vm_area_struct *vma,
>> -					     struct list_head *compound_pagelist)
>> +		pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
>> +		unsigned int order, struct list_head *compound_pagelist)
>>  {
>>  	spinlock_t *pmd_ptl;
>> -
>> +	const unsigned long nr_pages = 1UL << order;
>>  	/*
>>  	 * Re-establish the PMD to point to the original page table
>>  	 * entry. Restoring PMD needs to be done prior to releasing
>> @@ -763,7 +769,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
>>  	 * Release both raw and compound pages isolated
>>  	 * in __collapse_huge_page_isolate.
>>  	 */
>> -	release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
>> +	release_pte_pages(pte, pte + nr_pages, compound_pagelist);
>>  }
>>  
>>  /*
>> @@ -783,16 +789,16 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
>>   */
>>  static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
>>  		pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
>> -		unsigned long address, spinlock_t *ptl,
>> +		unsigned long address, spinlock_t *ptl, unsigned int order,
>>  		struct list_head *compound_pagelist)
>>  {
>>  	unsigned int i;
>>  	enum scan_result result = SCAN_SUCCEED;
>> -
>> +	const unsigned long nr_pages = 1UL << order;
> 
> Same here, all the way to the top.
> 
>>  	/*
>>  	 * Copying pages' contents is subject to memory poison at any iteration.
>>  	 */
>> -	for (i = 0; i < HPAGE_PMD_NR; i++) {
>> +	for (i = 0; i < nr_pages; i++) {
>>  		pte_t pteval = ptep_get(pte + i);
>>  		struct page *page = folio_page(folio, i);
>>  		unsigned long src_addr = address + i * PAGE_SIZE;
>> @@ -811,10 +817,10 @@ static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio *foli
>>  
>>  	if (likely(result == SCAN_SUCCEED))
>>  		__collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
>> -						    compound_pagelist);
>> +						    order, compound_pagelist);
>>  	else
>>  		__collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
>> -						 compound_pagelist);
>> +						 order, compound_pagelist);
>>  
>>  	return result;
>>  }
>> @@ -985,12 +991,12 @@ static enum scan_result check_pmd_still_valid(struct mm_struct *mm,
>>   * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
>>   */
>>  static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
>> -		struct vm_area_struct *vma, unsigned long start_addr, pmd_t *pmd,
>> -		int referenced)
>> +		struct vm_area_struct *vma, unsigned long start_addr,
>> +		pmd_t *pmd, int referenced, unsigned int order)
>>  {
>>  	int swapped_in = 0;
>>  	vm_fault_t ret = 0;
>> -	unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE);
>> +	unsigned long addr, end = start_addr + (PAGE_SIZE << order);
>>  	enum scan_result result;
>>  	pte_t *pte = NULL;
>>  	spinlock_t *ptl;
>> @@ -1022,6 +1028,19 @@ static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
>>  		    pte_present(vmf.orig_pte))
>>  			continue;
>>  
>> +		/*
>> +		 * TODO: Support swapin without leading to further mTHP
>> +		 * collapses. Currently bringing in new pages via swapin may
>> +		 * cause a future higher order collapse on a rescan of the same
>> +		 * range.
>> +		 */
>> +		if (!is_pmd_order(order)) {
>> +			pte_unmap(pte);
>> +			mmap_read_unlock(mm);
>> +			result = SCAN_EXCEED_SWAP_PTE;
>> +			goto out;
>> +		}
>> +
> 
> Interesting, we just swapin everything we find :)
> 
> But do we really need this check here? I mean, we just found it to be present.
> 
> In the rare event that there was a race, do we really care? It was just
> present, now it's swapped. Bad luck. Just swap it in.
> 

Okay, now I am confused. Why are you not taking care of
collapse_scan_pmd() in the same context?

Because if you make sure that we properly check against a max_ptes_swap
similar as in the style above, we'd rule out swapin right from the start?

Also, I would expect that all other parameters in there are similarly
handled?

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH v14 18/30] tracing: Check for undefined symbols in simple_ring_buffer
From: Nathan Chancellor @ 2026-03-12 20:40 UTC (permalink / raw)
  To: Vincent Donnefort
  Cc: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel, maz,
	oliver.upton, joey.gouly, suzuki.poulose, yuzenghui, kvmarm,
	linux-arm-kernel, jstultz, qperret, will, aneesh.kumar,
	kernel-team, linux-kernel
In-Reply-To: <abLIslqIcVscZRFg@google.com>

On Thu, Mar 12, 2026 at 02:07:46PM +0000, Vincent Donnefort wrote:
> In the end to unblock linux-next I have already sent an updated list of symbols.
> However feel free to send the logging bit, that is surely useful.

Thanks. I will review and test your "tracing: Generate undef symbols
allowlist for simple_ring_buffer" shortly (as I found more errors other
than the ones I described here) then I will base the logging patch on
top of that.

Cheers,
Nathan

^ permalink raw reply

* Re: [PATCH mm-unstable v15 03/13] mm/khugepaged: generalize __collapse_huge_page_* for mTHP support
From: David Hildenbrand (Arm) @ 2026-03-12 20:56 UTC (permalink / raw)
  To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, lorenzo.stoakes,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, peterx,
	pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang, rientjes,
	rostedt, rppt, ryan.roberts, shivankg, sunnanyong, surenb,
	thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
	wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <ee39e605-0d9f-433b-9dfa-f70fd92edfac@kernel.org>

On 3/12/26 21:36, David Hildenbrand (Arm) wrote:
> On 3/12/26 21:32, David Hildenbrand (Arm) wrote:
>> On 2/26/26 04:23, Nico Pache wrote:
>>> generalize the order of the __collapse_huge_page_* functions
>>> to support future mTHP collapse.
>>>
>>> mTHP collapse will not honor the khugepaged_max_ptes_shared or
>>> khugepaged_max_ptes_swap parameters, and will fail if it encounters a
>>> shared or swapped entry.
>>>
>>> No functional changes in this patch.
>>>
>>> Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
>>> Reviewed-by: Lance Yang <lance.yang@linux.dev>
>>> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>>> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>>> Co-developed-by: Dev Jain <dev.jain@arm.com>
>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>> Signed-off-by: Nico Pache <npache@redhat.com>
>>> ---
>>>  mm/khugepaged.c | 73 +++++++++++++++++++++++++++++++------------------
>>>  1 file changed, 47 insertions(+), 26 deletions(-)
>>>
>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>> index a9b645402b7f..ecdbbf6a01a6 100644
>>> --- a/mm/khugepaged.c
>>> +++ b/mm/khugepaged.c
>>> @@ -535,7 +535,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
>>>  
>>>  static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>>  		unsigned long start_addr, pte_t *pte, struct collapse_control *cc,
>>> -		struct list_head *compound_pagelist)
>>> +		unsigned int order, struct list_head *compound_pagelist)
>>>  {
>>>  	struct page *page = NULL;
>>>  	struct folio *folio = NULL;
>>> @@ -543,15 +543,17 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>>  	pte_t *_pte;
>>>  	int none_or_zero = 0, shared = 0, referenced = 0;
>>>  	enum scan_result result = SCAN_FAIL;
>>> +	const unsigned long nr_pages = 1UL << order;
>>> +	int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
>>
>> It might be a bit more readable to move "const unsigned long
>> nr_pages = 1UL << order;" all the way to the top.
>>
>> Then, have here
>>
>> 	int max_ptes_none = 0;
>>
>> and do at the beginning of the function:
>>
>> 	/* For MADV_COLLAPSE, we always collapse ... */
>> 	if (!cc->is_khugepaged)
>> 		max_ptes_none = HPAGE_PMD_NR;
>> 	/*  ... except if userfaultf relies on MISSING faults. */
>> 	if (!userfaultfd_armed(vma))
>> 		max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
>>
>> (but see below regarding helper function)
>>
>> then the code below becomes ...
>>
>>>  
>>> -	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
>>> +	for (_pte = pte; _pte < pte + nr_pages;
>>>  	     _pte++, addr += PAGE_SIZE) {
>>>  		pte_t pteval = ptep_get(_pte);
>>>  		if (pte_none_or_zero(pteval)) {
>>>  			++none_or_zero;
>>>  			if (!userfaultfd_armed(vma) &&
>>>  			    (!cc->is_khugepaged ||
>>> -			     none_or_zero <= khugepaged_max_ptes_none)) {
>>> +			     none_or_zero <= max_ptes_none)) {
>>
>> ...
>>
>> 	if (none_or_zero <= max_ptes_none) {
>>
>>
>> I see that you do something like that (but slightly different) in the next
>> patch. You could easily extend the above by it.
>>
>> Or go one step further and move all of that conditional into collapse_max_ptes_none(), whereby
>> you simply also pass the cc and the vma.
>>
>> Then this all gets cleaned up and you'd end up above with
>>
>> max_ptes_none = collapse_max_ptes_none(cc, vma, order);
>> if (max_ptes_none < 0)
>> 	return result;
>>
>> I'd do all that in this patch here, getting rid of #4.
>>
>>
>>>  				continue;
>>>  			} else {
>>>  				result = SCAN_EXCEED_NONE_PTE;
>>> @@ -585,8 +587,14 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>>  		/* See collapse_scan_pmd(). */
>>>  		if (folio_maybe_mapped_shared(folio)) {
>>>  			++shared;
>>> -			if (cc->is_khugepaged &&
>>> -			    shared > khugepaged_max_ptes_shared) {
>>> +			/*
>>> +			 * TODO: Support shared pages without leading to further
>>> +			 * mTHP collapses. Currently bringing in new pages via
>>> +			 * shared may cause a future higher order collapse on a
>>> +			 * rescan of the same range.
>>> +			 */
>>> +			if (!is_pmd_order(order) || (cc->is_khugepaged &&
>>> +			    shared > khugepaged_max_ptes_shared)) {
>>
>> That's not how we indent within a nested ().
>>
>> To make this easier to read, what about similarly having at the beginning
>> of the function:
>>
>> int max_ptes_shared = 0;
>>
>> /* For MADV_COLLAPSE, we always collapse. */
>> if (cc->is_khugepaged)
>> 	max_ptes_none = HPAGE_PMD_NR;
>> /* TODO ... */
>> if (is_pmd_order(order))
>> 	max_ptes_none = khugepaged_max_ptes_shared;
>>
>> to turn this code into a
>>
>> 	if (shared > khugepaged_max_ptes_shared)
>>
>> Also, here, might make sense to have a collapse_max_ptes_swap(cc, order)
>> to do that and clean it up.
>>
>>
>>>  				result = SCAN_EXCEED_SHARED_PTE;
>>>  				count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
>>>  				goto out;
>>> @@ -679,18 +687,18 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>>  }
>>>  
>>>  static void __collapse_huge_page_copy_succeeded(pte_t *pte,
>>> -						struct vm_area_struct *vma,
>>> -						unsigned long address,
>>> -						spinlock_t *ptl,
>>> -						struct list_head *compound_pagelist)
>>> +		struct vm_area_struct *vma, unsigned long address,
>>> +		spinlock_t *ptl, unsigned int order,
>>> +		struct list_head *compound_pagelist)
>>>  {
>>> -	unsigned long end = address + HPAGE_PMD_SIZE;
>>> +	unsigned long end = address + (PAGE_SIZE << order);
>>>  	struct folio *src, *tmp;
>>>  	pte_t pteval;
>>>  	pte_t *_pte;
>>>  	unsigned int nr_ptes;
>>> +	const unsigned long nr_pages = 1UL << order;
>>
>> Move it further to the top.
>>
>>>  
>>> -	for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
>>> +	for (_pte = pte; _pte < pte + nr_pages; _pte += nr_ptes,
>>>  	     address += nr_ptes * PAGE_SIZE) {
>>>  		nr_ptes = 1;
>>>  		pteval = ptep_get(_pte);
>>> @@ -743,13 +751,11 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
>>>  }
>>>  
>>>  static void __collapse_huge_page_copy_failed(pte_t *pte,
>>> -					     pmd_t *pmd,
>>> -					     pmd_t orig_pmd,
>>> -					     struct vm_area_struct *vma,
>>> -					     struct list_head *compound_pagelist)
>>> +		pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
>>> +		unsigned int order, struct list_head *compound_pagelist)
>>>  {
>>>  	spinlock_t *pmd_ptl;
>>> -
>>> +	const unsigned long nr_pages = 1UL << order;
>>>  	/*
>>>  	 * Re-establish the PMD to point to the original page table
>>>  	 * entry. Restoring PMD needs to be done prior to releasing
>>> @@ -763,7 +769,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
>>>  	 * Release both raw and compound pages isolated
>>>  	 * in __collapse_huge_page_isolate.
>>>  	 */
>>> -	release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
>>> +	release_pte_pages(pte, pte + nr_pages, compound_pagelist);
>>>  }
>>>  
>>>  /*
>>> @@ -783,16 +789,16 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
>>>   */
>>>  static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
>>>  		pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
>>> -		unsigned long address, spinlock_t *ptl,
>>> +		unsigned long address, spinlock_t *ptl, unsigned int order,
>>>  		struct list_head *compound_pagelist)
>>>  {
>>>  	unsigned int i;
>>>  	enum scan_result result = SCAN_SUCCEED;
>>> -
>>> +	const unsigned long nr_pages = 1UL << order;
>>
>> Same here, all the way to the top.
>>
>>>  	/*
>>>  	 * Copying pages' contents is subject to memory poison at any iteration.
>>>  	 */
>>> -	for (i = 0; i < HPAGE_PMD_NR; i++) {
>>> +	for (i = 0; i < nr_pages; i++) {
>>>  		pte_t pteval = ptep_get(pte + i);
>>>  		struct page *page = folio_page(folio, i);
>>>  		unsigned long src_addr = address + i * PAGE_SIZE;
>>> @@ -811,10 +817,10 @@ static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio *foli
>>>  
>>>  	if (likely(result == SCAN_SUCCEED))
>>>  		__collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
>>> -						    compound_pagelist);
>>> +						    order, compound_pagelist);
>>>  	else
>>>  		__collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
>>> -						 compound_pagelist);
>>> +						 order, compound_pagelist);
>>>  
>>>  	return result;
>>>  }
>>> @@ -985,12 +991,12 @@ static enum scan_result check_pmd_still_valid(struct mm_struct *mm,
>>>   * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
>>>   */
>>>  static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
>>> -		struct vm_area_struct *vma, unsigned long start_addr, pmd_t *pmd,
>>> -		int referenced)
>>> +		struct vm_area_struct *vma, unsigned long start_addr,
>>> +		pmd_t *pmd, int referenced, unsigned int order)
>>>  {
>>>  	int swapped_in = 0;
>>>  	vm_fault_t ret = 0;
>>> -	unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE);
>>> +	unsigned long addr, end = start_addr + (PAGE_SIZE << order);
>>>  	enum scan_result result;
>>>  	pte_t *pte = NULL;
>>>  	spinlock_t *ptl;
>>> @@ -1022,6 +1028,19 @@ static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
>>>  		    pte_present(vmf.orig_pte))
>>>  			continue;
>>>  
>>> +		/*
>>> +		 * TODO: Support swapin without leading to further mTHP
>>> +		 * collapses. Currently bringing in new pages via swapin may
>>> +		 * cause a future higher order collapse on a rescan of the same
>>> +		 * range.
>>> +		 */
>>> +		if (!is_pmd_order(order)) {
>>> +			pte_unmap(pte);
>>> +			mmap_read_unlock(mm);
>>> +			result = SCAN_EXCEED_SWAP_PTE;
>>> +			goto out;
>>> +		}
>>> +
>>
>> Interesting, we just swapin everything we find :)
>>
>> But do we really need this check here? I mean, we just found it to be present.
>>
>> In the rare event that there was a race, do we really care? It was just
>> present, now it's swapped. Bad luck. Just swap it in.
>>
> 
> Okay, now I am confused. Why are you not taking care of
> collapse_scan_pmd() in the same context?
> 
> Because if you make sure that we properly check against a max_ptes_swap
> similar as in the style above, we'd rule out swapin right from the start?
> 
> Also, I would expect that all other parameters in there are similarly
> handled?
> 

Okay, I think you should add the following:

From 17bce81ab93f3b16e044ac2f4f62be19aac38180 Mon Sep 17 00:00:00 2001
From: "David Hildenbrand (Arm)" <david@kernel.org>
Date: Thu, 12 Mar 2026 21:54:22 +0100
Subject: [PATCH] tmp

Signed-off-by: David Hildenbrand (Arm) <david@kernel.org>
---
 mm/khugepaged.c | 89 +++++++++++++++++++++++++++++--------------------
 1 file changed, 53 insertions(+), 36 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b7b4680d27ab..6a3773bfa0a2 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -318,6 +318,34 @@ static ssize_t max_ptes_shared_store(struct kobject *kobj,
 	return count;
 }
 
+static int collapse_max_ptes_none(struct collapse_control *cc,
+		struct vm_area_struct *vma)
+{
+	/* We don't mess with MISSING faults. */
+	if (vma && userfaultfd_armed(vma))
+		return 0;
+	/* MADV_COLLAPSE always collapses. */
+	if (!cc->is_khugepaged)
+		return HPAGE_PMD_NR;
+	return khugepaged_max_ptes_none;
+}
+
+static int collapse_max_ptes_shared(struct collapse_control *cc)
+{
+	/* MADV_COLLAPSE always collapses. */
+	if (!cc->is_khugepaged)
+		return HPAGE_PMD_NR;
+	return khugepaged_max_ptes_shared;
+}
+
+static int collapse_max_ptes_swap(struct collapse_control *cc)
+{
+	/* MADV_COLLAPSE always collapses. */
+	if (!cc->is_khugepaged)
+		return HPAGE_PMD_NR;
+	return khugepaged_max_ptes_swap;
+}
+
 static struct kobj_attribute khugepaged_max_ptes_shared_attr =
 	__ATTR_RW(max_ptes_shared);
 
@@ -539,6 +567,8 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		unsigned long start_addr, pte_t *pte, struct collapse_control *cc,
 		struct list_head *compound_pagelist)
 {
+	const int max_ptes_none = collapse_max_ptes_none(cc, vma);
+	const int max_ptes_shared = collapse_max_ptes_shared(cc);
 	struct page *page = NULL;
 	struct folio *folio = NULL;
 	unsigned long addr = start_addr;
@@ -550,16 +580,12 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
 	     _pte++, addr += PAGE_SIZE) {
 		pte_t pteval = ptep_get(_pte);
 		if (pte_none_or_zero(pteval)) {
-			++none_or_zero;
-			if (!userfaultfd_armed(vma) &&
-			    (!cc->is_khugepaged ||
-			     none_or_zero <= khugepaged_max_ptes_none)) {
-				continue;
-			} else {
+			if (++none_or_zero > max_ptes_none) {
 				result = SCAN_EXCEED_NONE_PTE;
 				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
 				goto out;
 			}
+			continue;
 		}
 		if (!pte_present(pteval)) {
 			result = SCAN_PTE_NON_PRESENT;
@@ -586,9 +612,7 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
 
 		/* See hpage_collapse_scan_pmd(). */
 		if (folio_maybe_mapped_shared(folio)) {
-			++shared;
-			if (cc->is_khugepaged &&
-			    shared > khugepaged_max_ptes_shared) {
+			if (++shared > max_ptes_shared) {
 				result = SCAN_EXCEED_SHARED_PTE;
 				count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
 				goto out;
@@ -1247,6 +1271,9 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
 		struct vm_area_struct *vma, unsigned long start_addr,
 		bool *mmap_locked, struct collapse_control *cc)
 {
+	const int max_ptes_none = collapse_max_ptes_none(cc, vma);
+	const int max_ptes_swap = collapse_max_ptes_swap(cc);
+	const int max_ptes_shared = collapse_max_ptes_shared(cc);
 	pmd_t *pmd;
 	pte_t *pte, *_pte;
 	int none_or_zero = 0, shared = 0, referenced = 0;
@@ -1280,36 +1307,28 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
 
 		pte_t pteval = ptep_get(_pte);
 		if (pte_none_or_zero(pteval)) {
-			++none_or_zero;
-			if (!userfaultfd_armed(vma) &&
-			    (!cc->is_khugepaged ||
-			     none_or_zero <= khugepaged_max_ptes_none)) {
-				continue;
-			} else {
+			if (++none_or_zero > max_ptes_none) {
 				result = SCAN_EXCEED_NONE_PTE;
 				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
 				goto out_unmap;
 			}
+			continue;
 		}
 		if (!pte_present(pteval)) {
-			++unmapped;
-			if (!cc->is_khugepaged ||
-			    unmapped <= khugepaged_max_ptes_swap) {
-				/*
-				 * Always be strict with uffd-wp
-				 * enabled swap entries.  Please see
-				 * comment below for pte_uffd_wp().
-				 */
-				if (pte_swp_uffd_wp_any(pteval)) {
-					result = SCAN_PTE_UFFD_WP;
-					goto out_unmap;
-				}
-				continue;
-			} else {
+			if (++unmapped > max_ptes_swap) {
 				result = SCAN_EXCEED_SWAP_PTE;
 				count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
 				goto out_unmap;
 			}
+			/*
+			 * Always be strict with uffd-wp enabled swap entries.
+			 * See the comment below for pte_uffd_wp().
+			 */
+			if (pte_swp_uffd_wp_any(pteval)) {
+				result = SCAN_PTE_UFFD_WP;
+				goto out_unmap;
+			}
+			continue;
 		}
 		if (pte_uffd_wp(pteval)) {
 			/*
@@ -1348,9 +1367,7 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
 		 * is shared.
 		 */
 		if (folio_maybe_mapped_shared(folio)) {
-			++shared;
-			if (cc->is_khugepaged &&
-			    shared > khugepaged_max_ptes_shared) {
+			if (++shared > max_ptes_shared) {
 				result = SCAN_EXCEED_SHARED_PTE;
 				count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
 				goto out_unmap;
@@ -2305,6 +2322,8 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm,
 		unsigned long addr, struct file *file, pgoff_t start,
 		struct collapse_control *cc)
 {
+	const int max_ptes_none = collapse_max_ptes_none(cc, NULL);
+	const int max_ptes_swap = collapse_max_ptes_swap(cc);
 	struct folio *folio = NULL;
 	struct address_space *mapping = file->f_mapping;
 	XA_STATE(xas, &mapping->i_pages, start);
@@ -2323,8 +2342,7 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm,
 
 		if (xa_is_value(folio)) {
 			swap += 1 << xas_get_order(&xas);
-			if (cc->is_khugepaged &&
-			    swap > khugepaged_max_ptes_swap) {
+			if (swap > max_ptes_swap) {
 				result = SCAN_EXCEED_SWAP_PTE;
 				count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
 				break;
@@ -2395,8 +2413,7 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm,
 		cc->progress += HPAGE_PMD_NR;
 
 	if (result == SCAN_SUCCEED) {
-		if (cc->is_khugepaged &&
-		    present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+		if (present < HPAGE_PMD_NR - max_ptes_none) {
 			result = SCAN_EXCEED_NONE_PTE;
 			count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
 		} else {
-- 
2.43.0


Then extend it by passing an order + return value check in this patch here. You can
directly squash changes from patch #4 in here then.

-- 
Cheers,

David

^ permalink raw reply related

* Re: [PATCH mm-unstable v15 06/13] mm/khugepaged: skip collapsing mTHP to smaller orders
From: David Hildenbrand (Arm) @ 2026-03-12 21:00 UTC (permalink / raw)
  To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, lorenzo.stoakes,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, peterx,
	pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang, rientjes,
	rostedt, rppt, ryan.roberts, shivankg, sunnanyong, surenb,
	thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
	wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260226032445.233437-1-npache@redhat.com>

On 2/26/26 04:24, Nico Pache wrote:
> khugepaged may try to collapse a mTHP to a smaller mTHP, resulting in
> some pages being unmapped. Skip these cases until we have a way to check
> if its ok to collapse to a smaller mTHP size (like in the case of a
> partially mapped folio).
> 
> This patch is inspired by Dev Jain's work on khugepaged mTHP support [1].
> 
> [1] https://lore.kernel.org/lkml/20241216165105.56185-11-dev.jain@arm.com/
> 
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Co-developed-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  mm/khugepaged.c | 8 ++++++++
>  1 file changed, 8 insertions(+)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index fb3ba8fe5a6c..c739f26dd61e 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -638,6 +638,14 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  				goto out;
>  			}
>  		}
> +		/*
> +		 * TODO: In some cases of partially-mapped folios, we'd actually
> +		 * want to collapse.
> +		 */
> +		if (!is_pmd_order(order) && folio_order(folio) >= order) {
> +			result = SCAN_PTE_MAPPED_HUGEPAGE;
> +			goto out;
> +		}
>  
>  		if (folio_test_large(folio)) {
>  			struct folio *f;

Why aren't we doing the same in hpage_collapse_scan_pmd() ?

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH v2 1/3] lib/bootconfig: fix off-by-one in xbc_verify_tree() unclosed brace error
From: Steven Rostedt @ 2026-03-12 21:03 UTC (permalink / raw)
  To: Josh Law
  Cc: Masami Hiramatsu, Andrew Morton, Josh Law, linux-kernel,
	linux-trace-kernel
In-Reply-To: <20260312191143.28719-2-objecting@objecting.org>

On Thu, 12 Mar 2026 19:11:41 +0000
Josh Law <hlcj1234567@gmail.com> wrote:

> From: Josh Law <objecting@objecting.org>
> 
> __xbc_open_brace() pushes entries with post-increment
> (open_brace[brace_index++]), so brace_index always points one past
> the last valid entry.  xbc_verify_tree() reads open_brace[brace_index]
> to report which brace is unclosed, but this is one past the last
> pushed entry and contains stale/zero data, causing the error message
> to reference the wrong node.
> 
> Use open_brace[brace_index - 1] to correctly identify the unclosed
> brace.  brace_index is known to be > 0 here since we are inside the
> if (brace_index) guard.
> 
> Signed-off-by: Josh Law <objecting@objecting.org>

Nice catch. May I ask how you found this.

Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>

-- Steve

> ---
>  lib/bootconfig.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/lib/bootconfig.c b/lib/bootconfig.c
> index 2bcd5c2aa87e..a1e6a2e14b01 100644
> --- a/lib/bootconfig.c
> +++ b/lib/bootconfig.c
> @@ -802,7 +802,7 @@ static int __init xbc_verify_tree(void)
>  
>  	/* Brace closing */
>  	if (brace_index) {
> -		n = &xbc_nodes[open_brace[brace_index]];
> +		n = &xbc_nodes[open_brace[brace_index - 1]];
>  		return xbc_parse_error("Brace is not closed",
>  					xbc_node_get_data(n));
>  	}


^ permalink raw reply

* Re: [PATCH mm-unstable v15 07/13] mm/khugepaged: add per-order mTHP collapse failure statistics
From: David Hildenbrand (Arm) @ 2026-03-12 21:03 UTC (permalink / raw)
  To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, lorenzo.stoakes,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, peterx,
	pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang, rientjes,
	rostedt, rppt, ryan.roberts, shivankg, sunnanyong, surenb,
	thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
	wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260226032504.233594-1-npache@redhat.com>

On 2/26/26 04:25, Nico Pache wrote:
> Add three new mTHP statistics to track collapse failures for different
> orders when encountering swap PTEs, excessive none PTEs, and shared PTEs:
> 
> - collapse_exceed_swap_pte: Increment when mTHP collapse fails due to swap
> 	PTEs
> 
> - collapse_exceed_none_pte: Counts when mTHP collapse fails due to
>   	exceeding the none PTE threshold for the given order
> 
> - collapse_exceed_shared_pte: Counts when mTHP collapse fails due to shared
>   	PTEs
> 
> These statistics complement the existing THP_SCAN_EXCEED_* events by
> providing per-order granularity for mTHP collapse attempts. The stats are
> exposed via sysfs under
> `/sys/kernel/mm/transparent_hugepage/hugepages-*/stats/` for each
> supported hugepage size.
> 
> As we currently dont support collapsing mTHPs that contain a swap or
> shared entry, those statistics keep track of how often we are
> encountering failed mTHP collapses due to these restrictions.
> 
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  Documentation/admin-guide/mm/transhuge.rst | 24 ++++++++++++++++++++++
>  include/linux/huge_mm.h                    |  3 +++
>  mm/huge_memory.c                           |  7 +++++++
>  mm/khugepaged.c                            | 16 ++++++++++++---
>  4 files changed, 47 insertions(+), 3 deletions(-)
> 
> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> index c51932e6275d..eebb1f6bbc6c 100644
> --- a/Documentation/admin-guide/mm/transhuge.rst
> +++ b/Documentation/admin-guide/mm/transhuge.rst
> @@ -714,6 +714,30 @@ nr_anon_partially_mapped
>         an anonymous THP as "partially mapped" and count it here, even though it
>         is not actually partially mapped anymore.
>  
> +collapse_exceed_none_pte
> +       The number of collapse attempts that failed due to exceeding the
> +       max_ptes_none threshold. For mTHP collapse, Currently only max_ptes_none
> +       values of 0 and (HPAGE_PMD_NR - 1) are supported. Any other value will
> +       emit a warning and no mTHP collapse will be attempted. khugepaged will
> +       try to collapse to the largest enabled (m)THP size; if it fails, it will
> +       try the next lower enabled mTHP size. This counter records the number of
> +       times a collapse attempt was skipped for exceeding the max_ptes_none
> +       threshold, and khugepaged will move on to the next available mTHP size.
> +
> +collapse_exceed_swap_pte
> +       The number of anonymous mTHP PTE ranges which were unable to collapse due
> +       to containing at least one swap PTE. Currently khugepaged does not
> +       support collapsing mTHP regions that contain a swap PTE. This counter can
> +       be used to monitor the number of khugepaged mTHP collapses that failed
> +       due to the presence of a swap PTE.
> +
> +collapse_exceed_shared_pte
> +       The number of anonymous mTHP PTE ranges which were unable to collapse due
> +       to containing at least one shared PTE. Currently khugepaged does not
> +       support collapsing mTHP PTE ranges that contain a shared PTE. This
> +       counter can be used to monitor the number of khugepaged mTHP collapses
> +       that failed due to the presence of a shared PTE.
> +
>  As the system ages, allocating huge pages may be expensive as the
>  system uses memory compaction to copy data around memory to free a
>  huge page for use. There are some counters in ``/proc/vmstat`` to help
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 9941fc6d7bd8..e8777bb2347d 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -144,6 +144,9 @@ enum mthp_stat_item {
>  	MTHP_STAT_SPLIT_DEFERRED,
>  	MTHP_STAT_NR_ANON,
>  	MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
> +	MTHP_STAT_COLLAPSE_EXCEED_SWAP,
> +	MTHP_STAT_COLLAPSE_EXCEED_NONE,
> +	MTHP_STAT_COLLAPSE_EXCEED_SHARED,
>  	__MTHP_STAT_COUNT
>  };
>  
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 228f35e962b9..1049a207a257 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -642,6 +642,10 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
>  DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
>  DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
>  DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
> +DEFINE_MTHP_STAT_ATTR(collapse_exceed_swap_pte, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
> +DEFINE_MTHP_STAT_ATTR(collapse_exceed_none_pte, MTHP_STAT_COLLAPSE_EXCEED_NONE);
> +DEFINE_MTHP_STAT_ATTR(collapse_exceed_shared_pte, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> +
>  
>  static struct attribute *anon_stats_attrs[] = {
>  	&anon_fault_alloc_attr.attr,
> @@ -658,6 +662,9 @@ static struct attribute *anon_stats_attrs[] = {
>  	&split_deferred_attr.attr,
>  	&nr_anon_attr.attr,
>  	&nr_anon_partially_mapped_attr.attr,
> +	&collapse_exceed_swap_pte_attr.attr,
> +	&collapse_exceed_none_pte_attr.attr,
> +	&collapse_exceed_shared_pte_attr.attr,
>  	NULL,
>  };
>  
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index c739f26dd61e..a6cf90e09e4a 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -595,7 +595,9 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  				continue;
>  			} else {
>  				result = SCAN_EXCEED_NONE_PTE;
> -				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
> +				if (is_pmd_order(order))
> +					count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
> +				count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_NONE);
>  				goto out;
>  			}
>  		}
> @@ -631,10 +633,17 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  			 * shared may cause a future higher order collapse on a
>  			 * rescan of the same range.
>  			 */
> -			if (!is_pmd_order(order) || (cc->is_khugepaged &&
> -			    shared > khugepaged_max_ptes_shared)) {
> +			if (!is_pmd_order(order)) {
> +				result = SCAN_EXCEED_SHARED_PTE;
> +				count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> +				goto out;
> +			}
> +
> +			if (cc->is_khugepaged &&
> +			    shared > khugepaged_max_ptes_shared) {
>  				result = SCAN_EXCEED_SHARED_PTE;
>  				count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
> +				count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
>  				goto out;

With the suggested earlier rework, this should hopefully become simply

if (++shared > max_ptes_shared) {
	result = SCAN_EXCEED_SHARED_PTE;
	if (is_pmd_order(order))
		count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
	count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
}

With that (no code duplication) LGTM.

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH v2 1/3] lib/bootconfig: fix off-by-one in xbc_verify_tree() unclosed brace error
From: Josh Law @ 2026-03-12 21:03 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Andrew Morton, Josh Law, linux-kernel,
	linux-trace-kernel
In-Reply-To: <20260312170303.61c1233c@gandalf.local.home>

12 Mar 2026 21:02:51 Steven Rostedt <rostedt@goodmis.org>:

> On Thu, 12 Mar 2026 19:11:41 +0000
> Josh Law <hlcj1234567@gmail.com> wrote:
>
>> From: Josh Law <objecting@objecting.org>
>>
>> __xbc_open_brace() pushes entries with post-increment
>> (open_brace[brace_index++]), so brace_index always points one past
>> the last valid entry.  xbc_verify_tree() reads open_brace[brace_index]
>> to report which brace is unclosed, but this is one past the last
>> pushed entry and contains stale/zero data, causing the error message
>> to reference the wrong node.
>>
>> Use open_brace[brace_index - 1] to correctly identify the unclosed
>> brace.  brace_index is known to be > 0 here since we are inside the
>> if (brace_index) guard.
>>
>> Signed-off-by: Josh Law <objecting@objecting.org>
>
> Nice catch. May I ask how you found this.
>
> Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
>
> -- Steve
>
>> ---
>> lib/bootconfig.c | 2 +-
>> 1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/lib/bootconfig.c b/lib/bootconfig.c
>> index 2bcd5c2aa87e..a1e6a2e14b01 100644
>> --- a/lib/bootconfig.c
>> +++ b/lib/bootconfig.c
>> @@ -802,7 +802,7 @@ static int __init xbc_verify_tree(void)
>>
>>     /* Brace closing */
>>     if (brace_index) {
>> -       n = &xbc_nodes[open_brace[brace_index]];
>> +       n = &xbc_nodes[open_brace[brace_index - 1]];
>>         return xbc_parse_error("Brace is not closed",
>>                     xbc_node_get_data(n));
>>     }

Hi Steve,
Thanks for the review!
I found this while doing a manual audit of the bootconfig parser's error handling. I noticed that the post-increment in __xbc_open_brace() didn't seem to align with how xbc_verify_tree() was accessing the index. I verified it by intentionally passing a malformed config with an unclosed brace and saw it reporting a 'stale' or incorrect node location

^ permalink raw reply

* Re: [PATCH mm-unstable v15 08/13] mm/khugepaged: improve tracepoints for mTHP orders
From: David Hildenbrand (Arm) @ 2026-03-12 21:05 UTC (permalink / raw)
  To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, lorenzo.stoakes,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, peterx,
	pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang, rientjes,
	rostedt, rppt, ryan.roberts, shivankg, sunnanyong, surenb,
	thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
	wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260226032522.233735-1-npache@redhat.com>

On 2/26/26 04:25, Nico Pache wrote:
> Add the order to the mm_collapse_huge_page<_swapin,_isolate> tracepoints to
> give better insight into what order is being operated at for.
> 
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---

Acked-by: David Hildenbrand (Arm) <david@kernel.org>

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH v2 2/3] lib/bootconfig: check bounds before writing in __xbc_open_brace()
From: Steven Rostedt @ 2026-03-12 21:06 UTC (permalink / raw)
  To: Josh Law
  Cc: Masami Hiramatsu, Andrew Morton, Josh Law, linux-kernel,
	linux-trace-kernel
In-Reply-To: <20260312191143.28719-3-objecting@objecting.org>

On Thu, 12 Mar 2026 19:11:42 +0000
Josh Law <hlcj1234567@gmail.com> wrote:

> From: Josh Law <objecting@objecting.org>
> 
> The bounds check for brace_index happens after the array write.
> While the current call pattern prevents an actual out-of-bounds
> access (the previous call would have returned an error), the
> write-before-check pattern is fragile and would become a real
> out-of-bounds write if the error return were ever not propagated.
> 
> Move the bounds check before the array write so the function is
> self-contained and safe regardless of caller behavior.

This is the only place that increments the index, and the check is >=,
which means even if there was just one space left, it would fail.

As there's no other place that updates brace_index, I don't believe this
patch is needed. It could even replace the >= with ==.

-- Steve


> 
> Signed-off-by: Josh Law <objecting@objecting.org>
> ---
>  lib/bootconfig.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/lib/bootconfig.c b/lib/bootconfig.c
> index a1e6a2e14b01..62b4ed7a0ba6 100644
> --- a/lib/bootconfig.c
> +++ b/lib/bootconfig.c
> @@ -532,9 +532,9 @@ static char *skip_spaces_until_newline(char *p)
>  static int __init __xbc_open_brace(char *p)
>  {
>  	/* Push the last key as open brace */
> -	open_brace[brace_index++] = xbc_node_index(last_parent);
>  	if (brace_index >= XBC_DEPTH_MAX)
>  		return xbc_parse_error("Exceed max depth of braces", p);
> +	open_brace[brace_index++] = xbc_node_index(last_parent);
>  
>  	return 0;
>  }


^ permalink raw reply

* Re: [PATCH v2 2/3] lib/bootconfig: check bounds before writing in __xbc_open_brace()
From: Josh Law @ 2026-03-12 21:08 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Andrew Morton, Josh Law, linux-kernel,
	linux-trace-kernel
In-Reply-To: <20260312170643.4b0f926b@gandalf.local.home>

12 Mar 2026 21:06:31 Steven Rostedt <rostedt@goodmis.org>:

> On Thu, 12 Mar 2026 19:11:42 +0000
> Josh Law <hlcj1234567@gmail.com> wrote:
>
>> From: Josh Law <objecting@objecting.org>
>>
>> The bounds check for brace_index happens after the array write.
>> While the current call pattern prevents an actual out-of-bounds
>> access (the previous call would have returned an error), the
>> write-before-check pattern is fragile and would become a real
>> out-of-bounds write if the error return were ever not propagated.
>>
>> Move the bounds check before the array write so the function is
>> self-contained and safe regardless of caller behavior.
>
> This is the only place that increments the index, and the check is >=,
> which means even if there was just one space left, it would fail.
>
> As there's no other place that updates brace_index, I don't believe this
> patch is needed. It could even replace the >= with ==.
>
> -- Steve
>
>
>>
>> Signed-off-by: Josh Law <objecting@objecting.org>
>> ---
>> lib/bootconfig.c | 2 +-
>> 1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/lib/bootconfig.c b/lib/bootconfig.c
>> index a1e6a2e14b01..62b4ed7a0ba6 100644
>> --- a/lib/bootconfig.c
>> +++ b/lib/bootconfig.c
>> @@ -532,9 +532,9 @@ static char *skip_spaces_until_newline(char *p)
>> static int __init __xbc_open_brace(char *p)
>> {
>>     /* Push the last key as open brace */
>> -   open_brace[brace_index++] = xbc_node_index(last_parent);
>>     if (brace_index >= XBC_DEPTH_MAX)
>>         return xbc_parse_error("Exceed max depth of braces", p);
>> +   open_brace[brace_index++] = xbc_node_index(last_parent);
>>
>>     return 0;
>> }

That's a fair point, Steve. Given that brace_index isn't touched elsewhere and the current check effectively prevents the overflow, I agree this isn't strictly necessary. I'll drop this patch and stick with the fix for the off-by-one reporting error instead. Thanks for the feedback!

^ permalink raw reply

* Re: [PATCH v2 3/3] lib/bootconfig: fix snprintf truncation check in xbc_node_compose_key_after()
From: Steven Rostedt @ 2026-03-12 21:09 UTC (permalink / raw)
  To: Josh Law
  Cc: Masami Hiramatsu, Andrew Morton, Josh Law, linux-kernel,
	linux-trace-kernel
In-Reply-To: <20260312191143.28719-4-objecting@objecting.org>

On Thu, 12 Mar 2026 19:11:43 +0000
Josh Law <hlcj1234567@gmail.com> wrote:

> From: Josh Law <objecting@objecting.org>
> 
> snprintf() returns the number of characters that would have been
> written excluding the NUL terminator.  Output is truncated when the
> return value is >= the buffer size, not just > the buffer size.
> 
> When ret == size, the current code takes the non-truncated path,
> advancing buf by ret and reducing size to 0.  This is wrong because
> the output was actually truncated (the last character was replaced by
> NUL).  Fix by using >= so the truncation path is taken correctly.
> 
> Signed-off-by: Josh Law <objecting@objecting.org>

Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>

-- Steve

> ---
>  lib/bootconfig.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/lib/bootconfig.c b/lib/bootconfig.c
> index 62b4ed7a0ba6..b0ef1e74e98a 100644
> --- a/lib/bootconfig.c
> +++ b/lib/bootconfig.c
> @@ -316,7 +316,7 @@ int __init xbc_node_compose_key_after(struct xbc_node *root,
>  			       depth ? "." : "");
>  		if (ret < 0)
>  			return ret;
> -		if (ret > size) {
> +		if (ret >= size) {
>  			size = 0;
>  		} else {
>  			size -= ret;


^ permalink raw reply

* Re: [PATCH mm-unstable v15 09/13] mm/khugepaged: introduce collapse_allowable_orders helper function
From: David Hildenbrand (Arm) @ 2026-03-12 21:09 UTC (permalink / raw)
  To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, lorenzo.stoakes,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, peterx,
	pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang, rientjes,
	rostedt, rppt, ryan.roberts, shivankg, sunnanyong, surenb,
	thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
	wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260226032542.233891-1-npache@redhat.com>

On 2/26/26 04:25, Nico Pache wrote:
> Add collapse_allowable_orders() to generalize THP order eligibility. The
> function determines which THP orders are permitted based on collapse
> context (khugepaged vs madv_collapse).
> 
> This consolidates collapse configuration logic and provides a clean
> interface for future mTHP collapse support where the orders may be
> different.
> 
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  mm/khugepaged.c | 16 +++++++++++++---
>  1 file changed, 13 insertions(+), 3 deletions(-)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 2e66d660ee8e..2fdfb6d42cf9 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -486,12 +486,22 @@ static unsigned int collapse_max_ptes_none(unsigned int order)
>  	return -EINVAL;
>  }
>  
> +/* Check what orders are allowed based on the vma and collapse type */
> +static unsigned long collapse_allowable_orders(struct vm_area_struct *vma,
> +			vm_flags_t vm_flags, bool is_khugepaged)

Nit: one tab to much

> +{
> +	enum tva_type tva_flags = is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
> +	unsigned long orders = BIT(HPAGE_PMD_ORDER);
> +
> +	return thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
> +}
> +
>  void khugepaged_enter_vma(struct vm_area_struct *vma,
>  			  vm_flags_t vm_flags)
>  {
>  	if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
>  	    hugepage_pmd_enabled()) {
> -		if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
> +		if (collapse_allowable_orders(vma, vm_flags, /*is_khugepaged=*/true))
>  			__khugepaged_enter(vma->vm_mm);
>  	}
>  }
> @@ -2637,7 +2647,7 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *
>  			progress++;
>  			break;
>  		}
> -		if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
> +		if (!collapse_allowable_orders(vma, vma->vm_flags, /*is_khugepaged=*/true)) {

I'm not sure if converting from a perfectly readable enum to a boolean
is an improvement?

I would just keep the TVA_KHUGEPAGED / TVA_FORCED_COLLAPSE here/

If you want to catch callers passing in something else, you could likely
use a BUILD_BUG_ON in there.


-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH v2 2/3] lib/bootconfig: check bounds before writing in __xbc_open_brace()
From: Josh Law @ 2026-03-12 21:09 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Andrew Morton, Josh Law, linux-kernel,
	linux-trace-kernel
In-Reply-To: <143ca1aa-d053-4947-9817-72462876c224@gmail.com>

12 Mar 2026 21:08:03 Josh Law <hlcj1234567@gmail.com>:

> 12 Mar 2026 21:06:31 Steven Rostedt <rostedt@goodmis.org>:
>
>> On Thu, 12 Mar 2026 19:11:42 +0000
>> Josh Law <hlcj1234567@gmail.com> wrote:
>>
>>> From: Josh Law <objecting@objecting.org>
>>>
>>> The bounds check for brace_index happens after the array write.
>>> While the current call pattern prevents an actual out-of-bounds
>>> access (the previous call would have returned an error), the
>>> write-before-check pattern is fragile and would become a real
>>> out-of-bounds write if the error return were ever not propagated.
>>>
>>> Move the bounds check before the array write so the function is
>>> self-contained and safe regardless of caller behavior.
>>
>> This is the only place that increments the index, and the check is >=,
>> which means even if there was just one space left, it would fail.
>>
>> As there's no other place that updates brace_index, I don't believe this
>> patch is needed. It could even replace the >= with ==.
>>
>> -- Steve
>>
>>
>>>
>>> Signed-off-by: Josh Law <objecting@objecting.org>
>>> ---
>>> lib/bootconfig.c | 2 +-
>>> 1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/lib/bootconfig.c b/lib/bootconfig.c
>>> index a1e6a2e14b01..62b4ed7a0ba6 100644
>>> --- a/lib/bootconfig.c
>>> +++ b/lib/bootconfig.c
>>> @@ -532,9 +532,9 @@ static char *skip_spaces_until_newline(char *p)
>>> static int __init __xbc_open_brace(char *p)
>>> {
>>>     /* Push the last key as open brace */
>>> -   open_brace[brace_index++] = xbc_node_index(last_parent);
>>>     if (brace_index >= XBC_DEPTH_MAX)
>>>         return xbc_parse_error("Exceed max depth of braces", p);
>>> +   open_brace[brace_index++] = xbc_node_index(last_parent);
>>>
>>>     return 0;
>>> }
>
> That's a fair point, Steve. Given that brace_index isn't touched elsewhere and the current check effectively prevents the overflow, I agree this isn't strictly necessary. I'll drop this patch and stick with the fix for the off-by-one reporting error instead. Thanks for the feedback!

Wait Steve,
Thanks for the look. I see your point that it's currently redundant given the call patterns. It looks like Andrew has already merged this into the -mm tree, likely as a 'belt-and-suspenders' safety measure. I'll keep your feedback in mind for future cleanup, but I'm glad we got the other off-by-one fix in as well!

And in my opinion, merging it is a decent idea.

^ permalink raw reply

* Re: [PATCH mm-unstable v15 10/13] mm/khugepaged: Introduce mTHP collapse support
From: David Hildenbrand (Arm) @ 2026-03-12 21:16 UTC (permalink / raw)
  To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, lorenzo.stoakes,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, peterx,
	pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang, rientjes,
	rostedt, rppt, ryan.roberts, shivankg, sunnanyong, surenb,
	thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
	wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260226032605.234046-1-npache@redhat.com>

On 2/26/26 04:26, Nico Pache wrote:
> Enable khugepaged to collapse to mTHP orders. This patch implements the
> main scanning logic using a bitmap to track occupied pages and a stack
> structure that allows us to find optimal collapse sizes.
> 
> Previous to this patch, PMD collapse had 3 main phases, a light weight
> scanning phase (mmap_read_lock) that determines a potential PMD
> collapse, a alloc phase (mmap unlocked), then finally heavier collapse
> phase (mmap_write_lock).
> 
> To enabled mTHP collapse we make the following changes:
> 
> During PMD scan phase, track occupied pages in a bitmap. When mTHP
> orders are enabled, we remove the restriction of max_ptes_none during the
> scan phase to avoid missing potential mTHP collapse candidates. Once we
> have scanned the full PMD range and updated the bitmap to track occupied
> pages, we use the bitmap to find the optimal mTHP size.
> 
> Implement collapse_scan_bitmap() to perform binary recursion on the bitmap
> and determine the best eligible order for the collapse. A stack structure
> is used instead of traditional recursion to manage the search. The
> algorithm recursively splits the bitmap into smaller chunks to find the
> highest order mTHPs that satisfy the collapse criteria. We start by
> attempting the PMD order, then moved on the consecutively lower orders
> (mTHP collapse). The stack maintains a pair of variables (offset, order),
> indicating the number of PTEs from the start of the PMD, and the order of
> the potential collapse candidate.
> 
> The algorithm for consuming the bitmap works as such:
>     1) push (0, HPAGE_PMD_ORDER) onto the stack
>     2) pop the stack
>     3) check if the number of set bits in that (offset,order) pair
>        statisfy the max_ptes_none threshold for that order
>     4) if yes, attempt collapse
>     5) if no (or collapse fails), push two new stack items representing
>        the left and right halves of the current bitmap range, at the
>        next lower order
>     6) repeat at step (2) until stack is empty.
> 
> Below is a diagram representing the algorithm and stack items:
> 
>                            offset       mid_offset
>                             |         |
>                             |         |
>                             v         v
>           ____________________________________
>          |          PTE Page Table            |
>          --------------------------------------
> 			    <-------><------->
>                              order-1  order-1
> 
> We currently only support mTHP collapse for max_ptes_none values of 0
> and HPAGE_PMD_NR - 1. resulting in the following behavior:
> 
>     - max_ptes_none=0: Never introduce new empty pages during collapse
>     - max_ptes_none=HPAGE_PMD_NR-1: Always try collapse to the highest
>       available mTHP order
> 
> Any other max_ptes_none value will emit a warning and skip mTHP collapse
> attempts. There should be no behavior change for PMD collapse.
> 
> Once we determine what mTHP sizes fits best in that PMD range a collapse
> is attempted. A minimum collapse order of 2 is used as this is the lowest
> order supported by anon memory as defined by THP_ORDERS_ALL_ANON.
> 
> mTHP collapses reject regions containing swapped out or shared pages.
> This is because adding new entries can lead to new none pages, and these
> may lead to constant promotion into a higher order (m)THP. A similar
> issue can occur with "max_ptes_none > HPAGE_PMD_NR/2" due to a collapse
> introducing at least 2x the number of pages, and on a future scan will
> satisfy the promotion condition once again. This issue is prevented via
> the collapse_max_ptes_none() function which imposes the max_ptes_none
> restrictions above.
> 
> Currently madv_collapse is not supported and will only attempt PMD
> collapse.
> 
> We can also remove the check for is_khugepaged inside the PMD scan as
> the collapse_max_ptes_none() function handles this logic now.
> 
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---


[...]


>  /**
> @@ -1361,17 +1392,138 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long s
>  	return result;
>  }
>  
> +static void mthp_stack_push(struct collapse_control *cc, int *stack_size,
> +				   u16 offset, u8 order)

Nit: indentation. Same for other functions.

Wondering if you'd want to call these functions

collapse_mthp_*

> +{
> +	const int size = *stack_size;
> +	struct mthp_range *stack = &cc->mthp_bitmap_stack[size];
> +
> +	VM_WARN_ON_ONCE(size >= MTHP_STACK_SIZE);
> +	stack->order = order;
> +	stack->offset = offset;
> +	(*stack_size)++;
> +}
> +
> +static struct mthp_range mthp_stack_pop(struct collapse_control *cc, int *stack_size)
> +{
> +	const int size = *stack_size;
> +
> +	VM_WARN_ON_ONCE(size <= 0);
> +	(*stack_size)--;
> +	return cc->mthp_bitmap_stack[size - 1];
> +}
> +
> +static unsigned int mthp_nr_occupied_pte_entries(struct collapse_control *cc,
> +						 u16 offset, unsigned long nr_pte_entries)

s/pte_entries/ptes/ ?

> +{
> +	bitmap_zero(cc->mthp_bitmap_mask, HPAGE_PMD_NR);
> +	bitmap_set(cc->mthp_bitmap_mask, offset, nr_pte_entries);
> +	return bitmap_weight_and(cc->mthp_bitmap, cc->mthp_bitmap_mask, HPAGE_PMD_NR);
> +}
> +
> +/*
> + * mthp_collapse() consumes the bitmap that is generated during
> + * collapse_scan_pmd() to determine what regions and mTHP orders fit best.
> + *
> + * Each bit in cc->mthp_bitmap represents a single occupied (!none/zero) page.
> + * A stack structure cc->mthp_bitmap_stack is used to check different regions
> + * of the bitmap for collapse eligibility. The stack maintains a pair of
> + * variables (offset, order), indicating the number of PTEs from the start of
> + * the PMD, and the order of the potential collapse candidate respectively. We
> + * start at the PMD order and check if it is eligible for collapse; if not, we
> + * add two entries to the stack at a lower order to represent the left and right
> + * halves of the PTE page table we are examining.
> + *
> + *                         offset       mid_offset
> + *                         |         |
> + *                         |         |
> + *                         v         v
> + *      --------------------------------------
> + *      |          cc->mthp_bitmap            |
> + *      --------------------------------------
> + *                         <-------><------->
> + *                          order-1  order-1
> + *
> + * For each of these, we determine how many PTE entries are occupied in the
> + * range of PTE entries we propose to collapse, then we compare this to a
> + * threshold number of PTE entries which would need to be occupied for a
> + * collapse to be permitted at that order (accounting for max_ptes_none).
> +
> + * If a collapse is permitted, we attempt to collapse the PTE range into a
> + * mTHP.
> + */
> +static int mthp_collapse(struct mm_struct *mm, unsigned long address,
> +		int referenced, int unmapped, struct collapse_control *cc,
> +		bool *mmap_locked, unsigned long enabled_orders)
> +{
> +	unsigned int max_ptes_none, nr_occupied_ptes;
> +	struct mthp_range range;
> +	unsigned long collapse_address;
> +	int collapsed = 0, stack_size = 0;
> +	unsigned long nr_pte_entries;

"nr_ptes" ? Any reason for that to be an unsigned long?

> +	u16 offset;
> +	u8 order;
> +
> +	mthp_stack_push(cc, &stack_size, 0, HPAGE_PMD_ORDER);
> +
> +	while (stack_size > 0) {
> +		range = mthp_stack_pop(cc, &stack_size);
> +		order = range.order;
> +		offset = range.offset;
> +		nr_pte_entries = 1UL << order;
> +
> +		if (!test_bit(order, &enabled_orders))
> +			goto next_order;
> +
> +		if (cc->is_khugepaged)
> +			max_ptes_none = collapse_max_ptes_none(order);
> +		else
> +			max_ptes_none = COLLAPSE_MAX_PTES_LIMIT;
> +
> +		if (max_ptes_none == -EINVAL)
> +			return collapsed;

With the previous suggested rework, you could likely make this

	max_ptes_none = collapse_max_ptes_none(cc, NULL, order);
	if (max_ptes_none < 0)
		return collapsed;

> +
> +		nr_occupied_ptes = mthp_nr_occupied_pte_entries(cc, offset, nr_pte_entries);
> +
> +		if (nr_occupied_ptes >= nr_pte_entries - max_ptes_none) {
> +			int ret;
> +
> +			collapse_address = address + offset * PAGE_SIZE;
> +			ret = collapse_huge_page(mm, collapse_address, referenced,
> +						 unmapped, cc, mmap_locked,
> +						 order);
> +			if (ret == SCAN_SUCCEED) {
> +				collapsed += nr_pte_entries;
> +				continue;
> +			}
> +		}
> +
> +next_order:
> +		if (order > KHUGEPAGED_MIN_MTHP_ORDER) {
> +			const u8 next_order = order - 1;
> +			const u16 mid_offset = offset + (nr_pte_entries / 2);
> +
> +			mthp_stack_push(cc, &stack_size, mid_offset, next_order);
> +			mthp_stack_push(cc, &stack_size, offset, next_order);
> +		}
> +	}
> +	return collapsed;
> +}
> +
>  static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>  		struct vm_area_struct *vma, unsigned long start_addr, bool *mmap_locked,
>  		unsigned int *cur_progress, struct collapse_control *cc)
>  {
>  	pmd_t *pmd;
>  	pte_t *pte, *_pte;
> -	int none_or_zero = 0, shared = 0, referenced = 0;
> +	int i;
> +	int none_or_zero = 0, shared = 0, nr_collapsed = 0, referenced = 0;
>  	enum scan_result result = SCAN_FAIL;
>  	struct page *page = NULL;
> +	unsigned int max_ptes_none;
>  	struct folio *folio = NULL;
>  	unsigned long addr;
> +	unsigned long enabled_orders;
>  	spinlock_t *ptl;
>  	int node = NUMA_NO_NODE, unmapped = 0;
>  
> @@ -1384,8 +1536,21 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>  		goto out;
>  	}
>  
> +	bitmap_zero(cc->mthp_bitmap, HPAGE_PMD_NR);
>  	memset(cc->node_load, 0, sizeof(cc->node_load));
>  	nodes_clear(cc->alloc_nmask);
> +
> +	enabled_orders = collapse_allowable_orders(vma, vma->vm_flags, cc->is_khugepaged);
> +
> +	/*
> +	 * If PMD is the only enabled order, enforce max_ptes_none, otherwise
> +	 * scan all pages to populate the bitmap for mTHP collapse.
> +	 */
> +	if (cc->is_khugepaged && enabled_orders == BIT(HPAGE_PMD_ORDER))
> +		max_ptes_none = collapse_max_ptes_none(HPAGE_PMD_ORDER);
> +	else
> +		max_ptes_none = COLLAPSE_MAX_PTES_LIMIT;
> +

I assume that code to change as well. If you need help figuring out how
to make it work, please shout.

[...]

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH mm-unstable v15 11/13] mm/khugepaged: avoid unnecessary mTHP collapse attempts
From: David Hildenbrand (Arm) @ 2026-03-12 21:19 UTC (permalink / raw)
  To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, lorenzo.stoakes,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, peterx,
	pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang, rientjes,
	rostedt, rppt, ryan.roberts, shivankg, sunnanyong, surenb,
	thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
	wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260226032631.234234-1-npache@redhat.com>

On 2/26/26 04:26, Nico Pache wrote:
> There are cases where, if an attempted collapse fails, all subsequent
> orders are guaranteed to also fail. Avoid these collapse attempts by
> bailing out early.
> 
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  mm/khugepaged.c | 35 ++++++++++++++++++++++++++++++++++-
>  1 file changed, 34 insertions(+), 1 deletion(-)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 1c3711ed4513..388d3f2537e2 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1492,9 +1492,42 @@ static int mthp_collapse(struct mm_struct *mm, unsigned long address,
>  			ret = collapse_huge_page(mm, collapse_address, referenced,
>  						 unmapped, cc, mmap_locked,
>  						 order);
> -			if (ret == SCAN_SUCCEED) {
> +
> +			switch (ret) {
> +			/* Cases were we continue to next collapse candidate */
> +			case SCAN_SUCCEED:
>  				collapsed += nr_pte_entries;
> +				fallthrough;
> +			case SCAN_PTE_MAPPED_HUGEPAGE:
>  				continue;
> +			/* Cases were lower orders might still succeed */
> +			case SCAN_LACK_REFERENCED_PAGE:
> +			case SCAN_EXCEED_NONE_PTE:
> +			case SCAN_EXCEED_SWAP_PTE:
> +			case SCAN_EXCEED_SHARED_PTE:
> +			case SCAN_PAGE_LOCK:
> +			case SCAN_PAGE_COUNT:
> +			case SCAN_PAGE_LRU:
> +			case SCAN_PAGE_NULL:
> +			case SCAN_DEL_PAGE_LRU:
> +			case SCAN_PTE_NON_PRESENT:
> +			case SCAN_PTE_UFFD_WP:
> +			case SCAN_ALLOC_HUGE_PAGE_FAIL:
> +				goto next_order;
> +			/* Cases were no further collapse is possible */
> +			case SCAN_CGROUP_CHARGE_FAIL:
> +			case SCAN_COPY_MC:
> +			case SCAN_ADDRESS_RANGE:
> +			case SCAN_NO_PTE_TABLE:
> +			case SCAN_ANY_PROCESS:
> +			case SCAN_VMA_NULL:
> +			case SCAN_VMA_CHECK:
> +			case SCAN_SCAN_ABORT:
> +			case SCAN_PAGE_ANON:
> +			case SCAN_PMD_MAPPED:
> +			case SCAN_FAIL:
> +			default:
> +				return collapsed;
>  			}
>  		}
>  

LGTM, but I do wonder, given that you have the "default" case, why spell
out the ones that fall into the "default" category? I'd strip those

/* For all other cases no futher collapse is possible */
default:
	return collapsed;


Acked-by: David Hildenbrand (Arm) <david@kernel.org>

-- 
Cheers,

David

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox