Linux Trace Kernel

Linux Trace Kernel
 help / color / mirror / Atom feed

* [PATCH v2 15/18] tracing/remotes: Add poll_ms tracefs file
From: Vincent Donnefort @ 2026-06-05 16:38 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
  Cc: kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260605163825.1762953-1-vdonnefort@google.com>

Add a tracefs file to configure the trace remote polling period. Keep
the default value to 100ms.

Signed-off-by: Vincent Donnefort <vdonnefort@google.com>

diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c
index cf99752e1cd5..f72fc862ae7f 100644
--- a/kernel/trace/trace_remote.c
+++ b/kernel/trace/trace_remote.c
@@ -1133,6 +1133,40 @@ static int dump_on_panic_show(struct seq_file *s, void *unused)
 }
 DEFINE_TRACE_REMOTE_ATTRIBUTE(dump_on_panic);
 
+static ssize_t poll_ms_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	struct seq_file *seq = filp->private_data;
+	struct trace_remote *remote = seq->private;
+	unsigned int val;
+	int ret;
+
+	ret = kstrtouint_from_user(ubuf, cnt, 10, &val);
+	if (ret)
+		return ret;
+
+	if (!val)
+		return -EINVAL;
+
+	guard(mutex)(&remote->lock);
+
+	if (val < remote->poll_ms && remote->poll_cnt)
+		mod_delayed_work(system_percpu_wq, &remote->poll_work, msecs_to_jiffies(val));
+
+	remote->poll_ms = val;
+
+	return cnt;
+}
+
+static int poll_ms_show(struct seq_file *s, void *unused)
+{
+	struct trace_remote *remote = s->private;
+
+	seq_printf(s, "%u\n", remote->poll_ms);
+
+	return 0;
+}
+DEFINE_TRACE_REMOTE_ATTRIBUTE(poll_ms);
+
 static int trace_remote_init_tracefs(const char *name, struct trace_remote *remote)
 {
 	struct dentry *remote_d, *percpu_d, *d;
@@ -1165,6 +1199,10 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo
 	if (!d)
 		goto err;
 
+	d = trace_create_file("poll_ms", TRACEFS_MODE_WRITE, remote_d, remote, &poll_ms_fops);
+	if (!d)
+		goto err;
+
 	d = trace_create_file("buffer_size_kb", TRACEFS_MODE_WRITE, remote_d, remote,
 			      &buffer_size_kb_fops);
 	if (!d)
-- 
2.54.0.1032.g2f8565e1d1-goog


^ permalink raw reply related

* [PATCH v2 16/18] tracing/remotes: Add trace_remote cmdline options
From: Vincent Donnefort @ 2026-06-05 16:38 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
  Cc: kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260605163825.1762953-1-vdonnefort@google.com>

Following the same format as trace_instance, add a cmdline to configure
a trace remote on registration:

  trace_remote=<remote>^<opt1>^<opt2>,<evt1>,<evt2>

Enabling events automatically turns on tracing.

Signed-off-by: Vincent Donnefort <vdonnefort@google.com>

diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c
index f72fc862ae7f..a0a372c47562 100644
--- a/kernel/trace/trace_remote.c
+++ b/kernel/trace/trace_remote.c
@@ -13,6 +13,8 @@
 #include <linux/trace_seq.h>
 #include <linux/types.h>
 
+#include <asm/setup.h>
+
 #include "trace.h"
 
 #define TRACEFS_DIR		"remotes"
@@ -1167,6 +1169,132 @@ static int poll_ms_show(struct seq_file *s, void *unused)
 }
 DEFINE_TRACE_REMOTE_ATTRIBUTE(poll_ms);
 
+static char trace_remote_cmdline[COMMAND_LINE_SIZE];
+
+static int __init set_trace_remote_cmdline(char *str)
+{
+	static int idx;
+
+	if (!str)
+		return 0;
+
+	strscpy(trace_remote_cmdline + idx, str, COMMAND_LINE_SIZE - idx);
+	idx += strlen(str);
+	trace_remote_cmdline[idx++] = '\t';
+	return 1;
+}
+__setup("trace_remote=", set_trace_remote_cmdline);
+
+static void trace_remote_apply_cmdline_opts(struct trace_remote *remote, char *cmdline)
+{
+	bool dmesg_on = false;
+	char *opt;
+	int ret;
+
+	while ((opt = strsep(&cmdline, "^"))) {
+		if (!*opt)
+			continue;
+
+		if (!strcmp(opt, "dump_on_panic")) {
+			remote->panic_on = true;
+		} else if (!strcmp(opt, "dmesg")) {
+			dmesg_on = true;
+		} else if (!strncmp(opt, "buf_size=", 9)) {
+			/* buf_size can only be applied if the buffer is unloaded */
+			if (!WARN_ON(trace_remote_loaded(remote)))
+				remote->trace_buffer_size = memparse(opt + 9, NULL);
+		} else if (!strncmp(opt, "poll=", 5)) {
+			unsigned int poll_ms;
+
+			if (!kstrtouint(opt + 5, 10, &poll_ms) && poll_ms > 0)
+				remote->poll_ms = poll_ms;
+			else
+				pr_warn("Invalid trace remote poll '%s'\n", opt);
+		} else {
+			pr_warn("Unknown trace remote option '%s'\n", opt);
+		}
+	}
+
+	if (dmesg_on) {
+		ret = trace_remote_enable_dmesg(remote, true);
+		if (ret)
+			pr_warn("Failed to enable trace remote dmesg (%d)\n", ret);
+	}
+}
+
+static struct remote_event *
+trace_remote_find_event_by_name(struct trace_remote *remote, const char *name);
+
+static int
+trace_remote_enable_event(struct trace_remote *remote, struct remote_event *evt, bool enable);
+
+static void trace_remote_apply_cmdline_events(struct trace_remote *remote, char *cmdline)
+{
+	bool tracing_on = false;
+	char *token;
+	int ret;
+
+	while ((token = strsep(&cmdline, ","))) {
+		struct remote_event *evt;
+
+		if (!*token)
+			continue;
+
+		evt = trace_remote_find_event_by_name(remote, token);
+		if (!evt) {
+			pr_warn("trace remote event '%s' not found\n", token);
+			continue;
+		}
+
+		ret = trace_remote_enable_event(remote, evt, true);
+		if (ret)
+			pr_warn("Failed to enable trace remote event '%s' (%d)\n", token, ret);
+		else
+			tracing_on = true;
+	}
+
+	if (tracing_on) {
+		ret = trace_remote_enable_tracing(remote);
+		if (ret)
+			pr_warn("Failed to enable trace remote tracing (%d)\n", ret);
+	}
+}
+
+static void trace_remote_apply_cmdline(const char *name, struct trace_remote *remote)
+{
+	char *cmdline __free(kfree) = NULL;
+	char *events_cmdline = NULL;
+	char *opts_cmdline = NULL;
+	char *curr, *next;
+
+	if (!trace_remote_cmdline[0])
+		return;
+
+	cmdline = kstrdup(trace_remote_cmdline, GFP_KERNEL);
+	if (!cmdline)
+		return;
+
+	next = cmdline;
+	while ((curr = strsep(&next, "\t"))) {
+		char *token = strsep(&curr, ",");
+		char *rname = strsep(&token, "^");
+
+		if (strcmp(rname, name) == 0) {
+			opts_cmdline = token;
+			events_cmdline = curr;
+			break;
+		}
+	}
+
+	guard(mutex)(&remote->lock);
+
+	if (opts_cmdline)
+		trace_remote_apply_cmdline_opts(remote, opts_cmdline);
+	if (events_cmdline)
+		trace_remote_apply_cmdline_events(remote, events_cmdline);
+}
+
+
 static int trace_remote_init_tracefs(const char *name, struct trace_remote *remote)
 {
 	struct dentry *remote_d, *percpu_d, *d;
@@ -1343,6 +1471,7 @@ int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs,
 	}
 
 	list_add(&remote->node, &trace_remotes);
+	trace_remote_apply_cmdline(name, remote);
 	retain_and_null_ptr(remote);
 
 	return 0;
@@ -1822,3 +1951,15 @@ static struct remote_event *trace_remote_find_event(struct trace_remote *remote,
 	return bsearch((const void *)(unsigned long)id, remote->events, remote->nr_events,
 		       sizeof(*remote->events), __cmp_events);
 }
+
+static struct remote_event *
+trace_remote_find_event_by_name(struct trace_remote *remote, const char *name)
+{
+	int i;
+
+	for (i = 0; i < remote->nr_events; i++) {
+		if (!strcmp(remote->events[i].name, name))
+			return &remote->events[i];
+	}
+	return NULL;
+}
-- 
2.54.0.1032.g2f8565e1d1-goog


^ permalink raw reply related

* [PATCH v2 17/18] Documentation: tracing/remotes: Add detailed tracefs layout
From: Vincent Donnefort @ 2026-06-05 16:38 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
  Cc: kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260605163825.1762953-1-vdonnefort@google.com>

Add a description for each tracefs file available in a trace remote
instance.

Signed-off-by: Vincent Donnefort <vdonnefort@google.com>

diff --git a/Documentation/trace/remotes.rst b/Documentation/trace/remotes.rst
index 1f9d764f69aa..b02ebed4a03f 100644
--- a/Documentation/trace/remotes.rst
+++ b/Documentation/trace/remotes.rst
@@ -19,8 +19,8 @@ for which the host kernel can see and expose to user space.
 
 Register a remote
 =================
-A remote must provide a set of callbacks `struct trace_remote_callbacks` whom
-description can be found below. Those callbacks allows Tracefs to enable and
+A remote must provide a set of callbacks `struct trace_remote_callbacks` whose
+description can be found below. Those callbacks allow Tracefs to enable and
 disable tracing and events, to load and unload a tracing buffer (a set of
 ring-buffers) and to swap a reader page with the head page, which enables
 consuming reading.
@@ -28,8 +28,66 @@ consuming reading.
 .. kernel-doc:: include/linux/trace_remote.h
 
 Once registered, an instance will appear for this remote in the Tracefs
-directory **remotes/**. Buffers can then be read using the usual Tracefs files
-**trace_pipe** and **trace**.
+directory **remotes/**. The files within this directory allow configuring
+and reading the remote buffer (see `The File System` below).
+
+The File System
+===============
+A remote tracing instance is represented by a directory in Tracefs under
+**remotes/**. The layout and files within it are very similar to standard ftrace
+instances. Inside the remote directory, the following files and directories are
+available:
+
+  tracing_on
+	This file allows enabling or disabling the remote tracing.
+
+  buffer_size_kb
+	This file displays and allows changing the size of the per-CPU ring
+	buffers used by the remote. It also shows if the buffer is **loaded** or
+	**unloaded**. To change the size, the remote buffers must be unloaded
+	first. Remote buffers are automatically unloaded when **tracing_on** is
+	off, no one is reading the buffer (either by accessing **trace_pipe** or
+	when **dmesg** is on) and no events remain in the buffer.
+
+  trace
+	Display the human-readable content of the remote buffers. Reading this
+	file is non-consuming. Writing to this file clears the ring buffers.
+
+  trace_pipe
+	Similar to **trace** but reading it consumes the events from the ring
+	buffers (consuming read). It blocks if there are no new events.
+
+  dmesg
+	When enabled, all events from the remote are redirected to the kernel
+	dmesg. This is similar to the **tp_printk** option for in-kernel events.
+	It counts as a reader of the remote buffers and prevents unloading.
+
+  dump_on_panic
+	When enabled, the remote tracing buffer is dumped to the console when a
+	kernel panic occurs.
+
+  poll_ms
+	Modifies the polling interval for the trace_remote.
+
+  per_cpu/
+	This directory contains subdirectories for each possible CPU (e.g.,
+	**cpu0/**, **cpu1/** ...)
+
+  per_cpu/cpuX/trace
+	This is similar to the **trace** file, but it will only display the data
+	specific for the CPU. If written to, it only clears the specific CPU
+	buffer.
+
+  per_cpu/cpuX/trace_pipe
+	This is similar to the **trace_pipe** file, and is a consuming read, but
+	it will only display (and consume) the data specific to the CPU.
+
+  events/
+	This directory contains remote events that can be enabled or disabled.
+
+  events/enable
+	Allows enabling or disabling all the remote events.
+
 
 Declare a remote event
 ======================
-- 
2.54.0.1032.g2f8565e1d1-goog


^ permalink raw reply related

* [PATCH v2 18/18] Documentation/kernel-parameters: Add trace_remote
From: Vincent Donnefort @ 2026-06-05 16:38 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
  Cc: kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260605163825.1762953-1-vdonnefort@google.com>

The trace_remote parameter allows to configure a trace remote on
registration. The syntax is similar to trace_instance.

Signed-off-by: Vincent Donnefort <vdonnefort@google.com>

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 97007f4f69d4..d379fec5e81c 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -7792,6 +7792,25 @@ Kernel parameters
 			See also Documentation/trace/ftrace.rst "trace options"
 			section.
 
+	trace_remote=[remote-info]
+			[FTRACE] Configure a trace remote instance at boot.
+			Format: <name>[^option1[^option2...]][,event1[,event2...]]
+
+			Supported options:
+
+			dump_on_panic   - Enable dumping the trace buffer on
+					  panic.
+			dmesg           - Redirect tracing output to dmesg.
+			buf_size=<size> - Set the trace buffer size (e.g. 2M).
+			poll=<ms>       - Set the trace remote polling interval
+					  in milliseconds.
+
+			Events are a comma-separated list of events to enable.
+			If events are specified, tracing is automatically enabled.
+
+			Multiple remotes can be configured by specifying this
+			parameter multiple times.
+
 	trace_trigger=[trigger-list]
 			[FTRACE] Add an event trigger on specific events.
 			Set a trigger on top of a specific event, with an optional
-- 
2.54.0.1032.g2f8565e1d1-goog


^ permalink raw reply related

* Re: [PATCH mm-unstable v19 10/14] mm/khugepaged: introduce collapse_possible_orders helper functions
From: Lorenzo Stoakes @ 2026-06-05 17:46 UTC (permalink / raw)
  To: Nico Pache
  Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, liam, mathieu.desnoyers, matthew.brost, mhiramat,
	mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260605161422.213817-11-npache@redhat.com>

On Fri, Jun 05, 2026 at 10:14:17AM -0600, Nico Pache wrote:
> Add collapse_possible_orders() to generalize THP order eligibility. The
> function determines which THP orders are permitted based on collapse
> context (khugepaged vs madv_collapse). We also add collapse_possible()
> as a thin wrapper around collapse_possible_orders() that returns a bool
> rather than the whole bitmap.
>
> This consolidates collapse configuration logic and provides a clean
> interface for future mTHP collapse support where the orders may be
> different.
>
> Acked-by: David Hildenbrand (Arm) <david@kernel.org>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Signed-off-by: Nico Pache <npache@redhat.com>

LGTM, so:

Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>

> ---
>  mm/khugepaged.c | 24 +++++++++++++++++++++---
>  1 file changed, 21 insertions(+), 3 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 26c343a6fa3d..ec886a031952 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -554,12 +554,30 @@ void __khugepaged_enter(struct mm_struct *mm)
>  		wake_up_interruptible(&khugepaged_wait);
>  }
>
> +/*
> + * Check what orders are possible based on the vma and collapse type.
> + * This is used to determine if mTHP collapse is a viable option.
> + */
> +static unsigned long collapse_possible_orders(struct vm_area_struct *vma,
> +		vm_flags_t vm_flags, enum tva_type tva_flags)
> +{
> +	const unsigned long orders = BIT(HPAGE_PMD_ORDER);
> +
> +	return thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
> +}
> +
> +static bool collapse_possible(struct vm_area_struct *vma,
> +		vm_flags_t vm_flags, enum tva_type tva_flags)
> +{
> +	return collapse_possible_orders(vma, vm_flags, tva_flags);
> +}
> +
>  void khugepaged_enter_vma(struct vm_area_struct *vma,
>  			  vm_flags_t vm_flags)
>  {
>  	if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
>  	    hugepage_pmd_enabled()) {
> -		if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
> +		if (collapse_possible(vma, vm_flags, TVA_KHUGEPAGED))
>  			__khugepaged_enter(vma->vm_mm);
>  	}
>  }
> @@ -2700,7 +2718,7 @@ static void collapse_scan_mm_slot(unsigned int progress_max,
>  			cc->progress++;
>  			break;
>  		}
> -		if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
> +		if (!collapse_possible(vma, vma->vm_flags, TVA_KHUGEPAGED)) {
>  			cc->progress++;
>  			continue;
>  		}
> @@ -3010,7 +3028,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
>  	BUG_ON(vma->vm_start > start);
>  	BUG_ON(vma->vm_end < end);
>
> -	if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
> +	if (!collapse_possible(vma, vma->vm_flags, TVA_FORCED_COLLAPSE))
>  		return -EINVAL;
>
>  	cc = kmalloc_obj(*cc);
> --
> 2.54.0
>

^ permalink raw reply

* Re: [PATCH mm-unstable v19 06/14] mm/khugepaged: generalize collapse_huge_page for mTHP collapse
From: David Hildenbrand (Arm) @ 2026-06-05 17:48 UTC (permalink / raw)
  To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
	ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
	tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
	willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260605161422.213817-7-npache@redhat.com>

On 6/5/26 18:14, Nico Pache wrote:
> Pass an order to collapse_huge_page to support collapsing anon memory to
> arbitrary orders within a PMD. order indicates what mTHP size we are
> attempting to collapse to.
> 
> For non-PMD collapse we must leave the anon VMA write locked until after
> we collapse the mTHP-- in the PMD case all the pages are isolated, but in
> the mTHP case this is not true, and we must keep the lock to prevent
> access/changes to the page tables. This can happen if the rmap walkers hit
> a pmd_none while the PMD entry is currently unavailable due to being
> temporarily removed during the collapse phase.
> 
> To properly establish the page table hierarchy without violating any
> expectations from certain architectures (e.g. MIPS), we must make sure to
> have the PMD reinstalled before the PTEs, and hold both PTE/PMD locks
> before calling update_mmu_cache_range() (if they are distinct locks).
> 
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---

[...]

>  	 */
>  	__folio_mark_uptodate(folio);
> -	pgtable = pmd_pgtable(_pmd);
> -
>  	spin_lock(pmd_ptl);
> -	BUG_ON(!pmd_none(*pmd));
> -	pgtable_trans_huge_deposit(mm, pmd, pgtable);
> -	map_anon_folio_pmd_nopf(folio, pmd, vma, address);
> +	VM_WARN_ON_ONCE(!pmd_none(*pmd));
> +	if (is_pmd_order(order)) {
> +		pgtable = pmd_pgtable(_pmd);
> +		pgtable_trans_huge_deposit(mm, pmd, pgtable);
> +		map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_addr);
> +	} else {
> +		/*
> +		 * Some architectures (e.g. MIPS) walk the live page table in
> +		 * their implementation. update_mmu_cache_range() must be called
> +		 * with a valid page table hierarchy and the PTE lock held.
> +		 * Acquire it nested inside pmd_ptl when they are distinct locks.
> +		 */
> +		if (pte_ptl != pmd_ptl)
> +			spin_lock_nested(pte_ptl, SINGLE_DEPTH_NESTING);
> +		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
> +		map_anon_folio_pte_nopf(folio, pte, vma, start_addr,
> +					  /*uffd_wp=*/ false);
> +		if (pte_ptl != pmd_ptl)
> +			spin_unlock(pte_ptl);
> +	}
>  	spin_unlock(pmd_ptl);
>  
>  	folio = NULL;
>  
>  	result = SCAN_SUCCEED;
>  out_up_write:
> +	if (anon_vma_locked)
> +		anon_vma_unlock_write(vma->anon_vma);
> +	if (pte)
> +		pte_unmap(pte);

We re-enable some page table walkers before we unmap the PTE.

We still hold the mmap lock in write mode, so nothing would currently try
reclaiming the page table concurrently.

So I guess this works right now, but we should likely rework that code later to
either revert both statements. Or maybe we can simply unmap like we did, and
simply remap before we call map_anon_folio_pte_nopf()? Remapping should not fail.

Alternatively to an unmap+remap, I think we could also unmap earlier for PMD

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6de935e76ceb..ba2a2508dda6 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1378,6 +1378,8 @@ static enum scan_result collapse_huge_page(struct
mm_struct *mm, unsigned long s
        if (is_pmd_order(order)) {
                anon_vma_unlock_write(vma->anon_vma);
                anon_vma_locked = false;
+               pte_unmap(pte);
+               pte = NULL;
        }

        result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,

But this can also be handled later.

We now hold an anon_vma lock a bit longer for !pmd-collapse. But there is also
less to copy. If that bites us, we can try optimizing later.


So after another skim, I think this patch is ready for primetime. We can address
the things mentioned above later ... and any fallout can be fixed later, if any.

Acked-by: David Hildenbrand (Arm) <david@kernel.org>


-- 
Cheers,

David

^ permalink raw reply related

* Re: [PATCH mm-unstable v19 12/14] mm/khugepaged: avoid unnecessary mTHP collapse attempts
From: David Hildenbrand (Arm) @ 2026-06-05 17:49 UTC (permalink / raw)
  To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
	ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
	tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
	willy, yang, ying.huang, ziy, zokeefe, Usama Arif
In-Reply-To: <20260605161422.213817-13-npache@redhat.com>

On 6/5/26 18:14, Nico Pache wrote:
> There are cases where, if an attempted collapse fails, all subsequent
> orders are guaranteed to also fail. Avoid these collapse attempts by
> bailing out early.
> 
> Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
> Acked-by: Usama Arif <usama.arif@linux.dev>
> Acked-by: David Hildenbrand (Arm) <david@kernel.org>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  mm/khugepaged.c | 13 +++++++++++++
>  1 file changed, 13 insertions(+)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 430047316f43..7de92b28dd30 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1499,6 +1499,7 @@ static enum scan_result mthp_collapse(struct mm_struct *mm,
>  			collapse_address = address + offset * PAGE_SIZE;
>  			ret = collapse_huge_page(mm, collapse_address, referenced,
>  						 unmapped, cc, order);
> +

Unrelated to this patch, but not the end of the world :)

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH mm-unstable v19 14/14] Documentation: mm: update the admin guide for mTHP collapse
From: David Hildenbrand (Arm) @ 2026-06-05 17:52 UTC (permalink / raw)
  To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
	ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
	tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
	willy, yang, ying.huang, ziy, zokeefe, Bagas Sanjaya
In-Reply-To: <20260605161422.213817-15-npache@redhat.com>

On 6/5/26 18:14, Nico Pache wrote:
> Now that we can collapse to mTHPs lets update the admin guide to
> reflect these changes and provide proper guidance on how to utilize it.
> 
> Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
> Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---

Acked-by: David Hildenbrand (Arm) <david@kernel.org>

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH mm-unstable v19 11/14] mm/khugepaged: Introduce mTHP collapse support
From: David Hildenbrand (Arm) @ 2026-06-05 18:03 UTC (permalink / raw)
  To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
	ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
	tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
	willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260605161422.213817-12-npache@redhat.com>

On 6/5/26 18:14, Nico Pache wrote:
> Enable khugepaged to collapse to mTHP orders. This patch implements the
> main scanning logic using a bitmap to track occupied pages and the
> algorithm to find optimal collapse sizes.
> 
> Previous to this patch, PMD collapse had 3 main phases, a light weight
> scanning phase (mmap_read_lock) that determines a potential PMD
> collapse, an alloc phase (mmap unlocked), then finally heavier collapse
> phase (mmap_write_lock).
> 
> To enabled mTHP collapse we make the following changes:
> 
> During PMD scan phase, track occupied pages in a bitmap. When mTHP
> orders are enabled, we remove the restriction of max_ptes_none during the
> scan phase to avoid missing potential mTHP collapse candidates. Once we
> have scanned the full PMD range and updated the bitmap to track occupied
> pages, we use the bitmap to find the optimal mTHP size.
> 
> Implement mthp_collapse() to walk forward through the bitmap and
> determine the best eligible order for each naturally-aligned region. The
> algorithm starts at the beginning of the PMD range and, for each offset,
> tries the highest order that fits the alignment. If the number of
> occupied PTEs in that region satisfies the max_ptes_none threshold for
> that order, a collapse is attempted. On failure, the order is
> decremented and the same offset is retried at the next smaller size. Once
> the smallest enabled order is exhausted (or a collapse succeeds), the
> offset advances past the region just processed, and the next attempt
> starts at the highest order permitted by the new offset's natural
> alignment.
> 
> The algorithm works as follows:
>     1) set offset=0 and order=HPAGE_PMD_ORDER
>     2) if the order is not enabled, go to step (5)
>     3) count occupied PTEs in the (offset, order) range using
>        bitmap_weight_from()
>     4) if the count satisfies the max_ptes_none threshold, attempt
>        collapse; on success, advance to step (6)
>     5) if a smaller enabled order exists, decrement order and retry
>        from step (2) at the same offset
>     6) advance offset past the current region and compute the next
>        order from the new offset's natural alignment via __ffs(offset),
>        capped at HPAGE_PMD_ORDER
>     7) repeat from step (2) until the full PMD range is covered
> 
> mTHP collapses reject regions containing swapped out or shared pages.
> This is because adding new entries can lead to new none pages, and these
> may lead to constant promotion into a higher order mTHP. A similar
> issue can occur with "max_ptes_none > HPAGE_PMD_NR/2" due to a collapse
> introducing at least 2x the number of pages, and on a future scan will
> satisfy the promotion condition once again. This issue is prevented via
> the collapse_max_ptes_none() function which imposes the max_ptes_none
> restrictions above.
> 
> We currently only support mTHP collapse for max_ptes_none values of 0
> and HPAGE_PMD_NR - 1. resulting in the following behavior:
> 
>     - max_ptes_none=0: Never introduce new empty pages during collapse
>     - max_ptes_none=HPAGE_PMD_NR-1: Always try collapse to the highest
>       available mTHP order
> 
> Any other max_ptes_none value will emit a warning and default mTHP
> collapse to max_ptes_none=0. There should be no behavior change for PMD
> collapse.
> 
> Once we determine what mTHP sizes fits best in that PMD range a collapse
> is attempted. A minimum collapse order of 2 is used as this is the lowest
> order supported by anon memory as defined by THP_ORDERS_ALL_ANON.
> 
> Currently madv_collapse is not supported and will only attempt PMD
> collapse.
> 
> We can also remove the check for is_khugepaged inside the PMD scan as
> the collapse_max_ptes_none() function handles this logic now.
> 
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---

Yeah, overall much simpler and much easier to get. As discussed, we can optimize
this later to traverse enabled orders more efficiently.

> +	bitmap_zero(cc->mthp_present_ptes, MAX_PTRS_PER_PTE);
>  	memset(cc->node_load, 0, sizeof(cc->node_load));
>  	nodes_clear(cc->alloc_nmask);
> +
> +	enabled_orders = collapse_possible_orders(vma, vma->vm_flags, tva_flags);
> +
> +	/*
> +	 * If PMD is the only enabled order, enforce max_ptes_none, otherwise
> +	 * scan all pages to populate the bitmap for mTHP collapse.
> +	 */

I think it would have been good to mention where the check is performed for mTHP
collapse. Can be added later.


Acked-by: David Hildenbrand (Arm) <david@kernel.org>

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH mm-unstable v19 00/14] khugepaged: add mTHP collapse support
From: David Hildenbrand (Arm) @ 2026-06-05 18:07 UTC (permalink / raw)
  To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
	ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
	tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
	willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260605161422.213817-1-npache@redhat.com>

On 6/5/26 18:14, Nico Pache wrote:
> The following series provides khugepaged with the capability to collapse
> anonymous memory regions to mTHPs.
> 
> To achieve this we generalize the khugepaged functions to no longer depend
> on PMD_ORDER. Then during the PMD scan, we use a bitmap to track individual
> pages that are occupied (!none/zero). After the PMD scan is done, we use
> the bitmap to find the optimal mTHP sizes for the PMD range. The
> restriction on max_ptes_none is removed during the scan, to make sure we
> account for the whole PMD range in the bitmap. When no mTHP size is
> enabled, the legacy behavior of khugepaged is maintained.
> 
> We currently only support max_ptes_none values of 0 or HPAGE_PMD_NR - 1
> (ie 511). If any other value is specified, the kernel will emit a warning
> and mTHP collapse will default to max_ptes_none=0. If a mTHP collapse is
> attempted, but contains swapped out, or shared pages, we don't perform
> the collapse.
> It is now also possible to collapse to mTHPs without requiring the PMD THP
> size to be enabled. These limitations are to prevent collapse "creep"
> behavior. This prevents constantly promoting mTHPs to the next available
> size, which would occur because a collapse introduces more non-zero pages
> that would satisfy the promotion condition on subsequent scans.
> 
> Patch 1-2:   Generalize hugepage_vma_revalidate and alloc_charge_folio
>              for arbitrary orders.
> Patch 3:     Rework max_ptes_* handling into helper functions
> Patch 4:     Generalize __collapse_huge_page_* for mTHP support
> Patch 5:     Require collapse_huge_page to enter/exit with the lock dropped
> Patch 6:     Generalize collapse_huge_page for mTHP collapse
> Patch 7:     Skip collapsing mTHP to smaller orders
> Patch 8-9:   Add per-order mTHP statistics and tracepoints
> Patch 10:    Introduce collapse_possible_orders helper functions
> Patch 11-13: Introduce bitmap and mTHP collapse support, fully enabled
> Patch 14:    Documentation
> 

Went through it and didn't find any blockers. Let's wait for Lorenzo's assessment.

If he also doesn't find anything major, I think we can move forward with merging
it and handle smaller things as follow-ups.

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH mm-unstable v19 06/14] mm/khugepaged: generalize collapse_huge_page for mTHP collapse
From: Lorenzo Stoakes @ 2026-06-05 18:15 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel,
	aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, liam, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
	ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
	tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
	willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <95390529-3a80-473c-9433-958db7a2dc6c@kernel.org>

On Fri, Jun 05, 2026 at 07:48:17PM +0200, David Hildenbrand (Arm) wrote:
> On 6/5/26 18:14, Nico Pache wrote:
> > Pass an order to collapse_huge_page to support collapsing anon memory to
> > arbitrary orders within a PMD. order indicates what mTHP size we are
> > attempting to collapse to.
> >
> > For non-PMD collapse we must leave the anon VMA write locked until after
> > we collapse the mTHP-- in the PMD case all the pages are isolated, but in
> > the mTHP case this is not true, and we must keep the lock to prevent
> > access/changes to the page tables. This can happen if the rmap walkers hit
> > a pmd_none while the PMD entry is currently unavailable due to being
> > temporarily removed during the collapse phase.
> >
> > To properly establish the page table hierarchy without violating any
> > expectations from certain architectures (e.g. MIPS), we must make sure to
> > have the PMD reinstalled before the PTEs, and hold both PTE/PMD locks
> > before calling update_mmu_cache_range() (if they are distinct locks).
> >
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
>
> [...]
>
> >  	 */
> >  	__folio_mark_uptodate(folio);
> > -	pgtable = pmd_pgtable(_pmd);
> > -
> >  	spin_lock(pmd_ptl);
> > -	BUG_ON(!pmd_none(*pmd));
> > -	pgtable_trans_huge_deposit(mm, pmd, pgtable);
> > -	map_anon_folio_pmd_nopf(folio, pmd, vma, address);
> > +	VM_WARN_ON_ONCE(!pmd_none(*pmd));
> > +	if (is_pmd_order(order)) {
> > +		pgtable = pmd_pgtable(_pmd);
> > +		pgtable_trans_huge_deposit(mm, pmd, pgtable);
> > +		map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_addr);
> > +	} else {
> > +		/*
> > +		 * Some architectures (e.g. MIPS) walk the live page table in
> > +		 * their implementation. update_mmu_cache_range() must be called
> > +		 * with a valid page table hierarchy and the PTE lock held.
> > +		 * Acquire it nested inside pmd_ptl when they are distinct locks.
> > +		 */
> > +		if (pte_ptl != pmd_ptl)
> > +			spin_lock_nested(pte_ptl, SINGLE_DEPTH_NESTING);
> > +		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
> > +		map_anon_folio_pte_nopf(folio, pte, vma, start_addr,
> > +					  /*uffd_wp=*/ false);
> > +		if (pte_ptl != pmd_ptl)
> > +			spin_unlock(pte_ptl);
> > +	}
> >  	spin_unlock(pmd_ptl);
> >
> >  	folio = NULL;
> >
> >  	result = SCAN_SUCCEED;
> >  out_up_write:
> > +	if (anon_vma_locked)
> > +		anon_vma_unlock_write(vma->anon_vma);
> > +	if (pte)
> > +		pte_unmap(pte);
>
> We re-enable some page table walkers before we unmap the PTE.
>
> We still hold the mmap lock in write mode, so nothing would currently try
> reclaiming the page table concurrently.

Reclaim uses rmap walkers though?

Oh you mean as in page table teardown, we're safe from higher level page table
teardown, but we're not safe from zap PTE page table teardown, as
CONFIG_PT_RECLAIM makes this possible on zap now.

That is RCU safe, so the unmap would keep us safe here, but now we could lose
the PTE page table.

But, only MADV_DONTNEED sets reclaim_pt = true, and that holds the VMA read lock
so we're safe.

And anyway:

MADV_DONTNEED - VMA read lock (we hold VMA write lock)
zap_vma_for_reaping() - mmap read lock
process teardown, munmap - mmap read lock
fault - vma/mmap read lock

So the vma/mmap locks save us from those.

So rmap-wise, only the i_mmap walkers remain (truncate, hole punch, et al. and
also hugetlbfs truncate/hole-punch which does its own nonsense too), but none of
those allow for reclaim_pt to happen in any case.

So yeah we're safe but we should what, reorder these 2 statements?

But yes I agree that can be a follow-up, nothing's broken AFAICT.

>
> So I guess this works right now, but we should likely rework that code later to
> either revert both statements. Or maybe we can simply unmap like we did, and
> simply remap before we call map_anon_folio_pte_nopf()? Remapping should not fail.
>
> Alternatively to an unmap+remap, I think we could also unmap earlier for PMD
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 6de935e76ceb..ba2a2508dda6 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1378,6 +1378,8 @@ static enum scan_result collapse_huge_page(struct
> mm_struct *mm, unsigned long s
>         if (is_pmd_order(order)) {
>                 anon_vma_unlock_write(vma->anon_vma);
>                 anon_vma_locked = false;
> +               pte_unmap(pte);
> +               pte = NULL;
>         }
>
>         result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
>
> But this can also be handled later.

Yup I mean it'd be nicer to do it in one place if we can (+ impact of holding
RCU lock longer not an issue), but all this code needs rewokr anyway.

>
> We now hold an anon_vma lock a bit longer for !pmd-collapse. But there is also
> less to copy. If that bites us, we can try optimizing later.

Yeah I do worry about holding these locks longer. But we'll see.

>
>
> So after another skim, I think this patch is ready for primetime. We can address
> the things mentioned above later ... and any fallout can be fixed later, if any.
>
> Acked-by: David Hildenbrand (Arm) <david@kernel.org>

Yes, also from my side - after a git range-diff and looking into above, LGTM,
so:

Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>

Now to go look at the core algo patch :)

>
>
> --
> Cheers,
>
> David

Cheers, Lorenzo

^ permalink raw reply

* Re: [PATCH mm-unstable v19 12/14] mm/khugepaged: avoid unnecessary mTHP collapse attempts
From: Lorenzo Stoakes @ 2026-06-05 18:16 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel,
	aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, liam, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
	ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
	tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
	willy, yang, ying.huang, ziy, zokeefe, Usama Arif
In-Reply-To: <44b12ede-1e16-47f5-9051-27fa4ea34236@kernel.org>

On Fri, Jun 05, 2026 at 07:49:34PM +0200, David Hildenbrand (Arm) wrote:
> On 6/5/26 18:14, Nico Pache wrote:
> > There are cases where, if an attempted collapse fails, all subsequent
> > orders are guaranteed to also fail. Avoid these collapse attempts by
> > bailing out early.
> >
> > Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
> > Acked-by: Usama Arif <usama.arif@linux.dev>
> > Acked-by: David Hildenbrand (Arm) <david@kernel.org>
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> >  mm/khugepaged.c | 13 +++++++++++++
> >  1 file changed, 13 insertions(+)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 430047316f43..7de92b28dd30 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -1499,6 +1499,7 @@ static enum scan_result mthp_collapse(struct mm_struct *mm,
> >  			collapse_address = address + offset * PAGE_SIZE;
> >  			ret = collapse_huge_page(mm, collapse_address, referenced,
> >  						 unmapped, cc, order);
> > +
>
> Unrelated to this patch, but not the end of the world :)

NA....h that's fine ;)

>
> --
> Cheers,
>
> David

Cheers, Lorenzo

^ permalink raw reply

* Re: [PATCH mm-unstable v19 06/14] mm/khugepaged: generalize collapse_huge_page for mTHP collapse
From: Lorenzo Stoakes @ 2026-06-05 18:18 UTC (permalink / raw)
  To: Nico Pache
  Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, liam, mathieu.desnoyers, matthew.brost, mhiramat,
	mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260605161422.213817-7-npache@redhat.com>

On Fri, Jun 05, 2026 at 10:14:13AM -0600, Nico Pache wrote:
> Pass an order to collapse_huge_page to support collapsing anon memory to
> arbitrary orders within a PMD. order indicates what mTHP size we are
> attempting to collapse to.
>
> For non-PMD collapse we must leave the anon VMA write locked until after
> we collapse the mTHP-- in the PMD case all the pages are isolated, but in
> the mTHP case this is not true, and we must keep the lock to prevent
> access/changes to the page tables. This can happen if the rmap walkers hit
> a pmd_none while the PMD entry is currently unavailable due to being
> temporarily removed during the collapse phase.
>
> To properly establish the page table hierarchy without violating any
> expectations from certain architectures (e.g. MIPS), we must make sure to
> have the PMD reinstalled before the PTEs, and hold both PTE/PMD locks
> before calling update_mmu_cache_range() (if they are distinct locks).
>
> Signed-off-by: Nico Pache <npache@redhat.com>

In case Andrew missing my tag in the lengthily reply in [0]:

Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>

:)

(Though probably he has a script that does this for him!)

[0]:https://lore.kernel.org/linux-mm/aiMNT7fBUkZS1EJK@lucifer/

Cheers, Lorenzo

> ---
>  mm/khugepaged.c | 105 ++++++++++++++++++++++++++++++------------------
>  1 file changed, 67 insertions(+), 38 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index e4b2ca77ecf6..c2769d82a719 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1228,34 +1228,36 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
>   * while allocating a THP, as that could trigger direct reclaim/compaction.
>   * Note that the VMA must be rechecked after grabbing the mmap_lock again.
>   */
> -static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address,
> -		int referenced, int unmapped, struct collapse_control *cc)
> +static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long start_addr,
> +		int referenced, int unmapped, struct collapse_control *cc,
> +		unsigned int order)
>  {
> +	const unsigned long pmd_addr = start_addr & HPAGE_PMD_MASK;
> +	const unsigned long end_addr = start_addr + (PAGE_SIZE << order);
>  	LIST_HEAD(compound_pagelist);
>  	pmd_t *pmd, _pmd;
> -	pte_t *pte;
> +	pte_t *pte = NULL;
>  	pgtable_t pgtable;
>  	struct folio *folio;
>  	spinlock_t *pmd_ptl, *pte_ptl;
>  	enum scan_result result = SCAN_FAIL;
>  	struct vm_area_struct *vma;
>  	struct mmu_notifier_range range;
> +	bool anon_vma_locked = false;
>
> -	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> -
> -	result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
> +	result = alloc_charge_folio(&folio, mm, cc, order);
>  	if (result != SCAN_SUCCEED)
>  		goto out_nolock;
>
>  	mmap_read_lock(mm);
> -	result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> -					 HPAGE_PMD_ORDER);
> +	result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
> +					 &vma, cc, order);
>  	if (result != SCAN_SUCCEED) {
>  		mmap_read_unlock(mm);
>  		goto out_nolock;
>  	}
>
> -	result = find_pmd_or_thp_or_none(mm, address, &pmd);
> +	result = find_pmd_or_thp_or_none(mm, pmd_addr, &pmd);
>  	if (result != SCAN_SUCCEED) {
>  		mmap_read_unlock(mm);
>  		goto out_nolock;
> @@ -1267,8 +1269,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>  		 * released when it fails. So we jump out_nolock directly in
>  		 * that case.  Continuing to collapse causes inconsistency.
>  		 */
> -		result = __collapse_huge_page_swapin(mm, vma, address, pmd,
> -						     referenced, HPAGE_PMD_ORDER);
> +		result = __collapse_huge_page_swapin(mm, vma, start_addr, pmd,
> +						     referenced, order);
>  		if (result != SCAN_SUCCEED)
>  			goto out_nolock;
>  	}
> @@ -1283,20 +1285,28 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>  	 * mmap_lock.
>  	 */
>  	mmap_write_lock(mm);
> -	result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> -					 HPAGE_PMD_ORDER);
> +	result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
> +					 &vma, cc, order);
>  	if (result != SCAN_SUCCEED)
>  		goto out_up_write;
>  	/* check if the pmd is still valid */
>  	vma_start_write(vma);
> -	result = check_pmd_still_valid(mm, address, pmd);
> +	result = check_pmd_still_valid(mm, pmd_addr, pmd);
>  	if (result != SCAN_SUCCEED)
>  		goto out_up_write;
>
>  	anon_vma_lock_write(vma->anon_vma);
> +	anon_vma_locked = true;
>
> -	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
> -				address + HPAGE_PMD_SIZE);
> +	/*
> +	 * Only notify about the PTE range we will actually modify. While we
> +	 * temporary unmap the whole PTE table for mTHP collapse, we'll remap
> +	 * it later, leaving other PTEs effectively unmodified. The locks we
> +	 * hold prevent anybody from stumbling over such temporarily unmapped
> +	 * PTE tables.
> +	 */
> +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start_addr,
> +				end_addr);
>  	mmu_notifier_invalidate_range_start(&range);
>
>  	pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
> @@ -1308,26 +1318,23 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>  	 * Parallel GUP-fast is fine since GUP-fast will back off when
>  	 * it detects PMD is changed.
>  	 */
> -	_pmd = pmdp_collapse_flush(vma, address, pmd);
> +	_pmd = pmdp_collapse_flush(vma, pmd_addr, pmd);
>  	spin_unlock(pmd_ptl);
>  	mmu_notifier_invalidate_range_end(&range);
>  	tlb_remove_table_sync_one();
>
> -	pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
> +	pte = pte_offset_map_lock(mm, &_pmd, start_addr, &pte_ptl);
>  	if (pte) {
> -		result = __collapse_huge_page_isolate(vma, address, pte, cc,
> -						      HPAGE_PMD_ORDER,
> -						      &compound_pagelist);
> +		result = __collapse_huge_page_isolate(vma, start_addr, pte, cc,
> +						      order, &compound_pagelist);
>  		spin_unlock(pte_ptl);
>  	} else {
>  		result = SCAN_NO_PTE_TABLE;
>  	}
>
>  	if (unlikely(result != SCAN_SUCCEED)) {
> -		if (pte)
> -			pte_unmap(pte);
>  		spin_lock(pmd_ptl);
> -		BUG_ON(!pmd_none(*pmd));
> +		VM_WARN_ON_ONCE(!pmd_none(*pmd));
>  		/*
>  		 * We can only use set_pmd_at when establishing
>  		 * hugepmds and never for establishing regular pmds that
> @@ -1335,21 +1342,24 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>  		 */
>  		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
>  		spin_unlock(pmd_ptl);
> -		anon_vma_unlock_write(vma->anon_vma);
>  		goto out_up_write;
>  	}
>
>  	/*
> -	 * All pages are isolated and locked so anon_vma rmap
> -	 * can't run anymore.
> +	 * For PMD collapse all pages are isolated and locked so anon_vma
> +	 * rmap can't run anymore. For mTHP collapse the PMD entry has been
> +	 * removed and not all pages are isolated and locked, so we must hold
> +	 * the lock to prevent neighboring folios from attempting to access
> +	 * this PMD until its reinstalled.
>  	 */
> -	anon_vma_unlock_write(vma->anon_vma);
> +	if (is_pmd_order(order)) {
> +		anon_vma_unlock_write(vma->anon_vma);
> +		anon_vma_locked = false;
> +	}
>
>  	result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
> -					   vma, address, pte_ptl,
> -					   HPAGE_PMD_ORDER,
> -					   &compound_pagelist);
> -	pte_unmap(pte);
> +					   vma, start_addr, pte_ptl,
> +					   order, &compound_pagelist);
>  	if (unlikely(result != SCAN_SUCCEED))
>  		goto out_up_write;
>
> @@ -1359,18 +1369,37 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>  	 * write.
>  	 */
>  	__folio_mark_uptodate(folio);
> -	pgtable = pmd_pgtable(_pmd);
> -
>  	spin_lock(pmd_ptl);
> -	BUG_ON(!pmd_none(*pmd));
> -	pgtable_trans_huge_deposit(mm, pmd, pgtable);
> -	map_anon_folio_pmd_nopf(folio, pmd, vma, address);
> +	VM_WARN_ON_ONCE(!pmd_none(*pmd));
> +	if (is_pmd_order(order)) {
> +		pgtable = pmd_pgtable(_pmd);
> +		pgtable_trans_huge_deposit(mm, pmd, pgtable);
> +		map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_addr);
> +	} else {
> +		/*
> +		 * Some architectures (e.g. MIPS) walk the live page table in
> +		 * their implementation. update_mmu_cache_range() must be called
> +		 * with a valid page table hierarchy and the PTE lock held.
> +		 * Acquire it nested inside pmd_ptl when they are distinct locks.
> +		 */
> +		if (pte_ptl != pmd_ptl)
> +			spin_lock_nested(pte_ptl, SINGLE_DEPTH_NESTING);
> +		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
> +		map_anon_folio_pte_nopf(folio, pte, vma, start_addr,
> +					  /*uffd_wp=*/ false);
> +		if (pte_ptl != pmd_ptl)
> +			spin_unlock(pte_ptl);
> +	}
>  	spin_unlock(pmd_ptl);
>
>  	folio = NULL;
>
>  	result = SCAN_SUCCEED;
>  out_up_write:
> +	if (anon_vma_locked)
> +		anon_vma_unlock_write(vma->anon_vma);
> +	if (pte)
> +		pte_unmap(pte);
>  	mmap_write_unlock(mm);
>  out_nolock:
>  	if (folio)
> @@ -1550,7 +1579,7 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>  		/* collapse_huge_page expects the lock to be dropped before calling */
>  		mmap_read_unlock(mm);
>  		result = collapse_huge_page(mm, start_addr, referenced,
> -					    unmapped, cc);
> +					    unmapped, cc, HPAGE_PMD_ORDER);
>  		/* collapse_huge_page will return with the mmap_lock released */
>  		*lock_dropped = true;
>  	}
> --
> 2.54.0
>

^ permalink raw reply

* Re: [PATCH mm-unstable v19 14/14] Documentation: mm: update the admin guide for mTHP collapse
From: Lorenzo Stoakes @ 2026-06-05 18:20 UTC (permalink / raw)
  To: Nico Pache
  Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, liam, mathieu.desnoyers, matthew.brost, mhiramat,
	mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe, Bagas Sanjaya
In-Reply-To: <20260605161422.213817-15-npache@redhat.com>

On Fri, Jun 05, 2026 at 10:14:21AM -0600, Nico Pache wrote:
> Now that we can collapse to mTHPs lets update the admin guide to
> reflect these changes and provide proper guidance on how to utilize it.
>
> Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
> Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
> Signed-off-by: Nico Pache <npache@redhat.com>

This is completely fine, and no blockers, but just a couple tiny things
below Claude brought up for a possible trivial follow up.

> ---
>  Documentation/admin-guide/mm/transhuge.rst | 49 ++++++++++++++--------
>  1 file changed, 32 insertions(+), 17 deletions(-)
>
> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> index b98e18c80185..23f8d13c2629 100644
> --- a/Documentation/admin-guide/mm/transhuge.rst
> +++ b/Documentation/admin-guide/mm/transhuge.rst
> @@ -63,7 +63,8 @@ often.
>  THP can be enabled system wide or restricted to certain tasks or even
>  memory ranges inside task's address space. Unless THP is completely
>  disabled, there is ``khugepaged`` daemon that scans memory and
> -collapses sequences of basic pages into PMD-sized huge pages.
> +collapses sequences of basic pages into huge pages of either PMD size
> +or mTHP sizes, if the system is configured to do so.
>
>  The THP behaviour is controlled via :ref:`sysfs <thp_sysfs>`
>  interface and using madvise(2) and prctl(2) system calls.
> @@ -219,10 +220,10 @@ this behaviour by writing 0 to shrink_underused, and enable it by writing
>  	echo 0 > /sys/kernel/mm/transparent_hugepage/shrink_underused
>  	echo 1 > /sys/kernel/mm/transparent_hugepage/shrink_underused
>
> -khugepaged will be automatically started when PMD-sized THP is enabled
> +khugepaged will be automatically started when any THP size is enabled
>  (either of the per-size anon control or the top-level control are set
>  to "always" or "madvise"), and it'll be automatically shutdown when
> -PMD-sized THP is disabled (when both the per-size anon control and the
> +all THP sizes are disabled (when both the per-size anon control and the
>  top-level control are "never")

Claude was very pedantic and said we need a full stop here :P

This is not a blocker, obviously...!

>
>  process THP controls
> @@ -265,8 +266,8 @@ Khugepaged controls
>  -------------------
>
>  .. note::
> -   khugepaged currently only searches for opportunities to collapse to
> -   PMD-sized THP and no attempt is made to collapse to other THP
> +   khugepaged currently only searches for opportunities to collapse file/shmem
> +   to PMD-sized THP. Only anonymous memory will attempt to collapse to other THP
>     sizes.
>
>  khugepaged runs usually at low frequency so while one may not want to
> @@ -296,11 +297,11 @@ allocation failure to throttle the next allocation attempt::
>  The khugepaged progress can be seen in the number of pages collapsed (note
>  that this counter may not be an exact count of the number of pages
>  collapsed, since "collapsed" could mean multiple things: (1) A PTE mapping
> -being replaced by a PMD mapping, or (2) All 4K physical pages replaced by
> -one 2M hugepage. Each may happen independently, or together, depending on
> -the type of memory and the failures that occur. As such, this value should
> -be interpreted roughly as a sign of progress, and counters in /proc/vmstat
> -consulted for more accurate accounting)::
> +being replaced by a PMD mapping, or (2) physical pages replaced by one
> +hugepage of various sizes (PMD-sized or mTHP). Each may happen independently,
> +or together, depending on the type of memory and the failures that occur.
> +As such, this value should be interpreted roughly as a sign of progress,
> +and counters in /proc/vmstat consulted for more accurate accounting)::

So Claude said maybe it's worth mentioning that the per-mTHP counters are only
actually exposed through
/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/stats/ and maybe worth
mentioning here too?

>
>  	/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
>
> @@ -308,16 +309,21 @@ for each pass::
>
>  	/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans
>
> -``max_ptes_none`` specifies how many extra small pages (that are
> -not already mapped) can be allocated when collapsing a group
> -of small pages into one large page::
> +``max_ptes_none`` specifies how many empty (none/zero) pages are allowed
> +when collapsing a group of small pages into one large page::
>
>  	/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none
>
> -A higher value leads to use additional memory for programs.
> -A lower value leads to gain less thp performance. Value of
> -max_ptes_none can waste cpu time very little, you can
> -ignore it.
> +For PMD-sized THP collapse, this directly limits the number of empty pages
> +allowed in the 2MB region.
> +
> +For mTHP collapse, only 0 or (HPAGE_PMD_NR - 1) are supported. At
> +HPAGE_PMD_NR - 1, we collapse to the highest possible order. Any intermediate
> +value will emit a warning and mTHP collapse will default to max_ptes_none=0.
> +
> +A higher value allows more empty pages, potentially leading to more memory
> +usage but better THP performance. A lower value is more conservative and
> +may result in fewer THP collapses.
>
>  ``max_ptes_swap`` specifies how many pages can be brought in from
>  swap when collapsing a group of pages into a transparent huge page::
> @@ -337,6 +343,15 @@ that THP is shared. Exceeding the number would block the collapse::
>
>  A higher value may increase memory footprint for some workloads.
>
> +.. note::
> +   For mTHP collapse, khugepaged does not support collapsing regions that
> +   contain shared or swapped out pages, as this could lead to continuous
> +   promotion to higher orders. The collapse will fail if any shared or
> +   swapped PTEs are encountered during the scan.
> +
> +   Currently, madvise_collapse only supports collapsing to PMD-sized THPs
> +   and does not attempt mTHP collapses.
> +
>  Boot parameters
>  ===============
>
> --
> 2.54.0
>

Cheers, Lorenzo

^ permalink raw reply

* Re: [PATCH v7 00/42] guest_memfd: In-place conversion support
From: Sean Christopherson @ 2026-06-05 18:27 UTC (permalink / raw)
  To: Ackerley Tng
  Cc: Ackerley Tng via B4 Relay, aik, andrew.jones, binbin.wu, brauner,
	chao.p.peng, david, ira.weiny, jmattson, jthoughton, michael.roth,
	oupton, pankaj.gupta, qperret, rick.p.edgecombe, rientjes,
	shivankg, steven.price, tabba, willy, wyihan, yan.y.zhao,
	forkloop, pratyush, suzuki.poulose, aneesh.kumar, liam,
	Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <CAEvNRgHz5GDjq0GqRmpQdHc-X45gCNr39VYWZH-T7XhPEtN5CQ@mail.gmail.com>

On Thu, Jun 04, 2026, Ackerley Tng wrote:
> Sean Christopherson <seanjc@google.com> writes:
> >> + KVM: selftests: Test conversion with elevated page refcount
> >>     + Askar pointed out that soon vmsplice may not pin pages. Should I
> >>       pin pages through CONFIG_GUP_TEST like in [2]? I prefer not to
> >>       take a dependency on CONFIG_GUP_TEST.
> >
> > I'm not exactly excited about taking a dependency on CONFIG_GUP_TEST either, but
> > it probably is the least awful choice.  E.g. KVM also pins pages is certain flows,
> > but we're _also_ actively working to remove the need to pin.
> >
> > Hmm, maybe IORING_REGISTER_PBUF_RING?  AFAICT, it's almost literally a "pin user
> > memory" syscall.
> >
> 
> Hmm that takes a dependency on io_uring, which isn't always compiled
> in. Between CONFIG_IO_URING and CONFIG_GUP_TEST, I'd rather
> CONFIG_GUP_TEST.

Or try both?  If it's not a ridiculous amount of work.

^ permalink raw reply

* Re: [PATCH mm-unstable v19 11/14] mm/khugepaged: Introduce mTHP collapse support
From: Lorenzo Stoakes @ 2026-06-05 18:38 UTC (permalink / raw)
  To: Nico Pache
  Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, liam, mathieu.desnoyers, matthew.brost, mhiramat,
	mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260605161422.213817-12-npache@redhat.com>

On Fri, Jun 05, 2026 at 10:14:18AM -0600, Nico Pache wrote:
> Enable khugepaged to collapse to mTHP orders. This patch implements the
> main scanning logic using a bitmap to track occupied pages and the
> algorithm to find optimal collapse sizes.
>
> Previous to this patch, PMD collapse had 3 main phases, a light weight
> scanning phase (mmap_read_lock) that determines a potential PMD
> collapse, an alloc phase (mmap unlocked), then finally heavier collapse
> phase (mmap_write_lock).
>
> To enabled mTHP collapse we make the following changes:
>
> During PMD scan phase, track occupied pages in a bitmap. When mTHP
> orders are enabled, we remove the restriction of max_ptes_none during the
> scan phase to avoid missing potential mTHP collapse candidates. Once we
> have scanned the full PMD range and updated the bitmap to track occupied
> pages, we use the bitmap to find the optimal mTHP size.
>
> Implement mthp_collapse() to walk forward through the bitmap and
> determine the best eligible order for each naturally-aligned region. The
> algorithm starts at the beginning of the PMD range and, for each offset,
> tries the highest order that fits the alignment. If the number of
> occupied PTEs in that region satisfies the max_ptes_none threshold for
> that order, a collapse is attempted. On failure, the order is
> decremented and the same offset is retried at the next smaller size. Once
> the smallest enabled order is exhausted (or a collapse succeeds), the
> offset advances past the region just processed, and the next attempt
> starts at the highest order permitted by the new offset's natural
> alignment.

I think still it might have been nice to discuss why we are not
e.g. greedily trying to find the biggest possible mTHP size (if we did, we
would try the highest offset first), but we can save that for adding some
documentation somewhere later tbh.

This commit message is long enough as it is :>)

>
> The algorithm works as follows:
>     1) set offset=0 and order=HPAGE_PMD_ORDER
>     2) if the order is not enabled, go to step (5)
>     3) count occupied PTEs in the (offset, order) range using
>        bitmap_weight_from()
>     4) if the count satisfies the max_ptes_none threshold, attempt
>        collapse; on success, advance to step (6)
>     5) if a smaller enabled order exists, decrement order and retry
>        from step (2) at the same offset
>     6) advance offset past the current region and compute the next
>        order from the new offset's natural alignment via __ffs(offset),
>        capped at HPAGE_PMD_ORDER
>     7) repeat from step (2) until the full PMD range is covered
>
> mTHP collapses reject regions containing swapped out or shared pages.
> This is because adding new entries can lead to new none pages, and these
> may lead to constant promotion into a higher order mTHP. A similar
> issue can occur with "max_ptes_none > HPAGE_PMD_NR/2" due to a collapse
> introducing at least 2x the number of pages, and on a future scan will
> satisfy the promotion condition once again. This issue is prevented via
> the collapse_max_ptes_none() function which imposes the max_ptes_none
> restrictions above.
>
> We currently only support mTHP collapse for max_ptes_none values of 0
> and HPAGE_PMD_NR - 1. resulting in the following behavior:
>
>     - max_ptes_none=0: Never introduce new empty pages during collapse
>     - max_ptes_none=HPAGE_PMD_NR-1: Always try collapse to the highest
>       available mTHP order
>
> Any other max_ptes_none value will emit a warning and default mTHP
> collapse to max_ptes_none=0. There should be no behavior change for PMD
> collapse.
>
> Once we determine what mTHP sizes fits best in that PMD range a collapse
> is attempted. A minimum collapse order of 2 is used as this is the lowest
> order supported by anon memory as defined by THP_ORDERS_ALL_ANON.
>
> Currently madv_collapse is not supported and will only attempt PMD
> collapse.
>
> We can also remove the check for is_khugepaged inside the PMD scan as
> the collapse_max_ptes_none() function handles this logic now.

It'd be nice to have kept the ASCII diagram here too :'( but this is fine,

>
> Signed-off-by: Nico Pache <npache@redhat.com>

This all LGTM, and we can fix up any issues that arise later if anything
does break. So:

Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>

> ---
>  mm/khugepaged.c | 146 +++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 138 insertions(+), 8 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index ec886a031952..430047316f43 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -99,6 +99,8 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
>
>  static struct kmem_cache *mm_slot_cache __ro_after_init;
>
> +#define KHUGEPAGED_MIN_MTHP_ORDER	2
> +
>  struct collapse_control {
>  	bool is_khugepaged;
>
> @@ -110,6 +112,9 @@ struct collapse_control {
>
>  	/* nodemask for allocation fallback */
>  	nodemask_t alloc_nmask;
> +
> +	/* Each bit represents a single occupied (!none/zero) page. */
> +	DECLARE_BITMAP(mthp_present_ptes, MAX_PTRS_PER_PTE);
>  };
>
>  /**
> @@ -1440,20 +1445,130 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long s
>  	return result;
>  }
>
> +/* Return the highest naturally aligned order that fits at @offset within a PMD. */
> +static unsigned int max_order_from_offset(unsigned int offset)
> +{
> +	if (offset == 0)
> +		return HPAGE_PMD_ORDER;
> +
> +	return min_t(unsigned int, __ffs(offset), HPAGE_PMD_ORDER);
> +}

Thanks this is better! I wonder if we can ever actually see an
__ffs(offset) that's > HPAGE_PMD_ORDER but probably better safe than sorry
here with the min_t.

> +
> +/*
> + * mthp_collapse() consumes the bitmap that is generated during
> + * collapse_scan_pmd() to determine what regions and mTHP orders fit best.
> + *
> + * Each bit in cc->mthp_present_ptes represents a single occupied (!none/zero)
> + * page. We start at the PMD order and check if it is eligible for collapse;
> + * if not, we check the left and right halves of the PTE page table we are
> + * examining at a lower order.
> + *
> + * For each of these, we determine how many PTE entries are occupied in the
> + * range of PTE entries we propose to collapse, then we compare this to a
> + * threshold number of PTE entries which would need to be occupied for a
> + * collapse to be permitted at that order (accounting for max_ptes_none).
> + *
> + * If a collapse is permitted, we attempt to collapse the PTE range into a
> + * mTHP.
> + */
> +static enum scan_result mthp_collapse(struct mm_struct *mm,
> +		unsigned long address, int referenced, int unmapped,
> +		struct collapse_control *cc, unsigned long enabled_orders)
> +{
> +	unsigned int nr_occupied_ptes, nr_ptes, max_ptes_none;
> +	enum scan_result last_result = SCAN_FAIL;
> +	int collapsed = 0;
> +	bool alloc_failed = false;
> +	unsigned long collapse_address;
> +	unsigned int offset = 0;
> +	unsigned int order = HPAGE_PMD_ORDER;
> +
> +	while (offset < HPAGE_PMD_NR) {
> +		nr_ptes = 1UL << order;
> +
> +		if (!test_bit(order, &enabled_orders))
> +			goto next_order;
> +
> +		max_ptes_none = collapse_max_ptes_none(cc, NULL, order);
> +		nr_occupied_ptes = bitmap_weight_from(cc->mthp_present_ptes, offset,
> +						      offset + nr_ptes);
> +
> +		if (nr_occupied_ptes >= nr_ptes - max_ptes_none) {
> +			enum scan_result ret;
> +
> +			collapse_address = address + offset * PAGE_SIZE;
> +			ret = collapse_huge_page(mm, collapse_address, referenced,
> +						 unmapped, cc, order);
> +			switch (ret) {
> +			/* Cases where we continue to next collapse candidate */
> +			case SCAN_SUCCEED:
> +				collapsed += nr_ptes;
> +				fallthrough;
> +			case SCAN_PTE_MAPPED_HUGEPAGE:
> +				goto next_offset;
> +			/* Cases where lower orders might still succeed */
> +			case SCAN_ALLOC_HUGE_PAGE_FAIL:
> +				alloc_failed = true;
> +				last_result = ret;
> +				goto next_order;
> +			/* Cases where no further collapse is possible */
> +			case SCAN_PMD_MAPPED:
> +				fallthrough;
> +			default:
> +				last_result = ret;
> +				goto done;
> +			}
> +		}
> +
> +next_order:
> +		/*
> +		 * Continue with the next smaller order if there is still
> +		 * any smaller order enabled. When at the smallest order
> +		 * we must always move to the next offset.
> +		 */
> +		if (order > KHUGEPAGED_MIN_MTHP_ORDER &&
> +			(enabled_orders & GENMASK(order - 1, 0))) {

Honestly wasn't aware of GENMASK() before :)

> +			order--;
> +			continue;
> +		}
> +next_offset:
> +		/*
> +		 * Advance past the region we just processed and determine the
> +		 * highest order we can attempt next. Since huge pages must be
> +		 * naturally aligned, the max order we can attempt next is
> +		 * limited by the alignment of the new offset.
> +		 * E.g. if we collapsed a order-2 mTHP at offset 0, offset
> +		 * becomes 4 and __ffs(4) == 2, so the next attempt starts at
> +		 * order 2.
> +		 */

Great comment thanks!

> +		offset += nr_ptes;
> +		order = max_order_from_offset(offset);
> +	}
> +done:
> +	if (collapsed)
> +		return SCAN_SUCCEED;
> +	if (alloc_failed)
> +		return SCAN_ALLOC_HUGE_PAGE_FAIL;
> +	return last_result;
> +}
> +
>  static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>  		struct vm_area_struct *vma, unsigned long start_addr,
>  		bool *lock_dropped, struct collapse_control *cc)
>  {
> -	const unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
>  	const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, HPAGE_PMD_ORDER);
>  	const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
> +	unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
> +	enum tva_type tva_flags = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
>  	pmd_t *pmd;
> -	pte_t *pte, *_pte;
> +	pte_t *pte, *_pte, pteval;
> +	int i;
>  	int none_or_zero = 0, shared = 0, referenced = 0;
>  	enum scan_result result = SCAN_FAIL;
>  	struct page *page = NULL;
>  	struct folio *folio = NULL;
>  	unsigned long addr;
> +	unsigned long enabled_orders;
>  	spinlock_t *ptl;
>  	int node = NUMA_NO_NODE, unmapped = 0;
>
> @@ -1465,8 +1580,19 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>  		goto out;
>  	}
>
> +	bitmap_zero(cc->mthp_present_ptes, MAX_PTRS_PER_PTE);
>  	memset(cc->node_load, 0, sizeof(cc->node_load));
>  	nodes_clear(cc->alloc_nmask);
> +
> +	enabled_orders = collapse_possible_orders(vma, vma->vm_flags, tva_flags);
> +
> +	/*
> +	 * If PMD is the only enabled order, enforce max_ptes_none, otherwise
> +	 * scan all pages to populate the bitmap for mTHP collapse.
> +	 */
> +	if (enabled_orders != BIT(HPAGE_PMD_ORDER))
> +		max_ptes_none = KHUGEPAGED_MAX_PTES_LIMIT;
> +
>  	pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
>  	if (!pte) {
>  		cc->progress++;
> @@ -1474,11 +1600,13 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>  		goto out;
>  	}
>
> -	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> -	     _pte++, addr += PAGE_SIZE) {
> +	for (i = 0; i < HPAGE_PMD_NR; i++) {
> +		_pte = pte + i;
> +		addr = start_addr + i * PAGE_SIZE;
> +		pteval = ptep_get(_pte);
> +
>  		cc->progress++;
>
> -		pte_t pteval = ptep_get(_pte);
>  		if (pte_none_or_zero(pteval)) {
>  			if (++none_or_zero > max_ptes_none) {
>  				result = SCAN_EXCEED_NONE_PTE;
> @@ -1558,6 +1686,8 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>  			}
>  		}
>
> +		/* Set bit for occupied pages */
> +		__set_bit(i, cc->mthp_present_ptes);
>  		/*
>  		 * Record which node the original page is from and save this
>  		 * information to cc->node_load[].
> @@ -1616,9 +1746,9 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>  	if (result == SCAN_SUCCEED) {
>  		/* collapse_huge_page expects the lock to be dropped before calling */
>  		mmap_read_unlock(mm);
> -		result = collapse_huge_page(mm, start_addr, referenced,
> -					    unmapped, cc, HPAGE_PMD_ORDER);
> -		/* collapse_huge_page will return with the mmap_lock released */
> +		result = mthp_collapse(mm, start_addr, referenced,
> +				       unmapped, cc, enabled_orders);
> +		/* mmap_lock was released above, set lock_dropped */
>  		*lock_dropped = true;
>  	}
>  out:
> --
> 2.54.0
>

Cheers, Lorenzo

^ permalink raw reply

* Re: [PATCH mm-unstable v19 00/14] khugepaged: add mTHP collapse support
From: Lorenzo Stoakes @ 2026-06-05 18:39 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel,
	aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, liam, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
	ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
	tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
	willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <177704e1-03b3-4791-9a69-9b83b72d61d5@kernel.org>

On Fri, Jun 05, 2026 at 08:07:02PM +0200, David Hildenbrand (Arm) wrote:
> On 6/5/26 18:14, Nico Pache wrote:
> > The following series provides khugepaged with the capability to collapse
> > anonymous memory regions to mTHPs.
> >
> > To achieve this we generalize the khugepaged functions to no longer depend
> > on PMD_ORDER. Then during the PMD scan, we use a bitmap to track individual
> > pages that are occupied (!none/zero). After the PMD scan is done, we use
> > the bitmap to find the optimal mTHP sizes for the PMD range. The
> > restriction on max_ptes_none is removed during the scan, to make sure we
> > account for the whole PMD range in the bitmap. When no mTHP size is
> > enabled, the legacy behavior of khugepaged is maintained.
> >
> > We currently only support max_ptes_none values of 0 or HPAGE_PMD_NR - 1
> > (ie 511). If any other value is specified, the kernel will emit a warning
> > and mTHP collapse will default to max_ptes_none=0. If a mTHP collapse is
> > attempted, but contains swapped out, or shared pages, we don't perform
> > the collapse.
> > It is now also possible to collapse to mTHPs without requiring the PMD THP
> > size to be enabled. These limitations are to prevent collapse "creep"
> > behavior. This prevents constantly promoting mTHPs to the next available
> > size, which would occur because a collapse introduces more non-zero pages
> > that would satisfy the promotion condition on subsequent scans.
> >
> > Patch 1-2:   Generalize hugepage_vma_revalidate and alloc_charge_folio
> >              for arbitrary orders.
> > Patch 3:     Rework max_ptes_* handling into helper functions
> > Patch 4:     Generalize __collapse_huge_page_* for mTHP support
> > Patch 5:     Require collapse_huge_page to enter/exit with the lock dropped
> > Patch 6:     Generalize collapse_huge_page for mTHP collapse
> > Patch 7:     Skip collapsing mTHP to smaller orders
> > Patch 8-9:   Add per-order mTHP statistics and tracepoints
> > Patch 10:    Introduce collapse_possible_orders helper functions
> > Patch 11-13: Introduce bitmap and mTHP collapse support, fully enabled
> > Patch 14:    Documentation
> >
>
> Went through it and didn't find any blockers. Let's wait for Lorenzo's assessment.
>
> If he also doesn't find anything major, I think we can move forward with merging
> it and handle smaller things as follow-ups.

All LGTM :)

Unless sashiko tells us about something utterly broken (trivial things meh,
existing things meh), or we see some massive breakage in testing, I
think... *drum roll*

We're good to take this this cycle :)

Thanks Nico for your patience during this process and obviously David and Lance
and all the other reviews for taking part.

We got there in the end :)

>
> --
> Cheers,
>
> David

Cheers, Lorenzo

^ permalink raw reply

* Re: [PATCH mm-unstable v19 04/14] mm/khugepaged: generalize __collapse_huge_page_* for mTHP support
From: Zi Yan @ 2026-06-05 19:03 UTC (permalink / raw)
  To: Nico Pache
  Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, liam, ljs, mathieu.desnoyers, matthew.brost, mhiramat,
	mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang,
	zokeefe
In-Reply-To: <20260605161422.213817-5-npache@redhat.com>

On 5 Jun 2026, at 12:14, Nico Pache wrote:

> generalize the order of the __collapse_huge_page_* and collapse_max_*
> functions to support future mTHP collapse.
>
> The current mechanism for determining collapse with the
> khugepaged_max_ptes_none value is not designed with mTHP in mind. This
> raises a key design issue: if we support user defined max_pte_none values
> (even those scaled by order), a collapse of a lower order can introduces
> an feedback loop, or "creep", when max_ptes_none is set to a value greater
> than HPAGE_PMD_NR / 2. [1]
>
> With this configuration, a successful collapse to order N will populate
> enough pages to satisfy the collapse condition on order N+1 on the next
> scan. This leads to unnecessary work and memory churn.
>
> To fix this issue introduce a helper function that will limit mTHP
> collapse support to two max_ptes_none values, 0 and HPAGE_PMD_NR - 1.
> This effectively supports two modes: [2]
>
> - max_ptes_none=0: never collapses if it encounters an empty PTE or a PTE
>   that maps the shared zeropage. Consequently, no memory bloat.
> - max_ptes_none=511 (on 4k pagesz): Always collapse to the highest
>   available mTHP order.
>
> This removes the possibility of "creep", and a warning will be emitted if
> any non-supported max_ptes_none value is configured with mTHP enabled.
> Any intermediate value will default mTHP collapse to max_ptes_none=0.
>
> mTHP collapse will not honor the khugepaged_max_ptes_shared or
> khugepaged_max_ptes_swap parameters, and will fail if it encounters a
> shared or swapped entry.
>
> No functional changes in this patch; however it defines future behavior
> for mTHP collapse.
>
> [1] - https://lore.kernel.org/all/e46ab3ab-a3d7-4fb7-9970-d0704bd5d05a@arm.com
> [2] - https://lore.kernel.org/all/37375ace-5601-4d6c-9dac-d1c8268698e9@redhat.com
>
> Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
> Acked-by: David Hildenbrand (arm) <david@kernel.org>
> Reviewed-by: Lance Yang <lance.yang@linux.dev>
> Co-developed-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  mm/khugepaged.c | 126 +++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 93 insertions(+), 33 deletions(-)
>
LGTM.

Reviewed-by: Zi Yan <ziy@nvidia.com>

Best Regards,
Yan, Zi

^ permalink raw reply

* Re: [PATCH mm-unstable v19 05/14] mm/khugepaged: require collapse_huge_page to enter/exit with the lock dropped
From: Zi Yan @ 2026-06-05 20:07 UTC (permalink / raw)
  To: Nico Pache
  Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, liam, ljs, mathieu.desnoyers, matthew.brost, mhiramat,
	mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang,
	zokeefe
In-Reply-To: <20260605161422.213817-6-npache@redhat.com>

On 5 Jun 2026, at 12:14, Nico Pache wrote:

> Currently the collapse_huge_page function requires the mmap_read_lock to
> enter with it held, and exit with it dropped. This function moves the
> unlock into its parent caller, and changes this semantic to requiring it
> to enter/exit with it always unlocked.
>
> In future patches, we need this expectation, as for in mTHP collapse, we
> may have already dropped the lock, and do not want to conditionally
> check for this by passing through the lock_dropped variable.
>
> No functional change is expected as one of the first things the
> collapse_huge_page function does is drop this lock before allocating the
> hugepage.
>
> Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
> Acked-by: David Hildenbrand (Arm) <david@kernel.org>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  mm/khugepaged.c | 16 ++++++++--------
>  1 file changed, 8 insertions(+), 8 deletions(-)
>
LGTM.

Reviewed-by: Zi Yan <ziy@nvidia.com>

Best Regards,
Yan, Zi

^ permalink raw reply

* Re: [PATCH mm-unstable v19 00/14] khugepaged: add mTHP collapse support
From: Andrew Morton @ 2026-06-06  0:38 UTC (permalink / raw)
  To: Nico Pache
  Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
	anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, liam, ljs, mathieu.desnoyers, matthew.brost, mhiramat,
	mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260605161422.213817-1-npache@redhat.com>

On Fri,  5 Jun 2026 10:14:07 -0600 Nico Pache <npache@redhat.com> wrote:

> The following series provides khugepaged with the capability to collapse
> anonymous memory regions to mTHPs.

Thanks, I updated mm.git's mm-unstable branch to this version.

Sashiko said things:
	https://sashiko.dev/#/patchset/20260605161422.213817-1-npache@redhat.com

> V19 Changes:
> - Rebased onto mm-unstable (base: 9282f9bdbdf2, where v18 was based)
> - Added RBs/Acks
> - [patch 04] Make max_ptes_none const; guard pr_warn_once with
>   check so we only warn for non-zero intermediate values and use a single
>   "return 0" path (David)
> - [patch 06] Add comment explaining the mmu_notifier_range covers only
>   the modified PTE range (David); change BUG_ON to VM_WARN_ON_ONCE
>   (David); fix update_mmu_cache_range() arch safety issue: reinstall
>   PMD via pmd_populate() before calling map_anon_folio_pte_nopf() and
>   hold the PTE lock nested inside pmd_ptl during the operation (David,
>   Lance); drop the now-unnecessary smp_wmb() as __folio_mark_uptodate()
>   provides the required ordering (David, Lance)
> - [patch 07] Clarify commit message: "mTHP to a folio of equal or
>   smaller size, possibly resulting in a partially mapped source folio"
>   (David)
> - [patch 08] Add Lorenzo's RB and David's Ack; move TODO comment about
>   shared pages to patch 4 (David)
> - [patch 10] Rename collapse_allowable_orders() to
>   collapse_possible_orders() and add collapse_possible() boolean wrapper
>   for callers that only need a yes/no answer (David)
> - [patch 11] Major rework: replace the DFS stack-based algorithm with a
>   simpler linear forward-walking approach using offset + order (David);
>   remove mthp_range struct, mthp_bitmap_stack[], stack push/pop
>   functions, and MTHP_STACK_SIZE; add max_order_from_offset() helper
>   using __ffs(offset) for natural alignment; rename mthp_bitmap to
>   mthp_present_ptes (David); remove temporary mthp_bitmap_mask and use
>   bitmap_weight_from() directly (David); fix result propagation so
>   mthp_collapse() returns enum scan_result and properly propagates
>   SCAN_ALLOC_HUGE_PAGE_FAIL and SCAN_PTE_MAPPED_HUGEPAGE to callers
>   (Lance); fold in v18 fixup for potential use-after-free of vma in
>   mthp_collapse() by passing NULL to collapse_max_ptes_none()
> - [patch 12] Rework to match new linear algorithm; expand the set of
>   failures that allow retrying at a lower order
> - [patch 14] Re-add note about file/shmem still only collapsing to
>   PMD-sized THPs (David)

Here's how v19 altered mm.git:


 mm/khugepaged.c |  222 ++++++++++++++++++++--------------------------
 1 file changed, 100 insertions(+), 122 deletions(-)

--- a/mm/khugepaged.c~b
+++ a/mm/khugepaged.c
@@ -100,28 +100,6 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_s
 static struct kmem_cache *mm_slot_cache __ro_after_init;
 
 #define KHUGEPAGED_MIN_MTHP_ORDER	2
-/*
- * mthp_collapse() does an iterative DFS over a binary tree, from
- * HPAGE_PMD_ORDER down to KHUGEPAGED_MIN_MTHP_ORDER. The max stack
- * size needed for a DFS on a binary tree is height + 1, where
- * height = HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER.
- *
- * ilog2 is used in place of HPAGE_PMD_ORDER because some architectures
- * (e.g. ppc64le) do not define HPAGE_PMD_ORDER until after build time.
- */
-#define MTHP_STACK_SIZE	(ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER + 1)
-
-/*
- * Defines a range of PTE entries in a PTE page table which are being
- * considered for mTHP collapse.
- *
- * @offset: the offset of the first PTE entry in a PMD range.
- * @order: the order of the PTE entries being considered for collapse.
- */
-struct mthp_range {
-	u16 offset;
-	u8 order;
-};
 
 struct collapse_control {
 	bool is_khugepaged;
@@ -136,10 +114,7 @@ struct collapse_control {
 	nodemask_t alloc_nmask;
 
 	/* Each bit represents a single occupied (!none/zero) page. */
-	DECLARE_BITMAP(mthp_bitmap, MAX_PTRS_PER_PTE);
-	/* A mask of the current range being considered for mTHP collapse. */
-	DECLARE_BITMAP(mthp_bitmap_mask, MAX_PTRS_PER_PTE);
-	struct mthp_range mthp_bitmap_stack[MTHP_STACK_SIZE];
+	DECLARE_BITMAP(mthp_present_ptes, MAX_PTRS_PER_PTE);
 };
 
 /**
@@ -584,8 +559,11 @@ void __khugepaged_enter(struct mm_struct
 		wake_up_interruptible(&khugepaged_wait);
 }
 
-/* Check what orders are allowed based on the vma and collapse type */
-static unsigned long collapse_allowable_orders(struct vm_area_struct *vma,
+/*
+ * Check what orders are possible based on the vma and collapse type.
+ * This is used to determine if mTHP collapse is a viable option.
+ */
+static unsigned long collapse_possible_orders(struct vm_area_struct *vma,
 		vm_flags_t vm_flags, enum tva_type tva_flags)
 {
 	unsigned long orders;
@@ -599,11 +577,17 @@ static unsigned long collapse_allowable_
 	return thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
 }
 
+static bool collapse_possible(struct vm_area_struct *vma,
+		vm_flags_t vm_flags, enum tva_type tva_flags)
+{
+	return collapse_possible_orders(vma, vm_flags, tva_flags);
+}
+
 void khugepaged_enter_vma(struct vm_area_struct *vma,
 			  vm_flags_t vm_flags)
 {
 	if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) && hugepage_enabled()
-	    && collapse_allowable_orders(vma, vm_flags, TVA_KHUGEPAGED))
+	    && collapse_possible(vma, vm_flags, TVA_KHUGEPAGED))
 		__khugepaged_enter(vma->vm_mm);
 }
 
@@ -844,7 +828,7 @@ static void __collapse_huge_page_copy_su
 		struct list_head *compound_pagelist)
 {
 	const unsigned long nr_pages = 1UL << order;
-	unsigned long end = address + (PAGE_SIZE << order);
+	unsigned long end = address + (PAGE_SIZE * nr_pages);
 	struct folio *src, *tmp;
 	pte_t pteval;
 	pte_t *_pte;
@@ -1074,7 +1058,13 @@ static enum scan_result hugepage_vma_rev
 	if (!vma)
 		return SCAN_VMA_NULL;
 
-	/* Always check the PMD order to ensure its not shared by another VMA */
+	/*
+	 * We cannot collapse VMA regions that do not span the full PMD. This is
+	 * due to the potential of the PMD being shared by another VMA leaving
+	 * us vulnerable to a race condition. Always check the PMD order here to
+	 * ensure its not shared by another VMA. We'd need to lock all VMAs in
+	 * the PMD range to support this.
+	 */
 	if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
 		return SCAN_ADDRESS_RANGE;
 	if (!thp_vma_allowable_orders(vma, vma->vm_flags, type, BIT(order)))
@@ -1390,7 +1380,7 @@ static enum scan_result collapse_huge_pa
 
 	if (unlikely(result != SCAN_SUCCEED)) {
 		spin_lock(pmd_ptl);
-		WARN_ON_ONCE(!pmd_none(*pmd));
+		VM_WARN_ON_ONCE(!pmd_none(*pmd));
 		/*
 		 * We can only use set_pmd_at when establishing
 		 * hugepmds and never for establishing regular pmds that
@@ -1433,13 +1423,18 @@ static enum scan_result collapse_huge_pa
 		map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_addr);
 	} else {
 		/*
-		 * set_ptes is called in map_anon_folio_pte_nopf with the
-		 * pmd_ptl lock still held; this is safe as the PMD is expected
-		 * to be none. The pmd entry is then repopulated below.
+		 * Some architectures (e.g. MIPS) walk the live page table in
+		 * their implementation. update_mmu_cache_range() must be called
+		 * with a valid page table hierarchy and the PTE lock held.
+		 * Acquire it nested inside pmd_ptl when they are distinct locks.
 		 */
-		map_anon_folio_pte_nopf(folio, pte, vma, start_addr, /*uffd_wp=*/ false);
-		smp_wmb(); /* make PTEs visible before PMD. See pmd_install() */
+		if (pte_ptl != pmd_ptl)
+			spin_lock_nested(pte_ptl, SINGLE_DEPTH_NESTING);
 		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
+		map_anon_folio_pte_nopf(folio, pte, vma, start_addr,
+					  /*uffd_wp=*/ false);
+		if (pte_ptl != pmd_ptl)
+			spin_unlock(pte_ptl);
 	}
 	spin_unlock(pmd_ptl);
 
@@ -1459,58 +1454,23 @@ out_nolock:
 	return result;
 }
 
-static void collapse_mthp_stack_push(struct collapse_control *cc, int *stack_size,
-				     u16 offset, u8 order)
-{
-	const int size = *stack_size;
-	struct mthp_range *stack = &cc->mthp_bitmap_stack[size];
-
-	VM_WARN_ON_ONCE(size >= MTHP_STACK_SIZE);
-	stack->order = order;
-	stack->offset = offset;
-	(*stack_size)++;
-}
-
-static struct mthp_range collapse_mthp_stack_pop(struct collapse_control *cc,
-						 int *stack_size)
+/* Return the highest naturally aligned order that fits at @offset within a PMD. */
+static unsigned int max_order_from_offset(unsigned int offset)
 {
-	const int size = *stack_size;
+	if (offset == 0)
+		return HPAGE_PMD_ORDER;
 
-	VM_WARN_ON_ONCE(size <= 0);
-	(*stack_size)--;
-	return cc->mthp_bitmap_stack[size - 1];
-}
-
-static unsigned int collapse_mthp_count_present(struct collapse_control *cc,
-						u16 offset, unsigned int nr_ptes)
-{
-	bitmap_zero(cc->mthp_bitmap_mask, MAX_PTRS_PER_PTE);
-	bitmap_set(cc->mthp_bitmap_mask, offset, nr_ptes);
-	return bitmap_weight_and(cc->mthp_bitmap, cc->mthp_bitmap_mask, MAX_PTRS_PER_PTE);
+	return min_t(unsigned int, __ffs(offset), HPAGE_PMD_ORDER);
 }
 
 /*
  * mthp_collapse() consumes the bitmap that is generated during
  * collapse_scan_pmd() to determine what regions and mTHP orders fit best.
  *
- * Each bit in cc->mthp_bitmap represents a single occupied (!none/zero) page.
- * A stack structure cc->mthp_bitmap_stack is used to check different regions
- * of the bitmap for collapse eligibility. The stack maintains a pair of
- * variables (offset, order), indicating the number of PTEs from the start of
- * the PMD, and the order of the potential collapse candidate respectively. We
- * start at the PMD order and check if it is eligible for collapse; if not, we
- * add two entries to the stack at a lower order to represent the left and right
- * halves of the PTE page table we are examining.
- *
- *                         offset       mid_offset
- *                         |         |
- *                         |         |
- *                         v         v
- *      --------------------------------------
- *      |          cc->mthp_bitmap            |
- *      --------------------------------------
- *                         <-------><------->
- *                          order-1  order-1
+ * Each bit in cc->mthp_present_ptes represents a single occupied (!none/zero)
+ * page. We start at the PMD order and check if it is eligible for collapse;
+ * if not, we check the left and right halves of the PTE page table we are
+ * examining at a lower order.
  *
  * For each of these, we determine how many PTE entries are occupied in the
  * range of PTE entries we propose to collapse, then we compare this to a
@@ -1520,35 +1480,30 @@ static unsigned int collapse_mthp_count_
  * If a collapse is permitted, we attempt to collapse the PTE range into a
  * mTHP.
  */
-static int mthp_collapse(struct mm_struct *mm, unsigned long address,
-		int referenced, int unmapped, struct collapse_control *cc,
-		unsigned long enabled_orders)
+static enum scan_result mthp_collapse(struct mm_struct *mm,
+		unsigned long address, int referenced, int unmapped,
+		struct collapse_control *cc, unsigned long enabled_orders)
 {
 	unsigned int nr_occupied_ptes, nr_ptes, max_ptes_none;
-	int collapsed = 0, stack_size = 0;
+	enum scan_result last_result = SCAN_FAIL;
+	int collapsed = 0;
+	bool alloc_failed = false;
 	unsigned long collapse_address;
-	struct mthp_range range;
-	u16 offset;
-	u8 order;
-
-	collapse_mthp_stack_push(cc, &stack_size, 0, HPAGE_PMD_ORDER);
-
-	while (stack_size) {
-		range = collapse_mthp_stack_pop(cc, &stack_size);
-		order = range.order;
-		offset = range.offset;
+	unsigned int offset = 0;
+	unsigned int order = HPAGE_PMD_ORDER;
+
+	while (offset < HPAGE_PMD_NR) {
 		nr_ptes = 1UL << order;
 
 		if (!test_bit(order, &enabled_orders))
 			goto next_order;
 
 		max_ptes_none = collapse_max_ptes_none(cc, NULL, order);
-
-		nr_occupied_ptes = collapse_mthp_count_present(cc, offset,
-							       nr_ptes);
+		nr_occupied_ptes = bitmap_weight_from(cc->mthp_present_ptes, offset,
+						      offset + nr_ptes);
 
 		if (nr_occupied_ptes >= nr_ptes - max_ptes_none) {
-			int ret;
+			enum scan_result ret;
 
 			collapse_address = address + offset * PAGE_SIZE;
 			ret = collapse_huge_page(mm, collapse_address, referenced,
@@ -1560,8 +1515,11 @@ static int mthp_collapse(struct mm_struc
 				collapsed += nr_ptes;
 				fallthrough;
 			case SCAN_PTE_MAPPED_HUGEPAGE:
-				continue;
+				goto next_offset;
 			/* Cases where lower orders might still succeed */
+			case SCAN_ALLOC_HUGE_PAGE_FAIL:
+				alloc_failed = true;
+				fallthrough;
 			case SCAN_LACK_REFERENCED_PAGE:
 			case SCAN_EXCEED_NONE_PTE:
 			case SCAN_EXCEED_SWAP_PTE:
@@ -1572,27 +1530,48 @@ static int mthp_collapse(struct mm_struc
 			case SCAN_DEL_PAGE_LRU:
 			case SCAN_PTE_NON_PRESENT:
 			case SCAN_PTE_UFFD_WP:
-			case SCAN_ALLOC_HUGE_PAGE_FAIL:
 			case SCAN_PAGE_LAZYFREE:
+				last_result = ret;
 				goto next_order;
 			/* Cases where no further collapse is possible */
+			case SCAN_PMD_MAPPED:
+				fallthrough;
 			default:
-				return collapsed;
+				last_result = ret;
+				goto done;
 			}
 		}
 
 next_order:
-		if ((BIT(order) - 1) & enabled_orders) {
-			const u8 next_order = order - 1;
-			const u16 mid_offset = offset + (nr_ptes / 2);
-
-			collapse_mthp_stack_push(cc, &stack_size, mid_offset,
-						 next_order);
-			collapse_mthp_stack_push(cc, &stack_size, offset,
-						 next_order);
-		}
-	}
-	return collapsed;
+		/*
+		 * Continue with the next smaller order if there is still
+		 * any smaller order enabled. When at the smallest order
+		 * we must always move to the next offset.
+		 */
+		if (order > KHUGEPAGED_MIN_MTHP_ORDER &&
+			(enabled_orders & GENMASK(order - 1, 0))) {
+			order--;
+			continue;
+		}
+next_offset:
+		/*
+		 * Advance past the region we just processed and determine the
+		 * highest order we can attempt next. Since huge pages must be
+		 * naturally aligned, the max order we can attempt next is
+		 * limited by the alignment of the new offset.
+		 * E.g. if we collapsed a order-2 mTHP at offset 0, offset
+		 * becomes 4 and __ffs(4) == 2, so the next attempt starts at
+		 * order 2.
+		 */
+		offset += nr_ptes;
+		order = max_order_from_offset(offset);
+	}
+done:
+	if (collapsed)
+		return SCAN_SUCCEED;
+	if (alloc_failed)
+		return SCAN_ALLOC_HUGE_PAGE_FAIL;
+	return last_result;
 }
 
 static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
@@ -1606,7 +1585,7 @@ static enum scan_result collapse_scan_pm
 	pmd_t *pmd;
 	pte_t *pte, *_pte, pteval;
 	int i;
-	int none_or_zero = 0, shared = 0, nr_collapsed = 0, referenced = 0;
+	int none_or_zero = 0, shared = 0, referenced = 0;
 	enum scan_result result = SCAN_FAIL;
 	struct page *page = NULL;
 	struct folio *folio = NULL;
@@ -1623,11 +1602,11 @@ static enum scan_result collapse_scan_pm
 		goto out;
 	}
 
-	bitmap_zero(cc->mthp_bitmap, MAX_PTRS_PER_PTE);
+	bitmap_zero(cc->mthp_present_ptes, MAX_PTRS_PER_PTE);
 	memset(cc->node_load, 0, sizeof(cc->node_load));
 	nodes_clear(cc->alloc_nmask);
 
-	enabled_orders = collapse_allowable_orders(vma, vma->vm_flags, tva_flags);
+	enabled_orders = collapse_possible_orders(vma, vma->vm_flags, tva_flags);
 
 	/*
 	 * If PMD is the only enabled order, enforce max_ptes_none, otherwise
@@ -1730,7 +1709,7 @@ static enum scan_result collapse_scan_pm
 		}
 
 		/* Set bit for occupied pages */
-		__set_bit(i, cc->mthp_bitmap);
+		__set_bit(i, cc->mthp_present_ptes);
 		/*
 		 * Record which node the original page is from and save this
 		 * information to cc->node_load[].
@@ -1789,11 +1768,10 @@ out_unmap:
 	if (result == SCAN_SUCCEED) {
 		/* collapse_huge_page expects the lock to be dropped before calling */
 		mmap_read_unlock(mm);
-		nr_collapsed = mthp_collapse(mm, start_addr, referenced,
-					     unmapped, cc, enabled_orders);
+		result = mthp_collapse(mm, start_addr, referenced,
+				       unmapped, cc, enabled_orders);
 		/* mmap_lock was released above, set lock_dropped */
 		*lock_dropped = true;
-		result = nr_collapsed ? SCAN_SUCCEED : SCAN_FAIL;
 	}
 out:
 	trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
@@ -2892,7 +2870,7 @@ static void collapse_scan_mm_slot(unsign
 			cc->progress++;
 			break;
 		}
-		if (!collapse_allowable_orders(vma, vma->vm_flags, TVA_KHUGEPAGED)) {
+		if (!collapse_possible(vma, vma->vm_flags, TVA_KHUGEPAGED)) {
 			cc->progress++;
 			continue;
 		}
@@ -3202,7 +3180,7 @@ int madvise_collapse(struct vm_area_stru
 	BUG_ON(vma->vm_start > start);
 	BUG_ON(vma->vm_end < end);
 
-	if (!collapse_allowable_orders(vma, vma->vm_flags, TVA_FORCED_COLLAPSE))
+	if (!collapse_possible(vma, vma->vm_flags, TVA_FORCED_COLLAPSE))
 		return -EINVAL;
 
 	cc = kmalloc_obj(*cc);
_


^ permalink raw reply

* Re: [PATCH mm-unstable v19 11/14] mm/khugepaged: Introduce mTHP collapse support
From: Lance Yang @ 2026-06-06 10:28 UTC (permalink / raw)
  To: npache
  Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, liam, ljs, mathieu.desnoyers, matthew.brost, mhiramat,
	mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260605161422.213817-12-npache@redhat.com>


On Fri, Jun 05, 2026 at 10:14:18AM -0600, Nico Pache wrote:
>Enable khugepaged to collapse to mTHP orders. This patch implements the
>main scanning logic using a bitmap to track occupied pages and the
>algorithm to find optimal collapse sizes.
>
>Previous to this patch, PMD collapse had 3 main phases, a light weight
>scanning phase (mmap_read_lock) that determines a potential PMD
>collapse, an alloc phase (mmap unlocked), then finally heavier collapse
>phase (mmap_write_lock).
>
>To enabled mTHP collapse we make the following changes:
>
>During PMD scan phase, track occupied pages in a bitmap. When mTHP
>orders are enabled, we remove the restriction of max_ptes_none during the
>scan phase to avoid missing potential mTHP collapse candidates. Once we
>have scanned the full PMD range and updated the bitmap to track occupied
>pages, we use the bitmap to find the optimal mTHP size.
>
>Implement mthp_collapse() to walk forward through the bitmap and
>determine the best eligible order for each naturally-aligned region. The
>algorithm starts at the beginning of the PMD range and, for each offset,
>tries the highest order that fits the alignment. If the number of
>occupied PTEs in that region satisfies the max_ptes_none threshold for
>that order, a collapse is attempted. On failure, the order is
>decremented and the same offset is retried at the next smaller size. Once
>the smallest enabled order is exhausted (or a collapse succeeds), the
>offset advances past the region just processed, and the next attempt
>starts at the highest order permitted by the new offset's natural
>alignment.
>
>The algorithm works as follows:
>    1) set offset=0 and order=HPAGE_PMD_ORDER
>    2) if the order is not enabled, go to step (5)
>    3) count occupied PTEs in the (offset, order) range using
>       bitmap_weight_from()
>    4) if the count satisfies the max_ptes_none threshold, attempt
>       collapse; on success, advance to step (6)
>    5) if a smaller enabled order exists, decrement order and retry
>       from step (2) at the same offset
>    6) advance offset past the current region and compute the next
>       order from the new offset's natural alignment via __ffs(offset),
>       capped at HPAGE_PMD_ORDER
>    7) repeat from step (2) until the full PMD range is covered
>
>mTHP collapses reject regions containing swapped out or shared pages.
>This is because adding new entries can lead to new none pages, and these
>may lead to constant promotion into a higher order mTHP. A similar
>issue can occur with "max_ptes_none > HPAGE_PMD_NR/2" due to a collapse
>introducing at least 2x the number of pages, and on a future scan will
>satisfy the promotion condition once again. This issue is prevented via
>the collapse_max_ptes_none() function which imposes the max_ptes_none
>restrictions above.
>
>We currently only support mTHP collapse for max_ptes_none values of 0
>and HPAGE_PMD_NR - 1. resulting in the following behavior:
>
>    - max_ptes_none=0: Never introduce new empty pages during collapse
>    - max_ptes_none=HPAGE_PMD_NR-1: Always try collapse to the highest
>      available mTHP order
>
>Any other max_ptes_none value will emit a warning and default mTHP
>collapse to max_ptes_none=0. There should be no behavior change for PMD
>collapse.
>
>Once we determine what mTHP sizes fits best in that PMD range a collapse
>is attempted. A minimum collapse order of 2 is used as this is the lowest
>order supported by anon memory as defined by THP_ORDERS_ALL_ANON.
>
>Currently madv_collapse is not supported and will only attempt PMD
>collapse.
>
>We can also remove the check for is_khugepaged inside the PMD scan as
>the collapse_max_ptes_none() function handles this logic now.
>
>Signed-off-by: Nico Pache <npache@redhat.com>
>---
> mm/khugepaged.c | 146 +++++++++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 138 insertions(+), 8 deletions(-)
>
>diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>index ec886a031952..430047316f43 100644
>--- a/mm/khugepaged.c
>+++ b/mm/khugepaged.c
>@@ -99,6 +99,8 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
> 
> static struct kmem_cache *mm_slot_cache __ro_after_init;
> 
>+#define KHUGEPAGED_MIN_MTHP_ORDER	2
>+
> struct collapse_control {
> 	bool is_khugepaged;
> 
>@@ -110,6 +112,9 @@ struct collapse_control {
> 
> 	/* nodemask for allocation fallback */
> 	nodemask_t alloc_nmask;
>+
>+	/* Each bit represents a single occupied (!none/zero) page. */
>+	DECLARE_BITMAP(mthp_present_ptes, MAX_PTRS_PER_PTE);
> };
> 
> /**
>@@ -1440,20 +1445,130 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long s
> 	return result;
> }
> 
>+/* Return the highest naturally aligned order that fits at @offset within a PMD. */
>+static unsigned int max_order_from_offset(unsigned int offset)
>+{
>+	if (offset == 0)
>+		return HPAGE_PMD_ORDER;
>+
>+	return min_t(unsigned int, __ffs(offset), HPAGE_PMD_ORDER);
>+}
>+
>+/*
>+ * mthp_collapse() consumes the bitmap that is generated during
>+ * collapse_scan_pmd() to determine what regions and mTHP orders fit best.
>+ *
>+ * Each bit in cc->mthp_present_ptes represents a single occupied (!none/zero)
>+ * page. We start at the PMD order and check if it is eligible for collapse;
>+ * if not, we check the left and right halves of the PTE page table we are
>+ * examining at a lower order.
>+ *
>+ * For each of these, we determine how many PTE entries are occupied in the
>+ * range of PTE entries we propose to collapse, then we compare this to a
>+ * threshold number of PTE entries which would need to be occupied for a
>+ * collapse to be permitted at that order (accounting for max_ptes_none).
>+ *
>+ * If a collapse is permitted, we attempt to collapse the PTE range into a
>+ * mTHP.
>+ */
>+static enum scan_result mthp_collapse(struct mm_struct *mm,
>+		unsigned long address, int referenced, int unmapped,
>+		struct collapse_control *cc, unsigned long enabled_orders)
>+{
>+	unsigned int nr_occupied_ptes, nr_ptes, max_ptes_none;
>+	enum scan_result last_result = SCAN_FAIL;
>+	int collapsed = 0;
>+	bool alloc_failed = false;
>+	unsigned long collapse_address;
>+	unsigned int offset = 0;
>+	unsigned int order = HPAGE_PMD_ORDER;
>+
>+	while (offset < HPAGE_PMD_NR) {
>+		nr_ptes = 1UL << order;
>+
>+		if (!test_bit(order, &enabled_orders))
>+			goto next_order;
>+
>+		max_ptes_none = collapse_max_ptes_none(cc, NULL, order);
>+		nr_occupied_ptes = bitmap_weight_from(cc->mthp_present_ptes, offset,
>+						      offset + nr_ptes);
>+
>+		if (nr_occupied_ptes >= nr_ptes - max_ptes_none) {

Looks broken for swap PTEs in PMD collapse ...

collapse_scan_pmd() allows them up to max_ptes_swap and record them in
unmapped, but they don't get a bit in mthp_present_ptes. And then
mthp_collapse() does the check above:

nr_occupied_ptes >= nr_ptes - max_ptes_none

So max_ptes_none=0 + 511 present PTEs + one allowed swap PTE won't even
call collapse_huge_page() for PMD order.

Shouldn't we account for them in the PMD-order check? Something like:

if (is_pmd_order(order))
	nr_occupied_ptes += unmapped;

Cheers, Lance
 
>+			enum scan_result ret;
>+
>+			collapse_address = address + offset * PAGE_SIZE;
>+			ret = collapse_huge_page(mm, collapse_address, referenced,
>+						 unmapped, cc, order);
>+			switch (ret) {
>+			/* Cases where we continue to next collapse candidate */
>+			case SCAN_SUCCEED:
>+				collapsed += nr_ptes;
>+				fallthrough;
>+			case SCAN_PTE_MAPPED_HUGEPAGE:
>+				goto next_offset;
>+			/* Cases where lower orders might still succeed */
>+			case SCAN_ALLOC_HUGE_PAGE_FAIL:
>+				alloc_failed = true;
>+				last_result = ret;
>+				goto next_order;
>+			/* Cases where no further collapse is possible */
>+			case SCAN_PMD_MAPPED:
>+				fallthrough;
>+			default:
>+				last_result = ret;
>+				goto done;
>+			}
>+		}
>+
>+next_order:
>+		/*
>+		 * Continue with the next smaller order if there is still
>+		 * any smaller order enabled. When at the smallest order
>+		 * we must always move to the next offset.
>+		 */
>+		if (order > KHUGEPAGED_MIN_MTHP_ORDER &&
>+			(enabled_orders & GENMASK(order - 1, 0))) {
>+			order--;
>+			continue;
>+		}
>+next_offset:
>+		/*
>+		 * Advance past the region we just processed and determine the
>+		 * highest order we can attempt next. Since huge pages must be
>+		 * naturally aligned, the max order we can attempt next is
>+		 * limited by the alignment of the new offset.
>+		 * E.g. if we collapsed a order-2 mTHP at offset 0, offset
>+		 * becomes 4 and __ffs(4) == 2, so the next attempt starts at
>+		 * order 2.
>+		 */
>+		offset += nr_ptes;
>+		order = max_order_from_offset(offset);
>+	}
>+done:
>+	if (collapsed)
>+		return SCAN_SUCCEED;
>+	if (alloc_failed)
>+		return SCAN_ALLOC_HUGE_PAGE_FAIL;
>+	return last_result;
>+}
>+
> static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
> 		struct vm_area_struct *vma, unsigned long start_addr,
> 		bool *lock_dropped, struct collapse_control *cc)
> {
>-	const unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
> 	const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, HPAGE_PMD_ORDER);
> 	const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
>+	unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
>+	enum tva_type tva_flags = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
> 	pmd_t *pmd;
>-	pte_t *pte, *_pte;
>+	pte_t *pte, *_pte, pteval;
>+	int i;
> 	int none_or_zero = 0, shared = 0, referenced = 0;
> 	enum scan_result result = SCAN_FAIL;
> 	struct page *page = NULL;
> 	struct folio *folio = NULL;
> 	unsigned long addr;
>+	unsigned long enabled_orders;
> 	spinlock_t *ptl;
> 	int node = NUMA_NO_NODE, unmapped = 0;
> 
>@@ -1465,8 +1580,19 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
> 		goto out;
> 	}
> 
>+	bitmap_zero(cc->mthp_present_ptes, MAX_PTRS_PER_PTE);
> 	memset(cc->node_load, 0, sizeof(cc->node_load));
> 	nodes_clear(cc->alloc_nmask);
>+
>+	enabled_orders = collapse_possible_orders(vma, vma->vm_flags, tva_flags);
>+
>+	/*
>+	 * If PMD is the only enabled order, enforce max_ptes_none, otherwise
>+	 * scan all pages to populate the bitmap for mTHP collapse.
>+	 */
>+	if (enabled_orders != BIT(HPAGE_PMD_ORDER))
>+		max_ptes_none = KHUGEPAGED_MAX_PTES_LIMIT;
>+
> 	pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
> 	if (!pte) {
> 		cc->progress++;
>@@ -1474,11 +1600,13 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
> 		goto out;
> 	}
> 
>-	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
>-	     _pte++, addr += PAGE_SIZE) {
>+	for (i = 0; i < HPAGE_PMD_NR; i++) {
>+		_pte = pte + i;
>+		addr = start_addr + i * PAGE_SIZE;
>+		pteval = ptep_get(_pte);
>+
> 		cc->progress++;
> 
>-		pte_t pteval = ptep_get(_pte);
> 		if (pte_none_or_zero(pteval)) {
> 			if (++none_or_zero > max_ptes_none) {
> 				result = SCAN_EXCEED_NONE_PTE;
>@@ -1558,6 +1686,8 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
> 			}
> 		}
> 
>+		/* Set bit for occupied pages */
>+		__set_bit(i, cc->mthp_present_ptes);
> 		/*
> 		 * Record which node the original page is from and save this
> 		 * information to cc->node_load[].
>@@ -1616,9 +1746,9 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
> 	if (result == SCAN_SUCCEED) {
> 		/* collapse_huge_page expects the lock to be dropped before calling */
> 		mmap_read_unlock(mm);
>-		result = collapse_huge_page(mm, start_addr, referenced,
>-					    unmapped, cc, HPAGE_PMD_ORDER);
>-		/* collapse_huge_page will return with the mmap_lock released */
>+		result = mthp_collapse(mm, start_addr, referenced,
>+				       unmapped, cc, enabled_orders);
>+		/* mmap_lock was released above, set lock_dropped */
> 		*lock_dropped = true;
> 	}
> out:
>-- 
>2.54.0
>
>

^ permalink raw reply

* [PATCHv8 bpf-next 00/29] bpf: tracing_multi link
From: Jiri Olsa @ 2026-06-06 12:39 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: Hengqi Chen, bpf, linux-trace-kernel, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, Menglong Dong,
	Steven Rostedt

hi,
adding tracing_multi link support that allows fast attachment
of tracing program to many functions.

RFC: https://lore.kernel.org/bpf/20260203093819.2105105-1-jolsa@kernel.org/
v1: https://lore.kernel.org/bpf/20260220100649.628307-1-jolsa@kernel.org/
v2: https://lore.kernel.org/bpf/20260304222141.497203-1-jolsa@kernel.org/
v3: https://lore.kernel.org/bpf/20260316075138.465430-1-jolsa@kernel.org/
v4: https://lore.kernel.org/bpf/20260324081846.2334094-1-jolsa@kernel.org/
v5: https://lore.kernel.org/bpf/20260417192502.194548-1-jolsa@kernel.org/
v6: https://lore.kernel.org/bpf/20260527113951.46265-1-jolsa@kernel.org/
v7: https://lore.kernel.org/bpf/20260603110554.29590-1-jolsa@kernel.org/

v8 changes:
- add back the btf_is_union check to btf_get_type_size [sashiko]

v7 changes:
- added ftrace_hash_count stub for !CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS cade [sashiko]
- selftests fixes [sashiko]
- use hash_ptr in select_trampoline_lock [sashiko]
- changed the check duplicate logic in check_dup_ids [sashiko]
- use sort_r_nonatomic in check_dup_ids [sashiko]
- added BPF_TRACE_FSESSION_MULTI to can_be_sleepable,
  plus added testcase for sleepable fsession
- make bpf_tracing_multi_opts pointer fields as const
- add ___migrate_enable to trace_blacklist

v6 changes:
- move ftrace_hash_count declaration under CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS [sashiko]
- fix ftrace_hash_remove check/deref [sashiko]
- disable context access for multi programs by using stub function with no arguments 
  for verification [sashiko]
- add __used for bpf_multi_func, and removed arguments, we do not allow direct access [sashiko]
- rebased on latest loongarch changes, fix ppc build
- guard update_ftrace_direct_del with ftrace_hash_count on rollback [sashiko]
- fix noreturn attachment condition in bpf_check_attach_btf_id_multi [sashiko]
- fail early on multiple same IDs provided by user [sashiko]
- fix selftests error paths [sashiko]
- add MAX_RESOLVE_DEPTH check to btf_get_type_size [sashiko]
- use btf__pointer_size [sashiko]
- fixed compilation on powerpc [sashiko]
- added verifier fails selftest
- after discussing with Song, it was determined that cleaning up FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER
  is not strictly necessary — keeping the trampoline in the ipmodify_enabled state is acceptable.
  The race condition this introduces remains unlikely, so the concern raised in [1] will not be
  addressed at this time.
  [1] https://lore.kernel.org/bpf/aec7bAbGlnEo3R1g@krava/

v5 changes:
- add dedicated hashes used for detach, so there's no need to allocate
  them on detach [sashiko]
- safely release old trampoline images [sashiko]
- add cond_resched() to couple of loops [sashiko]
- validate attr->link_create.target_fd [sashiko]
- allow only bpf_get_func_ret() for return value retrieval [sashiko]
- do not allow attachment of fexit/fsession_multi for noreturn functions [sashiko]
- fixed double free/close in libbpf btf cleanup, in separate patch [sashiko]
- make btf_type_is_traceable_func closer to btf_distill_func_proto [sashiko]
- add prog->attach_btf_obj_fd check to collect_func_ids_by_glob,
  to check we don't load module programs for kernel [sashiko]
- make sure program is loaded in bpf_program__attach_tracing_multi [sashiko]
- several selftests fixes [sashiko]
- add attach_type to fdinfo output [Leon Hwang]
- selftests cleanup fixes [Leon Hwang]

v4 changes:
- unlink rollback fix (added ftrace_hash_count) [bot]
- use const for some bpf_link_create_opts tracing_multi members [bot]
- adding missing comment for lockdep keys [bot]
- selftest error path fixes (leaks) and other assorted test fixes [Leon Hwang]
- several compile fixes wrt CONFIG_BPF_SYSCALL and CONFIG_BPF_JIT [kernel test robot]
- make ftrace_hash_clear global, because it's needed in rollback

v3 changes:
- fix module parsing [Leon Hwang]
- use function traceable check from libbpf [Leon Hwang]
- use ptr_to_u64 and fix/updated few comments [ci]
- display cookies as decimal numbers [ci]
- added link_create.flags check [ci]
- fix error path in bpf_trampoline_multi_detach [ci]
- make fentry/fexit.multi not extendable [ci]
- add missing OPTS_VALID to bpf_program__attach_tracing_multi [ci]

v2 changes:
- allocate data.unreg in bpf_trampoline_multi_attach for rollback path [ci]
  and fixed link count setup in rollback path [ci]
- several small assorted fixes [ci]
- added loongarch and powerpc changes for struct bpf_tramp_node change
- added support to attach functions from modules
- added tests for sleepable programs
- added rollback tests

v1 changes:
- added ftrace_hash_count as wrapper for hash_count [Steven]
- added trampoline mutex pool [Andrii]
- reworked 'struct bpf_tramp_node' separatoin [Andrii]
  - the 'struct bpf_tramp_node' now holds pointer to bpf_link,
    which is similar to what we do for uprobe_multi;
    I understand it's not a fundamental change compared to previous
    version which used bpf_prog pointer instead, but I don't see better
    way of doing this.. I'm happy to discuss this further if there's
    better idea
- reworked 'struct bpf_fsession_link' based on bpf_tramp_node
- made btf__find_by_glob_kind function internal helper [Andrii]
- many small assorted fixes [Andrii,CI]
- added session support [Leon Hwang]
- added cookies support
- added more tests


Note I plan to send linkinfo support separately, the patchset is big enough.

thanks,
jirka


Cc: Hengqi Chen <hengqi.chen@gmail.com>
---
Jiri Olsa (29):
      ftrace: Add ftrace_hash_count function
      ftrace: Add ftrace_hash_remove function
      ftrace: Add add_ftrace_hash_entry function
      bpf: Use mutex lock pool for bpf trampolines
      bpf: Add struct bpf_trampoline_ops object
      bpf: Move trampoline image setup into bpf_trampoline_ops callbacks
      bpf: Add bpf_trampoline_add/remove_prog functions
      bpf: Add struct bpf_tramp_node object
      bpf: Factor fsession link to use struct bpf_tramp_node
      bpf: Add multi tracing attach types
      bpf: Move sleepable verification code to btf_id_allow_sleepable
      bpf: Add bpf_trampoline_multi_attach/detach functions
      bpf: Add support for tracing multi link
      bpf: Add support for tracing_multi link cookies
      bpf: Add support for tracing_multi link session
      bpf: Add support for tracing_multi link fdinfo
      libbpf: Add bpf_object_cleanup_btf function
      libbpf: Add bpf_link_create support for tracing_multi link
      libbpf: Add btf_type_is_traceable_func function
      libbpf: Add support to create tracing multi link
      selftests/bpf: Add tracing multi skel/pattern/ids attach tests
      selftests/bpf: Add tracing multi skel/pattern/ids module attach tests
      selftests/bpf: Add tracing multi intersect tests
      selftests/bpf: Add tracing multi cookies test
      selftests/bpf: Add tracing multi session test
      selftests/bpf: Add tracing multi attach fails test
      selftests/bpf: Add tracing multi verifier fails test
      selftests/bpf: Add tracing multi attach benchmark test
      selftests/bpf: Add tracing multi attach rollback tests

 arch/arm64/net/bpf_jit_comp.c                                      |  58 ++--
 arch/loongarch/net/bpf_jit.c                                       |  52 ++--
 arch/powerpc/net/bpf_jit_comp.c                                    |  54 ++--
 arch/riscv/net/bpf_jit_comp64.c                                    |  52 ++--
 arch/s390/net/bpf_jit_comp.c                                       |  44 +--
 arch/x86/net/bpf_jit_comp.c                                        |  54 ++--
 include/linux/bpf.h                                                | 117 ++++++--
 include/linux/bpf_types.h                                          |   1 +
 include/linux/bpf_verifier.h                                       |   4 +
 include/linux/btf_ids.h                                            |   1 +
 include/linux/ftrace.h                                             |   9 +
 include/linux/trace_events.h                                       |   6 +
 include/uapi/linux/bpf.h                                           |   9 +
 kernel/bpf/bpf_struct_ops.c                                        |  27 +-
 kernel/bpf/fixups.c                                                |   2 +
 kernel/bpf/syscall.c                                               |  83 +++---
 kernel/bpf/trampoline.c                                            | 670 +++++++++++++++++++++++++++++++++----------
 kernel/bpf/verifier.c                                              | 183 ++++++++++--
 kernel/trace/bpf_trace.c                                           | 204 +++++++++++++-
 kernel/trace/ftrace.c                                              |  35 ++-
 net/bpf/bpf_dummy_struct_ops.c                                     |  14 +-
 net/bpf/test_run.c                                                 |   3 +
 tools/include/uapi/linux/bpf.h                                     |  10 +
 tools/lib/bpf/bpf.c                                                |   9 +
 tools/lib/bpf/bpf.h                                                |   5 +
 tools/lib/bpf/libbpf.c                                             | 375 ++++++++++++++++++++++++-
 tools/lib/bpf/libbpf.h                                             |  15 +
 tools/lib/bpf/libbpf.map                                           |   1 +
 tools/lib/bpf/libbpf_internal.h                                    |   1 +
 tools/testing/selftests/bpf/Makefile                               |   9 +-
 tools/testing/selftests/bpf/prog_tests/tracing_multi.c             | 936 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 tools/testing/selftests/bpf/progs/tracing_multi_attach.c           |  39 +++
 tools/testing/selftests/bpf/progs/tracing_multi_attach_module.c    |  25 ++
 tools/testing/selftests/bpf/progs/tracing_multi_bench.c            |  12 +
 tools/testing/selftests/bpf/progs/tracing_multi_check.c            | 214 ++++++++++++++
 tools/testing/selftests/bpf/progs/tracing_multi_fail.c             |  18 ++
 tools/testing/selftests/bpf/progs/tracing_multi_intersect_attach.c |  41 +++
 tools/testing/selftests/bpf/progs/tracing_multi_rollback.c         |  43 +++
 tools/testing/selftests/bpf/progs/tracing_multi_session_attach.c   |  65 +++++
 tools/testing/selftests/bpf/progs/tracing_multi_verifier.c         |  31 ++
 tools/testing/selftests/bpf/trace_helpers.c                        |   7 +-
 tools/testing/selftests/bpf/trace_helpers.h                        |   1 +
 42 files changed, 3110 insertions(+), 429 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/tracing_multi.c
 create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_attach.c
 create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_attach_module.c
 create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_bench.c
 create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_check.c
 create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_fail.c
 create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_intersect_attach.c
 create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_rollback.c
 create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_session_attach.c
 create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_verifier.c

^ permalink raw reply

* [PATCHv8 bpf-next 01/29] ftrace: Add ftrace_hash_count function
From: Jiri Olsa @ 2026-06-06 12:39 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: bpf, linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <20260606123955.345967-1-jolsa@kernel.org>

Adding external ftrace_hash_count function so we could get hash
count outside of ftrace object.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 include/linux/ftrace.h | 7 +++++++
 kernel/trace/ftrace.c  | 7 ++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 28b30c6f1031..02c24bf766ce 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -551,6 +551,8 @@ int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, b
 
 void ftrace_stub_direct_tramp(void);
 
+unsigned long ftrace_hash_count(struct ftrace_hash *hash);
+
 #else
 struct ftrace_ops;
 static inline unsigned long ftrace_find_rec_direct(unsigned long ip)
@@ -590,6 +592,11 @@ static inline int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace
 	return -ENODEV;
 }
 
+static inline unsigned long ftrace_hash_count(struct ftrace_hash *hash)
+{
+	return 0;
+}
+
 /*
  * This must be implemented by the architecture.
  * It is the way the ftrace direct_ops helper, when called
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b2611de3f594..57ab01fd00bd 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6288,11 +6288,16 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
 }
 EXPORT_SYMBOL_GPL(modify_ftrace_direct);
 
-static unsigned long hash_count(struct ftrace_hash *hash)
+static inline unsigned long hash_count(struct ftrace_hash *hash)
 {
 	return hash ? hash->count : 0;
 }
 
+unsigned long ftrace_hash_count(struct ftrace_hash *hash)
+{
+	return hash_count(hash);
+}
+
 /**
  * hash_add - adds two struct ftrace_hash and returns the result
  * @a: struct ftrace_hash object
-- 
2.54.0


^ permalink raw reply related

* [PATCHv8 bpf-next 02/29] ftrace: Add ftrace_hash_remove function
From: Jiri Olsa @ 2026-06-06 12:39 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: bpf, linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <20260606123955.345967-1-jolsa@kernel.org>

Adding ftrace_hash_remove function that removes all entries
from struct ftrace_hash object without freeing them.

It will be used in following changes where entries are allocated
as part of another structure and are free-ed separately.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 include/linux/ftrace.h |  1 +
 kernel/trace/ftrace.c  | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 02c24bf766ce..b55ec9b25bb3 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -415,6 +415,7 @@ struct ftrace_hash *alloc_ftrace_hash(int size_bits);
 void free_ftrace_hash(struct ftrace_hash *hash);
 struct ftrace_func_entry *add_ftrace_hash_entry_direct(struct ftrace_hash *hash,
 						       unsigned long ip, unsigned long direct);
+void ftrace_hash_remove(struct ftrace_hash *hash);
 
 /* The hash used to know what functions callbacks trace */
 struct ftrace_ops_hash {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 57ab01fd00bd..45548b0200eb 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1249,6 +1249,25 @@ remove_hash_entry(struct ftrace_hash *hash,
 	hash->count--;
 }
 
+void ftrace_hash_remove(struct ftrace_hash *hash)
+{
+	struct ftrace_func_entry *entry;
+	struct hlist_head *hhd;
+	struct hlist_node *tn;
+	int size;
+	int i;
+
+	if (!hash || !hash->count)
+		return;
+	size = 1 << hash->size_bits;
+	for (i = 0; i < size; i++) {
+		hhd = &hash->buckets[i];
+		hlist_for_each_entry_safe(entry, tn, hhd, hlist)
+			remove_hash_entry(hash, entry);
+	}
+	FTRACE_WARN_ON(hash->count);
+}
+
 static void ftrace_hash_clear(struct ftrace_hash *hash)
 {
 	struct hlist_head *hhd;
-- 
2.54.0


^ permalink raw reply related

* [PATCHv8 bpf-next 03/29] ftrace: Add add_ftrace_hash_entry function
From: Jiri Olsa @ 2026-06-06 12:39 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: bpf, linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <20260606123955.345967-1-jolsa@kernel.org>

Renaming __add_hash_entry to add_ftrace_hash_entry and making it global,
it will be used in following changes outside ftrace.c object.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 include/linux/ftrace.h | 1 +
 kernel/trace/ftrace.c  | 9 ++++-----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index b55ec9b25bb3..02bc5027523a 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -415,6 +415,7 @@ struct ftrace_hash *alloc_ftrace_hash(int size_bits);
 void free_ftrace_hash(struct ftrace_hash *hash);
 struct ftrace_func_entry *add_ftrace_hash_entry_direct(struct ftrace_hash *hash,
 						       unsigned long ip, unsigned long direct);
+void add_ftrace_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry);
 void ftrace_hash_remove(struct ftrace_hash *hash);
 
 /* The hash used to know what functions callbacks trace */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 45548b0200eb..f93e34dd2328 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1198,8 +1198,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
 	return __ftrace_lookup_ip(hash, ip);
 }
 
-static void __add_hash_entry(struct ftrace_hash *hash,
-			     struct ftrace_func_entry *entry)
+void add_ftrace_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry)
 {
 	struct hlist_head *hhd;
 	unsigned long key;
@@ -1221,7 +1220,7 @@ add_ftrace_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigne
 
 	entry->ip = ip;
 	entry->direct = direct;
-	__add_hash_entry(hash, entry);
+	add_ftrace_hash_entry(hash, entry);
 
 	return entry;
 }
@@ -1477,7 +1476,7 @@ static struct ftrace_hash *__move_hash(struct ftrace_hash *src, int size)
 		hhd = &src->buckets[i];
 		hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
 			remove_hash_entry(src, entry);
-			__add_hash_entry(new_hash, entry);
+			add_ftrace_hash_entry(new_hash, entry);
 		}
 	}
 	return new_hash;
@@ -5360,7 +5359,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
 	map->entry.ip = ip;
 	map->data = data;
 
-	__add_hash_entry(&mapper->hash, &map->entry);
+	add_ftrace_hash_entry(&mapper->hash, &map->entry);
 
 	return 0;
 }
-- 
2.54.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox