Linux Trace Kernel

Linux Trace Kernel
 help / color / mirror / Atom feed

* [PATCH 7.2 v16 08/13] mm/khugepaged: improve tracepoints for mTHP orders
From: Nico Pache @ 2026-04-19 18:57 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
	dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, ljs,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, npache,
	peterx, pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang,
	rientjes, rostedt, rppt, ryan.roberts, shivankg, sunnanyong,
	surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260419185750.260784-1-npache@redhat.com>

Add the order to the mm_collapse_huge_page<_swapin,_isolate> tracepoints to
give better insight into what order is being operated at for.

Reviewed-by: Lorenzo Stoakes <ljs@kernel.org> 
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Signed-off-by: Nico Pache <npache@redhat.com>
---
 include/trace/events/huge_memory.h | 34 +++++++++++++++++++-----------
 mm/khugepaged.c                    |  9 ++++----
 2 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index bcdc57eea270..291fae364c62 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -89,40 +89,44 @@ TRACE_EVENT(mm_khugepaged_scan_pmd,
 
 TRACE_EVENT(mm_collapse_huge_page,
 
-	TP_PROTO(struct mm_struct *mm, int isolated, int status),
+	TP_PROTO(struct mm_struct *mm, int isolated, int status, unsigned int order),
 
-	TP_ARGS(mm, isolated, status),
+	TP_ARGS(mm, isolated, status, order),
 
 	TP_STRUCT__entry(
 		__field(struct mm_struct *, mm)
 		__field(int, isolated)
 		__field(int, status)
+		__field(unsigned int, order)
 	),
 
 	TP_fast_assign(
 		__entry->mm = mm;
 		__entry->isolated = isolated;
 		__entry->status = status;
+		__entry->order = order;
 	),
 
-	TP_printk("mm=%p, isolated=%d, status=%s",
+	TP_printk("mm=%p, isolated=%d, status=%s, order=%u",
 		__entry->mm,
 		__entry->isolated,
-		__print_symbolic(__entry->status, SCAN_STATUS))
+		__print_symbolic(__entry->status, SCAN_STATUS),
+		__entry->order)
 );
 
 TRACE_EVENT(mm_collapse_huge_page_isolate,
 
 	TP_PROTO(struct folio *folio, int none_or_zero,
-		 int referenced, int status),
+		 int referenced, int status, unsigned int order),
 
-	TP_ARGS(folio, none_or_zero, referenced, status),
+	TP_ARGS(folio, none_or_zero, referenced, status, order),
 
 	TP_STRUCT__entry(
 		__field(unsigned long, pfn)
 		__field(int, none_or_zero)
 		__field(int, referenced)
 		__field(int, status)
+		__field(unsigned int, order)
 	),
 
 	TP_fast_assign(
@@ -130,26 +134,30 @@ TRACE_EVENT(mm_collapse_huge_page_isolate,
 		__entry->none_or_zero = none_or_zero;
 		__entry->referenced = referenced;
 		__entry->status = status;
+		__entry->order = order;
 	),
 
-	TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, status=%s",
+	TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, status=%s, order=%u",
 		__entry->pfn,
 		__entry->none_or_zero,
 		__entry->referenced,
-		__print_symbolic(__entry->status, SCAN_STATUS))
+		__print_symbolic(__entry->status, SCAN_STATUS),
+		__entry->order)
 );
 
 TRACE_EVENT(mm_collapse_huge_page_swapin,
 
-	TP_PROTO(struct mm_struct *mm, int swapped_in, int referenced, int ret),
+	TP_PROTO(struct mm_struct *mm, int swapped_in, int referenced, int ret,
+		 unsigned int order),
 
-	TP_ARGS(mm, swapped_in, referenced, ret),
+	TP_ARGS(mm, swapped_in, referenced, ret, order),
 
 	TP_STRUCT__entry(
 		__field(struct mm_struct *, mm)
 		__field(int, swapped_in)
 		__field(int, referenced)
 		__field(int, ret)
+		__field(unsigned int, order)
 	),
 
 	TP_fast_assign(
@@ -157,13 +165,15 @@ TRACE_EVENT(mm_collapse_huge_page_swapin,
 		__entry->swapped_in = swapped_in;
 		__entry->referenced = referenced;
 		__entry->ret = ret;
+		__entry->order = order;
 	),
 
-	TP_printk("mm=%p, swapped_in=%d, referenced=%d, ret=%d",
+	TP_printk("mm=%p, swapped_in=%d, referenced=%d, ret=%d, order=%u",
 		__entry->mm,
 		__entry->swapped_in,
 		__entry->referenced,
-		__entry->ret)
+		__entry->ret,
+		__entry->order)
 );
 
 TRACE_EVENT(mm_khugepaged_scan_file,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 0a1c7cc20c0e..a4f1c570b69b 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -780,13 +780,13 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
 	} else {
 		result = SCAN_SUCCEED;
 		trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
-						    referenced, result);
+						    referenced, result, order);
 		return result;
 	}
 out:
 	release_pte_pages(pte, _pte, compound_pagelist);
 	trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
-					    referenced, result);
+					    referenced, result, order);
 	return result;
 }
 
@@ -1180,7 +1180,8 @@ static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
 
 	result = SCAN_SUCCEED;
 out:
-	trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result);
+	trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result,
+					   order);
 	return result;
 }
 
@@ -1376,7 +1377,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long s
 out_nolock:
 	if (folio)
 		folio_put(folio);
-	trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
+	trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result, order);
 	return result;
 }
 
-- 
2.53.0


^ permalink raw reply related

* [PATCH 7.2 v16 07/13] mm/khugepaged: add per-order mTHP collapse failure statistics
From: Nico Pache @ 2026-04-19 18:57 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
	dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, ljs,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, npache,
	peterx, pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang,
	rientjes, rostedt, rppt, ryan.roberts, shivankg, sunnanyong,
	surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260419185750.260784-1-npache@redhat.com>

Add three new mTHP statistics to track collapse failures for different
orders when encountering swap PTEs, excessive none PTEs, and shared PTEs:

- collapse_exceed_swap_pte: Increment when mTHP collapse fails due to swap
	PTEs

- collapse_exceed_none_pte: Counts when mTHP collapse fails due to
  	exceeding the none PTE threshold for the given order

- collapse_exceed_shared_pte: Counts when mTHP collapse fails due to shared
  	PTEs

These statistics complement the existing THP_SCAN_EXCEED_* events by
providing per-order granularity for mTHP collapse attempts. The stats are
exposed via sysfs under
`/sys/kernel/mm/transparent_hugepage/hugepages-*/stats/` for each
supported hugepage size.

As we currently dont support collapsing mTHPs that contain a swap or
shared entry, those statistics keep track of how often we are
encountering failed mTHP collapses due to these restrictions.

Now that we plan to support mTHP collapse for anon pages, lets also track
when this happens at the PMD level within the per-mTHP stats.

Signed-off-by: Nico Pache <npache@redhat.com>
---
 Documentation/admin-guide/mm/transhuge.rst | 24 ++++++++++++++++++++++
 include/linux/huge_mm.h                    |  3 +++
 mm/huge_memory.c                           |  7 +++++++
 mm/khugepaged.c                            | 21 +++++++++++++++++--
 4 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index c51932e6275d..eebb1f6bbc6c 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -714,6 +714,30 @@ nr_anon_partially_mapped
        an anonymous THP as "partially mapped" and count it here, even though it
        is not actually partially mapped anymore.
 
+collapse_exceed_none_pte
+       The number of collapse attempts that failed due to exceeding the
+       max_ptes_none threshold. For mTHP collapse, Currently only max_ptes_none
+       values of 0 and (HPAGE_PMD_NR - 1) are supported. Any other value will
+       emit a warning and no mTHP collapse will be attempted. khugepaged will
+       try to collapse to the largest enabled (m)THP size; if it fails, it will
+       try the next lower enabled mTHP size. This counter records the number of
+       times a collapse attempt was skipped for exceeding the max_ptes_none
+       threshold, and khugepaged will move on to the next available mTHP size.
+
+collapse_exceed_swap_pte
+       The number of anonymous mTHP PTE ranges which were unable to collapse due
+       to containing at least one swap PTE. Currently khugepaged does not
+       support collapsing mTHP regions that contain a swap PTE. This counter can
+       be used to monitor the number of khugepaged mTHP collapses that failed
+       due to the presence of a swap PTE.
+
+collapse_exceed_shared_pte
+       The number of anonymous mTHP PTE ranges which were unable to collapse due
+       to containing at least one shared PTE. Currently khugepaged does not
+       support collapsing mTHP PTE ranges that contain a shared PTE. This
+       counter can be used to monitor the number of khugepaged mTHP collapses
+       that failed due to the presence of a shared PTE.
+
 As the system ages, allocating huge pages may be expensive as the
 system uses memory compaction to copy data around memory to free a
 huge page for use. There are some counters in ``/proc/vmstat`` to help
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ba7ae6808544..48496f09909b 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -144,6 +144,9 @@ enum mthp_stat_item {
 	MTHP_STAT_SPLIT_DEFERRED,
 	MTHP_STAT_NR_ANON,
 	MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
+	MTHP_STAT_COLLAPSE_EXCEED_SWAP,
+	MTHP_STAT_COLLAPSE_EXCEED_NONE,
+	MTHP_STAT_COLLAPSE_EXCEED_SHARED,
 	__MTHP_STAT_COUNT
 };
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 345c54133c83..5c128cdec810 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -703,6 +703,10 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
 DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
 DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
 DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
+DEFINE_MTHP_STAT_ATTR(collapse_exceed_swap_pte, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
+DEFINE_MTHP_STAT_ATTR(collapse_exceed_none_pte, MTHP_STAT_COLLAPSE_EXCEED_NONE);
+DEFINE_MTHP_STAT_ATTR(collapse_exceed_shared_pte, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
+
 
 static struct attribute *anon_stats_attrs[] = {
 	&anon_fault_alloc_attr.attr,
@@ -719,6 +723,9 @@ static struct attribute *anon_stats_attrs[] = {
 	&split_deferred_attr.attr,
 	&nr_anon_attr.attr,
 	&nr_anon_partially_mapped_attr.attr,
+	&collapse_exceed_swap_pte_attr.attr,
+	&collapse_exceed_none_pte_attr.attr,
+	&collapse_exceed_shared_pte_attr.attr,
 	NULL,
 };
 
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 8740d379882e..0a1c7cc20c0e 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -646,7 +646,9 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		if (pte_none_or_zero(pteval)) {
 			if (++none_or_zero > max_ptes_none) {
 				result = SCAN_EXCEED_NONE_PTE;
-				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+				if (is_pmd_order(order))
+					count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+				count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_NONE);
 				goto out;
 			}
 			continue;
@@ -680,9 +682,17 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
 
 		/* See collapse_scan_pmd(). */
 		if (folio_maybe_mapped_shared(folio)) {
+			/*
+			 * TODO: Support shared pages without leading to further
+			 * mTHP collapses. Currently bringing in new pages via
+			 * shared may cause a future higher order collapse on a
+			 * rescan of the same range.
+			 */
 			if (++shared > max_ptes_shared) {
 				result = SCAN_EXCEED_SHARED_PTE;
-				count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
+				if (is_pmd_order(order))
+					count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
+				count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
 				goto out;
 			}
 		}
@@ -1129,6 +1139,7 @@ static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
 		 * range.
 		 */
 		if (!is_pmd_order(order)) {
+			count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
 			pte_unmap(pte);
 			mmap_read_unlock(mm);
 			result = SCAN_EXCEED_SWAP_PTE;
@@ -1412,6 +1423,8 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
 			if (++none_or_zero > max_ptes_none) {
 				result = SCAN_EXCEED_NONE_PTE;
 				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+				count_mthp_stat(HPAGE_PMD_ORDER,
+						MTHP_STAT_COLLAPSE_EXCEED_NONE);
 				goto out_unmap;
 			}
 			continue;
@@ -1420,6 +1433,8 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
 			if (++unmapped > max_ptes_swap) {
 				result = SCAN_EXCEED_SWAP_PTE;
 				count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
+				count_mthp_stat(HPAGE_PMD_ORDER,
+						MTHP_STAT_COLLAPSE_EXCEED_SWAP);
 				goto out_unmap;
 			}
 			/*
@@ -1477,6 +1492,8 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
 			if (++shared > max_ptes_shared) {
 				result = SCAN_EXCEED_SHARED_PTE;
 				count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
+				count_mthp_stat(HPAGE_PMD_ORDER,
+						MTHP_STAT_COLLAPSE_EXCEED_SHARED);
 				goto out_unmap;
 			}
 		}
-- 
2.53.0


^ permalink raw reply related

* [PATCH 7.2 v16 06/13] mm/khugepaged: skip collapsing mTHP to smaller orders
From: Nico Pache @ 2026-04-19 18:57 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
	dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, ljs,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, npache,
	peterx, pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang,
	rientjes, rostedt, rppt, ryan.roberts, shivankg, sunnanyong,
	surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260419185750.260784-1-npache@redhat.com>

khugepaged may try to collapse a mTHP to a smaller mTHP, resulting in
some pages being unmapped. Skip these cases until we have a way to check
if its ok to collapse to a smaller mTHP size (like in the case of a
partially mapped folio). This check is also not done during the scan phase
as the current collapse order is unknown at that time.

This patch is inspired by Dev Jain's work on khugepaged mTHP support [1].

[1] https://lore.kernel.org/lkml/20241216165105.56185-11-dev.jain@arm.com/

Reviewed-by: Lorenzo Stoakes <ljs@kernel.org> 
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Co-developed-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
 mm/khugepaged.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ff6f9f1883ed..8740d379882e 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -686,6 +686,14 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
 				goto out;
 			}
 		}
+		/*
+		 * TODO: In some cases of partially-mapped folios, we'd actually
+		 * want to collapse.
+		 */
+		if (!is_pmd_order(order) && folio_order(folio) >= order) {
+			result = SCAN_PTE_MAPPED_HUGEPAGE;
+			goto out;
+		}
 
 		if (folio_test_large(folio)) {
 			struct folio *f;
-- 
2.53.0


^ permalink raw reply related

* [PATCH 7.2 v16 05/13] mm/khugepaged: generalize collapse_huge_page for mTHP collapse
From: Nico Pache @ 2026-04-19 18:57 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
	dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, ljs,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, npache,
	peterx, pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang,
	rientjes, rostedt, rppt, ryan.roberts, shivankg, sunnanyong,
	surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260419185750.260784-1-npache@redhat.com>

Pass an order and offset to collapse_huge_page to support collapsing anon
memory to arbitrary orders within a PMD. order indicates what mTHP size we
are attempting to collapse to, and offset indicates were in the PMD to
start the collapse attempt.

For non-PMD collapse we must leave the anon VMA write locked until after
we collapse the mTHP-- in the PMD case all the pages are isolated, but in
the mTHP case this is not true, and we must keep the lock to prevent
access/changes to the page tables. This can happen if the rmap walkers hit
a pmd_none while the PMD entry is currently unavailable due to being
temporarily removed during the collapse phase.

Signed-off-by: Nico Pache <npache@redhat.com>
---
 mm/khugepaged.c | 103 +++++++++++++++++++++++++++---------------------
 1 file changed, 57 insertions(+), 46 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 283bb63854a5..ff6f9f1883ed 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1198,42 +1198,36 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
 	return SCAN_SUCCEED;
 }
 
-static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address,
-		int referenced, int unmapped, struct collapse_control *cc)
+static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long start_addr,
+		int referenced, int unmapped, struct collapse_control *cc,
+		unsigned int order)
 {
 	LIST_HEAD(compound_pagelist);
 	pmd_t *pmd, _pmd;
-	pte_t *pte;
+	pte_t *pte = NULL;
 	pgtable_t pgtable;
 	struct folio *folio;
 	spinlock_t *pmd_ptl, *pte_ptl;
 	enum scan_result result = SCAN_FAIL;
 	struct vm_area_struct *vma;
 	struct mmu_notifier_range range;
+	bool anon_vma_locked = false;
+	const unsigned long pmd_addr = start_addr & HPAGE_PMD_MASK;
+	const unsigned long end_addr = start_addr + (PAGE_SIZE << order);
 
-	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-
-	/*
-	 * Before allocating the hugepage, release the mmap_lock read lock.
-	 * The allocation can take potentially a long time if it involves
-	 * sync compaction, and we do not need to hold the mmap_lock during
-	 * that. We will recheck the vma after taking it again in write mode.
-	 */
-	mmap_read_unlock(mm);
-
-	result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
+	result = alloc_charge_folio(&folio, mm, cc, order);
 	if (result != SCAN_SUCCEED)
 		goto out_nolock;
 
 	mmap_read_lock(mm);
-	result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
-					 HPAGE_PMD_ORDER);
+	result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
+					 &vma, cc, order);
 	if (result != SCAN_SUCCEED) {
 		mmap_read_unlock(mm);
 		goto out_nolock;
 	}
 
-	result = find_pmd_or_thp_or_none(mm, address, &pmd);
+	result = find_pmd_or_thp_or_none(mm, pmd_addr, &pmd);
 	if (result != SCAN_SUCCEED) {
 		mmap_read_unlock(mm);
 		goto out_nolock;
@@ -1245,8 +1239,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
 		 * released when it fails. So we jump out_nolock directly in
 		 * that case.  Continuing to collapse causes inconsistency.
 		 */
-		result = __collapse_huge_page_swapin(mm, vma, address, pmd,
-						     referenced, HPAGE_PMD_ORDER);
+		result = __collapse_huge_page_swapin(mm, vma, start_addr, pmd,
+						     referenced, order);
 		if (result != SCAN_SUCCEED)
 			goto out_nolock;
 	}
@@ -1261,20 +1255,21 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
 	 * mmap_lock.
 	 */
 	mmap_write_lock(mm);
-	result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
-					 HPAGE_PMD_ORDER);
+	result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
+					 &vma, cc, order);
 	if (result != SCAN_SUCCEED)
 		goto out_up_write;
 	/* check if the pmd is still valid */
 	vma_start_write(vma);
-	result = check_pmd_still_valid(mm, address, pmd);
+	result = check_pmd_still_valid(mm, pmd_addr, pmd);
 	if (result != SCAN_SUCCEED)
 		goto out_up_write;
 
 	anon_vma_lock_write(vma->anon_vma);
+	anon_vma_locked = true;
 
-	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
-				address + HPAGE_PMD_SIZE);
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start_addr,
+				end_addr);
 	mmu_notifier_invalidate_range_start(&range);
 
 	pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
@@ -1286,26 +1281,23 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
 	 * Parallel GUP-fast is fine since GUP-fast will back off when
 	 * it detects PMD is changed.
 	 */
-	_pmd = pmdp_collapse_flush(vma, address, pmd);
+	_pmd = pmdp_collapse_flush(vma, pmd_addr, pmd);
 	spin_unlock(pmd_ptl);
 	mmu_notifier_invalidate_range_end(&range);
 	tlb_remove_table_sync_one();
 
-	pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
+	pte = pte_offset_map_lock(mm, &_pmd, start_addr, &pte_ptl);
 	if (pte) {
-		result = __collapse_huge_page_isolate(vma, address, pte, cc,
-						      HPAGE_PMD_ORDER,
-						      &compound_pagelist);
+		result = __collapse_huge_page_isolate(vma, start_addr, pte, cc,
+						      order, &compound_pagelist);
 		spin_unlock(pte_ptl);
 	} else {
 		result = SCAN_NO_PTE_TABLE;
 	}
 
 	if (unlikely(result != SCAN_SUCCEED)) {
-		if (pte)
-			pte_unmap(pte);
 		spin_lock(pmd_ptl);
-		BUG_ON(!pmd_none(*pmd));
+		WARN_ON_ONCE(!pmd_none(*pmd));
 		/*
 		 * We can only use set_pmd_at when establishing
 		 * hugepmds and never for establishing regular pmds that
@@ -1313,21 +1305,24 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
 		 */
 		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
 		spin_unlock(pmd_ptl);
-		anon_vma_unlock_write(vma->anon_vma);
 		goto out_up_write;
 	}
 
 	/*
-	 * All pages are isolated and locked so anon_vma rmap
-	 * can't run anymore.
+	 * For PMD collapse all pages are isolated and locked so anon_vma
+	 * rmap can't run anymore. For mTHP collapse the PMD entry has been
+	 * removed and not all pages are isolated and locked, so we must hold
+	 * the lock to prevent neighboring folios from attempting to access
+	 * this PMD until its reinstalled.
 	 */
-	anon_vma_unlock_write(vma->anon_vma);
+	if (is_pmd_order(order)) {
+		anon_vma_unlock_write(vma->anon_vma);
+		anon_vma_locked = false;
+	}
 
 	result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
-					   vma, address, pte_ptl,
-					   HPAGE_PMD_ORDER,
-					   &compound_pagelist);
-	pte_unmap(pte);
+					   vma, start_addr, pte_ptl,
+					   order, &compound_pagelist);
 	if (unlikely(result != SCAN_SUCCEED))
 		goto out_up_write;
 
@@ -1337,18 +1332,27 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
 	 * write.
 	 */
 	__folio_mark_uptodate(folio);
-	pgtable = pmd_pgtable(_pmd);
-
 	spin_lock(pmd_ptl);
-	BUG_ON(!pmd_none(*pmd));
-	pgtable_trans_huge_deposit(mm, pmd, pgtable);
-	map_anon_folio_pmd_nopf(folio, pmd, vma, address);
+	WARN_ON_ONCE(!pmd_none(*pmd));
+	if (is_pmd_order(order)) { /* PMD collapse */
+		pgtable = pmd_pgtable(_pmd);
+		pgtable_trans_huge_deposit(mm, pmd, pgtable);
+		map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_addr);
+	} else { /* mTHP collapse */
+		map_anon_folio_pte_nopf(folio, pte, vma, start_addr, /*uffd_wp=*/ false);
+		smp_wmb(); /* make PTEs visible before PMD. See pmd_install() */
+		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
+	}
 	spin_unlock(pmd_ptl);
 
 	folio = NULL;
 
 	result = SCAN_SUCCEED;
 out_up_write:
+	if (anon_vma_locked)
+		anon_vma_unlock_write(vma->anon_vma);
+	if (pte)
+		pte_unmap(pte);
 	mmap_write_unlock(mm);
 out_nolock:
 	if (folio)
@@ -1525,8 +1529,15 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
 	if (result == SCAN_SUCCEED) {
+		/*
+		 * Before allocating the hugepage, release the mmap_lock read lock.
+		 * The allocation can take potentially a long time if it involves
+		 * sync compaction, and we do not need to hold the mmap_lock during
+		 * that. We will recheck the vma after taking it again in write mode.
+		 */
+		mmap_read_unlock(mm);
 		result = collapse_huge_page(mm, start_addr, referenced,
-					    unmapped, cc);
+					    unmapped, cc, HPAGE_PMD_ORDER);
 		/* collapse_huge_page will return with the mmap_lock released */
 		*lock_dropped = true;
 	}
-- 
2.53.0


^ permalink raw reply related

* [PATCH 7.2 v16 04/13] mm/khugepaged: generalize __collapse_huge_page_* for mTHP support
From: Nico Pache @ 2026-04-19 18:57 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
	dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, ljs,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, npache,
	peterx, pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang,
	rientjes, rostedt, rppt, ryan.roberts, shivankg, sunnanyong,
	surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260419185750.260784-1-npache@redhat.com>

generalize the order of the __collapse_huge_page_* and collapse_max_*
functions to support future mTHP collapse.

The current mechanism for determining collapse with the
khugepaged_max_ptes_none value is not designed with mTHP in mind. This
raises a key design issue: if we support user defined max_pte_none values
(even those scaled by order), a collapse of a lower order can introduces
an feedback loop, or "creep", when max_ptes_none is set to a value greater
than HPAGE_PMD_NR / 2.

With this configuration, a successful collapse to order N will populate
enough pages to satisfy the collapse condition on order N+1 on the next
scan. This leads to unnecessary work and memory churn.

To fix this issue introduce a helper function that will limit mTHP
collapse support to two max_ptes_none values, 0 and HPAGE_PMD_NR - 1.
This effectively supports two modes:

- max_ptes_none=0: never introduce new none-pages for mTHP collapse.
- max_ptes_none=511 (on 4k pagesz): Always collapse to the highest
  available mTHP order.

This removes the possiblilty of "creep", while not modifying any uAPI
expectations. A warning will be emitted if any non-supported
max_ptes_none value is configured with mTHP enabled.

mTHP collapse will not honor the khugepaged_max_ptes_shared or
khugepaged_max_ptes_swap parameters, and will fail if it encounters a
shared or swapped entry.

No functional changes in this patch; however it defines future behavior
for mTHP collapse.

Co-developed-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
 mm/khugepaged.c | 124 ++++++++++++++++++++++++++++++++++--------------
 1 file changed, 88 insertions(+), 36 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index f42b55421191..283bb63854a5 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -352,51 +352,86 @@ static bool pte_none_or_zero(pte_t pte)
  * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse
  * @cc: The collapse control struct
  * @vma: The vma to check for userfaultfd
+ * @order: The folio order being collapsed to
  *
  * If we are not in khugepaged mode use HPAGE_PMD_NR to allow any
- * empty page.
+ * empty page. For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the
+ * configured khugepaged_max_ptes_none value.
+ *
+ * For mTHP collapses, we currently only support khugepaged_max_pte_none values
+ * of 0 or (KHUGEPAGED_MAX_PTES_LIMIT). Any other value will emit a warning and
+ * no mTHP collapse will be attempted
  *
  * Return: Maximum number of empty PTEs allowed for the collapse operation
  */
-static unsigned int collapse_max_ptes_none(struct collapse_control *cc,
-		struct vm_area_struct *vma)
+static int collapse_max_ptes_none(struct collapse_control *cc,
+		struct vm_area_struct *vma, unsigned int order)
 {
 	if (vma && userfaultfd_armed(vma))
 		return 0;
 	if (!cc->is_khugepaged)
 		return HPAGE_PMD_NR;
-	return khugepaged_max_ptes_none;
+	if (is_pmd_order(order))
+		return khugepaged_max_ptes_none;
+	/* Zero/non-present collapse disabled. */
+	if (!khugepaged_max_ptes_none)
+		return 0;
+	if (khugepaged_max_ptes_none == KHUGEPAGED_MAX_PTES_LIMIT)
+		return (1 << order) - 1;
+
+	pr_warn_once("mTHP collapse only supports max_ptes_none values of 0 or %u\n",
+		      KHUGEPAGED_MAX_PTES_LIMIT);
+	return -EINVAL;
 }
 
 /**
  * collapse_max_ptes_shared - Calculate maximum allowed shared PTEs for collapse
  * @cc: The collapse control struct
+ * @order: The folio order being collapsed to
  *
  * If we are not in khugepaged mode use HPAGE_PMD_NR to allow any
  * shared page.
  *
+ * For mTHP collapses, we currently dont support collapsing memory with
+ * shared memory.
+ *
  * Return: Maximum number of shared PTEs allowed for the collapse operation
  */
-static unsigned int collapse_max_ptes_shared(struct collapse_control *cc)
+static unsigned int collapse_max_ptes_shared(struct collapse_control *cc,
+		unsigned int order)
 {
 	if (!cc->is_khugepaged)
 		return HPAGE_PMD_NR;
+	if (!is_pmd_order(order))
+		return 0;
+
 	return khugepaged_max_ptes_shared;
 }
 
 /**
  * collapse_max_ptes_swap - Calculate maximum allowed swap PTEs for collapse
  * @cc: The collapse control struct
+ * @order: The folio order being collapsed to
  *
  * If we are not in khugepaged mode use HPAGE_PMD_NR to allow any
  * swap page.
  *
+ * For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the configured
+ * khugepaged_max_ptes_swap value.
+ *
+ * For mTHP collapses, we currently dont support collapsing memory with
+ * swapped out memory.
+ *
  * Return: Maximum number of swap PTEs allowed for the collapse operation
  */
-static unsigned int collapse_max_ptes_swap(struct collapse_control *cc)
+static unsigned int collapse_max_ptes_swap(struct collapse_control *cc,
+		unsigned int order)
 {
 	if (!cc->is_khugepaged)
 		return HPAGE_PMD_NR;
+	if (!is_pmd_order(order))
+		return 0;
+
 	return khugepaged_max_ptes_swap;
 }
 
@@ -590,18 +625,22 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
 
 static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		unsigned long start_addr, pte_t *pte, struct collapse_control *cc,
-		struct list_head *compound_pagelist)
+		unsigned int order, struct list_head *compound_pagelist)
 {
+	const unsigned long nr_pages = 1UL << order;
 	struct page *page = NULL;
 	struct folio *folio = NULL;
 	unsigned long addr = start_addr;
 	pte_t *_pte;
 	int none_or_zero = 0, shared = 0, referenced = 0;
 	enum scan_result result = SCAN_FAIL;
-	unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma);
-	unsigned int max_ptes_shared = collapse_max_ptes_shared(cc);
+	int max_ptes_none = collapse_max_ptes_none(cc, vma, order);
+	unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, order);
+
+	if (max_ptes_none < 0)
+		return result;
 
-	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
+	for (_pte = pte; _pte < pte + nr_pages;
 	     _pte++, addr += PAGE_SIZE) {
 		pte_t pteval = ptep_get(_pte);
 		if (pte_none_or_zero(pteval)) {
@@ -734,18 +773,18 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
 }
 
 static void __collapse_huge_page_copy_succeeded(pte_t *pte,
-						struct vm_area_struct *vma,
-						unsigned long address,
-						spinlock_t *ptl,
-						struct list_head *compound_pagelist)
+		struct vm_area_struct *vma, unsigned long address,
+		spinlock_t *ptl, unsigned int order,
+		struct list_head *compound_pagelist)
 {
-	unsigned long end = address + HPAGE_PMD_SIZE;
+	const unsigned long nr_pages = 1UL << order;
+	unsigned long end = address + (PAGE_SIZE << order);
 	struct folio *src, *tmp;
 	pte_t pteval;
 	pte_t *_pte;
 	unsigned int nr_ptes;
 
-	for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
+	for (_pte = pte; _pte < pte + nr_pages; _pte += nr_ptes,
 	     address += nr_ptes * PAGE_SIZE) {
 		nr_ptes = 1;
 		pteval = ptep_get(_pte);
@@ -798,13 +837,11 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
 }
 
 static void __collapse_huge_page_copy_failed(pte_t *pte,
-					     pmd_t *pmd,
-					     pmd_t orig_pmd,
-					     struct vm_area_struct *vma,
-					     struct list_head *compound_pagelist)
+		pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
+		unsigned int order, struct list_head *compound_pagelist)
 {
+	const unsigned long nr_pages = 1UL << order;
 	spinlock_t *pmd_ptl;
-
 	/*
 	 * Re-establish the PMD to point to the original page table
 	 * entry. Restoring PMD needs to be done prior to releasing
@@ -818,7 +855,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
 	 * Release both raw and compound pages isolated
 	 * in __collapse_huge_page_isolate.
 	 */
-	release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
+	release_pte_pages(pte, pte + nr_pages, compound_pagelist);
 }
 
 /*
@@ -838,16 +875,16 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
  */
 static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
 		pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
-		unsigned long address, spinlock_t *ptl,
+		unsigned long address, spinlock_t *ptl, unsigned int order,
 		struct list_head *compound_pagelist)
 {
+	const unsigned long nr_pages = 1UL << order;
 	unsigned int i;
 	enum scan_result result = SCAN_SUCCEED;
-
 	/*
 	 * Copying pages' contents is subject to memory poison at any iteration.
 	 */
-	for (i = 0; i < HPAGE_PMD_NR; i++) {
+	for (i = 0; i < nr_pages; i++) {
 		pte_t pteval = ptep_get(pte + i);
 		struct page *page = folio_page(folio, i);
 		unsigned long src_addr = address + i * PAGE_SIZE;
@@ -866,10 +903,10 @@ static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio *foli
 
 	if (likely(result == SCAN_SUCCEED))
 		__collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
-						    compound_pagelist);
+						    order, compound_pagelist);
 	else
 		__collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
-						 compound_pagelist);
+						 order, compound_pagelist);
 
 	return result;
 }
@@ -1040,12 +1077,12 @@ static enum scan_result check_pmd_still_valid(struct mm_struct *mm,
  * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
  */
 static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
-		struct vm_area_struct *vma, unsigned long start_addr, pmd_t *pmd,
-		int referenced)
+		struct vm_area_struct *vma, unsigned long start_addr,
+		pmd_t *pmd, int referenced, unsigned int order)
 {
 	int swapped_in = 0;
 	vm_fault_t ret = 0;
-	unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE);
+	unsigned long addr, end = start_addr + (PAGE_SIZE << order);
 	enum scan_result result;
 	pte_t *pte = NULL;
 	spinlock_t *ptl;
@@ -1077,6 +1114,19 @@ static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
 		    pte_present(vmf.orig_pte))
 			continue;
 
+		/*
+		 * TODO: Support swapin without leading to further mTHP
+		 * collapses. Currently bringing in new pages via swapin may
+		 * cause a future higher order collapse on a rescan of the same
+		 * range.
+		 */
+		if (!is_pmd_order(order)) {
+			pte_unmap(pte);
+			mmap_read_unlock(mm);
+			result = SCAN_EXCEED_SWAP_PTE;
+			goto out;
+		}
+
 		vmf.pte = pte;
 		vmf.ptl = ptl;
 		ret = do_swap_page(&vmf);
@@ -1196,7 +1246,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
 		 * that case.  Continuing to collapse causes inconsistency.
 		 */
 		result = __collapse_huge_page_swapin(mm, vma, address, pmd,
-						     referenced);
+						     referenced, HPAGE_PMD_ORDER);
 		if (result != SCAN_SUCCEED)
 			goto out_nolock;
 	}
@@ -1244,6 +1294,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
 	pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
 	if (pte) {
 		result = __collapse_huge_page_isolate(vma, address, pte, cc,
+						      HPAGE_PMD_ORDER,
 						      &compound_pagelist);
 		spin_unlock(pte_ptl);
 	} else {
@@ -1274,6 +1325,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
 
 	result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
 					   vma, address, pte_ptl,
+					   HPAGE_PMD_ORDER,
 					   &compound_pagelist);
 	pte_unmap(pte);
 	if (unlikely(result != SCAN_SUCCEED))
@@ -1318,9 +1370,9 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
 	unsigned long addr;
 	spinlock_t *ptl;
 	int node = NUMA_NO_NODE, unmapped = 0;
-	unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma);
-	unsigned int max_ptes_shared = collapse_max_ptes_shared(cc);
-	unsigned int max_ptes_swap = collapse_max_ptes_swap(cc);
+	int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
+	unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, HPAGE_PMD_ORDER);
+	unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
 
 	VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
 
@@ -2371,8 +2423,8 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
 	int present, swap;
 	int node = NUMA_NO_NODE;
 	enum scan_result result = SCAN_SUCCEED;
-	unsigned int max_ptes_none = collapse_max_ptes_none(cc, NULL);
-	unsigned int max_ptes_swap = collapse_max_ptes_swap(cc);
+	int max_ptes_none = collapse_max_ptes_none(cc, NULL, HPAGE_PMD_ORDER);
+	unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
 
 	present = 0;
 	swap = 0;
-- 
2.53.0


^ permalink raw reply related

* [PATCH 7.2 v16 03/13] mm/khugepaged: rework max_ptes_* handling with helper functions
From: Nico Pache @ 2026-04-19 18:57 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
	dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, ljs,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, npache,
	peterx, pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang,
	rientjes, rostedt, rppt, ryan.roberts, shivankg, sunnanyong,
	surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260419185750.260784-1-npache@redhat.com>

The following cleanup reworks all the max_ptes_* handling into helper
functions. This increases the code readability and will later be used to
implement the mTHP handling of these variables.

With these changes we abstract all the madvise_collapse() special casing
(dont respect the sysctls) away from the functions that utilize them. And
will later in this series to cleanly restrict mTHP collapses behaviors.

Suggested-by: David Hildenbrand <david@kernel.org>
Signed-off-by: Nico Pache <npache@redhat.com>
---
 mm/khugepaged.c | 114 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 78 insertions(+), 36 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index afac6bc4e76d..f42b55421191 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -348,6 +348,58 @@ static bool pte_none_or_zero(pte_t pte)
 	return pte_present(pte) && is_zero_pfn(pte_pfn(pte));
 }
 
+/**
+ * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse
+ * @cc: The collapse control struct
+ * @vma: The vma to check for userfaultfd
+ *
+ * If we are not in khugepaged mode use HPAGE_PMD_NR to allow any
+ * empty page.
+ *
+ * Return: Maximum number of empty PTEs allowed for the collapse operation
+ */
+static unsigned int collapse_max_ptes_none(struct collapse_control *cc,
+		struct vm_area_struct *vma)
+{
+	if (vma && userfaultfd_armed(vma))
+		return 0;
+	if (!cc->is_khugepaged)
+		return HPAGE_PMD_NR;
+	return khugepaged_max_ptes_none;
+}
+
+/**
+ * collapse_max_ptes_shared - Calculate maximum allowed shared PTEs for collapse
+ * @cc: The collapse control struct
+ *
+ * If we are not in khugepaged mode use HPAGE_PMD_NR to allow any
+ * shared page.
+ *
+ * Return: Maximum number of shared PTEs allowed for the collapse operation
+ */
+static unsigned int collapse_max_ptes_shared(struct collapse_control *cc)
+{
+	if (!cc->is_khugepaged)
+		return HPAGE_PMD_NR;
+	return khugepaged_max_ptes_shared;
+}
+
+/**
+ * collapse_max_ptes_swap - Calculate maximum allowed swap PTEs for collapse
+ * @cc: The collapse control struct
+ *
+ * If we are not in khugepaged mode use HPAGE_PMD_NR to allow any
+ * swap page.
+ *
+ * Return: Maximum number of swap PTEs allowed for the collapse operation
+ */
+static unsigned int collapse_max_ptes_swap(struct collapse_control *cc)
+{
+	if (!cc->is_khugepaged)
+		return HPAGE_PMD_NR;
+	return khugepaged_max_ptes_swap;
+}
+
 int hugepage_madvise(struct vm_area_struct *vma,
 		     vm_flags_t *vm_flags, int advice)
 {
@@ -546,21 +598,19 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
 	pte_t *_pte;
 	int none_or_zero = 0, shared = 0, referenced = 0;
 	enum scan_result result = SCAN_FAIL;
+	unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma);
+	unsigned int max_ptes_shared = collapse_max_ptes_shared(cc);
 
 	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
 	     _pte++, addr += PAGE_SIZE) {
 		pte_t pteval = ptep_get(_pte);
 		if (pte_none_or_zero(pteval)) {
-			++none_or_zero;
-			if (!userfaultfd_armed(vma) &&
-			    (!cc->is_khugepaged ||
-			     none_or_zero <= khugepaged_max_ptes_none)) {
-				continue;
-			} else {
+			if (++none_or_zero > max_ptes_none) {
 				result = SCAN_EXCEED_NONE_PTE;
 				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
 				goto out;
 			}
+			continue;
 		}
 		if (!pte_present(pteval)) {
 			result = SCAN_PTE_NON_PRESENT;
@@ -591,9 +641,7 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
 
 		/* See collapse_scan_pmd(). */
 		if (folio_maybe_mapped_shared(folio)) {
-			++shared;
-			if (cc->is_khugepaged &&
-			    shared > khugepaged_max_ptes_shared) {
+			if (++shared > max_ptes_shared) {
 				result = SCAN_EXCEED_SHARED_PTE;
 				count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
 				goto out;
@@ -1270,6 +1318,9 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
 	unsigned long addr;
 	spinlock_t *ptl;
 	int node = NUMA_NO_NODE, unmapped = 0;
+	unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma);
+	unsigned int max_ptes_shared = collapse_max_ptes_shared(cc);
+	unsigned int max_ptes_swap = collapse_max_ptes_swap(cc);
 
 	VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
 
@@ -1294,36 +1345,29 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
 
 		pte_t pteval = ptep_get(_pte);
 		if (pte_none_or_zero(pteval)) {
-			++none_or_zero;
-			if (!userfaultfd_armed(vma) &&
-			    (!cc->is_khugepaged ||
-			     none_or_zero <= khugepaged_max_ptes_none)) {
-				continue;
-			} else {
+			if (++none_or_zero > max_ptes_none) {
 				result = SCAN_EXCEED_NONE_PTE;
 				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
 				goto out_unmap;
 			}
+			continue;
 		}
 		if (!pte_present(pteval)) {
-			++unmapped;
-			if (!cc->is_khugepaged ||
-			    unmapped <= khugepaged_max_ptes_swap) {
-				/*
-				 * Always be strict with uffd-wp
-				 * enabled swap entries.  Please see
-				 * comment below for pte_uffd_wp().
-				 */
-				if (pte_swp_uffd_wp_any(pteval)) {
-					result = SCAN_PTE_UFFD_WP;
-					goto out_unmap;
-				}
-				continue;
-			} else {
+			if (++unmapped > max_ptes_swap) {
 				result = SCAN_EXCEED_SWAP_PTE;
 				count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
 				goto out_unmap;
 			}
+			/*
+			 * Always be strict with uffd-wp
+			 * enabled swap entries.  Please see
+			 * comment below for pte_uffd_wp().
+			 */
+			if (pte_swp_uffd_wp_any(pteval)) {
+				result = SCAN_PTE_UFFD_WP;
+				goto out_unmap;
+			}
+			continue;
 		}
 		if (pte_uffd_wp(pteval)) {
 			/*
@@ -1366,9 +1410,7 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
 		 * is shared.
 		 */
 		if (folio_maybe_mapped_shared(folio)) {
-			++shared;
-			if (cc->is_khugepaged &&
-			    shared > khugepaged_max_ptes_shared) {
+			if (++shared > max_ptes_shared) {
 				result = SCAN_EXCEED_SHARED_PTE;
 				count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
 				goto out_unmap;
@@ -2329,6 +2371,8 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
 	int present, swap;
 	int node = NUMA_NO_NODE;
 	enum scan_result result = SCAN_SUCCEED;
+	unsigned int max_ptes_none = collapse_max_ptes_none(cc, NULL);
+	unsigned int max_ptes_swap = collapse_max_ptes_swap(cc);
 
 	present = 0;
 	swap = 0;
@@ -2341,8 +2385,7 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
 
 		if (xa_is_value(folio)) {
 			swap += 1 << xas_get_order(&xas);
-			if (cc->is_khugepaged &&
-			    swap > khugepaged_max_ptes_swap) {
+			if (swap > max_ptes_swap) {
 				result = SCAN_EXCEED_SWAP_PTE;
 				count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
 				break;
@@ -2413,8 +2456,7 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
 		cc->progress += HPAGE_PMD_NR;
 
 	if (result == SCAN_SUCCEED) {
-		if (cc->is_khugepaged &&
-		    present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+		if (present < HPAGE_PMD_NR - max_ptes_none) {
 			result = SCAN_EXCEED_NONE_PTE;
 			count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
 		} else {
-- 
2.53.0


^ permalink raw reply related

* [PATCH 7.2 v16 02/13] mm/khugepaged: generalize alloc_charge_folio()
From: Nico Pache @ 2026-04-19 18:57 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
	dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, ljs,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, npache,
	peterx, pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang,
	rientjes, rostedt, rppt, ryan.roberts, shivankg, sunnanyong,
	surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260419185750.260784-1-npache@redhat.com>

From: Dev Jain <dev.jain@arm.com>

Pass order to alloc_charge_folio() and update mTHP statistics.

Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Co-developed-by: Nico Pache <npache@redhat.com>
Signed-off-by: Nico Pache <npache@redhat.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
---
 Documentation/admin-guide/mm/transhuge.rst |  8 ++++++++
 include/linux/huge_mm.h                    |  2 ++
 mm/huge_memory.c                           |  4 ++++
 mm/khugepaged.c                            | 17 +++++++++++------
 4 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 5fbc3d89bb07..c51932e6275d 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -639,6 +639,14 @@ anon_fault_fallback_charge
 	instead falls back to using huge pages with lower orders or
 	small pages even though the allocation was successful.
 
+collapse_alloc
+	is incremented every time a huge page is successfully allocated for a
+	khugepaged collapse.
+
+collapse_alloc_failed
+	is incremented every time a huge page allocation fails during a
+	khugepaged collapse.
+
 zswpout
 	is incremented every time a huge page is swapped out to zswap in one
 	piece without splitting.
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2949e5acff35..ba7ae6808544 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -128,6 +128,8 @@ enum mthp_stat_item {
 	MTHP_STAT_ANON_FAULT_ALLOC,
 	MTHP_STAT_ANON_FAULT_FALLBACK,
 	MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
+	MTHP_STAT_COLLAPSE_ALLOC,
+	MTHP_STAT_COLLAPSE_ALLOC_FAILED,
 	MTHP_STAT_ZSWPOUT,
 	MTHP_STAT_SWPIN,
 	MTHP_STAT_SWPIN_FALLBACK,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 970e077019b7..345c54133c83 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -685,6 +685,8 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
 DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+DEFINE_MTHP_STAT_ATTR(collapse_alloc, MTHP_STAT_COLLAPSE_ALLOC);
+DEFINE_MTHP_STAT_ATTR(collapse_alloc_failed, MTHP_STAT_COLLAPSE_ALLOC_FAILED);
 DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
 DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
 DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK);
@@ -750,6 +752,8 @@ static struct attribute *any_stats_attrs[] = {
 #endif
 	&split_attr.attr,
 	&split_failed_attr.attr,
+	&collapse_alloc_attr.attr,
+	&collapse_alloc_failed_attr.attr,
 	NULL,
 };
 
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 53e7e4be172d..afac6bc4e76d 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1068,21 +1068,26 @@ static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
 }
 
 static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
-		struct collapse_control *cc)
+		struct collapse_control *cc, unsigned int order)
 {
 	gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
 		     GFP_TRANSHUGE);
 	int node = collapse_find_target_node(cc);
 	struct folio *folio;
 
-	folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
+	folio = __folio_alloc(gfp, order, node, &cc->alloc_nmask);
 	if (!folio) {
 		*foliop = NULL;
-		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+		if (is_pmd_order(order))
+			count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+		count_mthp_stat(order, MTHP_STAT_COLLAPSE_ALLOC_FAILED);
 		return SCAN_ALLOC_HUGE_PAGE_FAIL;
 	}
 
-	count_vm_event(THP_COLLAPSE_ALLOC);
+	if (is_pmd_order(order))
+		count_vm_event(THP_COLLAPSE_ALLOC);
+	count_mthp_stat(order, MTHP_STAT_COLLAPSE_ALLOC);
+
 	if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
 		folio_put(folio);
 		*foliop = NULL;
@@ -1118,7 +1123,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
 	 */
 	mmap_read_unlock(mm);
 
-	result = alloc_charge_folio(&folio, mm, cc);
+	result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
 	if (result != SCAN_SUCCEED)
 		goto out_nolock;
 
@@ -1899,7 +1904,7 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
 	VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
 	VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
 
-	result = alloc_charge_folio(&new_folio, mm, cc);
+	result = alloc_charge_folio(&new_folio, mm, cc, HPAGE_PMD_ORDER);
 	if (result != SCAN_SUCCEED)
 		goto out;
 
-- 
2.53.0


^ permalink raw reply related

* [PATCH 7.2 v16 01/13] mm/khugepaged: generalize hugepage_vma_revalidate for mTHP support
From: Nico Pache @ 2026-04-19 18:57 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
	dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, ljs,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, npache,
	peterx, pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang,
	rientjes, rostedt, rppt, ryan.roberts, shivankg, sunnanyong,
	surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260419185750.260784-1-npache@redhat.com>

For khugepaged to support different mTHP orders, we must generalize this
to check if the PMD is not shared by another VMA and that the order is
enabled.

No functional change in this patch. Also correct a comment about the
functionality of the revalidation and fix a double space issues.

Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Co-developed-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
 mm/khugepaged.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b8452dbdb043..53e7e4be172d 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -902,12 +902,13 @@ static int collapse_find_target_node(struct collapse_control *cc)
 
 /*
  * If mmap_lock temporarily dropped, revalidate vma
- * before taking mmap_lock.
+ * after taking the mmap_lock again.
  * Returns enum scan_result value.
  */
 
 static enum scan_result hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
-		bool expect_anon, struct vm_area_struct **vmap, struct collapse_control *cc)
+		bool expect_anon, struct vm_area_struct **vmap,
+		struct collapse_control *cc, unsigned int order)
 {
 	struct vm_area_struct *vma;
 	enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED :
@@ -920,15 +921,16 @@ static enum scan_result hugepage_vma_revalidate(struct mm_struct *mm, unsigned l
 	if (!vma)
 		return SCAN_VMA_NULL;
 
+	/* Always check the PMD order to ensure its not shared by another VMA */
 	if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
 		return SCAN_ADDRESS_RANGE;
-	if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER))
+	if (!thp_vma_allowable_orders(vma, vma->vm_flags, type, BIT(order)))
 		return SCAN_VMA_CHECK;
 	/*
 	 * Anon VMA expected, the address may be unmapped then
 	 * remapped to file after khugepaged reaquired the mmap_lock.
 	 *
-	 * thp_vma_allowable_order may return true for qualified file
+	 * thp_vma_allowable_orders may return true for qualified file
 	 * vmas.
 	 */
 	if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
@@ -1121,7 +1123,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
 		goto out_nolock;
 
 	mmap_read_lock(mm);
-	result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
+	result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
+					 HPAGE_PMD_ORDER);
 	if (result != SCAN_SUCCEED) {
 		mmap_read_unlock(mm);
 		goto out_nolock;
@@ -1155,7 +1158,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
 	 * mmap_lock.
 	 */
 	mmap_write_lock(mm);
-	result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
+	result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
+					 HPAGE_PMD_ORDER);
 	if (result != SCAN_SUCCEED)
 		goto out_up_write;
 	/* check if the pmd is still valid */
@@ -2857,8 +2861,8 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
 			mmap_unlocked = false;
 			*lock_dropped = true;
 			result = hugepage_vma_revalidate(mm, addr, false, &vma,
-							 cc);
-			if (result  != SCAN_SUCCEED) {
+							 cc, HPAGE_PMD_ORDER);
+			if (result != SCAN_SUCCEED) {
 				last_fail = result;
 				goto out_nolock;
 			}
-- 
2.53.0


^ permalink raw reply related

* [PATCH 7.2 v16 00/13] khugepaged: mTHP support
From: Nico Pache @ 2026-04-19 18:57 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
	dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, ljs,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, npache,
	peterx, pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang,
	rientjes, rostedt, rppt, ryan.roberts, shivankg, sunnanyong,
	surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe

The following series provides khugepaged with the capability to collapse
anonymous memory regions to mTHPs.

To achieve this we generalize the khugepaged functions to no longer depend
on PMD_ORDER. Then during the PMD scan, we use a bitmap to track individual
pages that are occupied (!none/zero). After the PMD scan is done, we use
the bitmap to find the optimal mTHP sizes for the PMD range. The
restriction on max_ptes_none is removed during the scan, to make sure we
account for the whole PMD range in the bitmap. When no mTHP size is
enabled, the legacy behavior of khugepaged is maintained.

We currently only support max_ptes_none values of 0 or HPAGE_PMD_NR - 1
(ie 511). If any other value is specified, the kernel will emit a warning
and no mTHP collapse will be attempted. If a mTHP collapse is attempted,
but contains swapped out, or shared pages, we don't perform the collapse.
It is now also possible to collapse to mTHPs without requiring the PMD THP
size to be enabled. These limitations are to prevent collapse "creep"
behavior. This prevents constantly promoting mTHPs to the next available
size, which would occur because a collapse introduces more non-zero pages
that would satisfy the promotion condition on subsequent scans.


Patch 1-2:   Generalize hugepage_vma_revalidate and alloc_charge_folio
	     for arbitrary orders.
Patch 3:     Rework max_ptes_* handling into helper functions
Patch 4-5:   Generalize __collapse_huge_page_* and collapse_huge_page
Patch 6:     Skip collapsing mTHP to smaller orders
Patch 7-8:   Add per-order mTHP statistics and tracepoints
Patch 9:     Introduce collapse_allowable_orders helper function
Patch 10-12: Introduce bitmap and mTHP collapse support, fully enabled
Patch 13:    Documentation

Testing:
- Built for x86_64, aarch64, ppc64le, and s390x
- ran all arches on test suites provided by the kernel-tests project
- internal testing suites: functional testing and performance testing
- selftests mm
- I created a test script that I used to push khugepaged to its limits
   while monitoring a number of stats and tracepoints. The code is
   available here[1] (Run in legacy mode for these changes and set mthp
   sizes to inherit)
   The summary from my testings was that there was no significant
   regression noticed through this test. In some cases my changes had
   better collapse latencies, and was able to scan more pages in the same
   amount of time/work, but for the most part the results were consistent.
- redis testing. I did some testing with these changes along with my defer
  changes (see followup [2] post for more details). We've decided to get
  the mTHP changes merged first before attempting the defer series.
- some basic testing on 64k page size.
- lots of general use.

V16 Changes:
- This was quite the respin, so hopefully everything is ok! I dropped
  some acks/rb on patches that changed significantly. Major changes
  are related to the locking cleanup in collapse_huge_page().
  sorry if i missed any feedback there was quite a bit of minor requests
- New cleanup patch: introduce collapse_max_ptes_none/shared/swap()
  helpers, absorbing madvise_collapse special-casing (David)
- Merge "introduce collapse_max_ptes_none helper" into the generalize
  __collapse_huge_page_* patch, dropping the separate patch (David)
- Simplify collapse_huge_page() locking: move mmap_read_unlock() to
  caller, remove mmap_locked tracking parameter entirely (Lorenzo)
- Use enum tva_type instead of bool is_khugepaged in
  collapse_allowable_orders() (David, Lorenzo)
- collapse_allowable_orders() takes (vma, tva_flags), accesses
  vma->vm_flags internally (Lorenzo)
- Rename mthp_* functions to collapse_mthp_* prefix (David, Lorenzo)
- Use MAX_PTRS_PER_PTE consistently for bitmap ops (Lorenzo)
- Downgrade BUG_ON() to WARN_ON_ONCE() in collapse_huge_page() (Lorenzo)
- Add /*expect_anon=*/true annotation to revalidate calls (Lorenzo)
- Pull common spin_lock + WARN_ON_ONCE out of PMD/mTHP branch (Lorenzo)
- Fix pteval declaration style; use end_addr variable (Lorenzo)
- Use ternary for result assignment in collapse_scan_pmd() (Lorenzo)
- Improve anon_vma locking comment for mTHP case (Lorenzo)
- Document why skip-smaller-order isn't done in scan phase (David)
- Make documentation of max_ptes_none mTHP behavior (only 0 and
  HPAGE_PMD_NR-1 supported) in admin guide more apparent. (David, Lorenzo)
- Collect new Acks/RB tags from v15 review

V15: https://lore.kernel.org/all/20260226031741.230674-1-npache@redhat.com
V14: https://lore.kernel.org/all/20260122192841.128719-1-npache@redhat.com
V13: https://lore.kernel.org/all/20251201174627.23295-1-npache@redhat.com
V12: https://lore.kernel.org/all/20251022183717.70829-1-npache@redhat.com
V11: https://lore.kernel.org/all/20250912032810.197475-1-npache@redhat.com
V10: https://lore.kernel.org/all/20250819134205.622806-1-npache@redhat.com
V9 : https://lore.kernel.org/all/20250714003207.113275-1-npache@redhat.com
V8 : https://lore.kernel.org/all/20250702055742.102808-1-npache@redhat.com
V7 : https://lore.kernel.org/all/20250515032226.128900-1-npache@redhat.com
V6 : https://lore.kernel.org/all/20250515030312.125567-1-npache@redhat.com
V5 : https://lore.kernel.org/all/20250428181218.85925-1-npache@redhat.com
V4 : https://lore.kernel.org/all/20250417000238.74567-1-npache@redhat.com
V3 : https://lore.kernel.org/all/20250414220557.35388-1-npache@redhat.com
V2 : https://lore.kernel.org/all/20250211003028.213461-1-npache@redhat.com
V1 : https://lore.kernel.org/all/20250108233128.14484-1-npache@redhat.com

A big thanks to everyone that has reviewed, tested, and participated in
the development process. Its been a great experience working with all of
you on this endeavor.

[1] - https://gitlab.com/npache/khugepaged_mthp_test
[2] - https://lore.kernel.org/lkml/20250515033857.132535-1-npache@redhat.com/

Baolin Wang (1):
  mm/khugepaged: run khugepaged for all orders

Dev Jain (1):
  mm/khugepaged: generalize alloc_charge_folio()

Nico Pache (11):
  mm/khugepaged: generalize hugepage_vma_revalidate for mTHP support
  mm/khugepaged: rework max_ptes_* handling with helper functions
  mm/khugepaged: generalize __collapse_huge_page_* for mTHP support
  mm/khugepaged: generalize collapse_huge_page for mTHP collapse
  mm/khugepaged: skip collapsing mTHP to smaller orders
  mm/khugepaged: add per-order mTHP collapse failure statistics
  mm/khugepaged: improve tracepoints for mTHP orders
  mm/khugepaged: introduce collapse_allowable_orders helper function
  mm/khugepaged: Introduce mTHP collapse support
  mm/khugepaged: avoid unnecessary mTHP collapse attempts
  Documentation: mm: update the admin guide for mTHP collapse

 Documentation/admin-guide/mm/transhuge.rst |  81 ++-
 include/linux/huge_mm.h                    |   5 +
 include/linux/khugepaged.h                 |   6 +-
 include/trace/events/huge_memory.h         |  34 +-
 mm/huge_memory.c                           |  13 +-
 mm/khugepaged.c                            | 623 ++++++++++++++++-----
 mm/vma.c                                   |   6 +-
 tools/testing/vma/include/stubs.h          |   3 +-
 8 files changed, 589 insertions(+), 182 deletions(-)


base-commit: 4569c01e84d440f1f6c79df1e13e9c86a9fb80ca
--
2.53.0


^ permalink raw reply

* Re: [PATCH v2 00/19] tracepoint: Avoid double static_branch evaluation at guarded call sites
From: Vineeth Remanan Pillai @ 2026-04-19 13:14 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Peter Zijlstra, Dmitry Ilvokhin, Masami Hiramatsu,
	Mathieu Desnoyers, Ingo Molnar, Jens Axboe, io-uring,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Alexei Starovoitov, Daniel Borkmann, Marcelo Ricardo Leitner,
	Xin Long, Jon Maloy, Aaron Conole, Eelco Chaudron, Ilya Maximets,
	netdev, bpf, linux-sctp, tipc-discussion, dev, Jiri Pirko,
	Oded Gabbay, Koby Elbaz, dri-devel, Rafael J. Wysocki,
	Viresh Kumar, Gautham R. Shenoy, Huang Rui, Mario Limonciello,
	Len Brown, Srinivas Pandruvada, linux-pm, MyungJoo Ham,
	Kyungmin Park, Chanwoo Choi, Christian König, Sumit Semwal,
	linaro-mm-sig, Eddie James, Andrew Jeffery, Joel Stanley,
	linux-fsi, David Airlie, Simona Vetter, Alex Deucher,
	Danilo Krummrich, Matthew Brost, Philipp Stanner, Harry Wentland,
	Leo Li, amd-gfx, Jiri Kosina, Benjamin Tissoires, linux-input,
	Wolfram Sang, linux-i2c, Mark Brown, Michael Hennerich,
	Nuno Sá, linux-spi, James E.J. Bottomley, Martin K. Petersen,
	linux-scsi, Chris Mason, David Sterba, linux-btrfs,
	Thomas Gleixner, Andrew Morton, SeongJae Park, linux-mm,
	Borislav Petkov, Dave Hansen, x86, linux-trace-kernel,
	linux-kernel
In-Reply-To: <20260418190456.631df6f3@fedora>

On Sat, Apr 18, 2026 at 7:05 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Mon, 23 Mar 2026 12:00:19 -0400
> "Vineeth Pillai (Google)" <vineeth@bitbyteword.org> wrote:
>
> >   if (trace_foo_enabled() && cond)
> >       trace_call__foo(args);   /* calls __do_trace_foo() directly */
>
> Hi Vineeth,
>
> Could you rebase this series on top of 7.1-rc1 when it comes out?
> Several of these patches were accepted already. Obviously drop those.
> They were the patches that added the feature, and any where the
> maintainer acked the patch.
>
> Now that the feature has been accepted, if you post the patch series
> again after 7.1-rc1 with all the patches that haven't been accepted
> yet, then the maintainers can simply take them directly. As the feature
> is now accepted, there's no dependency on it, and they don't need to go
> through the tracing tree.
>
Sure, will do. Thanks for merging this feature.

Thanks,
Vineeth

^ permalink raw reply

* [PATCH v4 2/2] blk-mq: expose tag starvation counts via debugfs
From: Aaron Tomlin @ 2026-04-19  2:30 UTC (permalink / raw)
  To: axboe, rostedt, mhiramat, mathieu.desnoyers
  Cc: bvanassche, johannes.thumshirn, kch, dlemoal, ritesh.list,
	loberman, neelx, sean, mproche, chjohnst, linux-block,
	linux-kernel, linux-trace-kernel
In-Reply-To: <20260419023036.1419514-1-atomlin@atomlin.com>

In high-performance storage environments, particularly when utilising
RAID controllers with shared tag sets (BLK_MQ_F_TAG_HCTX_SHARED), severe
latency spikes can occur when fast devices are starved of available
tags.

This patch introduces two new debugfs attributes for each block
hardware queue:
  - /sys/kernel/debug/block/[device]/hctxN/wait_on_hw_tag
  - /sys/kernel/debug/block/[device]/hctxN/wait_on_sched_tag

These files expose atomic counters that increment each time a submitting
context is forced into an uninterruptible sleep via io_schedule() due to
the complete exhaustion of physical driver tags or software scheduler
tags, respectively.

To ensure negligible performance overhead even in production
environments where CONFIG_BLK_DEBUG_FS is actively enabled, this
tracking logic utilises dynamically allocated per-CPU counters. When
this configuration is disabled, the tracking logic compiles down to a
safe no-op.

Signed-off-by: Aaron Tomlin <atomlin@atomlin.com>
---
 block/blk-mq-debugfs.c | 84 ++++++++++++++++++++++++++++++++++++++++++
 block/blk-mq-debugfs.h |  7 ++++
 block/blk-mq-tag.c     |  4 ++
 include/linux/blk-mq.h | 12 ++++++
 4 files changed, 107 insertions(+)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 047ec887456b..a3effed55d90 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -7,6 +7,7 @@
 #include <linux/blkdev.h>
 #include <linux/build_bug.h>
 #include <linux/debugfs.h>
+#include <linux/percpu.h>
 
 #include "blk.h"
 #include "blk-mq.h"
@@ -484,6 +485,54 @@ static int hctx_dispatch_busy_show(void *data, struct seq_file *m)
 	return 0;
 }
 
+/**
+ * hctx_wait_on_hw_tag_show - display hardware tag starvation count
+ * @data: generic pointer to the associated hardware context (hctx)
+ * @m: seq_file pointer for debugfs output formatting
+ *
+ * Prints the cumulative number of times a submitting context was forced
+ * to block due to the exhaustion of physical hardware driver tags.
+ *
+ * Return: 0 on success.
+ */
+static int hctx_wait_on_hw_tag_show(void *data, struct seq_file *m)
+{
+	struct blk_mq_hw_ctx *hctx = data;
+	unsigned long count = 0;
+	int cpu;
+
+	if (hctx->wait_on_hw_tag) {
+		for_each_possible_cpu(cpu)
+			count += *per_cpu_ptr(hctx->wait_on_hw_tag, cpu);
+	}
+	seq_printf(m, "%lu\n", count);
+	return 0;
+}
+
+/**
+ * hctx_wait_on_sched_tag_show - display scheduler tag starvation count
+ * @data: generic pointer to the associated hardware context (hctx)
+ * @m: seq_file pointer for debugfs output formatting
+ *
+ * Prints the cumulative number of times a submitting context was forced
+ * to block due to the exhaustion of software scheduler tags.
+ *
+ * Return: 0 on success.
+ */
+static int hctx_wait_on_sched_tag_show(void *data, struct seq_file *m)
+{
+	struct blk_mq_hw_ctx *hctx = data;
+	unsigned long count = 0;
+	int cpu;
+
+	if (hctx->wait_on_sched_tag) {
+		for_each_possible_cpu(cpu)
+			count += *per_cpu_ptr(hctx->wait_on_sched_tag, cpu);
+	}
+	seq_printf(m, "%lu\n", count);
+	return 0;
+}
+
 #define CTX_RQ_SEQ_OPS(name, type)					\
 static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \
 	__acquires(&ctx->lock)						\
@@ -599,6 +648,8 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
 	{"active", 0400, hctx_active_show},
 	{"dispatch_busy", 0400, hctx_dispatch_busy_show},
 	{"type", 0400, hctx_type_show},
+	{"wait_on_hw_tag", 0400, hctx_wait_on_hw_tag_show},
+	{"wait_on_sched_tag", 0400, hctx_wait_on_sched_tag_show},
 	{},
 };
 
@@ -670,6 +721,11 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q,
 	snprintf(name, sizeof(name), "hctx%u", hctx->queue_num);
 	hctx->debugfs_dir = debugfs_create_dir(name, q->debugfs_dir);
 
+	if (!hctx->wait_on_hw_tag)
+		hctx->wait_on_hw_tag = alloc_percpu(unsigned long);
+	if (!hctx->wait_on_sched_tag)
+		hctx->wait_on_sched_tag = alloc_percpu(unsigned long);
+
 	debugfs_create_files(q, hctx->debugfs_dir, hctx,
 			     blk_mq_debugfs_hctx_attrs);
 
@@ -684,6 +740,11 @@ void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx)
 	debugfs_remove_recursive(hctx->debugfs_dir);
 	hctx->sched_debugfs_dir = NULL;
 	hctx->debugfs_dir = NULL;
+
+	free_percpu(hctx->wait_on_hw_tag);
+	hctx->wait_on_hw_tag = NULL;
+	free_percpu(hctx->wait_on_sched_tag);
+	hctx->wait_on_sched_tag = NULL;
 }
 
 void blk_mq_debugfs_register_hctxs(struct request_queue *q)
@@ -815,3 +876,26 @@ void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx)
 	debugfs_remove_recursive(hctx->sched_debugfs_dir);
 	hctx->sched_debugfs_dir = NULL;
 }
+
+/**
+ * blk_mq_debugfs_inc_wait_tags - increment the tag starvation counters
+ * @hctx: hardware context associated with the tag allocation
+ * @is_sched: true if the starved pool is the software scheduler
+ *
+ * Evaluates the exhausted tag pool and safely increments the appropriate
+ * per-cpu debugfs starvation counter.
+ *
+ * Note: A race window exists during rapid device probe or CPU hotplug
+ * where I/O might be submitted before blk_mq_debugfs_register_hctx() has
+ * completed allocating the per-CPU counters. Therefore, the pointer is
+ * explicitly checked to prevent a NULL pointer dereference.
+ */
+void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx,
+				  bool is_sched)
+{
+	unsigned long __percpu *tags = is_sched ? hctx->wait_on_sched_tag :
+						  hctx->wait_on_hw_tag;
+
+	if (likely(tags))
+		this_cpu_inc(*tags);
+}
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index 49bb1aaa83dc..a0094d004d08 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -17,6 +17,8 @@ struct blk_mq_debugfs_attr {
 	const struct seq_operations *seq_ops;
 };
 
+void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx,
+				  bool is_sched);
 int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq);
 int blk_mq_debugfs_rq_show(struct seq_file *m, void *v);
 
@@ -35,6 +37,11 @@ void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx);
 
 void blk_mq_debugfs_register_rq_qos(struct request_queue *q);
 #else
+static inline void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx,
+						bool is_sched)
+{
+}
+
 static inline void blk_mq_debugfs_register(struct request_queue *q)
 {
 }
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 66138dd043d4..3cc6a97a87a0 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -17,6 +17,7 @@
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-sched.h"
+#include "blk-mq-debugfs.h"
 
 /*
  * Recalculate wakeup batch when tag is shared by hctx.
@@ -191,6 +192,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		trace_block_rq_tag_wait(data->q, data->hctx,
 					data->rq_flags & RQF_SCHED_TAGS);
 
+		blk_mq_debugfs_inc_wait_tags(data->hctx,
+					     data->rq_flags & RQF_SCHED_TAGS);
+
 		bt_prev = bt;
 		io_schedule();
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ebc45557aee8..17cd6221bb93 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -453,6 +453,18 @@ struct blk_mq_hw_ctx {
 	struct dentry		*debugfs_dir;
 	/** @sched_debugfs_dir:	debugfs directory for the scheduler. */
 	struct dentry		*sched_debugfs_dir;
+	/**
+	 * @wait_on_hw_tag: Cumulative per-cpu counter incremented each
+	 * time a submitting context is forced to block due to physical
+	 * hardware tag exhaustion.
+	 */
+	unsigned long __percpu	*wait_on_hw_tag;
+	/**
+	 * @wait_on_sched_tag: Cumulative per-cpu counter incremented each
+	 * time a submitting context is forced to block due to software
+	 * scheduler tag exhaustion.
+	 */
+	unsigned long __percpu	*wait_on_sched_tag;
 #endif
 
 	/**
-- 
2.51.0


^ permalink raw reply related

* [PATCH v4 1/2] blk-mq: add tracepoint block_rq_tag_wait
From: Aaron Tomlin @ 2026-04-19  2:30 UTC (permalink / raw)
  To: axboe, rostedt, mhiramat, mathieu.desnoyers
  Cc: bvanassche, johannes.thumshirn, kch, dlemoal, ritesh.list,
	loberman, neelx, sean, mproche, chjohnst, linux-block,
	linux-kernel, linux-trace-kernel
In-Reply-To: <20260419023036.1419514-1-atomlin@atomlin.com>

In high-performance storage environments, particularly when utilising
RAID controllers with shared tag sets (BLK_MQ_F_TAG_HCTX_SHARED), severe
latency spikes can occur when fast devices (SSDs) are starved of hardware
tags when sharing the same blk_mq_tag_set.

Currently, diagnosing this specific hardware queue contention is
difficult. When a CPU thread exhausts the tag pool, blk_mq_get_tag()
forces the current thread to block uninterruptible via io_schedule().
While this can be inferred via sched:sched_switch or dynamically
traced by attaching a kprobe to blk_mq_mark_tag_wait(), there is no
dedicated, out-of-the-box observability for this event.

This patch introduces the block_rq_tag_wait trace point in the tag
allocation slow-path. It triggers immediately before the thread yields
the CPU, exposing the exact hardware context (hctx) that is starved, the
specific pool experiencing starvation (hardware or software scheduler),
and the total pool depth.

This provides storage engineers and performance monitoring agents
with a zero-configuration, low-overhead mechanism to definitively
identify shared-tag bottlenecks and tune I/O schedulers or cgroup
throttling accordingly.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Laurence Oberman <loberman@redhat.com>
Tested-by: Laurence Oberman <loberman@redhat.com>
Signed-off-by: Aaron Tomlin <atomlin@atomlin.com>
---
 block/blk-mq-tag.c           |  4 ++++
 include/trace/events/block.h | 43 ++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 33946cdb5716..66138dd043d4 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -13,6 +13,7 @@
 #include <linux/kmemleak.h>
 
 #include <linux/delay.h>
+#include <trace/events/block.h>
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-sched.h"
@@ -187,6 +188,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		if (tag != BLK_MQ_NO_TAG)
 			break;
 
+		trace_block_rq_tag_wait(data->q, data->hctx,
+					data->rq_flags & RQF_SCHED_TAGS);
+
 		bt_prev = bt;
 		io_schedule();
 
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 6aa79e2d799c..71554b94e4d0 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -226,6 +226,49 @@ DECLARE_EVENT_CLASS(block_rq,
 		  IOPRIO_PRIO_LEVEL(__entry->ioprio), __entry->comm)
 );
 
+/**
+ * block_rq_tag_wait - triggered when a request is starved of a tag
+ * @q: request queue of the target device
+ * @hctx: hardware context of the request experiencing starvation
+ * @is_sched_tag: indicates whether the starved pool is the software scheduler
+ *
+ * Called immediately before the submitting context is forced to block due
+ * to the exhaustion of available tags (i.e., physical hardware driver tags
+ * or software scheduler tags). This trace point indicates that the context
+ * will be placed into an uninterruptible state via io_schedule() until an
+ * active request completes and relinquishes its assigned tag.
+ */
+TRACE_EVENT(block_rq_tag_wait,
+
+	TP_PROTO(struct request_queue *q, struct blk_mq_hw_ctx *hctx, bool is_sched_tag),
+
+	TP_ARGS(q, hctx, is_sched_tag),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( u32,		hctx_id			)
+		__field( u32,		nr_tags			)
+		__field( bool,		is_sched_tag		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= disk_devt(q->disk);
+		__entry->hctx_id	= hctx->queue_num;
+		__entry->is_sched_tag	= is_sched_tag;
+
+		if (is_sched_tag)
+			__entry->nr_tags = hctx->sched_tags->nr_tags;
+		else
+			__entry->nr_tags = hctx->tags->nr_tags;
+	),
+
+	TP_printk("%d,%d hctx=%u starved on %s tags (depth=%u)",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->hctx_id,
+		  __entry->is_sched_tag ? "scheduler" : "hardware",
+		  __entry->nr_tags)
+);
+
 /**
  * block_rq_insert - insert block operation request into queue
  * @rq: block IO operation request
-- 
2.51.0


^ permalink raw reply related

* [PATCH v4 0/2] blk-mq: introduce tag starvation observability
From: Aaron Tomlin @ 2026-04-19  2:30 UTC (permalink / raw)
  To: axboe, rostedt, mhiramat, mathieu.desnoyers
  Cc: bvanassche, johannes.thumshirn, kch, dlemoal, ritesh.list,
	loberman, neelx, sean, mproche, chjohnst, linux-block,
	linux-kernel, linux-trace-kernel

Hi Jens, Steve, Masami,

In high-performance storage environments, particularly when utilising RAID
controllers with shared tag sets (BLK_MQ_F_TAG_HCTX_SHARED), severe latency
spikes can occur when fast devices are starved of available tags.
Currently, diagnosing this specific queue contention requires deploying
dynamic kprobes or inferring sleep states, which lacks a simple,
out-of-the-box diagnostic path.

This short series introduces dedicated, low-overhead observability for tag
exhaustion events in the block layer:

  - Patch 1 introduces the "block_rq_tag_wait" tracepoint in the tag
    allocation slow-path to capture precise, event-based starvation.

  - Patch 2 complements this by exposing "wait_on_hw_tag" and
    "wait_on_sched_tag" per-CPU counters via debugfs for quick,
    point-in-time cumulative polling.

Together, these provide storage engineers with zero-configuration
mechanisms to definitively identify shared-tag bottlenecks.

Please let me know your thoughts.


Changes since v3 [1]:
 - Transitioned tracking architecture from shared atomic_t variables to
   dynamically allocated per-CPU counters to resolve cache line bouncing
   (Bart Van Assche)

Changes since v2 [2]:
 - Added "Reviewed-by:" and "Tested-by:" tags for patch 1
 - Evaluate is_sched_tag directly within TP_fast_assign (Steven Rostedt)
 - Introduced atomic counters via debugfs 

Changes since v1 [3]:
 - Improved the description of the trace point (Damien Le Moal)
 - Removed the redundant "active requests" (Laurence Oberman)
 - Introduced pool-specific starvation tracking

[1]: https://lore.kernel.org/lkml/20260319221956.332770-1-atomlin@atomlin.com/
[2]: https://lore.kernel.org/lkml/20260319015300.287653-1-atomlin@atomlin.com/
[3]: https://lore.kernel.org/lkml/20260317182835.258183-1-atomlin@atomlin.com/

Aaron Tomlin (2):
  blk-mq: add tracepoint block_rq_tag_wait
  blk-mq: expose tag starvation counts via debugfs

 block/blk-mq-debugfs.c       | 84 ++++++++++++++++++++++++++++++++++++
 block/blk-mq-debugfs.h       |  7 +++
 block/blk-mq-tag.c           |  8 ++++
 include/linux/blk-mq.h       | 12 ++++++
 include/trace/events/block.h | 43 ++++++++++++++++++
 5 files changed, 154 insertions(+)

-- 
2.51.0


^ permalink raw reply

* Re: [RFC PATCH 1/2] kernel/notifier: replace single-linked list with double-linked list for reverse traversal
From: Song Chen @ 2026-04-19  0:21 UTC (permalink / raw)
  To: David Laight
  Cc: rafael, lenb, mturquette, sboyd, viresh.kumar, agk, snitzer,
	mpatocka, bmarzins, song, yukuai, linan122, jason.wessel, danielt,
	dianders, horms, davem, edumazet, kuba, pabeni, paulmck, frederic,
	mcgrof, petr.pavlu, da.gomez, samitolvanen, atomlin, jpoimboe,
	jikos, mbenes, pmladek, joe.lawrence, rostedt, mhiramat,
	mark.rutland, mathieu.desnoyers, linux-modules, linux-kernel,
	linux-trace-kernel, linux-acpi, linux-clk, linux-pm,
	live-patching, dm-devel, linux-raid, kgdb-bugreport, netdev
In-Reply-To: <20260416133004.07bd2886@pumpkin>

Hi,

On 4/16/26 20:30, David Laight wrote:
> On Wed, 15 Apr 2026 15:01:37 +0800
> chensong_2000@189.cn wrote:
> 
>> From: Song Chen <chensong_2000@189.cn>
>>
>> The current notifier chain implementation uses a single-linked list
>> (struct notifier_block *next), which only supports forward traversal
>> in priority order. This makes it difficult to handle cleanup/teardown
>> scenarios that require notifiers to be called in reverse priority order.
> 
> If it is only cleanup/teardown then the list can be order-reversed
> as part of that process at the same time as the list is deleted.
> 
> 	David
> 
> 
> 

Sorry, i don't follow, the notifiers in the list are deleted when 
calling notifier_chain_unregister, other than that, they are traversed 
forward and backward.

Song


^ permalink raw reply

* Re: [RFC PATCH 1/2] kernel/notifier: replace single-linked list with double-linked list for reverse traversal
From: Song Chen @ 2026-04-19  0:07 UTC (permalink / raw)
  To: Petr Mladek
  Cc: rafael, lenb, mturquette, sboyd, viresh.kumar, agk, snitzer,
	mpatocka, bmarzins, song, yukuai, linan122, jason.wessel, danielt,
	dianders, horms, davem, edumazet, kuba, pabeni, paulmck, frederic,
	mcgrof, petr.pavlu, da.gomez, samitolvanen, atomlin, jpoimboe,
	jikos, mbenes, joe.lawrence, rostedt, mhiramat, mark.rutland,
	mathieu.desnoyers, linux-modules, linux-kernel,
	linux-trace-kernel, linux-acpi, linux-clk, linux-pm,
	live-patching, dm-devel, linux-raid, kgdb-bugreport, netdev
In-Reply-To: <aeC7ByGA5MHBcGQR@pathway.suse.cz>

Hi,


On 4/16/26 18:33, Petr Mladek wrote:
> On Wed 2026-04-15 15:01:37, chensong_2000@189.cn wrote:
>> From: Song Chen <chensong_2000@189.cn>
>>
>> The current notifier chain implementation uses a single-linked list
>> (struct notifier_block *next), which only supports forward traversal
>> in priority order. This makes it difficult to handle cleanup/teardown
>> scenarios that require notifiers to be called in reverse priority order.
>>
>> A concrete example is the ordering dependency between ftrace and
>> livepatch during module load/unload. see the detail here [1].
>>
>> This patch replaces the single-linked list in struct notifier_block
>> with a struct list_head, converting the notifier chain into a
>> doubly-linked list sorted in descending priority order. Based on
>> this, a new function notifier_call_chain_reverse() is introduced,
>> which traverses the chain in reverse (ascending priority order).
>> The corresponding blocking_notifier_call_chain_reverse() is also
>> added as the locking wrapper for blocking notifier chains.
>>
>> The internal notifier_call_chain_robust() is updated to use
>> notifier_call_chain_reverse() for rollback: on error, it records
>> the failing notifier (last_nb) and the count of successfully called
>> notifiers (nr), then rolls back exactly those nr-1 notifiers in
>> reverse order starting from last_nb's predecessor, without needing
>> to know the total length of the chain.
>>
>> With this change, subsystems with symmetric setup/teardown ordering
>> requirements can register a single notifier_block with one priority
>> value, and rely on blocking_notifier_call_chain() for forward
>> traversal and blocking_notifier_call_chain_reverse() for reverse
>> traversal, without needing hard-coded call sequences or separate
>> notifier registrations for each direction.
>>
>> [1]:https://lore.kernel.org/all
>> 	/alpine.LNX.2.00.1602172216491.22700@cbobk.fhfr.pm/
>>
>> --- a/include/linux/notifier.h
>> +++ b/include/linux/notifier.h
>> @@ -53,41 +53,41 @@ typedef	int (*notifier_fn_t)(struct notifier_block *nb,
> [...]
>>   struct notifier_block {
>>   	notifier_fn_t notifier_call;
>> -	struct notifier_block __rcu *next;
>> +	struct list_head __rcu entry;
>>   	int priority;
>>   };
> [...]
>>   #define ATOMIC_INIT_NOTIFIER_HEAD(name) do {	\
>>   		spin_lock_init(&(name)->lock);	\
>> -		(name)->head = NULL;		\
>> +		INIT_LIST_HEAD(&(name)->head);		\
> 
> I would expect the RCU variant here, aka INIT_LIST_HEAD_RCU().

I'm not familiar with list rcu, but i will look into it and give it a try.
> 
>> --- a/kernel/notifier.c
>> +++ b/kernel/notifier.c
>> @@ -14,39 +14,47 @@
>>    *	are layered on top of these, with appropriate locking added.
>>    */
>>   
>> -static int notifier_chain_register(struct notifier_block **nl,
>> +static int notifier_chain_register(struct list_head *nl,
>>   				   struct notifier_block *n,
>>   				   bool unique_priority)
>>   {
>> -	while ((*nl) != NULL) {
>> -		if (unlikely((*nl) == n)) {
>> +	struct notifier_block *cur;
>> +
>> +	list_for_each_entry(cur, nl, entry) {
>> +		if (unlikely(cur == n)) {
>>   			WARN(1, "notifier callback %ps already registered",
>>   			     n->notifier_call);
>>   			return -EEXIST;
>>   		}
>> -		if (n->priority > (*nl)->priority)
>> -			break;
>> -		if (n->priority == (*nl)->priority && unique_priority)
>> +
>> +		if (n->priority == cur->priority && unique_priority)
>>   			return -EBUSY;
>> -		nl = &((*nl)->next);
>> +
>> +		if (n->priority > cur->priority) {
>> +			list_add_tail(&n->entry, &cur->entry);
>> +			goto out;
>> +		}
>>   	}
>> -	n->next = *nl;
>> -	rcu_assign_pointer(*nl, n);
>> +
>> +	list_add_tail(&n->entry, nl);
> 
> I would expect list_add_tail_rcu() here.
> 
>> @@ -59,25 +67,25 @@ static int notifier_chain_unregister(struct notifier_block **nl,
>>    *			value of this parameter is -1.
>>    *	@nr_calls:	Records the number of notifications sent. Don't care
>>    *			value of this field is NULL.
>> + *	@last_nb:  Records the last called notifier block for rolling back
>>    *	Return:		notifier_call_chain returns the value returned by the
>>    *			last notifier function called.
>>    */
>> -static int notifier_call_chain(struct notifier_block **nl,
>> +static int notifier_call_chain(struct list_head *nl,
>>   			       unsigned long val, void *v,
>> -			       int nr_to_call, int *nr_calls)
>> +			       int nr_to_call, int *nr_calls,
>> +				   struct notifier_block **last_nb)
>>   {
>>   	int ret = NOTIFY_DONE;
>> -	struct notifier_block *nb, *next_nb;
>> -
>> -	nb = rcu_dereference_raw(*nl);
>> +	struct notifier_block *nb;
>>   
>> -	while (nb && nr_to_call) {
>> -		next_nb = rcu_dereference_raw(nb->next);
>> +	if (!nr_to_call)
>> +		return ret;
>>   
>> +	list_for_each_entry(nb, nl, entry) {
> 
> I would expect the RCU variant here, aka list_for_each_rcu()
> 
> These are just two random examples which I found by a quick look.
> 
> I guess that the notifier API is very old and it does not use all
> the RCU API features which allow to track safety when
> CONFIG_PROVE_RCU and CONFIG_PROVE_RCU_LIST are enabled.
> 
> It actually might be worth to audit the code and make it right.
> 
>>   #ifdef CONFIG_DEBUG_NOTIFIERS
>>   		if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
>>   			WARN(1, "Invalid notifier called!");
>> -			nb = next_nb;
>>   			continue;
>>   		}
>>   #endif
> 
> That said, I am not sure if the ftrace/livepatching handlers are
> the right motivation for this. Especially when I see the
> complexity of the 2nd patch [*]
> 
> After thinking more about it. I am not even sure that the ftrace and
> livepatching callbacks are good candidates for generic notifiers.
> They are too special. It is not only about ordering them against
> each other. But it is also about ordering them against other
> notifiers. The ftrace/livepatching callbacks must be the first/last
> during module load/release.
> 
> [*] The 2nd patch is not archived by lore for some reason.
>      I have found only a review but it gives a good picture, see
>      https://lore.kernel.org/all/1191caf5-6a61-4622-a15e-854d3701f4fc@suse.com/
> 
> Best Regards,
> Petr
> 

Thanks.

BR

Song

^ permalink raw reply

* Re: [PATCH v2 00/19] tracepoint: Avoid double static_branch evaluation at guarded call sites
From: Steven Rostedt @ 2026-04-18 23:04 UTC (permalink / raw)
  To: Vineeth Pillai (Google)
  Cc: Peter Zijlstra, Dmitry Ilvokhin, Masami Hiramatsu,
	Mathieu Desnoyers, Ingo Molnar, Jens Axboe, io-uring,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Alexei Starovoitov, Daniel Borkmann, Marcelo Ricardo Leitner,
	Xin Long, Jon Maloy, Aaron Conole, Eelco Chaudron, Ilya Maximets,
	netdev, bpf, linux-sctp, tipc-discussion, dev, Jiri Pirko,
	Oded Gabbay, Koby Elbaz, dri-devel, Rafael J. Wysocki,
	Viresh Kumar, Gautham R. Shenoy, Huang Rui, Mario Limonciello,
	Len Brown, Srinivas Pandruvada, linux-pm, MyungJoo Ham,
	Kyungmin Park, Chanwoo Choi, Christian König, Sumit Semwal,
	linaro-mm-sig, Eddie James, Andrew Jeffery, Joel Stanley,
	linux-fsi, David Airlie, Simona Vetter, Alex Deucher,
	Danilo Krummrich, Matthew Brost, Philipp Stanner, Harry Wentland,
	Leo Li, amd-gfx, Jiri Kosina, Benjamin Tissoires, linux-input,
	Wolfram Sang, linux-i2c, Mark Brown, Michael Hennerich,
	Nuno Sá, linux-spi, James E.J. Bottomley, Martin K. Petersen,
	linux-scsi, Chris Mason, David Sterba, linux-btrfs,
	Thomas Gleixner, Andrew Morton, SeongJae Park, linux-mm,
	Borislav Petkov, Dave Hansen, x86, linux-trace-kernel,
	linux-kernel
In-Reply-To: <20260323160052.17528-1-vineeth@bitbyteword.org>

On Mon, 23 Mar 2026 12:00:19 -0400
"Vineeth Pillai (Google)" <vineeth@bitbyteword.org> wrote:

>   if (trace_foo_enabled() && cond)
>       trace_call__foo(args);   /* calls __do_trace_foo() directly */

Hi Vineeth,

Could you rebase this series on top of 7.1-rc1 when it comes out?
Several of these patches were accepted already. Obviously drop those.
They were the patches that added the feature, and any where the
maintainer acked the patch.

Now that the feature has been accepted, if you post the patch series
again after 7.1-rc1 with all the patches that haven't been accepted
yet, then the maintainers can simply take them directly. As the feature
is now accepted, there's no dependency on it, and they don't need to go
through the tracing tree.

Thanks,

-- Steve

^ permalink raw reply

* Re: [PATCH] trace: propagate registration failure from tracing_start_*_record()
From: Steven Rostedt @ 2026-04-18 19:52 UTC (permalink / raw)
  To: Yash Suthar
  Cc: mhiramat, mathieu.desnoyers, linux-kernel, linux-trace-kernel,
	skhan, me, syzbot+a1d25e53cd4a10f7f2d3
In-Reply-To: <CAPfzD4kQg94qA0oRd+=JOju=+a76872WCsBSo8=aNG5QduNQbg@mail.gmail.com>

On Sat, 18 Apr 2026 11:08:42 +0530
Yash Suthar <yashsuthar983@gmail.com> wrote:

> Hello Steven,
> Thank you for taking a look and really sorry.
> 
> I did use ai assistance for commit message, but I reviewed, modified
> and tested(with syzbot locally) the code myself. I should have
> disclosed really sorry.
> 
> One thing I want to know (or I am still missing something):
> sched_cmdline_ref is incremented before tracing_sched_register() and
> register fails,  but sched_cmdline_ref stays at 1 and on disable
> tracepoint_remove_func() sees NULL and return error (as syzbot
> reported and reproduce also locally).
> your suggestion WARN_ONCE correctly flags the upstream failure, but
> the secondary WARN at  tracepoint.c:358 will still fire on the next
> disable, since the refcount desync isn't addressed. Was that
> intentional ?

Yes.

This is why I'm not too thrilled about syzbot injecting kmalloc
failures. These injections are for one page or less, in which case the
system is in pretty much a failed state anyway.

I don't care about warnings being triggering due to kmalloc failures.
I'll fix UAF or NULL pointer dereferences, but warnings? No!

If you fail to alloc a single page, expect a lot of warnings to happen.
That is intentional. syzbot should not flag an error for a warning that
was triggered due to a single page memory failure, unless that memory
was allocated by in atomic or something where it can't do reclaim.

-- Steve

^ permalink raw reply

* [PATCH] eventfs: Hold eventfs_mutex and SRCU when remount walks events
From: David Carlier @ 2026-04-18 19:17 UTC (permalink / raw)
  To: rostedt, mhiramat
  Cc: mathieu.desnoyers, linux-trace-kernel, linux-kernel,
	David Carlier, stable

Commit 340f0c7067a9 ("eventfs: Update all the eventfs_inodes from the
events descriptor") had eventfs_set_attrs() recurse through ei->children
on remount.  The walk only holds the rcu_read_lock() taken by
tracefs_apply_options() over tracefs_inodes, which is wrong:

  - list_for_each_entry over ei->children races with the list_del_rcu()
    in eventfs_remove_rec() -- LIST_POISON1 deref, same shape as
    d2603279c7d6.
  - eventfs_inodes are freed via call_srcu(&eventfs_srcu, ...).
    rcu_read_lock() does not extend an SRCU grace period, so ti->private
    can be reclaimed under the walk.
  - The writes to ei->attr race with eventfs_set_attr(), which holds
    eventfs_mutex.

Reproducer:

  while :; do mount -o remount,uid=$((RANDOM%1000)) /sys/kernel/tracing; done &
  while :; do
      echo "p:kp submit_bio" > /sys/kernel/tracing/kprobe_events
      echo > /sys/kernel/tracing/kprobe_events
  done

Wrap the events portion of tracefs_apply_options() in
eventfs_remount_lock()/_unlock() that take eventfs_mutex and
srcu_read_lock(&eventfs_srcu).  eventfs_set_attrs() doesn't sleep so the
nested rcu_read_lock() is fine; lockdep_assert_held() pins the contract.

Comment in tracefs_drop_inode() said "RCU cycle" -- it is SRCU.

Fixes: 340f0c7067a9 ("eventfs: Update all the eventfs_inodes from the events descriptor")
Cc: stable@vger.kernel.org
Signed-off-by: David Carlier <devnexen@gmail.com>
---
 fs/tracefs/event_inode.c | 14 ++++++++++++++
 fs/tracefs/inode.c       |  5 ++++-
 fs/tracefs/internal.h    |  3 +++
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index 81df94038f2e..79193021c6b0 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -244,6 +244,8 @@ static void eventfs_set_attrs(struct eventfs_inode *ei, bool update_uid, kuid_t
 {
 	struct eventfs_inode *ei_child;
 
+	lockdep_assert_held(&eventfs_mutex);
+
 	/* Update events/<system>/<event> */
 	if (WARN_ON_ONCE(level > 3))
 		return;
@@ -886,3 +888,15 @@ void eventfs_remove_events_dir(struct eventfs_inode *ei)
 	d_invalidate(dentry);
 	d_make_discardable(dentry);
 }
+
+int eventfs_remount_lock(void)
+{
+	mutex_lock(&eventfs_mutex);
+	return srcu_read_lock(&eventfs_srcu);
+}
+
+void eventfs_remount_unlock(int srcu_idx)
+{
+	srcu_read_unlock(&eventfs_srcu, srcu_idx);
+	mutex_unlock(&eventfs_mutex);
+}
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 03f768536fd5..f3d6188a3b7b 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -313,6 +313,7 @@ static int tracefs_apply_options(struct super_block *sb, bool remount)
 	struct inode *inode = d_inode(sb->s_root);
 	struct tracefs_inode *ti;
 	bool update_uid, update_gid;
+	int srcu_idx;
 	umode_t tmp_mode;
 
 	/*
@@ -337,6 +338,7 @@ static int tracefs_apply_options(struct super_block *sb, bool remount)
 		update_uid = fsi->opts & BIT(Opt_uid);
 		update_gid = fsi->opts & BIT(Opt_gid);
 
+		srcu_idx = eventfs_remount_lock();
 		rcu_read_lock();
 		list_for_each_entry_rcu(ti, &tracefs_inodes, list) {
 			if (update_uid) {
@@ -358,6 +360,7 @@ static int tracefs_apply_options(struct super_block *sb, bool remount)
 				eventfs_remount(ti, update_uid, update_gid);
 		}
 		rcu_read_unlock();
+		eventfs_remount_unlock(srcu_idx);
 	}
 
 	return 0;
@@ -403,7 +406,7 @@ static int tracefs_drop_inode(struct inode *inode)
 	 * This inode is being freed and cannot be used for
 	 * eventfs. Clear the flag so that it doesn't call into
 	 * eventfs during the remount flag updates. The eventfs_inode
-	 * gets freed after an RCU cycle, so the content will still
+	 * gets freed after an SRCU cycle, so the content will still
 	 * be safe if the iteration is going on now.
 	 */
 	ti->flags &= ~TRACEFS_EVENT_INODE;
diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
index d83c2a25f288..a4a7f8431aff 100644
--- a/fs/tracefs/internal.h
+++ b/fs/tracefs/internal.h
@@ -76,4 +76,7 @@ struct inode *tracefs_get_inode(struct super_block *sb);
 void eventfs_remount(struct tracefs_inode *ti, bool update_uid, bool update_gid);
 void eventfs_d_release(struct dentry *dentry);
 
+int eventfs_remount_lock(void);
+void eventfs_remount_unlock(int srcu_idx);
+
 #endif /* _TRACEFS_INTERNAL_H */
-- 
2.53.0


^ permalink raw reply related

* [PATCH] eventfs: Use list_add_tail_rcu() for SRCU-protected children list
From: David Carlier @ 2026-04-18 15:22 UTC (permalink / raw)
  To: rostedt, mhiramat
  Cc: mathieu.desnoyers, linux-trace-kernel, linux-kernel,
	David Carlier, stable

Commit d2603279c7d6 ("eventfs: Use list_del_rcu() for SRCU protected
list variable") converted the removal side to pair with the
list_for_each_entry_srcu() walker in eventfs_iterate(). The insertion
in eventfs_create_dir() was left as a plain list_add_tail(), which on
weakly-ordered architectures can expose a new entry to the SRCU reader
before its list pointers and fields are observable.

Use list_add_tail_rcu() so the publication pairs with the existing
list_del_rcu() and list_for_each_entry_srcu().

Fixes: 43aa6f97c2d0 ("eventfs: Get rid of dentry pointers without refcounts")
Cc: stable@vger.kernel.org
Signed-off-by: David Carlier <devnexen@gmail.com>
---
 fs/tracefs/event_inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index 81df94038f2e..8dd554508828 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -706,7 +706,7 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode

 	scoped_guard(mutex, &eventfs_mutex) {
 		if (!parent->is_freed)
-			list_add_tail(&ei->list, &parent->children);
+			list_add_tail_rcu(&ei->list, &parent->children);
 	}
 	/* Was the parent freed? */
 	if (list_empty(&ei->list)) {
-- 
2.53.0

^ permalink raw reply related

* Re: [PATCHv5 bpf-next 10/28] bpf: Add multi tracing attach types
From: bot+bpf-ci @ 2026-04-18  6:49 UTC (permalink / raw)
  To: jolsa, ast, daniel, andrii
  Cc: bpf, linux-trace-kernel, kafai, eddyz87, songliubraving, yhs,
	menglong8.dong, rostedt, ast, andrii, daniel, martin.lau, eddyz87,
	yonghong.song, clm, ihor.solodrai
In-Reply-To: <20260417192502.194548-11-jolsa@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 2812 bytes --]

> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1154,6 +1154,8 @@ enum bpf_attach_type {
>  	BPF_TRACE_KPROBE_SESSION,
>  	BPF_TRACE_UPROBE_SESSION,
>  	BPF_TRACE_FSESSION,
> +	BPF_TRACE_FENTRY_MULTI,
> +	BPF_TRACE_FEXIT_MULTI,
>  	__MAX_BPF_ATTACH_TYPE
>  };

In RFC v2 (Feb 4, 2026), Leon Hwang asked whether BPF_TRACE_FSESSION_MULTI
should be added alongside BPF_TRACE_FENTRY_MULTI and BPF_TRACE_FEXIT_MULTI
for API consistency, since BPF_TRACE_FSESSION exists and is handled
similarly to FENTRY/FEXIT. You acknowledged this with 'good catch, will add
it' but BPF_TRACE_FSESSION_MULTI was never added in any subsequent version
(v1, v2, v3, v4, or v5).

Is this attach type intentionally omitted, or should it be included to
complete the multi tracing API?

Reference: https://lore.kernel.org/bpf/3015991d-2e9b-4bf4-9c17-26e00b5d5926@linux.dev/

[ ... ]

> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c

[ ... ]

> @@ -19368,7 +19379,17 @@
>  		if (ret < 0)
>  			return ret;
>
> -		if (tgt_prog) {
> +		/* *.multi programs don't need an address during program
> +		 * verification, we just take the module ref if needed.
> +		 */
> +		if (is_tracing_multi_id(prog, btf_id)) {
> +			if (btf_is_module(btf)) {
> +				mod = btf_try_get_module(btf);
> +				if (!mod)
> +					return -ENOENT;
> +			}
> +			addr = 0;
> +		} else if (tgt_prog) {

This isn't a bug, but the new multi-line comment here uses the old kernel
comment style with text on the same line as the opening '/*'. The BPF
subsystem guide mandates the modern multi-line style with '/*' on its own
line for new code under kernel/bpf/, even if surrounding code still uses
the old style.

Should the comment be reformatted as:

	/*
	 * *.multi programs don't need an address during program
	 * verification, we just take the module ref if needed.
	 */

> @@ -19396,6 +19417,12 @@
>  			ret = -EINVAL;
>  			switch (prog->type) {
>  			case BPF_PROG_TYPE_TRACING:
> +				/* *.multi sleepable programs will pass initial sleepable check,
> +				 * the actual attached btf ids are checked later during the link
> +				 * attachment.
> +				 */
> +				if (is_tracing_multi_id(prog, btf_id))
> +					ret = 0;

The same comment style issue appears here. Should this also use the modern
style with '/*' on its own line?

Note that another comment you added in check_attach_btf_id uses the correct
modern style, making this inconsistency clearly fixable.

---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/24598000047

^ permalink raw reply

* Re: [PATCHv5 bpf-next 04/28] bpf: Use mutex lock pool for bpf trampolines
From: bot+bpf-ci @ 2026-04-18  6:49 UTC (permalink / raw)
  To: jolsa, ast, daniel, andrii
  Cc: bpf, linux-trace-kernel, kafai, eddyz87, songliubraving, yhs,
	menglong8.dong, rostedt, ast, andrii, daniel, martin.lau, eddyz87,
	yonghong.song, clm, ihor.solodrai
In-Reply-To: <20260417192502.194548-5-jolsa@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 2035 bytes --]

> diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
> index f02254a21585..eb4ea78ff77f 100644
> --- a/kernel/bpf/trampoline.c
> +++ b/kernel/bpf/trampoline.c
> @@ -30,6 +30,34 @@ static struct hlist_head trampoline_ip_table[TRAMPOLINE_TABLE_SIZE];
>  /* serializes access to trampoline tables */
>  static DEFINE_MUTEX(trampoline_mutex);
>
> +/*
> + * We keep 32 trampoline locks (5 bits) in the pool, because there is
> + * 48 (MAX_LOCK_DEPTH) locks limit allowed to be simultaneously held
> + * by task. Each lock has its own lockdep key to keep it simple.
> + */

The comment explains why the pool has 32 locks (MAX_LOCK_DEPTH limit), but
could it clarify why each lock needs its own lock_class_key?

Without this detail, it's not obvious that distinct keys are needed to avoid
lockdep "recursive locking" warnings when trampoline_lock_all() acquires all
32 pool mutexes simultaneously (presumably in following changes).

This was raised by bot+bpf-ci@kernel.org in v3 review:

  "Comment explains 32 count but not why per-lock class keys needed; should
   mention avoiding lockdep recursive locking warnings"
  https://lore.kernel.org/bpf/31ae46274a3157f2b9840e1a09b2698d1ec0cfd461737ff460c2d3349a9f0054@mail.kernel.org/

The concern was acknowledged with "will add", but the v5 comment still says
only "to keep it simple" without addressing the lockdep recursive locking
concern.

The same issue was raised again in v5:

  https://lore.kernel.org/bpf/53b3e6cb10e12555d3b1b321eeb4e04a447c707c387568d1e9ef759fb6459ef4@mail.kernel.org/

> +#define TRAMPOLINE_LOCKS_BITS 5
> +#define TRAMPOLINE_LOCKS_TABLE_SIZE (1 << TRAMPOLINE_LOCKS_BITS)
> +
> +static struct {
> +	struct mutex mutex;
> +	struct lock_class_key key;
> +} trampoline_locks[TRAMPOLINE_LOCKS_TABLE_SIZE];

[ ... ]


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/24598000047

^ permalink raw reply

* Re: [PATCHv5 bpf-next 20/28] libbpf: Add support to create tracing multi link
From: bot+bpf-ci @ 2026-04-18  6:10 UTC (permalink / raw)
  To: jolsa, ast, daniel, andrii
  Cc: bpf, linux-trace-kernel, kafai, eddyz87, songliubraving, yhs,
	menglong8.dong, rostedt, ast, andrii, daniel, martin.lau, eddyz87,
	yonghong.song, clm, ihor.solodrai
In-Reply-To: <20260417192502.194548-21-jolsa@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 2170 bytes --]

> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> index 49368ee161b1..9bf7e8754c7b 100644
> --- a/tools/lib/bpf/libbpf.c
> +++ b/tools/lib/bpf/libbpf.c

[ ... ]

> @@ -12510,6 +12592,197 @@ bool btf_type_is_traceable_func(const struct btf *btf, const struct btf_type *t
>  	return true;
>  }
>
> +static int collect_func_ids_by_glob(const struct bpf_program *prog, const char *pattern, __u32 **ids)
> +{
> +	struct bpf_object *obj = prog->obj;
> +	const struct module_btf *mod;
> +	struct btf *btf = NULL;
> +	const char *sep;
> +	int err;
> +
> +	err = bpf_object__load_vmlinux_btf(obj, true);
> +	if (err)
> +		return err;
> +
> +	/* In case we have module specified, we will find its btf and use that. */
> +	sep = strchr(pattern, ':');
> +	if (sep) {
> +		mod = find_attach_module(obj, pattern);
> +		if (!mod) {
> +			err = -EINVAL;
> +			goto cleanup;
> +		}
> +		btf = mod->btf;
> +		pattern = sep + 1;
> +	} else {
> +		/* Program is loaded for kernel module. */
> +		if (prog->attach_btf_obj_fd)
> +			return -EINVAL;
> +		btf = obj->btf_vmlinux;
> +	}
> +
> +	err = collect_btf_func_ids_by_glob(btf, pattern, ids);
> +
> +cleanup:
> +	bpf_object_cleanup_btf(obj);
> +	return err;
> +}

Does this function leak the vmlinux BTF when prog->attach_btf_obj_fd is
set? The early return bypasses the cleanup label that calls
bpf_object_cleanup_btf(obj). Every other error path goes through
cleanup, which suggests the BTF loaded by bpf_object__load_vmlinux_btf()
needs to be cleaned up before returning.

The design pattern here is that bpf_object__load_vmlinux_btf(obj, true)
is called specifically at attach-time (after the object has already been
post-load-cleaned), so obj->btf_vmlinux is freshly loaded. Skipping the
cleanup leaves that freshly-loaded btf_vmlinux attached to the bpf_object
until bpf_object__close().

Should this be 'err = -EINVAL; goto cleanup;' to match the other error
paths?

[ ... ]


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/24598000047

^ permalink raw reply

* Re: [PATCHv5 bpf-next 12/28] bpf: Add bpf_trampoline_multi_attach/detach functions
From: bot+bpf-ci @ 2026-04-18  6:10 UTC (permalink / raw)
  To: jolsa, ast, daniel, andrii
  Cc: bpf, linux-trace-kernel, kafai, eddyz87, songliubraving, yhs,
	menglong8.dong, rostedt, ast, andrii, daniel, martin.lau, eddyz87,
	yonghong.song, clm, ihor.solodrai
In-Reply-To: <20260417192502.194548-13-jolsa@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 14299 bytes --]

> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h

[ ... ]

> diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
> --- a/kernel/bpf/trampoline.c
> +++ b/kernel/bpf/trampoline.c
> @@ -1447,6 +1447,276 @@ int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
>  	return -ENOTSUPP;
>  }
>
> +#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) && \
> +    defined(CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS) && \
> +    defined(CONFIG_BPF_SYSCALL)
> +
> +static void trampoline_lock_all(void)
> +{
> +	int i;
> +
> +	for (i = 0; i < TRAMPOLINE_LOCKS_TABLE_SIZE; i++)
> +		mutex_lock(&trampoline_locks[i].mutex);
> +}
> +
> +static void trampoline_unlock_all(void)
> +{
> +	int i;
> +
> +	for (i = 0; i < TRAMPOLINE_LOCKS_TABLE_SIZE; i++)
> +		mutex_unlock(&trampoline_locks[i].mutex);
> +}
> +
> +static void remove_tracing_multi_data(struct bpf_tracing_multi_data *data)
> +{
> +	ftrace_hash_remove(data->reg);
> +	ftrace_hash_remove(data->unreg);
> +	ftrace_hash_remove(data->modify);
> +}
> +
> +static void clear_tracing_multi_data(struct bpf_tracing_multi_data *data)
> +{
> +	remove_tracing_multi_data(data);
> +
> +	free_ftrace_hash(data->reg);
> +	free_ftrace_hash(data->unreg);
> +	free_ftrace_hash(data->modify);
> +}
> +
> +static int init_tracing_multi_data(struct bpf_tracing_multi_data *data)
> +{
> +	data->reg    = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
> +	data->unreg  = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
> +	data->modify = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
> +
> +	if (!data->reg || !data->unreg || !data->modify) {
> +		clear_tracing_multi_data(data);
> +		return -ENOMEM;
> +	}
> +	return 0;
> +}
> +
> +static void ftrace_hash_add(struct ftrace_hash *hash, struct ftrace_func_entry *entry,
> +			    unsigned long ip, unsigned long direct)
> +{
> +	entry->ip = ip;
> +	entry->direct = direct;
> +	add_ftrace_hash_entry(hash, entry);
> +}
> +
> +static int register_fentry_multi(struct bpf_trampoline *tr, struct bpf_tramp_image *im, void *ptr)
> +{
> +	unsigned long addr = (unsigned long) im->image;
> +	unsigned long ip = ftrace_location(tr->ip);
> +	struct bpf_tracing_multi_data *data = ptr;
> +
> +	if (bpf_trampoline_use_jmp(tr->flags))
> +		addr = ftrace_jmp_set(addr);
> +
> +	ftrace_hash_add(data->reg, data->entry, ip, addr);
> +	tr->cur_image = im;
> +	return 0;
> +}
> +
> +static int unregister_fentry_multi(struct bpf_trampoline *tr, u32 orig_flags, void *ptr)
> +{
> +	unsigned long addr = (unsigned long) tr->cur_image->image;
> +	unsigned long ip = ftrace_location(tr->ip);
> +	struct bpf_tracing_multi_data *data = ptr;
> +
> +	if (bpf_trampoline_use_jmp(tr->flags))
> +		addr = ftrace_jmp_set(addr);
> +
> +	ftrace_hash_add(data->unreg, data->entry, ip, addr);
> +	tr->cur_image = NULL;
> +	return 0;
> +}
> +
> +static int modify_fentry_multi(struct bpf_trampoline *tr, u32 orig_flags, struct bpf_tramp_image *im,
> +			       bool lock_direct_mutex, void *ptr)
> +{
> +	unsigned long addr = (unsigned long) im->image;
> +	unsigned long ip = ftrace_location(tr->ip);
> +	struct bpf_tracing_multi_data *data = ptr;
> +
> +	if (bpf_trampoline_use_jmp(tr->flags))
> +		addr = ftrace_jmp_set(addr);
> +
> +	ftrace_hash_add(data->modify, data->entry, ip, addr);
> +	tr->cur_image = im;
> +	return 0;
> +}
> +
> +static const struct bpf_trampoline_ops trampoline_multi_ops = {
> +	.register_fentry   = register_fentry_multi,
> +	.unregister_fentry = unregister_fentry_multi,
> +	.modify_fentry     = modify_fentry_multi,
> +};
> +
> +static void bpf_trampoline_multi_attach_init(struct bpf_trampoline *tr)
> +{
> +	tr->multi_attach.old_image = tr->cur_image;
> +	tr->multi_attach.old_flags = tr->flags;
> +}
> +
> +static void bpf_trampoline_multi_attach_free(struct bpf_trampoline *tr)
> +{
> +	if (tr->multi_attach.old_image)
> +		bpf_tramp_image_put(tr->multi_attach.old_image);
> +
> +	tr->multi_attach.old_image = NULL;
> +	tr->multi_attach.old_flags = 0;
> +}
> +
> +static void bpf_trampoline_multi_attach_rollback(struct bpf_trampoline *tr)
> +{
> +	if (tr->cur_image)
> +		bpf_tramp_image_put(tr->cur_image);
> +	tr->cur_image = tr->multi_attach.old_image;
> +	tr->flags = tr->multi_attach.old_flags;
> +
> +	tr->multi_attach.old_image = NULL;
> +	tr->multi_attach.old_flags = 0;
> +}
> +
> +#define for_each_mnode_cnt(mnode, link, cnt) \
> +	for (i = 0, mnode = &link->nodes[i]; i < cnt; i++, mnode = &link->nodes[i])
> +
> +#define for_each_mnode(mnode, link) \
> +	for_each_mnode_cnt(mnode, link, link->nodes_cnt)
> +
> +int bpf_trampoline_multi_attach(struct bpf_prog *prog, u32 *ids,
> +				struct bpf_tracing_multi_link *link)
> +{
> +	struct bpf_tracing_multi_data *data = &link->data;
> +	struct bpf_attach_target_info tgt_info = {};
> +	struct btf *btf = prog->aux->attach_btf;
> +	struct bpf_tracing_multi_node *mnode;
> +	struct bpf_trampoline *tr;
> +	int i, err, rollback_cnt;
> +	u64 key;
> +
> +	for_each_mnode(mnode, link) {
> +		rollback_cnt = i;
> +
> +		err = bpf_check_attach_btf_id_multi(btf, prog, ids[i], &tgt_info);
> +		if (err)
> +			goto rollback_put;
> +
> +		key = bpf_trampoline_compute_key(NULL, btf, ids[i]);
> +
> +		tr = bpf_trampoline_get(key, &tgt_info);
> +		if (!tr) {
> +			err = -ENOMEM;
> +			goto rollback_put;
> +		}
> +
> +		mnode->trampoline = tr;
> +		mnode->node.link = &link->link;
> +
> +		cond_resched();
> +	}
> +
> +	err = init_tracing_multi_data(data);
> +	if (err) {
> +		rollback_cnt = link->nodes_cnt;
> +		goto rollback_put;
> +	}
> +
> +	trampoline_lock_all();
> +
> +	for_each_mnode(mnode, link) {
> +		bpf_trampoline_multi_attach_init(mnode->trampoline);
> +
> +		data->entry = &mnode->entry;
> +		err = __bpf_trampoline_link_prog(&mnode->node, mnode->trampoline, NULL,
> +						 &trampoline_multi_ops, data);
> +		if (err) {
> +			rollback_cnt = i;
> +			goto rollback_unlink;
> +		}
> +	}

When user-provided ids[] contains duplicate BTF IDs (or distinct IDs
that resolve to the same trampoline key), multiple nodes point to the
same struct bpf_trampoline. The link loop above then calls
bpf_trampoline_multi_attach_init() more than once on that trampoline,
overwriting the saved old_image with the newly assigned cur_image from
the previous iteration.

Scenario with ids[0] == ids[1] and trampoline X starting with OLD_X:

  i=0: attach_init(X) saves old_image=OLD_X
       __bpf_trampoline_link_prog() -> modify_fentry_multi() sets
       X->cur_image=NEW_X (OLD_X refcount not dropped, intent is for
       multi_attach_free() to release it later).

  i=1: attach_init(X) re-runs on the same trampoline and overwrites
       old_image=NEW_X (the only saved reference to OLD_X is lost).
       __bpf_trampoline_link_prog() returns -EBUSY (duplicate prog).
       rollback_cnt=i=1; goto rollback_unlink.

  rollback_unlink calls bpf_trampoline_multi_attach_rollback(X) once:

    static void bpf_trampoline_multi_attach_rollback(struct bpf_trampoline *tr)
    {
        if (tr->cur_image)
            bpf_tramp_image_put(tr->cur_image);    /* puts NEW_X */
        tr->cur_image = tr->multi_attach.old_image;  /* = NEW_X (stale!) */
        ...
    }

Result: OLD_X leaks (its refcount is never decremented). cur_image
points at NEW_X which was just released via bpf_tramp_image_put().

The rollback_put loop calls bpf_trampoline_put() which frees the
trampoline (prog was removed, progs_hlist is empty), so the dangling
NEW_X pointer is cleaned up. However OLD_X remains allocated via
module_alloc forever.

Reachability: commit dbf2afe2f603 ("bpf: Add support for tracing multi
link") copies ids[] from userspace without deduplication. The series'
selftests (commit 411fb40d4b2a "selftests/bpf: Add tracing multi attach
fails test", 'fail#7 (kernel) attach with duplicate id') exercise this
path explicitly, expecting -EBUSY.

The trigger requires the shared trampoline to already have cur_image !=
NULL when the first iteration runs (i.e., another program attached to
the same function), so modify_fentry_multi() is used and there is a
real OLD_X to leak. The first-time-attach case (register_fentry_multi,
OLD_X=NULL) only leaves cur_image dangling briefly until the trampoline
is freed.

Can the code save old_image/old_flags once per trampoline instead of
once per node, or store them per-mnode to avoid overwriting the saved
state?

> +
> +	rollback_cnt = link->nodes_cnt;
> +	if (ftrace_hash_count(data->reg)) {
> +		err = update_ftrace_direct_add(&direct_ops, data->reg);
> +		if (err)
> +			goto rollback_unlink;
> +	}
> +
> +	if (ftrace_hash_count(data->modify)) {
> +		err = update_ftrace_direct_mod(&direct_ops, data->modify, true);
> +		if (err) {
> +			WARN_ON_ONCE(update_ftrace_direct_del(&direct_ops, data->reg));
> +			                                                       ^^^^^^^^^^
> +			goto rollback_unlink;
> +		}
> +	}

The recovery path unconditionally calls update_ftrace_direct_del with
data->reg, but update_ftrace_direct_add is only invoked when data->reg
is non-empty:

    if (ftrace_hash_count(data->reg)) {
        err = update_ftrace_direct_add(&direct_ops, data->reg);
        ...
    }

When update_ftrace_direct_mod fails and all trampolines already had
cur_image != NULL (so nothing was added to data->reg),
update_ftrace_direct_del is called with an empty hash.
update_ftrace_direct_del at kernel/trace/ftrace.c:6511 checks:

    if (!hash_count(hash))
        return -EINVAL;

so it returns -EINVAL and the WARN_ON_ONCE fires spuriously. Should
this guard match the add path?

    if (ftrace_hash_count(data->reg))
        WARN_ON_ONCE(update_ftrace_direct_del(&direct_ops, data->reg));

> +
> +	for_each_mnode(mnode, link)
> +		bpf_trampoline_multi_attach_free(mnode->trampoline);
> +
> +	trampoline_unlock_all();
> +
> +	remove_tracing_multi_data(data);
> +	return 0;
> +
> +rollback_unlink:
> +	for_each_mnode_cnt(mnode, link, rollback_cnt) {
> +		bpf_trampoline_remove_prog(mnode->trampoline, &mnode->node);
> +		bpf_trampoline_multi_attach_rollback(mnode->trampoline);
> +	}
> +
> +	trampoline_unlock_all();
> +
> +	clear_tracing_multi_data(data);
> +	rollback_cnt = link->nodes_cnt;
> +
> +rollback_put:
> +	for_each_mnode_cnt(mnode, link, rollback_cnt)
> +		bpf_trampoline_put(mnode->trampoline);
> +
> +	return err;
> +}

[ ... ]

> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -19658,6 +19658,58 @@ struct bpf_attach_target_info tgt_info = {};
>  	return 0;
>  }
>
> +int bpf_check_attach_btf_id_multi(struct btf *btf, struct bpf_prog *prog, u32 btf_id,
> +				  struct bpf_attach_target_info *tgt_info)
> +{
> +	const struct btf_type *t;
> +	unsigned long addr;
> +	const char *tname;
> +	int err;
> +
> +	if (!btf_id || !btf)
> +		return -EINVAL;
> +
> +	/* Check noreturn attachment. */
> +	if (prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI ||
> +	     btf_id_set_contains(&noreturn_deny, btf_id))
> +		return -EINVAL;

Operator precedence issue in the noreturn check. The two subexpressions
are joined by ||, so the function returns -EINVAL whenever EITHER:

  (a) the program's expected_attach_type is BPF_TRACE_FEXIT_MULTI
      (regardless of whether btf_id targets a noreturn function), OR

  (b) the btf_id is in the noreturn_deny set (regardless of program
      attach type, rejecting FENTRY_MULTI attachments to noreturn
      functions as well).

The intent is to reject only FEXIT-like multi attachments to noreturn
functions. The non-multi path in check_attach_btf_id() correctly uses
&&:

    } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT ||
               prog->expected_attach_type == BPF_TRACE_FSESSION ||
               prog->expected_attach_type == BPF_MODIFY_RETURN) &&
               btf_id_set_contains(&noreturn_deny, btf_id)) {
        ...
        return -EINVAL;
    }

Consequences once bpf_trampoline_multi_attach() is wired up from
user-space (commit dbf2afe2f603 "bpf: Add support for tracing multi
link"):

  1. Every BPF_TRACE_FEXIT_MULTI program attach request is rejected
     with -EINVAL, even when the target function returns normally. This
     disables BPF_TRACE_FEXIT_MULTI entirely.

  2. Any tracing_multi program (including BPF_TRACE_FENTRY_MULTI) is
     rejected when its btf_id is in noreturn_deny (do_exit,
     __x64_sys_exit, make_task_dead, etc.). Rejecting FENTRY_MULTI on
     noreturn functions is an unintended behavioral restriction.

Forward search in the git range shows commit bfcad202ce99 ("bpf: Add
support for tracing_multi link session") rewrites this hunk to:

    if ((prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI ||
         prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) &&
         btf_id_set_contains(&noreturn_deny, btf_id))
        return -EINVAL;

The parenthesization + && in the later commit confirms this is a
precedence bug. Should this use && instead?

> +
> +	/* Check and get function target data. */
> +	t = btf_type_by_id(btf, btf_id);
> +	if (!t)
> +		return -EINVAL;
> +	tname = btf_name_by_offset(btf, t->name_off);
> +	if (!tname)
> +		return -EINVAL;
> +	if (!btf_type_is_func(t))
> +		return -EINVAL;
> +	t = btf_type_by_id(btf, t->type);
> +	if (!btf_type_is_func_proto(t))
> +		return -EINVAL;
> +	err = btf_distill_func_proto(NULL, btf, t, tname, &tgt_info->fmodel);
> +	if (err < 0)
> +		return err;
> +	if (btf_is_module(btf)) {
> +		/* The bpf program already holds reference to module. */
> +		if (WARN_ON_ONCE(!prog->aux->mod))
> +			return -EINVAL;
> +		addr = find_kallsyms_symbol_value(prog->aux->mod, tname);
> +	} else {
> +		addr = kallsyms_lookup_name(tname);
> +	}
> +	if (!addr || !ftrace_location(addr))
> +		return -ENOENT;
> +
> +	/* Check sleepable program attachment. */
> +	if (prog->sleepable) {
> +		err = btf_id_allow_sleepable(btf_id, addr, prog, btf);
> +		if (err)
> +			return err;
> +	}
> +	tgt_info->tgt_addr = addr;
> +	return 0;
> +}
> +
>  struct btf *bpf_get_btf_vmlinux(void)
>  {
>  	if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/24598000047

^ permalink raw reply

* Re: [PATCHv5 bpf-next 08/28] bpf: Add struct bpf_tramp_node object
From: bot+bpf-ci @ 2026-04-18  6:10 UTC (permalink / raw)
  To: jolsa, ast, daniel, andrii
  Cc: hengqi.chen, bpf, linux-trace-kernel, kafai, eddyz87,
	songliubraving, yhs, menglong8.dong, rostedt, ast, andrii, daniel,
	martin.lau, eddyz87, yonghong.song, clm, ihor.solodrai
In-Reply-To: <20260417192502.194548-9-jolsa@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 3917 bytes --]

> diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
> index 50103b3794fb..b97d9f68fc40 100644
> --- a/arch/powerpc/net/bpf_jit_comp.c
> +++ b/arch/powerpc/net/bpf_jit_comp.c

[ ... ]

> @@ -827,14 +827,14 @@ static void bpf_trampoline_restore_args_stack(u32 *image, struct codegen_context
>  static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_image,
>  					 void *rw_image_end, void *ro_image,
>  					 const struct btf_func_model *m, u32 flags,
> -					 struct bpf_tramp_links *tlinks,
> +					 struct bpf_tramp_nodes *tnodes,
>  					 void *func_addr)
>  {
>  	int regs_off, func_meta_off, ip_off, run_ctx_off, retval_off;
>  	int nvr_off, alt_lr_off, r4_off = 0;
> -	struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
> -	struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
> -	struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
> +	struct bpf_tramp_nodes *fmod_ret = &tnodes[BPF_TRAMP_MODIFY_RETURN];
> +	struct bpf_tramp_nodes *fentry = &tnodes[BPF_TRAMP_FENTRY];
> +	struct bpf_tramp_nodes *fexit = &tnodes[BPF_TRAMP_FEXIT];
>  	int i, ret, nr_regs, retaddr_off, bpf_frame_size = 0;
>  	struct codegen_context codegen_ctx, *ctx;
>  	int cookie_off, cookie_cnt, cookie_ctx_off;
> -	int fsession_cnt = bpf_fsession_cnt(tlinks);
> +	int fsession_cnt = bpf_fsession_cnt(tnodes);
                                            ^^^^^

This line wasn't actually updated in the diff. The parameter was renamed from
tlinks to tnodes, but this call site still references the old name. Does this
compile?

The same issue appears later in the function:

>  	cookie_off = bpf_frame_size;
> -	cookie_cnt = bpf_fsession_cookie_cnt(tlinks);
> +	cookie_cnt = bpf_fsession_cookie_cnt(tnodes);
                                              ^^^^^

This line also wasn't updated and still references tlinks.

> @@ -1050,21 +1050,21 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
>
>  	cookie_ctx_off = (regs_off - cookie_off) / 8;
>
> -	for (i = 0; i < fentry->nr_links; i++) {
> -		if (bpf_prog_calls_session_cookie(fentry->links[i])) {
> +	for (i = 0; i < fentry->nr_nodes; i++)
> +		if (bpf_prog_calls_session_cookie(fentry->nodes[i])) {
>  			u64 meta = func_meta | (cookie_ctx_off << BPF_TRAMP_COOKIE_INDEX_SHIFT);
>
>  			store_func_meta(image, ctx, meta, func_meta_off);
>  			cookie_ctx_off--;
>  		}
>
> -		if (invoke_bpf_prog(image, ro_image, ctx, fentry->links[i], regs_off, retval_off,
> +		if (invoke_bpf_prog(image, ro_image, ctx, fentry->nodes[i], regs_off, retval_off,
>  				    run_ctx_off, flags & BPF_TRAMP_F_RET_FENTRY_RET))
>  			return -EINVAL;
>  	}

The opening brace after the for-loop was removed but the closing brace at the
end remains. This creates two problems:

First, the for-loop now only controls the inner if statement checking for
session cookies. The invoke_bpf_prog call is no longer inside the loop and
will only execute once (with whatever value i holds after the loop exits),
instead of being called for each fentry node.

Second, the unmatched closing brace at the end creates a syntax error.

The code structure should be:

    for (i = 0; i < fentry->nr_nodes; i++) {
        if (bpf_prog_calls_session_cookie(fentry->nodes[i])) {
            u64 meta = func_meta | (cookie_ctx_off << BPF_TRAMP_COOKIE_INDEX_SHIFT);

            store_func_meta(image, ctx, meta, func_meta_off);
            cookie_ctx_off--;
        }

        if (invoke_bpf_prog(image, ro_image, ctx, fentry->nodes[i], regs_off, retval_off,
                    run_ctx_off, flags & BPF_TRAMP_F_RET_FENTRY_RET))
            return -EINVAL;
    }

[ ... ]

---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/24598000047

^ permalink raw reply

* Re: [PATCHv5 bpf-next 01/28] ftrace: Add ftrace_hash_count function
From: bot+bpf-ci @ 2026-04-18  6:10 UTC (permalink / raw)
  To: jolsa, ast, daniel, andrii
  Cc: bpf, linux-trace-kernel, kafai, eddyz87, songliubraving, yhs,
	menglong8.dong, rostedt, ast, andrii, daniel, martin.lau, eddyz87,
	yonghong.song, clm, ihor.solodrai
In-Reply-To: <20260417192502.194548-2-jolsa@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 2302 bytes --]

> diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
> index c242fe49af4c..401f8dfd05d3 100644
> --- a/include/linux/ftrace.h
> +++ b/include/linux/ftrace.h
> @@ -415,6 +415,7 @@ struct ftrace_hash *alloc_ftrace_hash(int size_bits);
>  void free_ftrace_hash(struct ftrace_hash *hash);
>  struct ftrace_func_entry *add_ftrace_hash_entry_direct(struct ftrace_hash *hash,
>  						       unsigned long ip, unsigned long direct);
> +unsigned long ftrace_hash_count(struct ftrace_hash *hash);
>
>  /* The hash used to know what functions callbacks trace */
>  struct ftrace_ops_hash {

[ ... ]

> diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
> index 413310912609..68a071e80f32 100644
> --- a/kernel/trace/ftrace.c
> +++ b/kernel/trace/ftrace.c
> @@ -6288,11 +6288,16 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
>  }
>  EXPORT_SYMBOL_GPL(modify_ftrace_direct);
>
> -static unsigned long hash_count(struct ftrace_hash *hash)
> +static inline unsigned long hash_count(struct ftrace_hash *hash)
>  {
>  	return hash ? hash->count : 0;
>  }
>
> +unsigned long ftrace_hash_count(struct ftrace_hash *hash)
> +{
> +	return hash_count(hash);
> +}
                     ^^^^

Will this cause a linker error on some kernel configurations?

The function ftrace_hash_count() is declared in include/linux/ftrace.h under
CONFIG_DYNAMIC_FTRACE, but this definition appears in kernel/trace/ftrace.c
inside a section guarded by CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS (which
extends from around modify_ftrace_direct() to several hundred lines below).

On architectures where CONFIG_DYNAMIC_FTRACE=y but
CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=n, code that calls
ftrace_hash_count() would compile successfully but fail to link since no
definition would be present in the built kernel.

This was raised by sashiko-bot@kernel.org on the v5 review:
https://lore.kernel.org/all/20260418005821.4F156C19425@smtp.kernel.org/

> +
>  /**
>   * hash_add - adds two struct ftrace_hash and returns the result
>   * @a: struct ftrace_hash object


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/24598000047

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox