Linux Trace Kernel

Linux Trace Kernel
 help / color / mirror / Atom feed

* [PATCH 7.2 v16 08/13] mm/khugepaged: improve tracepoints for mTHP orders
From: Nico Pache @ 2026-04-19 18:57 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
	dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, ljs,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, npache,
	peterx, pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang,
	rientjes, rostedt, rppt, ryan.roberts, shivankg, sunnanyong,
	surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260419185750.260784-1-npache@redhat.com>

Add the order to the mm_collapse_huge_page<_swapin,_isolate> tracepoints to
give better insight into what order is being operated at for.

Reviewed-by: Lorenzo Stoakes <ljs@kernel.org> 
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Signed-off-by: Nico Pache <npache@redhat.com>
---
 include/trace/events/huge_memory.h | 34 +++++++++++++++++++-----------
 mm/khugepaged.c                    |  9 ++++----
 2 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index bcdc57eea270..291fae364c62 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -89,40 +89,44 @@ TRACE_EVENT(mm_khugepaged_scan_pmd,
 
 TRACE_EVENT(mm_collapse_huge_page,
 
-	TP_PROTO(struct mm_struct *mm, int isolated, int status),
+	TP_PROTO(struct mm_struct *mm, int isolated, int status, unsigned int order),
 
-	TP_ARGS(mm, isolated, status),
+	TP_ARGS(mm, isolated, status, order),
 
 	TP_STRUCT__entry(
 		__field(struct mm_struct *, mm)
 		__field(int, isolated)
 		__field(int, status)
+		__field(unsigned int, order)
 	),
 
 	TP_fast_assign(
 		__entry->mm = mm;
 		__entry->isolated = isolated;
 		__entry->status = status;
+		__entry->order = order;
 	),
 
-	TP_printk("mm=%p, isolated=%d, status=%s",
+	TP_printk("mm=%p, isolated=%d, status=%s, order=%u",
 		__entry->mm,
 		__entry->isolated,
-		__print_symbolic(__entry->status, SCAN_STATUS))
+		__print_symbolic(__entry->status, SCAN_STATUS),
+		__entry->order)
 );
 
 TRACE_EVENT(mm_collapse_huge_page_isolate,
 
 	TP_PROTO(struct folio *folio, int none_or_zero,
-		 int referenced, int status),
+		 int referenced, int status, unsigned int order),
 
-	TP_ARGS(folio, none_or_zero, referenced, status),
+	TP_ARGS(folio, none_or_zero, referenced, status, order),
 
 	TP_STRUCT__entry(
 		__field(unsigned long, pfn)
 		__field(int, none_or_zero)
 		__field(int, referenced)
 		__field(int, status)
+		__field(unsigned int, order)
 	),
 
 	TP_fast_assign(
@@ -130,26 +134,30 @@ TRACE_EVENT(mm_collapse_huge_page_isolate,
 		__entry->none_or_zero = none_or_zero;
 		__entry->referenced = referenced;
 		__entry->status = status;
+		__entry->order = order;
 	),
 
-	TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, status=%s",
+	TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, status=%s, order=%u",
 		__entry->pfn,
 		__entry->none_or_zero,
 		__entry->referenced,
-		__print_symbolic(__entry->status, SCAN_STATUS))
+		__print_symbolic(__entry->status, SCAN_STATUS),
+		__entry->order)
 );
 
 TRACE_EVENT(mm_collapse_huge_page_swapin,
 
-	TP_PROTO(struct mm_struct *mm, int swapped_in, int referenced, int ret),
+	TP_PROTO(struct mm_struct *mm, int swapped_in, int referenced, int ret,
+		 unsigned int order),
 
-	TP_ARGS(mm, swapped_in, referenced, ret),
+	TP_ARGS(mm, swapped_in, referenced, ret, order),
 
 	TP_STRUCT__entry(
 		__field(struct mm_struct *, mm)
 		__field(int, swapped_in)
 		__field(int, referenced)
 		__field(int, ret)
+		__field(unsigned int, order)
 	),
 
 	TP_fast_assign(
@@ -157,13 +165,15 @@ TRACE_EVENT(mm_collapse_huge_page_swapin,
 		__entry->swapped_in = swapped_in;
 		__entry->referenced = referenced;
 		__entry->ret = ret;
+		__entry->order = order;
 	),
 
-	TP_printk("mm=%p, swapped_in=%d, referenced=%d, ret=%d",
+	TP_printk("mm=%p, swapped_in=%d, referenced=%d, ret=%d, order=%u",
 		__entry->mm,
 		__entry->swapped_in,
 		__entry->referenced,
-		__entry->ret)
+		__entry->ret,
+		__entry->order)
 );
 
 TRACE_EVENT(mm_khugepaged_scan_file,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 0a1c7cc20c0e..a4f1c570b69b 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -780,13 +780,13 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
 	} else {
 		result = SCAN_SUCCEED;
 		trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
-						    referenced, result);
+						    referenced, result, order);
 		return result;
 	}
 out:
 	release_pte_pages(pte, _pte, compound_pagelist);
 	trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
-					    referenced, result);
+					    referenced, result, order);
 	return result;
 }
 
@@ -1180,7 +1180,8 @@ static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
 
 	result = SCAN_SUCCEED;
 out:
-	trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result);
+	trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result,
+					   order);
 	return result;
 }
 
@@ -1376,7 +1377,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long s
 out_nolock:
 	if (folio)
 		folio_put(folio);
-	trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
+	trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result, order);
 	return result;
 }
 
-- 
2.53.0


^ permalink raw reply related

* [PATCH 7.2 v16 09/13] mm/khugepaged: introduce collapse_allowable_orders helper function
From: Nico Pache @ 2026-04-19 18:57 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
	dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, ljs,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, npache,
	peterx, pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang,
	rientjes, rostedt, rppt, ryan.roberts, shivankg, sunnanyong,
	surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260419185750.260784-1-npache@redhat.com>

Add collapse_allowable_orders() to generalize THP order eligibility. The
function determines which THP orders are permitted based on collapse
context (khugepaged vs madv_collapse).

This consolidates collapse configuration logic and provides a clean
interface for future mTHP collapse support where the orders may be
different.

Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
 include/linux/khugepaged.h        |  6 ++----
 mm/huge_memory.c                  |  2 +-
 mm/khugepaged.c                   | 20 ++++++++++++++------
 mm/vma.c                          |  6 +++---
 tools/testing/vma/include/stubs.h |  3 +--
 5 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index d7a9053ff4fe..e87df2fa6931 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -13,8 +13,7 @@ extern void khugepaged_destroy(void);
 extern int start_stop_khugepaged(void);
 extern void __khugepaged_enter(struct mm_struct *mm);
 extern void __khugepaged_exit(struct mm_struct *mm);
-extern void khugepaged_enter_vma(struct vm_area_struct *vma,
-				 vm_flags_t vm_flags);
+extern void khugepaged_enter_vma(struct vm_area_struct *vma);
 extern void khugepaged_min_free_kbytes_update(void);
 extern bool current_is_khugepaged(void);
 void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
@@ -38,8 +37,7 @@ static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm
 static inline void khugepaged_exit(struct mm_struct *mm)
 {
 }
-static inline void khugepaged_enter_vma(struct vm_area_struct *vma,
-					vm_flags_t vm_flags)
+static inline void khugepaged_enter_vma(struct vm_area_struct *vma)
 {
 }
 static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5c128cdec810..1023698a8b96 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1557,7 +1557,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 	ret = vmf_anon_prepare(vmf);
 	if (ret)
 		return ret;
-	khugepaged_enter_vma(vma, vma->vm_flags);
+	khugepaged_enter_vma(vma);
 
 	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
 			!mm_forbids_zeropage(vma->vm_mm) &&
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a4f1c570b69b..fdbdc1a1cdd9 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -447,7 +447,7 @@ int hugepage_madvise(struct vm_area_struct *vma,
 		 * register it here without waiting a page fault that
 		 * may not happen any time soon.
 		 */
-		khugepaged_enter_vma(vma, *vm_flags);
+		khugepaged_enter_vma(vma);
 		break;
 	case MADV_NOHUGEPAGE:
 		*vm_flags &= ~VM_HUGEPAGE;
@@ -546,12 +546,20 @@ void __khugepaged_enter(struct mm_struct *mm)
 		wake_up_interruptible(&khugepaged_wait);
 }
 
-void khugepaged_enter_vma(struct vm_area_struct *vma,
-			  vm_flags_t vm_flags)
+/* Check what orders are allowed based on the vma and collapse type */
+static unsigned long collapse_allowable_orders(struct vm_area_struct *vma,
+		enum tva_type tva_flags)
+{
+	unsigned long orders = BIT(HPAGE_PMD_ORDER);
+
+	return thp_vma_allowable_orders(vma, vma->vm_flags, tva_flags, orders);
+}
+
+void khugepaged_enter_vma(struct vm_area_struct *vma)
 {
 	if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
 	    hugepage_pmd_enabled()) {
-		if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
+		if (collapse_allowable_orders(vma, TVA_KHUGEPAGED))
 			__khugepaged_enter(vma->vm_mm);
 	}
 }
@@ -2664,7 +2672,7 @@ static void collapse_scan_mm_slot(unsigned int progress_max,
 			cc->progress++;
 			break;
 		}
-		if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
+		if (!collapse_allowable_orders(vma, TVA_KHUGEPAGED)) {
 			cc->progress++;
 			continue;
 		}
@@ -2973,7 +2981,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
 	BUG_ON(vma->vm_start > start);
 	BUG_ON(vma->vm_end < end);
 
-	if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
+	if (!collapse_allowable_orders(vma, TVA_FORCED_COLLAPSE))
 		return -EINVAL;
 
 	cc = kmalloc_obj(*cc);
diff --git a/mm/vma.c b/mm/vma.c
index 377321b48734..c0398fb597b3 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -989,7 +989,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
 		goto abort;
 
 	vma_set_flags_mask(vmg->target, sticky_flags);
-	khugepaged_enter_vma(vmg->target, vmg->vm_flags);
+	khugepaged_enter_vma(vmg->target);
 	vmg->state = VMA_MERGE_SUCCESS;
 	return vmg->target;
 
@@ -1110,7 +1110,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
 	 * following VMA if we have VMAs on both sides.
 	 */
 	if (vmg->target && !vma_expand(vmg)) {
-		khugepaged_enter_vma(vmg->target, vmg->vm_flags);
+		khugepaged_enter_vma(vmg->target);
 		vmg->state = VMA_MERGE_SUCCESS;
 		return vmg->target;
 	}
@@ -2589,7 +2589,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap,
 	 * call covers the non-merge case.
 	 */
 	if (!vma_is_anonymous(vma))
-		khugepaged_enter_vma(vma, map->vm_flags);
+		khugepaged_enter_vma(vma);
 	*vmap = vma;
 	return 0;
 
diff --git a/tools/testing/vma/include/stubs.h b/tools/testing/vma/include/stubs.h
index a30b8bc84955..3d9a2daa2712 100644
--- a/tools/testing/vma/include/stubs.h
+++ b/tools/testing/vma/include/stubs.h
@@ -182,8 +182,7 @@ static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
 	return true;
 }
 
-static inline void khugepaged_enter_vma(struct vm_area_struct *vma,
-			  vm_flags_t vm_flags)
+static inline void khugepaged_enter_vma(struct vm_area_struct *vma)
 {
 }
 
-- 
2.53.0


^ permalink raw reply related

* [PATCH 7.2 v16 10/13] mm/khugepaged: Introduce mTHP collapse support
From: Nico Pache @ 2026-04-19 18:57 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
	dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, ljs,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, npache,
	peterx, pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang,
	rientjes, rostedt, rppt, ryan.roberts, shivankg, sunnanyong,
	surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260419185750.260784-1-npache@redhat.com>

Enable khugepaged to collapse to mTHP orders. This patch implements the
main scanning logic using a bitmap to track occupied pages and a stack
structure that allows us to find optimal collapse sizes.

Previous to this patch, PMD collapse had 3 main phases, a light weight
scanning phase (mmap_read_lock) that determines a potential PMD
collapse, an alloc phase (mmap unlocked), then finally heavier collapse
phase (mmap_write_lock).

To enabled mTHP collapse we make the following changes:

During PMD scan phase, track occupied pages in a bitmap. When mTHP
orders are enabled, we remove the restriction of max_ptes_none during the
scan phase to avoid missing potential mTHP collapse candidates. Once we
have scanned the full PMD range and updated the bitmap to track occupied
pages, we use the bitmap to find the optimal mTHP size.

Implement collapse_scan_bitmap() to perform binary recursion on the bitmap
and determine the best eligible order for the collapse. A stack structure
is used instead of traditional recursion to manage the search. This also
prevents a traditional recursive approach when the kernel stack struct is
limited. The algorithm recursively splits the bitmap into smaller chunks to
find the highest order mTHPs that satisfy the collapse criteria. We start
by attempting the PMD order, then moved on the consecutively lower orders
(mTHP collapse). The stack maintains a pair of variables (offset, order),
indicating the number of PTEs from the start of the PMD, and the order of
the potential collapse candidate.

The algorithm for consuming the bitmap works as such:
    1) push (0, HPAGE_PMD_ORDER) onto the stack
    2) pop the stack
    3) check if the number of set bits in that (offset,order) pair
       statisfy the max_ptes_none threshold for that order
    4) if yes, attempt collapse
    5) if no (or collapse fails), push two new stack items representing
       the left and right halves of the current bitmap range, at the
       next lower order
    6) repeat at step (2) until stack is empty.

Below is a diagram representing the algorithm and stack items:

                            offset   mid_offset
                            |        |
                            |        |
                            v        v
          ____________________________________
         |          PTE Page Table            |
         --------------------------------------
			    <-------><------->
                             order-1  order-1

mTHP collapses reject regions containing swapped out or shared pages.
This is because adding new entries can lead to new none pages, and these
may lead to constant promotion into a higher order mTHP. A similar
issue can occur with "max_ptes_none > HPAGE_PMD_NR/2" due to a collapse
introducing at least 2x the number of pages, and on a future scan will
satisfy the promotion condition once again. This issue is prevented via
the collapse_max_ptes_none() function which imposes the max_ptes_none
restrictions above.

We currently only support mTHP collapse for max_ptes_none values of 0
and HPAGE_PMD_NR - 1. resulting in the following behavior:

    - max_ptes_none=0: Never introduce new empty pages during collapse
    - max_ptes_none=HPAGE_PMD_NR-1: Always try collapse to the highest
      available mTHP order

Any other max_ptes_none value will emit a warning and skip mTHP collapse
attempts. There should be no behavior change for PMD collapse.

Once we determine what mTHP sizes fits best in that PMD range a collapse
is attempted. A minimum collapse order of 2 is used as this is the lowest
order supported by anon memory as defined by THP_ORDERS_ALL_ANON.

Currently madv_collapse is not supported and will only attempt PMD
collapse.

We can also remove the check for is_khugepaged inside the PMD scan as
the collapse_max_ptes_none() function handles this logic now.

Signed-off-by: Nico Pache <npache@redhat.com>
---
 mm/khugepaged.c | 181 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 174 insertions(+), 7 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index fdbdc1a1cdd9..81ea7cbc54b2 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -99,6 +99,31 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
 
 static struct kmem_cache *mm_slot_cache __ro_after_init;
 
+#define KHUGEPAGED_MIN_MTHP_ORDER	2
+/*
+ * The maximum number of mTHP ranges that can be stored on the stack.
+ * This is calculated based on the number of PTE entries in a PTE page table
+ * and the minimum mTHP order.
+ *
+ * ilog2 is needed in place of HPAGE_PMD_ORDER due to some architectures
+ * (ie ppc64le) not defining HPAGE_PMD_ORDER until after build time.
+ *
+ * At most there will be 1 << (PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER) mTHP ranges
+ */
+#define MTHP_STACK_SIZE	(1UL << (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER))
+
+/*
+ * Defines a range of PTE entries in a PTE page table which are being
+ * considered for mTHP collapse.
+ *
+ * @offset: the offset of the first PTE entry in a PMD range.
+ * @order: the order of the PTE entries being considered for collapse.
+ */
+struct mthp_range {
+	u16 offset;
+	u8 order;
+};
+
 struct collapse_control {
 	bool is_khugepaged;
 
@@ -110,6 +135,12 @@ struct collapse_control {
 
 	/* nodemask for allocation fallback */
 	nodemask_t alloc_nmask;
+
+	/* Each bit represents a single occupied (!none/zero) page. */
+	DECLARE_BITMAP(mthp_bitmap, MAX_PTRS_PER_PTE);
+	/* A mask of the current range being considered for mTHP collapse. */
+	DECLARE_BITMAP(mthp_bitmap_mask, MAX_PTRS_PER_PTE);
+	struct mthp_range mthp_bitmap_stack[MTHP_STACK_SIZE];
 };
 
 /**
@@ -1389,22 +1420,142 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long s
 	return result;
 }
 
+static void collapse_mthp_stack_push(struct collapse_control *cc, int *stack_size,
+				     u16 offset, u8 order)
+{
+	const int size = *stack_size;
+	struct mthp_range *stack = &cc->mthp_bitmap_stack[size];
+
+	VM_WARN_ON_ONCE(size >= MTHP_STACK_SIZE);
+	stack->order = order;
+	stack->offset = offset;
+	(*stack_size)++;
+}
+
+static struct mthp_range collapse_mthp_stack_pop(struct collapse_control *cc,
+						 int *stack_size)
+{
+	const int size = *stack_size;
+
+	VM_WARN_ON_ONCE(size <= 0);
+	(*stack_size)--;
+	return cc->mthp_bitmap_stack[size - 1];
+}
+
+static unsigned int collapse_mthp_count_present(struct collapse_control *cc,
+						u16 offset, unsigned int nr_ptes)
+{
+	bitmap_zero(cc->mthp_bitmap_mask, MAX_PTRS_PER_PTE);
+	bitmap_set(cc->mthp_bitmap_mask, offset, nr_ptes);
+	return bitmap_weight_and(cc->mthp_bitmap, cc->mthp_bitmap_mask, MAX_PTRS_PER_PTE);
+}
+
+/*
+ * mthp_collapse() consumes the bitmap that is generated during
+ * collapse_scan_pmd() to determine what regions and mTHP orders fit best.
+ *
+ * Each bit in cc->mthp_bitmap represents a single occupied (!none/zero) page.
+ * A stack structure cc->mthp_bitmap_stack is used to check different regions
+ * of the bitmap for collapse eligibility. The stack maintains a pair of
+ * variables (offset, order), indicating the number of PTEs from the start of
+ * the PMD, and the order of the potential collapse candidate respectively. We
+ * start at the PMD order and check if it is eligible for collapse; if not, we
+ * add two entries to the stack at a lower order to represent the left and right
+ * halves of the PTE page table we are examining.
+ *
+ *                         offset       mid_offset
+ *                         |         |
+ *                         |         |
+ *                         v         v
+ *      --------------------------------------
+ *      |          cc->mthp_bitmap            |
+ *      --------------------------------------
+ *                         <-------><------->
+ *                          order-1  order-1
+ *
+ * For each of these, we determine how many PTE entries are occupied in the
+ * range of PTE entries we propose to collapse, then we compare this to a
+ * threshold number of PTE entries which would need to be occupied for a
+ * collapse to be permitted at that order (accounting for max_ptes_none).
+ *
+ * If a collapse is permitted, we attempt to collapse the PTE range into a
+ * mTHP.
+ */
+static int mthp_collapse(struct mm_struct *mm, unsigned long address,
+		int referenced, int unmapped, struct collapse_control *cc,
+		unsigned long enabled_orders)
+{
+	unsigned int nr_occupied_ptes, nr_ptes;
+	int max_ptes_none, collapsed = 0, stack_size = 0;
+	unsigned long collapse_address;
+	struct mthp_range range;
+	u16 offset;
+	u8 order;
+
+	collapse_mthp_stack_push(cc, &stack_size, 0, HPAGE_PMD_ORDER);
+
+	while (stack_size) {
+		range = collapse_mthp_stack_pop(cc, &stack_size);
+		order = range.order;
+		offset = range.offset;
+		nr_ptes = 1UL << order;
+
+		if (!test_bit(order, &enabled_orders))
+			goto next_order;
+
+		max_ptes_none = collapse_max_ptes_none(cc, NULL, order);
+
+		if (max_ptes_none < 0)
+			return collapsed;
+
+		nr_occupied_ptes = collapse_mthp_count_present(cc, offset,
+							       nr_ptes);
+
+		if (nr_occupied_ptes >= nr_ptes - max_ptes_none) {
+			int ret;
+
+			collapse_address = address + offset * PAGE_SIZE;
+			ret = collapse_huge_page(mm, collapse_address, referenced,
+						 unmapped, cc, order);
+			if (ret == SCAN_SUCCEED) {
+				collapsed += nr_ptes;
+				continue;
+			}
+		}
+
+next_order:
+		if (order > KHUGEPAGED_MIN_MTHP_ORDER) {
+			const u8 next_order = order - 1;
+			const u16 mid_offset = offset + (nr_ptes / 2);
+
+			collapse_mthp_stack_push(cc, &stack_size, mid_offset,
+						 next_order);
+			collapse_mthp_stack_push(cc, &stack_size, offset,
+						 next_order);
+		}
+	}
+	return collapsed;
+}
+
 static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
 		struct vm_area_struct *vma, unsigned long start_addr,
 		bool *lock_dropped, struct collapse_control *cc)
 {
 	pmd_t *pmd;
-	pte_t *pte, *_pte;
-	int none_or_zero = 0, shared = 0, referenced = 0;
+	pte_t *pte, *_pte, pteval;
+	int i;
+	int none_or_zero = 0, shared = 0, nr_collapsed = 0, referenced = 0;
 	enum scan_result result = SCAN_FAIL;
 	struct page *page = NULL;
 	struct folio *folio = NULL;
 	unsigned long addr;
+	unsigned long enabled_orders;
 	spinlock_t *ptl;
 	int node = NUMA_NO_NODE, unmapped = 0;
 	int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
 	unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, HPAGE_PMD_ORDER);
 	unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
+	enum tva_type tva_flags = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
 
 	VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
 
@@ -1414,8 +1565,19 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
 		goto out;
 	}
 
+	bitmap_zero(cc->mthp_bitmap, MAX_PTRS_PER_PTE);
 	memset(cc->node_load, 0, sizeof(cc->node_load));
 	nodes_clear(cc->alloc_nmask);
+
+	enabled_orders = collapse_allowable_orders(vma, tva_flags);
+
+	/*
+	 * If PMD is the only enabled order, enforce max_ptes_none, otherwise
+	 * scan all pages to populate the bitmap for mTHP collapse.
+	 */
+	if (enabled_orders != BIT(HPAGE_PMD_ORDER))
+		max_ptes_none = KHUGEPAGED_MAX_PTES_LIMIT;
+
 	pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
 	if (!pte) {
 		cc->progress++;
@@ -1423,11 +1585,13 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
 		goto out;
 	}
 
-	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
-	     _pte++, addr += PAGE_SIZE) {
+	for (i = 0; i < HPAGE_PMD_NR; i++) {
+		_pte = pte + i;
+		addr = start_addr + i * PAGE_SIZE;
+		pteval = ptep_get(_pte);
+
 		cc->progress++;
 
-		pte_t pteval = ptep_get(_pte);
 		if (pte_none_or_zero(pteval)) {
 			if (++none_or_zero > max_ptes_none) {
 				result = SCAN_EXCEED_NONE_PTE;
@@ -1507,6 +1671,8 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
 			}
 		}
 
+		/* Set bit for occupied pages */
+		__set_bit(i, cc->mthp_bitmap);
 		/*
 		 * Record which node the original page is from and save this
 		 * information to cc->node_load[].
@@ -1570,10 +1736,11 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
 		 * that. We will recheck the vma after taking it again in write mode.
 		 */
 		mmap_read_unlock(mm);
-		result = collapse_huge_page(mm, start_addr, referenced,
-					    unmapped, cc, HPAGE_PMD_ORDER);
+		nr_collapsed = mthp_collapse(mm, start_addr, referenced, unmapped,
+					      cc, enabled_orders);
 		/* collapse_huge_page will return with the mmap_lock released */
 		*lock_dropped = true;
+		result = nr_collapsed ? SCAN_SUCCEED : SCAN_FAIL;
 	}
 out:
 	trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
-- 
2.53.0


^ permalink raw reply related

* [PATCH 7.2 v16 11/13] mm/khugepaged: avoid unnecessary mTHP collapse attempts
From: Nico Pache @ 2026-04-19 18:57 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
	dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, ljs,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, npache,
	peterx, pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang,
	rientjes, rostedt, rppt, ryan.roberts, shivankg, sunnanyong,
	surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe, Usama Arif
In-Reply-To: <20260419185750.260784-1-npache@redhat.com>

There are cases where, if an attempted collapse fails, all subsequent
orders are guaranteed to also fail. Avoid these collapse attempts by
bailing out early.

Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
Acked-by: Usama Arif <usama.arif@linux.dev>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Signed-off-by: Nico Pache <npache@redhat.com>
---
 mm/khugepaged.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 81ea7cbc54b2..13b05bbb08e7 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1517,9 +1517,31 @@ static int mthp_collapse(struct mm_struct *mm, unsigned long address,
 			collapse_address = address + offset * PAGE_SIZE;
 			ret = collapse_huge_page(mm, collapse_address, referenced,
 						 unmapped, cc, order);
-			if (ret == SCAN_SUCCEED) {
+
+			switch (ret) {
+			/* Cases where we continue to next collapse candidate */
+			case SCAN_SUCCEED:
 				collapsed += nr_ptes;
+				fallthrough;
+			case SCAN_PTE_MAPPED_HUGEPAGE:
 				continue;
+			/* Cases where lower orders might still succeed */
+			case SCAN_LACK_REFERENCED_PAGE:
+			case SCAN_EXCEED_NONE_PTE:
+			case SCAN_EXCEED_SWAP_PTE:
+			case SCAN_EXCEED_SHARED_PTE:
+			case SCAN_PAGE_LOCK:
+			case SCAN_PAGE_COUNT:
+			case SCAN_PAGE_LRU:
+			case SCAN_PAGE_NULL:
+			case SCAN_DEL_PAGE_LRU:
+			case SCAN_PTE_NON_PRESENT:
+			case SCAN_PTE_UFFD_WP:
+			case SCAN_ALLOC_HUGE_PAGE_FAIL:
+				goto next_order;
+			/* Cases where no further collapse is possible */
+			default:
+				return collapsed;
 			}
 		}
 
-- 
2.53.0


^ permalink raw reply related

* [PATCH 7.2 v16 12/13] mm/khugepaged: run khugepaged for all orders
From: Nico Pache @ 2026-04-19 18:57 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
	dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, ljs,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, npache,
	peterx, pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang,
	rientjes, rostedt, rppt, ryan.roberts, shivankg, sunnanyong,
	surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe, Usama Arif
In-Reply-To: <20260419185750.260784-1-npache@redhat.com>

From: Baolin Wang <baolin.wang@linux.alibaba.com>

If any order (m)THP is enabled we should allow running khugepaged to
attempt scanning and collapsing mTHPs. In order for khugepaged to operate
when only mTHP sizes are specified in sysfs, we must modify the predicate
function that determines whether it ought to run to do so.

This function is currently called hugepage_pmd_enabled(), this patch
renames it to hugepage_enabled() and updates the logic to check to
determine whether any valid orders may exist which would justify
khugepaged running.

We must also update collapse_allowable_orders() to check all orders if
the vma is anonymous and the collapse is khugepaged.

After this patch khugepaged mTHP collapse is fully enabled.

Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Acked-by: Usama Arif <usama.arif@linux.dev>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
 mm/khugepaged.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 13b05bbb08e7..7d48d4fbd5f3 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -524,23 +524,23 @@ static inline int collapse_test_exit_or_disable(struct mm_struct *mm)
 		mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
 }
 
-static bool hugepage_pmd_enabled(void)
+static bool hugepage_enabled(void)
 {
 	/*
 	 * We cover the anon, shmem and the file-backed case here; file-backed
 	 * hugepages, when configured in, are determined by the global control.
-	 * Anon pmd-sized hugepages are determined by the pmd-size control.
+	 * Anon hugepages are determined by its per-size mTHP control.
 	 * Shmem pmd-sized hugepages are also determined by its pmd-size control,
 	 * except when the global shmem_huge is set to SHMEM_HUGE_DENY.
 	 */
 	if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
 	    hugepage_global_enabled())
 		return true;
-	if (test_bit(PMD_ORDER, &huge_anon_orders_always))
+	if (READ_ONCE(huge_anon_orders_always))
 		return true;
-	if (test_bit(PMD_ORDER, &huge_anon_orders_madvise))
+	if (READ_ONCE(huge_anon_orders_madvise))
 		return true;
-	if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) &&
+	if (READ_ONCE(huge_anon_orders_inherit) &&
 	    hugepage_global_enabled())
 		return true;
 	if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled())
@@ -581,7 +581,13 @@ void __khugepaged_enter(struct mm_struct *mm)
 static unsigned long collapse_allowable_orders(struct vm_area_struct *vma,
 		enum tva_type tva_flags)
 {
-	unsigned long orders = BIT(HPAGE_PMD_ORDER);
+	unsigned long orders;
+
+	/* If khugepaged is scanning an anonymous vma, allow mTHP collapse */
+	if ((tva_flags & TVA_KHUGEPAGED) && vma_is_anonymous(vma))
+		orders = THP_ORDERS_ALL_ANON;
+	else
+		orders = BIT(HPAGE_PMD_ORDER);
 
 	return thp_vma_allowable_orders(vma, vma->vm_flags, tva_flags, orders);
 }
@@ -589,7 +595,7 @@ static unsigned long collapse_allowable_orders(struct vm_area_struct *vma,
 void khugepaged_enter_vma(struct vm_area_struct *vma)
 {
 	if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
-	    hugepage_pmd_enabled()) {
+	    hugepage_enabled()) {
 		if (collapse_allowable_orders(vma, TVA_KHUGEPAGED))
 			__khugepaged_enter(vma->vm_mm);
 	}
@@ -2936,7 +2942,7 @@ static void collapse_scan_mm_slot(unsigned int progress_max,
 
 static int khugepaged_has_work(void)
 {
-	return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled();
+	return !list_empty(&khugepaged_scan.mm_head) && hugepage_enabled();
 }
 
 static int khugepaged_wait_event(void)
@@ -3009,7 +3015,7 @@ static void khugepaged_wait_work(void)
 		return;
 	}
 
-	if (hugepage_pmd_enabled())
+	if (hugepage_enabled())
 		wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
 }
 
@@ -3040,7 +3046,7 @@ void set_recommended_min_free_kbytes(void)
 	int nr_zones = 0;
 	unsigned long recommended_min;
 
-	if (!hugepage_pmd_enabled()) {
+	if (!hugepage_enabled()) {
 		calculate_min_free_kbytes();
 		goto update_wmarks;
 	}
@@ -3090,7 +3096,7 @@ int start_stop_khugepaged(void)
 	int err = 0;
 
 	mutex_lock(&khugepaged_mutex);
-	if (hugepage_pmd_enabled()) {
+	if (hugepage_enabled()) {
 		if (!khugepaged_thread)
 			khugepaged_thread = kthread_run(khugepaged, NULL,
 							"khugepaged");
@@ -3116,7 +3122,7 @@ int start_stop_khugepaged(void)
 void khugepaged_min_free_kbytes_update(void)
 {
 	mutex_lock(&khugepaged_mutex);
-	if (hugepage_pmd_enabled() && khugepaged_thread)
+	if (hugepage_enabled() && khugepaged_thread)
 		set_recommended_min_free_kbytes();
 	mutex_unlock(&khugepaged_mutex);
 }
-- 
2.53.0


^ permalink raw reply related

* [PATCH 7.2 v16 13/13] Documentation: mm: update the admin guide for mTHP collapse
From: Nico Pache @ 2026-04-19 18:57 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
	dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, ljs,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, npache,
	peterx, pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang,
	rientjes, rostedt, rppt, ryan.roberts, shivankg, sunnanyong,
	surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe, Bagas Sanjaya
In-Reply-To: <20260419185750.260784-1-npache@redhat.com>

Now that we can collapse to mTHPs lets update the admin guide to
reflect these changes and provide proper guidance on how to utilize it.

Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
 Documentation/admin-guide/mm/transhuge.rst | 49 +++++++++++++---------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index eebb1f6bbc6c..0ef13c451ac8 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -63,7 +63,8 @@ often.
 THP can be enabled system wide or restricted to certain tasks or even
 memory ranges inside task's address space. Unless THP is completely
 disabled, there is ``khugepaged`` daemon that scans memory and
-collapses sequences of basic pages into PMD-sized huge pages.
+collapses sequences of basic pages into huge pages of either PMD size
+or mTHP sizes, if the system is configured to do so.
 
 The THP behaviour is controlled via :ref:`sysfs <thp_sysfs>`
 interface and using madvise(2) and prctl(2) system calls.
@@ -219,10 +220,10 @@ this behaviour by writing 0 to shrink_underused, and enable it by writing
 	echo 0 > /sys/kernel/mm/transparent_hugepage/shrink_underused
 	echo 1 > /sys/kernel/mm/transparent_hugepage/shrink_underused
 
-khugepaged will be automatically started when PMD-sized THP is enabled
+khugepaged will be automatically started when any THP size is enabled
 (either of the per-size anon control or the top-level control are set
 to "always" or "madvise"), and it'll be automatically shutdown when
-PMD-sized THP is disabled (when both the per-size anon control and the
+all THP sizes are disabled (when both the per-size anon control and the
 top-level control are "never")
 
 process THP controls
@@ -264,11 +265,6 @@ support the following arguments::
 Khugepaged controls
 -------------------
 
-.. note::
-   khugepaged currently only searches for opportunities to collapse to
-   PMD-sized THP and no attempt is made to collapse to other THP
-   sizes.
-
 khugepaged runs usually at low frequency so while one may not want to
 invoke defrag algorithms synchronously during the page faults, it
 should be worth invoking defrag at least in khugepaged. However it's
@@ -296,11 +292,11 @@ allocation failure to throttle the next allocation attempt::
 The khugepaged progress can be seen in the number of pages collapsed (note
 that this counter may not be an exact count of the number of pages
 collapsed, since "collapsed" could mean multiple things: (1) A PTE mapping
-being replaced by a PMD mapping, or (2) All 4K physical pages replaced by
-one 2M hugepage. Each may happen independently, or together, depending on
-the type of memory and the failures that occur. As such, this value should
-be interpreted roughly as a sign of progress, and counters in /proc/vmstat
-consulted for more accurate accounting)::
+being replaced by a PMD mapping, or (2) physical pages replaced by one
+hugepage of various sizes (PMD-sized or mTHP). Each may happen independently,
+or together, depending on the type of memory and the failures that occur.
+As such, this value should be interpreted roughly as a sign of progress,
+and counters in /proc/vmstat consulted for more accurate accounting)::
 
 	/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
 
@@ -308,16 +304,20 @@ for each pass::
 
 	/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans
 
-``max_ptes_none`` specifies how many extra small pages (that are
-not already mapped) can be allocated when collapsing a group
-of small pages into one large page::
+``max_ptes_none`` specifies how many empty (none/zero) pages are allowed
+when collapsing a group of small pages into one large page::
 
 	/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none
 
-A higher value leads to use additional memory for programs.
-A lower value leads to gain less thp performance. Value of
-max_ptes_none can waste cpu time very little, you can
-ignore it.
+For PMD-sized THP collapse, this directly limits the number of empty pages
+allowed in the 2MB region.
+
+For mTHP collapse, only 0 or (HPAGE_PMD_NR - 1) are supported. Any other value
+will emit a warning and no mTHP collapse will be attempted.
+
+A higher value allows more empty pages, potentially leading to more memory
+usage but better THP performance. A lower value is more conservative and
+may result in fewer THP collapses.
 
 ``max_ptes_swap`` specifies how many pages can be brought in from
 swap when collapsing a group of pages into a transparent huge page::
@@ -337,6 +337,15 @@ that THP is shared. Exceeding the number would block the collapse::
 
 A higher value may increase memory footprint for some workloads.
 
+.. note::
+   For mTHP collapse, khugepaged does not support collapsing regions that
+   contain shared or swapped out pages, as this could lead to continuous
+   promotion to higher orders. The collapse will fail if any shared or
+   swapped PTEs are encountered during the scan.
+
+   Currently, madvise_collapse only supports collapsing to PMD-sized THPs
+   and does not attempt mTHP collapses.
+
 Boot parameters
 ===============
 
-- 
2.53.0


^ permalink raw reply related

* [PATCH] Documentation: fix spelling mistake "stucture" -> "structure"
From: Ninad Naik @ 2026-04-19 18:45 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, corbet, skhan
  Cc: linux-trace-kernel, linux-doc, linux-kernel, me,
	linux-kernel-mentees, Ninad Naik

Fixing a spelling mistake in Documentation/trace/histogram-design.rst.

Signed-off-by: Ninad Naik <ninadnaik07@gmail.com>
---
 Documentation/trace/histogram-design.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/trace/histogram-design.rst b/Documentation/trace/histogram-design.rst
index e92f56ebd0b5..41a726cd3536 100644
--- a/Documentation/trace/histogram-design.rst
+++ b/Documentation/trace/histogram-design.rst
@@ -247,7 +247,7 @@ field's size and offset, is used to grab that subkey's data from the
 current trace record.
 
 Note, the hist field function use to be a function pointer in the
-hist_field stucture. Due to spectre mitigation, it was converted into
+hist_field structure. Due to spectre mitigation, it was converted into
 a fn_num and hist_fn_call() is used to call the associated hist field
 function that corresponds to the fn_num of the hist_field structure.
 
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH v9 7/8] selftests/ftrace: Add a testcase for fprobe events on module
From: Masami Hiramatsu @ 2026-04-20  1:42 UTC (permalink / raw)
  To: Masami Hiramatsu (Google)
  Cc: Steven Rostedt, Menglong Dong, Mathieu Desnoyers, jiang.biao,
	linux-kernel, linux-trace-kernel
In-Reply-To: <177644271967.584467.9751522686479464647.stgit@mhiramat.tok.corp.google.com>

On Sat, 18 Apr 2026 01:18:39 +0900
"Masami Hiramatsu (Google)" <mhiramat@kernel.org> wrote:

> From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> 
> Add a testcase for fprobe events on module, which unloads a kernel
> module on which fprobe events are probing and ensure the ftrace
> hash map is cleared correctly.
> 
> Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> ---
>  Changes in v9:
>  - Use "trace-events-sample" instead of "trace_events_sample"
>  - Add checking unload module and remove core-kernel event case.
>  - Check test module exists when unloading it in EXIT.
>  Changes in v8:
>  - Newly added.
> ---
>  .../test.d/dynevent/add_remove_fprobe_module.tc    |   87 ++++++++++++++++++++
>  1 file changed, 87 insertions(+)
>  create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe_module.tc
> 
> diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe_module.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe_module.tc
> new file mode 100644
> index 000000000000..c358c5071f15
> --- /dev/null
> +++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe_module.tc
> @@ -0,0 +1,87 @@
> +#!/bin/sh
> +# SPDX-License-Identifier: GPL-2.0
> +# description: Generic dynamic event - add/remove fprobe events on module
> +# requires: dynamic_events "f[:[<group>/][<event>]] <func-name>[%return] [<args>]":README enabled_functions
> +
> +rmmod trace-events-sample ||:
> +if ! modprobe trace-events-sample ; then
> +  echo "No trace-events sample module - please make CONFIG_SAMPLE_TRACE_EVENTS=m"
> +  exit_unresolved;
> +fi
> +trap "lsmod | grep -q trace-event-sample && rmmod trace-events-sample" EXIT

Oops, we need to check "trace_events_sample".

Thanks,

> +
> +echo 0 > events/enable
> +echo > dynamic_events
> +
> +FUNC1='foo_bar*'
> +FUNC2='vfs_read'
> +
> +:;: "Add an event on the test module" ;:
> +echo "f:test1 $FUNC1" >> dynamic_events
> +echo 1 > events/fprobes/test1/enable
> +
> +:;: "Ensure it is enabled" ;:
> +funcs=`cat enabled_functions | wc -l`
> +test $funcs -ne 0
> +
> +:;: "Check the enabled_functions is cleared on unloading" ;:
> +rmmod trace-events-sample
> +funcs=`cat enabled_functions | wc -l`
> +test $funcs -eq 0
> +
> +:;: "Check it is kept clean" ;:
> +modprobe trace-events-sample
> +echo 1 > events/fprobes/test1/enable || echo "OK"
> +funcs=`cat enabled_functions | wc -l`
> +test $funcs -eq 0
> +
> +:;: "Add another event not on the test module" ;:
> +echo "f:test2 $FUNC2" >> dynamic_events
> +echo 1 > events/fprobes/test2/enable
> +
> +:;: "Ensure it is enabled" ;:
> +ofuncs=`cat enabled_functions | wc -l`
> +test $ofuncs -ne 0
> +
> +:;: "Disable and remove the first event"
> +echo 0 > events/fprobes/test1/enable
> +echo "-:fprobes/test1" >> dynamic_events
> +funcs=`cat enabled_functions | wc -l`
> +test $ofuncs -eq $funcs
> +
> +:;: "Disable and remove other events" ;:
> +echo 0 > events/fprobes/enable
> +echo > dynamic_events
> +funcs=`cat enabled_functions | wc -l`
> +test $funcs -eq 0
> +
> +rmmod trace-events-sample
> +
> +:;: "Add events on kernel and test module" ;:
> +modprobe trace-events-sample
> +echo "f:test1 $FUNC1" >> dynamic_events
> +echo 1 > events/fprobes/test1/enable
> +echo "f:test2 $FUNC2" >> dynamic_events
> +echo 1 > events/fprobes/test2/enable
> +ofuncs=`cat enabled_functions | wc -l`
> +test $ofuncs -ne 0
> +
> +:;: "Unload module (ftrace entry should be removed)" ;:
> +rmmod trace-events-sample
> +funcs=`cat enabled_functions | wc -l`
> +test $funcs -ne 0
> +test $ofuncs -ne $funcs
> +
> +:;: "Disable and remove core-kernel fprobe event" ;:
> +echo 0 > events/fprobes/test2/enable
> +echo "-:fprobes/test2" >> dynamic_events
> +
> +:;: "Ensure ftrace is disabled." ;:
> +funcs=`cat enabled_functions | wc -l`
> +test $funcs -eq 0
> +
> +echo 0 > events/fprobes/enable
> +echo > dynamic_events
> +
> +trap "" EXIT
> +clear_trace
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH 0/3] mm: split the file's i_mmap tree for NUMA
From: Huang Shijie @ 2026-04-20  2:10 UTC (permalink / raw)
  To: Mateusz Guzik
  Cc: akpm, viro, brauner, linux-mm, linux-kernel, linux-arm-kernel,
	linux-fsdevel, muchun.song, osalvador, linux-trace-kernel,
	linux-perf-users, linux-parisc, nvdimm, zhongyuan, fangbaoshun,
	yingzhiwei
In-Reply-To: <76pfiwabdgsej6q2yxfh3efuqvsyg7mt7rvl5itzzjyhdrto5r@53viaxsackzv>

On Mon, Apr 13, 2026 at 05:33:21PM +0200, Mateusz Guzik wrote:
> On Mon, Apr 13, 2026 at 02:20:39PM +0800, Huang Shijie wrote:
> >   In NUMA, there are maybe many NUMA nodes and many CPUs.
> > For example, a Hygon's server has 12 NUMA nodes, and 384 CPUs.
> > In the UnixBench tests, there is a test "execl" which tests
> > the execve system call.
> > 
> >   When we test our server with "./Run -c 384 execl",
> > the test result is not good enough. The i_mmap locks contended heavily on
> > "libc.so" and "ld.so". For example, the i_mmap tree for "libc.so" can have 
> > over 6000 VMAs, all the VMAs can be in different NUMA mode.
> > The insert/remove operations do not run quickly enough.
> > 
> > patch 1 & patch 2 are try to hide the direct access of i_mmap.
> > patch 3 splits the i_mmap into sibling trees, and we can get better 
> > performance with this patch set:
> >     we can get 77% performance improvement(10 times average)
> > 
> 
> To my reading you kept the lock as-is and only distributed the protected
> state.
> 
> While I don't doubt the improvement, I'm confident should you take a
> look at the profile you are going to find this still does not scale with
> rwsem being one of the problems (there are other global locks, some of
> which have experimental patches for).
> 
> Apart from that this does nothing to help high core systems which are
> all one node, which imo puts another question mark on this specific
> proposal.
> 
> Of course one may question whether a RB tree is the right choice here,
> it may be the lock-protected cost can go way down with merely a better
> data structure.
> 
> Regardless of that, for actual scalability, there will be no way around
> decentralazing locking around this and partitioning per some core count
> (not just by numa awareness).
> 
> Decentralizing locking is definitely possible, but I have not looked
> into specifics of how problematic it is. Best case scenario it will
> merely with separate locks. Worst case scenario something needs a fully
> stabilized state for traversal, in that case another rw lock can be
> slapped around this, creating locking order read lock -> per-subset
> write lock -- this will suffer scalability due to the read locking, but
> it will still scale drastically better as apart from that there will be
> no serialization. In this setting the problematic consumer will write
> lock the new thing to stabilize the state.
> 
I thought over again.
I can change this patch set to support the non-NUMA case by:
  1.) Still use one rw lock.
  2.) For NUMA, keep the patch set as it is.
  3.) For non-NUMA case, split the i_mmap tree to several subtrees.
      For example, if a machine has 192 CPUs, split the 32 CPUs as a tree.

So extend the patch set to support both the NUMA and non-NUMA machines.

Thanks
Huang Shijie


^ permalink raw reply

* Re: [RFC PATCH 2/2] kernel/module: Decouple klp and ftrace from load_module
From: Masami Hiramatsu @ 2026-04-20  2:27 UTC (permalink / raw)
  To: Petr Mladek
  Cc: Petr Pavlu, Song Chen, rafael, lenb, mturquette, sboyd,
	viresh.kumar, agk, snitzer, mpatocka, bmarzins, song, yukuai,
	linan122, jason.wessel, danielt, dianders, horms, davem, edumazet,
	kuba, pabeni, paulmck, frederic, mcgrof, da.gomez, samitolvanen,
	atomlin, jpoimboe, jikos, mbenes, joe.lawrence, rostedt, mhiramat,
	mark.rutland, mathieu.desnoyers, linux-modules, linux-kernel,
	linux-trace-kernel, linux-acpi, linux-clk, linux-pm,
	live-patching, dm-devel, linux-raid, kgdb-bugreport, netdev
In-Reply-To: <aeD2_FrFL6E3dbAC@pathway.suse.cz>

On Thu, 16 Apr 2026 16:49:32 +0200
Petr Mladek <pmladek@suse.com> wrote:

> On Thu 2026-04-16 13:18:30, Petr Pavlu wrote:
> > On 4/15/26 8:43 AM, Song Chen wrote:
> > > On 4/14/26 22:33, Petr Pavlu wrote:
> > >> On 4/13/26 10:07 AM, chensong_2000@189.cn wrote:
> > >>> diff --git a/include/linux/module.h b/include/linux/module.h
> > >>> index 14f391b186c6..0bdd56f9defd 100644
> > >>> --- a/include/linux/module.h
> > >>> +++ b/include/linux/module.h
> > >>> @@ -308,6 +308,14 @@ enum module_state {
> > >>>       MODULE_STATE_COMING,    /* Full formed, running module_init. */
> > >>>       MODULE_STATE_GOING,    /* Going away. */
> > >>>       MODULE_STATE_UNFORMED,    /* Still setting it up. */
> > >>> +    MODULE_STATE_FORMED,
> > >>
> > >> I don't see a reason to add a new module state. Why is it necessary and
> > >> how does it fit with the existing states?
> > >>
> > > because once notifier fails in state MODULE_STATE_UNFORMED (now only ftrace has someting to do in this state), notifier chain will roll back by calling blocking_notifier_call_chain_robust, i'm afraid MODULE_STATE_GOING is going to jeopardise the notifers which don't handle it appropriately, like:
> > > 
> > > case MODULE_STATE_COMING:
> > >      kmalloc();
> > > case MODULE_STATE_GOING:
> > >      kfree();
> > 
> > My understanding is that the current module "state machine" operates as
> > follows. Transitions marked with an asterisk (*) are announced via the
> > module notifier.
> > 
> > ---> UNFORMED --*> COMING --*> LIVE --*> GOING -.
> >         ^            |                     ^    |
> >         |            '---------------------*    |
> >         '---------------------------------------'
> > 
> > The new code aims to replace the current ftrace_module_init() call in
> > load_module(). To achieve this, it adds a notification for the UNFORMED
> > state (only when loading a module) and introduces a new FORMED state for
> > rollback. FORMED is purely a fake state because it never appears in
> > module::state. The new structure is as follows:
> > 
> >         ,--*> (FORMED)
> >         |
> > --*> UNFORMED --*> COMING --*> LIVE --*> GOING -.
> >         ^            |                     ^    |
> >         |            '---------------------*    |
> >         '---------------------------------------'
> > 
> > I'm afraid this is quite complex and inconsistent. Unless it can be kept
> > simple, we would be just replacing one special handling with a different
> > complexity, which is not worth it.
> 
> > >>
> > >>> +    if (err)
> > >>> +        goto ddebug_cleanup;
> > >>>         /* Finally it's fully formed, ready to start executing. */
> > >>>       err = complete_formation(mod, info);
> > >>> -    if (err)
> > >>> +    if (err) {
> > >>> +        blocking_notifier_call_chain_reverse(&module_notify_list,
> > >>> +                MODULE_STATE_FORMED, mod);
> > >>>           goto ddebug_cleanup;
> > >>> +    }
> > >>>   -    err = prepare_coming_module(mod);
> > >>> +    err = prepare_module_state_transaction(mod,
> > >>> +                MODULE_STATE_COMING, MODULE_STATE_GOING);
> > >>>       if (err)
> > >>>           goto bug_cleanup;
> > >>>   @@ -3522,7 +3519,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
> > >>>       destroy_params(mod->kp, mod->num_kp);
> > >>>       blocking_notifier_call_chain(&module_notify_list,
> > >>>                        MODULE_STATE_GOING, mod);
> > >>
> > >> My understanding is that all notifier chains for MODULE_STATE_GOING
> > >> should be reversed.
> > > yes, all, from lowest priority notifier to highest.
> > > I will resend patch 1 which was failed due to my proxy setting.
> > 
> > What I meant here is that the call:
> > 
> > blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod);
> > 
> > should be replaced with:
> > 
> > blocking_notifier_call_chain_reverse(&module_notify_list, MODULE_STATE_GOING, mod);
> > 
> > > 
> > >>
> > >>> -    klp_module_going(mod);
> > >>>    bug_cleanup:
> > >>>       mod->state = MODULE_STATE_GOING;
> > >>>       /* module_bug_cleanup needs module_mutex protection */
> > >>
> > >> The patch removes the klp_module_going() cleanup call in load_module().
> > >> Similarly, the ftrace_release_mod() call under the ddebug_cleanup label
> > >> should be removed and appropriately replaced with a cleanup via
> > >> a notifier.
> > >>
> > >     err = prepare_module_state_transaction(mod,
> > >                 MODULE_STATE_UNFORMED, MODULE_STATE_FORMED);
> > >     if (err)
> > >         goto ddebug_cleanup;
> > > 
> > > ftrace will be cleanup in blocking_notifier_call_chain_robust rolling back.
> > > 
> > >     err = prepare_module_state_transaction(mod,
> > >                 MODULE_STATE_COMING, MODULE_STATE_GOING);
> > > 
> > > each notifier including ftrace and klp will be cleanup in blocking_notifier_call_chain_robust rolling back.
> > > 
> > > if all notifiers are successful in MODULE_STATE_COMING, they all will be clean up in
> > >  coming_cleanup:
> > >     mod->state = MODULE_STATE_GOING;
> > >     destroy_params(mod->kp, mod->num_kp);
> > >     blocking_notifier_call_chain(&module_notify_list,
> > >                      MODULE_STATE_GOING, mod);
> > > 
> > > if  something wrong underneath.
> > 
> > My point is that the patch leaves a call to ftrace_release_mod() in
> > load_module(), which I expected to be handled via a notifier.
> 
> I think that I have got it. The ftrace code needs two notifiers when
> the module is being loaded and two when it is going.
> 
> This is why Sond added the new state. But I think that we would
> need two new states to call:
> 
>     + ftrace_module_init() in MODULE_STATE_UNFORMED
>     + ftrace_module_enable() in MODULE_STATE_FORMED
> 
> and
> 
>     + ftrace_free_mem() in MODULE_STATE_PRE_GOING
>     + ftrace_free_mem() in MODULE_STATE_GOING
> 
> 
> By using the ascii art:
> 
>  -*> UNFORMED -*> FORMED -> COMING -*> LIVE -*> PRE_GOING -*> GOING -.
>               |          |         |                ^           ^    ^
>               |          |         '----------------'           |    |
>               |          '--------------------------------------'    |
>               '------------------------------------------------------'
> 
> 
> But I think that this is not worth it.

Agree.

If this needs to be ordered so strictly, why we will use a "single"
module notifier chain for this complex situation?

I think the notifier call chain is just for notice a single signal,
instead of sending several different signals, especially if there is
any dependency among the callbacks.

If notification callbacks need to be ordered, they are currently
sorted by representing priority numerically, but this is quite
fragile for updating. It has to look up other registered priorities
and adjust the order among dependencies each time. For this reason,
this mechanism is not suitable for global ordering. (It's like line
numbers in BASIC.)
It is probably only useful for representing dependencies between
two components maintained by the same maintainer.

I'm against a general-purpose system that makes everything modular.
It unnecessarily complicates things. If there are processes that
require strict ordering, especially processes that must be performed
before each stage as part of the framework, they should be called
directly from the framework, not via notification callbacks.

This makes it simpler and more robust to maintain.

Only the framework's end users should utilize notification callbacks.

Thank you,


> 
> Best Regards,
> Petr
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [LSF/MM/BPF TOPIC][RFC PATCH v4 00/27] Private Memory Nodes (w/ Compressed RAM)
From: Gregory Price @ 2026-04-20  2:56 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: lsf-pc, linux-kernel, linux-cxl, cgroups, linux-mm,
	linux-trace-kernel, damon, kernel-team, gregkh, rafael, dakr,
	dave, jonathan.cameron, dave.jiang, alison.schofield,
	vishal.l.verma, ira.weiny, dan.j.williams, longman, akpm,
	lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko,
	osalvador, ziy, matthew.brost, joshua.hahnjy, rakie.kim,
	byungchul, ying.huang, apopple, axelrasmussen, yuanchu, weixugc,
	yury.norov, linux, mhiramat, mathieu.desnoyers, tj, hannes,
	mkoutny, jackmanb, sj, baolin.wang, npache, ryan.roberts,
	dev.jain, baohua, lance.yang, muchun.song, xu.xin16,
	chengming.zhou, jannh, linmiaohe, nao.horiguchi, pfalcato,
	rientjes, shakeel.butt, riel, harry.yoo, cl, roman.gushchin,
	chrisl, kasong, shikemeng, nphamcs, bhe, zhengqi.arch,
	terry.bowman
In-Reply-To: <46837cea-5d90-49d8-be67-7306e0e89aa3@kernel.org>

On Fri, Apr 17, 2026 at 11:37:36AM +0200, David Hildenbrand (Arm) wrote:
> On 4/15/26 17:17, Gregory Price wrote:
> 
> >> Needs a second thought regarding fallback logic I raised above.
> >>
> >> What I think would have to be audited is the usage of __GFP_THISNODE by
> >> kernel allocations, where we would not actually want to allocate from
> >> this private node.
> >>
> > 
> > This is fair, and I a re-visit is absolutely warranted.
> > 
> > Re-examining the quick audit from my last response suggests - I should
> > never have seen leakage in those cases, but the fallbacks are needed.
> > 
> > So yes, this all requires a second look (and a third, and a ninth).
> > 
> > I'm not married to __GFP_PRIVATE, but it has been reliable for me.
> 
> Yes, we should carefully describe which semantics we want to achieve, to
> then figure out how we could achieve them.
> 

Ah, I finally dug up my notes on this.

If we overload __GFP_THISNODE - then we have to audit all gfp_mask's
with THISNODE against the use of any of the following *forever*:

#define node_online_map         node_states[N_ONLINE]
#define node_possible_map       node_states[N_POSSIBLE]
#define for_each_node(node)        for_each_node_state(node, N_POSSIBLE)
#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)

  or

cgroup.cpuset.mems_allowed / mems_effective


Anyone that attempts to do:

    for_each_online_node(node):
        buf = alloc_pages_node(node, __GFP_THISNODE, NULL)

*will* get incidental access to private node memory, and it won't be
obvious to existing tooling that this should be considered a bug.


rate of occurance in the current code:
-----------------
node_online_map       -  21 instances
node_possible_map     -  25 instances
for_each_node         -  346 instances
for_each_online_node  -  67 instances
GFP_THISNODE          -  58 instances
(notes don't have mems_allowed/mems_effective instances)


But it's not always going to be obvious - since nodemasks and gfp_masks
get passed around as variables all throughout the kernel.

I ultimately determined that auditing this in-tree is already a fools
errand - and suggesting we try to validate this never occurs for all
future code moving forward is just not realistic in any sense.

I could not come up with a way to remove private nodes from
node_online/possible_map - and private nodes must be added to
cpuset.mems_allowed to allow cpuset control (otherwise all userland
access is blanket denied).

So I moved back to __GFP_PRIVATE.

=== TL;DR:

The core premise of private nodes is isolation first.

So we want this code:

   for node in cpuset.mems_allowed / online_map
       buf = alloc_pages_node(node, __GFP_THISNODE, NULL)

To explicitly fail - so that the caller knows they can't use these
masks this way anymore (it was already potentially a bug, but could
have been masked if all online nodes had memory).

~Gregory

^ permalink raw reply

* Re: [RFC PATCH 1/2] kernel/notifier: replace single-linked list with double-linked list for reverse traversal
From: Masami Hiramatsu @ 2026-04-20  5:44 UTC (permalink / raw)
  To: chensong_2000
  Cc: rafael, lenb, mturquette, sboyd, viresh.kumar, agk, snitzer,
	mpatocka, bmarzins, song, yukuai, linan122, jason.wessel, danielt,
	dianders, horms, davem, edumazet, kuba, pabeni, paulmck, frederic,
	mcgrof, petr.pavlu, da.gomez, samitolvanen, atomlin, jpoimboe,
	jikos, mbenes, pmladek, joe.lawrence, rostedt, mark.rutland,
	mathieu.desnoyers, linux-modules, linux-kernel,
	linux-trace-kernel, linux-acpi, linux-clk, linux-pm,
	live-patching, dm-devel, linux-raid, kgdb-bugreport, netdev
In-Reply-To: <20260415070137.17860-1-chensong_2000@189.cn>

Hi Song,

On Wed, 15 Apr 2026 15:01:37 +0800
chensong_2000@189.cn wrote:

> From: Song Chen <chensong_2000@189.cn>
> 
> The current notifier chain implementation uses a single-linked list
> (struct notifier_block *next), which only supports forward traversal
> in priority order. This makes it difficult to handle cleanup/teardown
> scenarios that require notifiers to be called in reverse priority order.

What about introducing a new notification callback API that allows you
to describe dependencies between callback functions?

For example, when registering a callback, you could register a string
as an ID and specify whether to call it before or after that ID,
or you could register a comparison function that is called when adding
to a list. (I prefer @name and @depends fields so that it can be easily
maintained.)

This would allow for better dependency building when adding to the list.

> 
> A concrete example is the ordering dependency between ftrace and
> livepatch during module load/unload. see the detail here [1].

If this only concerns notification callback issues with the ftrace
and livepatch modules, it's far more robust to simply call the
necessary processing directly when the modules load and unload,
rather than registering notification callbacks externally.

There are fprobe, kprobe and its trace-events, all of them are using
ftrace as its fundation layer. In this case, I always needs to
consider callback order when a module is unloaded.

If ftrace is working as a part of module callbacks, it will conflict
with fprobe/kprobe module callback. Of course we can reorder it with
modifying its priority. But this is ugly, because when we introduce
a new other feature which depends on another layer, we need to
reorder the callback's priority number on the list.

Based on the above, I don't think this can be resolved simply by
changing the list of notification callbacks to a bidirectional list.

Thank you,

-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* [PATCH v2] tracing: export live module tracepoint strings in printk_formats
From: Cao Ruichuang @ 2026-04-20  6:19 UTC (permalink / raw)
  To: rostedt; +Cc: petr.pavlu, linux-trace-kernel, linux-kernel, Cao Ruichuang
In-Reply-To: <20260413123359.32517-1-create0818@163.com>

tracepoint_string() documents that its strings are exported through
printk_formats so that user space can decode pointer fields recorded in
trace buffers.

That already works for built-in __tracepoint_str entries, but module
__tracepoint_str sections are not collected or exported today. As a
result, module tracepoint_string() users still show raw pointer values
in printk_formats consumers such as trace.dat decoders.

Record module __tracepoint_str sections when modules are loaded, expose
their live ranges through printk_formats, and teach
trace_is_tracepoint_string() to accept those live module strings too.

Keep the lifetime semantics tied to the module itself. This does not
copy strings into tracing-owned storage and does not preserve the
mappings after module unload.

On MODULE_STATE_GOING, the live module string ranges are removed again.
This relies on the existing tracing module notifier ordering: trace
event teardown runs first and resets module event buffers before these
auxiliary string mappings are dropped.

If the small auxiliary registry allocation fails, warn and continue
loading the module. printk_formats exposure is degraded in that case,
but tracing should not fail module load for missing debug metadata.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=217196
Assisted-by: Codex:GPT-5.4
Signed-off-by: Cao Ruichuang <create0818@163.com>
---
v2:
- replace the previous copied-string approach with live module section ranges
- record module __tracepoint_str ranges in struct module
- export only live module tracepoint strings in printk_formats
- remove module mappings on MODULE_STATE_GOING
- keep auxiliary registry allocation failure non-fatal and warn instead
- add explicit notifier priority and document the teardown ordering dependency

Tested in QEMU:
- basic repro showing module tracepoint_string() entries in printk_formats
- load/unload validation confirming mappings are removed after rmmod
- failed module init after MODULE_STATE_COMING with no stale mapping left
- targeted failslab injection on the notifier-time auxiliary allocation,
  confirming module load still succeeds, a warning is emitted, and the
  module mapping is not exported

 include/linux/module.h      |   2 +
 kernel/module/main.c        |   4 +
 kernel/trace/trace_printk.c | 153 ++++++++++++++++++++++++++++++++++--
 3 files changed, 152 insertions(+), 7 deletions(-)

diff --git a/include/linux/module.h b/include/linux/module.h
index 14f391b186c..e475466a785 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -515,6 +515,8 @@ struct module {
 #ifdef CONFIG_TRACING
 	unsigned int num_trace_bprintk_fmt;
 	const char **trace_bprintk_fmt_start;
+	unsigned int num_tracepoint_strings;
+	const char **tracepoint_strings_start;
 #endif
 #ifdef CONFIG_EVENT_TRACING
 	struct trace_event_call **trace_events;
diff --git a/kernel/module/main.c b/kernel/module/main.c
index c3ce106c70a..d7d890138ac 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2672,6 +2672,10 @@ static int find_module_sections(struct module *mod, struct load_info *info)
 	mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
 					 sizeof(*mod->trace_bprintk_fmt_start),
 					 &mod->num_trace_bprintk_fmt);
+	mod->tracepoint_strings_start =
+		section_objs(info, "__tracepoint_str",
+			     sizeof(*mod->tracepoint_strings_start),
+			     &mod->num_tracepoint_strings);
 #endif
 #ifdef CONFIG_DYNAMIC_FTRACE
 	/* sechdrs[0].sh_size is always zero */
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 5ea5e0d76f0..2d41b0a63b3 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -13,6 +13,7 @@
 #include <linux/string.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/rcupdate.h>
 #include <linux/ctype.h>
 #include <linux/list.h>
 #include <linux/slab.h>
@@ -24,10 +25,15 @@
 /*
  * modules trace_printk()'s formats are autosaved in struct trace_bprintk_fmt
  * which are queued on trace_bprintk_fmt_list.
+ *
+ * modules tracepoint_string() entries are kept as ranges into the owning
+ * module's __tracepoint_str section and are removed again when the module
+ * goes away.
  */
 static LIST_HEAD(trace_bprintk_fmt_list);
+static LIST_HEAD(tracepoint_str_list);
 
-/* serialize accesses to trace_bprintk_fmt_list */
+/* serialize accesses to module trace printk and tracepoint string lists */
 static DEFINE_MUTEX(btrace_mutex);
 
 struct trace_bprintk_fmt {
@@ -35,6 +41,13 @@ struct trace_bprintk_fmt {
 	const char *fmt;
 };
 
+struct tracepoint_mod_str {
+	struct list_head list;
+	struct module *mod;
+	const char **start;
+	unsigned int num;
+};
+
 static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
 {
 	struct trace_bprintk_fmt *pos;
@@ -85,16 +98,70 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
 	mutex_unlock(&btrace_mutex);
 }
 
+static void hold_module_tracepoint_strings(struct module *mod)
+{
+	struct tracepoint_mod_str *tp_str;
+
+	if (!mod->num_tracepoint_strings)
+		return;
+
+	tp_str = kmalloc_obj(*tp_str);
+	if (!tp_str) {
+		pr_warn("tracing: Failed to expose module tracepoint strings for %s\n",
+			mod->name);
+		return;
+	}
+
+	tp_str->mod = mod;
+	tp_str->start = mod->tracepoint_strings_start;
+	tp_str->num = mod->num_tracepoint_strings;
+
+	mutex_lock(&btrace_mutex);
+	list_add_tail_rcu(&tp_str->list, &tracepoint_str_list);
+	mutex_unlock(&btrace_mutex);
+}
+
+static void release_module_tracepoint_strings(struct module *mod)
+{
+	struct tracepoint_mod_str *tp_str, *next;
+	struct tracepoint_mod_str *found = NULL;
+
+	mutex_lock(&btrace_mutex);
+	list_for_each_entry_safe(tp_str, next, &tracepoint_str_list, list) {
+		if (tp_str->mod != mod)
+			continue;
+
+		list_del_rcu(&tp_str->list);
+		found = tp_str;
+		break;
+	}
+	mutex_unlock(&btrace_mutex);
+
+	if (found) {
+		synchronize_rcu();
+		kfree(found);
+	}
+}
+
 static int module_trace_bprintk_format_notify(struct notifier_block *self,
 		unsigned long val, void *data)
 {
 	struct module *mod = data;
-	if (mod->num_trace_bprintk_fmt) {
-		const char **start = mod->trace_bprintk_fmt_start;
-		const char **end = start + mod->num_trace_bprintk_fmt;
 
-		if (val == MODULE_STATE_COMING)
+	switch (val) {
+	case MODULE_STATE_COMING:
+		if (mod->num_trace_bprintk_fmt) {
+			const char **start = mod->trace_bprintk_fmt_start;
+			const char **end = start + mod->num_trace_bprintk_fmt;
+
 			hold_module_trace_bprintk_format(start, end);
+		}
+		hold_module_tracepoint_strings(mod);
+		break;
+	case MODULE_STATE_GOING:
+		/* trace event teardown runs first and clears module event buffers. */
+		release_module_tracepoint_strings(mod);
+		break;
 	}
 	return NOTIFY_OK;
 }
@@ -159,6 +226,55 @@ find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
 	return &mod_fmt->fmt;
 }
 
+static int count_mod_formats(void)
+{
+	struct trace_bprintk_fmt *p;
+	int count = 0;
+
+	list_for_each_entry(p, &trace_bprintk_fmt_list, list)
+		count++;
+
+	return count;
+}
+
+static const char **
+find_next_mod_tracepoint_str(int start_index, loff_t *pos)
+{
+	struct tracepoint_mod_str *tp_str;
+	int index = start_index;
+	unsigned int i;
+
+	list_for_each_entry(tp_str, &tracepoint_str_list, list) {
+		for (i = 0; i < tp_str->num; i++) {
+			if (index == *pos)
+				return tp_str->start + i;
+			index++;
+		}
+	}
+
+	return NULL;
+}
+
+static bool is_module_tracepoint_string(const char *str)
+{
+	struct tracepoint_mod_str *tp_str;
+	unsigned int i;
+	bool found = false;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tp_str, &tracepoint_str_list, list) {
+		for (i = 0; i < tp_str->num; i++) {
+			if (str == tp_str->start[i]) {
+				found = true;
+				goto out;
+			}
+		}
+	}
+out:
+	rcu_read_unlock();
+	return found;
+}
+
 static void format_mod_start(void)
 {
 	mutex_lock(&btrace_mutex);
@@ -181,6 +297,22 @@ find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
 {
 	return NULL;
 }
+
+static inline int count_mod_formats(void)
+{
+	return 0;
+}
+
+static inline const char **
+find_next_mod_tracepoint_str(int start_index, loff_t *pos)
+{
+	return NULL;
+}
+
+static inline bool is_module_tracepoint_string(const char *str)
+{
+	return false;
+}
 static inline void format_mod_start(void) { }
 static inline void format_mod_stop(void) { }
 #endif /* CONFIG_MODULES */
@@ -195,6 +327,7 @@ void trace_printk_control(bool enabled)
 __initdata_or_module static
 struct notifier_block module_trace_bprintk_format_nb = {
 	.notifier_call = module_trace_bprintk_format_notify,
+	.priority = 0,
 };
 
 int __trace_bprintk(unsigned long ip, const char *fmt, ...)
@@ -259,12 +392,13 @@ bool trace_is_tracepoint_string(const char *str)
 		if (str == *ptr)
 			return true;
 	}
-	return false;
+	return is_module_tracepoint_string(str);
 }
 
 static const char **find_next(void *v, loff_t *pos)
 {
 	const char **fmt = v;
+	int mod_formats;
 	int start_index;
 	int last_index;
 
@@ -292,7 +426,12 @@ static const char **find_next(void *v, loff_t *pos)
 		return __start___tracepoint_str + (*pos - last_index);
 
 	start_index += last_index;
-	return find_next_mod_format(start_index, v, fmt, pos);
+	mod_formats = count_mod_formats();
+	if (*pos < start_index + mod_formats)
+		return find_next_mod_format(start_index, v, fmt, pos);
+
+	start_index += mod_formats;
+	return find_next_mod_tracepoint_str(start_index, pos);
 }
 
 static void *
-- 
2.39.5 (Apple Git-154)


^ permalink raw reply related

* Re: [RFC v4 0/7] ext4: fast commit: snapshot inode state for FC log
From: Li Chen @ 2026-04-20  9:37 UTC (permalink / raw)
  To: Theodore Tso
  Cc: Zhang Yi, Andreas Dilger, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, linux-ext4, linux-trace-kernel, linux-kernel
In-Reply-To: <20260413131244.GB20496@macsyma-wired.lan>

Hi Theodore,

 ---- On Mon, 13 Apr 2026 21:12:44 +0800  Theodore Tso <tytso@mit.edu> wrote --- 
 > On Mon, Apr 13, 2026 at 09:01:28PM +0800, Li Chen wrote:
 > > Absolutely! It's great to learn about the Sashiko development site.
 > > I will address the real issues in the next version.
 > 
 > Note that Sashiko will sometimes report a pre-existing issue as if it
 > were a problem with the commit.  If that happens, feel free to ignore
 > its complaint; what I consider best practice is to either (a) fix it
 > in the a subsequent patch or patch series, or (b) leave a TODO in the
 > code.
 > 
 > I've asked the Sashiko folks to add way for URI's for each issue that
 > are identified by Sashiko, so we can put a URL in the TODO comment for
 > someone who wants to fix it later, and to make it easier for Sashiko
 > to identified pre-existing issues so it doesn't comment on the same
 > issue across multiple commit reviews (and perhaps save on the some LLM
 > token budget :-).
 > 
 > In the next few days, for patches sent to linux-ext4, Sashiko will
 > start e-mailing its reviews to the patch submitter and to me as the
 > maintainer.  Once we can reduce the false positive rate, I'll ask that
 > the reviews be cc'ed to the linux-ext4 mailing list.  But it seems
 > good enough that to send e-mails to the patch submitter and the
 > maintainer --- but that's a decision that each subsystem maintainer
 > will be making on their own.

Got it, thanks. I'll treat Sashiko as a review aid, fix the real issues in the next version, 
and leave unrelated pre-existing issues for follow-up or a TODO.

Regards,
Li


^ permalink raw reply

* [PATCH] trace: remove the dead IS_ERR() check in trace_pipe_open()
From: Yash Suthar @ 2026-04-20 10:12 UTC (permalink / raw)
  To: rostedt, mhiramat
  Cc: mathieu.desnoyers, linux-kernel, linux-trace-kernel, Yash Suthar

in trace_pipe_open() already check the IS_ERR(iter) and
return early on error,so iter after will be valid and
it is safe to return 0 at end.

Signed-off-by: Yash Suthar <yashsuthar983@gmail.com>
---
 kernel/trace/trace_remote.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c
index d6c3f94d67cd..2a6cc000ec98 100644
--- a/kernel/trace/trace_remote.c
+++ b/kernel/trace/trace_remote.c
@@ -602,7 +602,7 @@ static int trace_pipe_open(struct inode *inode, struct file *filp)
 
 	filp->private_data = iter;
 
-	return IS_ERR(iter) ? PTR_ERR(iter) : 0;
+	return 0;
 }
 
 static int trace_pipe_release(struct inode *inode, struct file *filp)
-- 
2.43.0


^ permalink raw reply related

* [PATCH net v1] net: validate skb->napi_id in RX tracepoints
From: Kohei Enju @ 2026-04-20 10:54 UTC (permalink / raw)
  To: netdev, linux-trace-kernel
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Kohei Enju

Since commit 2bd82484bb4c ("xps: fix xps for stacked devices"),
skb->napi_id shares storage with sender_cpu. RX tracepoints using
net_dev_rx_verbose_template read skb->napi_id directly and can therefore
report sender_cpu values as if they were NAPI IDs.

For example, on the loopback path this can report 1 as napi_id, where 1
comes from raw_smp_processor_id() + 1 in the XPS path:

  # bpftrace -e 'tracepoint:net:netif_rx_entry{ print(args->napi_id); }'
  # taskset -c 0 ping -c 1 ::1

Report only valid NAPI IDs in these tracepoints and use 0 otherwise.

Fixes: 2bd82484bb4c ("xps: fix xps for stacked devices")
Signed-off-by: Kohei Enju <kohei@enjuk.jp>
---
 include/trace/events/net.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index fdd9ad474ce3..dbc2c5598e35 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -10,6 +10,7 @@
 #include <linux/if_vlan.h>
 #include <linux/ip.h>
 #include <linux/tracepoint.h>
+#include <net/busy_poll.h>
 
 TRACE_EVENT(net_dev_start_xmit,
 
@@ -208,7 +209,8 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,
 	TP_fast_assign(
 		__assign_str(name);
 #ifdef CONFIG_NET_RX_BUSY_POLL
-		__entry->napi_id = skb->napi_id;
+		__entry->napi_id = napi_id_valid(skb->napi_id) ?
+				   skb->napi_id : 0;
 #else
 		__entry->napi_id = 0;
 #endif
-- 
2.51.0


^ permalink raw reply related

* Re: [PATCH net v1] net: validate skb->napi_id in RX tracepoints
From: Jiayuan Chen @ 2026-04-20 11:27 UTC (permalink / raw)
  To: Kohei Enju, netdev, linux-trace-kernel
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers
In-Reply-To: <20260420105427.162816-1-kohei@enjuk.jp>


On 4/20/26 6:54 PM, Kohei Enju wrote:
> Since commit 2bd82484bb4c ("xps: fix xps for stacked devices"),
> skb->napi_id shares storage with sender_cpu. RX tracepoints using
> net_dev_rx_verbose_template read skb->napi_id directly and can therefore
> report sender_cpu values as if they were NAPI IDs.
>
> For example, on the loopback path this can report 1 as napi_id, where 1
So I think veth_forward_skb->__netif_rx could be affected as well?
> comes from raw_smp_processor_id() + 1 in the XPS path:
>
>    # bpftrace -e 'tracepoint:net:netif_rx_entry{ print(args->napi_id); }'
>    # taskset -c 0 ping -c 1 ::1
>
> Report only valid NAPI IDs in these tracepoints and use 0 otherwise.
>
> Fixes: 2bd82484bb4c ("xps: fix xps for stacked devices")
> Signed-off-by: Kohei Enju <kohei@enjuk.jp>
> ---
>   include/trace/events/net.h | 4 +++-
>   1 file changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/include/trace/events/net.h b/include/trace/events/net.h
> index fdd9ad474ce3..dbc2c5598e35 100644
> --- a/include/trace/events/net.h
> +++ b/include/trace/events/net.h
> @@ -10,6 +10,7 @@
>   #include <linux/if_vlan.h>
>   #include <linux/ip.h>
>   #include <linux/tracepoint.h>
> +#include <net/busy_poll.h>
>   
>   TRACE_EVENT(net_dev_start_xmit,
>   
> @@ -208,7 +209,8 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,
>   	TP_fast_assign(
>   		__assign_str(name);
>   #ifdef CONFIG_NET_RX_BUSY_POLL
> -		__entry->napi_id = skb->napi_id;
> +		__entry->napi_id = napi_id_valid(skb->napi_id) ?
> +				   skb->napi_id : 0;
>   #else
>   		__entry->napi_id = 0;
>   #endif

^ permalink raw reply

* Re: [PATCH net v1] net: validate skb->napi_id in RX tracepoints
From: Kohei Enju @ 2026-04-20 11:54 UTC (permalink / raw)
  To: Jiayuan Chen
  Cc: netdev, linux-trace-kernel, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers
In-Reply-To: <b943ec1e-417c-4157-ab19-b34aa6d63688@linux.dev>

On 04/20 19:27, Jiayuan Chen wrote:
> 
> On 4/20/26 6:54 PM, Kohei Enju wrote:
> > Since commit 2bd82484bb4c ("xps: fix xps for stacked devices"),
> > skb->napi_id shares storage with sender_cpu. RX tracepoints using
> > net_dev_rx_verbose_template read skb->napi_id directly and can therefore
> > report sender_cpu values as if they were NAPI IDs.
> > 
> > For example, on the loopback path this can report 1 as napi_id, where 1
> So I think veth_forward_skb->__netif_rx could be affected as well?

Yes. Just in case, I've confirmed the same behavior in the veth path.
The mentioned loopback path is just a single example of possibly
affected paths.

Thanks,
Kohei

> > comes from raw_smp_processor_id() + 1 in the XPS path:
> > 
> >    # bpftrace -e 'tracepoint:net:netif_rx_entry{ print(args->napi_id); }'
> >    # taskset -c 0 ping -c 1 ::1
> > 
> > Report only valid NAPI IDs in these tracepoints and use 0 otherwise.
> > 
> > Fixes: 2bd82484bb4c ("xps: fix xps for stacked devices")
> > Signed-off-by: Kohei Enju <kohei@enjuk.jp>
> > ---
> >   include/trace/events/net.h | 4 +++-
> >   1 file changed, 3 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/trace/events/net.h b/include/trace/events/net.h
> > index fdd9ad474ce3..dbc2c5598e35 100644
> > --- a/include/trace/events/net.h
> > +++ b/include/trace/events/net.h
> > @@ -10,6 +10,7 @@
> >   #include <linux/if_vlan.h>
> >   #include <linux/ip.h>
> >   #include <linux/tracepoint.h>
> > +#include <net/busy_poll.h>
> >   TRACE_EVENT(net_dev_start_xmit,
> > @@ -208,7 +209,8 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,
> >   	TP_fast_assign(
> >   		__assign_str(name);
> >   #ifdef CONFIG_NET_RX_BUSY_POLL
> > -		__entry->napi_id = skb->napi_id;
> > +		__entry->napi_id = napi_id_valid(skb->napi_id) ?
> > +				   skb->napi_id : 0;
> >   #else
> >   		__entry->napi_id = 0;
> >   #endif

^ permalink raw reply

* Re: [PATCH v13 17/18] unwind_user/sframe/x86: Enable sframe unwinding on x86
From: Jens Remus @ 2026-04-20 12:35 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel, bpf, x86, linux-mm,
	Steven Rostedt
  Cc: Josh Poimboeuf, Masami Hiramatsu, Mathieu Desnoyers,
	Peter Zijlstra, Ingo Molnar, Jiri Olsa, Arnaldo Carvalho de Melo,
	Namhyung Kim, Thomas Gleixner, Andrii Nakryiko, Indu Bhagat,
	Jose E. Marchesi, Beau Belgrave, Linus Torvalds, Andrew Morton,
	Florian Weimer, Kees Cook, Carlos O'Donell, Sam James,
	Dylan Hatch, Borislav Petkov, Dave Hansen, David Hildenbrand,
	H. Peter Anvin, Liam R. Howlett, Lorenzo Stoakes, Michal Hocko,
	Mike Rapoport, Suren Baghdasaryan, Vlastimil Babka,
	Heiko Carstens, Vasily Gorbik, Steven Rostedt (Google)
In-Reply-To: <20260127150554.2760964-18-jremus@linux.ibm.com>

On 1/27/2026 4:05 PM, Jens Remus wrote:

> diff --git a/arch/x86/include/asm/unwind_user.h b/arch/x86/include/asm/unwind_user.h

> @@ -15,6 +15,40 @@ static inline int unwind_user_word_size(struct pt_regs *regs)
>  	return user_64bit_mode(regs) ? 8 : 4;
>  }
>  
> +static inline int unwind_user_get_reg(unsigned long *val, unsigned int regnum)
> +{
> +#ifdef CONFIG_X86_64
> +	const struct pt_regs *regs = task_pt_regs(current);
> +
> +	switch (regnum) {
> +	/* DWARF register numbers 0..15 */
> +	case  0: *val = regs->ax; break;
> +	case  1: *val = regs->dx; break;
> +	case  2: *val = regs->cx; break;
> +	case  3: *val = regs->bx; break;
> +	case  4: *val = regs->si; break;
> +	case  5: *val = regs->di; break;
> +	case  6: *val = regs->bp; break;
> +	case  7: *val = regs->sp; break;
> +	case  8: *val = regs->r8; break;
> +	case  9: *val = regs->r9; break;
> +	case 10: *val = regs->r10; break;
> +	case 11: *val = regs->r11; break;
> +	case 12: *val = regs->r12; break;
> +	case 13: *val = regs->r13; break;
> +	case 14: *val = regs->r14; break;
> +	case 15: *val = regs->r15; break;
> +	default:
> +		return -EINVAL;
> +	}
> +	return 0;
> +#else /* !CONFIG_X86_64 */
> +	return -EINVAL;
> +#endif /* !CONFIG_X86_64 */
> +

Nit: Superfluous empty line.

> +}
> +#define unwind_user_get_reg unwind_user_get_reg
> +
>  #endif /* CONFIG_UNWIND_USER */
>  
>  #ifdef CONFIG_HAVE_UNWIND_USER_FP
Regards,
Jens
-- 
Jens Remus
Linux on Z Development (D3303)
jremus@de.ibm.com / jremus@linux.ibm.com

IBM Deutschland Research & Development GmbH; Vorsitzender des Aufsichtsrats: Wolfgang Wendt; Geschäftsführung: David Faller; Sitz der Gesellschaft: Ehningen; Registergericht: Amtsgericht Stuttgart, HRB 243294
IBM Data Privacy Statement: https://www.ibm.com/privacy/


^ permalink raw reply

* Re: [PATCH 7.2 v16 01/13] mm/khugepaged: generalize hugepage_vma_revalidate for mTHP support
From: Usama Arif @ 2026-04-20 12:59 UTC (permalink / raw)
  To: Nico Pache
  Cc: Usama Arif, linux-doc, linux-kernel, linux-mm, linux-trace-kernel,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, Liam.Howlett, ljs, mathieu.desnoyers, matthew.brost,
	mhiramat, mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260419185750.260784-2-npache@redhat.com>

On Sun, 19 Apr 2026 12:57:38 -0600 Nico Pache <npache@redhat.com> wrote:

> For khugepaged to support different mTHP orders, we must generalize this
> to check if the PMD is not shared by another VMA and that the order is
> enabled.
> 
> No functional change in this patch. Also correct a comment about the
> functionality of the revalidation and fix a double space issues.
> 
> Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
> Reviewed-by: Lance Yang <lance.yang@linux.dev>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
> Reviewed-by: Zi Yan <ziy@nvidia.com>
> Acked-by: David Hildenbrand (Arm) <david@kernel.org>
> Co-developed-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  mm/khugepaged.c | 20 ++++++++++++--------
>  1 file changed, 12 insertions(+), 8 deletions(-)
> 

Acked-by: Usama Arif <usama.arif@linux.dev>

^ permalink raw reply

* Re: [PATCH 7.2 v16 02/13] mm/khugepaged: generalize alloc_charge_folio()
From: Usama Arif @ 2026-04-20 13:05 UTC (permalink / raw)
  To: Nico Pache
  Cc: Usama Arif, linux-doc, linux-kernel, linux-mm, linux-trace-kernel,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, Liam.Howlett, ljs, mathieu.desnoyers, matthew.brost,
	mhiramat, mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260419185750.260784-3-npache@redhat.com>

On Sun, 19 Apr 2026 12:57:39 -0600 Nico Pache <npache@redhat.com> wrote:

> From: Dev Jain <dev.jain@arm.com>
> 
> Pass order to alloc_charge_folio() and update mTHP statistics.
> 
> Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
> Reviewed-by: Lance Yang <lance.yang@linux.dev>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
> Reviewed-by: Zi Yan <ziy@nvidia.com>
> Acked-by: David Hildenbrand (Arm) <david@kernel.org>
> Co-developed-by: Nico Pache <npache@redhat.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
>  Documentation/admin-guide/mm/transhuge.rst |  8 ++++++++
>  include/linux/huge_mm.h                    |  2 ++
>  mm/huge_memory.c                           |  4 ++++
>  mm/khugepaged.c                            | 17 +++++++++++------
>  4 files changed, 25 insertions(+), 6 deletions(-)
> 

Acked-by: Usama Arif <usama.arif@linux.dev>

^ permalink raw reply

* [PATCH AUTOSEL 7.0-6.12] ring-buffer: Enforce read ordering of trace_buffer cpumask and buffers
From: Sasha Levin @ 2026-04-20 13:07 UTC (permalink / raw)
  To: patches, stable
  Cc: Vincent Donnefort, Steven Rostedt (Google), Sasha Levin, mhiramat,
	linux-kernel, linux-trace-kernel
In-Reply-To: <20260420131539.986432-1-sashal@kernel.org>

From: Vincent Donnefort <vdonnefort@google.com>

[ Upstream commit 20ad8b0888be392eb2c4c3654805eb8594952373 ]

On CPU hotplug, if it is the first time a trace_buffer sees a CPU, a
ring_buffer_per_cpu will be allocated and its corresponding bit toggled
in the cpumask. Many readers check this cpumask to know if they can
safely read the ring_buffer_per_cpu but they are doing so without memory
ordering and may observe the cpumask bit set while having NULL buffer
pointer.

Enforce the memory read ordering by sending an IPI to all online CPUs.
The hotplug path is a slow-path anyway and it saves us from adding read
barriers in numerous call sites.

Link: https://patch.msgid.link/20260401053659.3458961-1-vdonnefort@google.com
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---

LLM Generated explanations, may be completely bogus:

Based on my comprehensive analysis, I have sufficient information to
make a decision. Let me now lay out the full analysis following the
9-phase workflow.

---

## PHASE 1: COMMIT MESSAGE FORENSICS

**Step 1.1 - Subject Line Parse:**
Record: Subsystem=`ring-buffer`, action verb=`Enforce`, summary=`Enforce
read ordering of trace_buffer cpumask and buffers`. "Enforce" implies
fixing a missing ordering guarantee = bug fix.

**Step 1.2 - Tags:**
Record:
- Link: https://patch.msgid.link/20260401053659.3458961-1-
  vdonnefort@google.com (lore discussion)
- Signed-off-by: Vincent Donnefort (author)
- Suggested-by: Steven Rostedt (Google) - tracing maintainer suggested
  the fix approach
- Signed-off-by: Steven Rostedt - applied via maintainer tree
- No Fixes: tag
- No Cc: stable tag
- No Reported-by tag

**Step 1.3 - Body Analysis:**
Record: Bug described is a memory ordering issue. On CPU hotplug,
`trace_rb_cpu_prepare()` allocates a `ring_buffer_per_cpu` and sets the
corresponding cpumask bit. Readers on other CPUs check the cpumask
first, then access `buffer->buffers[cpu]`. Without read ordering, on
weakly-ordered architectures a reader may observe the cpumask bit set
while still seeing NULL for `buffer->buffers[cpu]`, causing NULL pointer
dereference. Fix uses IPI + barrier trick to force ordering across all
CPUs. Language "may observe" indicates defensive/analytical fix rather
than reported crash.

**Step 1.4 - Hidden Fix Detection:**
Record: This IS clearly a bug fix (memory ordering/race fix), though
without explicit "fix" in subject. The word "Enforce" indicates adding a
missing ordering guarantee - a classic race fix pattern.

---

## PHASE 2: DIFF ANALYSIS

**Step 2.1 - Inventory:**
Record: Single file `kernel/trace/ring_buffer.c`, +18 -1 lines. New
function `rb_cpu_sync()` (6 lines). Modified `trace_rb_cpu_prepare()`
(replace single `smp_wmb()` with conditional IPI + wmb). Surgical fix.

**Step 2.2 - Code Flow:**
Record:
- Before: `rb_allocate_cpu_buffer()` -> `smp_wmb()` ->
  `cpumask_set_cpu()`. Writer-side ordering only.
- After: `rb_allocate_cpu_buffer()` -> `on_each_cpu(rb_cpu_sync, ...)`
  (IPI to all online CPUs, each executes `smp_rmb()`) -> `smp_wmb()` ->
  `cpumask_set_cpu()`. Forces synchronization on reader CPUs.
- `if (unlikely(system_state == SYSTEM_RUNNING))` guard avoids IPI
  during early boot when IPI infrastructure may not be ready.

**Step 2.3 - Bug Mechanism:**
Record: Category (b) Synchronization/race condition fix. Specifically:
data race between writer (trace_rb_cpu_prepare) and many readers/writers
on other CPUs. On weakly-ordered CPUs (ARM, ARM64, PowerPC), readers can
observe the cpumask bit set before seeing the newly-allocated buffer
pointer, leading to NULL dereference. The IPI mechanism acts as a cross-
CPU barrier: after `on_each_cpu()` returns synchronously, every online
CPU has executed `smp_rmb()` via the IPI handler. Any subsequent
cpumask_set_cpu write becomes visible only after that sync, ensuring any
reader observing the bit also observes the buffer pointer.

**Step 2.4 - Fix Quality:**
Record: Fix is conceptually elegant and correct. Uses a well-known
kernel pattern (IPI-as-barrier). Only affects the slow CPU hotplug path.
Low regression risk - the IPI is guarded against early-boot execution,
and CPU hotplug is inherently rare. The alternative (adding `smp_rmb()`
to 30+ callsites) would be much more invasive and error-prone.

---

## PHASE 3: GIT HISTORY

**Step 3.1 - Blame:**
Record: The `smp_wmb()` + `cpumask_set_cpu` pattern was introduced by
commit `b32614c03413f8` "tracing/rb: Convert to hotplug state machine"
by Sebastian Andrzej Siewior, dated Nov 27, 2016 (v4.10). This code has
been present in all stable trees since v4.10.

**Step 3.2 - Fixes: Tag Follow-up:**
Record: No Fixes: tag. The buggy code (the missing read barrier on
reader side) has been present since 2016. The root cause is systemic -
many reader callsites were added over the years without matching
smp_rmb().

**Step 3.3 - File History:**
Record: `kernel/trace/ring_buffer.c` is actively maintained (578
commits, recent activity). No prerequisite patches needed. Standalone
fix (v1 only).

**Step 3.4 - Author Context:**
Record: Vincent Donnefort is a regular ring-buffer contributor (6+
commits to ring_buffer.c in 2024). Steven Rostedt (tracing maintainer)
suggested the approach. Both are highly credentialed.

**Step 3.5 - Dependencies:**
Record: None. Self-contained fix. `on_each_cpu`, `smp_rmb`, `smp_wmb`,
`system_state`/`SYSTEM_RUNNING` are all long-standing kernel primitives
available in all stable trees.

---

## PHASE 4: MAILING LIST

**Step 4.1 - Patch Discussion:**
Record: `b4 dig -c 20ad8b0888be3` returned lore URL https://lore.kernel.
org/all/20260401053659.3458961-1-vdonnefort@google.com/. Thread contains
only the single patch submission (v1). No review comments, no NAKs, no
"Cc: stable" suggestions. Thread mbox has just the submission message.

**Step 4.2 - Reviewers:**
Record: To: rostedt@goodmis.org, mhiramat@kernel.org,
mathieu.desnoyers@efficios.com, linux-trace-kernel@vger.kernel.org. Cc:
kernel-team@android.com, linux-kernel@vger.kernel.org. All relevant
maintainers CC'd.

**Step 4.3 - Bug Report:**
Record: No bug report linked. No Reported-by. This is a defensive fix
based on code analysis. Web fetch of lore was blocked by Anubis bot
protection.

**Step 4.4 - Related Patches:**
Record: `b4 dig -c -a` confirmed only v1 exists. Standalone single
patch, not part of a series.

**Step 4.5 - Stable Discussion:**
Record: Not searched separately; no Cc: stable in the submission
indicates the author/maintainer did not explicitly nominate it for
stable.

---

## PHASE 5: CODE SEMANTIC ANALYSIS

**Step 5.1 - Key Functions:**
Record: `trace_rb_cpu_prepare()` (modified), `rb_cpu_sync()` (new).

**Step 5.2 - Callers of `trace_rb_cpu_prepare`:**
Record: Registered as CPU hotplug callback via
`cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, ...)`. Called on CPU
online transition for each `trace_buffer` registered. Runs in kernel
context on the control CPU (not the target CPU).

**Step 5.3 - Readers Affected (impact surface):**
Record: `grep cpumask_test_cpu.*buffer->cpumask` in ring_buffer.c shows
30+ callsites that use the pattern `if (!cpumask_test_cpu(cpu,
buffer->cpumask)) return; cpu_buffer = buffer->buffers[cpu];`. Key hot
paths include:
- `ring_buffer_lock_reserve()` line 4716 - the main WRITE path, called
  from every trace event
- `ring_buffer_peek()` line 5874 (reader)
- `ring_buffer_record_disable_cpu/enable_cpu` - userspace-reachable via
  tracefs
- `rb_wait()` via userspace read of per-CPU trace_pipe_raw

These are user-reachable from tracefs and syscall paths.

**Step 5.4 - Bug Reachability:**
Record: User-reachable. CPU hotplug is triggered by:
- Boot (initial CPU online)
- Suspend/resume cycle (all CPUs brought down/up)
- `/sys/devices/system/cpu/cpuX/online` toggling by root
- VM CPU hotplug on cloud instances

Concurrent readers on other CPUs during hotplug could race with the
buffer allocation. On weakly-ordered archs (ARM64 very common), the race
is realistic.

**Step 5.5 - Similar Patterns:**
Record: The IPI-as-barrier pattern is used elsewhere in the kernel
(e.g., `kernel/time/hrtimer.c:958` `on_each_cpu(retrigger_next_event,
NULL, 1)`). It's a well-established idiom.

---

## PHASE 6: STABLE TREE ANALYSIS

**Step 6.1 - Buggy Code in Stable:**
Record: Confirmed by inspecting archived versions - the exact
`smp_wmb(); cpumask_set_cpu(cpu, buffer->cpumask);` pattern exists in
v6.1, v6.6, v6.12, v6.18 (all active stable trees) and has been there
since v4.10. Many readers lacking `smp_rmb()` are also present across
all those versions.

**Step 6.2 - Backport Complications:**
Record: The patch context is simple and stable across versions. The line
numbers differ but the exact 2-line context
(`smp_wmb();\n\tcpumask_set_cpu(cpu, buffer->cpumask);`) is unchanged
across all examined stable trees. Should apply cleanly with fuzz.

**Step 6.3 - Related Fixes in Stable:**
Record: No prior related fix found. Searched for memory ordering/barrier
commits in kernel/trace/ - no matches targeting this specific ordering
issue.

---

## PHASE 7: SUBSYSTEM CONTEXT

**Step 7.1 - Subsystem/Criticality:**
Record: `kernel/trace/ring_buffer.c` is the core ring buffer backing
tracefs/ftrace/perf/BPF. Criticality = IMPORTANT (affects all users of
tracing - kernel developers, security/observability tools, distros using
ftrace).

**Step 7.2 - Subsystem Activity:**
Record: Very active - ~578 commits to ring_buffer.c, recent activity in
2024-2026. Well-maintained with careful attention to correctness.

---

## PHASE 8: IMPACT AND RISK

**Step 8.1 - Affected Population:**
Record: Users of tracing (ftrace, perf, BPF) on systems with CPU
hotplug, especially on weakly-ordered architectures (ARM64 - ubiquitous
in cloud, mobile, embedded; PowerPC - servers). Any Linux system where
tracing is active during suspend/resume is potentially affected.

**Step 8.2 - Trigger Conditions:**
Record: Requires:
1. CPU hotplug event bringing online a CPU not previously seen by a
   trace_buffer
2. Concurrent tracing activity on other CPUs
3. Weakly-ordered CPU architecture (or unlucky timing on x86)

Trigger is realistic (suspend/resume of any laptop, VM CPU hotplug) but
timing-sensitive. Unprivileged users cannot directly trigger CPU
hotplug, but tracing (if accessible) can be used to exercise the hot
path. System_state transitions occur normally at boot.

**Step 8.3 - Failure Severity:**
Record: NULL pointer dereference -> kernel oops/panic. CRITICAL severity
(system crash). No data corruption but immediate loss of system
availability.

**Step 8.4 - Risk-Benefit:**
Record:
- Benefit: MEDIUM-HIGH. Prevents a real kernel crash in a widely-used
  subsystem on common architectures.
- Risk: LOW. 18-line change. IPI only on slow-path (CPU hotplug).
  SYSTEM_RUNNING guard prevents early-boot issues. Standard kernel
  idiom. Reviewed by subsystem maintainer.
- Ratio: Favorable for backport.

---

## PHASE 9: SYNTHESIS

**Step 9.1 - Evidence Summary:**

FOR backport:
- Real memory-ordering race that can cause NULL dereference (kernel
  crash)
- Small, surgical fix (18 lines, single file)
- Buggy pattern present in all active stable trees since v4.10
- Reviewed and authored by the tracing maintainer (Steven Rostedt)
- Applies cleanly (context unchanged across stable versions)
- Affects commonly-used architecture (ARM64)
- Impact path includes hot tracing code called from many paths

AGAINST backport:
- No Fixes: tag
- No Cc: stable tag
- No Reported-by - defensive fix based on code analysis, not observed
  crash
- Bug was latent for 10 years without mainstream report
- Commit message uses conditional "may observe"

**Step 9.2 - Stable Rules Checklist:**
1. Obviously correct and tested? Yes - uses standard IPI-as-barrier
   idiom, accepted by maintainer
2. Fixes a real bug? Yes - verifiable memory-ordering race
3. Important (crash)? Yes - NULL pointer dereference
4. Small and contained? Yes - 18 lines, single file
5. No new features? Correct - pure ordering fix
6. Can apply to stable? Yes - simple context, unchanged code structure

**Step 9.3 - Exception Category:**
Not a device-ID/quirk/build/doc/DT fix. Standard bug fix category.

**Step 9.4 - Decision:**

This is a borderline-positive case. The fix addresses a genuine memory-
ordering bug that can cause a kernel crash (NULL dereference) on weakly-
ordered architectures. The fix is small, well-reasoned, reviewed by the
subsystem maintainer, and very low regression risk. While the absence of
Fixes:/Cc:stable tags and concrete reports suggests lower urgency,
stable rules explicitly include theoretical races with severe
consequences. The tracing subsystem is widely used, and ARM64 (where
this bug most readily manifests) is prevalent. The fix applies cleanly
to stable trees.

---

## Verification

- [Phase 1] Parsed commit message: confirmed no Fixes:, no Cc: stable,
  no Reported-by; Suggested-by: Steven Rostedt (tracing maintainer)
- [Phase 1] Language analysis: "may observe" and "Enforce" indicate
  defensive race fix based on code analysis
- [Phase 2] `git show 20ad8b0888be3 --stat`: verified 18 insertions, 1
  deletion, single file
- [Phase 2] Diff inspection: verified replacement of `smp_wmb()` with
  conditional `on_each_cpu + smp_wmb` block; new `rb_cpu_sync` helper
  with `smp_rmb()`
- [Phase 3] `git blame -L 7508,7512 kernel/trace/ring_buffer.c`:
  confirmed `smp_wmb();/cpumask_set_cpu` pattern introduced by
  b32614c03413f8 in v4.10 (Nov 2016)
- [Phase 3] `git show b32614c03413f8`: confirmed original commit by
  Sebastian Andrzej Siewior, "tracing/rb: Convert to hotplug state
  machine"
- [Phase 3] `git log --author="Vincent Donnefort" --
  kernel/trace/ring_buffer.c`: author has 6+ ring_buffer.c commits,
  regular contributor
- [Phase 4] `b4 dig -c 20ad8b0888be3`: resolved to lore URL, confirmed
  submission thread
- [Phase 4] `b4 dig -c -a`: confirmed only v1 exists, no revisions
- [Phase 4] `/tmp/rb_sync_thread.mbox` read: thread has only the single
  patch submission, no review replies, no stable nomination in
  discussion
- [Phase 5] `grep cpumask_test_cpu.*buffer->cpumask`: confirmed 30+
  reader callsites in ring_buffer.c
- [Phase 5] Verified `ring_buffer_lock_reserve` (line 4716) uses the
  pattern - hot write path
- [Phase 5] Verified `on_each_cpu(x, NULL, 1)` idiom used elsewhere
  (kernel/time/hrtimer.c:958)
- [Phase 6] `git show v6.6/v6.12/v6.18:kernel/trace/ring_buffer.c`:
  confirmed identical 2-line context `smp_wmb();\ncpumask_set_cpu(cpu,
  buffer->cpumask);` present in all major stable trees -> patch will
  apply cleanly
- [Phase 6] `git show v4.10:kernel/trace/ring_buffer.c`: confirmed
  pattern present at trace_rb_cpu_prepare since v4.10
- [Phase 7] `git log --oneline --since=2024 --
  kernel/trace/ring_buffer.c`: confirmed active subsystem with many
  recent commits
- [Phase 8] Confirmed reachability: cpumask checks precede
  `buffer->buffers[cpu]` dereference in hot write path
  (`ring_buffer_lock_reserve`) and reader paths - NULL deref is possible
  if race occurs
- UNVERIFIED: Cannot confirm whether this race has actually been
  observed in production (no Reported-by, no Link to bug tracker, Lore
  WebFetch blocked by bot protection). Assessment is based on code
  analysis and consequences of the race, which are severe when it does
  trigger.

**YES**

 kernel/trace/ring_buffer.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 170170bd83bd9..10d2d0404434d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -7468,6 +7468,12 @@ int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu)
 	return 0;
 }

+static void rb_cpu_sync(void *data)
+{
+	/* Not really needed, but documents what is happening */
+	smp_rmb();
+}
+
 /*
  * We only allocate new buffers, never free them if the CPU goes down.
  * If we were to free the buffer, then the user would lose any trace that was in
@@ -7506,7 +7512,18 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node)
 		     cpu);
 		return -ENOMEM;
 	}
-	smp_wmb();
+
+	/*
+	 * Ensure trace_buffer readers observe the newly allocated
+	 * ring_buffer_per_cpu before they check the cpumask. Instead of using a
+	 * read barrier for all readers, send an IPI.
+	 */
+	if (unlikely(system_state == SYSTEM_RUNNING)) {
+		on_each_cpu(rb_cpu_sync, NULL, 1);
+		/* Not really needed, but documents what is happening */
+		smp_wmb();
+	}
+
 	cpumask_set_cpu(cpu, buffer->cpumask);
 	return 0;
 }
-- 
2.53.0

^ permalink raw reply related

* [PATCH AUTOSEL 6.18] tracing/probe: reject non-closed empty immediate strings
From: Sasha Levin @ 2026-04-20 13:08 UTC (permalink / raw)
  To: patches, stable
  Cc: Pengpeng Hou, Steven Rostedt (Google), Masami Hiramatsu (Google),
	Sasha Levin, linux-kernel, linux-trace-kernel
In-Reply-To: <20260420131539.986432-1-sashal@kernel.org>

From: Pengpeng Hou <pengpeng@iscas.ac.cn>

[ Upstream commit 4346be6577aaa04586167402ae87bbdbe32484a4 ]

parse_probe_arg() accepts quoted immediate strings and passes the body
after the opening quote to __parse_imm_string(). That helper currently
computes strlen(str) and immediately dereferences str[len - 1], which
underflows when the body is empty and not closed with double-quotation.

Reject empty non-closed immediate strings before checking for the closing quote.

Link: https://lore.kernel.org/all/20260401160315.88518-1-pengpeng@iscas.ac.cn/

Fixes: a42e3c4de964 ("tracing/probe: Add immediate string parameter support")
Signed-off-by: Pengpeng Hou <pengpeng@iscas.ac.cn>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---

LLM Generated explanations, may be completely bogus:

Error: Failed to generate final synthesis

 kernel/trace/trace_probe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 5cbdc423afebc..d7adbf1536c8b 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1068,7 +1068,7 @@ static int __parse_imm_string(char *str, char **pbuf, int offs)
 {
 	size_t len = strlen(str);
 
-	if (str[len - 1] != '"') {
+	if (!len || str[len - 1] != '"') {
 		trace_probe_log_err(offs + len, IMMSTR_NO_CLOSE);
 		return -EINVAL;
 	}
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH 7.2 v16 03/13] mm/khugepaged: rework max_ptes_* handling with helper functions
From: Usama Arif @ 2026-04-20 13:15 UTC (permalink / raw)
  To: Nico Pache
  Cc: Usama Arif, linux-doc, linux-kernel, linux-mm, linux-trace-kernel,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, Liam.Howlett, ljs, mathieu.desnoyers, matthew.brost,
	mhiramat, mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260419185750.260784-4-npache@redhat.com>

On Sun, 19 Apr 2026 12:57:40 -0600 Nico Pache <npache@redhat.com> wrote:

> The following cleanup reworks all the max_ptes_* handling into helper
> functions. This increases the code readability and will later be used to
> implement the mTHP handling of these variables.
> 
> With these changes we abstract all the madvise_collapse() special casing
> (dont respect the sysctls) away from the functions that utilize them. And
> will later in this series to cleanly restrict mTHP collapses behaviors.
> 
> Suggested-by: David Hildenbrand <david@kernel.org>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  mm/khugepaged.c | 114 +++++++++++++++++++++++++++++++++---------------
>  1 file changed, 78 insertions(+), 36 deletions(-)
> 

The old code re-read khugepaged_max_ptes_* on every loop iteration; the new
code snapshots them once per scan call. If userspace writes the sysctl
mid-scan, old behavior reacted within the scan, new behavior uses the value
sampled at entry. This is completely ok IMO, but might be good to call out.

Also might be good to write no functional change intended apart from
above in the commit message?

Acked-by: Usama Arif <usama.arif@linux.dev>


^ permalink raw reply

* [PATCH AUTOSEL 6.18] btrfs: tracepoints: get correct superblock from dentry in event btrfs_sync_file()
From: Sasha Levin @ 2026-04-20 13:08 UTC (permalink / raw)
  To: patches, stable
  Cc: Goldwyn Rodrigues, Boris Burkov, Goldwyn Rodrigues, David Sterba,
	Sasha Levin, clm, rostedt, mhiramat, linux-btrfs, linux-kernel,
	linux-trace-kernel
In-Reply-To: <20260420131539.986432-1-sashal@kernel.org>

From: Goldwyn Rodrigues <rgoldwyn@suse.de>

[ Upstream commit a85b46db143fda5869e7d8df8f258ccef5fa1719 ]

If overlay is used on top of btrfs, dentry->d_sb translates to overlay's
super block and fsid assignment will lead to a crash.

Use file_inode(file)->i_sb to always get btrfs_sb.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---

LLM Generated explanations, may be completely bogus:

Error: Failed to generate final synthesis

 include/trace/events/btrfs.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 125bdc166bfed..0864700f76e0a 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -769,12 +769,15 @@ TRACE_EVENT(btrfs_sync_file,
 	),
 
 	TP_fast_assign(
-		const struct dentry *dentry = file->f_path.dentry;
-		const struct inode *inode = d_inode(dentry);
+		struct dentry *dentry = file_dentry(file);
+		struct inode *inode = file_inode(file);
+		struct dentry *parent = dget_parent(dentry);
+		struct inode *parent_inode = d_inode(parent);
 
-		TP_fast_assign_fsid(btrfs_sb(file->f_path.dentry->d_sb));
+		dput(parent);
+		TP_fast_assign_fsid(btrfs_sb(inode->i_sb));
 		__entry->ino		= btrfs_ino(BTRFS_I(inode));
-		__entry->parent		= btrfs_ino(BTRFS_I(d_inode(dentry->d_parent)));
+		__entry->parent		= btrfs_ino(BTRFS_I(parent_inode));
 		__entry->datasync	= datasync;
 		__entry->root_objectid	= btrfs_root_id(BTRFS_I(inode)->root);
 	),
-- 
2.53.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox