Linux Trace Kernel

Linux Trace Kernel
 help / color / mirror / Atom feed

* [PATCH v10 7/8] selftests/ftrace: Add a testcase for fprobe events on module
From: Masami Hiramatsu (Google) @ 2026-04-20 14:01 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Menglong Dong, Mathieu Desnoyers, jiang.biao, linux-kernel,
	linux-trace-kernel
In-Reply-To: <177669363667.132053.12454670015890859277.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Add a testcase for fprobe events on module, which unloads a kernel
module on which fprobe events are probing and ensure the ftrace
hash map is cleared correctly.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v10:
 - Fix module name typo in error case trap.
 Changes in v9:
 - Use "trace-events-sample" instead of "trace_events_sample"
 - Add checking unload module and remove core-kernel event case.
 - Check test module exists when unloading it in EXIT.
 Changes in v8:
 - Newly added.
---
 .../test.d/dynevent/add_remove_fprobe_module.tc    |   87 ++++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe_module.tc

diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe_module.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe_module.tc
new file mode 100644
index 000000000000..2915206777b6
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe_module.tc
@@ -0,0 +1,87 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Generic dynamic event - add/remove fprobe events on module
+# requires: dynamic_events "f[:[<group>/][<event>]] <func-name>[%return] [<args>]":README enabled_functions
+
+rmmod trace-events-sample ||:
+if ! modprobe trace-events-sample ; then
+  echo "No trace-events sample module - please make CONFIG_SAMPLE_TRACE_EVENTS=m"
+  exit_unresolved;
+fi
+trap "lsmod | grep -q trace_events_sample && rmmod trace-events-sample" EXIT
+
+echo 0 > events/enable
+echo > dynamic_events
+
+FUNC1='foo_bar*'
+FUNC2='vfs_read'
+
+:;: "Add an event on the test module" ;:
+echo "f:test1 $FUNC1" >> dynamic_events
+echo 1 > events/fprobes/test1/enable
+
+:;: "Ensure it is enabled" ;:
+funcs=`cat enabled_functions | wc -l`
+test $funcs -ne 0
+
+:;: "Check the enabled_functions is cleared on unloading" ;:
+rmmod trace-events-sample
+funcs=`cat enabled_functions | wc -l`
+test $funcs -eq 0
+
+:;: "Check it is kept clean" ;:
+modprobe trace-events-sample
+echo 1 > events/fprobes/test1/enable || echo "OK"
+funcs=`cat enabled_functions | wc -l`
+test $funcs -eq 0
+
+:;: "Add another event not on the test module" ;:
+echo "f:test2 $FUNC2" >> dynamic_events
+echo 1 > events/fprobes/test2/enable
+
+:;: "Ensure it is enabled" ;:
+ofuncs=`cat enabled_functions | wc -l`
+test $ofuncs -ne 0
+
+:;: "Disable and remove the first event"
+echo 0 > events/fprobes/test1/enable
+echo "-:fprobes/test1" >> dynamic_events
+funcs=`cat enabled_functions | wc -l`
+test $ofuncs -eq $funcs
+
+:;: "Disable and remove other events" ;:
+echo 0 > events/fprobes/enable
+echo > dynamic_events
+funcs=`cat enabled_functions | wc -l`
+test $funcs -eq 0
+
+rmmod trace-events-sample
+
+:;: "Add events on kernel and test module" ;:
+modprobe trace-events-sample
+echo "f:test1 $FUNC1" >> dynamic_events
+echo 1 > events/fprobes/test1/enable
+echo "f:test2 $FUNC2" >> dynamic_events
+echo 1 > events/fprobes/test2/enable
+ofuncs=`cat enabled_functions | wc -l`
+test $ofuncs -ne 0
+
+:;: "Unload module (ftrace entry should be removed)" ;:
+rmmod trace-events-sample
+funcs=`cat enabled_functions | wc -l`
+test $funcs -ne 0
+test $ofuncs -ne $funcs
+
+:;: "Disable and remove core-kernel fprobe event" ;:
+echo 0 > events/fprobes/test2/enable
+echo "-:fprobes/test2" >> dynamic_events
+
+:;: "Ensure ftrace is disabled." ;:
+funcs=`cat enabled_functions | wc -l`
+test $funcs -eq 0
+
+echo 0 > events/fprobes/enable
+echo > dynamic_events
+
+trap "" EXIT
+clear_trace


^ permalink raw reply related

* [PATCH v10 8/8] selftests/ftrace: Add a testcase for multiple fprobe events
From: Masami Hiramatsu (Google) @ 2026-04-20 14:01 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Menglong Dong, Mathieu Desnoyers, jiang.biao, linux-kernel,
	linux-trace-kernel
In-Reply-To: <177669363667.132053.12454670015890859277.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Add a testcase for multiple fprobe events on the same function
so that it clears ftrace hash map correctly when removing the
events.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 .../test.d/dynevent/add_remove_multiple_fprobe.tc  |   69 ++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/add_remove_multiple_fprobe.tc

diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_multiple_fprobe.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_multiple_fprobe.tc
new file mode 100644
index 000000000000..f2cbf2ffd29b
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_multiple_fprobe.tc
@@ -0,0 +1,69 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Generic dynamic event - add/remove multiple fprobe events on the same function
+# requires: dynamic_events "f[:[<group>/][<event>]] <func-name>[%return] [<args>]":README enabled_functions
+
+echo 0 > events/enable
+echo > dynamic_events
+
+PLACE=vfs_read
+PLACE2=vfs_open
+
+:;: 'Ensure no other ftrace user' ;:
+test `cat enabled_functions | wc -l` -eq 0 || exit_unresolved
+
+:;: 'Test case 1: leave entry event' ;:
+:;: 'Add entry and exit events on the same place' ;:
+echo "f:event1 ${PLACE}" >> dynamic_events
+echo "f:event2 ${PLACE}%return" >> dynamic_events
+
+:;: 'Enable both of them' ;:
+echo 1 > events/fprobes/enable
+test `cat enabled_functions | wc -l` -eq 1
+
+:;: 'Disable and remove exit event' ;:
+echo 0 > events/fprobes/event2/enable
+echo -:event2 >> dynamic_events
+
+:;: 'Disable and remove all events' ;:
+echo 0 > events/fprobes/enable
+echo > dynamic_events
+
+:;: 'Add another event' ;:
+echo "f:event3 ${PLACE2}%return" > dynamic_events
+echo 1 > events/fprobes/enable
+test `cat enabled_functions | wc -l` -eq 1
+
+:;: 'No other ftrace user' ;:
+echo 0 > events/fprobes/enable
+echo > dynamic_events
+test `cat enabled_functions | wc -l` -eq 0
+
+:;: 'Test case 2: leave exit event' ;:
+:;: 'Add entry and exit events on the same place' ;:
+echo "f:event1 ${PLACE}" >> dynamic_events
+echo "f:event2 ${PLACE}%return" >> dynamic_events
+
+:;: 'Enable both of them' ;:
+echo 1 > events/fprobes/enable
+test `cat enabled_functions | wc -l` -eq 1
+
+:;: 'Disable and remove entry event' ;:
+echo 0 > events/fprobes/event1/enable
+echo -:event1 >> dynamic_events
+
+:;: 'Disable and remove all events' ;:
+echo 0 > events/fprobes/enable
+echo > dynamic_events
+
+:;: 'Add another event' ;:
+echo "f:event3 ${PLACE2}" > dynamic_events
+echo 1 > events/fprobes/enable
+test `cat enabled_functions | wc -l` -eq 1
+
+:;: 'No other ftrace user' ;:
+echo 0 > events/fprobes/enable
+echo > dynamic_events
+test `cat enabled_functions | wc -l` -eq 0
+
+clear_trace


^ permalink raw reply related

* Re: [PATCH 7.2 v16 05/13] mm/khugepaged: generalize collapse_huge_page for mTHP collapse
From: Usama Arif @ 2026-04-20 14:20 UTC (permalink / raw)
  To: Nico Pache
  Cc: Usama Arif, linux-doc, linux-kernel, linux-mm, linux-trace-kernel,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, Liam.Howlett, ljs, mathieu.desnoyers, matthew.brost,
	mhiramat, mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260419185750.260784-6-npache@redhat.com>

On Sun, 19 Apr 2026 12:57:42 -0600 Nico Pache <npache@redhat.com> wrote:

> Pass an order and offset to collapse_huge_page to support collapsing anon
> memory to arbitrary orders within a PMD. order indicates what mTHP size we
> are attempting to collapse to, and offset indicates were in the PMD to
> start the collapse attempt.
> 
> For non-PMD collapse we must leave the anon VMA write locked until after
> we collapse the mTHP-- in the PMD case all the pages are isolated, but in
> the mTHP case this is not true, and we must keep the lock to prevent
> access/changes to the page tables. This can happen if the rmap walkers hit
> a pmd_none while the PMD entry is currently unavailable due to being
> temporarily removed during the collapse phase.
> 
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  mm/khugepaged.c | 103 +++++++++++++++++++++++++++---------------------
>  1 file changed, 57 insertions(+), 46 deletions(-)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 283bb63854a5..ff6f9f1883ed 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1198,42 +1198,36 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
>  	return SCAN_SUCCEED;
>  }
>  
> -static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address,
> -		int referenced, int unmapped, struct collapse_control *cc)
> +static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long start_addr,
> +		int referenced, int unmapped, struct collapse_control *cc,
> +		unsigned int order)
>  {
>  	LIST_HEAD(compound_pagelist);
>  	pmd_t *pmd, _pmd;
> -	pte_t *pte;
> +	pte_t *pte = NULL;
>  	pgtable_t pgtable;
>  	struct folio *folio;
>  	spinlock_t *pmd_ptl, *pte_ptl;
>  	enum scan_result result = SCAN_FAIL;
>  	struct vm_area_struct *vma;
>  	struct mmu_notifier_range range;
> +	bool anon_vma_locked = false;
> +	const unsigned long pmd_addr = start_addr & HPAGE_PMD_MASK;
> +	const unsigned long end_addr = start_addr + (PAGE_SIZE << order);
>  
> -	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> -
> -	/*
> -	 * Before allocating the hugepage, release the mmap_lock read lock.
> -	 * The allocation can take potentially a long time if it involves
> -	 * sync compaction, and we do not need to hold the mmap_lock during
> -	 * that. We will recheck the vma after taking it again in write mode.
> -	 */
> -	mmap_read_unlock(mm);
> -

My understanding now is that the caller will need to drop the mmap_read lock?

This needs explicit documentation at the start of the function IMO. I think
there are callers in later patches and its very easy to miss this.


> -	result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
> +	result = alloc_charge_folio(&folio, mm, cc, order);
>  	if (result != SCAN_SUCCEED)
>  		goto out_nolock;
>  
>  	mmap_read_lock(mm);
> -	result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> -					 HPAGE_PMD_ORDER);
> +	result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
> +					 &vma, cc, order);
>  	if (result != SCAN_SUCCEED) {
>  		mmap_read_unlock(mm);
>  		goto out_nolock;
>  	}
>  
> -	result = find_pmd_or_thp_or_none(mm, address, &pmd);
> +	result = find_pmd_or_thp_or_none(mm, pmd_addr, &pmd);
>  	if (result != SCAN_SUCCEED) {
>  		mmap_read_unlock(mm);
>  		goto out_nolock;
> @@ -1245,8 +1239,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>  		 * released when it fails. So we jump out_nolock directly in
>  		 * that case.  Continuing to collapse causes inconsistency.
>  		 */
> -		result = __collapse_huge_page_swapin(mm, vma, address, pmd,
> -						     referenced, HPAGE_PMD_ORDER);
> +		result = __collapse_huge_page_swapin(mm, vma, start_addr, pmd,
> +						     referenced, order);
>  		if (result != SCAN_SUCCEED)
>  			goto out_nolock;
>  	}
> @@ -1261,20 +1255,21 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>  	 * mmap_lock.
>  	 */
>  	mmap_write_lock(mm);
> -	result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> -					 HPAGE_PMD_ORDER);
> +	result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
> +					 &vma, cc, order);
>  	if (result != SCAN_SUCCEED)
>  		goto out_up_write;
>  	/* check if the pmd is still valid */
>  	vma_start_write(vma);
> -	result = check_pmd_still_valid(mm, address, pmd);
> +	result = check_pmd_still_valid(mm, pmd_addr, pmd);
>  	if (result != SCAN_SUCCEED)
>  		goto out_up_write;
>  
>  	anon_vma_lock_write(vma->anon_vma);
> +	anon_vma_locked = true;
>  
> -	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
> -				address + HPAGE_PMD_SIZE);
> +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start_addr,
> +				end_addr);
>  	mmu_notifier_invalidate_range_start(&range);
>  
>  	pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
> @@ -1286,26 +1281,23 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>  	 * Parallel GUP-fast is fine since GUP-fast will back off when
>  	 * it detects PMD is changed.
>  	 */
> -	_pmd = pmdp_collapse_flush(vma, address, pmd);
> +	_pmd = pmdp_collapse_flush(vma, pmd_addr, pmd);


For an mTHP collapse covering, say, 64KiB of a 2MiB PMD, the patch still
flushes the entire PMD via pmdp_collapse_flush and tlb_remove_table_sync_one.
That triggers cross-CPU TLB shootdowns for ~1.94MiB of unrelated mappings on
every successful sub-PMD collapse. Probably acceptable as a first cut?


>  	spin_unlock(pmd_ptl);
>  	mmu_notifier_invalidate_range_end(&range);
>  	tlb_remove_table_sync_one();
>  
> -	pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
> +	pte = pte_offset_map_lock(mm, &_pmd, start_addr, &pte_ptl);
>  	if (pte) {
> -		result = __collapse_huge_page_isolate(vma, address, pte, cc,
> -						      HPAGE_PMD_ORDER,
> -						      &compound_pagelist);
> +		result = __collapse_huge_page_isolate(vma, start_addr, pte, cc,
> +						      order, &compound_pagelist);
>  		spin_unlock(pte_ptl);
>  	} else {
>  		result = SCAN_NO_PTE_TABLE;
>  	}
>  
>  	if (unlikely(result != SCAN_SUCCEED)) {
> -		if (pte)
> -			pte_unmap(pte);
>  		spin_lock(pmd_ptl);
> -		BUG_ON(!pmd_none(*pmd));
> +		WARN_ON_ONCE(!pmd_none(*pmd));

Why was this turned into WARN_ON_ONCE? Would be good to add to commit message
what the reason is if it has been discussed earlier.

The next line writes a PMD entry over an existing one — that leaks the
previous page table or PMD-mapped folio and can corrupt VA mappings. BUG_ON
failed loudly and safely; WARN_ON_ONCE continues into the corruption and falls
silent after the first hit.


>  		/*
>  		 * We can only use set_pmd_at when establishing
>  		 * hugepmds and never for establishing regular pmds that
> @@ -1313,21 +1305,24 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>  		 */
>  		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
>  		spin_unlock(pmd_ptl);
> -		anon_vma_unlock_write(vma->anon_vma);
>  		goto out_up_write;
>  	}
>  
>  	/*
> -	 * All pages are isolated and locked so anon_vma rmap
> -	 * can't run anymore.
> +	 * For PMD collapse all pages are isolated and locked so anon_vma
> +	 * rmap can't run anymore. For mTHP collapse the PMD entry has been
> +	 * removed and not all pages are isolated and locked, so we must hold
> +	 * the lock to prevent neighboring folios from attempting to access
> +	 * this PMD until its reinstalled.
>  	 */
> -	anon_vma_unlock_write(vma->anon_vma);
> +	if (is_pmd_order(order)) {
> +		anon_vma_unlock_write(vma->anon_vma);
> +		anon_vma_locked = false;
> +	}
>  
>  	result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
> -					   vma, address, pte_ptl,
> -					   HPAGE_PMD_ORDER,
> -					   &compound_pagelist);
> -	pte_unmap(pte);
> +					   vma, start_addr, pte_ptl,
> +					   order, &compound_pagelist);
>  	if (unlikely(result != SCAN_SUCCEED))
>  		goto out_up_write;
>  
> @@ -1337,18 +1332,27 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>  	 * write.
>  	 */
>  	__folio_mark_uptodate(folio);
> -	pgtable = pmd_pgtable(_pmd);
> -
>  	spin_lock(pmd_ptl);
> -	BUG_ON(!pmd_none(*pmd));
> -	pgtable_trans_huge_deposit(mm, pmd, pgtable);
> -	map_anon_folio_pmd_nopf(folio, pmd, vma, address);
> +	WARN_ON_ONCE(!pmd_none(*pmd));
> +	if (is_pmd_order(order)) { /* PMD collapse */
> +		pgtable = pmd_pgtable(_pmd);
> +		pgtable_trans_huge_deposit(mm, pmd, pgtable);
> +		map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_addr);
> +	} else { /* mTHP collapse */
> +		map_anon_folio_pte_nopf(folio, pte, vma, start_addr, /*uffd_wp=*/ false);

map_anon_folio_pte_nopf calls set_ptes and modifies pagetable, while holding
pmd_ptl only. It should be safe as we expect pmd_none. But I think you should
put a comment about this?


> +		smp_wmb(); /* make PTEs visible before PMD. See pmd_install() */
> +		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
> +	}
>  	spin_unlock(pmd_ptl);
>  
>  	folio = NULL;
>  
>  	result = SCAN_SUCCEED;
>  out_up_write:
> +	if (anon_vma_locked)
> +		anon_vma_unlock_write(vma->anon_vma);
> +	if (pte)
> +		pte_unmap(pte);
>  	mmap_write_unlock(mm);
>  out_nolock:
>  	if (folio)
> @@ -1525,8 +1529,15 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>  out_unmap:
>  	pte_unmap_unlock(pte, ptl);
>  	if (result == SCAN_SUCCEED) {
> +		/*
> +		 * Before allocating the hugepage, release the mmap_lock read lock.
> +		 * The allocation can take potentially a long time if it involves
> +		 * sync compaction, and we do not need to hold the mmap_lock during
> +		 * that. We will recheck the vma after taking it again in write mode.
> +		 */
> +		mmap_read_unlock(mm);
>  		result = collapse_huge_page(mm, start_addr, referenced,
> -					    unmapped, cc);
> +					    unmapped, cc, HPAGE_PMD_ORDER);
>  		/* collapse_huge_page will return with the mmap_lock released */
>  		*lock_dropped = true;
>  	}
> -- 
> 2.53.0
> 
> 

^ permalink raw reply

* Re: [PATCH 7.2 v16 06/13] mm/khugepaged: skip collapsing mTHP to smaller orders
From: Usama Arif @ 2026-04-20 15:36 UTC (permalink / raw)
  To: Nico Pache
  Cc: Usama Arif, linux-doc, linux-kernel, linux-mm, linux-trace-kernel,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, Liam.Howlett, ljs, mathieu.desnoyers, matthew.brost,
	mhiramat, mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260419185750.260784-7-npache@redhat.com>

On Sun, 19 Apr 2026 12:57:43 -0600 Nico Pache <npache@redhat.com> wrote:

> khugepaged may try to collapse a mTHP to a smaller mTHP, resulting in
> some pages being unmapped. Skip these cases until we have a way to check
> if its ok to collapse to a smaller mTHP size (like in the case of a
> partially mapped folio). This check is also not done during the scan phase
> as the current collapse order is unknown at that time.
> 
> This patch is inspired by Dev Jain's work on khugepaged mTHP support [1].
> 
> [1] https://lore.kernel.org/lkml/20241216165105.56185-11-dev.jain@arm.com/
> 
> Reviewed-by: Lorenzo Stoakes <ljs@kernel.org> 
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Co-developed-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  mm/khugepaged.c | 8 ++++++++
>  1 file changed, 8 insertions(+)
> 

Acked-by: Usama Arif <usama.arif@linux.dev>
 

^ permalink raw reply

* Re: [PATCH] tracing: branch: Fix inverted check on stat tracer registration
From: Masami Hiramatsu @ 2026-04-20 23:57 UTC (permalink / raw)
  To: Breno Leitao
  Cc: Steven Rostedt, Mathieu Desnoyers, Ingo Molnar,
	Frederic Weisbecker, Steven Rostedt, linux-kernel,
	linux-trace-kernel, paulmck, kernel-team
In-Reply-To: <20260420-tracing-v1-1-d8f4cd0d6af1@debian.org>

On Mon, 20 Apr 2026 06:25:09 -0700
Breno Leitao <leitao@debian.org> wrote:

> init_annotated_branch_stats() and all_annotated_branch_stats() check the
> return value of register_stat_tracer() with "if (!ret)", but
> register_stat_tracer() returns 0 on success and a negative errno on
> failure. The inverted check causes the warning to be printed on every
> successful registration, e.g.:
> 
>   Warning: could not register annotated branches stats
> 
> while leaving real failures silent. The initcall also returned a
> hard-coded 1 instead of the actual error.
> 
> Invert the check and propagate ret so that the warning fires on real
> errors and the initcall reports the correct status.
> 

Good catch!

Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Thanks,

> Fixes: 002bb86d8d42 ("tracing/ftrace: separate events tracing and stats tracing engine")
> Signed-off-by: Breno Leitao <leitao@debian.org>
> ---
>  kernel/trace/trace_branch.c | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
> index 6809b370e991d..d1564db95a8f5 100644
> --- a/kernel/trace/trace_branch.c
> +++ b/kernel/trace/trace_branch.c
> @@ -373,10 +373,10 @@ __init static int init_annotated_branch_stats(void)
>  	int ret;
>  
>  	ret = register_stat_tracer(&annotated_branch_stats);
> -	if (!ret) {
> +	if (ret) {
>  		printk(KERN_WARNING "Warning: could not register "
>  				    "annotated branches stats\n");
> -		return 1;
> +		return ret;
>  	}
>  	return 0;
>  }
> @@ -438,10 +438,10 @@ __init static int all_annotated_branch_stats(void)
>  	int ret;
>  
>  	ret = register_stat_tracer(&all_branch_stats);
> -	if (!ret) {
> +	if (ret) {
>  		printk(KERN_WARNING "Warning: could not register "
>  				    "all branches stats\n");
> -		return 1;
> +		return ret;
>  	}
>  	return 0;
>  }
> 
> ---
> base-commit: c7275b05bc428c7373d97aa2da02d3a7fa6b9f66
> change-id: 20260420-tracing-3f1367ee4b93
> 
> Best regards,
> --  
> Breno Leitao <leitao@debian.org>
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH 0/3] mm: split the file's i_mmap tree for NUMA
From: Huang Shijie @ 2026-04-21  3:06 UTC (permalink / raw)
  To: Pedro Falcato
  Cc: Mateusz Guzik, akpm, viro, brauner, linux-mm, linux-kernel,
	linux-arm-kernel, linux-fsdevel, muchun.song, osalvador,
	linux-trace-kernel, linux-perf-users, linux-parisc, nvdimm,
	zhongyuan, fangbaoshun, yingzhiwei
In-Reply-To: <hshxzebq5y4gavo7mbrgn7qitz5j5wyun73wy7ooiiehzzpcui@hlknbp34sgja>

On Mon, Apr 20, 2026 at 02:48:49PM +0100, Pedro Falcato wrote:
> BTW you're missing _a lot_ of CC's here, including the whole of mm/rmap.c
> maintainership.

Thanks, my fault.

> 
> On Mon, Apr 20, 2026 at 10:10:19AM +0800, Huang Shijie wrote:
> > On Mon, Apr 13, 2026 at 05:33:21PM +0200, Mateusz Guzik wrote:
> > > On Mon, Apr 13, 2026 at 02:20:39PM +0800, Huang Shijie wrote:
> > > >   In NUMA, there are maybe many NUMA nodes and many CPUs.
> > > > For example, a Hygon's server has 12 NUMA nodes, and 384 CPUs.
> > > > In the UnixBench tests, there is a test "execl" which tests
> > > > the execve system call.
> > > > 
> > > >   When we test our server with "./Run -c 384 execl",
> > > > the test result is not good enough. The i_mmap locks contended heavily on
> > > > "libc.so" and "ld.so". For example, the i_mmap tree for "libc.so" can have 
> > > > over 6000 VMAs, all the VMAs can be in different NUMA mode.
> > > > The insert/remove operations do not run quickly enough.
> > > > 
> > > > patch 1 & patch 2 are try to hide the direct access of i_mmap.
> > > > patch 3 splits the i_mmap into sibling trees, and we can get better 
> > > > performance with this patch set:
> > > >     we can get 77% performance improvement(10 times average)
> > > > 
> > > 
> > > To my reading you kept the lock as-is and only distributed the protected
> > > state.
> > > 
> > > While I don't doubt the improvement, I'm confident should you take a
> > > look at the profile you are going to find this still does not scale with
> > > rwsem being one of the problems (there are other global locks, some of
> > > which have experimental patches for).
> > > 
> > > Apart from that this does nothing to help high core systems which are
> > > all one node, which imo puts another question mark on this specific
> > > proposal.
> > > 
> > > Of course one may question whether a RB tree is the right choice here,
> > > it may be the lock-protected cost can go way down with merely a better
> > > data structure.
> > > 
> > > Regardless of that, for actual scalability, there will be no way around
> > > decentralazing locking around this and partitioning per some core count
> > > (not just by numa awareness).
> > > 
> > > Decentralizing locking is definitely possible, but I have not looked
> > > into specifics of how problematic it is. Best case scenario it will
> > > merely with separate locks. Worst case scenario something needs a fully
> > > stabilized state for traversal, in that case another rw lock can be
> > > slapped around this, creating locking order read lock -> per-subset
> > > write lock -- this will suffer scalability due to the read locking, but
> > > it will still scale drastically better as apart from that there will be
> > > no serialization. In this setting the problematic consumer will write
> > > lock the new thing to stabilize the state.
> > > 
> > I thought over again.
> > I can change this patch set to support the non-NUMA case by:
> >   1.) Still use one rw lock.
> 
> No. This doesn't help anything.
> 
> >   2.) For NUMA, keep the patch set as it is.
> 
> Please no. No NUMA vs non-NUMA case.
> 
> >   3.) For non-NUMA case, split the i_mmap tree to several subtrees.
> >       For example, if a machine has 192 CPUs, split the 32 CPUs as a tree.
> 
> If lock contention is the problem, I don't see how splitting the tree helps,
> unless it helps reduce lock hold time in a way that randomly helps your workload.
> But that's entirely random.
We actually face two issues:
   1.) the lock contention
   2.) the lock hold time.

IMHO, if we can reduce the lock hold time, we can ease the lock contention too.
So this patch set is to reduce the lock hold time, which is much helpful in our
NUMA server in UnixBench test.

If we split the lock into small locks, we can also benefit from it. 
If you or Mateusz create the patch in future, I can test it on our server.
I wonder if it can give us better performance then current patch set.

Thanks
Huang Shijie


^ permalink raw reply

* Re: [PATCH v13 04/18] unwind_user/sframe: Add support for reading .sframe contents
From: Jens Remus @ 2026-04-21  8:27 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel, bpf, x86, linux-mm,
	Steven Rostedt
  Cc: Josh Poimboeuf, Masami Hiramatsu, Mathieu Desnoyers,
	Peter Zijlstra, Ingo Molnar, Jiri Olsa, Arnaldo Carvalho de Melo,
	Namhyung Kim, Thomas Gleixner, Andrii Nakryiko, Jose E. Marchesi,
	Beau Belgrave, Linus Torvalds, Andrew Morton, Florian Weimer,
	Kees Cook, Carlos O'Donell, Sam James, Dylan Hatch,
	Borislav Petkov, Dave Hansen, David Hildenbrand, H. Peter Anvin,
	Liam R. Howlett, Lorenzo Stoakes, Michal Hocko, Mike Rapoport,
	Suren Baghdasaryan, Vlastimil Babka, Heiko Carstens,
	Vasily Gorbik, Steven Rostedt (Google), Indu Bhagat
In-Reply-To: <20260127150554.2760964-5-jremus@linux.ibm.com>

On 1/27/2026 4:05 PM, Jens Remus wrote:
> From: Josh Poimboeuf <jpoimboe@kernel.org>
> 
> In preparation for using sframe to unwind user space stacks, add an
> sframe_find() interface for finding the sframe information associated
> with a given text address.
> 
> For performance, use user_read_access_begin() and the corresponding
> unsafe_*() accessors.  Note that use of pr_debug() in uaccess-enabled
> regions would break noinstr validation, so there aren't any debug
> messages yet.  That will be added in a subsequent commit.
> 
> Link: https://lore.kernel.org/all/77c0d1ec143bf2a53d66c4ecb190e7e0a576fbfd.1737511963.git.jpoimboe@kernel.org/
> Link: https://lore.kernel.org/all/b35ca3a3-8de5-4d32-8d30-d4e562f6b0de@linux.ibm.com/
> 
> [ Jens Remus: Add initial support for SFrame V3 (limited to regular
> FDEs).  Add support for PC-relative FDE function start offset.  Simplify
> logic by using an internal FDE representation.  Rename struct sframe_fre
> to sframe_fre_internal to align with struct sframe_fde_internal.
> Cleanup includes.  Fix checkpatch errors "spaces required around that
> ':'". ]

> diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c

> +static __always_inline int __read_fde(struct sframe_section *sec,
> +				      unsigned int fde_num,
> +				      struct sframe_fde_internal *fde)
> +{
> +	unsigned long fde_addr, fda_addr, func_addr;
> +	struct sframe_fde_v3 _fde;
> +	struct sframe_fda_v3 _fda;
> +
> +	fde_addr = sec->fdes_start + (fde_num * sizeof(struct sframe_fde_v3));
> +	unsafe_copy_from_user(&_fde, (void __user *)fde_addr,
> +			      sizeof(struct sframe_fde_v3), Efault);
> +
> +	func_addr = fde_addr + _fde.func_start_off;
> +	if (func_addr < sec->text_start || func_addr > sec->text_end)

sec->text_end points after the last byte of the .text section.
Therefore the check for whether FDE function start address is within the
section must be:

	if (func_addr < sec->text_start || func_addr >= sec->text_end)

> +		return -EINVAL;

Regards,
Jens
-- 
Jens Remus
Linux on Z Development (D3303)
jremus@de.ibm.com / jremus@linux.ibm.com

IBM Deutschland Research & Development GmbH; Vorsitzender des Aufsichtsrats: Wolfgang Wendt; Geschäftsführung: David Faller; Sitz der Gesellschaft: Ehningen; Registergericht: Amtsgericht Stuttgart, HRB 243294
IBM Data Privacy Statement: https://www.ibm.com/privacy/


^ permalink raw reply

* Re: [PATCHv5 bpf-next 21/28] selftests/bpf: Add tracing multi skel/pattern/ids attach tests
From: Jiri Olsa @ 2026-04-21  8:54 UTC (permalink / raw)
  To: bot+bpf-ci
  Cc: ast, daniel, andrii, bpf, linux-trace-kernel, kafai, eddyz87,
	songliubraving, yhs, menglong8.dong, rostedt, martin.lau,
	yonghong.song, clm, ihor.solodrai
In-Reply-To: <1ab81cf709ce7d536a898695ffad30c15c8a6a7bf48dcb7271239952dd270315@mail.kernel.org>

On Fri, Apr 17, 2026 at 08:10:22PM +0000, bot+bpf-ci@kernel.org wrote:
> > diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
> 
> [ ... ]
> 
> > @@ -0,0 +1,256 @@
> > +static __u32 *get_ids(const char * const funcs[], int funcs_cnt, const char *mod)
> > +{
> > +	struct btf *btf, *vmlinux_btf = NULL;
> > +	__u32 nr, type_id, cnt = 0;
> > +	void *root = NULL;
> > +	__u32 *ids = NULL;
> > +	int i, err = 0;
> > +
> > +	btf = btf__load_vmlinux_btf();
> > +	if (!ASSERT_OK_PTR(btf, "btf__load_vmlinux_btf"))
> > +		return NULL;
> > +
> > +	if (mod) {
> > +		vmlinux_btf = btf;
> > +		btf = btf__load_module_btf(mod, vmlinux_btf);
> > +		if (!ASSERT_OK_PTR(btf, "btf__load_module_btf")) {
> > +			btf__free(vmlinux_btf);
> > +			goto out;
> > +		}
> > +	}
> 
> [ ... ]
> 
> > +out:
> > +	tdestroy(root, tdestroy_free_nop);
> > +	btf__free(vmlinux_btf);
> > +	btf__free(btf);
> > +	return ids;
> > +}
> 
> Does this code double-free vmlinux_btf when btf__load_module_btf() fails?
> If btf__load_module_btf(mod, vmlinux_btf) fails, the error path at
> ASSERT_OK_PTR calls btf__free(vmlinux_btf) and then jumps to the out:
> label, which calls btf__free(vmlinux_btf) again on the same pointer.
> 
> The first free happens here:
> 
> 	if (mod) {
> 		vmlinux_btf = btf;
> 		btf = btf__load_module_btf(mod, vmlinux_btf);
> 		if (!ASSERT_OK_PTR(btf, "btf__load_module_btf")) {
> 			btf__free(vmlinux_btf);  <-- FIRST FREE
> 			goto out;
> 		}
> 	}
> 
> Then at the out: label:
> 
> out:
> 	tdestroy(root, tdestroy_free_nop);
> 	btf__free(vmlinux_btf);  <-- SECOND FREE
> 	btf__free(btf);
> 
> The btf__free() implementation in tools/lib/bpf/btf.c checks only
> IS_ERR_OR_NULL(btf) and does not guard against already-freed memory,
> so this would trigger a double-free that glibc would detect and abort.
> 
> In the current commit, get_ids() is only called with mod=NULL from
> test_link_api_ids(), so this path is latent. However, a subsequent commit
> (4dda98b9e4d0, "selftests/bpf: Add tracing multi skel/pattern/ids module
> attach tests") adds a caller get_ids(bpf_testmod_fentry_test, cnt,
> "bpf_testmod") that activates the buggy path.

yep, double free, will fix

jirka

^ permalink raw reply

* Re: [PATCHv5 bpf-next 02/28] ftrace: Add ftrace_hash_remove function
From: Jiri Olsa @ 2026-04-21  8:54 UTC (permalink / raw)
  To: bot+bpf-ci
  Cc: ast, daniel, andrii, bpf, linux-trace-kernel, kafai, eddyz87,
	songliubraving, yhs, menglong8.dong, rostedt, martin.lau,
	yonghong.song, clm, ihor.solodrai
In-Reply-To: <14d32b2f6aacddfce59d73dc5e1c0ea423f84e373ca130f6830c3e26154e70b3@mail.kernel.org>

On Sat, Apr 18, 2026 at 06:10:52AM +0000, bot+bpf-ci@kernel.org wrote:
> > diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
> > --- a/include/linux/ftrace.h
> > +++ b/include/linux/ftrace.h
> > @@ -416,6 +416,7 @@ void free_ftrace_hash(struct ftrace_hash *hash);
> >  struct ftrace_func_entry *add_ftrace_hash_entry_direct(struct ftrace_hash *hash,
> >  						       unsigned long ip, unsigned long direct);
> >  unsigned long ftrace_hash_count(struct ftrace_hash *hash);
> > +void ftrace_hash_remove(struct ftrace_hash *hash);
> >
> >  /* The hash used to know what functions callbacks trace */
> >  struct ftrace_ops_hash {
> > diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
> > --- a/kernel/trace/ftrace.c
> > +++ b/kernel/trace/ftrace.c
> > @@ -1249,6 +1249,25 @@ remove_hash_entry(struct ftrace_hash *hash,
> >  	hash->count--;
> >  }
> >
> > +void ftrace_hash_remove(struct ftrace_hash *hash)
> > +{
> > +	struct hlist_head *hhd;
> > +	struct hlist_node *tn;
> > +	struct ftrace_func_entry *entry;
> > +	int size = 1 << hash->size_bits;
>                           ^^^^^^^^^^^^^^^^
> 
> Can this dereference a NULL pointer? The code computes the size using
> hash->size_bits before checking if hash is NULL:
> 
>     int size = 1 << hash->size_bits;
>     ...
>     if (!hash || !hash->count)
>         return;

yep, we should deref after the check, will fix

jirka

^ permalink raw reply

* Re: [PATCHv5 bpf-next 04/28] bpf: Use mutex lock pool for bpf trampolines
From: Jiri Olsa @ 2026-04-21  8:54 UTC (permalink / raw)
  To: bot+bpf-ci
  Cc: ast, daniel, andrii, bpf, linux-trace-kernel, kafai, eddyz87,
	songliubraving, yhs, menglong8.dong, rostedt, martin.lau,
	yonghong.song, clm, ihor.solodrai
In-Reply-To: <53b3e6cb10e12555d3b1b321eeb4e04a447c707c387568d1e9ef759fb6459ef4@mail.kernel.org>

On Fri, Apr 17, 2026 at 08:10:24PM +0000, bot+bpf-ci@kernel.org wrote:
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> 
> [ ... ]
> 
> > diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
> > --- a/kernel/bpf/trampoline.c
> > +++ b/kernel/bpf/trampoline.c
> > @@ -30,6 +30,34 @@ #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS)
> >  /* serializes access to trampoline tables */
> >  static DEFINE_MUTEX(trampoline_mutex);
> >
> > +/*
> > + * We keep 32 trampoline locks (5 bits) in the pool, because there is
> > + * 48 (MAX_LOCK_DEPTH) locks limit allowed to be simultaneously held
> > + * by task. Each lock has its own lockdep key to keep it simple.
> > + */
> 
> The comment explains the 32 count (MAX_LOCK_DEPTH limit), but should it
> also explain why each lock has its own lock_class_key?
> 
> Without that explanation, it is not obvious that distinct keys are required
> to avoid lockdep "recursive locking" warnings when trampoline_lock_all()
> acquires all 32 pool mutexes simultaneously.
> 
> This was raised by bot+bpf-ci@kernel.org in v3 review:
> https://lore.kernel.org/bpf/31ae46274a3157f2b9840e1a09b2698d1ec0cfd461737ff460c2d3349a9f0054@mail.kernel.org/
> 
> The author acknowledged it with "will add", but the current comment only
> says "to keep it simple" without addressing the recursive locking concern.

will add more details

jirka

> 
> > +#define TRAMPOLINE_LOCKS_BITS 5
> > +#define TRAMPOLINE_LOCKS_TABLE_SIZE (1 << TRAMPOLINE_LOCKS_BITS)
> > +
> > +static struct {
> > +	struct mutex mutex;
> > +	struct lock_class_key key;
> > +} trampoline_locks[TRAMPOLINE_LOCKS_TABLE_SIZE];
> 
> [ ... ]
> 
> 
> ---
> AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
> See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
> 
> CI run summary: https://github.com/kernel-patches/bpf/actions/runs/24583317711


^ permalink raw reply

* Re: [PATCHv5 bpf-next 06/28] bpf: Move trampoline image setup into bpf_trampoline_ops callbacks
From: Jiri Olsa @ 2026-04-21  8:55 UTC (permalink / raw)
  To: bot+bpf-ci
  Cc: ast, daniel, andrii, bpf, linux-trace-kernel, kafai, eddyz87,
	songliubraving, yhs, menglong8.dong, rostedt, martin.lau,
	yonghong.song, clm, ihor.solodrai
In-Reply-To: <8a315de2e20505875006c4e32af33c0838000a79484f7a2e44b1081187a01379@mail.kernel.org>

On Fri, Apr 17, 2026 at 08:10:25PM +0000, bot+bpf-ci@kernel.org wrote:
> > There's slight functional change for the unregister path, where we
> > currently free the image unconditionally even if the detach fails.
> > The new code keeps the image in place, possibly preventing the crash.
> 
> This appears to be fixing a bug where the image was freed even when detach
> failed. The bug was introduced in commit e21aa341785c ("bpf: Fix fexit
> trampoline.") which unconditionally freed the trampoline image.
> 
> Should this commit include a Fixes tag? The suggested tag would be:
> 
> Fixes: e21aa341785c ("bpf: Fix fexit trampoline.")

right, I think I'll send separate fix for current code,
to have this fixed earlier

jirka

> 
> 
> ---
> AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
> See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
> 
> CI run summary: https://github.com/kernel-patches/bpf/actions/runs/24583317711


^ permalink raw reply

* Re: [PATCHv5 bpf-next 08/28] bpf: Add struct bpf_tramp_node object
From: Jiri Olsa @ 2026-04-21  8:55 UTC (permalink / raw)
  To: bot+bpf-ci
  Cc: ast, daniel, andrii, hengqi.chen, bpf, linux-trace-kernel, kafai,
	eddyz87, songliubraving, yhs, menglong8.dong, rostedt, martin.lau,
	yonghong.song, clm, ihor.solodrai
In-Reply-To: <fb38d0f3976f2225ae9781d2292bb79f4d787183360f050c08de3b4490790974@mail.kernel.org>

On Sat, Apr 18, 2026 at 06:10:55AM +0000, bot+bpf-ci@kernel.org wrote:
> > diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
> > index 50103b3794fb..b97d9f68fc40 100644
> > --- a/arch/powerpc/net/bpf_jit_comp.c
> > +++ b/arch/powerpc/net/bpf_jit_comp.c
> 
> [ ... ]
> 
> > @@ -827,14 +827,14 @@ static void bpf_trampoline_restore_args_stack(u32 *image, struct codegen_context
> >  static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_image,
> >  					 void *rw_image_end, void *ro_image,
> >  					 const struct btf_func_model *m, u32 flags,
> > -					 struct bpf_tramp_links *tlinks,
> > +					 struct bpf_tramp_nodes *tnodes,
> >  					 void *func_addr)
> >  {
> >  	int regs_off, func_meta_off, ip_off, run_ctx_off, retval_off;
> >  	int nvr_off, alt_lr_off, r4_off = 0;
> > -	struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
> > -	struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
> > -	struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
> > +	struct bpf_tramp_nodes *fmod_ret = &tnodes[BPF_TRAMP_MODIFY_RETURN];
> > +	struct bpf_tramp_nodes *fentry = &tnodes[BPF_TRAMP_FENTRY];
> > +	struct bpf_tramp_nodes *fexit = &tnodes[BPF_TRAMP_FEXIT];
> >  	int i, ret, nr_regs, retaddr_off, bpf_frame_size = 0;
> >  	struct codegen_context codegen_ctx, *ctx;
> >  	int cookie_off, cookie_cnt, cookie_ctx_off;
> > -	int fsession_cnt = bpf_fsession_cnt(tlinks);
> > +	int fsession_cnt = bpf_fsession_cnt(tnodes);
>                                             ^^^^^
> 
> This line wasn't actually updated in the diff. The parameter was renamed from
> tlinks to tnodes, but this call site still references the old name. Does this
> compile?
> 
> The same issue appears later in the function:
> 
> >  	cookie_off = bpf_frame_size;
> > -	cookie_cnt = bpf_fsession_cookie_cnt(tlinks);
> > +	cookie_cnt = bpf_fsession_cookie_cnt(tnodes);

ugh, it clashed with latest update for fsession,
and I skip the build test for some reason, will fix

jirka

^ permalink raw reply

* Re: [PATCHv5 bpf-next 10/28] bpf: Add multi tracing attach types
From: Jiri Olsa @ 2026-04-21  8:55 UTC (permalink / raw)
  To: bot+bpf-ci
  Cc: ast, daniel, andrii, bpf, linux-trace-kernel, kafai, eddyz87,
	songliubraving, yhs, menglong8.dong, rostedt, martin.lau,
	yonghong.song, clm, ihor.solodrai
In-Reply-To: <1ef3a598cb11de0fac8dfe1ac94287d3588ca36257f27b0a2407e85f5037667a@mail.kernel.org>

On Fri, Apr 17, 2026 at 08:22:39PM +0000, bot+bpf-ci@kernel.org wrote:
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 552bc5d9afbd..5f814e7101cf 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -1154,6 +1154,8 @@ enum bpf_attach_type {
> >  	BPF_TRACE_KPROBE_SESSION,
> >  	BPF_TRACE_UPROBE_SESSION,
> >  	BPF_TRACE_FSESSION,
> > +	BPF_TRACE_FENTRY_MULTI,
> > +	BPF_TRACE_FEXIT_MULTI,
> >  	__MAX_BPF_ATTACH_TYPE
> >  };
> 
> In the RFC version posted on February 4, 2026, Leon Hwang asked whether
> BPF_TRACE_FSESSION_MULTI should be added alongside BPF_TRACE_FENTRY_MULTI
> and BPF_TRACE_FEXIT_MULTI for consistency, since BPF_TRACE_FSESSION exists
> and is handled similarly to FENTRY/FEXIT.
> 
> Reference:
> https://lore.kernel.org/bpf/3015991d-2e9b-4bf4-9c17-26e00b5d5926@linux.dev/
> 
> You acknowledged this with "good catch, will add it" in the same thread:
> https://lore.kernel.org/bpf/aYM-jeV50WaVik1b@krava/
> 
> However, BPF_TRACE_FSESSION_MULTI was not added in any subsequent version
> through v5. Should BPF_TRACE_FSESSION_MULTI be included here for API
> consistency?
>

I think the comment was to introduce the support for fsession,
not to add it to this specific patch

and BPF_TRACE_FSESSION_MULTI is introduced in:
  bpf: Add support for tracing_multi link session

jirka

> 
> ---
> AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
> See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
> 
> CI run summary: https://github.com/kernel-patches/bpf/actions/runs/24583317711


^ permalink raw reply

* Re: [PATCHv5 bpf-next 10/28] bpf: Add multi tracing attach types
From: Jiri Olsa @ 2026-04-21  8:56 UTC (permalink / raw)
  To: bot+bpf-ci
  Cc: ast, daniel, andrii, bpf, linux-trace-kernel, kafai, eddyz87,
	songliubraving, yhs, menglong8.dong, rostedt, martin.lau,
	yonghong.song, clm, ihor.solodrai
In-Reply-To: <23dc9c8f0305da291027f64f583beda22a65d77849bc00c08eb88f32e4651d95@mail.kernel.org>

On Sat, Apr 18, 2026 at 06:49:09AM +0000, bot+bpf-ci@kernel.org wrote:
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -1154,6 +1154,8 @@ enum bpf_attach_type {
> >  	BPF_TRACE_KPROBE_SESSION,
> >  	BPF_TRACE_UPROBE_SESSION,
> >  	BPF_TRACE_FSESSION,
> > +	BPF_TRACE_FENTRY_MULTI,
> > +	BPF_TRACE_FEXIT_MULTI,
> >  	__MAX_BPF_ATTACH_TYPE
> >  };
> 
> In RFC v2 (Feb 4, 2026), Leon Hwang asked whether BPF_TRACE_FSESSION_MULTI
> should be added alongside BPF_TRACE_FENTRY_MULTI and BPF_TRACE_FEXIT_MULTI
> for API consistency, since BPF_TRACE_FSESSION exists and is handled
> similarly to FENTRY/FEXIT. You acknowledged this with 'good catch, will add
> it' but BPF_TRACE_FSESSION_MULTI was never added in any subsequent version
> (v1, v2, v3, v4, or v5).
> 
> Is this attach type intentionally omitted, or should it be included to
> complete the multi tracing API?

ansrewed in the other reply

> 
> Reference: https://lore.kernel.org/bpf/3015991d-2e9b-4bf4-9c17-26e00b5d5926@linux.dev/
> 
> [ ... ]
> 
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> 
> [ ... ]
> 
> > @@ -19368,7 +19379,17 @@
> >  		if (ret < 0)
> >  			return ret;
> >
> > -		if (tgt_prog) {
> > +		/* *.multi programs don't need an address during program
> > +		 * verification, we just take the module ref if needed.
> > +		 */
> > +		if (is_tracing_multi_id(prog, btf_id)) {
> > +			if (btf_is_module(btf)) {
> > +				mod = btf_try_get_module(btf);
> > +				if (!mod)
> > +					return -ENOENT;
> > +			}
> > +			addr = 0;
> > +		} else if (tgt_prog) {
> 
> This isn't a bug, but the new multi-line comment here uses the old kernel
> comment style with text on the same line as the opening '/*'. The BPF
> subsystem guide mandates the modern multi-line style with '/*' on its own
> line for new code under kernel/bpf/, even if surrounding code still uses
> the old style.
> 
> Should the comment be reformatted as:
> 
> 	/*
> 	 * *.multi programs don't need an address during program
> 	 * verification, we just take the module ref if needed.
> 	 */

sure

jirka

> 
> > @@ -19396,6 +19417,12 @@
> >  			ret = -EINVAL;
> >  			switch (prog->type) {
> >  			case BPF_PROG_TYPE_TRACING:
> > +				/* *.multi sleepable programs will pass initial sleepable check,
> > +				 * the actual attached btf ids are checked later during the link
> > +				 * attachment.
> > +				 */
> > +				if (is_tracing_multi_id(prog, btf_id))
> > +					ret = 0;
> 
> The same comment style issue appears here. Should this also use the modern
> style with '/*' on its own line?
> 
> Note that another comment you added in check_attach_btf_id uses the correct
> modern style, making this inconsistency clearly fixable.
> 
> 
> ---
> AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
> See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
> 
> CI run summary: https://github.com/kernel-patches/bpf/actions/runs/24598000047


^ permalink raw reply

* Re: [PATCHv5 bpf-next 12/28] bpf: Add bpf_trampoline_multi_attach/detach functions
From: Jiri Olsa @ 2026-04-21  8:56 UTC (permalink / raw)
  To: bot+bpf-ci
  Cc: ast, daniel, andrii, bpf, linux-trace-kernel, kafai, eddyz87,
	songliubraving, yhs, menglong8.dong, rostedt, martin.lau,
	yonghong.song, clm, ihor.solodrai
In-Reply-To: <11459a06ee7d070bdbdc3699d59adcdca12d5178eaa6ebd15286e0f51cc27141@mail.kernel.org>

On Fri, Apr 17, 2026 at 08:22:36PM +0000, bot+bpf-ci@kernel.org wrote:

SNIP

> > +	if (ftrace_hash_count(data->modify)) {
> > +		err = update_ftrace_direct_mod(&direct_ops, data->modify, true);
> > +		if (err) {
> > +			WARN_ON_ONCE(update_ftrace_direct_del(&direct_ops, data->reg));
> > +			goto rollback_unlink;
> > +		}
> > +	}
> 
> Does update_ftrace_direct_del need a guard here?
> 
> When update_ftrace_direct_mod fails, the recovery code unconditionally
> calls update_ftrace_direct_del(&direct_ops, data->reg). However,
> update_ftrace_direct_add is itself only invoked when data->reg is
> non-empty:
> 
>     if (ftrace_hash_count(data->reg)) {
>         err = update_ftrace_direct_add(&direct_ops, data->reg);
>         ...
>     }
> 
> If all trampolines being multi-attached already had cur_image != NULL,
> nothing is ever added to data->reg, and the add call is skipped. Yet
> the recovery path still calls update_ftrace_direct_del with the empty
> data->reg.
> 
> update_ftrace_direct_del() at kernel/trace/ftrace.c:6511 does:
> 
>     if (!hash_count(hash))
>         return -EINVAL;
> 
> so it returns -EINVAL, and WARN_ON_ONCE fires. No further damage is
> done since the rollback_unlink path still unwinds correctly, but this
> is a spurious warning that becomes visible whenever
> update_ftrace_direct_mod fails and data->reg happens to be empty.
> 
> The undo should match the guard used for the add:
> 
>     if (err) {
>         if (ftrace_hash_count(data->reg))
>             WARN_ON_ONCE(update_ftrace_direct_del(&direct_ops, data->reg));
>         goto rollback_unlink;
>     }

right, that's the fix

> 
> > +
> > +	for_each_mnode(mnode, link)
> > +		bpf_trampoline_multi_attach_free(mnode->trampoline);
> > +
> > +	trampoline_unlock_all();
> > +
> > +	remove_tracing_multi_data(data);
> > +	return 0;
> > +
> > +rollback_unlink:
> > +	for_each_mnode_cnt(mnode, link, rollback_cnt) {
> > +		bpf_trampoline_remove_prog(mnode->trampoline, &mnode->node);
> > +		bpf_trampoline_multi_attach_rollback(mnode->trampoline);
> > +	}
> > +
> > +	trampoline_unlock_all();
> > +
> > +	clear_tracing_multi_data(data);
> > +	rollback_cnt = link->nodes_cnt;
> > +
> > +rollback_put:
> > +	for_each_mnode_cnt(mnode, link, rollback_cnt)
> > +		bpf_trampoline_put(mnode->trampoline);
> > +
> > +	return err;
> > +}

SNIP

> > +	/* Check noreturn attachment. */
> > +	if (prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI ||
> > +	     btf_id_set_contains(&noreturn_deny, btf_id))
> > +		return -EINVAL;
> 
> Should this be && instead of ||?
> 
> This condition rejects all BPF_TRACE_FEXIT_MULTI programs regardless
> of whether the target is on the noreturn_deny list. The C operator
> precedence makes this `A || B`, so if the program's expected_attach_type
> is BPF_TRACE_FEXIT_MULTI, the entire condition is TRUE and the
> function returns -EINVAL.
> 
> This breaks FEXIT_MULTI attachment support. The caller
> bpf_trampoline_multi_attach is the multi-attach path that supports both
> BPF_TRACE_FENTRY_MULTI and BPF_TRACE_FEXIT_MULTI (see is_tracing_multi()
> in include/linux/bpf.h).
> 
> The equivalent check in the single-target path, check_attach_btf_id,
> correctly uses &&:
> 
>     } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT ||
>                prog->expected_attach_type == BPF_TRACE_FSESSION ||
>                prog->expected_attach_type == BPF_MODIFY_RETURN) &&
>                btf_id_set_contains(&noreturn_deny, btf_id)) {
>             ...
>     }
> 
> The intended semantics are to reject FEXIT_MULTI attachments only when
> the target is on noreturn_deny. The correct operator is &&:
> 
>     if (prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI &&
>         btf_id_set_contains(&noreturn_deny, btf_id))
>         return -EINVAL;
> 
> Additional evidence:
> 1. The comment says "Check noreturn attachment" but the current code
>    rejects all FEXIT_MULTI regardless of noreturn status.
> 2. The indentation of the second line matches the alignment pattern
>    used when lines are joined by && rather than ||.
> 3. bpf_check_attach_target handles case BPF_TRACE_FEXIT_MULTI as a
>    valid attach type in its switch.
> 
> Note: In this commit, bpf_trampoline_multi_attach has no callers yet;
> it is infrastructure wired up by a later commit. Once the link creation
> path invokes bpf_trampoline_multi_attach for a FEXIT_MULTI program,
> every such attach will fail with -EINVAL.

guilty as charged, should be &&, will fix

jirka

^ permalink raw reply

* Re: [PATCHv5 bpf-next 12/28] bpf: Add bpf_trampoline_multi_attach/detach functions
From: Jiri Olsa @ 2026-04-21  8:56 UTC (permalink / raw)
  To: bot+bpf-ci
  Cc: ast, daniel, andrii, bpf, linux-trace-kernel, kafai, eddyz87,
	songliubraving, yhs, menglong8.dong, rostedt, martin.lau,
	yonghong.song, clm, ihor.solodrai
In-Reply-To: <0fefc9ca058787dddef9c125163a0182c6ddd1f47d2754f2ddbb5a82bfc9126a@mail.kernel.org>

On Sat, Apr 18, 2026 at 06:10:57AM +0000, bot+bpf-ci@kernel.org wrote:

SNIP

> > +		err = __bpf_trampoline_link_prog(&mnode->node, mnode->trampoline, NULL,
> > +						 &trampoline_multi_ops, data);
> > +		if (err) {
> > +			rollback_cnt = i;
> > +			goto rollback_unlink;
> > +		}
> > +	}
> 
> When user-provided ids[] contains duplicate BTF IDs (or distinct IDs
> that resolve to the same trampoline key), multiple nodes point to the
> same struct bpf_trampoline. The link loop above then calls
> bpf_trampoline_multi_attach_init() more than once on that trampoline,
> overwriting the saved old_image with the newly assigned cur_image from
> the previous iteration.
> 
> Scenario with ids[0] == ids[1] and trampoline X starting with OLD_X:
> 
>   i=0: attach_init(X) saves old_image=OLD_X
>        __bpf_trampoline_link_prog() -> modify_fentry_multi() sets
>        X->cur_image=NEW_X (OLD_X refcount not dropped, intent is for
>        multi_attach_free() to release it later).
> 
>   i=1: attach_init(X) re-runs on the same trampoline and overwrites
>        old_image=NEW_X (the only saved reference to OLD_X is lost).
>        __bpf_trampoline_link_prog() returns -EBUSY (duplicate prog).
>        rollback_cnt=i=1; goto rollback_unlink.
> 
>   rollback_unlink calls bpf_trampoline_multi_attach_rollback(X) once:
> 
>     static void bpf_trampoline_multi_attach_rollback(struct bpf_trampoline *tr)
>     {
>         if (tr->cur_image)
>             bpf_tramp_image_put(tr->cur_image);    /* puts NEW_X */
>         tr->cur_image = tr->multi_attach.old_image;  /* = NEW_X (stale!) */
>         ...
>     }
> 
> Result: OLD_X leaks (its refcount is never decremented). cur_image
> points at NEW_X which was just released via bpf_tramp_image_put().
> 
> The rollback_put loop calls bpf_trampoline_put() which frees the
> trampoline (prog was removed, progs_hlist is empty), so the dangling
> NEW_X pointer is cleaned up. However OLD_X remains allocated via
> module_alloc forever.
> 
> Reachability: commit dbf2afe2f603 ("bpf: Add support for tracing multi
> link") copies ids[] from userspace without deduplication. The series'
> selftests (commit 411fb40d4b2a "selftests/bpf: Add tracing multi attach
> fails test", 'fail#7 (kernel) attach with duplicate id') exercise this
> path explicitly, expecting -EBUSY.
> 
> The trigger requires the shared trampoline to already have cur_image !=
> NULL when the first iteration runs (i.e., another program attached to
> the same function), so modify_fentry_multi() is used and there is a
> real OLD_X to leak. The first-time-attach case (register_fentry_multi,
> OLD_X=NULL) only leaves cur_image dangling briefly until the trampoline
> is freed.
> 
> Can the code save old_image/old_flags once per trampoline instead of
> once per node, or store them per-mnode to avoid overwriting the saved
> state?

I think it's better to ensure the btf ids array does not contain
duplicate entries.. wil check

> 
> > +
> > +	rollback_cnt = link->nodes_cnt;
> > +	if (ftrace_hash_count(data->reg)) {
> > +		err = update_ftrace_direct_add(&direct_ops, data->reg);
> > +		if (err)
> > +			goto rollback_unlink;
> > +	}
> > +
> > +	if (ftrace_hash_count(data->modify)) {
> > +		err = update_ftrace_direct_mod(&direct_ops, data->modify, true);
> > +		if (err) {
> > +			WARN_ON_ONCE(update_ftrace_direct_del(&direct_ops, data->reg));
> > +			                                                       ^^^^^^^^^^
> > +			goto rollback_unlink;
> > +		}
> > +	}
> 
> The recovery path unconditionally calls update_ftrace_direct_del with
> data->reg, but update_ftrace_direct_add is only invoked when data->reg
> is non-empty:
> 
>     if (ftrace_hash_count(data->reg)) {
>         err = update_ftrace_direct_add(&direct_ops, data->reg);
>         ...
>     }
> 
> When update_ftrace_direct_mod fails and all trampolines already had
> cur_image != NULL (so nothing was added to data->reg),
> update_ftrace_direct_del is called with an empty hash.
> update_ftrace_direct_del at kernel/trace/ftrace.c:6511 checks:
> 
>     if (!hash_count(hash))
>         return -EINVAL;
> 
> so it returns -EINVAL and the WARN_ON_ONCE fires spuriously. Should
> this guard match the add path?
> 
>     if (ftrace_hash_count(data->reg))
>         WARN_ON_ONCE(update_ftrace_direct_del(&direct_ops, data->reg));

yes, replied in the other email

SNIP

> > +int bpf_check_attach_btf_id_multi(struct btf *btf, struct bpf_prog *prog, u32 btf_id,
> > +				  struct bpf_attach_target_info *tgt_info)
> > +{
> > +	const struct btf_type *t;
> > +	unsigned long addr;
> > +	const char *tname;
> > +	int err;
> > +
> > +	if (!btf_id || !btf)
> > +		return -EINVAL;
> > +
> > +	/* Check noreturn attachment. */
> > +	if (prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI ||
> > +	     btf_id_set_contains(&noreturn_deny, btf_id))
> > +		return -EINVAL;
> 
> Operator precedence issue in the noreturn check. The two subexpressions
> are joined by ||, so the function returns -EINVAL whenever EITHER:
> 
>   (a) the program's expected_attach_type is BPF_TRACE_FEXIT_MULTI
>       (regardless of whether btf_id targets a noreturn function), OR
> 
>   (b) the btf_id is in the noreturn_deny set (regardless of program
>       attach type, rejecting FENTRY_MULTI attachments to noreturn
>       functions as well).
> 
> The intent is to reject only FEXIT-like multi attachments to noreturn
> functions. The non-multi path in check_attach_btf_id() correctly uses
> &&:
> 
>     } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT ||
>                prog->expected_attach_type == BPF_TRACE_FSESSION ||
>                prog->expected_attach_type == BPF_MODIFY_RETURN) &&
>                btf_id_set_contains(&noreturn_deny, btf_id)) {
>         ...
>         return -EINVAL;
>     }
> 
> Consequences once bpf_trampoline_multi_attach() is wired up from
> user-space (commit dbf2afe2f603 "bpf: Add support for tracing multi
> link"):
> 
>   1. Every BPF_TRACE_FEXIT_MULTI program attach request is rejected
>      with -EINVAL, even when the target function returns normally. This
>      disables BPF_TRACE_FEXIT_MULTI entirely.
> 
>   2. Any tracing_multi program (including BPF_TRACE_FENTRY_MULTI) is
>      rejected when its btf_id is in noreturn_deny (do_exit,
>      __x64_sys_exit, make_task_dead, etc.). Rejecting FENTRY_MULTI on
>      noreturn functions is an unintended behavioral restriction.
> 
> Forward search in the git range shows commit bfcad202ce99 ("bpf: Add
> support for tracing_multi link session") rewrites this hunk to:
> 
>     if ((prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI ||
>          prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) &&
>          btf_id_set_contains(&noreturn_deny, btf_id))
>         return -EINVAL;
> 
> The parenthesization + && in the later commit confirms this is a
> precedence bug. Should this use && instead?

yes, replied in the other email

jirka

^ permalink raw reply

* Re: [PATCHv5 bpf-next 20/28] libbpf: Add support to create tracing multi link
From: Jiri Olsa @ 2026-04-21  8:57 UTC (permalink / raw)
  To: bot+bpf-ci
  Cc: ast, daniel, andrii, bpf, linux-trace-kernel, kafai, eddyz87,
	songliubraving, yhs, menglong8.dong, rostedt, martin.lau,
	yonghong.song, clm, ihor.solodrai
In-Reply-To: <658609fdf8d0b9ee59a5a516fd7dd2460ef7cb52b9fca779094dfb761e43aa82@mail.kernel.org>

On Sat, Apr 18, 2026 at 06:10:58AM +0000, bot+bpf-ci@kernel.org wrote:
> > diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> > index 49368ee161b1..9bf7e8754c7b 100644
> > --- a/tools/lib/bpf/libbpf.c
> > +++ b/tools/lib/bpf/libbpf.c
> 
> [ ... ]
> 
> > @@ -12510,6 +12592,197 @@ bool btf_type_is_traceable_func(const struct btf *btf, const struct btf_type *t
> >  	return true;
> >  }
> >
> > +static int collect_func_ids_by_glob(const struct bpf_program *prog, const char *pattern, __u32 **ids)
> > +{
> > +	struct bpf_object *obj = prog->obj;
> > +	const struct module_btf *mod;
> > +	struct btf *btf = NULL;
> > +	const char *sep;
> > +	int err;
> > +
> > +	err = bpf_object__load_vmlinux_btf(obj, true);
> > +	if (err)
> > +		return err;
> > +
> > +	/* In case we have module specified, we will find its btf and use that. */
> > +	sep = strchr(pattern, ':');
> > +	if (sep) {
> > +		mod = find_attach_module(obj, pattern);
> > +		if (!mod) {
> > +			err = -EINVAL;
> > +			goto cleanup;
> > +		}
> > +		btf = mod->btf;
> > +		pattern = sep + 1;
> > +	} else {
> > +		/* Program is loaded for kernel module. */
> > +		if (prog->attach_btf_obj_fd)
> > +			return -EINVAL;
> > +		btf = obj->btf_vmlinux;
> > +	}
> > +
> > +	err = collect_btf_func_ids_by_glob(btf, pattern, ids);
> > +
> > +cleanup:
> > +	bpf_object_cleanup_btf(obj);
> > +	return err;
> > +}
> 
> Does this function leak the vmlinux BTF when prog->attach_btf_obj_fd is
> set? The early return bypasses the cleanup label that calls
> bpf_object_cleanup_btf(obj). Every other error path goes through
> cleanup, which suggests the BTF loaded by bpf_object__load_vmlinux_btf()
> needs to be cleaned up before returning.
> 
> The design pattern here is that bpf_object__load_vmlinux_btf(obj, true)
> is called specifically at attach-time (after the object has already been
> post-load-cleaned), so obj->btf_vmlinux is freshly loaded. Skipping the
> cleanup leaves that freshly-loaded btf_vmlinux attached to the bpf_object
> until bpf_object__close().
> 
> Should this be 'err = -EINVAL; goto cleanup;' to match the other error
> paths?

yea, should be 'goto cleanup', will fix

jirka

^ permalink raw reply

* Re: [RFC PATCH 1/2] kernel/notifier: replace single-linked list with double-linked list for reverse traversal
From: Petr Mladek @ 2026-04-21  9:05 UTC (permalink / raw)
  To: Masami Hiramatsu
  Cc: chensong_2000, rafael, lenb, mturquette, sboyd, viresh.kumar, agk,
	snitzer, mpatocka, bmarzins, song, yukuai, linan122, jason.wessel,
	danielt, dianders, horms, davem, edumazet, kuba, pabeni, paulmck,
	frederic, mcgrof, petr.pavlu, da.gomez, samitolvanen, atomlin,
	jpoimboe, jikos, mbenes, joe.lawrence, rostedt, mark.rutland,
	mathieu.desnoyers, linux-modules, linux-kernel,
	linux-trace-kernel, linux-acpi, linux-clk, linux-pm,
	live-patching, dm-devel, linux-raid, kgdb-bugreport, netdev
In-Reply-To: <20260420144429.57b45f2beece690bceea96ec@kernel.org>

On Mon 2026-04-20 14:44:29, Masami Hiramatsu wrote:
> Hi Song,
> 
> On Wed, 15 Apr 2026 15:01:37 +0800
> chensong_2000@189.cn wrote:
> 
> > From: Song Chen <chensong_2000@189.cn>
> > 
> > The current notifier chain implementation uses a single-linked list
> > (struct notifier_block *next), which only supports forward traversal
> > in priority order. This makes it difficult to handle cleanup/teardown
> > scenarios that require notifiers to be called in reverse priority order.
> 
> What about introducing a new notification callback API that allows you
> to describe dependencies between callback functions?
> 
> For example, when registering a callback, you could register a string
> as an ID and specify whether to call it before or after that ID,
> or you could register a comparison function that is called when adding
> to a list. (I prefer @name and @depends fields so that it can be easily
> maintained.)

This looks too complex. It would make sense only
when this API has more users.

Also this won't be enough for the ftrace/livepatch callbacks.
They need to be ordered against against each other. But they
also need to be called before/after all other callbacks.
For example, when the module is loaded:

   + 1st frace
   + 2nd livepatch
   + then other notifiers

See the commit c1bf08ac26e92122 ("ftrace: Be first to run code
modification on modules").

> This would allow for better dependency building when adding to the list.
 
> > 
> > A concrete example is the ordering dependency between ftrace and
> > livepatch during module load/unload. see the detail here [1].
> 
> If this only concerns notification callback issues with the ftrace
> and livepatch modules, it's far more robust to simply call the
> necessary processing directly when the modules load and unload,
> rather than registering notification callbacks externally.
> 
> There are fprobe, kprobe and its trace-events, all of them are using
> ftrace as its fundation layer. In this case, I always needs to
> consider callback order when a module is unloaded.
> 
> If ftrace is working as a part of module callbacks, it will conflict
> with fprobe/kprobe module callback. Of course we can reorder it with
> modifying its priority. But this is ugly, because when we introduce
> a new other feature which depends on another layer, we need to
> reorder the callback's priority number on the list.
> 
> Based on the above, I don't think this can be resolved simply by
> changing the list of notification callbacks to a bidirectional list.

I agree. I would keep it as is (hardcoded).

Best Regards,
Petr

^ permalink raw reply

* [PATCH] kernel: trace: do not generate undefsyms_base.c
From: Paolo Bonzini @ 2026-04-21 10:04 UTC (permalink / raw)
  To: linux-kernel, kvm
  Cc: torvalds, rostedt, Marc Zyngier, Arnd Bergmann, Nathan Chancellor,
	linux-trace-kernel

The code to autogenerate undefsyms_base.c in the Makefile is larger
than the file itself.

Remove the "echo" indirection that creates the file, which keeps
the build system sane and makes it much easier to edit it if/when
new situations arrive.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 kernel/trace/.gitignore       |  1 -
 kernel/trace/Makefile         | 35 ++++-------------------------------
 kernel/trace/undefsyms_base.c | 28 ++++++++++++++++++++++++++++
 3 files changed, 32 insertions(+), 32 deletions(-)
 delete mode 100644 kernel/trace/.gitignore
 create mode 100644 kernel/trace/undefsyms_base.c

diff --git a/kernel/trace/.gitignore b/kernel/trace/.gitignore
deleted file mode 100644
index 6adbb09d6deb..000000000000
--- a/kernel/trace/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-/undefsyms_base.c
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 4d4229e5eec4..1decdce8cbef 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -133,41 +133,14 @@ obj-$(CONFIG_TRACE_REMOTE) += trace_remote.o
 obj-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o
 obj-$(CONFIG_TRACE_REMOTE_TEST) += remote_test.o
 
-#
 # simple_ring_buffer is used by the pKVM hypervisor which does not have access
 # to all kernel symbols. Fail the build if forbidden symbols are found.
-#
-# undefsyms_base generates a set of compiler and tooling-generated symbols that can
-# safely be ignored for simple_ring_buffer.
-#
-filechk_undefsyms_base = \
-	echo '$(pound)include <linux/atomic.h>'; \
-	echo '$(pound)include <linux/string.h>'; \
-	echo '$(pound)include <asm/page.h>'; \
-	echo 'static char page[PAGE_SIZE] __aligned(PAGE_SIZE);'; \
-	echo 'void undefsyms_base(void *p, int n);'; \
-	echo 'void undefsyms_base(void *p, int n) {'; \
-	echo '	char buffer[256] = { 0 };'; \
-	echo '	u32 u = 0;'; \
-	echo '	memset((char * volatile)page, 8, PAGE_SIZE);'; \
-	echo '	memset((char * volatile)buffer, 8, sizeof(buffer));'; \
-	echo '	memcpy((void * volatile)p, buffer, sizeof(buffer));'; \
-	echo '	cmpxchg((u32 * volatile)&u, 0, 8);'; \
-	echo '	WARN_ON(n == 0xdeadbeef);'; \
-	echo '}'
-
-$(obj)/undefsyms_base.c: FORCE
-	$(call filechk,undefsyms_base)
-
-clean-files += undefsyms_base.c
-
-$(obj)/undefsyms_base.o: $(obj)/undefsyms_base.c
 
+# Basic compiler and tooling-generated symbols that can safely be left
+# undefined. Ensure KASAN is enabled to avoid logic that may disable
+# FORTIFY_SOURCE when KASAN is not enabled. undefsyms_base.o does not
+# automatically get KASAN flags because it is not linked into vmlinux.
 targets += undefsyms_base.o
-
-# Ensure KASAN is enabled to avoid logic that may disable FORTIFY_SOURCE when
-# KASAN is not enabled. undefsyms_base.o does not automatically get KASAN flags
-# because it is not linked into vmlinux.
 KASAN_SANITIZE_undefsyms_base.o := y
 
 UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __x86_indirect_thunk \
diff --git a/kernel/trace/undefsyms_base.c b/kernel/trace/undefsyms_base.c
new file mode 100644
index 000000000000..e65baf58e6ff
--- /dev/null
+++ b/kernel/trace/undefsyms_base.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * simple_ring_buffer is used by the pKVM hypervisor which does not have access
+ * to all kernel symbols.  Whatever is undefined when compiling this file is
+ * compiler and tooling-generated symbols that can safely be ignored for
+ * simple_ring_buffer.
+ */
+
+#include <linux/atomic.h>
+#include <linux/string.h>
+#include <asm/page.h>
+
+void undefsyms_base(void *p, int n);
+
+static char page[PAGE_SIZE] __aligned(PAGE_SIZE);
+
+void undefsyms_base(void *p, int n)
+{
+	char buffer[256] = { 0 };
+
+	u32 u = 0;
+	memset((char * volatile)page, 8, PAGE_SIZE);
+	memset((char * volatile)buffer, 8, sizeof(buffer));
+	memcpy((void * volatile)p, buffer, sizeof(buffer));
+	cmpxchg((u32 * volatile)&u, 0, 8);
+	WARN_ON(n == 0xdeadbeef);
+}
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH] kernel: trace: do not generate undefsyms_base.c
From: Steven Rostedt @ 2026-04-21 14:02 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: linux-kernel, kvm, torvalds, Marc Zyngier, Arnd Bergmann,
	Nathan Chancellor, linux-trace-kernel
In-Reply-To: <20260421100455.324333-1-pbonzini@redhat.com>

On Tue, 21 Apr 2026 11:04:55 +0100
Paolo Bonzini <pbonzini@redhat.com> wrote:

> The code to autogenerate undefsyms_base.c in the Makefile is larger
> than the file itself.
> 
> Remove the "echo" indirection that creates the file, which keeps
> the build system sane and makes it much easier to edit it if/when
> new situations arrive.
> 
> Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

Marc beat you to it by a few minutes ;-)

  https://lore.kernel.org/all/20260421095446.2951646-1-maz@kernel.org/

Although he didn't Cc linux-trace-kernel@vger.kernel.org :-p

But anyway, it ended up in my internal Patchwork. I'll take his as they are
both pretty much the same patch.

Thanks,

-- Steve

^ permalink raw reply

* Re: [PATCH] kernel: trace: do not generate undefsyms_base.c
From: Marc Zyngier @ 2026-04-21 14:16 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Paolo Bonzini, linux-kernel, kvm, torvalds, Arnd Bergmann,
	Nathan Chancellor, linux-trace-kernel
In-Reply-To: <20260421100209.7535e26f@gandalf.local.home>

On Tue, 21 Apr 2026 15:02:09 +0100,
Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> On Tue, 21 Apr 2026 11:04:55 +0100
> Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> > The code to autogenerate undefsyms_base.c in the Makefile is larger
> > than the file itself.
> > 
> > Remove the "echo" indirection that creates the file, which keeps
> > the build system sane and makes it much easier to edit it if/when
> > new situations arrive.
> > 
> > Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> 
> Marc beat you to it by a few minutes ;-)
> 
>   https://lore.kernel.org/all/20260421095446.2951646-1-maz@kernel.org/
> 
> Although he didn't Cc linux-trace-kernel@vger.kernel.org :-p
> 
> But anyway, it ended up in my internal Patchwork. I'll take his as they are
> both pretty much the same patch.

On the other hand, Paolo's patch has the SPDX tag on the new file,
which I forgot to add. Your call, anyway.

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.

^ permalink raw reply

* Re: [PATCH] kernel: trace: do not generate undefsyms_base.c
From: Paolo Bonzini @ 2026-04-21 14:21 UTC (permalink / raw)
  To: Marc Zyngier
  Cc: Steven Rostedt, linux-kernel, kvm, torvalds, Arnd Bergmann,
	Nathan Chancellor, linux-trace-kernel
In-Reply-To: <86ik9k1jqs.wl-maz@kernel.org>

On Tue, Apr 21, 2026 at 3:18 PM Marc Zyngier <maz@kernel.org> wrote:
> On Tue, 21 Apr 2026 15:02:09 +0100, Steven Rostedt <rostedt@goodmis.org> wrote:
> > Marc beat you to it by a few minutes ;-)
> >
> >   https://lore.kernel.org/all/20260421095446.2951646-1-maz@kernel.org/
> >
> > Although he didn't Cc linux-trace-kernel@vger.kernel.org :-p
> >
> > But anyway, it ended up in my internal Patchwork. I'll take his as they are
> > both pretty much the same patch.
>
> On the other hand, Paolo's patch has the SPDX tag on the new file,
> which I forgot to add. Your call, anyway.

Yeah if you want to get the best of both worlds, take the commit
message from Marc's and the code from mine; he has more complete
trailers whereas I have slightly cleaner formatting and more comments.
But otherwise it's not a big deal.

Paolo


^ permalink raw reply

* Re: [PATCH net v1] net: validate skb->napi_id in RX tracepoints
From: Simon Horman @ 2026-04-21 16:33 UTC (permalink / raw)
  To: Kohei Enju
  Cc: netdev, linux-trace-kernel, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers
In-Reply-To: <20260420105427.162816-1-kohei@enjuk.jp>

On Mon, Apr 20, 2026 at 10:54:23AM +0000, Kohei Enju wrote:
> Since commit 2bd82484bb4c ("xps: fix xps for stacked devices"),
> skb->napi_id shares storage with sender_cpu. RX tracepoints using
> net_dev_rx_verbose_template read skb->napi_id directly and can therefore
> report sender_cpu values as if they were NAPI IDs.
> 
> For example, on the loopback path this can report 1 as napi_id, where 1
> comes from raw_smp_processor_id() + 1 in the XPS path:
> 
>   # bpftrace -e 'tracepoint:net:netif_rx_entry{ print(args->napi_id); }'
>   # taskset -c 0 ping -c 1 ::1
> 
> Report only valid NAPI IDs in these tracepoints and use 0 otherwise.
> 
> Fixes: 2bd82484bb4c ("xps: fix xps for stacked devices")
> Signed-off-by: Kohei Enju <kohei@enjuk.jp>

Reviewed-by: Simon Horman <horms@kernel.org>

> ---
>  include/trace/events/net.h | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/include/trace/events/net.h b/include/trace/events/net.h
> index fdd9ad474ce3..dbc2c5598e35 100644
> --- a/include/trace/events/net.h
> +++ b/include/trace/events/net.h
> @@ -10,6 +10,7 @@
>  #include <linux/if_vlan.h>
>  #include <linux/ip.h>
>  #include <linux/tracepoint.h>
> +#include <net/busy_poll.h>
>  
>  TRACE_EVENT(net_dev_start_xmit,
>  
> @@ -208,7 +209,8 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,
>  	TP_fast_assign(
>  		__assign_str(name);
>  #ifdef CONFIG_NET_RX_BUSY_POLL
> -		__entry->napi_id = skb->napi_id;
> +		__entry->napi_id = napi_id_valid(skb->napi_id) ?
> +				   skb->napi_id : 0;

Note to self: they key is that if the storage at napi_id is
being used as a sender_cpu then napi_id_valid because
the valid values for a sender_cpu are disjoint from those
of a valid napi_id. This can be seen clearly in the
implementation of napi_id_valid() and the comment above it.

>  #else
>  		__entry->napi_id = 0;
>  #endif
> -- 
> 2.51.0
> 

^ permalink raw reply

* Re: [PATCH] kernel: trace: do not generate undefsyms_base.c
From: Steven Rostedt @ 2026-04-21 19:56 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Marc Zyngier, linux-kernel, kvm, torvalds, Arnd Bergmann,
	Nathan Chancellor, linux-trace-kernel
In-Reply-To: <CABgObfam470W_Cq-jFHXc7H=xkr4CRMOGrbuLd-HtkepvwA3AQ@mail.gmail.com>

On Tue, 21 Apr 2026 15:21:19 +0100
Paolo Bonzini <pbonzini@redhat.com> wrote:

> On Tue, Apr 21, 2026 at 3:18 PM Marc Zyngier <maz@kernel.org> wrote:
> > On Tue, 21 Apr 2026 15:02:09 +0100, Steven Rostedt <rostedt@goodmis.org> wrote:  
> > > Marc beat you to it by a few minutes ;-)
> > >
> > >   https://lore.kernel.org/all/20260421095446.2951646-1-maz@kernel.org/
> > >
> > > Although he didn't Cc linux-trace-kernel@vger.kernel.org :-p
> > >
> > > But anyway, it ended up in my internal Patchwork. I'll take his as they are
> > > both pretty much the same patch.  
> >
> > On the other hand, Paolo's patch has the SPDX tag on the new file,
> > which I forgot to add. Your call, anyway.  

Ah, OK. I'll take Paolo's.

> 
> Yeah if you want to get the best of both worlds, take the commit
> message from Marc's and the code from mine; he has more complete
> trailers whereas I have slightly cleaner formatting and more comments.
> But otherwise it's not a big deal.

I'll update the subject and change log slightly, but otherwise, it's your
patch.

-- Steve


^ permalink raw reply

* Re: [PATCH] kernel: trace: do not generate undefsyms_base.c
From: Nathan Chancellor @ 2026-04-21 21:51 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: linux-kernel, kvm, torvalds, rostedt, Marc Zyngier, Arnd Bergmann,
	linux-trace-kernel
In-Reply-To: <20260421100455.324333-1-pbonzini@redhat.com>

On Tue, Apr 21, 2026 at 11:04:55AM +0100, Paolo Bonzini wrote:
> The code to autogenerate undefsyms_base.c in the Makefile is larger
> than the file itself.
> 
> Remove the "echo" indirection that creates the file, which keeps
> the build system sane and makes it much easier to edit it if/when
> new situations arrive.
> 
> Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

Yeah, I don't really know how I did not see this originally :/ tunnel
vision is real I suppose.

Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>

> ---
>  kernel/trace/.gitignore       |  1 -
>  kernel/trace/Makefile         | 35 ++++-------------------------------
>  kernel/trace/undefsyms_base.c | 28 ++++++++++++++++++++++++++++
>  3 files changed, 32 insertions(+), 32 deletions(-)
>  delete mode 100644 kernel/trace/.gitignore
>  create mode 100644 kernel/trace/undefsyms_base.c
> 
> diff --git a/kernel/trace/.gitignore b/kernel/trace/.gitignore
> deleted file mode 100644
> index 6adbb09d6deb..000000000000
> --- a/kernel/trace/.gitignore
> +++ /dev/null
> @@ -1 +0,0 @@
> -/undefsyms_base.c
> diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
> index 4d4229e5eec4..1decdce8cbef 100644
> --- a/kernel/trace/Makefile
> +++ b/kernel/trace/Makefile
> @@ -133,41 +133,14 @@ obj-$(CONFIG_TRACE_REMOTE) += trace_remote.o
>  obj-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o
>  obj-$(CONFIG_TRACE_REMOTE_TEST) += remote_test.o
>  
> -#
>  # simple_ring_buffer is used by the pKVM hypervisor which does not have access
>  # to all kernel symbols. Fail the build if forbidden symbols are found.
> -#
> -# undefsyms_base generates a set of compiler and tooling-generated symbols that can
> -# safely be ignored for simple_ring_buffer.
> -#
> -filechk_undefsyms_base = \
> -	echo '$(pound)include <linux/atomic.h>'; \
> -	echo '$(pound)include <linux/string.h>'; \
> -	echo '$(pound)include <asm/page.h>'; \
> -	echo 'static char page[PAGE_SIZE] __aligned(PAGE_SIZE);'; \
> -	echo 'void undefsyms_base(void *p, int n);'; \
> -	echo 'void undefsyms_base(void *p, int n) {'; \
> -	echo '	char buffer[256] = { 0 };'; \
> -	echo '	u32 u = 0;'; \
> -	echo '	memset((char * volatile)page, 8, PAGE_SIZE);'; \
> -	echo '	memset((char * volatile)buffer, 8, sizeof(buffer));'; \
> -	echo '	memcpy((void * volatile)p, buffer, sizeof(buffer));'; \
> -	echo '	cmpxchg((u32 * volatile)&u, 0, 8);'; \
> -	echo '	WARN_ON(n == 0xdeadbeef);'; \
> -	echo '}'
> -
> -$(obj)/undefsyms_base.c: FORCE
> -	$(call filechk,undefsyms_base)
> -
> -clean-files += undefsyms_base.c
> -
> -$(obj)/undefsyms_base.o: $(obj)/undefsyms_base.c
>  
> +# Basic compiler and tooling-generated symbols that can safely be left
> +# undefined. Ensure KASAN is enabled to avoid logic that may disable
> +# FORTIFY_SOURCE when KASAN is not enabled. undefsyms_base.o does not
> +# automatically get KASAN flags because it is not linked into vmlinux.
>  targets += undefsyms_base.o
> -
> -# Ensure KASAN is enabled to avoid logic that may disable FORTIFY_SOURCE when
> -# KASAN is not enabled. undefsyms_base.o does not automatically get KASAN flags
> -# because it is not linked into vmlinux.
>  KASAN_SANITIZE_undefsyms_base.o := y
>  
>  UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __x86_indirect_thunk \
> diff --git a/kernel/trace/undefsyms_base.c b/kernel/trace/undefsyms_base.c
> new file mode 100644
> index 000000000000..e65baf58e6ff
> --- /dev/null
> +++ b/kernel/trace/undefsyms_base.c
> @@ -0,0 +1,28 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/*
> + * simple_ring_buffer is used by the pKVM hypervisor which does not have access
> + * to all kernel symbols.  Whatever is undefined when compiling this file is
> + * compiler and tooling-generated symbols that can safely be ignored for
> + * simple_ring_buffer.
> + */
> +
> +#include <linux/atomic.h>
> +#include <linux/string.h>
> +#include <asm/page.h>
> +
> +void undefsyms_base(void *p, int n);
> +
> +static char page[PAGE_SIZE] __aligned(PAGE_SIZE);
> +
> +void undefsyms_base(void *p, int n)
> +{
> +	char buffer[256] = { 0 };
> +
> +	u32 u = 0;
> +	memset((char * volatile)page, 8, PAGE_SIZE);
> +	memset((char * volatile)buffer, 8, sizeof(buffer));
> +	memcpy((void * volatile)p, buffer, sizeof(buffer));
> +	cmpxchg((u32 * volatile)&u, 0, 8);
> +	WARN_ON(n == 0xdeadbeef);
> +}
> -- 
> 2.53.0
> 

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox