All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC][PATCH] perf: Fix perf_addr_filters_afjust()
@ 2026-06-22 12:32 Peter Zijlstra
  2026-06-22 12:53 ` sashiko-bot
  0 siblings, 1 reply; 2+ messages in thread
From: Peter Zijlstra @ 2026-06-22 12:32 UTC (permalink / raw)
  To: mingo
  Cc: acme, namhyung, mark.rutland, alexander.shishkin, jolsa, irogers,
	adrian.hunter, james.clark, linux-perf-users, linux-kernel,
	suzuki.poulose, mike.leach, leo.yan



 - adjust lock order; since perf_event_addr_filters_apply() takes mmap_lock
   inside child_mutex.

 - note that perf_event_mmap() is called by the thread doing the
   mmap()/mprotect()/.. operation, but not the other threads in the address
   space that might have inherited the same event.

 - have perf_addr_filters_adjust() iterate the child_list to find all other
   events for tasks that have the same mm.

 - since this can now adjust addr_filter_ranges[] for remote events, make sure
   to adjust things while holding ctx->lock; local_irq_disable() as implied by
   the ifh->lock is no longer suffient to serialize against event scheduling.

 - per the previous locking order; perf_event_mmap(), which is called holding
   mmap_lock, cannot take child_mutex, as such, make child_list RCU protected
   and iterate under RCU.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/events/core.c |   69 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 49 insertions(+), 20 deletions(-)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1454,10 +1454,11 @@ static void put_ctx(struct perf_event_co
  *	  perf_event_context::mutex
  *	    perf_event::child_mutex;
  *	      perf_event_context::lock
- *	    mmap_lock
- *	      perf_event::mmap_mutex
- *	        perf_buffer::aux_mutex
- *	      perf_addr_filters_head::lock
+ *	        perf_addr_filters_head::lock
+ *	      mmap_lock
+ *	        perf_event::mmap_mutex
+ *	          perf_buffer::aux_mutex
+ *	        perf_addr_filters_head::lock
  *
  *    cpu_hotplug_lock
  *      pmus_lock
@@ -2429,7 +2430,7 @@ static void perf_child_detach(struct per
 	lockdep_assert_held(&parent_event->child_mutex);
 	 */
 
-	list_del_init(&event->child_list);
+	list_del_rcu(&event->child_list);
 }
 
 static bool is_orphaned_event(struct perf_event *event)
@@ -9902,35 +9903,64 @@ static bool perf_addr_filter_vma_adjust(
 	return true;
 }
 
-static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
+static int perf_child_addr_filters_adjust(struct perf_event *event,
+					  struct vm_area_struct *vma)
 {
 	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
-	struct vm_area_struct *vma = data;
-	struct perf_addr_filter *filter;
 	unsigned int restart = 0, count = 0;
-	unsigned long flags;
-
-	if (!has_addr_filter(event))
-		return;
+	struct perf_addr_filter *filter;
 
-	if (!vma->vm_file)
-		return;
+	guard(raw_spinlock)(&ifh->lock);
 
-	raw_spin_lock_irqsave(&ifh->lock, flags);
 	list_for_each_entry(filter, &ifh->list, entry) {
 		if (perf_addr_filter_vma_adjust(filter, vma,
 						&event->addr_filter_ranges[count]))
 			restart++;
 
 		count++;
+
 	}
 
 	if (restart)
 		event->addr_filters_gen++;
-	raw_spin_unlock_irqrestore(&ifh->lock, flags);
 
-	if (restart)
-		perf_event_stop(event, 1);
+	return restart;
+}
+
+static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
+{
+	struct perf_event *child, *parent = event->parent ?: event;
+	struct vm_area_struct *vma = data;
+
+	if (!has_addr_filter(event))
+		return;
+
+	if (!vma->vm_file)
+		return;
+
+	list_for_each_entry_rcu(child, &parent->child_list, child_list) {
+		struct perf_event_context *ctx;
+		struct task_struct *task;
+		int restart = 0;
+
+again:
+		ctx = READ_ONCE(child->ctx);
+		scoped_guard (raw_spinlock_irqsave, &ctx->lock) {
+			if (child->ctx != ctx)
+				goto again;
+
+			task = ctx->task;
+			if (!task || task == TASK_TOMBSTONE)
+				continue;
+
+			if (vma->vm_mm != task->mm)
+				continue;
+
+			restart = perf_child_addr_filters_adjust(child, vma);
+		}
+		if (restart)
+			perf_event_stop(child, 1);
+	}
 }
 
 /*
@@ -9947,11 +9977,10 @@ static void perf_addr_filters_adjust(str
 	if (!(vma->vm_flags & VM_EXEC))
 		return;
 
-	rcu_read_lock();
+	guard(rcu)();
 	ctx = rcu_dereference(current->perf_event_ctxp);
 	if (ctx)
 		perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
-	rcu_read_unlock();
 }
 
 void perf_event_mmap(struct vm_area_struct *vma)

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [RFC][PATCH] perf: Fix perf_addr_filters_afjust()
  2026-06-22 12:32 [RFC][PATCH] perf: Fix perf_addr_filters_afjust() Peter Zijlstra
@ 2026-06-22 12:53 ` sashiko-bot
  0 siblings, 0 replies; 2+ messages in thread
From: sashiko-bot @ 2026-06-22 12:53 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-perf-users

Thank you for your contribution! Sashiko AI review found 3 potential issue(s) to consider:

New issues:
- [High] Incomplete RCU conversion for `child_list`: missing `list_add_tail_rcu` in `inherit_event` leads to a data race and potential crashes for concurrent RCU readers.
- [High] The parent event is completely skipped during address filter adjustments, breaking hardware address filtering for the primary task.

Pre-existing issues:
- [Critical] The `event->addr_filter_ranges` array is freed synchronously, leading to a Use-After-Free when concurrent RCU readers access it during the RCU grace period.
--

commit a5419e8444b35216c0ba361188f91ecd86875fd1
Author: Peter Zijlstra <peterz@infradead.org>

perf: Fix perf_addr_filters_afjust()

This patch adjusts the locking order for perf_event_addr_filters_apply()
and converts child_list to be RCU-protected to avoid taking child_mutex
under mmap_lock. It updates perf_addr_filters_adjust() to iterate through
child events under RCU.

> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 00e6dcb931d96..af3c123010407 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c

[ ... ]

> @@ -9902,35 +9903,64 @@ static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
>  	return true;
>  }
>  
> -static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
> +static int perf_child_addr_filters_adjust(struct perf_event *event,
> +					  struct vm_area_struct *vma)
>  {
>  	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
> -	struct vm_area_struct *vma = data;
> -	struct perf_addr_filter *filter;
>  	unsigned int restart = 0, count = 0;
> -	unsigned long flags;
> -
> -	if (!has_addr_filter(event))
> -		return;
> +	struct perf_addr_filter *filter;
>  
> -	if (!vma->vm_file)
> -		return;
> +	guard(raw_spinlock)(&ifh->lock);
>  
> -	raw_spin_lock_irqsave(&ifh->lock, flags);
>  	list_for_each_entry(filter, &ifh->list, entry) {
>  		if (perf_addr_filter_vma_adjust(filter, vma,
>  						&event->addr_filter_ranges[count]))

[Severity: Critical]
This is a pre-existing issue, but could a concurrent RCU reader access an
already freed addr_filter_ranges array here?

When an event's refcount drops to zero, __free_event() frees the array
synchronously:

kernel/events/core.c:__free_event() {
    ...
    kfree(event->addr_filter_ranges);
    ...
}

If a concurrent mmap() traverses the child_list under RCU, could it find the
dying event during the grace period and write to this freed memory?

>  			restart++;
>  
>  		count++;
> +
>  	}
>  
>  	if (restart)
>  		event->addr_filters_gen++;
> -	raw_spin_unlock_irqrestore(&ifh->lock, flags);
>  
> -	if (restart)
> -		perf_event_stop(event, 1);
> +	return restart;
> +}
> +
> +static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
> +{
> +	struct perf_event *child, *parent = event->parent ?: event;
> +	struct vm_area_struct *vma = data;
> +
> +	if (!has_addr_filter(event))
> +		return;
> +
> +	if (!vma->vm_file)
> +		return;
> +
> +	list_for_each_entry_rcu(child, &parent->child_list, child_list) {

[Severity: High]
Are the parent event's address filters skipped during adjustments?

Because list_for_each_entry_rcu() only iterates over the list nodes, the
parent event hosting the child_list head won't be included in the iteration.

If a standalone process creates an event and mmaps it, its child_list is
empty. Will this cause the parent's filter ranges to remain unadjusted,
breaking hardware address filtering for the primary task?


[Severity: High]
Since child_list is now RCU-protected, should the addition of child events
use RCU primitives to prevent data races?

In inherit_event(), child events are still added using the non-RCU
list_add_tail():

kernel/events/core.c:inherit_event() {
    ...
    /*
     * Link this into the parent event's child list
     */
    list_add_tail(&child_event->child_list, &parent_event->child_list);
    mutex_unlock(&parent_event->child_mutex);
    ...
}

Because list_add_tail() lacks the memory release barrier provided by
list_add_tail_rcu(), could concurrent lockless RCU readers observe the newly
linked list node before its initialization is visible to the reading CPU?

> +		struct perf_event_context *ctx;
> +		struct task_struct *task;
> +		int restart = 0;
> +
> +again:
> +		ctx = READ_ONCE(child->ctx);
> +		scoped_guard (raw_spinlock_irqsave, &ctx->lock) {
> +			if (child->ctx != ctx)
> +				goto again;
> +
> +			task = ctx->task;

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260622123245.GS48970@noisy.programming.kicks-ass.net?part=1

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2026-06-22 12:53 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-22 12:32 [RFC][PATCH] perf: Fix perf_addr_filters_afjust() Peter Zijlstra
2026-06-22 12:53 ` sashiko-bot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.