[PATCH v3] perf/core: Fix missing read event generation on task exit

public inbox for linux-perf-users@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH v3] perf/core: Fix missing read event generation on task exit
@ 2025-10-24 17:05 Thaumy Cheng
  2026-02-06 11:21 ` James Clark
  0 siblings, 1 reply; 10+ messages in thread
From: Thaumy Cheng @ 2025-10-24 17:05 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Ian Rogers, Adrian Hunter, Kan Liang
  Cc: linux-perf-users, linux-kernel, Thaumy Cheng

For events with inherit_stat enabled, a "read" event will be generated
to collect per task event counts on task exit.

The call chain is as follows:

do_exit
  -> perf_event_exit_task
    -> perf_event_exit_task_context
      -> perf_event_exit_event
        -> perf_remove_from_context
          -> perf_child_detach
            -> sync_child_event
              -> perf_event_read_event

However, the child event context detaches the task too early in
perf_event_exit_task_context, which causes sync_child_event to never
generate the read event in this case, since child_event->ctx->task is
always set to TASK_TOMBSTONE. Fix that by moving context lock section
backward to ensure ctx->task is not set to TASK_TOMBSTONE before
generating the read event.

Because perf_event_free_task calls perf_event_exit_task_context with
exit = false to tear down all child events from the context, and the
task never lived, accessing the task PID can lead to a use-after-free.

To fix that, let sync_child_event read task from argument and move the
call to the only place it should be triggered to avoid the effect of
setting ctx->task to TASK_TOMESTONE, and add a task parameter to
perf_event_exit_event to trigger the sync_child_event properly when
needed.

This bug can be reproduced by running "perf record -s" and attaching to
any program that generates perf events in its child tasks. If we check
the result with "perf report -T", the last line of the report will leave
an empty table like "# PID  TID", which is expected to contain the
per-task event counts by design.

Fixes: ef54c1a476ae ("perf: Rework perf_event_exit_event()")
Signed-off-by: Thaumy Cheng <thaumy.love@gmail.com>
---
Changes in v3:
- Fix the bug in a more direct way by moving the call to
  sync_child_event and bring back the task param to
  perf_event_exit_event.
  This approach avoids the event unscheduling issue in v2.

Changes in v2:
- Only trigger read event on task exit.
- Rename perf_event_exit_event to perf_event_detach_event.
- Link to v2: https://lore.kernel.org/all/20250817132742.85154-1-thaumy.love@gmail.com/

Changes in v1:
- Set TASK_TOMBSTONE after the read event is tirggered.
- Link to v1: https://lore.kernel.org/all/20250720000424.12572-1-thaumy.love@gmail.com/

 kernel/events/core.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 177e57c1a362..618e7947c358 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2316,7 +2316,8 @@ static void perf_group_detach(struct perf_event *event)
 	perf_event__header_size(leader);
 }

-static void sync_child_event(struct perf_event *child_event);
+static void sync_child_event(struct perf_event *child_event,
+			     struct task_struct *task);

 static void perf_child_detach(struct perf_event *event)
 {
@@ -2336,7 +2337,6 @@ static void perf_child_detach(struct perf_event *event)
 	lockdep_assert_held(&parent_event->child_mutex);
 	 */

-	sync_child_event(event);
 	list_del_init(&event->child_list);
 }

@@ -4587,6 +4587,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
 static void perf_remove_from_owner(struct perf_event *event);
 static void perf_event_exit_event(struct perf_event *event,
 				  struct perf_event_context *ctx,
+				  struct task_struct *task,
 				  bool revoke);

 /*
@@ -4614,7 +4615,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx)

 		modified = true;

-		perf_event_exit_event(event, ctx, false);
+		perf_event_exit_event(event, ctx, ctx->task, false);
 	}

 	raw_spin_lock_irqsave(&ctx->lock, flags);
@@ -12437,7 +12438,7 @@ static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
 	/*
 	 * De-schedule the event and mark it REVOKED.
 	 */
-	perf_event_exit_event(event, ctx, true);
+	perf_event_exit_event(event, ctx, ctx->task, true);

 	/*
 	 * All _free_event() bits that rely on event->pmu:
@@ -13994,14 +13995,13 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
 }
 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);

-static void sync_child_event(struct perf_event *child_event)
+static void sync_child_event(struct perf_event *child_event,
+			     struct task_struct *task)
 {
 	struct perf_event *parent_event = child_event->parent;
 	u64 child_val;

 	if (child_event->attr.inherit_stat) {
-		struct task_struct *task = child_event->ctx->task;
-
 		if (task && task != TASK_TOMBSTONE)
 			perf_event_read_event(child_event, task);
 	}
@@ -14020,7 +14020,9 @@ static void sync_child_event(struct perf_event *child_event)

 static void
 perf_event_exit_event(struct perf_event *event,
-		      struct perf_event_context *ctx, bool revoke)
+		      struct perf_event_context *ctx,
+		      struct task_struct *task,
+		      bool revoke)
 {
 	struct perf_event *parent_event = event->parent;
 	unsigned long detach_flags = DETACH_EXIT;
@@ -14043,6 +14045,9 @@ perf_event_exit_event(struct perf_event *event,
 		mutex_lock(&parent_event->child_mutex);
 		/* PERF_ATTACH_ITRACE might be set concurrently */
 		attach_state = READ_ONCE(event->attach_state);
+
+		if (attach_state & PERF_ATTACH_CHILD)
+			sync_child_event(event, task);
 	}

 	if (revoke)
@@ -14134,7 +14139,7 @@ static void perf_event_exit_task_context(struct task_struct *task, bool exit)
 		perf_event_task(task, ctx, 0);

 	list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry)
-		perf_event_exit_event(child_event, ctx, false);
+		perf_event_exit_event(child_event, ctx, exit ? task : NULL, false);

 	mutex_unlock(&ctx->mutex);

--
2.51.0

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] perf/core: Fix missing read event generation on task exit
  2025-10-24 17:05 [PATCH v3] perf/core: Fix missing read event generation on task exit Thaumy Cheng
@ 2026-02-06 11:21 ` James Clark
  2026-02-06 15:29   ` Peter Zijlstra
  0 siblings, 1 reply; 10+ messages in thread
From: James Clark @ 2026-02-06 11:21 UTC (permalink / raw)
  To: Thaumy Cheng, Peter Zijlstra
  Cc: linux-perf-users, linux-kernel, Ingo Molnar,
	Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
	Alexander Shishkin, Jiri Olsa, Ian Rogers, Adrian Hunter,
	Kan Liang, Suzuki K Poulose, Leo Yan, Mike Leach



On 24/10/2025 6:05 pm, Thaumy Cheng wrote:
> For events with inherit_stat enabled, a "read" event will be generated
> to collect per task event counts on task exit.
> 
> The call chain is as follows:
> 
> do_exit
>    -> perf_event_exit_task
>      -> perf_event_exit_task_context
>        -> perf_event_exit_event
>          -> perf_remove_from_context
>            -> perf_child_detach
>              -> sync_child_event
>                -> perf_event_read_event
> 
> However, the child event context detaches the task too early in
> perf_event_exit_task_context, which causes sync_child_event to never
> generate the read event in this case, since child_event->ctx->task is
> always set to TASK_TOMBSTONE. Fix that by moving context lock section
> backward to ensure ctx->task is not set to TASK_TOMBSTONE before
> generating the read event.
> 
> Because perf_event_free_task calls perf_event_exit_task_context with
> exit = false to tear down all child events from the context, and the
> task never lived, accessing the task PID can lead to a use-after-free.
> 
> To fix that, let sync_child_event read task from argument and move the
> call to the only place it should be triggered to avoid the effect of
> setting ctx->task to TASK_TOMESTONE, and add a task parameter to
> perf_event_exit_event to trigger the sync_child_event properly when
> needed.
> 
> This bug can be reproduced by running "perf record -s" and attaching to
> any program that generates perf events in its child tasks. If we check
> the result with "perf report -T", the last line of the report will leave
> an empty table like "# PID  TID", which is expected to contain the
> per-task event counts by design.
> 
> Fixes: ef54c1a476ae ("perf: Rework perf_event_exit_event()")
> Signed-off-by: Thaumy Cheng <thaumy.love@gmail.com>
> ---
> Changes in v3:
> - Fix the bug in a more direct way by moving the call to
>    sync_child_event and bring back the task param to
>    perf_event_exit_event.
>    This approach avoids the event unscheduling issue in v2.
> 
> Changes in v2:
> - Only trigger read event on task exit.
> - Rename perf_event_exit_event to perf_event_detach_event.
> - Link to v2: https://lore.kernel.org/all/20250817132742.85154-1-thaumy.love@gmail.com/
> 
> Changes in v1:
> - Set TASK_TOMBSTONE after the read event is tirggered.
> - Link to v1: https://lore.kernel.org/all/20250720000424.12572-1-thaumy.love@gmail.com/
> 
>   kernel/events/core.c | 23 ++++++++++++++---------
>   1 file changed, 14 insertions(+), 9 deletions(-)
> 
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 177e57c1a362..618e7947c358 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -2316,7 +2316,8 @@ static void perf_group_detach(struct perf_event *event)
>   	perf_event__header_size(leader);
>   }
> 
> -static void sync_child_event(struct perf_event *child_event);
> +static void sync_child_event(struct perf_event *child_event,
> +			     struct task_struct *task);
> 
>   static void perf_child_detach(struct perf_event *event)
>   {
> @@ -2336,7 +2337,6 @@ static void perf_child_detach(struct perf_event *event)
>   	lockdep_assert_held(&parent_event->child_mutex);
>   	 */
> 
> -	sync_child_event(event);
>   	list_del_init(&event->child_list);
>   }
> 
> @@ -4587,6 +4587,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
>   static void perf_remove_from_owner(struct perf_event *event);
>   static void perf_event_exit_event(struct perf_event *event,
>   				  struct perf_event_context *ctx,
> +				  struct task_struct *task,
>   				  bool revoke);
> 
>   /*
> @@ -4614,7 +4615,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx)
> 
>   		modified = true;
> 
> -		perf_event_exit_event(event, ctx, false);
> +		perf_event_exit_event(event, ctx, ctx->task, false);
>   	}
> 
>   	raw_spin_lock_irqsave(&ctx->lock, flags);
> @@ -12437,7 +12438,7 @@ static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
>   	/*
>   	 * De-schedule the event and mark it REVOKED.
>   	 */
> -	perf_event_exit_event(event, ctx, true);
> +	perf_event_exit_event(event, ctx, ctx->task, true);
> 
>   	/*
>   	 * All _free_event() bits that rely on event->pmu:
> @@ -13994,14 +13995,13 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
>   }
>   EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
> 
> -static void sync_child_event(struct perf_event *child_event)
> +static void sync_child_event(struct perf_event *child_event,
> +			     struct task_struct *task)
>   {
>   	struct perf_event *parent_event = child_event->parent;
>   	u64 child_val;
> 
>   	if (child_event->attr.inherit_stat) {
> -		struct task_struct *task = child_event->ctx->task;
> -
>   		if (task && task != TASK_TOMBSTONE)
>   			perf_event_read_event(child_event, task);
>   	}
> @@ -14020,7 +14020,9 @@ static void sync_child_event(struct perf_event *child_event)
> 
>   static void
>   perf_event_exit_event(struct perf_event *event,
> -		      struct perf_event_context *ctx, bool revoke)
> +		      struct perf_event_context *ctx,
> +		      struct task_struct *task,
> +		      bool revoke)
>   {
>   	struct perf_event *parent_event = event->parent;
>   	unsigned long detach_flags = DETACH_EXIT;
> @@ -14043,6 +14045,9 @@ perf_event_exit_event(struct perf_event *event,
>   		mutex_lock(&parent_event->child_mutex);
>   		/* PERF_ATTACH_ITRACE might be set concurrently */
>   		attach_state = READ_ONCE(event->attach_state);
> +
> +		if (attach_state & PERF_ATTACH_CHILD)
> +			sync_child_event(event, task);

Hi Thaumy and Peter,

I've been looking into a regression caused by this commit and didn't 
manage to come up with a fix. But shouldn't this be something more like:

   if (attach_state & PERF_ATTACH_CHILD && event_filter_match(event))
       sync_child_event(event, task);

As in, you only want to call sync_child_event() and write stuff to the 
ring buffer for the CPU that is currently running this exit handler? 
Although this change affects the 'total_time_enabled' tracking as well, 
but I'm not 100% sure if we're not double counting it anyway.

 From perf_event_exit_task_context(), perf_event_exit_event() is called 
on all events, which includes events on other CPUs:

   list_for_each_entry_safe(child_event, next, &ctx->event_list, ...)
     perf_event_exit_event(child_event, ctx, exit ? task : NULL, false);

Then we write into those other CPU's ring buffers, which don't support 
concurrency.

The reason I found this is because we have a tracing test that spawns 
some threads and then looks for PERF_RECORD_AUX events. When there are 
concurrent writes into the ring buffers, rb->nest tracking gets messed 
up leaving the count positive even after all nested writers have 
finished. Then all future writes don't copy the data_head pointer to the 
user page (because it thinks someone else is writing), so Perf doesn't 
copy out any data anymore leaving records missing.

An easy reproducer is to put a warning that the ring buffer being 
written to is the correct one:

   @@ -41,10 +41,11 @@ static void perf_output_get_handle(struct
   perf_output_handle *handle)
   {
  	struct perf_buffer *rb = handle->rb;

  	preempt_disable();

   +	WARN_ON(handle->event->cpu != smp_processor_id());


And then record:

   perf record -s -- stress -c 8 -t 1

Which results in:

   perf_output_begin+0x320/0x480 (P)
   perf_event_exit_event+0x178/0x2c0
   perf_event_exit_task_context+0x214/0x2f0
   perf_event_exit_task+0xb0/0x3b0
   do_exit+0x1bc/0x808
   __arm64_sys_exit+0x28/0x30
   invoke_syscall+0x4c/0xe8
   el0_svc_common+0x9c/0xf0
   do_el0_svc+0x28/0x40
   el0_svc+0x50/0x240
   el0t_64_sync_handler+0x78/0x130
   el0t_64_sync+0x198/0x1a0

I suppose there is a chance that this is only an issue when also doing 
perf_aux_output_begin()/perf_aux_output_end() from start/stop because 
that's where I saw the real race? Maybe without that, accessing the rb 
from another CPU is ok because there is some locking, but I think this 
might be a more general issue.

Thanks
James


>   	}
> 
>   	if (revoke)
> @@ -14134,7 +14139,7 @@ static void perf_event_exit_task_context(struct task_struct *task, bool exit)
>   		perf_event_task(task, ctx, 0);
> 
>   	list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry)
> -		perf_event_exit_event(child_event, ctx, false);
> +		perf_event_exit_event(child_event, ctx, exit ? task : NULL, false);
> 
>   	mutex_unlock(&ctx->mutex);
> 
> --
> 2.51.0
> 


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] perf/core: Fix missing read event generation on task exit
  2026-02-06 11:21 ` James Clark
@ 2026-02-06 15:29   ` Peter Zijlstra
  2026-02-09 16:12     ` James Clark
  0 siblings, 1 reply; 10+ messages in thread
From: Peter Zijlstra @ 2026-02-06 15:29 UTC (permalink / raw)
  To: James Clark
  Cc: Thaumy Cheng, linux-perf-users, linux-kernel, Ingo Molnar,
	Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
	Alexander Shishkin, Jiri Olsa, Ian Rogers, Adrian Hunter,
	Kan Liang, Suzuki K Poulose, Leo Yan, Mike Leach

On Fri, Feb 06, 2026 at 11:21:19AM +0000, James Clark wrote:

> I've been looking into a regression caused by this commit and didn't manage
> to come up with a fix. But shouldn't this be something more like:
> 
>   if (attach_state & PERF_ATTACH_CHILD && event_filter_match(event))
>       sync_child_event(event, task);
> 
> As in, you only want to call sync_child_event() and write stuff to the ring
> buffer for the CPU that is currently running this exit handler? Although
> this change affects the 'total_time_enabled' tracking as well, but I'm not
> 100% sure if we're not double counting it anyway.
> 
> From perf_event_exit_task_context(), perf_event_exit_event() is called on
> all events, which includes events on other CPUs:
> 
>   list_for_each_entry_safe(child_event, next, &ctx->event_list, ...)
>     perf_event_exit_event(child_event, ctx, exit ? task : NULL, false);
> 
> Then we write into those other CPU's ring buffers, which don't support
> concurrency.
> 
> The reason I found this is because we have a tracing test that spawns some
> threads and then looks for PERF_RECORD_AUX events. When there are concurrent
> writes into the ring buffers, rb->nest tracking gets messed up leaving the
> count positive even after all nested writers have finished. Then all future
> writes don't copy the data_head pointer to the user page (because it thinks
> someone else is writing), so Perf doesn't copy out any data anymore leaving
> records missing.
> 
> An easy reproducer is to put a warning that the ring buffer being written to
> is the correct one:
> 
>   @@ -41,10 +41,11 @@ static void perf_output_get_handle(struct
>   perf_output_handle *handle)
>   {
>  	struct perf_buffer *rb = handle->rb;
> 
>  	preempt_disable();
> 
>   +	WARN_ON(handle->event->cpu != smp_processor_id());
> 
> 
> And then record:
> 
>   perf record -s -- stress -c 8 -t 1
> 
> Which results in:
> 
>   perf_output_begin+0x320/0x480 (P)
>   perf_event_exit_event+0x178/0x2c0
>   perf_event_exit_task_context+0x214/0x2f0
>   perf_event_exit_task+0xb0/0x3b0
>   do_exit+0x1bc/0x808
>   __arm64_sys_exit+0x28/0x30
>   invoke_syscall+0x4c/0xe8
>   el0_svc_common+0x9c/0xf0
>   do_el0_svc+0x28/0x40
>   el0_svc+0x50/0x240
>   el0t_64_sync_handler+0x78/0x130
>   el0t_64_sync+0x198/0x1a0
> 
> I suppose there is a chance that this is only an issue when also doing
> perf_aux_output_begin()/perf_aux_output_end() from start/stop because that's
> where I saw the real race? Maybe without that, accessing the rb from another
> CPU is ok because there is some locking, but I think this might be a more
> general issue.

I *think* something like so.

Before the patch in question this would never happen, because of calling
things too late and always hitting that TASK_TOMBSTONE.

But irrespective of emitting that event, we do want to propagate the
count and runtime numbers.


---
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5b5cb620499e..f566ad55b4fb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -14086,7 +14086,7 @@ static void sync_child_event(struct perf_event *child_event,
 	u64 child_val;
 
 	if (child_event->attr.inherit_stat) {
-		if (task && task != TASK_TOMBSTONE)
+		if (task && task != TASK_TOMBSTONE && event_filter_match(child_event))
 			perf_event_read_event(child_event, task);
 	}
 

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] perf/core: Fix missing read event generation on task exit
  2026-02-06 15:29   ` Peter Zijlstra
@ 2026-02-09 16:12     ` James Clark
  2026-03-03 14:29       ` Peter Zijlstra
  0 siblings, 1 reply; 10+ messages in thread
From: James Clark @ 2026-02-09 16:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thaumy Cheng, linux-perf-users, linux-kernel, Ingo Molnar,
	Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
	Alexander Shishkin, Jiri Olsa, Ian Rogers, Adrian Hunter,
	Kan Liang, Suzuki K Poulose, Leo Yan, Mike Leach



On 06/02/2026 3:29 pm, Peter Zijlstra wrote:
> On Fri, Feb 06, 2026 at 11:21:19AM +0000, James Clark wrote:
> 
>> I've been looking into a regression caused by this commit and didn't manage
>> to come up with a fix. But shouldn't this be something more like:
>>
>>    if (attach_state & PERF_ATTACH_CHILD && event_filter_match(event))
>>        sync_child_event(event, task);
>>
>> As in, you only want to call sync_child_event() and write stuff to the ring
>> buffer for the CPU that is currently running this exit handler? Although
>> this change affects the 'total_time_enabled' tracking as well, but I'm not
>> 100% sure if we're not double counting it anyway.
>>
>>  From perf_event_exit_task_context(), perf_event_exit_event() is called on
>> all events, which includes events on other CPUs:
>>
>>    list_for_each_entry_safe(child_event, next, &ctx->event_list, ...)
>>      perf_event_exit_event(child_event, ctx, exit ? task : NULL, false);
>>
>> Then we write into those other CPU's ring buffers, which don't support
>> concurrency.
>>
>> The reason I found this is because we have a tracing test that spawns some
>> threads and then looks for PERF_RECORD_AUX events. When there are concurrent
>> writes into the ring buffers, rb->nest tracking gets messed up leaving the
>> count positive even after all nested writers have finished. Then all future
>> writes don't copy the data_head pointer to the user page (because it thinks
>> someone else is writing), so Perf doesn't copy out any data anymore leaving
>> records missing.
>>
>> An easy reproducer is to put a warning that the ring buffer being written to
>> is the correct one:
>>
>>    @@ -41,10 +41,11 @@ static void perf_output_get_handle(struct
>>    perf_output_handle *handle)
>>    {
>>   	struct perf_buffer *rb = handle->rb;
>>
>>   	preempt_disable();
>>
>>    +	WARN_ON(handle->event->cpu != smp_processor_id());
>>
>>
>> And then record:
>>
>>    perf record -s -- stress -c 8 -t 1
>>
>> Which results in:
>>
>>    perf_output_begin+0x320/0x480 (P)
>>    perf_event_exit_event+0x178/0x2c0
>>    perf_event_exit_task_context+0x214/0x2f0
>>    perf_event_exit_task+0xb0/0x3b0
>>    do_exit+0x1bc/0x808
>>    __arm64_sys_exit+0x28/0x30
>>    invoke_syscall+0x4c/0xe8
>>    el0_svc_common+0x9c/0xf0
>>    do_el0_svc+0x28/0x40
>>    el0_svc+0x50/0x240
>>    el0t_64_sync_handler+0x78/0x130
>>    el0t_64_sync+0x198/0x1a0
>>
>> I suppose there is a chance that this is only an issue when also doing
>> perf_aux_output_begin()/perf_aux_output_end() from start/stop because that's
>> where I saw the real race? Maybe without that, accessing the rb from another
>> CPU is ok because there is some locking, but I think this might be a more
>> general issue.
> 
> I *think* something like so.
> 
> Before the patch in question this would never happen, because of calling
> things too late and always hitting that TASK_TOMBSTONE.
> 
> But irrespective of emitting that event, we do want to propagate the
> count and runtime numbers.
> 
> 
> ---
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 5b5cb620499e..f566ad55b4fb 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -14086,7 +14086,7 @@ static void sync_child_event(struct perf_event *child_event,
>   	u64 child_val;
>   
>   	if (child_event->attr.inherit_stat) {
> -		if (task && task != TASK_TOMBSTONE)
> +		if (task && task != TASK_TOMBSTONE && event_filter_match(child_event))
>   			perf_event_read_event(child_event, task);
>   	}
>   

Turns out I tested this before with "child_event->cpu == 
raw_smp_processor_id()" rather than using event_filter_match() so I 
missed that the loop over all the events needs to be wrapped with 
preempt_disable(). But that can't be done because 
perf_event_exit_event() takes a mutex.

I don't think the preempt_disable() can be on any smaller region than 
outside the entire loop otherwise you can get rescheduled between 
event_filter_match() checks and end up failing them all and not writing 
any event out at all.

While debugging I also noticed another issue with these per-thread count 
records. perf_event_exit_event() only does anything if the event has a 
parent. But the context switch optimization means that sometimes threads 
re-use the original event which has no parent. So randomly you get 
threads that are missing from the output.

There is a comment that mentions this under the parent check:

   if (parent_event) {
     /*
      * Do not destroy the 'original' grouping; because of the
      * context switch optimization the original events could've
      * ended up in a random child task.

But I'm not sure if that was supposed to be worked around some other way 
and it's now broken, or it was a known limitation of the implementation 
from the beginning? But right now it randomly misses one of the threads 
and includes the main thread counts, or includes all the threads and 
doesn't include the main thread counts if no context switch optimisation 
was done.

The perf record docs don't say anything that you wouldn't expect all 
threads to be there:

     -s, --stat            per thread counts


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] perf/core: Fix missing read event generation on task exit
  2026-02-09 16:12     ` James Clark
@ 2026-03-03 14:29       ` Peter Zijlstra
  2026-03-04 11:26         ` James Clark
                           ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Peter Zijlstra @ 2026-03-03 14:29 UTC (permalink / raw)
  To: James Clark
  Cc: Thaumy Cheng, linux-perf-users, linux-kernel, Ingo Molnar,
	Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
	Alexander Shishkin, Jiri Olsa, Ian Rogers, Adrian Hunter,
	Kan Liang, Suzuki K Poulose, Leo Yan, Mike Leach


Sorry, things got lost again :-(

On Mon, Feb 09, 2026 at 04:12:39PM +0000, James Clark wrote:

> > >    @@ -41,10 +41,11 @@ static void perf_output_get_handle(struct
> > >    perf_output_handle *handle)
> > >    {
> > >   	struct perf_buffer *rb = handle->rb;
> > > 
> > >   	preempt_disable();
> > > 
> > >    +	WARN_ON(handle->event->cpu != smp_processor_id());
> > > 

> > ---
> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index 5b5cb620499e..f566ad55b4fb 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -14086,7 +14086,7 @@ static void sync_child_event(struct perf_event *child_event,
> >   	u64 child_val;
> >   	if (child_event->attr.inherit_stat) {
> > -		if (task && task != TASK_TOMBSTONE)
> > +		if (task && task != TASK_TOMBSTONE && event_filter_match(child_event))
> >   			perf_event_read_event(child_event, task);
> >   	}
> 
> Turns out I tested this before with "child_event->cpu ==
> raw_smp_processor_id()" rather than using event_filter_match() so I missed
> that the loop over all the events needs to be wrapped with
> preempt_disable(). But that can't be done because perf_event_exit_event()
> takes a mutex.
> 
> I don't think the preempt_disable() can be on any smaller region than
> outside the entire loop otherwise you can get rescheduled between
> event_filter_match() checks and end up failing them all and not writing any
> event out at all.

Ooh, cute. That's annoying. This is specific to using per-task-per-cpu
buffers afaict, nevertheless those ought to work.

Also, I suspect your WARN would trigger before c418d8b4d7a4 as well, if
it wouldn't have been for that TOMBSTONE thing.

How to fix this mess... perhaps something like this, except now I worry
about the revoke and remove_on_exec cases; do they want something?


---
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 03ced7aad309..d0aae76cabf4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4727,7 +4727,6 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
 static void perf_remove_from_owner(struct perf_event *event);
 static void perf_event_exit_event(struct perf_event *event,
 				  struct perf_event_context *ctx,
-				  struct task_struct *task,
 				  bool revoke);
 
 /*
@@ -4755,7 +4754,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx)
 
 		modified = true;
 
-		perf_event_exit_event(event, ctx, ctx->task, false);
+		perf_event_exit_event(event, ctx, false);
 	}
 
 	raw_spin_lock_irqsave(&ctx->lock, flags);
@@ -12863,7 +12862,7 @@ static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
 	/*
 	 * De-schedule the event and mark it REVOKED.
 	 */
-	perf_event_exit_event(event, ctx, ctx->task, true);
+	perf_event_exit_event(event, ctx, true);
 
 	/*
 	 * All _free_event() bits that rely on event->pmu:
@@ -14424,17 +14423,11 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
 }
 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
 
-static void sync_child_event(struct perf_event *child_event,
-			     struct task_struct *task)
+static void sync_child_event(struct perf_event *child_event)
 {
 	struct perf_event *parent_event = child_event->parent;
 	u64 child_val;
 
-	if (child_event->attr.inherit_stat) {
-		if (task && task != TASK_TOMBSTONE)
-			perf_event_read_event(child_event, task);
-	}
-
 	child_val = perf_event_count(child_event, false);
 
 	/*
@@ -14450,7 +14443,6 @@ static void sync_child_event(struct perf_event *child_event,
 static void
 perf_event_exit_event(struct perf_event *event,
 		      struct perf_event_context *ctx,
-		      struct task_struct *task,
 		      bool revoke)
 {
 	struct perf_event *parent_event = event->parent;
@@ -14476,7 +14468,7 @@ perf_event_exit_event(struct perf_event *event,
 		attach_state = READ_ONCE(event->attach_state);
 
 		if (attach_state & PERF_ATTACH_CHILD)
-			sync_child_event(event, task);
+			sync_child_event(event);
 	}
 
 	if (revoke)
@@ -14517,7 +14509,7 @@ perf_event_exit_event(struct perf_event *event,
 static void perf_event_exit_task_context(struct task_struct *task, bool exit)
 {
 	struct perf_event_context *ctx, *clone_ctx = NULL;
-	struct perf_event *child_event, *next;
+	struct perf_event *event, *next;
 
 	ctx = perf_pin_task_context(task);
 	if (!ctx)
@@ -14564,11 +14556,21 @@ static void perf_event_exit_task_context(struct task_struct *task, bool exit)
 	 * won't get any samples after PERF_RECORD_EXIT. We can however still
 	 * get a few PERF_RECORD_READ events.
 	 */
-	if (exit)
+	if (exit) {
 		perf_event_task(task, ctx, 0);
 
-	list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry)
-		perf_event_exit_event(child_event, ctx, exit ? task : NULL, false);
+		guard(raw_spinlock_irq)(&ctx->lock);
+		list_for_each_entry(event, &ctx->event_list, event_entry) {
+			if (event->attr.inherit_stat) {
+				if (task && task != TASK_TOMBSTONE &&
+				    event_filter_match(event))
+					perf_event_read_event(event, task);
+			}
+		}
+	}
+
+	list_for_each_entry_safe(event, next, &ctx->event_list, event_entry)
+		perf_event_exit_event(event, ctx, false);
 
 	mutex_unlock(&ctx->mutex);
 

> While debugging I also noticed another issue with these per-thread count
> records. perf_event_exit_event() only does anything if the event has a
> parent. But the context switch optimization means that sometimes threads
> re-use the original event which has no parent. So randomly you get threads
> that are missing from the output.
> 
> There is a comment that mentions this under the parent check:
> 
>   if (parent_event) {
>     /*
>      * Do not destroy the 'original' grouping; because of the
>      * context switch optimization the original events could've
>      * ended up in a random child task.
> 
> But I'm not sure if that was supposed to be worked around some other way and
> it's now broken, or it was a known limitation of the implementation from the
> beginning? But right now it randomly misses one of the threads and includes
> the main thread counts, or includes all the threads and doesn't include the
> main thread counts if no context switch optimisation was done.
> 
> The perf record docs don't say anything that you wouldn't expect all threads
> to be there:
> 
>     -s, --stat            per thread counts

Ha! So commit bfbd3381e63a ("perf_counter: Implement more accurate per
task statistics") actually mentions that issue and tries to mitigate,
but I think I indeed missed a case.

I suppose the easiest fix is this, but urgh...

---
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 03ced7aad309..c38acf1e2f43 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3664,6 +3664,9 @@ static int context_equiv(struct perf_event_context *ctx1,
 	if (ctx1->pin_count || ctx2->pin_count)
 		return 0;
 
+	if (ctx1->nr_stat || ctx2->nr_stat)
+		return 0;
+
 	/* If ctx1 is the parent of ctx2 */
 	if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
 		return 1;
@@ -3677,76 +3680,13 @@ static int context_equiv(struct perf_event_context *ctx1,
 	 * hierarchy, see perf_event_init_context().
 	 */
 	if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
-			ctx1->parent_gen == ctx2->parent_gen)
+	    ctx1->parent_gen == ctx2->parent_gen)
 		return 1;
 
 	/* Unmatched */
 	return 0;
 }
 
-static void __perf_event_sync_stat(struct perf_event *event,
-				     struct perf_event *next_event)
-{
-	u64 value;
-
-	if (!event->attr.inherit_stat)
-		return;
-
-	/*
-	 * Update the event value, we cannot use perf_event_read()
-	 * because we're in the middle of a context switch and have IRQs
-	 * disabled, which upsets smp_call_function_single(), however
-	 * we know the event must be on the current CPU, therefore we
-	 * don't need to use it.
-	 */
-	perf_pmu_read(event);
-
-	perf_event_update_time(event);
-
-	/*
-	 * In order to keep per-task stats reliable we need to flip the event
-	 * values when we flip the contexts.
-	 */
-	value = local64_read(&next_event->count);
-	value = local64_xchg(&event->count, value);
-	local64_set(&next_event->count, value);
-
-	swap(event->total_time_enabled, next_event->total_time_enabled);
-	swap(event->total_time_running, next_event->total_time_running);
-
-	/*
-	 * Since we swizzled the values, update the user visible data too.
-	 */
-	perf_event_update_userpage(event);
-	perf_event_update_userpage(next_event);
-}
-
-static void perf_event_sync_stat(struct perf_event_context *ctx,
-				   struct perf_event_context *next_ctx)
-{
-	struct perf_event *event, *next_event;
-
-	if (!ctx->nr_stat)
-		return;
-
-	update_context_time(ctx);
-
-	event = list_first_entry(&ctx->event_list,
-				   struct perf_event, event_entry);
-
-	next_event = list_first_entry(&next_ctx->event_list,
-					struct perf_event, event_entry);
-
-	while (&event->event_entry != &ctx->event_list &&
-	       &next_event->event_entry != &next_ctx->event_list) {
-
-		__perf_event_sync_stat(event, next_event);
-
-		event = list_next_entry(event, event_entry);
-		next_event = list_next_entry(next_event, event_entry);
-	}
-}
-
 static void perf_ctx_sched_task_cb(struct perf_event_context *ctx,
 				   struct task_struct *task, bool sched_in)
 {
@@ -3835,8 +3775,6 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
 			RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
 
 			do_switch = 0;
-
-			perf_event_sync_stat(ctx, next_ctx);
 		}
 		raw_spin_unlock(&next_ctx->lock);
 		raw_spin_unlock(&ctx->lock);

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] perf/core: Fix missing read event generation on task exit
  2026-03-03 14:29       ` Peter Zijlstra
@ 2026-03-04 11:26         ` James Clark
  2026-03-04 15:23           ` Peter Zijlstra
  2026-03-10 10:59         ` James Clark
  2026-03-10 18:25         ` Ian Rogers
  2 siblings, 1 reply; 10+ messages in thread
From: James Clark @ 2026-03-04 11:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thaumy Cheng, linux-perf-users, linux-kernel, Ingo Molnar,
	Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
	Alexander Shishkin, Jiri Olsa, Ian Rogers, Adrian Hunter,
	Kan Liang, Suzuki K Poulose, Leo Yan, Mike Leach



On 03/03/2026 2:29 pm, Peter Zijlstra wrote:
> 
> Sorry, things got lost again :-(
> 
> On Mon, Feb 09, 2026 at 04:12:39PM +0000, James Clark wrote:
> 
>>>>     @@ -41,10 +41,11 @@ static void perf_output_get_handle(struct
>>>>     perf_output_handle *handle)
>>>>     {
>>>>    	struct perf_buffer *rb = handle->rb;
>>>>
>>>>    	preempt_disable();
>>>>
>>>>     +	WARN_ON(handle->event->cpu != smp_processor_id());
>>>>
> 
>>> ---
>>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>>> index 5b5cb620499e..f566ad55b4fb 100644
>>> --- a/kernel/events/core.c
>>> +++ b/kernel/events/core.c
>>> @@ -14086,7 +14086,7 @@ static void sync_child_event(struct perf_event *child_event,
>>>    	u64 child_val;
>>>    	if (child_event->attr.inherit_stat) {
>>> -		if (task && task != TASK_TOMBSTONE)
>>> +		if (task && task != TASK_TOMBSTONE && event_filter_match(child_event))
>>>    			perf_event_read_event(child_event, task);
>>>    	}
>>
>> Turns out I tested this before with "child_event->cpu ==
>> raw_smp_processor_id()" rather than using event_filter_match() so I missed
>> that the loop over all the events needs to be wrapped with
>> preempt_disable(). But that can't be done because perf_event_exit_event()
>> takes a mutex.
>>
>> I don't think the preempt_disable() can be on any smaller region than
>> outside the entire loop otherwise you can get rescheduled between
>> event_filter_match() checks and end up failing them all and not writing any
>> event out at all.
> 
> Ooh, cute. That's annoying. This is specific to using per-task-per-cpu
> buffers afaict, nevertheless those ought to work.
> 
> Also, I suspect your WARN would trigger before c418d8b4d7a4 as well, if
> it wouldn't have been for that TOMBSTONE thing.
> 
> How to fix this mess... perhaps something like this, except now I worry
> about the revoke and remove_on_exec cases; do they want something?
> 
> 
> ---
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 03ced7aad309..d0aae76cabf4 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -4727,7 +4727,6 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
>   static void perf_remove_from_owner(struct perf_event *event);
>   static void perf_event_exit_event(struct perf_event *event,
>   				  struct perf_event_context *ctx,
> -				  struct task_struct *task,
>   				  bool revoke);
>   
>   /*
> @@ -4755,7 +4754,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx)
>   
>   		modified = true;
>   
> -		perf_event_exit_event(event, ctx, ctx->task, false);
> +		perf_event_exit_event(event, ctx, false);
>   	}
>   
>   	raw_spin_lock_irqsave(&ctx->lock, flags);
> @@ -12863,7 +12862,7 @@ static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
>   	/*
>   	 * De-schedule the event and mark it REVOKED.
>   	 */
> -	perf_event_exit_event(event, ctx, ctx->task, true);
> +	perf_event_exit_event(event, ctx, true);
>   
>   	/*
>   	 * All _free_event() bits that rely on event->pmu:
> @@ -14424,17 +14423,11 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
>   }
>   EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
>   
> -static void sync_child_event(struct perf_event *child_event,
> -			     struct task_struct *task)
> +static void sync_child_event(struct perf_event *child_event)
>   {
>   	struct perf_event *parent_event = child_event->parent;
>   	u64 child_val;
>   
> -	if (child_event->attr.inherit_stat) {
> -		if (task && task != TASK_TOMBSTONE)
> -			perf_event_read_event(child_event, task);
> -	}
> -
>   	child_val = perf_event_count(child_event, false);
>   
>   	/*
> @@ -14450,7 +14443,6 @@ static void sync_child_event(struct perf_event *child_event,
>   static void
>   perf_event_exit_event(struct perf_event *event,
>   		      struct perf_event_context *ctx,
> -		      struct task_struct *task,
>   		      bool revoke)
>   {
>   	struct perf_event *parent_event = event->parent;
> @@ -14476,7 +14468,7 @@ perf_event_exit_event(struct perf_event *event,
>   		attach_state = READ_ONCE(event->attach_state);
>   
>   		if (attach_state & PERF_ATTACH_CHILD)
> -			sync_child_event(event, task);
> +			sync_child_event(event);
>   	}
>   
>   	if (revoke)
> @@ -14517,7 +14509,7 @@ perf_event_exit_event(struct perf_event *event,
>   static void perf_event_exit_task_context(struct task_struct *task, bool exit)
>   {
>   	struct perf_event_context *ctx, *clone_ctx = NULL;
> -	struct perf_event *child_event, *next;
> +	struct perf_event *event, *next;
>   
>   	ctx = perf_pin_task_context(task);
>   	if (!ctx)
> @@ -14564,11 +14556,21 @@ static void perf_event_exit_task_context(struct task_struct *task, bool exit)
>   	 * won't get any samples after PERF_RECORD_EXIT. We can however still
>   	 * get a few PERF_RECORD_READ events.
>   	 */
> -	if (exit)
> +	if (exit) {
>   		perf_event_task(task, ctx, 0);
>   
> -	list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry)
> -		perf_event_exit_event(child_event, ctx, exit ? task : NULL, false);
> +		guard(raw_spinlock_irq)(&ctx->lock);
> +		list_for_each_entry(event, &ctx->event_list, event_entry) {
> +			if (event->attr.inherit_stat) {
> +				if (task && task != TASK_TOMBSTONE &&
> +				    event_filter_match(event))
> +					perf_event_read_event(event, task);

Doesn't this only work if you happened to have an event opened on every 
CPU? Otherwise you could get rescheduled onto a core without an event 
and the event_filter_match() will fail and you'd drop the sample read 
for that exit.

> +			}
> +		}
> +	}
> +
> +	list_for_each_entry_safe(event, next, &ctx->event_list, event_entry)
> +		perf_event_exit_event(event, ctx, false);
>   
>   	mutex_unlock(&ctx->mutex);
>   
> 
>> While debugging I also noticed another issue with these per-thread count
>> records. perf_event_exit_event() only does anything if the event has a
>> parent. But the context switch optimization means that sometimes threads
>> re-use the original event which has no parent. So randomly you get threads
>> that are missing from the output.
>>
>> There is a comment that mentions this under the parent check:
>>
>>    if (parent_event) {
>>      /*
>>       * Do not destroy the 'original' grouping; because of the
>>       * context switch optimization the original events could've
>>       * ended up in a random child task.
>>
>> But I'm not sure if that was supposed to be worked around some other way and
>> it's now broken, or it was a known limitation of the implementation from the
>> beginning? But right now it randomly misses one of the threads and includes
>> the main thread counts, or includes all the threads and doesn't include the
>> main thread counts if no context switch optimisation was done.
>>
>> The perf record docs don't say anything that you wouldn't expect all threads
>> to be there:
>>
>>      -s, --stat            per thread counts
> 
> Ha! So commit bfbd3381e63a ("perf_counter: Implement more accurate per
> task statistics") actually mentions that issue and tries to mitigate,
> but I think I indeed missed a case.
> 
> I suppose the easiest fix is this, but urgh...
> 
> ---
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 03ced7aad309..c38acf1e2f43 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -3664,6 +3664,9 @@ static int context_equiv(struct perf_event_context *ctx1,
>   	if (ctx1->pin_count || ctx2->pin_count)
>   		return 0;
>   
> +	if (ctx1->nr_stat || ctx2->nr_stat)
> +		return 0;
> +
>   	/* If ctx1 is the parent of ctx2 */
>   	if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
>   		return 1;
> @@ -3677,76 +3680,13 @@ static int context_equiv(struct perf_event_context *ctx1,
>   	 * hierarchy, see perf_event_init_context().
>   	 */
>   	if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
> -			ctx1->parent_gen == ctx2->parent_gen)
> +	    ctx1->parent_gen == ctx2->parent_gen)
>   		return 1;
>   
>   	/* Unmatched */
>   	return 0;
>   }
>   
> -static void __perf_event_sync_stat(struct perf_event *event,
> -				     struct perf_event *next_event)
> -{
> -	u64 value;
> -
> -	if (!event->attr.inherit_stat)
> -		return;
> -
> -	/*
> -	 * Update the event value, we cannot use perf_event_read()
> -	 * because we're in the middle of a context switch and have IRQs
> -	 * disabled, which upsets smp_call_function_single(), however
> -	 * we know the event must be on the current CPU, therefore we
> -	 * don't need to use it.
> -	 */
> -	perf_pmu_read(event);
> -
> -	perf_event_update_time(event);
> -
> -	/*
> -	 * In order to keep per-task stats reliable we need to flip the event
> -	 * values when we flip the contexts.
> -	 */
> -	value = local64_read(&next_event->count);
> -	value = local64_xchg(&event->count, value);
> -	local64_set(&next_event->count, value);
> -
> -	swap(event->total_time_enabled, next_event->total_time_enabled);
> -	swap(event->total_time_running, next_event->total_time_running);
> -
> -	/*
> -	 * Since we swizzled the values, update the user visible data too.
> -	 */
> -	perf_event_update_userpage(event);
> -	perf_event_update_userpage(next_event);
> -}
> -
> -static void perf_event_sync_stat(struct perf_event_context *ctx,
> -				   struct perf_event_context *next_ctx)
> -{
> -	struct perf_event *event, *next_event;
> -
> -	if (!ctx->nr_stat)
> -		return;
> -
> -	update_context_time(ctx);
> -
> -	event = list_first_entry(&ctx->event_list,
> -				   struct perf_event, event_entry);
> -
> -	next_event = list_first_entry(&next_ctx->event_list,
> -					struct perf_event, event_entry);
> -
> -	while (&event->event_entry != &ctx->event_list &&
> -	       &next_event->event_entry != &next_ctx->event_list) {
> -
> -		__perf_event_sync_stat(event, next_event);
> -
> -		event = list_next_entry(event, event_entry);
> -		next_event = list_next_entry(next_event, event_entry);
> -	}
> -}
> -
>   static void perf_ctx_sched_task_cb(struct perf_event_context *ctx,
>   				   struct task_struct *task, bool sched_in)
>   {
> @@ -3835,8 +3775,6 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
>   			RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
>   
>   			do_switch = 0;
> -
> -			perf_event_sync_stat(ctx, next_ctx);
>   		}
>   		raw_spin_unlock(&next_ctx->lock);
>   		raw_spin_unlock(&ctx->lock);


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] perf/core: Fix missing read event generation on task exit
  2026-03-04 11:26         ` James Clark
@ 2026-03-04 15:23           ` Peter Zijlstra
  0 siblings, 0 replies; 10+ messages in thread
From: Peter Zijlstra @ 2026-03-04 15:23 UTC (permalink / raw)
  To: James Clark
  Cc: Thaumy Cheng, linux-perf-users, linux-kernel, Ingo Molnar,
	Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
	Alexander Shishkin, Jiri Olsa, Ian Rogers, Adrian Hunter,
	Kan Liang, Suzuki K Poulose, Leo Yan, Mike Leach

On Wed, Mar 04, 2026 at 11:26:10AM +0000, James Clark wrote:

> > @@ -14564,11 +14556,21 @@ static void perf_event_exit_task_context(struct task_struct *task, bool exit)
> >   	 * won't get any samples after PERF_RECORD_EXIT. We can however still
> >   	 * get a few PERF_RECORD_READ events.
> >   	 */
> > -	if (exit)
> > +	if (exit) {
> >   		perf_event_task(task, ctx, 0);
> > -	list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry)
> > -		perf_event_exit_event(child_event, ctx, exit ? task : NULL, false);
> > +		guard(raw_spinlock_irq)(&ctx->lock);
> > +		list_for_each_entry(event, &ctx->event_list, event_entry) {
> > +			if (event->attr.inherit_stat) {
> > +				if (task && task != TASK_TOMBSTONE &&
> > +				    event_filter_match(event))
> > +					perf_event_read_event(event, task);
> 
> Doesn't this only work if you happened to have an event opened on every CPU?
> Otherwise you could get rescheduled onto a core without an event and the
> event_filter_match() will fail and you'd drop the sample read for that exit.

Yes, absolutely. But that is more or less what you get.

So perf has per-task buffers and per-cpu buffers. For a simple per-task
profile, the per-task buffer works fine. However, if you want inherited
tasks, they can't all emit to the same buffer -- this would be a
scalability nightmake. So in that scenario you have to have
per-task-per-cpu events and make sure everybody emits to the local
buffer.

If you 'forget' to create an event/buffer for a particular CPU the
task(s) can run on, you loose events the moment they end up there.

There just isn't much you can do about that.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] perf/core: Fix missing read event generation on task exit
  2026-03-03 14:29       ` Peter Zijlstra
  2026-03-04 11:26         ` James Clark
@ 2026-03-10 10:59         ` James Clark
  2026-03-10 18:25         ` Ian Rogers
  2 siblings, 0 replies; 10+ messages in thread
From: James Clark @ 2026-03-10 10:59 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thaumy Cheng, linux-perf-users, linux-kernel, Ingo Molnar,
	Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
	Alexander Shishkin, Jiri Olsa, Ian Rogers, Adrian Hunter,
	Kan Liang, Suzuki K Poulose, Leo Yan, Mike Leach



On 03/03/2026 2:29 pm, Peter Zijlstra wrote:
> 
> Sorry, things got lost again :-(
> 
> On Mon, Feb 09, 2026 at 04:12:39PM +0000, James Clark wrote:
> 
>>>>     @@ -41,10 +41,11 @@ static void perf_output_get_handle(struct
>>>>     perf_output_handle *handle)
>>>>     {
>>>>    	struct perf_buffer *rb = handle->rb;
>>>>
>>>>    	preempt_disable();
>>>>
>>>>     +	WARN_ON(handle->event->cpu != smp_processor_id());
>>>>
> 
>>> ---
>>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>>> index 5b5cb620499e..f566ad55b4fb 100644
>>> --- a/kernel/events/core.c
>>> +++ b/kernel/events/core.c
>>> @@ -14086,7 +14086,7 @@ static void sync_child_event(struct perf_event *child_event,
>>>    	u64 child_val;
>>>    	if (child_event->attr.inherit_stat) {
>>> -		if (task && task != TASK_TOMBSTONE)
>>> +		if (task && task != TASK_TOMBSTONE && event_filter_match(child_event))
>>>    			perf_event_read_event(child_event, task);
>>>    	}
>>
>> Turns out I tested this before with "child_event->cpu ==
>> raw_smp_processor_id()" rather than using event_filter_match() so I missed
>> that the loop over all the events needs to be wrapped with
>> preempt_disable(). But that can't be done because perf_event_exit_event()
>> takes a mutex.
>>
>> I don't think the preempt_disable() can be on any smaller region than
>> outside the entire loop otherwise you can get rescheduled between
>> event_filter_match() checks and end up failing them all and not writing any
>> event out at all.
> 
> Ooh, cute. That's annoying. This is specific to using per-task-per-cpu
> buffers afaict, nevertheless those ought to work.
> 
> Also, I suspect your WARN would trigger before c418d8b4d7a4 as well, if
> it wouldn't have been for that TOMBSTONE thing.
> 
> How to fix this mess... perhaps something like this, except now I worry
> about the revoke and remove_on_exec cases; do they want something?
> 
> 

So the fixes here seem to fix the two problems for me, didn't see any 
issues in the Perf tests either. Not sure about remove_on_exec though, I 
didn't test that.

> ---
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 03ced7aad309..d0aae76cabf4 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -4727,7 +4727,6 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
>   static void perf_remove_from_owner(struct perf_event *event);
>   static void perf_event_exit_event(struct perf_event *event,
>   				  struct perf_event_context *ctx,
> -				  struct task_struct *task,
>   				  bool revoke);
>   
>   /*
> @@ -4755,7 +4754,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx)
>   
>   		modified = true;
>   
> -		perf_event_exit_event(event, ctx, ctx->task, false);
> +		perf_event_exit_event(event, ctx, false);
>   	}
>   
>   	raw_spin_lock_irqsave(&ctx->lock, flags);
> @@ -12863,7 +12862,7 @@ static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
>   	/*
>   	 * De-schedule the event and mark it REVOKED.
>   	 */
> -	perf_event_exit_event(event, ctx, ctx->task, true);
> +	perf_event_exit_event(event, ctx, true);
>   
>   	/*
>   	 * All _free_event() bits that rely on event->pmu:
> @@ -14424,17 +14423,11 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
>   }
>   EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
>   
> -static void sync_child_event(struct perf_event *child_event,
> -			     struct task_struct *task)
> +static void sync_child_event(struct perf_event *child_event)
>   {
>   	struct perf_event *parent_event = child_event->parent;
>   	u64 child_val;
>   
> -	if (child_event->attr.inherit_stat) {
> -		if (task && task != TASK_TOMBSTONE)
> -			perf_event_read_event(child_event, task);
> -	}
> -
>   	child_val = perf_event_count(child_event, false);
>   
>   	/*
> @@ -14450,7 +14443,6 @@ static void sync_child_event(struct perf_event *child_event,
>   static void
>   perf_event_exit_event(struct perf_event *event,
>   		      struct perf_event_context *ctx,
> -		      struct task_struct *task,
>   		      bool revoke)
>   {
>   	struct perf_event *parent_event = event->parent;
> @@ -14476,7 +14468,7 @@ perf_event_exit_event(struct perf_event *event,
>   		attach_state = READ_ONCE(event->attach_state);
>   
>   		if (attach_state & PERF_ATTACH_CHILD)
> -			sync_child_event(event, task);
> +			sync_child_event(event);
>   	}
>   
>   	if (revoke)
> @@ -14517,7 +14509,7 @@ perf_event_exit_event(struct perf_event *event,
>   static void perf_event_exit_task_context(struct task_struct *task, bool exit)
>   {
>   	struct perf_event_context *ctx, *clone_ctx = NULL;
> -	struct perf_event *child_event, *next;
> +	struct perf_event *event, *next;
>   
>   	ctx = perf_pin_task_context(task);
>   	if (!ctx)
> @@ -14564,11 +14556,21 @@ static void perf_event_exit_task_context(struct task_struct *task, bool exit)
>   	 * won't get any samples after PERF_RECORD_EXIT. We can however still
>   	 * get a few PERF_RECORD_READ events.
>   	 */
> -	if (exit)
> +	if (exit) {
>   		perf_event_task(task, ctx, 0);
>   
> -	list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry)
> -		perf_event_exit_event(child_event, ctx, exit ? task : NULL, false);
> +		guard(raw_spinlock_irq)(&ctx->lock);
> +		list_for_each_entry(event, &ctx->event_list, event_entry) {
> +			if (event->attr.inherit_stat) {
> +				if (task && task != TASK_TOMBSTONE &&
> +				    event_filter_match(event))
> +					perf_event_read_event(event, task);
> +			}
> +		}
> +	}
> +
> +	list_for_each_entry_safe(event, next, &ctx->event_list, event_entry)
> +		perf_event_exit_event(event, ctx, false);
>   
>   	mutex_unlock(&ctx->mutex);
>   
> 
>> While debugging I also noticed another issue with these per-thread count
>> records. perf_event_exit_event() only does anything if the event has a
>> parent. But the context switch optimization means that sometimes threads
>> re-use the original event which has no parent. So randomly you get threads
>> that are missing from the output.
>>
>> There is a comment that mentions this under the parent check:
>>
>>    if (parent_event) {
>>      /*
>>       * Do not destroy the 'original' grouping; because of the
>>       * context switch optimization the original events could've
>>       * ended up in a random child task.
>>
>> But I'm not sure if that was supposed to be worked around some other way and
>> it's now broken, or it was a known limitation of the implementation from the
>> beginning? But right now it randomly misses one of the threads and includes
>> the main thread counts, or includes all the threads and doesn't include the
>> main thread counts if no context switch optimisation was done.
>>
>> The perf record docs don't say anything that you wouldn't expect all threads
>> to be there:
>>
>>      -s, --stat            per thread counts
> 
> Ha! So commit bfbd3381e63a ("perf_counter: Implement more accurate per
> task statistics") actually mentions that issue and tries to mitigate,
> but I think I indeed missed a case.
> 
> I suppose the easiest fix is this, but urgh...
> 
> ---
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 03ced7aad309..c38acf1e2f43 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -3664,6 +3664,9 @@ static int context_equiv(struct perf_event_context *ctx1,
>   	if (ctx1->pin_count || ctx2->pin_count)
>   		return 0;
>   
> +	if (ctx1->nr_stat || ctx2->nr_stat)
> +		return 0;
> +
>   	/* If ctx1 is the parent of ctx2 */
>   	if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
>   		return 1;
> @@ -3677,76 +3680,13 @@ static int context_equiv(struct perf_event_context *ctx1,
>   	 * hierarchy, see perf_event_init_context().
>   	 */
>   	if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
> -			ctx1->parent_gen == ctx2->parent_gen)
> +	    ctx1->parent_gen == ctx2->parent_gen)
>   		return 1;
>   
>   	/* Unmatched */
>   	return 0;
>   }
>   
> -static void __perf_event_sync_stat(struct perf_event *event,
> -				     struct perf_event *next_event)
> -{
> -	u64 value;
> -
> -	if (!event->attr.inherit_stat)
> -		return;
> -
> -	/*
> -	 * Update the event value, we cannot use perf_event_read()
> -	 * because we're in the middle of a context switch and have IRQs
> -	 * disabled, which upsets smp_call_function_single(), however
> -	 * we know the event must be on the current CPU, therefore we
> -	 * don't need to use it.
> -	 */
> -	perf_pmu_read(event);
> -
> -	perf_event_update_time(event);
> -
> -	/*
> -	 * In order to keep per-task stats reliable we need to flip the event
> -	 * values when we flip the contexts.
> -	 */
> -	value = local64_read(&next_event->count);
> -	value = local64_xchg(&event->count, value);
> -	local64_set(&next_event->count, value);
> -
> -	swap(event->total_time_enabled, next_event->total_time_enabled);
> -	swap(event->total_time_running, next_event->total_time_running);
> -
> -	/*
> -	 * Since we swizzled the values, update the user visible data too.
> -	 */
> -	perf_event_update_userpage(event);
> -	perf_event_update_userpage(next_event);
> -}
> -
> -static void perf_event_sync_stat(struct perf_event_context *ctx,
> -				   struct perf_event_context *next_ctx)
> -{
> -	struct perf_event *event, *next_event;
> -
> -	if (!ctx->nr_stat)
> -		return;
> -
> -	update_context_time(ctx);
> -
> -	event = list_first_entry(&ctx->event_list,
> -				   struct perf_event, event_entry);
> -
> -	next_event = list_first_entry(&next_ctx->event_list,
> -					struct perf_event, event_entry);
> -
> -	while (&event->event_entry != &ctx->event_list &&
> -	       &next_event->event_entry != &next_ctx->event_list) {
> -
> -		__perf_event_sync_stat(event, next_event);
> -
> -		event = list_next_entry(event, event_entry);
> -		next_event = list_next_entry(next_event, event_entry);
> -	}
> -}
> -
>   static void perf_ctx_sched_task_cb(struct perf_event_context *ctx,
>   				   struct task_struct *task, bool sched_in)
>   {
> @@ -3835,8 +3775,6 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
>   			RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
>   
>   			do_switch = 0;
> -
> -			perf_event_sync_stat(ctx, next_ctx);
>   		}
>   		raw_spin_unlock(&next_ctx->lock);
>   		raw_spin_unlock(&ctx->lock);


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] perf/core: Fix missing read event generation on task exit
  2026-03-03 14:29       ` Peter Zijlstra
  2026-03-04 11:26         ` James Clark
  2026-03-10 10:59         ` James Clark
@ 2026-03-10 18:25         ` Ian Rogers
  2026-03-11 10:16           ` James Clark
  2 siblings, 1 reply; 10+ messages in thread
From: Ian Rogers @ 2026-03-10 18:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: James Clark, Thaumy Cheng, linux-perf-users, linux-kernel,
	Ingo Molnar, Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
	Alexander Shishkin, Jiri Olsa, Adrian Hunter, Kan Liang,
	Suzuki K Poulose, Leo Yan, Mike Leach

On Tue, Mar 3, 2026 at 6:29 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> I suppose the easiest fix is this, but urgh...
>
> ---
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 03ced7aad309..c38acf1e2f43 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -3664,6 +3664,9 @@ static int context_equiv(struct perf_event_context *ctx1,
>         if (ctx1->pin_count || ctx2->pin_count)
>                 return 0;
>
> +       if (ctx1->nr_stat || ctx2->nr_stat)
> +               return 0;
> +

So, just for fun, I ran the original patch, James's bug description
and this proposed fix through an AI using Chris Mason's kernel review
prompts. I'm sharing the output as the do_exit comments seem
legitimate given the stack trace in James' report.

Does this introduce a significant performance regression for multithreaded
workloads? By forcing a full sched_out and sched_in of the PMU on every
thread switch instead of just swapping context pointers, it removes the
performance benefits of context_equiv() for inherited stats.

Furthermore, does this patch actually address the concurrent ring buffer
corruption reported in the bug?

The corruption occurs because perf_event_exit_event() (called during do_exit)
invokes perf_output_begin() on the parent's ring buffer from the exiting child's
CPU. Because ring buffers use local_t operations which are not SMP-safe,
concurrent child exits corrupt the rb->nest tracking.

Since context_equiv() is only used during context switch
(perf_event_context_sched_out) and not during task exit, modifying it does
not prevent the concurrent ring buffer writes during do_exit.

Thanks,
Ian

>         /* If ctx1 is the parent of ctx2 */
>         if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
>                 return 1;
> @@ -3677,76 +3680,13 @@ static int context_equiv(struct perf_event_context *ctx1,
>          * hierarchy, see perf_event_init_context().
>          */
>         if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
> -                       ctx1->parent_gen == ctx2->parent_gen)
> +           ctx1->parent_gen == ctx2->parent_gen)
>                 return 1;
>
>         /* Unmatched */
>         return 0;
>  }
>
> -static void __perf_event_sync_stat(struct perf_event *event,
> -                                    struct perf_event *next_event)
> -{
> -       u64 value;
> -
> -       if (!event->attr.inherit_stat)
> -               return;
> -
> -       /*
> -        * Update the event value, we cannot use perf_event_read()
> -        * because we're in the middle of a context switch and have IRQs
> -        * disabled, which upsets smp_call_function_single(), however
> -        * we know the event must be on the current CPU, therefore we
> -        * don't need to use it.
> -        */
> -       perf_pmu_read(event);
> -
> -       perf_event_update_time(event);
> -
> -       /*
> -        * In order to keep per-task stats reliable we need to flip the event
> -        * values when we flip the contexts.
> -        */
> -       value = local64_read(&next_event->count);
> -       value = local64_xchg(&event->count, value);
> -       local64_set(&next_event->count, value);
> -
> -       swap(event->total_time_enabled, next_event->total_time_enabled);
> -       swap(event->total_time_running, next_event->total_time_running);
> -
> -       /*
> -        * Since we swizzled the values, update the user visible data too.
> -        */
> -       perf_event_update_userpage(event);
> -       perf_event_update_userpage(next_event);
> -}
> -
> -static void perf_event_sync_stat(struct perf_event_context *ctx,
> -                                  struct perf_event_context *next_ctx)
> -{
> -       struct perf_event *event, *next_event;
> -
> -       if (!ctx->nr_stat)
> -               return;
> -
> -       update_context_time(ctx);
> -
> -       event = list_first_entry(&ctx->event_list,
> -                                  struct perf_event, event_entry);
> -
> -       next_event = list_first_entry(&next_ctx->event_list,
> -                                       struct perf_event, event_entry);
> -
> -       while (&event->event_entry != &ctx->event_list &&
> -              &next_event->event_entry != &next_ctx->event_list) {
> -
> -               __perf_event_sync_stat(event, next_event);
> -
> -               event = list_next_entry(event, event_entry);
> -               next_event = list_next_entry(next_event, event_entry);
> -       }
> -}
> -
>  static void perf_ctx_sched_task_cb(struct perf_event_context *ctx,
>                                    struct task_struct *task, bool sched_in)
>  {
> @@ -3835,8 +3775,6 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
>                         RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
>
>                         do_switch = 0;
> -
> -                       perf_event_sync_stat(ctx, next_ctx);
>                 }
>                 raw_spin_unlock(&next_ctx->lock);
>                 raw_spin_unlock(&ctx->lock);

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] perf/core: Fix missing read event generation on task exit
  2026-03-10 18:25         ` Ian Rogers
@ 2026-03-11 10:16           ` James Clark
  0 siblings, 0 replies; 10+ messages in thread
From: James Clark @ 2026-03-11 10:16 UTC (permalink / raw)
  To: Ian Rogers, Peter Zijlstra
  Cc: Thaumy Cheng, linux-perf-users, linux-kernel, Ingo Molnar,
	Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
	Alexander Shishkin, Jiri Olsa, Adrian Hunter, Kan Liang,
	Suzuki K Poulose, Leo Yan, Mike Leach



On 10/03/2026 6:25 pm, Ian Rogers wrote:
> On Tue, Mar 3, 2026 at 6:29 AM Peter Zijlstra <peterz@infradead.org> wrote:
>>
>> I suppose the easiest fix is this, but urgh...
>>
>> ---
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 03ced7aad309..c38acf1e2f43 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -3664,6 +3664,9 @@ static int context_equiv(struct perf_event_context *ctx1,
>>          if (ctx1->pin_count || ctx2->pin_count)
>>                  return 0;
>>
>> +       if (ctx1->nr_stat || ctx2->nr_stat)
>> +               return 0;
>> +
> 
> So, just for fun, I ran the original patch, James's bug description
> and this proposed fix through an AI using Chris Mason's kernel review
> prompts. I'm sharing the output as the do_exit comments seem
> legitimate given the stack trace in James' report.
> 
> Does this introduce a significant performance regression for multithreaded
> workloads? By forcing a full sched_out and sched_in of the PMU on every
> thread switch instead of just swapping context pointers, it removes the
> performance benefits of context_equiv() for inherited stats.

inherit_stat isn't default though. I suppose if you ask for more stuff 
it costs more, and I assume complete data but slightly slower is better 
than incomplete but faster.

> 
> Furthermore, does this patch actually address the concurrent ring buffer
> corruption reported in the bug?

No, the top patch fixes that issue, not the bottom one.

> 
> The corruption occurs because perf_event_exit_event() (called during do_exit)
> invokes perf_output_begin() on the parent's ring buffer from the exiting child's
> CPU. Because ring buffers use local_t operations which are not SMP-safe,
> concurrent child exits corrupt the rb->nest tracking.
> 
> Since context_equiv() is only used during context switch
> (perf_event_context_sched_out) and not during task exit, modifying it does
> not prevent the concurrent ring buffer writes during do_exit.
> 
> Thanks,
> Ian
> 
>>          /* If ctx1 is the parent of ctx2 */
>>          if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
>>                  return 1;
>> @@ -3677,76 +3680,13 @@ static int context_equiv(struct perf_event_context *ctx1,
>>           * hierarchy, see perf_event_init_context().
>>           */
>>          if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
>> -                       ctx1->parent_gen == ctx2->parent_gen)
>> +           ctx1->parent_gen == ctx2->parent_gen)
>>                  return 1;
>>
>>          /* Unmatched */
>>          return 0;
>>   }
>>
>> -static void __perf_event_sync_stat(struct perf_event *event,
>> -                                    struct perf_event *next_event)
>> -{
>> -       u64 value;
>> -
>> -       if (!event->attr.inherit_stat)
>> -               return;
>> -
>> -       /*
>> -        * Update the event value, we cannot use perf_event_read()
>> -        * because we're in the middle of a context switch and have IRQs
>> -        * disabled, which upsets smp_call_function_single(), however
>> -        * we know the event must be on the current CPU, therefore we
>> -        * don't need to use it.
>> -        */
>> -       perf_pmu_read(event);
>> -
>> -       perf_event_update_time(event);
>> -
>> -       /*
>> -        * In order to keep per-task stats reliable we need to flip the event
>> -        * values when we flip the contexts.
>> -        */
>> -       value = local64_read(&next_event->count);
>> -       value = local64_xchg(&event->count, value);
>> -       local64_set(&next_event->count, value);
>> -
>> -       swap(event->total_time_enabled, next_event->total_time_enabled);
>> -       swap(event->total_time_running, next_event->total_time_running);
>> -
>> -       /*
>> -        * Since we swizzled the values, update the user visible data too.
>> -        */
>> -       perf_event_update_userpage(event);
>> -       perf_event_update_userpage(next_event);
>> -}
>> -
>> -static void perf_event_sync_stat(struct perf_event_context *ctx,
>> -                                  struct perf_event_context *next_ctx)
>> -{
>> -       struct perf_event *event, *next_event;
>> -
>> -       if (!ctx->nr_stat)
>> -               return;
>> -
>> -       update_context_time(ctx);
>> -
>> -       event = list_first_entry(&ctx->event_list,
>> -                                  struct perf_event, event_entry);
>> -
>> -       next_event = list_first_entry(&next_ctx->event_list,
>> -                                       struct perf_event, event_entry);
>> -
>> -       while (&event->event_entry != &ctx->event_list &&
>> -              &next_event->event_entry != &next_ctx->event_list) {
>> -
>> -               __perf_event_sync_stat(event, next_event);
>> -
>> -               event = list_next_entry(event, event_entry);
>> -               next_event = list_next_entry(next_event, event_entry);
>> -       }
>> -}
>> -
>>   static void perf_ctx_sched_task_cb(struct perf_event_context *ctx,
>>                                     struct task_struct *task, bool sched_in)
>>   {
>> @@ -3835,8 +3775,6 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
>>                          RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
>>
>>                          do_switch = 0;
>> -
>> -                       perf_event_sync_stat(ctx, next_ctx);
>>                  }
>>                  raw_spin_unlock(&next_ctx->lock);
>>                  raw_spin_unlock(&ctx->lock);


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2026-03-11 10:16 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-24 17:05 [PATCH v3] perf/core: Fix missing read event generation on task exit Thaumy Cheng
2026-02-06 11:21 ` James Clark
2026-02-06 15:29   ` Peter Zijlstra
2026-02-09 16:12     ` James Clark
2026-03-03 14:29       ` Peter Zijlstra
2026-03-04 11:26         ` James Clark
2026-03-04 15:23           ` Peter Zijlstra
2026-03-10 10:59         ` James Clark
2026-03-10 18:25         ` Ian Rogers
2026-03-11 10:16           ` James Clark

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox