[PATCH v3] trace/pid_list: optimize pid

linux-trace-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v3] trace/pid_list: optimize pid_list->lock contention
@ 2025-11-13  0:02 Yongliang Gao
  2025-11-13  0:21 ` Steven Rostedt
  2025-11-13  7:34 ` Sebastian Andrzej Siewior
  0 siblings, 2 replies; 13+ messages in thread
From: Yongliang Gao @ 2025-11-13  0:02 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, bigeasy
  Cc: linux-kernel, linux-trace-kernel, frankjpliu, Yongliang Gao,
	Huang Cun

From: Yongliang Gao <leonylgao@tencent.com>

When the system has many cores and task switching is frequent,
setting set_ftrace_pid can cause frequent pid_list->lock contention
and high system sys usage.

For example, in a 288-core VM environment, we observed 267 CPUs
experiencing contention on pid_list->lock, with stack traces showing:

 #4 [ffffa6226fb4bc70] native_queued_spin_lock_slowpath at ffffffff99cd4b7e
 #5 [ffffa6226fb4bc90] _raw_spin_lock_irqsave at ffffffff99cd3e36
 #6 [ffffa6226fb4bca0] trace_pid_list_is_set at ffffffff99267554
 #7 [ffffa6226fb4bcc0] trace_ignore_this_task at ffffffff9925c288
 #8 [ffffa6226fb4bcd8] ftrace_filter_pid_sched_switch_probe at ffffffff99246efe
 #9 [ffffa6226fb4bcf0] __schedule at ffffffff99ccd161

Replaces the existing spinlock with a seqlock to allow concurrent readers,
while maintaining write exclusivity.

---
Changes from v2:
- Keep trace_pid_list_next() using raw_spin_lock for simplicity. [Steven Rostedt]
- Link to v2: https://lore.kernel.org/all/20251112181456.473864-1-leonylgao@gmail.com
Changes from v1:
- Fixed sleep-in-atomic issues under PREEMPT_RT. [Steven Rostedt]
- Link to v1: https://lore.kernel.org/all/20251015114952.4014352-1-leonylgao@gmail.com
---

Signed-off-by: Yongliang Gao <leonylgao@tencent.com>
Reviewed-by: Huang Cun <cunhuang@tencent.com>
---
 kernel/trace/pid_list.c | 30 +++++++++++++++++++++---------
 kernel/trace/pid_list.h |  1 +
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index 090bb5ea4a19..dbee72d69d0a 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -3,6 +3,7 @@
  * Copyright (C) 2021 VMware Inc, Steven Rostedt <rostedt@goodmis.org>
  */
 #include <linux/spinlock.h>
+#include <linux/seqlock.h>
 #include <linux/irq_work.h>
 #include <linux/slab.h>
 #include "trace.h"
@@ -126,7 +127,7 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
 {
 	union upper_chunk *upper_chunk;
 	union lower_chunk *lower_chunk;
-	unsigned long flags;
+	unsigned int seq;
 	unsigned int upper1;
 	unsigned int upper2;
 	unsigned int lower;
@@ -138,14 +139,16 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
 	if (pid_split(pid, &upper1, &upper2, &lower) < 0)
 		return false;
 
-	raw_spin_lock_irqsave(&pid_list->lock, flags);
-	upper_chunk = pid_list->upper[upper1];
-	if (upper_chunk) {
-		lower_chunk = upper_chunk->data[upper2];
-		if (lower_chunk)
-			ret = test_bit(lower, lower_chunk->data);
-	}
-	raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+	do {
+		seq = read_seqcount_begin(&pid_list->seqcount);
+		ret = false;
+		upper_chunk = pid_list->upper[upper1];
+		if (upper_chunk) {
+			lower_chunk = upper_chunk->data[upper2];
+			if (lower_chunk)
+				ret = test_bit(lower, lower_chunk->data);
+		}
+	} while (read_seqcount_retry(&pid_list->seqcount, seq));
 
 	return ret;
 }
@@ -178,6 +181,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid)
 		return -EINVAL;
 
 	raw_spin_lock_irqsave(&pid_list->lock, flags);
+	write_seqcount_begin(&pid_list->seqcount);
 	upper_chunk = pid_list->upper[upper1];
 	if (!upper_chunk) {
 		upper_chunk = get_upper_chunk(pid_list);
@@ -199,6 +203,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid)
 	set_bit(lower, lower_chunk->data);
 	ret = 0;
  out:
+	write_seqcount_end(&pid_list->seqcount);
 	raw_spin_unlock_irqrestore(&pid_list->lock, flags);
 	return ret;
 }
@@ -230,6 +235,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid)
 		return -EINVAL;
 
 	raw_spin_lock_irqsave(&pid_list->lock, flags);
+	write_seqcount_begin(&pid_list->seqcount);
 	upper_chunk = pid_list->upper[upper1];
 	if (!upper_chunk)
 		goto out;
@@ -250,6 +256,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid)
 		}
 	}
  out:
+	write_seqcount_end(&pid_list->seqcount);
 	raw_spin_unlock_irqrestore(&pid_list->lock, flags);
 	return 0;
 }
@@ -340,8 +347,10 @@ static void pid_list_refill_irq(struct irq_work *iwork)
 
  again:
 	raw_spin_lock(&pid_list->lock);
+	write_seqcount_begin(&pid_list->seqcount);
 	upper_count = CHUNK_ALLOC - pid_list->free_upper_chunks;
 	lower_count = CHUNK_ALLOC - pid_list->free_lower_chunks;
+	write_seqcount_end(&pid_list->seqcount);
 	raw_spin_unlock(&pid_list->lock);
 
 	if (upper_count <= 0 && lower_count <= 0)
@@ -370,6 +379,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
 	}
 
 	raw_spin_lock(&pid_list->lock);
+	write_seqcount_begin(&pid_list->seqcount);
 	if (upper) {
 		*upper_next = pid_list->upper_list;
 		pid_list->upper_list = upper;
@@ -380,6 +390,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
 		pid_list->lower_list = lower;
 		pid_list->free_lower_chunks += lcnt;
 	}
+	write_seqcount_end(&pid_list->seqcount);
 	raw_spin_unlock(&pid_list->lock);
 
 	/*
@@ -419,6 +430,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
 	init_irq_work(&pid_list->refill_irqwork, pid_list_refill_irq);
 
 	raw_spin_lock_init(&pid_list->lock);
+	seqcount_raw_spinlock_init(&pid_list->seqcount, &pid_list->lock);
 
 	for (i = 0; i < CHUNK_ALLOC; i++) {
 		union upper_chunk *chunk;
diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h
index 62e73f1ac85f..0b45fb0eadb9 100644
--- a/kernel/trace/pid_list.h
+++ b/kernel/trace/pid_list.h
@@ -76,6 +76,7 @@ union upper_chunk {
 };
 
 struct trace_pid_list {
+	seqcount_raw_spinlock_t		seqcount;
 	raw_spinlock_t			lock;
 	struct irq_work			refill_irqwork;
 	union upper_chunk		*upper[UPPER1_SIZE]; // 1 or 2K in size
-- 
2.43.5


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v3] trace/pid_list: optimize pid_list->lock contention
  2025-11-13  0:02 [PATCH v3] trace/pid_list: optimize pid_list->lock contention Yongliang Gao
@ 2025-11-13  0:21 ` Steven Rostedt
  2025-11-13  7:34 ` Sebastian Andrzej Siewior
  1 sibling, 0 replies; 13+ messages in thread
From: Steven Rostedt @ 2025-11-13  0:21 UTC (permalink / raw)
  To: Yongliang Gao
  Cc: mhiramat, mathieu.desnoyers, bigeasy, linux-kernel,
	linux-trace-kernel, frankjpliu, Yongliang Gao, Huang Cun

On Thu, 13 Nov 2025 08:02:52 +0800
Yongliang Gao <leonylgao@gmail.com> wrote:

> From: Yongliang Gao <leonylgao@tencent.com>
> 
> When the system has many cores and task switching is frequent,
> setting set_ftrace_pid can cause frequent pid_list->lock contention
> and high system sys usage.
> 
> For example, in a 288-core VM environment, we observed 267 CPUs
> experiencing contention on pid_list->lock, with stack traces showing:
> 
>  #4 [ffffa6226fb4bc70] native_queued_spin_lock_slowpath at ffffffff99cd4b7e
>  #5 [ffffa6226fb4bc90] _raw_spin_lock_irqsave at ffffffff99cd3e36
>  #6 [ffffa6226fb4bca0] trace_pid_list_is_set at ffffffff99267554
>  #7 [ffffa6226fb4bcc0] trace_ignore_this_task at ffffffff9925c288
>  #8 [ffffa6226fb4bcd8] ftrace_filter_pid_sched_switch_probe at ffffffff99246efe
>  #9 [ffffa6226fb4bcf0] __schedule at ffffffff99ccd161
> 
> Replaces the existing spinlock with a seqlock to allow concurrent readers,
> while maintaining write exclusivity.
> 
> ---
> Changes from v2:
> - Keep trace_pid_list_next() using raw_spin_lock for simplicity. [Steven Rostedt]
> - Link to v2: https://lore.kernel.org/all/20251112181456.473864-1-leonylgao@gmail.com
> Changes from v1:
> - Fixed sleep-in-atomic issues under PREEMPT_RT. [Steven Rostedt]
> - Link to v1: https://lore.kernel.org/all/20251015114952.4014352-1-leonylgao@gmail.com
> ---

You don't need to resend, but the "Changes from" needs to go below the
'---' after the tags. Otherwise, git am removes everything from the above
'---', including your tags below.

-- Steve


> 
> Signed-off-by: Yongliang Gao <leonylgao@tencent.com>
> Reviewed-by: Huang Cun <cunhuang@tencent.com>
> ---
>  kernel/trace/pid_list.c | 30 +++++++++++++++++++++---------
>  kernel/trace/pid_list.h |  1 +
>  2 files changed, 22 insertions(+), 9 deletions(-)
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3] trace/pid_list: optimize pid_list->lock contention
  2025-11-13  0:02 [PATCH v3] trace/pid_list: optimize pid_list->lock contention Yongliang Gao
  2025-11-13  0:21 ` Steven Rostedt
@ 2025-11-13  7:34 ` Sebastian Andrzej Siewior
  2025-11-13 11:13   ` Yongliang Gao
  2025-11-13 15:35   ` Steven Rostedt
  1 sibling, 2 replies; 13+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-11-13  7:34 UTC (permalink / raw)
  To: Yongliang Gao
  Cc: rostedt, mhiramat, mathieu.desnoyers, linux-kernel,
	linux-trace-kernel, frankjpliu, Yongliang Gao, Huang Cun

On 2025-11-13 08:02:52 [+0800], Yongliang Gao wrote:
> --- a/kernel/trace/pid_list.c
> +++ b/kernel/trace/pid_list.c
> @@ -138,14 +139,16 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
>  	if (pid_split(pid, &upper1, &upper2, &lower) < 0)
>  		return false;
>  
> -	raw_spin_lock_irqsave(&pid_list->lock, flags);
> -	upper_chunk = pid_list->upper[upper1];
> -	if (upper_chunk) {
> -		lower_chunk = upper_chunk->data[upper2];
> -		if (lower_chunk)
> -			ret = test_bit(lower, lower_chunk->data);
> -	}
> -	raw_spin_unlock_irqrestore(&pid_list->lock, flags);
> +	do {
> +		seq = read_seqcount_begin(&pid_list->seqcount);
> +		ret = false;
> +		upper_chunk = pid_list->upper[upper1];
> +		if (upper_chunk) {
> +			lower_chunk = upper_chunk->data[upper2];
> +			if (lower_chunk)
> +				ret = test_bit(lower, lower_chunk->data);
> +		}
> +	} while (read_seqcount_retry(&pid_list->seqcount, seq));

How is this better? Any numbers?
If the write side is busy and the lock is handed over from one CPU to
another then it is possible that the reader spins here and does several
loops, right?
And in this case, how accurate would it be? I mean the result could
change right after the sequence here is completed because the write side
got active again. How bad would it be if there would be no locking and
RCU ensures that the chunks (and data) don't disappear while looking at
it?

>  	return ret;
>  }

Sebastian

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3] trace/pid_list: optimize pid_list->lock contention
  2025-11-13  7:34 ` Sebastian Andrzej Siewior
@ 2025-11-13 11:13   ` Yongliang Gao
  2025-11-13 14:15     ` Sebastian Andrzej Siewior
  2025-11-13 15:35   ` Steven Rostedt
  1 sibling, 1 reply; 13+ messages in thread
From: Yongliang Gao @ 2025-11-13 11:13 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: rostedt, mhiramat, mathieu.desnoyers, linux-kernel,
	linux-trace-kernel, frankjpliu, Yongliang Gao, Huang Cun

Hi Sebastian,

Thank you for your review and the thoughtful questions.

1. Performance Data
We encountered this issue in a production environment with 288 cores
where enabling set_ftrace_pid caused system CPU usage (sys%) to
increase from 10% to over 90%. In our 92-core VM test environment:

Before patch (spinlock):
- Without filtering: cs=2395401/s, sys%=7%
- With filtering: cs=1828261/s, sys%=40%

After patch (seqlock):
- Without filtering: cs=2397032/s, sys%=6%
- With filtering: cs=2398922/s, sys%=6%

The seqlock approach eliminates the pid_list->lock contention that was
previously causing sys% to increase from 7% to 40%.

2. Reader Retry Behavior
Yes, if the write side is continuously busy, the reader might spin and
retry. However, in practice:
- Writes are infrequent (only when setting ftrace_pid filter or during
task fork/exit with function-fork enabled)
- For readers, trace_pid_list_is_set() is called on every task switch,
which can occur at a very high frequency.

3. Result Accuracy
You're correct that the result might change immediately after the
read. For trace_ignore_this_task(), we don't require absolute
accuracy. Slight race conditions (where a task might be traced or not
in borderline cases) are acceptable.

Best regards,
Yongliang

On Thu, Nov 13, 2025 at 3:34 PM Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
>
> On 2025-11-13 08:02:52 [+0800], Yongliang Gao wrote:
> > --- a/kernel/trace/pid_list.c
> > +++ b/kernel/trace/pid_list.c
> > @@ -138,14 +139,16 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
> >       if (pid_split(pid, &upper1, &upper2, &lower) < 0)
> >               return false;
> >
> > -     raw_spin_lock_irqsave(&pid_list->lock, flags);
> > -     upper_chunk = pid_list->upper[upper1];
> > -     if (upper_chunk) {
> > -             lower_chunk = upper_chunk->data[upper2];
> > -             if (lower_chunk)
> > -                     ret = test_bit(lower, lower_chunk->data);
> > -     }
> > -     raw_spin_unlock_irqrestore(&pid_list->lock, flags);
> > +     do {
> > +             seq = read_seqcount_begin(&pid_list->seqcount);
> > +             ret = false;
> > +             upper_chunk = pid_list->upper[upper1];
> > +             if (upper_chunk) {
> > +                     lower_chunk = upper_chunk->data[upper2];
> > +                     if (lower_chunk)
> > +                             ret = test_bit(lower, lower_chunk->data);
> > +             }
> > +     } while (read_seqcount_retry(&pid_list->seqcount, seq));
>
> How is this better? Any numbers?
> If the write side is busy and the lock is handed over from one CPU to
> another then it is possible that the reader spins here and does several
> loops, right?
> And in this case, how accurate would it be? I mean the result could
> change right after the sequence here is completed because the write side
> got active again. How bad would it be if there would be no locking and
> RCU ensures that the chunks (and data) don't disappear while looking at
> it?
>
> >       return ret;
> >  }
>
> Sebastian

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3] trace/pid_list: optimize pid_list->lock contention
  2025-11-13 11:13   ` Yongliang Gao
@ 2025-11-13 14:15     ` Sebastian Andrzej Siewior
  2025-11-13 15:05       ` Steven Rostedt
  0 siblings, 1 reply; 13+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-11-13 14:15 UTC (permalink / raw)
  To: Yongliang Gao
  Cc: rostedt, mhiramat, mathieu.desnoyers, linux-kernel,
	linux-trace-kernel, frankjpliu, Yongliang Gao, Huang Cun

On 2025-11-13 19:13:23 [+0800], Yongliang Gao wrote:
> Hi Sebastian,
Hi Yongliang,

> Thank you for your review and the thoughtful questions.
> 
> 1. Performance Data
> We encountered this issue in a production environment with 288 cores
> where enabling set_ftrace_pid caused system CPU usage (sys%) to
> increase from 10% to over 90%. In our 92-core VM test environment:
> 
> Before patch (spinlock):
> - Without filtering: cs=2395401/s, sys%=7%
> - With filtering: cs=1828261/s, sys%=40%
> 
> After patch (seqlock):
> - Without filtering: cs=2397032/s, sys%=6%
> - With filtering: cs=2398922/s, sys%=6%
> 
> The seqlock approach eliminates the pid_list->lock contention that was
> previously causing sys% to increase from 7% to 40%.
> 
> 2. Reader Retry Behavior
> Yes, if the write side is continuously busy, the reader might spin and
> retry. However, in practice:
> - Writes are infrequent (only when setting ftrace_pid filter or during
> task fork/exit with function-fork enabled)
> - For readers, trace_pid_list_is_set() is called on every task switch,
> which can occur at a very high frequency.

See.

> 3. Result Accuracy
> You're correct that the result might change immediately after the
> read. For trace_ignore_this_task(), we don't require absolute
> accuracy. Slight race conditions (where a task might be traced or not
> in borderline cases) are acceptable.

I don't see why RCU work shouldn't work here.
If a pid is removed then it might result that a chunk cleared/ removed
then upper_chunk/ lower_chunk can become NULL. The buffer itself can be
reused and point to something else. It could lear to a false outcome in
test_bit(). This is handled by read_seqcount_retry().

You could assign upper1, upper2, to NULL/ buffer as now and the removal
(in put_lower_chunk(), put_upper_chunk()) delay to RCU so it happens
after the grace period. That would allow you to iterate over it in
trace_pid_list_is_set() lockless since the buffer will not disappear and
will not be reused for another task until after all reader left.

Additionally it would guarantee that the buffer is not released in
trace_pid_list_free(). I don't see how the seqcount ensures that the
buffer is not gone. I mean you could have a reader and the retry would
force you to do another loop but before that happens you dereference the
upper_chunk pointer which could be reused.

So I *think* the RCU approach should be doable and cover this.

> Best regards,
> Yongliang

Sebastian

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3] trace/pid_list: optimize pid_list->lock contention
  2025-11-13 14:15     ` Sebastian Andrzej Siewior
@ 2025-11-13 15:05       ` Steven Rostedt
  2025-11-13 15:17         ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 13+ messages in thread
From: Steven Rostedt @ 2025-11-13 15:05 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Yongliang Gao, mhiramat, mathieu.desnoyers, linux-kernel,
	linux-trace-kernel, frankjpliu, Yongliang Gao, Huang Cun

On Thu, 13 Nov 2025 15:15:15 +0100
Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:

> I don't see why RCU work shouldn't work here.
> If a pid is removed then it might result that a chunk cleared/ removed
> then upper_chunk/ lower_chunk can become NULL. The buffer itself can be
> reused and point to something else. It could lear to a false outcome in
> test_bit(). This is handled by read_seqcount_retry().
> 
> You could assign upper1, upper2, to NULL/ buffer as now and the removal
> (in put_lower_chunk(), put_upper_chunk()) delay to RCU so it happens
> after the grace period. That would allow you to iterate over it in
> trace_pid_list_is_set() lockless since the buffer will not disappear and
> will not be reused for another task until after all reader left.

The chunks are reused. When they are cleared, they go to a free-list, and
when a new chunk is needed, it is taken from the free-list so that we do
not need to allocate.

When all the bits in a chunk are cleared it moves to the free-list. This
can happen in exit, when a task exits and it clears its pid bit from the
pid_list.

Then a chunk can be allocated in fork, when a task whose pid is in the
pid_list adds its child to the list as well.

This means that the chunks are not being freed and we can't be doing
synchronize_rcu() in every exit.

> 
> Additionally it would guarantee that the buffer is not released in
> trace_pid_list_free(). I don't see how the seqcount ensures that the
> buffer is not gone. I mean you could have a reader and the retry would
> force you to do another loop but before that happens you dereference the
> upper_chunk pointer which could be reused.

This protection has nothing to do with trace_pid_list_free(). In fact,
you'll notice that function doesn't even have any locking. That's because
the pid_list itself is removed from view and RCU synchronization happens
before that function is called.

The protection in trace_pid_list_is_set() is only to synchronize with the
adding and removing of the bits in the updates in exit and fork as well as
with the user manually writing into the set_*_pid files.

> 
> So I *think* the RCU approach should be doable and cover this.

Where would you put the synchronize_rcu()? In do_exit()?

Also understanding what this is used for helps in understanding the scope
of protection needed.

The pid_list is created when you add anything into one of the pid files in
tracefs. Let's use /sys/kernel/tracing/set_ftrace_pid:

  # cd /sys/kernel/tracing
  # echo $$ > set_ftrace_pid
  # echo 1 > options/function-fork
  # cat set_ftrace_pid
  2716
  2936
  # cat set_ftrace_pid
  2716
  2945

What the above did was to create a pid_list for the function tracer. I
added the bash process pid using $$ (2716). Then when I cat the file, it
showed the pid for the bash process as well as the pid for the cat process,
as the cat process is a child of the bash process. The function-fork option
means to add any child process to the set_ftrace_pid if the parent is
already in the list. It also means to remove the pid if a process in the
list exits.

When I enable function tracing, it will only trace the bash process and any
of its children:

 # echo 0 > tracing_on
 # echo function > current_tracer
 # cat set_ftrace_pid ; echo 0 > tracing_on
 2716
 2989
 # cat trace
[..]
            bash-2716    [003] ..... 36854.662833: rcu_read_lock_held <-mtree_range_walk
            bash-2716    [003] ..... 36854.662834: rcu_lockdep_current_cpu_online <-rcu_read_lock_held
            bash-2716    [003] ..... 36854.662834: rcu_read_lock_held <-vma_start_read
##### CPU 6 buffer started ####
             cat-2989    [006] d..2. 36854.662834: ret_from_fork <-ret_from_fork_asm
            bash-2716    [003] ..... 36854.662835: rcu_lockdep_current_cpu_online <-rcu_read_lock_held
             cat-2989    [006] d..2. 36854.662836: schedule_tail <-ret_from_fork
            bash-2716    [003] ..... 36854.662836: __rcu_read_unlock <-lock_vma_under_rcu
             cat-2989    [006] d..2. 36854.662836: finish_task_switch.isra.0 <-schedule_tail
            bash-2716    [003] ..... 36854.662836: handle_mm_fault <-do_user_addr_fault
[..]

It would be way too expensive to check the pid_list at *every* function
call. But luckily we don't have to. Instead, we set a per-cpu flag in the
instance trace_array on sched_switch if the next pid is in the pid_list and
clear it if it is not. (See ftrace_filter_pid_sched_switch_probe()).

This means, the bit being checked in the pid_list is always for a task that
is about to run.

The bit being cleared, is always for that task that is exiting (except for
the case of manual updates).

What we are protecting against is when one chunk is freed, but then
allocated again for a different set of PIDs. Where the reader has the chunk,
it was freed and re-allocated and the bit that is about to be checked
doesn't represent the bit it is checking for.

    CPU1					CPU2
    ----					----

 trace_pid_list_is_set() {
   lower_chunk = upper_chunk->data[upper2];

					do_exit() {
					  trace_pid_list_clear() {
					    lower_chunk = upper_chunk->data[upper2]; // same lower_chunk
					    clear_bit(lower, lower_chunk->data);
					    if (find_first_bit(lower_chunk->data, LOWER_MAX) >= LOWER_MAX)
					      put_lower_chunk(pid_list, lower_chunk);
					    [..]
					  }
					}

					fork() {
					  trace_pid_list_set() {
					    lower_chunk = get_lower_chunk(pid_list);
					    upper_chunk->data[upper2] = lower_chunk; // upper2 is a different value here
					    set_bit(lower, lower_chunk->data);
					  }
					}

   ret = test_bit(lower, lower_chunk->data);

And if the "lower" bit matches the set_bit from CPU2, we have a false
positive. Although, this race is highly unlikely, we should still protect
against it (it could happen on a VM vCPU that was preempted in
trace_pid_list_is_set()).

-- Steve

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3] trace/pid_list: optimize pid_list->lock contention
  2025-11-13 15:05       ` Steven Rostedt
@ 2025-11-13 15:17         ` Sebastian Andrzej Siewior
  2025-11-13 15:24           ` Steven Rostedt
  0 siblings, 1 reply; 13+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-11-13 15:17 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Yongliang Gao, mhiramat, mathieu.desnoyers, linux-kernel,
	linux-trace-kernel, frankjpliu, Yongliang Gao, Huang Cun

On 2025-11-13 10:05:24 [-0500], Steven Rostedt wrote:
> This means that the chunks are not being freed and we can't be doing
> synchronize_rcu() in every exit.

You don't have to, you can do call_rcu().

> > Additionally it would guarantee that the buffer is not released in
> > trace_pid_list_free(). I don't see how the seqcount ensures that the
> > buffer is not gone. I mean you could have a reader and the retry would
> > force you to do another loop but before that happens you dereference the
> > upper_chunk pointer which could be reused.
> 
> This protection has nothing to do with trace_pid_list_free(). In fact,
> you'll notice that function doesn't even have any locking. That's because
> the pid_list itself is removed from view and RCU synchronization happens
> before that function is called.
> 
> The protection in trace_pid_list_is_set() is only to synchronize with the
> adding and removing of the bits in the updates in exit and fork as well as
> with the user manually writing into the set_*_pid files.

So if the kfree() is not an issue, it is just the use of the block from
the freelist which must not point to a wrong item? And therefore the
seqcount?

> > So I *think* the RCU approach should be doable and cover this.
> 
> Where would you put the synchronize_rcu()? In do_exit()?

simply call_rcu() and let it move to the freelist.

> Also understanding what this is used for helps in understanding the scope
> of protection needed.
> 
> The pid_list is created when you add anything into one of the pid files in
> tracefs. Let's use /sys/kernel/tracing/set_ftrace_pid:
> 
>   # cd /sys/kernel/tracing
>   # echo $$ > set_ftrace_pid
>   # echo 1 > options/function-fork
>   # cat set_ftrace_pid
>   2716
>   2936
>   # cat set_ftrace_pid
>   2716
>   2945
> 
> What the above did was to create a pid_list for the function tracer. I
> added the bash process pid using $$ (2716). Then when I cat the file, it
> showed the pid for the bash process as well as the pid for the cat process,
> as the cat process is a child of the bash process. The function-fork option
> means to add any child process to the set_ftrace_pid if the parent is
> already in the list. It also means to remove the pid if a process in the
> list exits.

This adding/ add-on-fork, removing and remove-on-exit is the only write
side?

> When I enable function tracing, it will only trace the bash process and any
> of its children:
> 
>  # echo 0 > tracing_on
>  # echo function > current_tracer
>  # cat set_ftrace_pid ; echo 0 > tracing_on
>  2716
>  2989
>  # cat trace
> [..]


>             bash-2716    [003] ..... 36854.662833: rcu_read_lock_held <-mtree_range_walk
>             bash-2716    [003] ..... 36854.662834: rcu_lockdep_current_cpu_online <-rcu_read_lock_held
>             bash-2716    [003] ..... 36854.662834: rcu_read_lock_held <-vma_start_read
> ##### CPU 6 buffer started ####
>              cat-2989    [006] d..2. 36854.662834: ret_from_fork <-ret_from_fork_asm
>             bash-2716    [003] ..... 36854.662835: rcu_lockdep_current_cpu_online <-rcu_read_lock_held
>              cat-2989    [006] d..2. 36854.662836: schedule_tail <-ret_from_fork
>             bash-2716    [003] ..... 36854.662836: __rcu_read_unlock <-lock_vma_under_rcu
>              cat-2989    [006] d..2. 36854.662836: finish_task_switch.isra.0 <-schedule_tail
>             bash-2716    [003] ..... 36854.662836: handle_mm_fault <-do_user_addr_fault
> [..]
> 
> It would be way too expensive to check the pid_list at *every* function
> call. But luckily we don't have to. Instead, we set a per-cpu flag in the
> instance trace_array on sched_switch if the next pid is in the pid_list and
> clear it if it is not. (See ftrace_filter_pid_sched_switch_probe()).
> 
> This means, the bit being checked in the pid_list is always for a task that
> is about to run.
> 
> The bit being cleared, is always for that task that is exiting (except for
> the case of manual updates).
> 
> What we are protecting against is when one chunk is freed, but then
> allocated again for a different set of PIDs. Where the reader has the chunk,
> it was freed and re-allocated and the bit that is about to be checked
> doesn't represent the bit it is checking for.

This I assumed.
And the kfree() at the end can not happen while there is still a reader?

…
> And if the "lower" bit matches the set_bit from CPU2, we have a false
> positive. Although, this race is highly unlikely, we should still protect
> against it (it could happen on a VM vCPU that was preempted in
> trace_pid_list_is_set()).
> 
> -- Steve

Sebastian

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3] trace/pid_list: optimize pid_list->lock contention
  2025-11-13 15:17         ` Sebastian Andrzej Siewior
@ 2025-11-13 15:24           ` Steven Rostedt
  2025-11-13 15:35             ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 13+ messages in thread
From: Steven Rostedt @ 2025-11-13 15:24 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Yongliang Gao, mhiramat, mathieu.desnoyers, linux-kernel,
	linux-trace-kernel, frankjpliu, Yongliang Gao, Huang Cun

On Thu, 13 Nov 2025 16:17:29 +0100
Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:

> On 2025-11-13 10:05:24 [-0500], Steven Rostedt wrote:
> > This means that the chunks are not being freed and we can't be doing
> > synchronize_rcu() in every exit.  
> 
> You don't have to, you can do call_rcu().

But the chunk isn't being freed. They may be used right away.


> So if the kfree() is not an issue, it is just the use of the block from
> the freelist which must not point to a wrong item? And therefore the
> seqcount?

Correct.

> 
> > > So I *think* the RCU approach should be doable and cover this.  
> > 
> > Where would you put the synchronize_rcu()? In do_exit()?  
> 
> simply call_rcu() and let it move to the freelist.

A couple of issues. One, the chunks are fully used. There's no place to put
a "rcu_head" in them. Well, we may be able to make use of them.

Second, if there's a lot of tasks exiting and forking, we can easily run
out of chunks that are waiting to be "freed" via call_rcu().

> 
> > Also understanding what this is used for helps in understanding the scope
> > of protection needed.
> > 
> > The pid_list is created when you add anything into one of the pid files in
> > tracefs. Let's use /sys/kernel/tracing/set_ftrace_pid:
> > 
> >   # cd /sys/kernel/tracing
> >   # echo $$ > set_ftrace_pid
> >   # echo 1 > options/function-fork
> >   # cat set_ftrace_pid
> >   2716
> >   2936
> >   # cat set_ftrace_pid
> >   2716
> >   2945
> > 
> > What the above did was to create a pid_list for the function tracer. I
> > added the bash process pid using $$ (2716). Then when I cat the file, it
> > showed the pid for the bash process as well as the pid for the cat process,
> > as the cat process is a child of the bash process. The function-fork option
> > means to add any child process to the set_ftrace_pid if the parent is
> > already in the list. It also means to remove the pid if a process in the
> > list exits.  
> 
> This adding/ add-on-fork, removing and remove-on-exit is the only write
> side?

That and manual writes to the set_ftrace_pid file.

> > What we are protecting against is when one chunk is freed, but then
> > allocated again for a different set of PIDs. Where the reader has the chunk,
> > it was freed and re-allocated and the bit that is about to be checked
> > doesn't represent the bit it is checking for.  
> 
> This I assumed.
> And the kfree() at the end can not happen while there is still a reader?

Correct. That's done by the pid_list user:

In clear_ftrace_pids():

	/* Wait till all users are no longer using pid filtering */
	synchronize_rcu();

	if ((type & TRACE_PIDS) && pid_list)
		trace_pid_list_free(pid_list);

	if ((type & TRACE_NO_PIDS) && no_pid_list)
		trace_pid_list_free(no_pid_list);

-- Steve


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3] trace/pid_list: optimize pid_list->lock contention
  2025-11-13 15:24           ` Steven Rostedt
@ 2025-11-13 15:35             ` Sebastian Andrzej Siewior
  2025-11-13 15:51               ` Steven Rostedt
  0 siblings, 1 reply; 13+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-11-13 15:35 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Yongliang Gao, mhiramat, mathieu.desnoyers, linux-kernel,
	linux-trace-kernel, frankjpliu, Yongliang Gao, Huang Cun

On 2025-11-13 10:24:45 [-0500], Steven Rostedt wrote:
> On Thu, 13 Nov 2025 16:17:29 +0100
> Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> 
> > On 2025-11-13 10:05:24 [-0500], Steven Rostedt wrote:
> > > This means that the chunks are not being freed and we can't be doing
> > > synchronize_rcu() in every exit.  
> > 
> > You don't have to, you can do call_rcu().
> 
> But the chunk isn't being freed. They may be used right away.

Not if you avoid using it until after the rcu callback.

> > > > So I *think* the RCU approach should be doable and cover this.  
> > > 
> > > Where would you put the synchronize_rcu()? In do_exit()?  
> > 
> > simply call_rcu() and let it move to the freelist.
> 
> A couple of issues. One, the chunks are fully used. There's no place to put
> a "rcu_head" in them. Well, we may be able to make use of them.

This could be the first (16?) bytes of the memory chunk.

> Second, if there's a lot of tasks exiting and forking, we can easily run
> out of chunks that are waiting to be "freed" via call_rcu().

but this is a general RCU problem and not new here. The task_struct and
everything around it (including stack) is RCU freed.

> > 
> > > Also understanding what this is used for helps in understanding the scope
> > > of protection needed.
> > > 
> > > The pid_list is created when you add anything into one of the pid files in
> > > tracefs. Let's use /sys/kernel/tracing/set_ftrace_pid:
> > > 
> > >   # cd /sys/kernel/tracing
> > >   # echo $$ > set_ftrace_pid
> > >   # echo 1 > options/function-fork
> > >   # cat set_ftrace_pid
> > >   2716
> > >   2936
> > >   # cat set_ftrace_pid
> > >   2716
> > >   2945
> > > 
> > > What the above did was to create a pid_list for the function tracer. I
> > > added the bash process pid using $$ (2716). Then when I cat the file, it
> > > showed the pid for the bash process as well as the pid for the cat process,
> > > as the cat process is a child of the bash process. The function-fork option
> > > means to add any child process to the set_ftrace_pid if the parent is
> > > already in the list. It also means to remove the pid if a process in the
> > > list exits.  
> > 
> > This adding/ add-on-fork, removing and remove-on-exit is the only write
> > side?
> 
> That and manual writes to the set_ftrace_pid file.

This looks like minimal. I miss understood then that context switch can
also contribute to it.

> > > What we are protecting against is when one chunk is freed, but then
> > > allocated again for a different set of PIDs. Where the reader has the chunk,
> > > it was freed and re-allocated and the bit that is about to be checked
> > > doesn't represent the bit it is checking for.  
> > 
> > This I assumed.
> > And the kfree() at the end can not happen while there is still a reader?
> 
> Correct. That's done by the pid_list user:
> 
> In clear_ftrace_pids():
> 
> 	/* Wait till all users are no longer using pid filtering */
> 	synchronize_rcu();
> 
> 	if ((type & TRACE_PIDS) && pid_list)
> 		trace_pid_list_free(pid_list);
> 
> 	if ((type & TRACE_NO_PIDS) && no_pid_list)
> 		trace_pid_list_free(no_pid_list);

And the callers of trace_pid_list_is_set() are always in the RCU read
section then? I assume so, since it wouldn't make sense otherwise.

> -- Steve

Sebastian

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3] trace/pid_list: optimize pid_list->lock contention
  2025-11-13  7:34 ` Sebastian Andrzej Siewior
  2025-11-13 11:13   ` Yongliang Gao
@ 2025-11-13 15:35   ` Steven Rostedt
  1 sibling, 0 replies; 13+ messages in thread
From: Steven Rostedt @ 2025-11-13 15:35 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Yongliang Gao, mhiramat, mathieu.desnoyers, linux-kernel,
	linux-trace-kernel, frankjpliu, Yongliang Gao, Huang Cun

On Thu, 13 Nov 2025 08:34:20 +0100
Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:

> > +	do {
> > +		seq = read_seqcount_begin(&pid_list->seqcount);
> > +		ret = false;
> > +		upper_chunk = pid_list->upper[upper1];
> > +		if (upper_chunk) {
> > +			lower_chunk = upper_chunk->data[upper2];
> > +			if (lower_chunk)
> > +				ret = test_bit(lower, lower_chunk->data);
> > +		}
> > +	} while (read_seqcount_retry(&pid_list->seqcount, seq));  
> 
> How is this better? Any numbers?
> If the write side is busy and the lock is handed over from one CPU to
> another then it is possible that the reader spins here and does several
> loops, right?

I think the chances of that is very slim. The writes are at fork and exit
and manually writing to one of the set_*_pid files.

The readers are at every sched_switch. Currently we just use
raw_spin_locks. But that forces a serialization of every sched_switch!
Which on big machines could cause a huge latency.

This approach allows multiple sched_switches to happen at the same time.

> And in this case, how accurate would it be? I mean the result could
> change right after the sequence here is completed because the write side
> got active again. How bad would it be if there would be no locking and
> RCU ensures that the chunks (and data) don't disappear while looking at
> it?

As I mentioned the use case for this, it is very accurate. That's because
the writers are updating the pid bits for themselves. If you are checking
for pid 123, that means task 123 is about to run. If bit 123 is being added
or removed, it would only be done by task 123 or its parent.

The exception to this rule is if a user manually adds or removes a pid from
the set_*_pid file. But that has other races that we don't really care
about. It's known that the update made there may take some milliseconds to
update.

-- Steve

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3] trace/pid_list: optimize pid_list->lock contention
  2025-11-13 15:35             ` Sebastian Andrzej Siewior
@ 2025-11-13 15:51               ` Steven Rostedt
  2025-11-13 16:07                 ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 13+ messages in thread
From: Steven Rostedt @ 2025-11-13 15:51 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Yongliang Gao, mhiramat, mathieu.desnoyers, linux-kernel,
	linux-trace-kernel, frankjpliu, Yongliang Gao, Huang Cun

On Thu, 13 Nov 2025 16:35:08 +0100
Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:

> > A couple of issues. One, the chunks are fully used. There's no place to put
> > a "rcu_head" in them. Well, we may be able to make use of them.  
> 
> This could be the first (16?) bytes of the memory chunk.

Yes, but this will make things more complex.

> 
> > Second, if there's a lot of tasks exiting and forking, we can easily run
> > out of chunks that are waiting to be "freed" via call_rcu().  
> 
> but this is a general RCU problem and not new here. The task_struct and
> everything around it (including stack) is RCU freed.

But this doesn't happen under memory pressure. Allocations are not
performed if a chunk is not available. If a chunk isn't available, then we
just stop tracing the filtered task. Hmm, it currently silently fails.
Perhaps I need to add a printk when this happens.

> > > This adding/ add-on-fork, removing and remove-on-exit is the only write
> > > side?  
> > 
> > That and manual writes to the set_ftrace_pid file.  
> 
> This looks like minimal. I miss understood then that context switch can
> also contribute to it.

sched_switch is only a reader, not a writer.

> And the callers of trace_pid_list_is_set() are always in the RCU read
> section then? I assume so, since it wouldn't make sense otherwise.

Yes, because they are only tested in sched_switch and fork and exit tracepoints.

Although, this was written when tracepoint callbacks were always called
under preempt disable. Perhaps we need to change that call to:

	tracepoint_synchronize_unregister()

Or add a preempt_disable() around the callers.

I'm very nervous about using RCU here. It will add a lot more corner cases
that needs to be accounted for. The complexity doesn't appear to be worth
it. I'd rather just keep the raw spin locks than to convert it to RCU.

The seqcount makes sense to me. It's simple and keeps the same paradigm as
what we have. What's wrong with using it?

-- Steve

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3] trace/pid_list: optimize pid_list->lock contention
  2025-11-13 15:51               ` Steven Rostedt
@ 2025-11-13 16:07                 ` Sebastian Andrzej Siewior
  2025-11-13 16:14                   ` Steven Rostedt
  0 siblings, 1 reply; 13+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-11-13 16:07 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Yongliang Gao, mhiramat, mathieu.desnoyers, linux-kernel,
	linux-trace-kernel, frankjpliu, Yongliang Gao, Huang Cun

On 2025-11-13 10:51:06 [-0500], Steven Rostedt wrote:
> Yes, because they are only tested in sched_switch and fork and exit tracepoints.
> 
> Although, this was written when tracepoint callbacks were always called
> under preempt disable. Perhaps we need to change that call to:
> 
> 	tracepoint_synchronize_unregister()
> 
> Or add a preempt_disable() around the callers.

Please don't. Please do a regular rcu_read_lock() ;)
I tried to get rid of the preempt_disable() around tracepoints so that
the attached BPF callbacks are not invoked with disabled preemption. I
haven't followed up here in a while but I think Paul's SRCU work goes
in the right direction.

> I'm very nervous about using RCU here. It will add a lot more corner cases
> that needs to be accounted for. The complexity doesn't appear to be worth
> it. I'd rather just keep the raw spin locks than to convert it to RCU.
> 
> The seqcount makes sense to me. It's simple and keeps the same paradigm as
> what we have. What's wrong with using it?

I'm fine with it once you explained under what conditions retry can
happen.  Thank you.

> -- Steve

Sebastian

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3] trace/pid_list: optimize pid_list->lock contention
  2025-11-13 16:07                 ` Sebastian Andrzej Siewior
@ 2025-11-13 16:14                   ` Steven Rostedt
  0 siblings, 0 replies; 13+ messages in thread
From: Steven Rostedt @ 2025-11-13 16:14 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Yongliang Gao, mhiramat, mathieu.desnoyers, linux-kernel,
	linux-trace-kernel, frankjpliu, Yongliang Gao, Huang Cun

On Thu, 13 Nov 2025 17:07:39 +0100
Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:

> On 2025-11-13 10:51:06 [-0500], Steven Rostedt wrote:
> > Yes, because they are only tested in sched_switch and fork and exit tracepoints.
> > 
> > Although, this was written when tracepoint callbacks were always called
> > under preempt disable. Perhaps we need to change that call to:
> > 
> > 	tracepoint_synchronize_unregister()
> > 
> > Or add a preempt_disable() around the callers.  
> 
> Please don't. Please do a regular rcu_read_lock() ;)
> I tried to get rid of the preempt_disable() around tracepoints so that
> the attached BPF callbacks are not invoked with disabled preemption. I
> haven't followed up here in a while but I think Paul's SRCU work goes
> in the right direction.

I meant just reading the pid lists, which are usually called from
tracepoints that are in preempt_disabled locations.

Anyway, I can add rcu_read_lock() around the callers of it.

> 
> > I'm very nervous about using RCU here. It will add a lot more corner cases
> > that needs to be accounted for. The complexity doesn't appear to be worth
> > it. I'd rather just keep the raw spin locks than to convert it to RCU.
> > 
> > The seqcount makes sense to me. It's simple and keeps the same paradigm as
> > what we have. What's wrong with using it?  
> 
> I'm fine with it once you explained under what conditions retry can
> happen.  Thank you.

Thanks,

-- Steve

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2025-11-13 16:14 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-13  0:02 [PATCH v3] trace/pid_list: optimize pid_list->lock contention Yongliang Gao
2025-11-13  0:21 ` Steven Rostedt
2025-11-13  7:34 ` Sebastian Andrzej Siewior
2025-11-13 11:13   ` Yongliang Gao
2025-11-13 14:15     ` Sebastian Andrzej Siewior
2025-11-13 15:05       ` Steven Rostedt
2025-11-13 15:17         ` Sebastian Andrzej Siewior
2025-11-13 15:24           ` Steven Rostedt
2025-11-13 15:35             ` Sebastian Andrzej Siewior
2025-11-13 15:51               ` Steven Rostedt
2025-11-13 16:07                 ` Sebastian Andrzej Siewior
2025-11-13 16:14                   ` Steven Rostedt
2025-11-13 15:35   ` Steven Rostedt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).