* [PATCH v4] trace/pid_list: optimize pid_list->lock contention
@ 2025-11-13 0:45 Yongliang Gao
2025-11-13 1:02 ` Steven Rostedt
0 siblings, 1 reply; 2+ messages in thread
From: Yongliang Gao @ 2025-11-13 0:45 UTC (permalink / raw)
To: rostedt, mhiramat, mathieu.desnoyers, bigeasy
Cc: linux-kernel, linux-trace-kernel, frankjpliu, Yongliang Gao,
Huang Cun
From: Yongliang Gao <leonylgao@tencent.com>
When the system has many cores and task switching is frequent,
setting set_ftrace_pid can cause frequent pid_list->lock contention
and high system sys usage.
For example, in a 288-core VM environment, we observed 267 CPUs
experiencing contention on pid_list->lock, with stack traces showing:
#4 [ffffa6226fb4bc70] native_queued_spin_lock_slowpath at ffffffff99cd4b7e
#5 [ffffa6226fb4bc90] _raw_spin_lock_irqsave at ffffffff99cd3e36
#6 [ffffa6226fb4bca0] trace_pid_list_is_set at ffffffff99267554
#7 [ffffa6226fb4bcc0] trace_ignore_this_task at ffffffff9925c288
#8 [ffffa6226fb4bcd8] ftrace_filter_pid_sched_switch_probe at ffffffff99246efe
#9 [ffffa6226fb4bcf0] __schedule at ffffffff99ccd161
Replaces the existing spinlock with a seqlock to allow concurrent readers,
while maintaining write exclusivity.
Signed-off-by: Yongliang Gao <leonylgao@tencent.com>
Reviewed-by: Huang Cun <cunhuang@tencent.com>
---
Changes from v3:
- Fixed patch format as per feedback. [Steven Rostedt]
- Link to v3: https://lore.kernel.org/all/20251113000252.1058144-1-leonylgao@gmail.com
Changes from v2:
- Keep trace_pid_list_next() using raw_spin_lock for simplicity. [Steven Rostedt]
- Link to v2: https://lore.kernel.org/all/20251112181456.473864-1-leonylgao@gmail.com
Changes from v1:
- Fixed sleep-in-atomic issues under PREEMPT_RT. [Steven Rostedt]
- Link to v1: https://lore.kernel.org/all/20251015114952.4014352-1-leonylgao@gmail.com
---
kernel/trace/pid_list.c | 30 +++++++++++++++++++++---------
kernel/trace/pid_list.h | 1 +
2 files changed, 22 insertions(+), 9 deletions(-)
diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index 090bb5ea4a19..dbee72d69d0a 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -3,6 +3,7 @@
* Copyright (C) 2021 VMware Inc, Steven Rostedt <rostedt@goodmis.org>
*/
#include <linux/spinlock.h>
+#include <linux/seqlock.h>
#include <linux/irq_work.h>
#include <linux/slab.h>
#include "trace.h"
@@ -126,7 +127,7 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
{
union upper_chunk *upper_chunk;
union lower_chunk *lower_chunk;
- unsigned long flags;
+ unsigned int seq;
unsigned int upper1;
unsigned int upper2;
unsigned int lower;
@@ -138,14 +139,16 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
if (pid_split(pid, &upper1, &upper2, &lower) < 0)
return false;
- raw_spin_lock_irqsave(&pid_list->lock, flags);
- upper_chunk = pid_list->upper[upper1];
- if (upper_chunk) {
- lower_chunk = upper_chunk->data[upper2];
- if (lower_chunk)
- ret = test_bit(lower, lower_chunk->data);
- }
- raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+ do {
+ seq = read_seqcount_begin(&pid_list->seqcount);
+ ret = false;
+ upper_chunk = pid_list->upper[upper1];
+ if (upper_chunk) {
+ lower_chunk = upper_chunk->data[upper2];
+ if (lower_chunk)
+ ret = test_bit(lower, lower_chunk->data);
+ }
+ } while (read_seqcount_retry(&pid_list->seqcount, seq));
return ret;
}
@@ -178,6 +181,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid)
return -EINVAL;
raw_spin_lock_irqsave(&pid_list->lock, flags);
+ write_seqcount_begin(&pid_list->seqcount);
upper_chunk = pid_list->upper[upper1];
if (!upper_chunk) {
upper_chunk = get_upper_chunk(pid_list);
@@ -199,6 +203,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid)
set_bit(lower, lower_chunk->data);
ret = 0;
out:
+ write_seqcount_end(&pid_list->seqcount);
raw_spin_unlock_irqrestore(&pid_list->lock, flags);
return ret;
}
@@ -230,6 +235,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid)
return -EINVAL;
raw_spin_lock_irqsave(&pid_list->lock, flags);
+ write_seqcount_begin(&pid_list->seqcount);
upper_chunk = pid_list->upper[upper1];
if (!upper_chunk)
goto out;
@@ -250,6 +256,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid)
}
}
out:
+ write_seqcount_end(&pid_list->seqcount);
raw_spin_unlock_irqrestore(&pid_list->lock, flags);
return 0;
}
@@ -340,8 +347,10 @@ static void pid_list_refill_irq(struct irq_work *iwork)
again:
raw_spin_lock(&pid_list->lock);
+ write_seqcount_begin(&pid_list->seqcount);
upper_count = CHUNK_ALLOC - pid_list->free_upper_chunks;
lower_count = CHUNK_ALLOC - pid_list->free_lower_chunks;
+ write_seqcount_end(&pid_list->seqcount);
raw_spin_unlock(&pid_list->lock);
if (upper_count <= 0 && lower_count <= 0)
@@ -370,6 +379,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
}
raw_spin_lock(&pid_list->lock);
+ write_seqcount_begin(&pid_list->seqcount);
if (upper) {
*upper_next = pid_list->upper_list;
pid_list->upper_list = upper;
@@ -380,6 +390,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
pid_list->lower_list = lower;
pid_list->free_lower_chunks += lcnt;
}
+ write_seqcount_end(&pid_list->seqcount);
raw_spin_unlock(&pid_list->lock);
/*
@@ -419,6 +430,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
init_irq_work(&pid_list->refill_irqwork, pid_list_refill_irq);
raw_spin_lock_init(&pid_list->lock);
+ seqcount_raw_spinlock_init(&pid_list->seqcount, &pid_list->lock);
for (i = 0; i < CHUNK_ALLOC; i++) {
union upper_chunk *chunk;
diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h
index 62e73f1ac85f..0b45fb0eadb9 100644
--- a/kernel/trace/pid_list.h
+++ b/kernel/trace/pid_list.h
@@ -76,6 +76,7 @@ union upper_chunk {
};
struct trace_pid_list {
+ seqcount_raw_spinlock_t seqcount;
raw_spinlock_t lock;
struct irq_work refill_irqwork;
union upper_chunk *upper[UPPER1_SIZE]; // 1 or 2K in size
--
2.43.5
^ permalink raw reply related [flat|nested] 2+ messages in thread
* Re: [PATCH v4] trace/pid_list: optimize pid_list->lock contention
2025-11-13 0:45 [PATCH v4] trace/pid_list: optimize pid_list->lock contention Yongliang Gao
@ 2025-11-13 1:02 ` Steven Rostedt
0 siblings, 0 replies; 2+ messages in thread
From: Steven Rostedt @ 2025-11-13 1:02 UTC (permalink / raw)
To: Yongliang Gao
Cc: mhiramat, mathieu.desnoyers, bigeasy, linux-kernel,
linux-trace-kernel, frankjpliu, Yongliang Gao, Huang Cun
On Thu, 13 Nov 2025 08:45:26 +0800
Yongliang Gao <leonylgao@gmail.com> wrote:
> ---
> Changes from v3:
> - Fixed patch format as per feedback. [Steven Rostedt]
> - Link to v3: https://lore.kernel.org/all/20251113000252.1058144-1-leonylgao@gmail.com
I already pulled this in and started testing it. No need for this resend.
-- Steve
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2025-11-13 1:01 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-13 0:45 [PATCH v4] trace/pid_list: optimize pid_list->lock contention Yongliang Gao
2025-11-13 1:02 ` Steven Rostedt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).