From: Steven Rostedt <rostedt@kernel.org>
To: linux-kernel@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>,
Mark Rutland <mark.rutland@arm.com>,
Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
Andrew Morton <akpm@linux-foundation.org>,
Huang Cun <cunhuang@tencent.com>,
Yongliang Gao <leonylgao@tencent.com>
Subject: [for-next][PATCH 1/5] trace/pid_list: optimize pid_list->lock contention
Date: Sat, 15 Nov 2025 09:09:11 -0500 [thread overview]
Message-ID: <20251115140936.505263410@kernel.org> (raw)
In-Reply-To: 20251115140910.386662473@kernel.org
From: Yongliang Gao <leonylgao@tencent.com>
When the system has many cores and task switching is frequent,
setting set_ftrace_pid can cause frequent pid_list->lock contention
and high system sys usage.
For example, in a 288-core VM environment, we observed 267 CPUs
experiencing contention on pid_list->lock, with stack traces showing:
#4 [ffffa6226fb4bc70] native_queued_spin_lock_slowpath at ffffffff99cd4b7e
#5 [ffffa6226fb4bc90] _raw_spin_lock_irqsave at ffffffff99cd3e36
#6 [ffffa6226fb4bca0] trace_pid_list_is_set at ffffffff99267554
#7 [ffffa6226fb4bcc0] trace_ignore_this_task at ffffffff9925c288
#8 [ffffa6226fb4bcd8] ftrace_filter_pid_sched_switch_probe at ffffffff99246efe
#9 [ffffa6226fb4bcf0] __schedule at ffffffff99ccd161
Replaces the existing spinlock with a seqlock to allow concurrent readers,
while maintaining write exclusivity.
Link: https://patch.msgid.link/20251113000252.1058144-1-leonylgao@gmail.com
Reviewed-by: Huang Cun <cunhuang@tencent.com>
Signed-off-by: Yongliang Gao <leonylgao@tencent.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
kernel/trace/pid_list.c | 30 +++++++++++++++++++++---------
kernel/trace/pid_list.h | 1 +
2 files changed, 22 insertions(+), 9 deletions(-)
diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index 090bb5ea4a19..dbee72d69d0a 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -3,6 +3,7 @@
* Copyright (C) 2021 VMware Inc, Steven Rostedt <rostedt@goodmis.org>
*/
#include <linux/spinlock.h>
+#include <linux/seqlock.h>
#include <linux/irq_work.h>
#include <linux/slab.h>
#include "trace.h"
@@ -126,7 +127,7 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
{
union upper_chunk *upper_chunk;
union lower_chunk *lower_chunk;
- unsigned long flags;
+ unsigned int seq;
unsigned int upper1;
unsigned int upper2;
unsigned int lower;
@@ -138,14 +139,16 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
if (pid_split(pid, &upper1, &upper2, &lower) < 0)
return false;
- raw_spin_lock_irqsave(&pid_list->lock, flags);
- upper_chunk = pid_list->upper[upper1];
- if (upper_chunk) {
- lower_chunk = upper_chunk->data[upper2];
- if (lower_chunk)
- ret = test_bit(lower, lower_chunk->data);
- }
- raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+ do {
+ seq = read_seqcount_begin(&pid_list->seqcount);
+ ret = false;
+ upper_chunk = pid_list->upper[upper1];
+ if (upper_chunk) {
+ lower_chunk = upper_chunk->data[upper2];
+ if (lower_chunk)
+ ret = test_bit(lower, lower_chunk->data);
+ }
+ } while (read_seqcount_retry(&pid_list->seqcount, seq));
return ret;
}
@@ -178,6 +181,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid)
return -EINVAL;
raw_spin_lock_irqsave(&pid_list->lock, flags);
+ write_seqcount_begin(&pid_list->seqcount);
upper_chunk = pid_list->upper[upper1];
if (!upper_chunk) {
upper_chunk = get_upper_chunk(pid_list);
@@ -199,6 +203,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid)
set_bit(lower, lower_chunk->data);
ret = 0;
out:
+ write_seqcount_end(&pid_list->seqcount);
raw_spin_unlock_irqrestore(&pid_list->lock, flags);
return ret;
}
@@ -230,6 +235,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid)
return -EINVAL;
raw_spin_lock_irqsave(&pid_list->lock, flags);
+ write_seqcount_begin(&pid_list->seqcount);
upper_chunk = pid_list->upper[upper1];
if (!upper_chunk)
goto out;
@@ -250,6 +256,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid)
}
}
out:
+ write_seqcount_end(&pid_list->seqcount);
raw_spin_unlock_irqrestore(&pid_list->lock, flags);
return 0;
}
@@ -340,8 +347,10 @@ static void pid_list_refill_irq(struct irq_work *iwork)
again:
raw_spin_lock(&pid_list->lock);
+ write_seqcount_begin(&pid_list->seqcount);
upper_count = CHUNK_ALLOC - pid_list->free_upper_chunks;
lower_count = CHUNK_ALLOC - pid_list->free_lower_chunks;
+ write_seqcount_end(&pid_list->seqcount);
raw_spin_unlock(&pid_list->lock);
if (upper_count <= 0 && lower_count <= 0)
@@ -370,6 +379,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
}
raw_spin_lock(&pid_list->lock);
+ write_seqcount_begin(&pid_list->seqcount);
if (upper) {
*upper_next = pid_list->upper_list;
pid_list->upper_list = upper;
@@ -380,6 +390,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
pid_list->lower_list = lower;
pid_list->free_lower_chunks += lcnt;
}
+ write_seqcount_end(&pid_list->seqcount);
raw_spin_unlock(&pid_list->lock);
/*
@@ -419,6 +430,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
init_irq_work(&pid_list->refill_irqwork, pid_list_refill_irq);
raw_spin_lock_init(&pid_list->lock);
+ seqcount_raw_spinlock_init(&pid_list->seqcount, &pid_list->lock);
for (i = 0; i < CHUNK_ALLOC; i++) {
union upper_chunk *chunk;
diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h
index 62e73f1ac85f..0b45fb0eadb9 100644
--- a/kernel/trace/pid_list.h
+++ b/kernel/trace/pid_list.h
@@ -76,6 +76,7 @@ union upper_chunk {
};
struct trace_pid_list {
+ seqcount_raw_spinlock_t seqcount;
raw_spinlock_t lock;
struct irq_work refill_irqwork;
union upper_chunk *upper[UPPER1_SIZE]; // 1 or 2K in size
--
2.51.0
next prev parent reply other threads:[~2025-11-15 14:09 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-11-15 14:09 [for-next][PATCH 0/5] tracing: Updates for v6.19 Steven Rostedt
2025-11-15 14:09 ` Steven Rostedt [this message]
2025-11-15 14:09 ` [for-next][PATCH 2/5] tracing: Have function graph tracer option funcgraph-irqs be per instance Steven Rostedt
2025-11-15 14:09 ` [for-next][PATCH 3/5] tracing: Move graph-time out of function graph options Steven Rostedt
2025-11-15 14:09 ` [for-next][PATCH 4/5] tracing: Have function graph tracer option sleep-time be per instance Steven Rostedt
2025-11-15 14:09 ` [for-next][PATCH 5/5] tracing: Convert function graph set_flags() to use a switch() statement Steven Rostedt
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251115140936.505263410@kernel.org \
--to=rostedt@kernel.org \
--cc=akpm@linux-foundation.org \
--cc=cunhuang@tencent.com \
--cc=leonylgao@tencent.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mark.rutland@arm.com \
--cc=mathieu.desnoyers@efficios.com \
--cc=mhiramat@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.