From: Yongliang Gao <leonylgao@gmail.com>
To: rostedt@goodmis.org, mhiramat@kernel.org, mathieu.desnoyers@efficios.com
Cc: linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org,
Yongliang Gao <leonylgao@tencent.com>,
Huang Cun <cunhuang@tencent.com>
Subject: [PATCH] trace/pid_list: optimize pid_list->lock contention
Date: Wed, 15 Oct 2025 19:49:52 +0800 [thread overview]
Message-ID: <20251015114952.4014352-1-leonylgao@gmail.com> (raw)
From: Yongliang Gao <leonylgao@tencent.com>
When the system has many cores and task switching is frequent,
setting set_ftrace_pid can cause frequent pid_list->lock contention
and high system sys usage.
For example, in a vmcore environment with 288 cores, We found 267
CPUs are in pid_list->lock contention.
#4 [ffffa6226fb4bc70] native_queued_spin_lock_slowpath at ffffffff99cd4b7e
#5 [ffffa6226fb4bc90] _raw_spin_lock_irqsave at ffffffff99cd3e36
#6 [ffffa6226fb4bca0] trace_pid_list_is_set at ffffffff99267554
#7 [ffffa6226fb4bcc0] trace_ignore_this_task at ffffffff9925c288
#8 [ffffa6226fb4bcd8] ftrace_filter_pid_sched_switch_probe at ffffffff99246efe
#9 [ffffa6226fb4bcf0] __schedule at ffffffff99ccd161
Signed-off-by: Yongliang Gao <leonylgao@tencent.com>
Reviewed-by: Huang Cun <cunhuang@tencent.com>
---
kernel/trace/pid_list.c | 26 +++++++++++++-------------
kernel/trace/pid_list.h | 2 +-
2 files changed, 14 insertions(+), 14 deletions(-)
diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index 090bb5ea4a19..62082a4f60db 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -138,14 +138,14 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
if (pid_split(pid, &upper1, &upper2, &lower) < 0)
return false;
- raw_spin_lock_irqsave(&pid_list->lock, flags);
+ read_lock_irqsave(&pid_list->lock, flags);
upper_chunk = pid_list->upper[upper1];
if (upper_chunk) {
lower_chunk = upper_chunk->data[upper2];
if (lower_chunk)
ret = test_bit(lower, lower_chunk->data);
}
- raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+ read_unlock_irqrestore(&pid_list->lock, flags);
return ret;
}
@@ -177,7 +177,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid)
if (pid_split(pid, &upper1, &upper2, &lower) < 0)
return -EINVAL;
- raw_spin_lock_irqsave(&pid_list->lock, flags);
+ write_lock_irqsave(&pid_list->lock, flags);
upper_chunk = pid_list->upper[upper1];
if (!upper_chunk) {
upper_chunk = get_upper_chunk(pid_list);
@@ -199,7 +199,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid)
set_bit(lower, lower_chunk->data);
ret = 0;
out:
- raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+ write_unlock_irqrestore(&pid_list->lock, flags);
return ret;
}
@@ -229,7 +229,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid)
if (pid_split(pid, &upper1, &upper2, &lower) < 0)
return -EINVAL;
- raw_spin_lock_irqsave(&pid_list->lock, flags);
+ write_lock_irqsave(&pid_list->lock, flags);
upper_chunk = pid_list->upper[upper1];
if (!upper_chunk)
goto out;
@@ -250,7 +250,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid)
}
}
out:
- raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+ write_unlock_irqrestore(&pid_list->lock, flags);
return 0;
}
@@ -282,7 +282,7 @@ int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid,
if (pid_split(pid, &upper1, &upper2, &lower) < 0)
return -EINVAL;
- raw_spin_lock_irqsave(&pid_list->lock, flags);
+ read_lock_irqsave(&pid_list->lock, flags);
for (; upper1 <= UPPER_MASK; upper1++, upper2 = 0) {
upper_chunk = pid_list->upper[upper1];
@@ -302,7 +302,7 @@ int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid,
}
found:
- raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+ read_unlock_irqrestore(&pid_list->lock, flags);
if (upper1 > UPPER_MASK)
return -1;
@@ -339,10 +339,10 @@ static void pid_list_refill_irq(struct irq_work *iwork)
int lcnt = 0;
again:
- raw_spin_lock(&pid_list->lock);
+ write_lock(&pid_list->lock);
upper_count = CHUNK_ALLOC - pid_list->free_upper_chunks;
lower_count = CHUNK_ALLOC - pid_list->free_lower_chunks;
- raw_spin_unlock(&pid_list->lock);
+ write_unlock(&pid_list->lock);
if (upper_count <= 0 && lower_count <= 0)
return;
@@ -369,7 +369,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
lcnt++;
}
- raw_spin_lock(&pid_list->lock);
+ write_lock(&pid_list->lock);
if (upper) {
*upper_next = pid_list->upper_list;
pid_list->upper_list = upper;
@@ -380,7 +380,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
pid_list->lower_list = lower;
pid_list->free_lower_chunks += lcnt;
}
- raw_spin_unlock(&pid_list->lock);
+ write_unlock(&pid_list->lock);
/*
* On success of allocating all the chunks, both counters
@@ -418,7 +418,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
init_irq_work(&pid_list->refill_irqwork, pid_list_refill_irq);
- raw_spin_lock_init(&pid_list->lock);
+ rwlock_init(&pid_list->lock);
for (i = 0; i < CHUNK_ALLOC; i++) {
union upper_chunk *chunk;
diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h
index 62e73f1ac85f..da200834f4ad 100644
--- a/kernel/trace/pid_list.h
+++ b/kernel/trace/pid_list.h
@@ -76,7 +76,7 @@ union upper_chunk {
};
struct trace_pid_list {
- raw_spinlock_t lock;
+ rwlock_t lock;
struct irq_work refill_irqwork;
union upper_chunk *upper[UPPER1_SIZE]; // 1 or 2K in size
union upper_chunk *upper_list;
--
2.43.5
reply other threads:[~2025-10-15 11:49 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251015114952.4014352-1-leonylgao@gmail.com \
--to=leonylgao@gmail.com \
--cc=cunhuang@tencent.com \
--cc=leonylgao@tencent.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-trace-kernel@vger.kernel.org \
--cc=mathieu.desnoyers@efficios.com \
--cc=mhiramat@kernel.org \
--cc=rostedt@goodmis.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).