public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] perf/core: Optimize LBR callstack handling
@ 2026-02-11 22:32 Namhyung Kim
  2026-02-11 22:32 ` [PATCH 1/3] perf/core: Pass GFP flags to attach_task_ctx_data() Namhyung Kim
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: Namhyung Kim @ 2026-02-11 22:32 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar
  Cc: Mark Rutland, Alexander Shishkin, Arnaldo Carvalho de Melo, LKML,
	Guenter Roeck

Hello,

I found other problematic cases wrt LBR callstacks.  Basically O(N^2)
loop for every threads is too costly on large machines.  We can use
faster memory allocation and free methods to reduce the overhead.

Actually this approach is suggested by AI (Gemini).

Thanks,
Namhyung


Namhyung Kim (3):
  perf/core: Pass GFP flags to attach_task_ctx_data()
  perf/core: Try to allocate task_ctx_data quickly
  perf/core: Simplify __detach_global_ctx_data()

 kernel/events/core.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

-- 
2.53.0.273.g2a3d683680-goog


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/3] perf/core: Pass GFP flags to attach_task_ctx_data()
  2026-02-11 22:32 [PATCH 0/3] perf/core: Optimize LBR callstack handling Namhyung Kim
@ 2026-02-11 22:32 ` Namhyung Kim
  2026-02-28 10:56   ` [tip: perf/core] " tip-bot2 for Namhyung Kim
  2026-02-11 22:32 ` [PATCH 2/3] perf/core: Try to allocate task_ctx_data quickly Namhyung Kim
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 9+ messages in thread
From: Namhyung Kim @ 2026-02-11 22:32 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar
  Cc: Mark Rutland, Alexander Shishkin, Arnaldo Carvalho de Melo, LKML,
	Guenter Roeck

This is a preparation for the next change to reduce the computational
complexity in the global context data handling for LBR callstacks.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 kernel/events/core.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index da013b9a595fcebe..b8498e9891e21c18 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5370,15 +5370,15 @@ static void unaccount_freq_event(void)
 
 
 static struct perf_ctx_data *
-alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global)
+alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global, gfp_t gfp_flags)
 {
 	struct perf_ctx_data *cd;
 
-	cd = kzalloc(sizeof(*cd), GFP_KERNEL);
+	cd = kzalloc(sizeof(*cd), gfp_flags);
 	if (!cd)
 		return NULL;
 
-	cd->data = kmem_cache_zalloc(ctx_cache, GFP_KERNEL);
+	cd->data = kmem_cache_zalloc(ctx_cache, gfp_flags);
 	if (!cd->data) {
 		kfree(cd);
 		return NULL;
@@ -5412,11 +5412,11 @@ static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd)
 
 static int
 attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache,
-		     bool global)
+		     bool global, gfp_t gfp_flags)
 {
 	struct perf_ctx_data *cd, *old = NULL;
 
-	cd = alloc_perf_ctx_data(ctx_cache, global);
+	cd = alloc_perf_ctx_data(ctx_cache, global, gfp_flags);
 	if (!cd)
 		return -ENOMEM;
 
@@ -5499,7 +5499,7 @@ attach_global_ctx_data(struct kmem_cache *ctx_cache)
 
 	return 0;
 alloc:
-	ret = attach_task_ctx_data(p, ctx_cache, true);
+	ret = attach_task_ctx_data(p, ctx_cache, true, GFP_KERNEL);
 	put_task_struct(p);
 	if (ret) {
 		__detach_global_ctx_data();
@@ -5519,7 +5519,7 @@ attach_perf_ctx_data(struct perf_event *event)
 		return -ENOMEM;
 
 	if (task)
-		return attach_task_ctx_data(task, ctx_cache, false);
+		return attach_task_ctx_data(task, ctx_cache, false, GFP_KERNEL);
 
 	ret = attach_global_ctx_data(ctx_cache);
 	if (ret)
@@ -9231,7 +9231,7 @@ perf_event_alloc_task_data(struct task_struct *child,
 
 	return;
 attach:
-	attach_task_ctx_data(child, ctx_cache, true);
+	attach_task_ctx_data(child, ctx_cache, true, GFP_KERNEL);
 }
 
 void perf_event_fork(struct task_struct *task)
-- 
2.53.0.273.g2a3d683680-goog


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/3] perf/core: Try to allocate task_ctx_data quickly
  2026-02-11 22:32 [PATCH 0/3] perf/core: Optimize LBR callstack handling Namhyung Kim
  2026-02-11 22:32 ` [PATCH 1/3] perf/core: Pass GFP flags to attach_task_ctx_data() Namhyung Kim
@ 2026-02-11 22:32 ` Namhyung Kim
  2026-02-28 10:56   ` [tip: perf/core] " tip-bot2 for Namhyung Kim
  2026-02-11 22:32 ` [PATCH 3/3] perf/core: Simplify __detach_global_ctx_data() Namhyung Kim
  2026-02-26 12:07 ` [PATCH 0/3] perf/core: Optimize LBR callstack handling Peter Zijlstra
  3 siblings, 1 reply; 9+ messages in thread
From: Namhyung Kim @ 2026-02-11 22:32 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar
  Cc: Mark Rutland, Alexander Shishkin, Arnaldo Carvalho de Melo, LKML,
	Guenter Roeck

The attach_global_ctx_data() has O(N^2) algorithm to allocate the
context data for each thread.  This caused perfomance problems on large
systems with O(100k) threads.

Because kmalloc(GFP_KERNEL) can go sleep it cannot be called under the
RCU lock.  So let's try with GFP_NOWAIT first so that it can proceed in
normal cases.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 kernel/events/core.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index b8498e9891e21c18..5b05a71edeb47955 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5489,6 +5489,13 @@ attach_global_ctx_data(struct kmem_cache *ctx_cache)
 					cd = NULL;
 			}
 			if (!cd) {
+				/*
+				 * Try to allocate context quickly before
+				 * traversing the whole thread list again.
+				 */
+				if (!attach_task_ctx_data(p, ctx_cache, true,
+							  GFP_NOWAIT))
+					continue;
 				get_task_struct(p);
 				goto alloc;
 			}
-- 
2.53.0.273.g2a3d683680-goog


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 3/3] perf/core: Simplify __detach_global_ctx_data()
  2026-02-11 22:32 [PATCH 0/3] perf/core: Optimize LBR callstack handling Namhyung Kim
  2026-02-11 22:32 ` [PATCH 1/3] perf/core: Pass GFP flags to attach_task_ctx_data() Namhyung Kim
  2026-02-11 22:32 ` [PATCH 2/3] perf/core: Try to allocate task_ctx_data quickly Namhyung Kim
@ 2026-02-11 22:32 ` Namhyung Kim
  2026-02-28 10:56   ` [tip: perf/core] " tip-bot2 for Namhyung Kim
  2026-02-26 12:07 ` [PATCH 0/3] perf/core: Optimize LBR callstack handling Peter Zijlstra
  3 siblings, 1 reply; 9+ messages in thread
From: Namhyung Kim @ 2026-02-11 22:32 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar
  Cc: Mark Rutland, Alexander Shishkin, Arnaldo Carvalho de Melo, LKML,
	Guenter Roeck

Like in the attach_global_ctx_data() it has a O(N^2) loop to delete task
context data for each thread.  But perf_free_ctx_data_rcu() can be
called under RCU read lock, so just calls it directly rather than
iterating the whole thread list again.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 kernel/events/core.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5b05a71edeb47955..e67e1baa99d1e5a9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5561,22 +5561,15 @@ static void __detach_global_ctx_data(void)
 	struct task_struct *g, *p;
 	struct perf_ctx_data *cd;
 
-again:
 	scoped_guard (rcu) {
 		for_each_process_thread(g, p) {
 			cd = rcu_dereference(p->perf_ctx_data);
-			if (!cd || !cd->global)
-				continue;
-			cd->global = 0;
-			get_task_struct(p);
-			goto detach;
+			if (cd && cd->global) {
+				cd->global = 0;
+				detach_task_ctx_data(p);
+			}
 		}
 	}
-	return;
-detach:
-	detach_task_ctx_data(p);
-	put_task_struct(p);
-	goto again;
 }
 
 static void detach_global_ctx_data(void)
-- 
2.53.0.273.g2a3d683680-goog


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 0/3] perf/core: Optimize LBR callstack handling
  2026-02-11 22:32 [PATCH 0/3] perf/core: Optimize LBR callstack handling Namhyung Kim
                   ` (2 preceding siblings ...)
  2026-02-11 22:32 ` [PATCH 3/3] perf/core: Simplify __detach_global_ctx_data() Namhyung Kim
@ 2026-02-26 12:07 ` Peter Zijlstra
  2026-02-26 18:26   ` Namhyung Kim
  3 siblings, 1 reply; 9+ messages in thread
From: Peter Zijlstra @ 2026-02-26 12:07 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Ingo Molnar, Mark Rutland, Alexander Shishkin,
	Arnaldo Carvalho de Melo, LKML, Guenter Roeck

On Wed, Feb 11, 2026 at 02:32:18PM -0800, Namhyung Kim wrote:
> Namhyung Kim (3):
>   perf/core: Pass GFP flags to attach_task_ctx_data()
>   perf/core: Try to allocate task_ctx_data quickly
>   perf/core: Simplify __detach_global_ctx_data()

They seem to have crossed paths with kalloc_obj() stuff, but I stomped
on it and now they fit.

Patches seem fine, I'll throw them at the robots.

Thanks!

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 0/3] perf/core: Optimize LBR callstack handling
  2026-02-26 12:07 ` [PATCH 0/3] perf/core: Optimize LBR callstack handling Peter Zijlstra
@ 2026-02-26 18:26   ` Namhyung Kim
  0 siblings, 0 replies; 9+ messages in thread
From: Namhyung Kim @ 2026-02-26 18:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Mark Rutland, Alexander Shishkin,
	Arnaldo Carvalho de Melo, LKML, Guenter Roeck

On Thu, Feb 26, 2026 at 01:07:12PM +0100, Peter Zijlstra wrote:
> On Wed, Feb 11, 2026 at 02:32:18PM -0800, Namhyung Kim wrote:
> > Namhyung Kim (3):
> >   perf/core: Pass GFP flags to attach_task_ctx_data()
> >   perf/core: Try to allocate task_ctx_data quickly
> >   perf/core: Simplify __detach_global_ctx_data()
> 
> They seem to have crossed paths with kalloc_obj() stuff, but I stomped
> on it and now they fit.
> 
> Patches seem fine, I'll throw them at the robots.

Thanks for doing that!

Namhyung


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [tip: perf/core] perf/core: Simplify __detach_global_ctx_data()
  2026-02-11 22:32 ` [PATCH 3/3] perf/core: Simplify __detach_global_ctx_data() Namhyung Kim
@ 2026-02-28 10:56   ` tip-bot2 for Namhyung Kim
  0 siblings, 0 replies; 9+ messages in thread
From: tip-bot2 for Namhyung Kim @ 2026-02-28 10:56 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: Namhyung Kim, Peter Zijlstra (Intel), x86, linux-kernel

The following commit has been merged into the perf/core branch of tip:

Commit-ID:     da45c8d5f051434a3c68397e66ae2d3b3c97cdec
Gitweb:        https://git.kernel.org/tip/da45c8d5f051434a3c68397e66ae2d3b3c97cdec
Author:        Namhyung Kim <namhyung@kernel.org>
AuthorDate:    Wed, 11 Feb 2026 14:32:21 -08:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Fri, 27 Feb 2026 16:40:22 +01:00

perf/core: Simplify __detach_global_ctx_data()

Like in the attach_global_ctx_data() it has a O(N^2) loop to delete task
context data for each thread.  But perf_free_ctx_data_rcu() can be
called under RCU read lock, so just calls it directly rather than
iterating the whole thread list again.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260211223222.3119790-4-namhyung@kernel.org
---
 kernel/events/core.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index d357714..5eeae86 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5560,22 +5560,15 @@ static void __detach_global_ctx_data(void)
 	struct task_struct *g, *p;
 	struct perf_ctx_data *cd;
 
-again:
 	scoped_guard (rcu) {
 		for_each_process_thread(g, p) {
 			cd = rcu_dereference(p->perf_ctx_data);
-			if (!cd || !cd->global)
-				continue;
-			cd->global = 0;
-			get_task_struct(p);
-			goto detach;
+			if (cd && cd->global) {
+				cd->global = 0;
+				detach_task_ctx_data(p);
+			}
 		}
 	}
-	return;
-detach:
-	detach_task_ctx_data(p);
-	put_task_struct(p);
-	goto again;
 }
 
 static void detach_global_ctx_data(void)

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [tip: perf/core] perf/core: Try to allocate task_ctx_data quickly
  2026-02-11 22:32 ` [PATCH 2/3] perf/core: Try to allocate task_ctx_data quickly Namhyung Kim
@ 2026-02-28 10:56   ` tip-bot2 for Namhyung Kim
  0 siblings, 0 replies; 9+ messages in thread
From: tip-bot2 for Namhyung Kim @ 2026-02-28 10:56 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: Namhyung Kim, Peter Zijlstra (Intel), x86, linux-kernel

The following commit has been merged into the perf/core branch of tip:

Commit-ID:     bec2ee2390c95ed0c44494340464e69e79802e4a
Gitweb:        https://git.kernel.org/tip/bec2ee2390c95ed0c44494340464e69e79802e4a
Author:        Namhyung Kim <namhyung@kernel.org>
AuthorDate:    Wed, 11 Feb 2026 14:32:20 -08:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Fri, 27 Feb 2026 16:40:21 +01:00

perf/core: Try to allocate task_ctx_data quickly

The attach_global_ctx_data() has O(N^2) algorithm to allocate the
context data for each thread.  This caused perfomance problems on large
systems with O(100k) threads.

Because kmalloc(GFP_KERNEL) can go sleep it cannot be called under the
RCU lock.  So let's try with GFP_NOWAIT first so that it can proceed in
normal cases.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260211223222.3119790-3-namhyung@kernel.org
---
 kernel/events/core.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 90b0c93..d357714 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5489,6 +5489,12 @@ again:
 					cd = NULL;
 			}
 			if (!cd) {
+				/*
+				 * Try to allocate context quickly before
+				 * traversing the whole thread list again.
+				 */
+				if (!attach_task_ctx_data(p, ctx_cache, true, GFP_NOWAIT))
+					continue;
 				get_task_struct(p);
 				goto alloc;
 			}

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [tip: perf/core] perf/core: Pass GFP flags to attach_task_ctx_data()
  2026-02-11 22:32 ` [PATCH 1/3] perf/core: Pass GFP flags to attach_task_ctx_data() Namhyung Kim
@ 2026-02-28 10:56   ` tip-bot2 for Namhyung Kim
  0 siblings, 0 replies; 9+ messages in thread
From: tip-bot2 for Namhyung Kim @ 2026-02-28 10:56 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: Namhyung Kim, Peter Zijlstra (Intel), x86, linux-kernel

The following commit has been merged into the perf/core branch of tip:

Commit-ID:     28c75fbfec8f024db1278194918e5f6eda4c570f
Gitweb:        https://git.kernel.org/tip/28c75fbfec8f024db1278194918e5f6eda4c570f
Author:        Namhyung Kim <namhyung@kernel.org>
AuthorDate:    Wed, 11 Feb 2026 14:32:19 -08:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Fri, 27 Feb 2026 16:40:21 +01:00

perf/core: Pass GFP flags to attach_task_ctx_data()

This is a preparation for the next change to reduce the computational
complexity in the global context data handling for LBR callstacks.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260211223222.3119790-2-namhyung@kernel.org
---
 kernel/events/core.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index ac70d68..90b0c93 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5370,15 +5370,15 @@ static void unaccount_freq_event(void)
 
 
 static struct perf_ctx_data *
-alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global)
+alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global, gfp_t gfp_flags)
 {
 	struct perf_ctx_data *cd;
 
-	cd = kzalloc_obj(*cd);
+	cd = kzalloc_obj(*cd, gfp_flags);
 	if (!cd)
 		return NULL;
 
-	cd->data = kmem_cache_zalloc(ctx_cache, GFP_KERNEL);
+	cd->data = kmem_cache_zalloc(ctx_cache, gfp_flags);
 	if (!cd->data) {
 		kfree(cd);
 		return NULL;
@@ -5412,11 +5412,11 @@ static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd)
 
 static int
 attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache,
-		     bool global)
+		     bool global, gfp_t gfp_flags)
 {
 	struct perf_ctx_data *cd, *old = NULL;
 
-	cd = alloc_perf_ctx_data(ctx_cache, global);
+	cd = alloc_perf_ctx_data(ctx_cache, global, gfp_flags);
 	if (!cd)
 		return -ENOMEM;
 
@@ -5499,7 +5499,7 @@ again:
 
 	return 0;
 alloc:
-	ret = attach_task_ctx_data(p, ctx_cache, true);
+	ret = attach_task_ctx_data(p, ctx_cache, true, GFP_KERNEL);
 	put_task_struct(p);
 	if (ret) {
 		__detach_global_ctx_data();
@@ -5519,7 +5519,7 @@ attach_perf_ctx_data(struct perf_event *event)
 		return -ENOMEM;
 
 	if (task)
-		return attach_task_ctx_data(task, ctx_cache, false);
+		return attach_task_ctx_data(task, ctx_cache, false, GFP_KERNEL);
 
 	ret = attach_global_ctx_data(ctx_cache);
 	if (ret)
@@ -9240,7 +9240,7 @@ perf_event_alloc_task_data(struct task_struct *child,
 
 	return;
 attach:
-	attach_task_ctx_data(child, ctx_cache, true);
+	attach_task_ctx_data(child, ctx_cache, true, GFP_KERNEL);
 }
 
 void perf_event_fork(struct task_struct *task)

^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2026-02-28 10:56 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-02-11 22:32 [PATCH 0/3] perf/core: Optimize LBR callstack handling Namhyung Kim
2026-02-11 22:32 ` [PATCH 1/3] perf/core: Pass GFP flags to attach_task_ctx_data() Namhyung Kim
2026-02-28 10:56   ` [tip: perf/core] " tip-bot2 for Namhyung Kim
2026-02-11 22:32 ` [PATCH 2/3] perf/core: Try to allocate task_ctx_data quickly Namhyung Kim
2026-02-28 10:56   ` [tip: perf/core] " tip-bot2 for Namhyung Kim
2026-02-11 22:32 ` [PATCH 3/3] perf/core: Simplify __detach_global_ctx_data() Namhyung Kim
2026-02-28 10:56   ` [tip: perf/core] " tip-bot2 for Namhyung Kim
2026-02-26 12:07 ` [PATCH 0/3] perf/core: Optimize LBR callstack handling Peter Zijlstra
2026-02-26 18:26   ` Namhyung Kim

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox