[PATCH v1] sched/numa: Add tracepoint to track NUMA migration cost

public inbox for linux-trace-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH v1] sched/numa: Add tracepoint to track NUMA migration cost
@ 2025-10-29 13:22 Jiayuan Chen
  2025-11-24 20:33 ` Steven Rostedt
  0 siblings, 1 reply; 2+ messages in thread
From: Jiayuan Chen @ 2025-10-29 13:22 UTC (permalink / raw)
  To: linux-kernel
  Cc: Jiayuan Chen, Jiayuan Chen, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Ben Segall, Mel Gorman,
	Valentin Schneider, Andrii Nakryiko, Oleg Nesterov,
	Gabriele Monaco, Libo Chen, linux-trace-kernel

From: Jiayuan Chen <jiayuan.chen@shopee.com>

In systems with multiple NUMA nodes, memory imbalance between nodes often
occurs.  To address this, we typically tune parameters like scan_size_mb or
scan_period_{min,max}_ms to allow processes to migrate pages between NUMA
nodes.

Currently, the migration task task_numa_work() holds the mmap_lock during
the entire migration process, which can significantly impact process
performance, especially for memory operations. This patch introduces a new
tracepoint that records the migration duration, along with the number of
scanned pages and migrated pages. These metrics can be used to calculate
efficiency metrics similar to %vmeff in 'sar -B'.

These metrics help evaluate whether the adjusted NUMA balancing parameters
are properly tuned.

Here's an example bpftrace script:
```bash

bpftrace -e '
tracepoint:sched:sched_numa_balance_start
{
    @start_time[cpu] = nsecs;
}

tracepoint:sched:sched_numa_balance_end {
    if (@start_time[cpu] > 0) {
        $cost = nsecs - @start_time[cpu];
        printf("task '%s' migrate cost %lu, scanned %lu, migrated %lu\n",
               args.comm, $cost, args.scanned, args.migrated);
    }
}
'
```
Sample output:
Attaching 2 probes...
task 'rs:main Q:Reg' migrate cost 5584655, scanned 24516, migrated 22373
task 'systemd-journal' migrate cost 123191, scanned 6308, migrated 0
task 'wrk' migrate cost 894026, scanned 5842, migrated 5841

Signed-off-by: Jiayuan Chen <jiayuan.chen@shopee.com>
---
 include/trace/events/sched.h | 60 ++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c          | 14 +++++++--
 2 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 7b2645b50e78..e24bf700a614 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -804,6 +804,66 @@ TRACE_EVENT(sched_skip_cpuset_numa,
 		  __entry->ngid,
 		  MAX_NUMNODES, __entry->mem_allowed)
 );
+
+TRACE_EVENT(sched_numa_balance_start,
+
+	TP_PROTO(struct task_struct *tsk),
+
+	TP_ARGS(tsk),
+
+	TP_STRUCT__entry(
+		__array(char,	comm, TASK_COMM_LEN)
+		__field(pid_t,	pid)
+		__field(pid_t,	tgid)
+		__field(pid_t,	ngid)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+		__entry->pid		 = task_pid_nr(tsk);
+		__entry->tgid		 = task_tgid_nr(tsk);
+		__entry->ngid		 = task_numa_group_id(tsk);
+	),
+
+	TP_printk("comm=%s pid=%d tgid=%d ngid=%d",
+		  __entry->comm,
+		  __entry->pid,
+		  __entry->tgid,
+		  __entry->ngid)
+);
+
+TRACE_EVENT(sched_numa_balance_end,
+
+	TP_PROTO(struct task_struct *tsk, unsigned long scanned, unsigned long migrated),
+
+	TP_ARGS(tsk, scanned, migrated),
+
+	TP_STRUCT__entry(
+		__array(char,		comm, TASK_COMM_LEN)
+		__field(pid_t,		pid)
+		__field(pid_t,		tgid)
+		__field(pid_t,		ngid)
+		__field(unsigned long,	migrated)
+		__field(unsigned long,	scanned)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+		__entry->pid		 = task_pid_nr(tsk);
+		__entry->tgid		 = task_tgid_nr(tsk);
+		__entry->ngid		 = task_numa_group_id(tsk);
+		__entry->migrated	 = migrated;
+		__entry->scanned	 = scanned;
+	),
+
+	TP_printk("comm=%s pid=%d tgid=%d ngid=%d scanned=%lu migrated=%lu",
+		  __entry->comm,
+		  __entry->pid,
+		  __entry->tgid,
+		  __entry->ngid,
+		  __entry->scanned,
+		  __entry->migrated)
+);
 #endif /* CONFIG_NUMA_BALANCING */
 
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 25970dbbb279..173c9c8397e2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3294,6 +3294,9 @@ static void task_numa_work(struct callback_head *work)
 	struct vm_area_struct *vma;
 	unsigned long start, end;
 	unsigned long nr_pte_updates = 0;
+	unsigned long nr_scanned = 0;
+	unsigned long total_migrated = 0;
+	unsigned long total_scanned = 0;
 	long pages, virtpages;
 	struct vma_iterator vmi;
 	bool vma_pids_skipped;
@@ -3359,6 +3362,7 @@ static void task_numa_work(struct callback_head *work)
 	if (!mmap_read_trylock(mm))
 		return;
 
+	trace_sched_numa_balance_start(p);
 	/*
 	 * VMAs are skipped if the current PID has not trapped a fault within
 	 * the VMA recently. Allow scanning to be forced if there is no
@@ -3477,6 +3481,10 @@ static void task_numa_work(struct callback_head *work)
 			end = min(end, vma->vm_end);
 			nr_pte_updates = change_prot_numa(vma, start, end);
 
+			nr_scanned = (end - start) >> PAGE_SHIFT;
+			total_migrated += nr_pte_updates;
+			total_scanned += nr_scanned;
+
 			/*
 			 * Try to scan sysctl_numa_balancing_size worth of
 			 * hpages that have at least one present PTE that
@@ -3486,8 +3494,8 @@ static void task_numa_work(struct callback_head *work)
 			 * areas faster.
 			 */
 			if (nr_pte_updates)
-				pages -= (end - start) >> PAGE_SHIFT;
-			virtpages -= (end - start) >> PAGE_SHIFT;
+				pages -= nr_scanned;
+			virtpages -= nr_scanned;
 
 			start = end;
 			if (pages <= 0 || virtpages <= 0)
@@ -3528,6 +3536,8 @@ static void task_numa_work(struct callback_head *work)
 		mm->numa_scan_offset = start;
 	else
 		reset_ptenuma_scan(p);
+
+	trace_sched_numa_balance_end(p, total_scanned, total_migrated);
 	mmap_read_unlock(mm);
 
 	/*
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH v1] sched/numa: Add tracepoint to track NUMA migration cost
  2025-10-29 13:22 [PATCH v1] sched/numa: Add tracepoint to track NUMA migration cost Jiayuan Chen
@ 2025-11-24 20:33 ` Steven Rostedt
  0 siblings, 0 replies; 2+ messages in thread
From: Steven Rostedt @ 2025-11-24 20:33 UTC (permalink / raw)
  To: Jiayuan Chen
  Cc: linux-kernel, Jiayuan Chen, Masami Hiramatsu, Mathieu Desnoyers,
	Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Ben Segall, Mel Gorman, Valentin Schneider,
	Andrii Nakryiko, Oleg Nesterov, Gabriele Monaco, Libo Chen,
	linux-trace-kernel

On Wed, 29 Oct 2025 21:22:55 +0800
Jiayuan Chen <jiayuan.chen@linux.dev> wrote:

> From: Jiayuan Chen <jiayuan.chen@shopee.com>
> 
> In systems with multiple NUMA nodes, memory imbalance between nodes often
> occurs.  To address this, we typically tune parameters like scan_size_mb or
> scan_period_{min,max}_ms to allow processes to migrate pages between NUMA
> nodes.
> 
> Currently, the migration task task_numa_work() holds the mmap_lock during
> the entire migration process, which can significantly impact process
> performance, especially for memory operations. This patch introduces a new
> tracepoint that records the migration duration, along with the number of
> scanned pages and migrated pages. These metrics can be used to calculate
> efficiency metrics similar to %vmeff in 'sar -B'.
> 
> These metrics help evaluate whether the adjusted NUMA balancing parameters
> are properly tuned.
> 
> Here's an example bpftrace script:
> ```bash
> 
> bpftrace -e '
> tracepoint:sched:sched_numa_balance_start
> {
>     @start_time[cpu] = nsecs;
> }
> 
> tracepoint:sched:sched_numa_balance_end {
>     if (@start_time[cpu] > 0) {
>         $cost = nsecs - @start_time[cpu];
>         printf("task '%s' migrate cost %lu, scanned %lu, migrated %lu\n",
>                args.comm, $cost, args.scanned, args.migrated);
>     }
> }
> '

BTW, you don't need bpf for this either:

  # trace-cmd sqlhist -e -n numa_balance SELECT end.comm, TIMESTAMP_DELTA_USECS as cost, \
            end.scanned, end.migrated FROM sched_numa_balance_start AS start \
            JOIN sched_numa_balance_end AS end ON start.common_pid = end.common_pid

  # trace-cmd start -e numa_balance

[ I'd show the output, but my test boxes don't have NUMA ]

You could also make a histogram with it:

  # trace-cmd sqlhist -e SELECT start.comm, 'CAST(start.cost AS BUCKETS=50)' FROM numa_balance AS start

And then cat /sys/kernel/tracing/events/synthetic/numa_balance/hist

Just to give you an idea.


> ```
> Sample output:
> Attaching 2 probes...
> task 'rs:main Q:Reg' migrate cost 5584655, scanned 24516, migrated 22373
> task 'systemd-journal' migrate cost 123191, scanned 6308, migrated 0
> task 'wrk' migrate cost 894026, scanned 5842, migrated 5841
> 
> Signed-off-by: Jiayuan Chen <jiayuan.chen@shopee.com>
> ---
>  include/trace/events/sched.h | 60 ++++++++++++++++++++++++++++++++++++
>  kernel/sched/fair.c          | 14 +++++++--
>  2 files changed, 72 insertions(+), 2 deletions(-)
> 
> diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> index 7b2645b50e78..e24bf700a614 100644
> --- a/include/trace/events/sched.h
> +++ b/include/trace/events/sched.h
> @@ -804,6 +804,66 @@ TRACE_EVENT(sched_skip_cpuset_numa,
>  		  __entry->ngid,
>  		  MAX_NUMNODES, __entry->mem_allowed)
>  );
> +
> +TRACE_EVENT(sched_numa_balance_start,
> +
> +	TP_PROTO(struct task_struct *tsk),
> +
> +	TP_ARGS(tsk),
> +
> +	TP_STRUCT__entry(
> +		__array(char,	comm, TASK_COMM_LEN)

Please use __string() and not __array(). I'm trying to get rid of these for
task comm.

> +		__field(pid_t,	pid)
> +		__field(pid_t,	tgid)
> +		__field(pid_t,	ngid)
> +	),
> +
> +	TP_fast_assign(
> +		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
> +		__entry->pid		 = task_pid_nr(tsk);
> +		__entry->tgid		 = task_tgid_nr(tsk);
> +		__entry->ngid		 = task_numa_group_id(tsk);
> +	),
> +
> +	TP_printk("comm=%s pid=%d tgid=%d ngid=%d",
> +		  __entry->comm,
> +		  __entry->pid,
> +		  __entry->tgid,
> +		  __entry->ngid)
> +);
> +
> +TRACE_EVENT(sched_numa_balance_end,
> +
> +	TP_PROTO(struct task_struct *tsk, unsigned long scanned, unsigned long migrated),
> +
> +	TP_ARGS(tsk, scanned, migrated),
> +
> +	TP_STRUCT__entry(
> +		__array(char,		comm, TASK_COMM_LEN)
> +		__field(pid_t,		pid)
> +		__field(pid_t,		tgid)
> +		__field(pid_t,		ngid)
> +		__field(unsigned long,	migrated)
> +		__field(unsigned long,	scanned)
> +	),
> +
> +	TP_fast_assign(
> +		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
> +		__entry->pid		 = task_pid_nr(tsk);
> +		__entry->tgid		 = task_tgid_nr(tsk);
> +		__entry->ngid		 = task_numa_group_id(tsk);
> +		__entry->migrated	 = migrated;
> +		__entry->scanned	 = scanned;
> +	),
> +
> +	TP_printk("comm=%s pid=%d tgid=%d ngid=%d scanned=%lu migrated=%lu",
> +		  __entry->comm,
> +		  __entry->pid,
> +		  __entry->tgid,
> +		  __entry->ngid,
> +		  __entry->scanned,
> +		  __entry->migrated)
> +);
>  #endif /* CONFIG_NUMA_BALANCING */
>  
>  /*
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 25970dbbb279..173c9c8397e2 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3294,6 +3294,9 @@ static void task_numa_work(struct callback_head *work)
>  	struct vm_area_struct *vma;
>  	unsigned long start, end;
>  	unsigned long nr_pte_updates = 0;
> +	unsigned long nr_scanned = 0;
> +	unsigned long total_migrated = 0;
> +	unsigned long total_scanned = 0;
>  	long pages, virtpages;
>  	struct vma_iterator vmi;
>  	bool vma_pids_skipped;
> @@ -3359,6 +3362,7 @@ static void task_numa_work(struct callback_head *work)
>  	if (!mmap_read_trylock(mm))
>  		return;
>  
> +	trace_sched_numa_balance_start(p);
>  	/*
>  	 * VMAs are skipped if the current PID has not trapped a fault within
>  	 * the VMA recently. Allow scanning to be forced if there is no
> @@ -3477,6 +3481,10 @@ static void task_numa_work(struct callback_head *work)
>  			end = min(end, vma->vm_end);
>  			nr_pte_updates = change_prot_numa(vma, start, end);
>  
> +			nr_scanned = (end - start) >> PAGE_SHIFT;
> +			total_migrated += nr_pte_updates;
> +			total_scanned += nr_scanned;
> +

This will require the scheduler maintainers agreeing on this for acceptance.

Will kprobes not due?

-- Steve


>  			/*
>  			 * Try to scan sysctl_numa_balancing_size worth of
>  			 * hpages that have at least one present PTE that
> @@ -3486,8 +3494,8 @@ static void task_numa_work(struct callback_head *work)
>  			 * areas faster.
>  			 */
>  			if (nr_pte_updates)
> -				pages -= (end - start) >> PAGE_SHIFT;
> -			virtpages -= (end - start) >> PAGE_SHIFT;
> +				pages -= nr_scanned;
> +			virtpages -= nr_scanned;
>  
>  			start = end;
>  			if (pages <= 0 || virtpages <= 0)
> @@ -3528,6 +3536,8 @@ static void task_numa_work(struct callback_head *work)
>  		mm->numa_scan_offset = start;
>  	else
>  		reset_ptenuma_scan(p);
> +
> +	trace_sched_numa_balance_end(p, total_scanned, total_migrated);
>  	mmap_read_unlock(mm);
>  
>  	/*


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2025-11-24 20:32 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-29 13:22 [PATCH v1] sched/numa: Add tracepoint to track NUMA migration cost Jiayuan Chen
2025-11-24 20:33 ` Steven Rostedt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox