linux-trace-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v1] sched/numa: Add tracepoint to track NUMA migration cost
@ 2025-10-29 13:22 Jiayuan Chen
  0 siblings, 0 replies; only message in thread
From: Jiayuan Chen @ 2025-10-29 13:22 UTC (permalink / raw)
  To: linux-kernel
  Cc: Jiayuan Chen, Jiayuan Chen, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Ben Segall, Mel Gorman,
	Valentin Schneider, Andrii Nakryiko, Oleg Nesterov,
	Gabriele Monaco, Libo Chen, linux-trace-kernel

From: Jiayuan Chen <jiayuan.chen@shopee.com>

In systems with multiple NUMA nodes, memory imbalance between nodes often
occurs.  To address this, we typically tune parameters like scan_size_mb or
scan_period_{min,max}_ms to allow processes to migrate pages between NUMA
nodes.

Currently, the migration task task_numa_work() holds the mmap_lock during
the entire migration process, which can significantly impact process
performance, especially for memory operations. This patch introduces a new
tracepoint that records the migration duration, along with the number of
scanned pages and migrated pages. These metrics can be used to calculate
efficiency metrics similar to %vmeff in 'sar -B'.

These metrics help evaluate whether the adjusted NUMA balancing parameters
are properly tuned.

Here's an example bpftrace script:
```bash

bpftrace -e '
tracepoint:sched:sched_numa_balance_start
{
    @start_time[cpu] = nsecs;
}

tracepoint:sched:sched_numa_balance_end {
    if (@start_time[cpu] > 0) {
        $cost = nsecs - @start_time[cpu];
        printf("task '%s' migrate cost %lu, scanned %lu, migrated %lu\n",
               args.comm, $cost, args.scanned, args.migrated);
    }
}
'
```
Sample output:
Attaching 2 probes...
task 'rs:main Q:Reg' migrate cost 5584655, scanned 24516, migrated 22373
task 'systemd-journal' migrate cost 123191, scanned 6308, migrated 0
task 'wrk' migrate cost 894026, scanned 5842, migrated 5841

Signed-off-by: Jiayuan Chen <jiayuan.chen@shopee.com>
---
 include/trace/events/sched.h | 60 ++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c          | 14 +++++++--
 2 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 7b2645b50e78..e24bf700a614 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -804,6 +804,66 @@ TRACE_EVENT(sched_skip_cpuset_numa,
 		  __entry->ngid,
 		  MAX_NUMNODES, __entry->mem_allowed)
 );
+
+TRACE_EVENT(sched_numa_balance_start,
+
+	TP_PROTO(struct task_struct *tsk),
+
+	TP_ARGS(tsk),
+
+	TP_STRUCT__entry(
+		__array(char,	comm, TASK_COMM_LEN)
+		__field(pid_t,	pid)
+		__field(pid_t,	tgid)
+		__field(pid_t,	ngid)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+		__entry->pid		 = task_pid_nr(tsk);
+		__entry->tgid		 = task_tgid_nr(tsk);
+		__entry->ngid		 = task_numa_group_id(tsk);
+	),
+
+	TP_printk("comm=%s pid=%d tgid=%d ngid=%d",
+		  __entry->comm,
+		  __entry->pid,
+		  __entry->tgid,
+		  __entry->ngid)
+);
+
+TRACE_EVENT(sched_numa_balance_end,
+
+	TP_PROTO(struct task_struct *tsk, unsigned long scanned, unsigned long migrated),
+
+	TP_ARGS(tsk, scanned, migrated),
+
+	TP_STRUCT__entry(
+		__array(char,		comm, TASK_COMM_LEN)
+		__field(pid_t,		pid)
+		__field(pid_t,		tgid)
+		__field(pid_t,		ngid)
+		__field(unsigned long,	migrated)
+		__field(unsigned long,	scanned)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+		__entry->pid		 = task_pid_nr(tsk);
+		__entry->tgid		 = task_tgid_nr(tsk);
+		__entry->ngid		 = task_numa_group_id(tsk);
+		__entry->migrated	 = migrated;
+		__entry->scanned	 = scanned;
+	),
+
+	TP_printk("comm=%s pid=%d tgid=%d ngid=%d scanned=%lu migrated=%lu",
+		  __entry->comm,
+		  __entry->pid,
+		  __entry->tgid,
+		  __entry->ngid,
+		  __entry->scanned,
+		  __entry->migrated)
+);
 #endif /* CONFIG_NUMA_BALANCING */
 
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 25970dbbb279..173c9c8397e2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3294,6 +3294,9 @@ static void task_numa_work(struct callback_head *work)
 	struct vm_area_struct *vma;
 	unsigned long start, end;
 	unsigned long nr_pte_updates = 0;
+	unsigned long nr_scanned = 0;
+	unsigned long total_migrated = 0;
+	unsigned long total_scanned = 0;
 	long pages, virtpages;
 	struct vma_iterator vmi;
 	bool vma_pids_skipped;
@@ -3359,6 +3362,7 @@ static void task_numa_work(struct callback_head *work)
 	if (!mmap_read_trylock(mm))
 		return;
 
+	trace_sched_numa_balance_start(p);
 	/*
 	 * VMAs are skipped if the current PID has not trapped a fault within
 	 * the VMA recently. Allow scanning to be forced if there is no
@@ -3477,6 +3481,10 @@ static void task_numa_work(struct callback_head *work)
 			end = min(end, vma->vm_end);
 			nr_pte_updates = change_prot_numa(vma, start, end);
 
+			nr_scanned = (end - start) >> PAGE_SHIFT;
+			total_migrated += nr_pte_updates;
+			total_scanned += nr_scanned;
+
 			/*
 			 * Try to scan sysctl_numa_balancing_size worth of
 			 * hpages that have at least one present PTE that
@@ -3486,8 +3494,8 @@ static void task_numa_work(struct callback_head *work)
 			 * areas faster.
 			 */
 			if (nr_pte_updates)
-				pages -= (end - start) >> PAGE_SHIFT;
-			virtpages -= (end - start) >> PAGE_SHIFT;
+				pages -= nr_scanned;
+			virtpages -= nr_scanned;
 
 			start = end;
 			if (pages <= 0 || virtpages <= 0)
@@ -3528,6 +3536,8 @@ static void task_numa_work(struct callback_head *work)
 		mm->numa_scan_offset = start;
 	else
 		reset_ptenuma_scan(p);
+
+	trace_sched_numa_balance_end(p, total_scanned, total_migrated);
 	mmap_read_unlock(mm);
 
 	/*
-- 
2.43.0


^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2025-10-29 13:23 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-29 13:22 [PATCH v1] sched/numa: Add tracepoint to track NUMA migration cost Jiayuan Chen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).