From: Tim Chen <tim.c.chen@linux.intel.com>
To: Peter Zijlstra <peterz@infradead.org>,
Ingo Molnar <mingo@redhat.com>,
K Prateek Nayak <kprateek.nayak@amd.com>,
Vincent Guittot <vincent.guittot@linaro.org>
Cc: Chen Yu <yu.c.chen@intel.com>, Juri Lelli <juri.lelli@redhat.com>,
Dietmar Eggemann <dietmar.eggemann@arm.com>,
Steven Rostedt <rostedt@goodmis.org>,
Ben Segall <bsegall@google.com>, Mel Gorman <mgorman@suse.de>,
Valentin Schneider <vschneid@redhat.com>,
Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
Hillf Danton <hdanton@sina.com>,
Shrikanth Hegde <sshegde@linux.ibm.com>,
Jianyong Wu <jianyong.wu@outlook.com>,
Yangyu Chen <cyy@cyyself.name>,
Tingyin Duan <tingyin.duan@gmail.com>,
Vern Hao <vernhao@tencent.com>, Vern Hao <haoxing990@gmail.com>,
Len Brown <len.brown@intel.com>,
Tim Chen <tim.c.chen@linux.intel.com>,
Aubrey Li <aubrey.li@intel.com>, Zhao Liu <zhao1.liu@intel.com>,
Chen Yu <yu.chen.surf@gmail.com>,
Adam Li <adamli@os.amperecomputing.com>,
Aaron Lu <ziqianlu@bytedance.com>,
Tim Chen <tim.c.chen@intel.com>, Josh Don <joshdon@google.com>,
Gavin Guo <gavinguo@igalia.com>,
Qais Yousef <qyousef@layalina.io>,
Libo Chen <libchen@purestorage.com>,
Luo Gengkun <luogengkun2@huawei.com>,
linux-kernel@vger.kernel.org
Subject: [Patch v4 05/16] sched/cache: Avoid cache-aware scheduling for memory-heavy processes
Date: Wed, 13 May 2026 13:39:16 -0700 [thread overview]
Message-ID: <95cf64a385bcc12f18dcebe9d59e8d3ba8bb318f.1778703694.git.tim.c.chen@linux.intel.com> (raw)
In-Reply-To: <cover.1778703694.git.tim.c.chen@linux.intel.com>
From: Chen Yu <yu.c.chen@intel.com>
Prateek and Tingyin reported that memory-intensive workloads (such as
stream) can saturate memory bandwidth and caches on the preferred LLC
when sched_cache aggregates too many threads.
To mitigate this, estimate a process's memory footprint by comparing
its NUMA balancing fault statistics to the size of the LLC. If the
footprint exceeds the LLC size, skip cache-aware scheduling.
Note that footprint is only an approximation of the memory footprint,
since the kernel lacks suitable metrics to estimate the real working
set. If a user-provided hint is available in the future, it would be
more accurate. A later patch will allow users to provide a hint to
adjust this threshold.
Tested-by: Tingyin Duan <tingyin.duan@gmail.com>
Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Suggested-by: Vern Hao <vernhao@tencent.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
include/linux/sched.h | 1 +
kernel/exit.c | 29 ++++++++++++++++++++
kernel/sched/fair.c | 62 ++++++++++++++++++++++++++++++++++++++++---
3 files changed, 89 insertions(+), 3 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6701911eaaf7..95729670929c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2425,6 +2425,7 @@ struct sched_cache_stat {
unsigned long epoch;
u64 nr_running_avg;
unsigned long next_scan;
+ unsigned long footprint;
int cpu;
} ____cacheline_aligned_in_smp;
diff --git a/kernel/exit.c b/kernel/exit.c
index ede3117fa7d4..77275c26a2a1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -543,6 +543,32 @@ void mm_update_next_owner(struct mm_struct *mm)
}
#endif /* CONFIG_MEMCG */
+#if defined(CONFIG_SCHED_CACHE) && defined(CONFIG_NUMA_BALANCING)
+/*
+ * Subtract the memory footprint of the current task from
+ * mm.
+ */
+static void exit_mm_sched_cache(struct mm_struct *mm)
+{
+ unsigned long fp, sub;
+
+ if (!current->total_numa_faults)
+ return;
+ /*
+ * No lock protection due to performance considerations.
+ * Make sure mm->sc_stat.footprint does not become
+ * negative.
+ */
+ fp = READ_ONCE(mm->sc_stat.footprint);
+ sub = min(fp, current->total_numa_faults);
+ WRITE_ONCE(mm->sc_stat.footprint, fp - sub);
+}
+#else
+static inline void exit_mm_sched_cache(struct mm_struct *mm)
+{
+}
+#endif /* CONFIG_SCHED_CACHE CONFIG_NUMA_BALANCING */
+
/*
* Turn us into a lazy TLB process if we
* aren't already..
@@ -554,6 +580,9 @@ static void exit_mm(void)
exit_mm_release(current, mm);
if (!mm)
return;
+
+ exit_mm_sched_cache(mm);
+
mmap_read_lock(mm);
mmgrab_lazy_tlb(mm);
BUG_ON(mm != current->active_mm);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index df21366ba1ca..a10116ffe0d1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1384,6 +1384,32 @@ static int llc_id(int cpu)
return per_cpu(sd_llc_id, cpu);
}
+static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
+{
+#ifdef CONFIG_NUMA_BALANCING
+ unsigned long llc, footprint;
+ struct sched_domain *sd;
+
+ guard(rcu)();
+
+ sd = rcu_dereference_sched_domain(cpu_rq(cpu)->sd);
+ if (!sd)
+ return true;
+
+ if (static_branch_likely(&sched_numa_balancing)) {
+ /*
+ * TBD: RDT exclusive LLC ways reserved should be
+ * excluded.
+ */
+ llc = sd->llc_bytes;
+ footprint = READ_ONCE(mm->sc_stat.footprint);
+
+ return (llc < (footprint * PAGE_SIZE));
+ }
+#endif
+ return false;
+}
+
static bool invalid_llc_nr(struct mm_struct *mm, struct task_struct *p,
int cpu)
{
@@ -1463,6 +1489,7 @@ void mm_init_sched(struct mm_struct *mm,
mm->sc_stat.cpu = -1;
mm->sc_stat.next_scan = jiffies;
mm->sc_stat.nr_running_avg = 0;
+ mm->sc_stat.footprint = 0;
/*
* The update to mm->sc_stat should not be reordered
* before initialization to mm's other fields, in case
@@ -1585,7 +1612,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
* its preferred state.
*/
if (epoch - READ_ONCE(mm->sc_stat.epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
- invalid_llc_nr(mm, p, cpu_of(rq))) {
+ invalid_llc_nr(mm, p, cpu_of(rq)) ||
+ exceed_llc_capacity(mm, cpu_of(rq))) {
if (mm->sc_stat.cpu != -1)
mm->sc_stat.cpu = -1;
}
@@ -1716,7 +1744,8 @@ static void task_cache_work(struct callback_head *work)
return;
curr_cpu = task_cpu(p);
- if (invalid_llc_nr(mm, p, curr_cpu)) {
+ if (invalid_llc_nr(mm, p, curr_cpu) ||
+ exceed_llc_capacity(mm, curr_cpu)) {
if (mm->sc_stat.cpu != -1)
mm->sc_stat.cpu = -1;
@@ -3515,6 +3544,7 @@ static void task_numa_placement(struct task_struct *p)
unsigned long total_faults;
u64 runtime, period;
spinlock_t *group_lock = NULL;
+ long __maybe_unused new_fp;
struct numa_group *ng;
/*
@@ -3589,6 +3619,31 @@ static void task_numa_placement(struct task_struct *p)
ng->total_faults += diff;
group_faults += ng->faults[mem_idx];
}
+#ifdef CONFIG_SCHED_CACHE
+ /*
+ * Per task p->numa_faults[mem_idx] converges,
+ * so the accumulation of each task's faults
+ * converges too - Given the number of threads,
+ * it cannot overflow an unsigned long.
+ * Racy with concurrent updates from other threads
+ * sharing this mm. Acceptable since footprint is a
+ * heuristic and occasional lost updates are tolerable.
+ *
+ * If a task exits, its corresponding footprint must
+ * be subtracted from the mm->sc_stat.footprint, otherwise
+ * the mm->sc_stat.footprint will not converge:
+ * the exiting thread's footprint remains unchanged/undecayed
+ * in mm->sc_stat.footprint. See exit_mm().
+ *
+ * Lost updates and unsynchronized subtraction
+ * in exit_mm() can cause footprint + diff to
+ * go negative. Clamp to zero to prevent the
+ * unsigned footprint from wrapping.
+ */
+ new_fp = (long)READ_ONCE(p->mm->sc_stat.footprint) + diff;
+ WRITE_ONCE(p->mm->sc_stat.footprint,
+ max(new_fp, 0L));
+#endif
}
if (!ng) {
@@ -10338,7 +10393,8 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
return mig_unrestricted;
/* skip cache aware load balance for too many threads */
- if (invalid_llc_nr(mm, p, dst_cpu)) {
+ if (invalid_llc_nr(mm, p, dst_cpu) ||
+ exceed_llc_capacity(mm, dst_cpu)) {
if (mm->sc_stat.cpu != -1)
mm->sc_stat.cpu = -1;
return mig_unrestricted;
--
2.32.0
next prev parent reply other threads:[~2026-05-13 20:33 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-13 20:39 [Patch v4 00/16] Cache aware scheduling enhancements Tim Chen
2026-05-13 20:39 ` [Patch v4 01/16] sched/cache: Allow only 1 thread of the process to calculate the LLC occupancy Tim Chen
2026-05-13 20:39 ` [Patch v4 02/16] sched/cache: Disable cache aware scheduling for processes with high thread counts Tim Chen
2026-05-13 20:39 ` [Patch v4 03/16] sched/cache: Skip cache-aware scheduling for single-threaded processes Tim Chen
2026-05-13 20:39 ` [Patch v4 04/16] sched/cache: Calculate the LLC size and store it in sched_domain Tim Chen
2026-05-13 20:39 ` Tim Chen [this message]
2026-05-13 20:39 ` [Patch v4 06/16] sched/cache: Add user control to adjust the aggressiveness of cache-aware scheduling Tim Chen
2026-05-13 20:39 ` [Patch v4 07/16] sched/cache: Fix rcu warning when accessing sd_llc domain Tim Chen
2026-05-13 20:39 ` [Patch v4 08/16] sched/cache: Fix potential NULL mm pointer access Tim Chen
2026-05-13 20:39 ` [Patch v4 09/16] sched/cache: Annotate lockless accesses to mm->sc_stat.cpu Tim Chen
2026-05-13 20:39 ` [Patch v4 10/16] sched/cache: Fix unpaired account_llc_enqueue/dequeue Tim Chen
2026-05-13 20:39 ` [Patch v4 11/16] sched/cache: Fix checking active load balance by only considering the CFS task Tim Chen
2026-05-13 20:39 ` [Patch v4 12/16] sched/cache: Fix race condition during sched domain rebuild Tim Chen
2026-05-13 20:39 ` [Patch v4 13/16] sched/cache: Fix cache aware scheduling enabling for multi LLCs system Tim Chen
2026-05-13 20:39 ` [Patch v4 14/16] sched/cache: Fix has_multi_llcs iff at least one partition has multiple LLCs Tim Chen
2026-05-13 20:39 ` [Patch v4 15/16] sched/cache: Fix possible overflow when invalidating the preferred CPU Tim Chen
2026-05-13 20:39 ` [Patch v4 16/16] sched/cache: Fix stale preferred_llc for a new task Tim Chen
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=95cf64a385bcc12f18dcebe9d59e8d3ba8bb318f.1778703694.git.tim.c.chen@linux.intel.com \
--to=tim.c.chen@linux.intel.com \
--cc=adamli@os.amperecomputing.com \
--cc=aubrey.li@intel.com \
--cc=bsegall@google.com \
--cc=cyy@cyyself.name \
--cc=dietmar.eggemann@arm.com \
--cc=gavinguo@igalia.com \
--cc=haoxing990@gmail.com \
--cc=hdanton@sina.com \
--cc=jianyong.wu@outlook.com \
--cc=joshdon@google.com \
--cc=juri.lelli@redhat.com \
--cc=kprateek.nayak@amd.com \
--cc=len.brown@intel.com \
--cc=libchen@purestorage.com \
--cc=linux-kernel@vger.kernel.org \
--cc=luogengkun2@huawei.com \
--cc=mgorman@suse.de \
--cc=mingo@redhat.com \
--cc=peterz@infradead.org \
--cc=qyousef@layalina.io \
--cc=rostedt@goodmis.org \
--cc=sshegde@linux.ibm.com \
--cc=tim.c.chen@intel.com \
--cc=tingyin.duan@gmail.com \
--cc=vernhao@tencent.com \
--cc=vincent.guittot@linaro.org \
--cc=vineethr@linux.ibm.com \
--cc=vschneid@redhat.com \
--cc=yu.c.chen@intel.com \
--cc=yu.chen.surf@gmail.com \
--cc=zhao1.liu@intel.com \
--cc=ziqianlu@bytedance.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox