From: Tim Chen <tim.c.chen@linux.intel.com>
To: Peter Zijlstra <peterz@infradead.org>,
Ingo Molnar <mingo@redhat.com>,
K Prateek Nayak <kprateek.nayak@amd.com>,
Vincent Guittot <vincent.guittot@linaro.org>
Cc: Chen Yu <yu.c.chen@intel.com>, Juri Lelli <juri.lelli@redhat.com>,
Dietmar Eggemann <dietmar.eggemann@arm.com>,
Steven Rostedt <rostedt@goodmis.org>,
Ben Segall <bsegall@google.com>, Mel Gorman <mgorman@suse.de>,
Valentin Schneider <vschneid@redhat.com>,
Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
Hillf Danton <hdanton@sina.com>,
Shrikanth Hegde <sshegde@linux.ibm.com>,
Jianyong Wu <jianyong.wu@outlook.com>,
Yangyu Chen <cyy@cyyself.name>,
Tingyin Duan <tingyin.duan@gmail.com>,
Vern Hao <vernhao@tencent.com>, Vern Hao <haoxing990@gmail.com>,
Len Brown <len.brown@intel.com>,
Tim Chen <tim.c.chen@linux.intel.com>,
Aubrey Li <aubrey.li@intel.com>, Zhao Liu <zhao1.liu@intel.com>,
Chen Yu <yu.chen.surf@gmail.com>,
Adam Li <adamli@os.amperecomputing.com>,
Aaron Lu <ziqianlu@bytedance.com>,
Tim Chen <tim.c.chen@intel.com>, Josh Don <joshdon@google.com>,
Gavin Guo <gavinguo@igalia.com>,
Qais Yousef <qyousef@layalina.io>,
Libo Chen <libchen@purestorage.com>,
Luo Gengkun <luogengkun2@huawei.com>,
linux-kernel@vger.kernel.org
Subject: [Patch v4 04/16] sched/cache: Calculate the LLC size and store it in sched_domain
Date: Wed, 13 May 2026 13:39:15 -0700 [thread overview]
Message-ID: <37afee09ff608034da0ce149e72d33b6f4698edf.1778703694.git.tim.c.chen@linux.intel.com> (raw)
In-Reply-To: <cover.1778703694.git.tim.c.chen@linux.intel.com>
From: Chen Yu <yu.c.chen@intel.com>
Cache aware scheduling needs to know the LLC size that a process
can use, so as to avoid memory-intensive tasks from being
over-aggregated on a single LLC.
Introduce a preparation patch to add get_effective_llc_bytes() to
get the LLC size that a CPU can use. The function can be further
enhanced by subtracting the LLC cache ways reserved by resctrl
(CAT in Intel RDT, etc).
Tested-by: Tingyin Duan <tingyin.duan@gmail.com>
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
drivers/base/cacheinfo.c | 23 ++++++++
include/linux/cacheinfo.h | 1 +
include/linux/sched/topology.h | 7 +++
kernel/sched/topology.c | 98 ++++++++++++++++++++++++++++++++--
4 files changed, 126 insertions(+), 3 deletions(-)
diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
index 391ac5e3d2f5..70701d3bc81c 100644
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -17,6 +17,7 @@
#include <linux/init.h>
#include <linux/of.h>
#include <linux/sched.h>
+#include <linux/sched/topology.h>
#include <linux/slab.h>
#include <linux/smp.h>
#include <linux/sysfs.h>
@@ -68,6 +69,24 @@ bool last_level_cache_is_valid(unsigned int cpu)
}
+/*
+ * Get the cacheinfo of the LLC associated with @cpu.
+ * Derived from update_per_cpu_data_slice_size_cpu().
+ */
+struct cacheinfo *get_cpu_cacheinfo_llc(unsigned int cpu)
+{
+ struct cacheinfo *llc;
+
+ if (!last_level_cache_is_valid(cpu))
+ return NULL;
+
+ llc = per_cpu_cacheinfo_idx(cpu, cache_leaves(cpu) - 1);
+ if (llc->type != CACHE_TYPE_DATA && llc->type != CACHE_TYPE_UNIFIED)
+ return NULL;
+
+ return llc;
+}
+
bool last_level_cache_is_shared(unsigned int cpu_x, unsigned int cpu_y)
{
struct cacheinfo *llc_x, *llc_y;
@@ -1018,6 +1037,7 @@ static int cacheinfo_cpu_online(unsigned int cpu)
goto err;
if (cpu_map_shared_cache(true, cpu, &cpu_map))
update_per_cpu_data_slice_size(true, cpu, cpu_map);
+ sched_update_llc_bytes(cpu);
return 0;
err:
free_cache_attributes(cpu);
@@ -1036,6 +1056,9 @@ static int cacheinfo_cpu_pre_down(unsigned int cpu)
free_cache_attributes(cpu);
if (nr_shared > 1)
update_per_cpu_data_slice_size(false, cpu, cpu_map);
+
+ sched_update_llc_bytes(cpu);
+
return 0;
}
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index c8f4f0a0b874..fc879ac4cc4f 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -89,6 +89,7 @@ int populate_cache_leaves(unsigned int cpu);
int cache_setup_acpi(unsigned int cpu);
bool last_level_cache_is_valid(unsigned int cpu);
bool last_level_cache_is_shared(unsigned int cpu_x, unsigned int cpu_y);
+struct cacheinfo *get_cpu_cacheinfo_llc(unsigned int cpu);
int fetch_cache_info(unsigned int cpu);
int detect_cache_attributes(unsigned int cpu);
#ifndef CONFIG_ACPI_PPTT
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 0036d6b4bd67..fe09d3268bc9 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -106,6 +106,7 @@ struct sched_domain {
#ifdef CONFIG_SCHED_CACHE
unsigned int llc_max;
unsigned int *llc_counts __counted_by_ptr(llc_max);
+ unsigned long llc_bytes;
#endif
#ifdef CONFIG_SCHEDSTATS
@@ -265,4 +266,10 @@ static inline int task_node(const struct task_struct *p)
return cpu_to_node(task_cpu(p));
}
+#ifdef CONFIG_SCHED_CACHE
+extern void sched_update_llc_bytes(unsigned int cpu);
+#else
+static inline void sched_update_llc_bytes(unsigned int cpu) { }
+#endif
+
#endif /* _LINUX_SCHED_TOPOLOGY_H */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 9fc99346ef4f..7248a7279abe 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -776,9 +776,11 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
/* move buffer to parent as child is being destroyed */
sd->llc_counts = tmp->llc_counts;
sd->llc_max = tmp->llc_max;
+ sd->llc_bytes = tmp->llc_bytes;
/* make sure destroy_sched_domain() does not free it */
tmp->llc_counts = NULL;
tmp->llc_max = 0;
+ tmp->llc_bytes = 0;
#endif
/*
* sched groups hold the flags of the child sched
@@ -831,10 +833,42 @@ DEFINE_STATIC_KEY_FALSE(sched_cache_active);
/* user wants cache aware scheduling [0 or 1] */
int sysctl_sched_cache_user = 1;
+/*
+ * Get the effective LLC size in bytes that @cpu's bottom sched_domain
+ * can use. A CPU within a cpuset partition can only use a proportion
+ * of the physical LLC, scaled by the ratio of the partition's span
+ * weight to the hardware LLC sharing weight. @sd should be the
+ * topmost domain with SD_SHARE_LLC.
+ *
+ * Returns 0 if cacheinfo is not yet populated. This happens during
+ * early boot when build_sched_domains() runs before the generic
+ * cacheinfo framework has been initialized (cacheinfo_cpu_online()
+ * is a device_initcall cpuhp callback). In that case,
+ * cacheinfo_cpu_online() will later call sched_update_llc_bytes()
+ * to fill in the bottom domain's llc_bytes once the cache attributes
+ * are available.
+ */
+static unsigned long get_effective_llc_bytes(int cpu,
+ struct sched_domain *sd)
+{
+ struct cacheinfo *ci;
+ unsigned int hw_weight;
+
+ ci = get_cpu_cacheinfo_llc(cpu);
+ if (!ci)
+ return 0;
+
+ hw_weight = cpumask_weight(&ci->shared_cpu_map);
+ if (!hw_weight)
+ return 0;
+
+ return div_u64((u64)ci->size * sd->span_weight, hw_weight);
+}
+
static bool alloc_sd_llc(const struct cpumask *cpu_map,
struct s_data *d)
{
- struct sched_domain *sd;
+ struct sched_domain *sd, *top_llc, *parent;
unsigned int *p;
int i;
@@ -848,8 +882,24 @@ static bool alloc_sd_llc(const struct cpumask *cpu_map,
if (!p)
goto err;
- sd->llc_max = max_lid + 1;
- sd->llc_counts = p;
+ top_llc = sd;
+ /*
+ * Find the topmost SD_SHARE_LLC domain.
+ * Not yet attached to the CPU, so per_cpu(sd_llc, i)
+ * can not be used.
+ */
+ while ((parent = rcu_dereference_protected(top_llc->parent, true)) &&
+ (parent->flags & SD_SHARE_LLC))
+ top_llc = parent;
+
+ if (top_llc->flags & SD_SHARE_LLC) {
+ sd->llc_max = max_lid + 1;
+ sd->llc_counts = p;
+ sd->llc_bytes = get_effective_llc_bytes(i, top_llc);
+ } else {
+ /* avoid memory leak */
+ kfree(p);
+ }
}
return true;
@@ -860,6 +910,7 @@ static bool alloc_sd_llc(const struct cpumask *cpu_map,
kfree(sd->llc_counts);
sd->llc_counts = NULL;
sd->llc_max = 0;
+ sd->llc_bytes = 0;
}
}
@@ -919,6 +970,47 @@ void sched_cache_active_set_unlocked(void)
{
return sched_cache_active_set(false);
}
+
+/*
+ * Update the bottom sched_domain's llc_bytes for @cpu and all its
+ * LLC siblings. Called from cacheinfo_cpu_online() or
+ * cacheinfo_cpu_pre_down() with cpu hotplug lock held.
+ *
+ * Note: get_effective_llc_bytes() returns 0 on PowerPC.
+ * thus cache aware scheduling is disabled on PowerPC for
+ * now. PowerPC does not use the generic cacheinfo framework --
+ * it has its own cacheinfo with a separate struct cache hierarchy
+ * and does not populates the per-CPU struct cpu_cacheinfo array
+ * that get_cpu_cacheinfo_llc() reads.
+ */
+void sched_update_llc_bytes(unsigned int cpu)
+{
+ struct sched_domain *sd, *sdp;
+ unsigned int i;
+
+ sched_domains_mutex_lock();
+
+ sdp = rcu_dereference_sched_domain(per_cpu(sd_llc, cpu));
+ if (!sdp)
+ goto unlock;
+
+ /*
+ * ci->shared_cpu_map is built incrementally as CPUs come
+ * online, so the first CPU in an LLC initially sees
+ * hw_weight == 1 and computes an inflated llc_bytes in
+ * get_effective_llc_bytes(). Re-evaluating every LLC
+ * sibling on each online event corrects this once the full
+ * shared_cpu_map is known.
+ */
+ for_each_cpu(i, sched_domain_span(sdp)) {
+ sd = rcu_dereference_sched_domain(cpu_rq(i)->sd);
+ if (sd)
+ sd->llc_bytes = get_effective_llc_bytes(i, sdp);
+ }
+
+unlock:
+ sched_domains_mutex_unlock();
+}
#else
static bool alloc_sd_llc(const struct cpumask *cpu_map,
struct s_data *d)
--
2.32.0
next prev parent reply other threads:[~2026-05-13 20:33 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-13 20:39 [Patch v4 00/16] Cache aware scheduling enhancements Tim Chen
2026-05-13 20:39 ` [Patch v4 01/16] sched/cache: Allow only 1 thread of the process to calculate the LLC occupancy Tim Chen
2026-05-13 20:39 ` [Patch v4 02/16] sched/cache: Disable cache aware scheduling for processes with high thread counts Tim Chen
2026-05-13 20:39 ` [Patch v4 03/16] sched/cache: Skip cache-aware scheduling for single-threaded processes Tim Chen
2026-05-13 20:39 ` Tim Chen [this message]
2026-05-13 20:39 ` [Patch v4 05/16] sched/cache: Avoid cache-aware scheduling for memory-heavy processes Tim Chen
2026-05-13 20:39 ` [Patch v4 06/16] sched/cache: Add user control to adjust the aggressiveness of cache-aware scheduling Tim Chen
2026-05-13 20:39 ` [Patch v4 07/16] sched/cache: Fix rcu warning when accessing sd_llc domain Tim Chen
2026-05-13 20:39 ` [Patch v4 08/16] sched/cache: Fix potential NULL mm pointer access Tim Chen
2026-05-13 20:39 ` [Patch v4 09/16] sched/cache: Annotate lockless accesses to mm->sc_stat.cpu Tim Chen
2026-05-13 20:39 ` [Patch v4 10/16] sched/cache: Fix unpaired account_llc_enqueue/dequeue Tim Chen
2026-05-13 20:39 ` [Patch v4 11/16] sched/cache: Fix checking active load balance by only considering the CFS task Tim Chen
2026-05-13 20:39 ` [Patch v4 12/16] sched/cache: Fix race condition during sched domain rebuild Tim Chen
2026-05-13 20:39 ` [Patch v4 13/16] sched/cache: Fix cache aware scheduling enabling for multi LLCs system Tim Chen
2026-05-13 20:39 ` [Patch v4 14/16] sched/cache: Fix has_multi_llcs iff at least one partition has multiple LLCs Tim Chen
2026-05-13 20:39 ` [Patch v4 15/16] sched/cache: Fix possible overflow when invalidating the preferred CPU Tim Chen
2026-05-13 20:39 ` [Patch v4 16/16] sched/cache: Fix stale preferred_llc for a new task Tim Chen
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=37afee09ff608034da0ce149e72d33b6f4698edf.1778703694.git.tim.c.chen@linux.intel.com \
--to=tim.c.chen@linux.intel.com \
--cc=adamli@os.amperecomputing.com \
--cc=aubrey.li@intel.com \
--cc=bsegall@google.com \
--cc=cyy@cyyself.name \
--cc=dietmar.eggemann@arm.com \
--cc=gavinguo@igalia.com \
--cc=haoxing990@gmail.com \
--cc=hdanton@sina.com \
--cc=jianyong.wu@outlook.com \
--cc=joshdon@google.com \
--cc=juri.lelli@redhat.com \
--cc=kprateek.nayak@amd.com \
--cc=len.brown@intel.com \
--cc=libchen@purestorage.com \
--cc=linux-kernel@vger.kernel.org \
--cc=luogengkun2@huawei.com \
--cc=mgorman@suse.de \
--cc=mingo@redhat.com \
--cc=peterz@infradead.org \
--cc=qyousef@layalina.io \
--cc=rostedt@goodmis.org \
--cc=sshegde@linux.ibm.com \
--cc=tim.c.chen@intel.com \
--cc=tingyin.duan@gmail.com \
--cc=vernhao@tencent.com \
--cc=vincent.guittot@linaro.org \
--cc=vineethr@linux.ibm.com \
--cc=vschneid@redhat.com \
--cc=yu.c.chen@intel.com \
--cc=yu.chen.surf@gmail.com \
--cc=zhao1.liu@intel.com \
--cc=ziqianlu@bytedance.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox