* [PATCH 1/3] x86/sbm: Fix domain shift calculation and sbm_find_next_bit()
2026-05-10 15:59 ` [PATCH v2 1/4] sched/rt: Optimize cpupri_vec layout to mitigate cache line contention Chen Yu
@ 2026-05-10 15:59 ` Chen Yu
2026-05-10 15:59 ` [PATCH 2/3] lib/sbm: Use dynamically sized bitmap in sbm_leaf Chen Yu
2026-05-10 15:59 ` [PATCH 3/3] x86/sbm: Derive leaf granularity from LLC cacheinfo instead of topology domain Chen Yu
2 siblings, 0 replies; 6+ messages in thread
From: Chen Yu @ 2026-05-10 15:59 UTC (permalink / raw)
To: kprateek.nayak, tim.c.chen, peterz
Cc: pan.deng, mingo, linux-kernel, tianyou.li, Chen Yu
---
arch/x86/kernel/cpu/topology.c | 2 +-
include/linux/sbm.h | 2 +-
lib/sbm.c | 6 +++---
3 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 6f3d18288600..751b7517f2d5 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -565,7 +565,7 @@ void __init topology_init_possible_cpus(void)
for_each_possible_cpu(cpu)
apicid = max(apicid, cpuid_to_apicid[cpu]);
- arch_sbm_shift = x86_topo_system.dom_shifts[TOPO_DIE_DOMAIN] - 1;
+ arch_sbm_shift = x86_topo_system.dom_shifts[TOPO_DIE_DOMAIN - 1];
arch_sbm_leafs = 1 + (apicid >> arch_sbm_shift);
arch_sbm_mask = (1 << arch_sbm_shift) - 1;
arch_sbm_bits = arch_sbm_shift;
diff --git a/include/linux/sbm.h b/include/linux/sbm.h
index 8beade6c0585..a25a96366694 100644
--- a/include/linux/sbm.h
+++ b/include/linux/sbm.h
@@ -36,7 +36,7 @@ struct sbm {
};
extern struct sbm *sbm_alloc(void);
-extern unsigned int sbm_find_next_bit(struct sbm *sbm, int start);
+extern int sbm_find_next_bit(struct sbm *sbm, int start);
#define __sbm_op(sbm, func) \
({ \
diff --git a/lib/sbm.c b/lib/sbm.c
index 167cf857cd32..8006f9b04b62 100644
--- a/lib/sbm.c
+++ b/lib/sbm.c
@@ -34,7 +34,7 @@ struct sbm *sbm_alloc(void)
return NULL;
}
-unsigned int sbm_find_next_bit(struct sbm *sbm, int start)
+int sbm_find_next_bit(struct sbm *sbm, int start)
{
struct sbm_leaf *leaf = (void *)sbm;
struct sbm_root *root = (void *)sbm;
@@ -45,8 +45,8 @@ unsigned int sbm_find_next_bit(struct sbm *sbm, int start)
for (; nr < arch_sbm_leafs; nr++, mask = ~0UL) {
leaf = root->leafs[nr];
tmp = leaf->bitmap & mask;
- if (!tmp)
- continue;
+ if (tmp)
+ break;
}
} else {
tmp = leaf->bitmap & mask;
--
2.25.1
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH 2/3] lib/sbm: Use dynamically sized bitmap in sbm_leaf
2026-05-10 15:59 ` [PATCH v2 1/4] sched/rt: Optimize cpupri_vec layout to mitigate cache line contention Chen Yu
2026-05-10 15:59 ` [PATCH 1/3] x86/sbm: Fix domain shift calculation and sbm_find_next_bit() Chen Yu
@ 2026-05-10 15:59 ` Chen Yu
2026-05-10 15:59 ` [PATCH 3/3] x86/sbm: Derive leaf granularity from LLC cacheinfo instead of topology domain Chen Yu
2 siblings, 0 replies; 6+ messages in thread
From: Chen Yu @ 2026-05-10 15:59 UTC (permalink / raw)
To: kprateek.nayak, tim.c.chen, peterz
Cc: pan.deng, mingo, linux-kernel, tianyou.li, Chen Yu
The original sbm_leaf uses a single unsigned long (u64) as its bitmap,
which limits each leaf to representing at most 64 CPUs. When a LLC
domain contains more than 64 logical CPUs, the within-leaf bit
position (computed as apicid & arch_sbm_mask) can exceed 63.
Since set_bit(nr, addr) treats addr as an arbitrarily long bitmap
array, set_bit(65, &leaf->bitmap) would write to (&leaf->bitmap)[1],
memory beyond the single unsigned long field. While
____cacheline_aligned padding may prevent corrupting adjacent
leaves, the bits written into the padding are never read back by
sbm_find_next_bit(), silently making those CPUs invisible.
Fix this by converting the fixed u64 bitmap to a flexible array
member (unsigned long bitmap[]) whose size is determined at
allocation time from the number of CPUs that of the TILE
domain(1 << arch_sbm_shift). A subsequent patch will switch
to use the number CPUs shared LLC rather than TILE domain.
---
include/linux/sbm.h | 5 +++--
lib/sbm.c | 28 +++++++++++++++++-----------
2 files changed, 20 insertions(+), 13 deletions(-)
diff --git a/include/linux/sbm.h b/include/linux/sbm.h
index a25a96366694..8d60f4bc7004 100644
--- a/include/linux/sbm.h
+++ b/include/linux/sbm.h
@@ -28,7 +28,8 @@ struct sbm_root {
struct sbm_leaf {
enum sbm_type type;
- unsigned long bitmap;
+ unsigned int nbits;
+ unsigned long bitmap[];
} ____cacheline_aligned;
struct sbm {
@@ -48,7 +49,7 @@ extern int sbm_find_next_bit(struct sbm *sbm, int start);
leaf = root->leafs[nr]; \
} \
int bit = idx & arch_sbm_mask; \
- func(bit, &leaf->bitmap); \
+ func(bit, leaf->bitmap); \
})
static inline void sbm_cpu_set(struct sbm *sbm, int cpu)
diff --git a/lib/sbm.c b/lib/sbm.c
index 8006f9b04b62..76670ce14291 100644
--- a/lib/sbm.c
+++ b/lib/sbm.c
@@ -4,6 +4,8 @@
struct sbm *sbm_alloc(void)
{
unsigned int nr = arch_sbm_leafs;
+ unsigned int nbits = 1U << arch_sbm_shift;
+ unsigned int nlongs = BITS_TO_LONGS(nbits);
struct sbm_root *root = kzalloc_flex(*root, leafs, nr);
struct sbm_leaf *leaf;
if (!root)
@@ -12,10 +14,12 @@ struct sbm *sbm_alloc(void)
root->type = st_root;
for (int i = 0; i < nr; i++) {
- leaf = kzalloc_obj(*leaf);
+ leaf = kzalloc(struct_size(leaf, bitmap, nlongs),
+ GFP_KERNEL);
if (!leaf)
goto fail;
leaf->type = st_leaf;
+ leaf->nbits = nbits;
root->leafs[i] = leaf;
}
@@ -40,18 +44,20 @@ int sbm_find_next_bit(struct sbm *sbm, int start)
struct sbm_root *root = (void *)sbm;
int nr = start >> arch_sbm_shift;
int bit = start & arch_sbm_mask;
- unsigned long tmp, mask = (~0UL) << bit;
+ unsigned int found;
+
if (sbm->type == st_root) {
- for (; nr < arch_sbm_leafs; nr++, mask = ~0UL) {
+ do {
leaf = root->leafs[nr];
- tmp = leaf->bitmap & mask;
- if (tmp)
- break;
- }
+ found = find_next_bit(leaf->bitmap, leaf->nbits, bit);
+ if (found < leaf->nbits)
+ return (nr << arch_sbm_shift) | found;
+ bit = 0;
+ } while (++nr < arch_sbm_leafs);
} else {
- tmp = leaf->bitmap & mask;
+ found = find_next_bit(leaf->bitmap, leaf->nbits, bit);
+ if (found < leaf->nbits)
+ return found;
}
- if (!tmp)
- return -1;
- return (nr << arch_sbm_shift) | __ffs(tmp);
+ return -1;
}
--
2.25.1
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH 3/3] x86/sbm: Derive leaf granularity from LLC cacheinfo instead of topology domain
2026-05-10 15:59 ` [PATCH v2 1/4] sched/rt: Optimize cpupri_vec layout to mitigate cache line contention Chen Yu
2026-05-10 15:59 ` [PATCH 1/3] x86/sbm: Fix domain shift calculation and sbm_find_next_bit() Chen Yu
2026-05-10 15:59 ` [PATCH 2/3] lib/sbm: Use dynamically sized bitmap in sbm_leaf Chen Yu
@ 2026-05-10 15:59 ` Chen Yu
2026-05-11 7:48 ` K Prateek Nayak
2 siblings, 1 reply; 6+ messages in thread
From: Chen Yu @ 2026-05-10 15:59 UTC (permalink / raw)
To: kprateek.nayak, tim.c.chen, peterz
Cc: pan.deng, mingo, linux-kernel, tianyou.li
From: Tim Chen <tim.c.chen@linux.intel.com>
Instead of using topology domain shifts (TOPO_TILE_DOMAIN) which may not
align with the actual LLC boundary, derive SBM parameters from the
authoritative LLC cache identification that the kernel already performs
during boot CPU identification.
- intel_cacheinfo_0x4(): iterates CPUID 0x04 leaves, identifies L2/L3
- cacheinfo_amd_init_llc_id(): identifies LLC for AMD/Hygon CPUs
Besides, if sbm is not defined for an architecture, define a
default implementation for it similar to cpumask. Rename
arch_sbm_max_apicid to sbm_max_apicid and put it in x86
file as it is specific for x86.
---
arch/x86/include/asm/apic.h | 1 +
arch/x86/include/asm/sbm.h | 13 ++++---------
arch/x86/kernel/cpu/cacheinfo.c | 20 +++++++++++++++++++-
arch/x86/kernel/cpu/common.c | 8 ++++++++
arch/x86/kernel/cpu/topology.c | 12 ++----------
include/linux/sbm.h | 9 +++++++--
kernel/sched/core.c | 2 ++
kernel/sched/fair.c | 6 ++++++
kernel/sched/sched.h | 1 +
kernel/sched/topology.c | 6 ++++++
lib/sbm.c | 28 ++++++++++++++++++++++++----
11 files changed, 80 insertions(+), 26 deletions(-)
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 24012a91ac1e..90406d8d6af1 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -55,6 +55,7 @@ static inline void x86_32_probe_apic(void) { }
extern u32 cpuid_to_apicid[];
extern u32 apicid_to_cpuid[];
+extern unsigned int sbm_max_apicid;
#define CPU_ACPIID_INVALID U32_MAX
diff --git a/arch/x86/include/asm/sbm.h b/arch/x86/include/asm/sbm.h
index 9a4d283347d1..f48d3a985972 100644
--- a/arch/x86/include/asm/sbm.h
+++ b/arch/x86/include/asm/sbm.h
@@ -1,12 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/apic.h>
-static __always_inline u32 arch_sbm_cpu_to_idx(unsigned int cpu)
-{
- return cpuid_to_apicid[cpu];
-}
-
-static __always_inline u32 arch_sbm_idx_to_cpu(unsigned int idx)
-{
- return apicid_to_cpuid[idx];
-}
+#define arch_sbm_cpu_to_idx(cpu) \
+ ((u32)(cpuid_to_apicid[(cpu)]))
+#define arch_sbm_idx_to_cpu(idx) \
+ ((u32)(apicid_to_cpuid[(idx)]))
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index 51a95b07831f..fa59fa6828a6 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -21,6 +21,8 @@
#include <asm/smp.h>
#include <asm/tlbflush.h>
+#include <linux/sbm.h>
+
#include "cpu.h"
/* Shared last level cache maps */
@@ -317,12 +319,16 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id)
if (c->x86 < 0x17) {
/* Pre-Zen: LLC is at the node level */
c->topo.llc_id = die_id;
+ if (c == &boot_cpu_data)
+ arch_sbm_shift = topology_get_domain_shift(TOPO_DIE_DOMAIN);
} else if (c->x86 == 0x17 && c->x86_model <= 0x1F) {
/*
* Family 17h up to 1F models: LLC is at the core
* complex level. Core complex ID is ApicId[3].
*/
c->topo.llc_id = c->topo.apicid >> 3;
+ if (c == &boot_cpu_data)
+ arch_sbm_shift = 3;
} else {
/*
* Newer families: LLC ID is calculated from the number
@@ -331,8 +337,11 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id)
u32 llc_index = find_num_cache_leaves(c) - 1;
struct _cpuid4_info id4 = {};
- if (!amd_fill_cpuid4_info(llc_index, &id4))
+ if (!amd_fill_cpuid4_info(llc_index, &id4)) {
c->topo.llc_id = get_cache_id(c->topo.apicid, &id4);
+ if (c == &boot_cpu_data)
+ arch_sbm_shift = get_count_order(1 + id4.eax.split.num_threads_sharing);
+ }
}
}
@@ -346,6 +355,8 @@ void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c)
* at the core complex level. Core complex ID is ApicId[3].
*/
c->topo.llc_id = c->topo.apicid >> 3;
+ if (c == &boot_cpu_data)
+ arch_sbm_shift = 3;
}
void init_amd_cacheinfo(struct cpuinfo_x86 *c)
@@ -425,6 +436,7 @@ static bool intel_cacheinfo_0x4(struct cpuinfo_x86 *c)
struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index);
unsigned int l2_id = BAD_APICID, l3_id = BAD_APICID;
unsigned int l1d = 0, l1i = 0, l2 = 0, l3 = 0;
+ unsigned int llc_nthreads = 0;
if (c->cpuid_level < 4)
return false;
@@ -461,6 +473,7 @@ static bool intel_cacheinfo_0x4(struct cpuinfo_x86 *c)
case 3:
l3 = id4.size / 1024;
l3_id = calc_cache_topo_id(c, &id4);
+ llc_nthreads = 1 + id4.eax.split.num_threads_sharing;
break;
default:
break;
@@ -469,6 +482,11 @@ static bool intel_cacheinfo_0x4(struct cpuinfo_x86 *c)
c->topo.l2c_id = l2_id;
c->topo.llc_id = (l3_id == BAD_APICID) ? l2_id : l3_id;
+
+ /* Save LLC shift for SBM (boot CPU only) */
+ if (c == &boot_cpu_data && llc_nthreads)
+ arch_sbm_shift = get_count_order(llc_nthreads);
+
intel_cacheinfo_done(c, l3, l2, l1i, l1d);
return true;
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a8ff4376c286..5c590d8a3e78 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -28,6 +28,7 @@
#include <linux/stackprotector.h>
#include <linux/utsname.h>
#include <linux/efi.h>
+#include <linux/sbm.h>
#include <asm/alternative.h>
#include <asm/cmdline.h>
@@ -70,6 +71,7 @@
#include <asm/set_memory.h>
#include <asm/traps.h>
#include <asm/sev.h>
+#include <asm/apic.h>
#include <asm/tdx.h>
#include <asm/posted_intr.h>
#include <asm/runtime-const.h>
@@ -2561,6 +2563,12 @@ void __init arch_cpu_finalize_init(void)
identify_boot_cpu();
+ arch_sbm_leafs = 1 + (sbm_max_apicid >> arch_sbm_shift);
+ arch_sbm_mask = (1 << arch_sbm_shift) - 1;
+ arch_sbm_bits = arch_sbm_shift;
+ pr_info("SBM: shift(%d) leafs(%d) APIC(%x)\n",
+ arch_sbm_shift, arch_sbm_leafs, sbm_max_apicid);
+
select_idle_routine();
/*
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 751b7517f2d5..9245456791b0 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -50,10 +50,7 @@ DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC) __read_mostly;
u32 cpuid_to_apicid[] __ro_after_init = { [0 ... NR_CPUS - 1] = BAD_APICID, };
u32 apicid_to_cpuid[MAX_LOCAL_APIC] = { 0 };
-u32 arch_sbm_leafs __ro_after_init;
-u32 arch_sbm_shift __ro_after_init;
-u32 arch_sbm_mask __ro_after_init;
-u32 arch_sbm_bits __ro_after_init;
+u32 sbm_max_apicid __ro_after_init;
/* Bitmaps to mark registered APICs at each topology domain */
static struct { DECLARE_BITMAP(map, MAX_LOCAL_APIC); } apic_maps[TOPO_MAX_DOMAIN] __ro_after_init;
@@ -565,12 +562,7 @@ void __init topology_init_possible_cpus(void)
for_each_possible_cpu(cpu)
apicid = max(apicid, cpuid_to_apicid[cpu]);
- arch_sbm_shift = x86_topo_system.dom_shifts[TOPO_DIE_DOMAIN - 1];
- arch_sbm_leafs = 1 + (apicid >> arch_sbm_shift);
- arch_sbm_mask = (1 << arch_sbm_shift) - 1;
- arch_sbm_bits = arch_sbm_shift;
-
- pr_info("SBM: shift(%d) leafs(%d) APIC(%x)\n", arch_sbm_shift, arch_sbm_leafs, apicid);
+ sbm_max_apicid = apicid;
}
/*
diff --git a/include/linux/sbm.h b/include/linux/sbm.h
index 8d60f4bc7004..be940bcf1ae9 100644
--- a/include/linux/sbm.h
+++ b/include/linux/sbm.h
@@ -12,8 +12,13 @@ extern unsigned int arch_sbm_shift;
extern unsigned int arch_sbm_mask;
extern unsigned int arch_sbm_bits;
-extern unsigned int arch_sbm_cpu_to_idx(unsigned int cpu);
-extern unsigned int arch_sbm_idx_to_cpu(unsigned int idx);
+#ifndef arch_sbm_cpu_to_idx
+#define arch_sbm_cpu_to_idx(cpu) (cpu)
+#endif
+
+#ifndef arch_sbm_idx_to_cpu
+#define arch_sbm_idx_to_(idx) (idx)
+#endif
enum sbm_type {
st_root = 0,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 496dff740dca..2be95fa3c002 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8567,6 +8567,8 @@ void __init sched_init_smp(void)
sched_init_dl_servers();
+ init_sched_fair_class_smp();
+
sched_smp_initialized = true;
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 474ee0362998..ae95610721b4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -14033,6 +14033,12 @@ __init void init_sched_fair_class(void)
#ifdef CONFIG_NO_HZ_COMMON
nohz.next_balance = jiffies;
nohz.next_blocked = jiffies;
+#endif
+}
+
+void __init init_sched_fair_class_smp(void)
+{
+#ifdef CONFIG_NO_HZ_COMMON
nohz.sbm = sbm_alloc();
#endif
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 43bbf0693cca..9e45396a1512 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2878,6 +2878,7 @@ extern void update_max_interval(void);
extern void init_sched_dl_class(void);
extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
+extern void init_sched_fair_class_smp(void);
extern void resched_curr(struct rq *rq);
extern void resched_curr_lazy(struct rq *rq);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 32dcddaead82..f62a10c869fa 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -5,6 +5,7 @@
#include <linux/sched/isolation.h>
#include <linux/bsearch.h>
+#include <linux/sbm.h>
#include "sched.h"
DEFINE_MUTEX(sched_domains_mutex);
@@ -21,6 +22,11 @@ void sched_domains_mutex_unlock(void)
static cpumask_var_t sched_domains_tmpmask;
static cpumask_var_t sched_domains_tmpmask2;
+u32 arch_sbm_leafs __ro_after_init;
+u32 arch_sbm_shift __ro_after_init;
+u32 arch_sbm_mask __ro_after_init;
+u32 arch_sbm_bits __ro_after_init;
+
static int __init sched_debug_setup(char *str)
{
sched_debug_verbose = true;
diff --git a/lib/sbm.c b/lib/sbm.c
index 76670ce14291..45003e7b5621 100644
--- a/lib/sbm.c
+++ b/lib/sbm.c
@@ -1,13 +1,33 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/sbm.h>
+#include <linux/cpumask.h>
struct sbm *sbm_alloc(void)
{
- unsigned int nr = arch_sbm_leafs;
- unsigned int nbits = 1U << arch_sbm_shift;
- unsigned int nlongs = BITS_TO_LONGS(nbits);
- struct sbm_root *root = kzalloc_flex(*root, leafs, nr);
+ unsigned int nr;
+ unsigned int nbits;
+ unsigned int nlongs;
+ struct sbm_root *root;
struct sbm_leaf *leaf;
+
+ if (!arch_sbm_shift) {
+ unsigned int max_idx = num_possible_cpus();
+
+ /*
+ * unsigned long is the base unit for bitmap in sbm_leaf.
+ * Use that for default bitmap size for compact bitmap
+ * without unused bits.
+ */
+ arch_sbm_shift = BYTES_TO_BITS(sizeof(unsigned long));
+ arch_sbm_leafs = 1 + (max_idx >> arch_sbm_shift);
+ arch_sbm_mask = (1 << arch_sbm_shift) - 1;
+ arch_sbm_bits = arch_sbm_shift;
+ }
+
+ nr = arch_sbm_leafs;
+ nbits = 1U << arch_sbm_shift;
+ nlongs = BITS_TO_LONGS(nbits);
+ root = kzalloc_flex(*root, leafs, nr);
if (!root)
return NULL;
--
2.25.1
^ permalink raw reply related [flat|nested] 6+ messages in thread