* [PATCH v2 1/5] x86/numa: Store extra copy of numa_nodes_parsed
2026-03-03 10:55 [PATCH v2 0/5] x86/topo: SNC Divination Peter Zijlstra
@ 2026-03-03 10:55 ` Peter Zijlstra
2026-03-04 15:46 ` [tip: x86/urgent] " tip-bot2 for Peter Zijlstra
2026-03-03 10:55 ` [PATCH v2 2/5] x86/topo: Add topology_num_nodes_per_package() Peter Zijlstra
` (4 subsequent siblings)
5 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2026-03-03 10:55 UTC (permalink / raw)
To: x86, tglx
Cc: linux-kernel, peterz, tim.c.chen, yu.c.chen, kyle.meyer,
vinicius.gomes, brgerst, hpa, kprateek.nayak, patryk.wlazlyn,
rafael.j.wysocki, russ.anderson, zhao1.liu, tony.luck, Zhang Rui
The topology setup code needs to know the total number of physical
nodes enumerated in SRAT; however NUMA_EMU can cause the existing
numa_nodes_parsed bitmap to be fictitious. Therefore, keep a copy of
the bitmap specifically to retain the physical node count.
Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Kyle Meyer <kyle.meyer@hpe.com>
---
arch/x86/include/asm/numa.h | 6 ++++++
arch/x86/kernel/cpu/topology.c | 1 +
arch/x86/mm/numa.c | 8 ++++++++
arch/x86/mm/srat.c | 2 ++
4 files changed, 17 insertions(+)
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -22,6 +22,7 @@ extern int numa_off;
*/
extern s16 __apicid_to_node[MAX_LOCAL_APIC];
extern nodemask_t numa_nodes_parsed __initdata;
+extern nodemask_t numa_phys_nodes_parsed __initdata;
static inline void set_apicid_to_node(int apicid, s16 node)
{
@@ -48,6 +49,7 @@ extern void __init init_cpu_to_node(void
extern void numa_add_cpu(unsigned int cpu);
extern void numa_remove_cpu(unsigned int cpu);
extern void init_gi_nodes(void);
+extern int num_phys_nodes(void);
#else /* CONFIG_NUMA */
static inline void numa_set_node(int cpu, int node) { }
static inline void numa_clear_node(int cpu) { }
@@ -55,6 +57,10 @@ static inline void init_cpu_to_node(void
static inline void numa_add_cpu(unsigned int cpu) { }
static inline void numa_remove_cpu(unsigned int cpu) { }
static inline void init_gi_nodes(void) { }
+static inline int num_phys_nodes(void)
+{
+ return 1;
+}
#endif /* CONFIG_NUMA */
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -48,6 +48,8 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] = {
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
};
+nodemask_t numa_phys_nodes_parsed __initdata;
+
int numa_cpu_node(int cpu)
{
u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
@@ -57,6 +59,11 @@ int numa_cpu_node(int cpu)
return NUMA_NO_NODE;
}
+int __init num_phys_nodes(void)
+{
+ return bitmap_weight(numa_phys_nodes_parsed.bits, MAX_NUMNODES);
+}
+
cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
EXPORT_SYMBOL(node_to_cpumask_map);
@@ -210,6 +217,7 @@ static int __init dummy_numa_init(void)
0LLU, PFN_PHYS(max_pfn) - 1);
node_set(0, numa_nodes_parsed);
+ node_set(0, numa_phys_nodes_parsed);
numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
return 0;
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -57,6 +57,7 @@ acpi_numa_x2apic_affinity_init(struct ac
}
set_apicid_to_node(apic_id, node);
node_set(node, numa_nodes_parsed);
+ node_set(node, numa_phys_nodes_parsed);
pr_debug("SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", pxm, apic_id, node);
}
@@ -97,6 +98,7 @@ acpi_numa_processor_affinity_init(struct
set_apicid_to_node(apic_id, node);
node_set(node, numa_nodes_parsed);
+ node_set(node, numa_phys_nodes_parsed);
pr_debug("SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", pxm, apic_id, node);
}
^ permalink raw reply [flat|nested] 14+ messages in thread* [tip: x86/urgent] x86/numa: Store extra copy of numa_nodes_parsed
2026-03-03 10:55 ` [PATCH v2 1/5] x86/numa: Store extra copy of numa_nodes_parsed Peter Zijlstra
@ 2026-03-04 15:46 ` tip-bot2 for Peter Zijlstra
0 siblings, 0 replies; 14+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2026-03-04 15:46 UTC (permalink / raw)
To: linux-tip-commits
Cc: K Prateek Nayak, Peter Zijlstra (Intel), Ingo Molnar, Zhang Rui,
Chen Yu, Kyle Meyer, x86, linux-kernel
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: 48084cc153a5b0fbf0aa98d47670d3be0b9f64d5
Gitweb: https://git.kernel.org/tip/48084cc153a5b0fbf0aa98d47670d3be0b9f64d5
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Tue, 03 Mar 2026 11:55:40 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Wed, 04 Mar 2026 16:35:08 +01:00
x86/numa: Store extra copy of numa_nodes_parsed
The topology setup code needs to know the total number of physical
nodes enumerated in SRAT; however NUMA_EMU can cause the existing
numa_nodes_parsed bitmap to be fictitious. Therefore, keep a copy of
the bitmap specifically to retain the physical node count.
Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Kyle Meyer <kyle.meyer@hpe.com>
Link: https://patch.msgid.link/20260303110059.889884023@infradead.org
---
arch/x86/include/asm/numa.h | 6 ++++++
arch/x86/mm/numa.c | 8 ++++++++
arch/x86/mm/srat.c | 2 ++
3 files changed, 16 insertions(+)
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 53ba39c..a9063f3 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -22,6 +22,7 @@ extern int numa_off;
*/
extern s16 __apicid_to_node[MAX_LOCAL_APIC];
extern nodemask_t numa_nodes_parsed __initdata;
+extern nodemask_t numa_phys_nodes_parsed __initdata;
static inline void set_apicid_to_node(int apicid, s16 node)
{
@@ -48,6 +49,7 @@ extern void __init init_cpu_to_node(void);
extern void numa_add_cpu(unsigned int cpu);
extern void numa_remove_cpu(unsigned int cpu);
extern void init_gi_nodes(void);
+extern int num_phys_nodes(void);
#else /* CONFIG_NUMA */
static inline void numa_set_node(int cpu, int node) { }
static inline void numa_clear_node(int cpu) { }
@@ -55,6 +57,10 @@ static inline void init_cpu_to_node(void) { }
static inline void numa_add_cpu(unsigned int cpu) { }
static inline void numa_remove_cpu(unsigned int cpu) { }
static inline void init_gi_nodes(void) { }
+static inline int num_phys_nodes(void)
+{
+ return 1;
+}
#endif /* CONFIG_NUMA */
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 7a97327..99d0a93 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -48,6 +48,8 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] = {
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
};
+nodemask_t numa_phys_nodes_parsed __initdata;
+
int numa_cpu_node(int cpu)
{
u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
@@ -57,6 +59,11 @@ int numa_cpu_node(int cpu)
return NUMA_NO_NODE;
}
+int __init num_phys_nodes(void)
+{
+ return bitmap_weight(numa_phys_nodes_parsed.bits, MAX_NUMNODES);
+}
+
cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
EXPORT_SYMBOL(node_to_cpumask_map);
@@ -210,6 +217,7 @@ static int __init dummy_numa_init(void)
0LLU, PFN_PHYS(max_pfn) - 1);
node_set(0, numa_nodes_parsed);
+ node_set(0, numa_phys_nodes_parsed);
numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
return 0;
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 6f8e0f2..44ca666 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -57,6 +57,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
}
set_apicid_to_node(apic_id, node);
node_set(node, numa_nodes_parsed);
+ node_set(node, numa_phys_nodes_parsed);
pr_debug("SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", pxm, apic_id, node);
}
@@ -97,6 +98,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
set_apicid_to_node(apic_id, node);
node_set(node, numa_nodes_parsed);
+ node_set(node, numa_phys_nodes_parsed);
pr_debug("SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", pxm, apic_id, node);
}
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v2 2/5] x86/topo: Add topology_num_nodes_per_package()
2026-03-03 10:55 [PATCH v2 0/5] x86/topo: SNC Divination Peter Zijlstra
2026-03-03 10:55 ` [PATCH v2 1/5] x86/numa: Store extra copy of numa_nodes_parsed Peter Zijlstra
@ 2026-03-03 10:55 ` Peter Zijlstra
2026-03-04 15:46 ` [tip: x86/urgent] " tip-bot2 for Peter Zijlstra
2026-03-03 10:55 ` [PATCH v2 3/5] x86/topo: Replace x86_has_numa_in_package Peter Zijlstra
` (3 subsequent siblings)
5 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2026-03-03 10:55 UTC (permalink / raw)
To: x86, tglx
Cc: linux-kernel, peterz, tim.c.chen, yu.c.chen, kyle.meyer,
vinicius.gomes, brgerst, hpa, kprateek.nayak, patryk.wlazlyn,
rafael.j.wysocki, russ.anderson, zhao1.liu, tony.luck, Zhang Rui
Use the MADT and SRAT table data to compute __num_nodes_per_package.
Specifically, SRAT has already been parsed in x86_numa_init(), which is called
before acpi_boot_init() which parses MADT. So both are available in
topology_init_possible_cpus().
This number is useful to divinate the various Intel CoD/SNC and AMD NPS modes,
since the platforms are failing to provide this otherwise.
Doing it this way is independent of the number of online CPUs and
other such shenanigans.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Tony Luck <tony.luck@intel.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Kyle Meyer <kyle.meyer@hpe.com>
---
arch/x86/include/asm/topology.h | 6 ++++++
arch/x86/kernel/cpu/common.c | 3 +++
arch/x86/kernel/cpu/topology.c | 13 +++++++++++--
3 files changed, 20 insertions(+), 2 deletions(-)
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -155,6 +155,7 @@ extern unsigned int __max_logical_packag
extern unsigned int __max_threads_per_core;
extern unsigned int __num_threads_per_package;
extern unsigned int __num_cores_per_package;
+extern unsigned int __num_nodes_per_package;
const char *get_topology_cpu_type_name(struct cpuinfo_x86 *c);
enum x86_topology_cpu_type get_topology_cpu_type(struct cpuinfo_x86 *c);
@@ -179,6 +180,11 @@ static inline unsigned int topology_num_
return __num_threads_per_package;
}
+static inline unsigned int topology_num_nodes_per_package(void)
+{
+ return __num_nodes_per_package;
+}
+
#ifdef CONFIG_X86_LOCAL_APIC
int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level);
#else
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -95,6 +95,9 @@ EXPORT_SYMBOL(__max_dies_per_package);
unsigned int __max_logical_packages __ro_after_init = 1;
EXPORT_SYMBOL(__max_logical_packages);
+unsigned int __num_nodes_per_package __ro_after_init = 1;
+EXPORT_SYMBOL(__num_nodes_per_package);
+
unsigned int __num_cores_per_package __ro_after_init = 1;
EXPORT_SYMBOL(__num_cores_per_package);
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -31,6 +31,7 @@
#include <asm/mpspec.h>
#include <asm/msr.h>
#include <asm/smp.h>
+#include <asm/numa.h>
#include "cpu.h"
@@ -492,11 +493,19 @@ void __init topology_init_possible_cpus(
set_nr_cpu_ids(allowed);
cnta = domain_weight(TOPO_PKG_DOMAIN);
- cntb = domain_weight(TOPO_DIE_DOMAIN);
__max_logical_packages = cnta;
+
+ pr_info("Max. logical packages: %3u\n", __max_logical_packages);
+
+ cntb = num_phys_nodes();
+ __num_nodes_per_package = DIV_ROUND_UP(cntb, cnta);
+
+ pr_info("Max. logical nodes: %3u\n", cntb);
+ pr_info("Num. nodes per package:%3u\n", __num_nodes_per_package);
+
+ cntb = domain_weight(TOPO_DIE_DOMAIN);
__max_dies_per_package = 1U << (get_count_order(cntb) - get_count_order(cnta));
- pr_info("Max. logical packages: %3u\n", cnta);
pr_info("Max. logical dies: %3u\n", cntb);
pr_info("Max. dies per package: %3u\n", __max_dies_per_package);
^ permalink raw reply [flat|nested] 14+ messages in thread* [tip: x86/urgent] x86/topo: Add topology_num_nodes_per_package()
2026-03-03 10:55 ` [PATCH v2 2/5] x86/topo: Add topology_num_nodes_per_package() Peter Zijlstra
@ 2026-03-04 15:46 ` tip-bot2 for Peter Zijlstra
0 siblings, 0 replies; 14+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2026-03-04 15:46 UTC (permalink / raw)
To: linux-tip-commits
Cc: Peter Zijlstra (Intel), Ingo Molnar, Tony Luck, K Prateek Nayak,
Zhang Rui, Chen Yu, Kyle Meyer, x86, linux-kernel
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: ae6730ff42b3a13d94b405edeb5e40108b6d21b6
Gitweb: https://git.kernel.org/tip/ae6730ff42b3a13d94b405edeb5e40108b6d21b6
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Tue, 03 Mar 2026 11:55:41 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Wed, 04 Mar 2026 16:35:08 +01:00
x86/topo: Add topology_num_nodes_per_package()
Use the MADT and SRAT table data to compute __num_nodes_per_package.
Specifically, SRAT has already been parsed in x86_numa_init(), which is called
before acpi_boot_init() which parses MADT. So both are available in
topology_init_possible_cpus().
This number is useful to divinate the various Intel CoD/SNC and AMD NPS modes,
since the platforms are failing to provide this otherwise.
Doing it this way is independent of the number of online CPUs and
other such shenanigans.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Tony Luck <tony.luck@intel.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Kyle Meyer <kyle.meyer@hpe.com>
Link: https://patch.msgid.link/20260303110100.004091624@infradead.org
---
arch/x86/include/asm/topology.h | 6 ++++++
arch/x86/kernel/cpu/common.c | 3 +++
arch/x86/kernel/cpu/topology.c | 13 +++++++++++--
3 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 1fadf0c..0ba9bdb 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -155,6 +155,7 @@ extern unsigned int __max_logical_packages;
extern unsigned int __max_threads_per_core;
extern unsigned int __num_threads_per_package;
extern unsigned int __num_cores_per_package;
+extern unsigned int __num_nodes_per_package;
const char *get_topology_cpu_type_name(struct cpuinfo_x86 *c);
enum x86_topology_cpu_type get_topology_cpu_type(struct cpuinfo_x86 *c);
@@ -179,6 +180,11 @@ static inline unsigned int topology_num_threads_per_package(void)
return __num_threads_per_package;
}
+static inline unsigned int topology_num_nodes_per_package(void)
+{
+ return __num_nodes_per_package;
+}
+
#ifdef CONFIG_X86_LOCAL_APIC
int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level);
#else
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 1c3261c..a8ff437 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -95,6 +95,9 @@ EXPORT_SYMBOL(__max_dies_per_package);
unsigned int __max_logical_packages __ro_after_init = 1;
EXPORT_SYMBOL(__max_logical_packages);
+unsigned int __num_nodes_per_package __ro_after_init = 1;
+EXPORT_SYMBOL(__num_nodes_per_package);
+
unsigned int __num_cores_per_package __ro_after_init = 1;
EXPORT_SYMBOL(__num_cores_per_package);
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 23190a7..eafcb1f 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -31,6 +31,7 @@
#include <asm/mpspec.h>
#include <asm/msr.h>
#include <asm/smp.h>
+#include <asm/numa.h>
#include "cpu.h"
@@ -492,11 +493,19 @@ void __init topology_init_possible_cpus(void)
set_nr_cpu_ids(allowed);
cnta = domain_weight(TOPO_PKG_DOMAIN);
- cntb = domain_weight(TOPO_DIE_DOMAIN);
__max_logical_packages = cnta;
+
+ pr_info("Max. logical packages: %3u\n", __max_logical_packages);
+
+ cntb = num_phys_nodes();
+ __num_nodes_per_package = DIV_ROUND_UP(cntb, cnta);
+
+ pr_info("Max. logical nodes: %3u\n", cntb);
+ pr_info("Num. nodes per package:%3u\n", __num_nodes_per_package);
+
+ cntb = domain_weight(TOPO_DIE_DOMAIN);
__max_dies_per_package = 1U << (get_count_order(cntb) - get_count_order(cnta));
- pr_info("Max. logical packages: %3u\n", cnta);
pr_info("Max. logical dies: %3u\n", cntb);
pr_info("Max. dies per package: %3u\n", __max_dies_per_package);
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v2 3/5] x86/topo: Replace x86_has_numa_in_package
2026-03-03 10:55 [PATCH v2 0/5] x86/topo: SNC Divination Peter Zijlstra
2026-03-03 10:55 ` [PATCH v2 1/5] x86/numa: Store extra copy of numa_nodes_parsed Peter Zijlstra
2026-03-03 10:55 ` [PATCH v2 2/5] x86/topo: Add topology_num_nodes_per_package() Peter Zijlstra
@ 2026-03-03 10:55 ` Peter Zijlstra
2026-03-04 15:46 ` [tip: x86/urgent] " tip-bot2 for Peter Zijlstra
2026-03-03 10:55 ` [PATCH v2 4/5] x86/topo: Fix SNC topology mess Peter Zijlstra
` (2 subsequent siblings)
5 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2026-03-03 10:55 UTC (permalink / raw)
To: x86, tglx
Cc: linux-kernel, peterz, tim.c.chen, yu.c.chen, kyle.meyer,
vinicius.gomes, brgerst, hpa, kprateek.nayak, patryk.wlazlyn,
rafael.j.wysocki, russ.anderson, zhao1.liu, tony.luck, Zhang Rui
.. with the brand spanking new topology_num_nodes_per_package().
Having the topology setup determine this value during MADT/SRAT parsing before
SMP bringup avoids having to detect this situation when building the SMP
topology masks.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Tony Luck <tony.luck@intel.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Kyle Meyer <kyle.meyer@hpe.com>
---
arch/x86/kernel/smpboot.c | 13 +++----------
1 file changed, 3 insertions(+), 10 deletions(-)
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -468,13 +468,6 @@ static int x86_cluster_flags(void)
}
#endif
-/*
- * Set if a package/die has multiple NUMA nodes inside.
- * AMD Magny-Cours, Intel Cluster-on-Die, and Intel
- * Sub-NUMA Clustering have this.
- */
-static bool x86_has_numa_in_package;
-
static struct sched_domain_topology_level x86_topology[] = {
SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT),
#ifdef CONFIG_SCHED_CLUSTER
@@ -496,7 +489,7 @@ static void __init build_sched_topology(
* PKG domain since the NUMA domains will auto-magically create the
* right spanning domains based on the SLIT.
*/
- if (x86_has_numa_in_package) {
+ if (topology_num_nodes_per_package() > 1) {
unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2;
memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom]));
@@ -550,7 +543,7 @@ int arch_sched_node_distance(int from, i
case INTEL_GRANITERAPIDS_X:
case INTEL_ATOM_DARKMONT_X:
- if (!x86_has_numa_in_package || topology_max_packages() == 1 ||
+ if (topology_max_packages() == 1 || topology_num_nodes_per_package() == 1 ||
d < REMOTE_DISTANCE)
return d;
@@ -606,7 +599,7 @@ void set_cpu_sibling_map(int cpu)
o = &cpu_data(i);
if (match_pkg(c, o) && !topology_same_node(c, o))
- x86_has_numa_in_package = true;
+ WARN_ON_ONCE(topology_num_nodes_per_package() == 1);
if ((i == cpu) || (has_smt && match_smt(c, o)))
link_mask(topology_sibling_cpumask, cpu, i);
^ permalink raw reply [flat|nested] 14+ messages in thread* [tip: x86/urgent] x86/topo: Replace x86_has_numa_in_package
2026-03-03 10:55 ` [PATCH v2 3/5] x86/topo: Replace x86_has_numa_in_package Peter Zijlstra
@ 2026-03-04 15:46 ` tip-bot2 for Peter Zijlstra
0 siblings, 0 replies; 14+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2026-03-04 15:46 UTC (permalink / raw)
To: linux-tip-commits
Cc: Peter Zijlstra (Intel), Ingo Molnar, Tony Luck, K Prateek Nayak,
Zhang Rui, Chen Yu, Kyle Meyer, x86, linux-kernel
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: 717b64d58cff6fb97f97be07e382ed7641167a56
Gitweb: https://git.kernel.org/tip/717b64d58cff6fb97f97be07e382ed7641167a56
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Tue, 03 Mar 2026 11:55:42 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Wed, 04 Mar 2026 16:35:08 +01:00
x86/topo: Replace x86_has_numa_in_package
.. with the brand spanking new topology_num_nodes_per_package().
Having the topology setup determine this value during MADT/SRAT parsing before
SMP bringup avoids having to detect this situation when building the SMP
topology masks.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Tony Luck <tony.luck@intel.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Kyle Meyer <kyle.meyer@hpe.com>
Link: https://patch.msgid.link/20260303110100.123701837@infradead.org
---
arch/x86/kernel/smpboot.c | 13 +++----------
1 file changed, 3 insertions(+), 10 deletions(-)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 5cd6950..db3e481 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -468,13 +468,6 @@ static int x86_cluster_flags(void)
}
#endif
-/*
- * Set if a package/die has multiple NUMA nodes inside.
- * AMD Magny-Cours, Intel Cluster-on-Die, and Intel
- * Sub-NUMA Clustering have this.
- */
-static bool x86_has_numa_in_package;
-
static struct sched_domain_topology_level x86_topology[] = {
SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT),
#ifdef CONFIG_SCHED_CLUSTER
@@ -496,7 +489,7 @@ static void __init build_sched_topology(void)
* PKG domain since the NUMA domains will auto-magically create the
* right spanning domains based on the SLIT.
*/
- if (x86_has_numa_in_package) {
+ if (topology_num_nodes_per_package() > 1) {
unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2;
memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom]));
@@ -550,7 +543,7 @@ int arch_sched_node_distance(int from, int to)
case INTEL_GRANITERAPIDS_X:
case INTEL_ATOM_DARKMONT_X:
- if (!x86_has_numa_in_package || topology_max_packages() == 1 ||
+ if (topology_max_packages() == 1 || topology_num_nodes_per_package() == 1 ||
d < REMOTE_DISTANCE)
return d;
@@ -606,7 +599,7 @@ void set_cpu_sibling_map(int cpu)
o = &cpu_data(i);
if (match_pkg(c, o) && !topology_same_node(c, o))
- x86_has_numa_in_package = true;
+ WARN_ON_ONCE(topology_num_nodes_per_package() == 1);
if ((i == cpu) || (has_smt && match_smt(c, o)))
link_mask(topology_sibling_cpumask, cpu, i);
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v2 4/5] x86/topo: Fix SNC topology mess
2026-03-03 10:55 [PATCH v2 0/5] x86/topo: SNC Divination Peter Zijlstra
` (2 preceding siblings ...)
2026-03-03 10:55 ` [PATCH v2 3/5] x86/topo: Replace x86_has_numa_in_package Peter Zijlstra
@ 2026-03-03 10:55 ` Peter Zijlstra
2026-03-03 11:59 ` Ingo Molnar
2026-03-04 15:46 ` [tip: x86/urgent] " tip-bot2 for Peter Zijlstra
2026-03-03 10:55 ` [PATCH v2 5/5] x86/resctrl: Fix SNC detection Peter Zijlstra
2026-03-03 12:01 ` [PATCH v2 0/5] x86/topo: SNC Divination Ingo Molnar
5 siblings, 2 replies; 14+ messages in thread
From: Peter Zijlstra @ 2026-03-03 10:55 UTC (permalink / raw)
To: x86, tglx
Cc: linux-kernel, peterz, tim.c.chen, yu.c.chen, kyle.meyer,
vinicius.gomes, brgerst, hpa, kprateek.nayak, patryk.wlazlyn,
rafael.j.wysocki, russ.anderson, zhao1.liu, tony.luck, Zhang Rui
Per 4d6dd05d07d0 ("sched/topology: Fix sched domain build error for GNR, CWF in
SNC-3 mode"), the original crazy SNC-3 SLIT table was:
node distances:
node 0 1 2 3 4 5
0: 10 15 17 21 28 26
1: 15 10 15 23 26 23
2: 17 15 10 26 23 21
3: 21 28 26 10 15 17
4: 23 26 23 15 10 15
5: 26 23 21 17 15 10
And per:
https://lore.kernel.org/lkml/20250825075642.GQ3245006@noisy.programming.kicks-ass.net/
The suggestion was to average the off-trace clusters to restore sanity.
However, 4d6dd05d07d0 implements this under various assumptions:
- anything GNR/CWF with numa_in_package;
- there will never be more than 2 packages;
- the off-trace cluster will have distance >20
And then HPE shows up with a machine that matches the
Vendor-Family-Model checks but looks like this:
Here's an 8 socket (2 chassis) HPE system with SNC enabled:
node 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
0: 10 12 16 16 16 16 18 18 40 40 40 40 40 40 40 40
1: 12 10 16 16 16 16 18 18 40 40 40 40 40 40 40 40
2: 16 16 10 12 18 18 16 16 40 40 40 40 40 40 40 40
3: 16 16 12 10 18 18 16 16 40 40 40 40 40 40 40 40
4: 16 16 18 18 10 12 16 16 40 40 40 40 40 40 40 40
5: 16 16 18 18 12 10 16 16 40 40 40 40 40 40 40 40
6: 18 18 16 16 16 16 10 12 40 40 40 40 40 40 40 40
7: 18 18 16 16 16 16 12 10 40 40 40 40 40 40 40 40
8: 40 40 40 40 40 40 40 40 10 12 16 16 16 16 18 18
9: 40 40 40 40 40 40 40 40 12 10 16 16 16 16 18 18
10: 40 40 40 40 40 40 40 40 16 16 10 12 18 18 16 16
11: 40 40 40 40 40 40 40 40 16 16 12 10 18 18 16 16
12: 40 40 40 40 40 40 40 40 16 16 18 18 10 12 16 16
13: 40 40 40 40 40 40 40 40 16 16 18 18 12 10 16 16
14: 40 40 40 40 40 40 40 40 18 18 16 16 16 16 10 12
15: 40 40 40 40 40 40 40 40 18 18 16 16 16 16 12 10
10 = Same chassis and socket
12 = Same chassis and socket (SNC)
16 = Same chassis and adjacent socket
18 = Same chassis and non-adjacent socket
40 = Different chassis
Turns out, the 'max 2 packages' thing is only relevant to the SNC-3 parts, the
smaller parts do 8 sockets (like usual). The above SLIT table is sane, but
violates the previous assumptions and trips a WARN.
Now that the topology code has a sensible measure of nodes-per-package, we can
use that to divinate the SNC mode at hand, and only fix up SNC-3 topologies.
There is a 'healthy' amount of paranoia code validating the assumptions on the
SLIT table, a simple pr_err(FW_BUG) print on failure and a fallback to using
the regular table. Lets see how long this lasts :-)
Fixes: 4d6dd05d07d0 ("sched/topology: Fix sched domain build error for GNR, CWF in SNC-3 mode")
Reported-by: Kyle Meyer <kyle.meyer@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Kyle Meyer <kyle.meyer@hpe.com>
---
arch/x86/kernel/smpboot.c | 185 ++++++++++++++++++++++++++++++++++------------
1 file changed, 140 insertions(+), 45 deletions(-)
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -506,33 +506,148 @@ static void __init build_sched_topology(
}
#ifdef CONFIG_NUMA
-static int sched_avg_remote_distance;
-static int avg_remote_numa_distance(void)
+/*
+ * Test if the on-trace cluster at (N,N) is symmetric.
+ * Uses upper triangle iteration to avoid obvious duplicates.
+ */
+static bool slit_cluster_symmetric(int N)
{
- int i, j;
- int distance, nr_remote, total_distance;
+ int u = topology_num_nodes_per_package();
- if (sched_avg_remote_distance > 0)
- return sched_avg_remote_distance;
+ for (int k = 0; k < u; k++) {
+ for (int l = k; l < u; l++) {
+ if (node_distance(N + k, N + l) !=
+ node_distance(N + l, N + k))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/*
+ * Return the package-id of the cluster, or ~0 if indeterminate.
+ * Each node in the on-trace cluster should have the same package-id.
+ */
+static u32 slit_cluster_package(int N)
+{
+ int u = topology_num_nodes_per_package();
+ u32 pkg_id = ~0;
+
+ for (int n = 0; n < u; n++) {
+ const struct cpumask *cpus = cpumask_of_node(N + n);
+ int cpu;
+
+ for_each_cpu(cpu, cpus) {
+ u32 id = topology_logical_package_id(cpu);
+ if (pkg_id == ~0)
+ pkg_id = id;
+ if (pkg_id != id)
+ return ~0;
+ }
+ }
+
+ return pkg_id;
+}
+
+/*
+ * Validate the SLIT table is of the form expected for SNC-3, specifically:
+ *
+ * - each on-trace cluster should be symmetric,
+ * - each on-trace cluster should have a unique package-id.
+ *
+ * If you NUMA_EMU on top of SNC, you get to keep the pieces.
+ */
+static bool slit_validate(void)
+{
+ int u = topology_num_nodes_per_package();
+ u32 pkg_id, prev_pkg_id = ~0;
- nr_remote = 0;
- total_distance = 0;
- for_each_node_state(i, N_CPU) {
- for_each_node_state(j, N_CPU) {
- distance = node_distance(i, j);
-
- if (distance >= REMOTE_DISTANCE) {
- nr_remote++;
- total_distance += distance;
- }
+ for (int pkg = 0; pkg < topology_max_packages(); pkg++) {
+ int n = pkg * u;
+
+ /*
+ * Ensure the on-trace cluster is symmetric and each cluster
+ * has a different package id.
+ */
+ if (!slit_cluster_symmetric(n))
+ return false;
+ pkg_id = slit_cluster_package(n);
+ if (pkg_id == ~0)
+ return false;
+ if (pkg && pkg_id == prev_pkg_id)
+ return false;
+
+ prev_pkg_id = pkg_id;
+ }
+
+ return true;
+}
+
+/*
+ * Compute a sanitized SLIT table for SNC; notably SNC-3 can end up with
+ * asymmetric off-trace clusters, reflecting physical assymmetries. However
+ * this leads to 'unfortunate' sched_domain configurations.
+ *
+ * For example dual socket GNR with SNC-3:
+ *
+ * node distances:
+ * node 0 1 2 3 4 5
+ * 0: 10 15 17 21 28 26
+ * 1: 15 10 15 23 26 23
+ * 2: 17 15 10 26 23 21
+ * 3: 21 28 26 10 15 17
+ * 4: 23 26 23 15 10 15
+ * 5: 26 23 21 17 15 10
+ *
+ * Fix things up by averaging out the off-trace clusters; resulting in:
+ *
+ * node 0 1 2 3 4 5
+ * 0: 10 15 17 24 24 24
+ * 1: 15 10 15 24 24 24
+ * 2: 17 15 10 24 24 24
+ * 3: 24 24 24 10 15 17
+ * 4: 24 24 24 15 10 15
+ * 5: 24 24 24 17 15 10
+ */
+static int slit_cluster_distance(int i, int j)
+{
+ static int slit_valid = -1;
+ int u = topology_num_nodes_per_package();
+ long d = 0;
+ int x, y;
+
+ if (slit_valid < 0) {
+ slit_valid = slit_validate();
+ if (!slit_valid)
+ pr_err(FW_BUG "SLIT table doesn't have the expected form for SNC -- fixup disabled!\n");
+ else
+ pr_info("Fixing up SNC SLIT table.\n");
+ }
+
+ /*
+ * Is this a unit cluster on the trace?
+ */
+ if ((i / u) == (j / u) || !slit_valid)
+ return node_distance(i, j);
+
+ /*
+ * Off-trace cluster.
+ *
+ * Notably average out the symmetric pair of off-trace clusters to
+ * ensure the resulting SLIT table is symmetric.
+ */
+ x = i - (i % u);
+ y = j - (j % u);
+
+ for (i = x; i < x + u; i++) {
+ for (j = y; j < y + u; j++) {
+ d += node_distance(i, j);
+ d += node_distance(j, i);
}
}
- if (nr_remote)
- sched_avg_remote_distance = total_distance / nr_remote;
- else
- sched_avg_remote_distance = REMOTE_DISTANCE;
- return sched_avg_remote_distance;
+ return d / (2*u*u);
}
int arch_sched_node_distance(int from, int to)
@@ -542,34 +657,14 @@ int arch_sched_node_distance(int from, i
switch (boot_cpu_data.x86_vfm) {
case INTEL_GRANITERAPIDS_X:
case INTEL_ATOM_DARKMONT_X:
-
- if (topology_max_packages() == 1 || topology_num_nodes_per_package() == 1 ||
- d < REMOTE_DISTANCE)
+ if (topology_max_packages() == 1 ||
+ topology_num_nodes_per_package() < 3)
return d;
/*
- * With SNC enabled, there could be too many levels of remote
- * NUMA node distances, creating NUMA domain levels
- * including local nodes and partial remote nodes.
- *
- * Trim finer distance tuning for NUMA nodes in remote package
- * for the purpose of building sched domains. Group NUMA nodes
- * in the remote package in the same sched group.
- * Simplify NUMA domains and avoid extra NUMA levels including
- * different remote NUMA nodes and local nodes.
- *
- * GNR and CWF don't expect systems with more than 2 packages
- * and more than 2 hops between packages. Single average remote
- * distance won't be appropriate if there are more than 2
- * packages as average distance to different remote packages
- * could be different.
+ * Handle SNC-3 asymmetries.
*/
- WARN_ONCE(topology_max_packages() > 2,
- "sched: Expect only up to 2 packages for GNR or CWF, "
- "but saw %d packages when building sched domains.",
- topology_max_packages());
-
- d = avg_remote_numa_distance();
+ return slit_cluster_distance(from, to);
}
return d;
}
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH v2 4/5] x86/topo: Fix SNC topology mess
2026-03-03 10:55 ` [PATCH v2 4/5] x86/topo: Fix SNC topology mess Peter Zijlstra
@ 2026-03-03 11:59 ` Ingo Molnar
2026-03-03 14:45 ` Peter Zijlstra
2026-03-04 15:46 ` [tip: x86/urgent] " tip-bot2 for Peter Zijlstra
1 sibling, 1 reply; 14+ messages in thread
From: Ingo Molnar @ 2026-03-03 11:59 UTC (permalink / raw)
To: Peter Zijlstra
Cc: x86, tglx, linux-kernel, tim.c.chen, yu.c.chen, kyle.meyer,
vinicius.gomes, brgerst, hpa, kprateek.nayak, patryk.wlazlyn,
rafael.j.wysocki, russ.anderson, zhao1.liu, tony.luck, Zhang Rui
* Peter Zijlstra <peterz@infradead.org> wrote:
> +static u32 slit_cluster_package(int N)
> +{
> + int u = topology_num_nodes_per_package();
> + u32 pkg_id = ~0;
> +
> + for (int n = 0; n < u; n++) {
> + const struct cpumask *cpus = cpumask_of_node(N + n);
> + int cpu;
> +
> + for_each_cpu(cpu, cpus) {
> + u32 id = topology_logical_package_id(cpu);
> + if (pkg_id == ~0)
> + pkg_id = id;
Nit: newline after the 'id' local variable definition.
> + /*
> + * Off-trace cluster.
> + *
> + * Notably average out the symmetric pair of off-trace clusters to
> + * ensure the resulting SLIT table is symmetric.
> + */
> + x = i - (i % u);
> + y = j - (j % u);
AFAICS that's an open-coded rounddown() from <linux/math.h>:
x = rounddown(i, u);
y = rounddown(j, u);
right?
With these addressed:
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Thanks,
Ingo
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH v2 4/5] x86/topo: Fix SNC topology mess
2026-03-03 11:59 ` Ingo Molnar
@ 2026-03-03 14:45 ` Peter Zijlstra
0 siblings, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2026-03-03 14:45 UTC (permalink / raw)
To: Ingo Molnar
Cc: x86, tglx, linux-kernel, tim.c.chen, yu.c.chen, kyle.meyer,
vinicius.gomes, brgerst, hpa, kprateek.nayak, patryk.wlazlyn,
rafael.j.wysocki, russ.anderson, zhao1.liu, tony.luck, Zhang Rui
On Tue, Mar 03, 2026 at 12:59:46PM +0100, Ingo Molnar wrote:
>
> * Peter Zijlstra <peterz@infradead.org> wrote:
>
> > +static u32 slit_cluster_package(int N)
> > +{
> > + int u = topology_num_nodes_per_package();
> > + u32 pkg_id = ~0;
> > +
> > + for (int n = 0; n < u; n++) {
> > + const struct cpumask *cpus = cpumask_of_node(N + n);
> > + int cpu;
> > +
> > + for_each_cpu(cpu, cpus) {
> > + u32 id = topology_logical_package_id(cpu);
> > + if (pkg_id == ~0)
> > + pkg_id = id;
>
> Nit: newline after the 'id' local variable definition.
>
> > + /*
> > + * Off-trace cluster.
> > + *
> > + * Notably average out the symmetric pair of off-trace clusters to
> > + * ensure the resulting SLIT table is symmetric.
> > + */
> > + x = i - (i % u);
> > + y = j - (j % u);
>
> AFAICS that's an open-coded rounddown() from <linux/math.h>:
>
> x = rounddown(i, u);
> y = rounddown(j, u);
>
> right?
Yeah, but I can never remember those 'helpers'. That is, if I have to
spend time looking for them, they're a net negative to me.
Same reason I'm most likely to write: (x + d - 1) / d, rather than,
uhmmm, /me goes find.. DIV_ROUND_UP(). And while I did actually use
that in this series, that's only because it was already in use at the
exact spot I made a change, so I didn't have to go looking.
So sure, I can do the rounddown() thing, but really, its longer than the
thing they replace and less clear, so where's the win?
^ permalink raw reply [flat|nested] 14+ messages in thread
* [tip: x86/urgent] x86/topo: Fix SNC topology mess
2026-03-03 10:55 ` [PATCH v2 4/5] x86/topo: Fix SNC topology mess Peter Zijlstra
2026-03-03 11:59 ` Ingo Molnar
@ 2026-03-04 15:46 ` tip-bot2 for Peter Zijlstra
1 sibling, 0 replies; 14+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2026-03-04 15:46 UTC (permalink / raw)
To: linux-tip-commits
Cc: Kyle Meyer, Peter Zijlstra (Intel), Ingo Molnar, K Prateek Nayak,
Zhang Rui, Chen Yu, x86, linux-kernel
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: 528d89a4707e5bfd86e30823c45dbb66877df900
Gitweb: https://git.kernel.org/tip/528d89a4707e5bfd86e30823c45dbb66877df900
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Tue, 03 Mar 2026 11:55:43 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Wed, 04 Mar 2026 16:35:09 +01:00
x86/topo: Fix SNC topology mess
Per 4d6dd05d07d0 ("sched/topology: Fix sched domain build error for GNR, CWF in
SNC-3 mode"), the original crazy SNC-3 SLIT table was:
node distances:
node 0 1 2 3 4 5
0: 10 15 17 21 28 26
1: 15 10 15 23 26 23
2: 17 15 10 26 23 21
3: 21 28 26 10 15 17
4: 23 26 23 15 10 15
5: 26 23 21 17 15 10
And per:
https://lore.kernel.org/lkml/20250825075642.GQ3245006@noisy.programming.kicks-ass.net/
The suggestion was to average the off-trace clusters to restore sanity.
However, 4d6dd05d07d0 implements this under various assumptions:
- anything GNR/CWF with numa_in_package;
- there will never be more than 2 packages;
- the off-trace cluster will have distance >20
And then HPE shows up with a machine that matches the
Vendor-Family-Model checks but looks like this:
Here's an 8 socket (2 chassis) HPE system with SNC enabled:
node 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
0: 10 12 16 16 16 16 18 18 40 40 40 40 40 40 40 40
1: 12 10 16 16 16 16 18 18 40 40 40 40 40 40 40 40
2: 16 16 10 12 18 18 16 16 40 40 40 40 40 40 40 40
3: 16 16 12 10 18 18 16 16 40 40 40 40 40 40 40 40
4: 16 16 18 18 10 12 16 16 40 40 40 40 40 40 40 40
5: 16 16 18 18 12 10 16 16 40 40 40 40 40 40 40 40
6: 18 18 16 16 16 16 10 12 40 40 40 40 40 40 40 40
7: 18 18 16 16 16 16 12 10 40 40 40 40 40 40 40 40
8: 40 40 40 40 40 40 40 40 10 12 16 16 16 16 18 18
9: 40 40 40 40 40 40 40 40 12 10 16 16 16 16 18 18
10: 40 40 40 40 40 40 40 40 16 16 10 12 18 18 16 16
11: 40 40 40 40 40 40 40 40 16 16 12 10 18 18 16 16
12: 40 40 40 40 40 40 40 40 16 16 18 18 10 12 16 16
13: 40 40 40 40 40 40 40 40 16 16 18 18 12 10 16 16
14: 40 40 40 40 40 40 40 40 18 18 16 16 16 16 10 12
15: 40 40 40 40 40 40 40 40 18 18 16 16 16 16 12 10
10 = Same chassis and socket
12 = Same chassis and socket (SNC)
16 = Same chassis and adjacent socket
18 = Same chassis and non-adjacent socket
40 = Different chassis
Turns out, the 'max 2 packages' thing is only relevant to the SNC-3 parts, the
smaller parts do 8 sockets (like usual). The above SLIT table is sane, but
violates the previous assumptions and trips a WARN.
Now that the topology code has a sensible measure of nodes-per-package, we can
use that to divinate the SNC mode at hand, and only fix up SNC-3 topologies.
There is a 'healthy' amount of paranoia code validating the assumptions on the
SLIT table, a simple pr_err(FW_BUG) print on failure and a fallback to using
the regular table. Lets see how long this lasts :-)
Fixes: 4d6dd05d07d0 ("sched/topology: Fix sched domain build error for GNR, CWF in SNC-3 mode")
Reported-by: Kyle Meyer <kyle.meyer@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Kyle Meyer <kyle.meyer@hpe.com>
Link: https://patch.msgid.link/20260303110100.238361290@infradead.org
---
arch/x86/kernel/smpboot.c | 190 +++++++++++++++++++++++++++----------
1 file changed, 143 insertions(+), 47 deletions(-)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index db3e481..294a8ea 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -506,33 +506,149 @@ static void __init build_sched_topology(void)
}
#ifdef CONFIG_NUMA
-static int sched_avg_remote_distance;
-static int avg_remote_numa_distance(void)
+/*
+ * Test if the on-trace cluster at (N,N) is symmetric.
+ * Uses upper triangle iteration to avoid obvious duplicates.
+ */
+static bool slit_cluster_symmetric(int N)
{
- int i, j;
- int distance, nr_remote, total_distance;
-
- if (sched_avg_remote_distance > 0)
- return sched_avg_remote_distance;
-
- nr_remote = 0;
- total_distance = 0;
- for_each_node_state(i, N_CPU) {
- for_each_node_state(j, N_CPU) {
- distance = node_distance(i, j);
-
- if (distance >= REMOTE_DISTANCE) {
- nr_remote++;
- total_distance += distance;
- }
+ int u = topology_num_nodes_per_package();
+
+ for (int k = 0; k < u; k++) {
+ for (int l = k; l < u; l++) {
+ if (node_distance(N + k, N + l) !=
+ node_distance(N + l, N + k))
+ return false;
}
}
- if (nr_remote)
- sched_avg_remote_distance = total_distance / nr_remote;
- else
- sched_avg_remote_distance = REMOTE_DISTANCE;
- return sched_avg_remote_distance;
+ return true;
+}
+
+/*
+ * Return the package-id of the cluster, or ~0 if indeterminate.
+ * Each node in the on-trace cluster should have the same package-id.
+ */
+static u32 slit_cluster_package(int N)
+{
+ int u = topology_num_nodes_per_package();
+ u32 pkg_id = ~0;
+
+ for (int n = 0; n < u; n++) {
+ const struct cpumask *cpus = cpumask_of_node(N + n);
+ int cpu;
+
+ for_each_cpu(cpu, cpus) {
+ u32 id = topology_logical_package_id(cpu);
+
+ if (pkg_id == ~0)
+ pkg_id = id;
+ if (pkg_id != id)
+ return ~0;
+ }
+ }
+
+ return pkg_id;
+}
+
+/*
+ * Validate the SLIT table is of the form expected for SNC, specifically:
+ *
+ * - each on-trace cluster should be symmetric,
+ * - each on-trace cluster should have a unique package-id.
+ *
+ * If you NUMA_EMU on top of SNC, you get to keep the pieces.
+ */
+static bool slit_validate(void)
+{
+ int u = topology_num_nodes_per_package();
+ u32 pkg_id, prev_pkg_id = ~0;
+
+ for (int pkg = 0; pkg < topology_max_packages(); pkg++) {
+ int n = pkg * u;
+
+ /*
+ * Ensure the on-trace cluster is symmetric and each cluster
+ * has a different package id.
+ */
+ if (!slit_cluster_symmetric(n))
+ return false;
+ pkg_id = slit_cluster_package(n);
+ if (pkg_id == ~0)
+ return false;
+ if (pkg && pkg_id == prev_pkg_id)
+ return false;
+
+ prev_pkg_id = pkg_id;
+ }
+
+ return true;
+}
+
+/*
+ * Compute a sanitized SLIT table for SNC; notably SNC-3 can end up with
+ * asymmetric off-trace clusters, reflecting physical assymmetries. However
+ * this leads to 'unfortunate' sched_domain configurations.
+ *
+ * For example dual socket GNR with SNC-3:
+ *
+ * node distances:
+ * node 0 1 2 3 4 5
+ * 0: 10 15 17 21 28 26
+ * 1: 15 10 15 23 26 23
+ * 2: 17 15 10 26 23 21
+ * 3: 21 28 26 10 15 17
+ * 4: 23 26 23 15 10 15
+ * 5: 26 23 21 17 15 10
+ *
+ * Fix things up by averaging out the off-trace clusters; resulting in:
+ *
+ * node 0 1 2 3 4 5
+ * 0: 10 15 17 24 24 24
+ * 1: 15 10 15 24 24 24
+ * 2: 17 15 10 24 24 24
+ * 3: 24 24 24 10 15 17
+ * 4: 24 24 24 15 10 15
+ * 5: 24 24 24 17 15 10
+ */
+static int slit_cluster_distance(int i, int j)
+{
+ static int slit_valid = -1;
+ int u = topology_num_nodes_per_package();
+ long d = 0;
+ int x, y;
+
+ if (slit_valid < 0) {
+ slit_valid = slit_validate();
+ if (!slit_valid)
+ pr_err(FW_BUG "SLIT table doesn't have the expected form for SNC -- fixup disabled!\n");
+ else
+ pr_info("Fixing up SNC SLIT table.\n");
+ }
+
+ /*
+ * Is this a unit cluster on the trace?
+ */
+ if ((i / u) == (j / u) || !slit_valid)
+ return node_distance(i, j);
+
+ /*
+ * Off-trace cluster.
+ *
+ * Notably average out the symmetric pair of off-trace clusters to
+ * ensure the resulting SLIT table is symmetric.
+ */
+ x = i - (i % u);
+ y = j - (j % u);
+
+ for (i = x; i < x + u; i++) {
+ for (j = y; j < y + u; j++) {
+ d += node_distance(i, j);
+ d += node_distance(j, i);
+ }
+ }
+
+ return d / (2*u*u);
}
int arch_sched_node_distance(int from, int to)
@@ -542,34 +658,14 @@ int arch_sched_node_distance(int from, int to)
switch (boot_cpu_data.x86_vfm) {
case INTEL_GRANITERAPIDS_X:
case INTEL_ATOM_DARKMONT_X:
-
- if (topology_max_packages() == 1 || topology_num_nodes_per_package() == 1 ||
- d < REMOTE_DISTANCE)
+ if (topology_max_packages() == 1 ||
+ topology_num_nodes_per_package() < 3)
return d;
/*
- * With SNC enabled, there could be too many levels of remote
- * NUMA node distances, creating NUMA domain levels
- * including local nodes and partial remote nodes.
- *
- * Trim finer distance tuning for NUMA nodes in remote package
- * for the purpose of building sched domains. Group NUMA nodes
- * in the remote package in the same sched group.
- * Simplify NUMA domains and avoid extra NUMA levels including
- * different remote NUMA nodes and local nodes.
- *
- * GNR and CWF don't expect systems with more than 2 packages
- * and more than 2 hops between packages. Single average remote
- * distance won't be appropriate if there are more than 2
- * packages as average distance to different remote packages
- * could be different.
+ * Handle SNC-3 asymmetries.
*/
- WARN_ONCE(topology_max_packages() > 2,
- "sched: Expect only up to 2 packages for GNR or CWF, "
- "but saw %d packages when building sched domains.",
- topology_max_packages());
-
- d = avg_remote_numa_distance();
+ return slit_cluster_distance(from, to);
}
return d;
}
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v2 5/5] x86/resctrl: Fix SNC detection
2026-03-03 10:55 [PATCH v2 0/5] x86/topo: SNC Divination Peter Zijlstra
` (3 preceding siblings ...)
2026-03-03 10:55 ` [PATCH v2 4/5] x86/topo: Fix SNC topology mess Peter Zijlstra
@ 2026-03-03 10:55 ` Peter Zijlstra
2026-03-04 15:46 ` [tip: x86/urgent] " tip-bot2 for Tony Luck
2026-03-03 12:01 ` [PATCH v2 0/5] x86/topo: SNC Divination Ingo Molnar
5 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2026-03-03 10:55 UTC (permalink / raw)
To: x86, tglx
Cc: linux-kernel, peterz, tim.c.chen, yu.c.chen, kyle.meyer,
vinicius.gomes, brgerst, hpa, kprateek.nayak, patryk.wlazlyn,
rafael.j.wysocki, russ.anderson, zhao1.liu, tony.luck, Zhang Rui
From: Tony Luck <tony.luck@intel.com>
Now that the x86 topology code has a sensible nodes-per-package
measure, that does not depend on the online status of CPUs, use this
to divinate the SNC mode.
Note that when Cluster on Die (CoD) is configured on older systems this
will also show multiple NUMA nodes per package. Intel Resource Director
Technology is incomaptible with CoD. Print a warning and do not use the
fixup MSR_RMID_SNC_CONFIG.
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Link: https://patch.msgid.link/aaCxbbgjL6OZ6VMd@agluck-desk3
---
arch/x86/kernel/cpu/resctrl/monitor.c | 36 ++++------------------------------
1 file changed, 5 insertions(+), 31 deletions(-)
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -364,7 +364,7 @@ void arch_mon_domain_online(struct rdt_r
msr_clear_bit(MSR_RMID_SNC_CONFIG, 0);
}
-/* CPU models that support MSR_RMID_SNC_CONFIG */
+/* CPU models that support SNC and MSR_RMID_SNC_CONFIG */
static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0),
@@ -375,40 +375,14 @@ static const struct x86_cpu_id snc_cpu_i
{}
};
-/*
- * There isn't a simple hardware bit that indicates whether a CPU is running
- * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the
- * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in
- * the same NUMA node as CPU0.
- * It is not possible to accurately determine SNC state if the system is
- * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes
- * to L3 caches. It will be OK if system is booted with hyperthreading
- * disabled (since this doesn't affect the ratio).
- */
static __init int snc_get_config(void)
{
- struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE);
- const cpumask_t *node0_cpumask;
- int cpus_per_node, cpus_per_l3;
- int ret;
+ int ret = topology_num_nodes_per_package();
- if (!x86_match_cpu(snc_cpu_ids) || !ci)
+ if (ret > 1 && !x86_match_cpu(snc_cpu_ids)) {
+ pr_warn("CoD enabled system? Resctrl not supported\n");
return 1;
-
- cpus_read_lock();
- if (num_online_cpus() != num_present_cpus())
- pr_warn("Some CPUs offline, SNC detection may be incorrect\n");
- cpus_read_unlock();
-
- node0_cpumask = cpumask_of_node(cpu_to_node(0));
-
- cpus_per_node = cpumask_weight(node0_cpumask);
- cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map);
-
- if (!cpus_per_node || !cpus_per_l3)
- return 1;
-
- ret = cpus_per_l3 / cpus_per_node;
+ }
/* sanity check: Only valid results are 1, 2, 3, 4, 6 */
switch (ret) {
^ permalink raw reply [flat|nested] 14+ messages in thread* [tip: x86/urgent] x86/resctrl: Fix SNC detection
2026-03-03 10:55 ` [PATCH v2 5/5] x86/resctrl: Fix SNC detection Peter Zijlstra
@ 2026-03-04 15:46 ` tip-bot2 for Tony Luck
0 siblings, 0 replies; 14+ messages in thread
From: tip-bot2 for Tony Luck @ 2026-03-04 15:46 UTC (permalink / raw)
To: linux-tip-commits
Cc: Tony Luck, Peter Zijlstra (Intel), Ingo Molnar, Zhang Rui,
Chen Yu, x86, linux-kernel
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: 59674fc9d0bfd96ce8a776680ee1cf22c28c9ac7
Gitweb: https://git.kernel.org/tip/59674fc9d0bfd96ce8a776680ee1cf22c28c9ac7
Author: Tony Luck <tony.luck@intel.com>
AuthorDate: Tue, 03 Mar 2026 11:55:44 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Wed, 04 Mar 2026 16:35:09 +01:00
x86/resctrl: Fix SNC detection
Now that the x86 topology code has a sensible nodes-per-package
measure, that does not depend on the online status of CPUs, use this
to divinate the SNC mode.
Note that when Cluster on Die (CoD) is configured on older systems this
will also show multiple NUMA nodes per package. Intel Resource Director
Technology is incomaptible with CoD. Print a warning and do not use the
fixup MSR_RMID_SNC_CONFIG.
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Link: https://patch.msgid.link/aaCxbbgjL6OZ6VMd@agluck-desk3
Link: https://patch.msgid.link/20260303110100.367976706@infradead.org
---
arch/x86/kernel/cpu/resctrl/monitor.c | 36 +++-----------------------
1 file changed, 5 insertions(+), 31 deletions(-)
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index e6a1542..9bd87ba 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -364,7 +364,7 @@ void arch_mon_domain_online(struct rdt_resource *r, struct rdt_l3_mon_domain *d)
msr_clear_bit(MSR_RMID_SNC_CONFIG, 0);
}
-/* CPU models that support MSR_RMID_SNC_CONFIG */
+/* CPU models that support SNC and MSR_RMID_SNC_CONFIG */
static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0),
@@ -375,40 +375,14 @@ static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
{}
};
-/*
- * There isn't a simple hardware bit that indicates whether a CPU is running
- * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the
- * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in
- * the same NUMA node as CPU0.
- * It is not possible to accurately determine SNC state if the system is
- * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes
- * to L3 caches. It will be OK if system is booted with hyperthreading
- * disabled (since this doesn't affect the ratio).
- */
static __init int snc_get_config(void)
{
- struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE);
- const cpumask_t *node0_cpumask;
- int cpus_per_node, cpus_per_l3;
- int ret;
-
- if (!x86_match_cpu(snc_cpu_ids) || !ci)
- return 1;
+ int ret = topology_num_nodes_per_package();
- cpus_read_lock();
- if (num_online_cpus() != num_present_cpus())
- pr_warn("Some CPUs offline, SNC detection may be incorrect\n");
- cpus_read_unlock();
-
- node0_cpumask = cpumask_of_node(cpu_to_node(0));
-
- cpus_per_node = cpumask_weight(node0_cpumask);
- cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map);
-
- if (!cpus_per_node || !cpus_per_l3)
+ if (ret > 1 && !x86_match_cpu(snc_cpu_ids)) {
+ pr_warn("CoD enabled system? Resctrl not supported\n");
return 1;
-
- ret = cpus_per_l3 / cpus_per_node;
+ }
/* sanity check: Only valid results are 1, 2, 3, 4, 6 */
switch (ret) {
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH v2 0/5] x86/topo: SNC Divination
2026-03-03 10:55 [PATCH v2 0/5] x86/topo: SNC Divination Peter Zijlstra
` (4 preceding siblings ...)
2026-03-03 10:55 ` [PATCH v2 5/5] x86/resctrl: Fix SNC detection Peter Zijlstra
@ 2026-03-03 12:01 ` Ingo Molnar
5 siblings, 0 replies; 14+ messages in thread
From: Ingo Molnar @ 2026-03-03 12:01 UTC (permalink / raw)
To: Peter Zijlstra
Cc: x86, tglx, linux-kernel, tim.c.chen, yu.c.chen, kyle.meyer,
vinicius.gomes, brgerst, hpa, kprateek.nayak, patryk.wlazlyn,
rafael.j.wysocki, russ.anderson, zhao1.liu, tony.luck
* Peter Zijlstra <peterz@infradead.org> wrote:
> Hi!
>
> Extend Thomas' MADT time topology code to include SRAT based node counts to
> provide a boot time based: topology_num_nodes_per_package().
>
> This provides a best effort estimate for things like Intel CoD/SNC and AMD NPS
> modes that are not otherwise enumerated.
>
> Use this measure to address various SNC snafus.
>
> v1: https://lkml.kernel.org/r/20260226104909.675623579@infradead.org
>
> ---
> arch/x86/include/asm/numa.h | 6 ++
> arch/x86/include/asm/topology.h | 6 ++
> arch/x86/kernel/cpu/common.c | 3 +
> arch/x86/kernel/cpu/resctrl/monitor.c | 36 +------
> arch/x86/kernel/cpu/topology.c | 13 ++-
> arch/x86/kernel/smpboot.c | 198 ++++++++++++++++++++++++----------
> arch/x86/mm/numa.c | 8 ++
> arch/x86/mm/srat.c | 2 +
> 8 files changed, 184 insertions(+), 88 deletions(-)
Modulo some minor comments for patch #4, this series LGTM:
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Thanks,
Ingo
^ permalink raw reply [flat|nested] 14+ messages in thread