LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: "Christophe Leroy (CS GROUP)" <chleroy@kernel.org>
To: Srikar Dronamraju <srikar@linux.ibm.com>,
	linuxppc-dev <linuxppc-dev@lists.ozlabs.org>,
	Madhavan Srinivasan <maddy@linux.ibm.com>,
	Michael Ellerman <mpe@ellerman.id.au>,
	Nicholas Piggin <npiggin@gmail.com>,
	Naveen N Rao <naveen@kernel.org>
Cc: skiboot@lists.ozlags.org, arbab@linux.ibm.com,
	mahesh@linux.ibm.com, linux-kernel@vger.kernel.org
Subject: Re: [PATCH] powerpc/topology: Support coregroup in PowerNV
Date: Fri, 29 May 2026 09:58:25 +0200	[thread overview]
Message-ID: <6387fb12-76f1-4503-8ac2-3ca777c0186e@kernel.org> (raw)
In-Reply-To: <20260524010017.140408-1-srikar@linux.ibm.com>



Le 24/05/2026 à 03:00, Srikar Dronamraju a écrit :
> Coregroup support was only available on PowerPC on PowerVM LPARs.
> However if firmware were to expose coregroup-id to the kernel, then
> coregroup can even be supported on PowerNV too.
> 
> PowerPC Linux kernel will detect support for coregroup by looking at the
> primary_domain_index. Till now on PowerNV systems, primary_domain_index
> has been the penultimate domain in cpunode ibm,associativity device-tree
> property. This would be taken as hint that coregroup support is not
> available in the firmware.
> 
> If on PowerNV systems, primary_domain_index is not the penultimate
> domain in cpunode ibm,associativity device-tree property, then it would
> be taken as a hint that coregroup support is available in the firmware.
> 
> This logic makes it compatible with PowerVM Systems, where
> primary_domain_index is not the penultimate domain in cpunode
> ibm,associativity device-tree property.
> 
> $ lscpu
> Architecture:        ppc64le
> Byte Order:          Little Endian
> CPU(s):              480
> On-line CPU(s) list: 0-479
> Thread(s) per core:  8
> Core(s) per socket:  15
> Socket(s):           4
> NUMA node(s):        4
> Model:               2.0 (pvr 0080 0200)
> Model name:          POWER10, altivec supported
> CPU max MHz:         3249.0000
> CPU min MHz:         3249.0000
> L1d cache:           32K
> L1i cache:           48K
> L2 cache:            1024K
> L3 cache:            4096K
> NUMA node0 CPU(s):   0-119
> NUMA node1 CPU(s):   120-239
> NUMA node2 CPU(s):   240-359
> NUMA node3 CPU(s):   360-479
> 
> with-out patched firmware and/or Linux-kernel
> ---------------------------------------------
> $ grep -h -r . /sys/devices/system/cpu/*/topology/die_id |sort | uniq -c | sort -n -r
>      120 27
>      120 18
>      120 9
>      120 0
> 
> $ grep -h -r . /sys/devices/system/cpu/*/topology/die_cpus_list |sort | uniq -c | sort -n -r
>      120 360-479
>      120 240-359
>      120 120-239
>      120 0-119
> 
> with patched firmware and Linux-kernel
> --------------------------------------
> grep -h -r . /sys/devices/system/cpu/*/topology/die_id |sort | uniq -c | sort -n -r
>       64 6
>       64 4
>       64 2
>       64 0
>       56 7
>       56 5
>       56 3
>       56 1
> grep -h -r . /sys/devices/system/cpu/*/topology/die_cpus_list |sort | uniq -c | sort -n -r
>       64 360-375,392-407,424-439,456-471
>       64 240-255,272-287,296-311,328-343
>       64 120-135,152-167,184-199,208-223
>       64 0-15,32-47,64-79,88-103
>       56 376-391,408-423,440-455,472-479
>       56 256-271,288-295,312-327,344-359
>       56 16-31,48-63,80-87,104-119
>       56 136-151,168-183,200-207,224-239
> 
> Observation:
> Without the patched kernel and/or skiboot, die-id and numa were same.
> With patched kernel and/or skiboot, we see 2 die-id per node.
> Signed-off-by: Srikar Dronamraju <srikar@linux.ibm.com>
> ---
>   arch/powerpc/include/asm/topology.h | 15 +++--
>   arch/powerpc/mm/numa.c              | 87 ++++++++++++++++++++++-------
>   2 files changed, 73 insertions(+), 29 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
> index 66ed5fe1b718..568e6bc55726 100644
> --- a/arch/powerpc/include/asm/topology.h
> +++ b/arch/powerpc/include/asm/topology.h
> @@ -71,6 +71,7 @@ extern void map_cpu_to_node(int cpu, int node);
>   extern void unmap_cpu_from_node(unsigned long cpu);
>   #endif /* CONFIG_HOTPLUG_CPU */
>   
> +extern int cpu_to_coregroup_id(int cpu);

No new 'extern' for function prototypes (allthough it is only a move), 
that's pointless. checkpatch.pl --strict will likely complain about it.

>   #else
>   
>   static inline int early_cpu_to_node(int cpu) { return 0; }
> @@ -107,14 +108,6 @@ static inline void map_cpu_to_node(int cpu, int node) {}
>   static inline void unmap_cpu_from_node(unsigned long cpu) {}
>   #endif /* CONFIG_HOTPLUG_CPU */
>   #endif /* CONFIG_SMP */
> -
> -#endif /* CONFIG_NUMA */
> -
> -#if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR)
> -void find_and_update_cpu_nid(int cpu);
> -extern int cpu_to_coregroup_id(int cpu);
> -#else
> -static inline void find_and_update_cpu_nid(int cpu) {}
>   static inline int cpu_to_coregroup_id(int cpu)
>   {
>   #ifdef CONFIG_SMP
> @@ -124,6 +117,12 @@ static inline int cpu_to_coregroup_id(int cpu)
>   #endif
>   }
>   
> +#endif /* CONFIG_NUMA */
> +
> +#if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR)
> +void find_and_update_cpu_nid(int cpu);
> +#else
> +static inline void find_and_update_cpu_nid(int cpu) {}
>   #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
>   
>   #include <asm-generic/topology.h>
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index f4cf3ae036de..9b45cc9e1f27 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -432,7 +432,7 @@ static void __init initialize_form2_numa_distance_lookup_table(void)
>   
>   static int __init find_primary_domain_index(void)
>   {
> -	int index;
> +	int index = -1;
>   	struct device_node *root;
>   
>   	/*
> @@ -502,12 +502,9 @@ static int __init find_primary_domain_index(void)
>   		distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
>   	}
>   
> -	of_node_put(root);
> -	return index;
> -
>   err:
>   	of_node_put(root);
> -	return -1;
> +	return index;
>   }
>   
>   static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
> @@ -892,12 +889,32 @@ static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
>   	return 0;
>   }
>   
> +/*
> + * If hierarchy extends beyond primary_domain_index + 1, then next
> + * level corresponds to coregroup.
> + */
> +static int detect_and_enable_coregroup(const __be32 *associativity, int index)
> +{
> +	if (!associativity)
> +		return -1;

Even if 'index' is not 0 ?

> +
> +	if (!index) {
> +		index = of_read_number(associativity, 1);
> +
> +		if (index > primary_domain_index + 1)
> +			coregroup_enabled = 1;
> +		else
> +			index = -1;
> +	}
> +	return index;
> +}

I'd prefer something more flat:

	if (index)
		return index;

	index =  of_read_number(associativity, 1);

	if (index <= primary_domain_index + 1)
		return -1;

	coregroup_enabled = 1;
	return index;

> +
>   static int __init parse_numa_properties(void)
>   {
>   	struct device_node *memory, *pci;
> -	int default_nid = 0;
> -	unsigned long i;
> +	int default_nid = 0, index = 0;
>   	const __be32 *associativity;
> +	unsigned long i;
>   
>   	if (numa_enabled == 0) {
>   		pr_warn("disabled by user\n");
> @@ -930,7 +947,6 @@ static int __init parse_numa_properties(void)
>   	 */
>   	for_each_present_cpu(i) {
>   		__be32 vphn_assoc[VPHN_ASSOC_BUFSIZE];
> -		struct device_node *cpu;
>   		int nid = NUMA_NO_NODE;
>   
>   		memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32));
> @@ -938,7 +954,10 @@ static int __init parse_numa_properties(void)
>   		if (__vphn_get_associativity(i, vphn_assoc) == 0) {
>   			nid = associativity_to_nid(vphn_assoc);
>   			initialize_form1_numa_distance(vphn_assoc);
> +			if (!index)
> +				index = detect_and_enable_coregroup(vphn_assoc, index);
>   		} else {
> +			struct device_node *cpu;
>   
>   			/*
>   			 * Don't fall back to default_nid yet -- we will plug
> @@ -951,6 +970,8 @@ static int __init parse_numa_properties(void)
>   			associativity = of_get_associativity(cpu);
>   			if (associativity) {
>   				nid = associativity_to_nid(associativity);
> +				if (!index)
> +					index = detect_and_enable_coregroup(associativity, index);
>   				initialize_form1_numa_distance(associativity);
>   			}
>   			of_node_put(cpu);
> @@ -1431,9 +1452,26 @@ void find_and_update_cpu_nid(int cpu)
>   	pr_debug("%s:%d cpu %d nid %d\n", __func__, __LINE__, cpu, new_nid);
>   }
>   
> +static int topology_update_init(void)
> +{
> +	topology_inited = 1;
> +	return 0;
> +}
> +device_initcall(topology_update_init);
> +
> +#else
> +static long vphn_get_associativity(unsigned long cpu,
> +					__be32 *associativity)
> +{
> +	return -1;
> +}
> +#endif /* CONFIG_PPC_SPLPAR */
> +
>   int cpu_to_coregroup_id(int cpu)
>   {
> -	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
> +	int coregroup_id = cpu_to_core_id(cpu);
> +	struct device_node *cpunode = NULL;
> +	const __be32 *associativity;
>   	int index;
>   
>   	if (cpu < 0 || cpu > nr_cpu_ids)
> @@ -1442,24 +1480,31 @@ int cpu_to_coregroup_id(int cpu)
>   	if (!coregroup_enabled)
>   		goto out;
>   
> -	if (!firmware_has_feature(FW_FEATURE_VPHN))
> -		goto out;
> +	if (firmware_has_feature(FW_FEATURE_VPHN)) {
> +		__be32 tmp[VPHN_ASSOC_BUFSIZE] = {0};
>   
> -	if (vphn_get_associativity(cpu, associativity))
> +		if (vphn_get_associativity(cpu, tmp))
> +			goto out;
> +
> +		associativity = tmp;
> +
> +	} else {
> +		cpunode = of_get_cpu_node(cpu, NULL);
> +		if (!cpunode)
> +			goto out;
> +
> +		associativity = of_get_associativity(cpunode);
> +	}
> +	if (!associativity)
>   		goto out;
>   
>   	index = of_read_number(associativity, 1);
>   	if (index > primary_domain_index + 1)
> -		return of_read_number(&associativity[index - 1], 1);
> +		coregroup_id = of_read_number(&associativity[index - 1], 1);
>   
>   out:
> -	return cpu_to_core_id(cpu);
> -}
> +	if (cpunode)
> +		of_node_put(cpunode);
>   
> -static int topology_update_init(void)
> -{
> -	topology_inited = 1;
> -	return 0;
> +	return coregroup_id;
>   }
> -device_initcall(topology_update_init);
> -#endif /* CONFIG_PPC_SPLPAR */



  reply	other threads:[~2026-05-29  7:58 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-24  1:00 [PATCH] powerpc/topology: Support coregroup in PowerNV Srikar Dronamraju
2026-05-29  7:58 ` Christophe Leroy (CS GROUP) [this message]
2026-05-29 10:31   ` Srikar Dronamraju

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=6387fb12-76f1-4503-8ac2-3ca777c0186e@kernel.org \
    --to=chleroy@kernel.org \
    --cc=arbab@linux.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=maddy@linux.ibm.com \
    --cc=mahesh@linux.ibm.com \
    --cc=mpe@ellerman.id.au \
    --cc=naveen@kernel.org \
    --cc=npiggin@gmail.com \
    --cc=skiboot@lists.ozlags.org \
    --cc=srikar@linux.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox