linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
To: David Gibson <david@gibson.dropbear.id.au>
Cc: Nathan Lynch <nathanl@linux.ibm.com>,
	Daniel Henrique Barboza <danielhb413@gmail.com>,
	linuxppc-dev@lists.ozlabs.org
Subject: Re: [PATCH v5 4/6] powerpc/pseries: Consolidate different NUMA distance update code paths
Date: Thu, 22 Jul 2021 12:37:46 +0530	[thread overview]
Message-ID: <87zgueu8ql.fsf@linux.ibm.com> (raw)
In-Reply-To: <YPjMkQ5W1fSQdzNe@yekko>

David Gibson <david@gibson.dropbear.id.au> writes:

> On Mon, Jun 28, 2021 at 08:41:15PM +0530, Aneesh Kumar K.V wrote:
>> The associativity details of the newly added resourced are collected from
>> the hypervisor via "ibm,configure-connector" rtas call. Update the numa
>> distance details of the newly added numa node after the above call.
>> 
>> Instead of updating NUMA distance every time we lookup a node id
>> from the associativity property, add helpers that can be used
>> during boot which does this only once. Also remove the distance
>> update from node id lookup helpers.
>> 
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>  arch/powerpc/mm/numa.c                        | 173 +++++++++++++-----
>>  arch/powerpc/platforms/pseries/hotplug-cpu.c  |   2 +
>>  .../platforms/pseries/hotplug-memory.c        |   2 +
>>  arch/powerpc/platforms/pseries/pseries.h      |   1 +
>>  4 files changed, 132 insertions(+), 46 deletions(-)
>> 
>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>> index 0ec16999beef..7b142f79d600 100644
>> --- a/arch/powerpc/mm/numa.c
>> +++ b/arch/powerpc/mm/numa.c
>> @@ -208,22 +208,6 @@ int __node_distance(int a, int b)
>>  }
>>  EXPORT_SYMBOL(__node_distance);
>>  
>> -static void initialize_distance_lookup_table(int nid,
>> -		const __be32 *associativity)
>> -{
>> -	int i;
>> -
>> -	if (affinity_form != FORM1_AFFINITY)
>> -		return;
>> -
>> -	for (i = 0; i < max_associativity_domain_index; i++) {
>> -		const __be32 *entry;
>> -
>> -		entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
>> -		distance_lookup_table[nid][i] = of_read_number(entry, 1);
>> -	}
>> -}
>> -
>>  /*
>>   * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
>>   * info is found.
>> @@ -241,15 +225,6 @@ static int associativity_to_nid(const __be32 *associativity)
>>  	/* POWER4 LPAR uses 0xffff as invalid node */
>>  	if (nid == 0xffff || nid >= nr_node_ids)
>>  		nid = NUMA_NO_NODE;
>> -
>> -	if (nid > 0 &&
>> -		of_read_number(associativity, 1) >= max_associativity_domain_index) {
>> -		/*
>> -		 * Skip the length field and send start of associativity array
>> -		 */
>> -		initialize_distance_lookup_table(nid, associativity + 1);
>> -	}
>> -
>>  out:
>>  	return nid;
>>  }
>> @@ -287,6 +262,49 @@ int of_node_to_nid(struct device_node *device)
>>  }
>>  EXPORT_SYMBOL(of_node_to_nid);
>>  
>> +static void __initialize_form1_numa_distance(const __be32 *associativity)
>> +{
>> +	int i, nid;
>> +
>> +	if (affinity_form != FORM1_AFFINITY)
>
> Since this shouldn't be called on a !form1 system, this could be a WARN_ON().

The way we call functions currently, instead of doing

if (affinity_form == FORM1_AFFINITY)
    __initialize_form1_numa_distance()

We avoid doing the if check in multiple places. For example
parse_numa_properties will fetch the associativity array to find the
details of online node and set it online. We use the same code path to
initialize distance.

		if (__vphn_get_associativity(i, vphn_assoc) == 0) {
			nid = associativity_to_nid(vphn_assoc);
			__initialize_form1_numa_distance(vphn_assoc);
		} else {

			cpu = of_get_cpu_node(i, NULL);
			BUG_ON(!cpu);

			associativity = of_get_associativity(cpu);
			if (associativity) {
				nid = associativity_to_nid(associativity);
				__initialize_form1_numa_distance(associativity);
			}

We avoid the the if (affinity_form == FORM1_AFFINITY) check there by
moving the check inside __initialize_form1_numa_distance().


>
>> +		return;
>> +
>> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
>> +		nid = of_read_number(&associativity[primary_domain_index], 1);
>
> This computes the nid from the assoc array independently of
> associativity_to_nid, which doesn't seem like a good idea.  Wouldn't
> it be better to call assocaitivity_to_nid(), then make the next bit
> conditional on nid !== NUMA_NO_NODE?

@@ -302,9 +302,8 @@ static void __initialize_form1_numa_distance(const __be32 *associativity)
 	if (affinity_form != FORM1_AFFINITY)
 		return;
 
-	if (of_read_number(associativity, 1) >= primary_domain_index) {
-		nid = of_read_number(&associativity[primary_domain_index], 1);
-
+	nid = associativity_to_nid(associativity);
+	if (nid != NUMA_NO_NODE) {
 		for (i = 0; i < distance_ref_points_depth; i++) {
 			const __be32 *entry;
 

>
>> +
>> +		for (i = 0; i < max_associativity_domain_index; i++) {
>> +			const __be32 *entry;
>> +
>> +			entry = &associativity[be32_to_cpu(distance_ref_points[i])];
>> +			distance_lookup_table[nid][i] = of_read_number(entry, 1);
>> +		}
>> +	}
>> +}
>> +
>> +static void initialize_form1_numa_distance(struct device_node *node)
>> +{
>> +	const __be32 *associativity;
>> +
>> +	associativity = of_get_associativity(node);
>> +	if (!associativity)
>> +		return;
>> +
>> +	__initialize_form1_numa_distance(associativity);
>> +}
>> +
>> +/*
>> + * Used to update distance information w.r.t newly added node.
>> + */
>> +void update_numa_distance(struct device_node *node)
>> +{
>> +	if (affinity_form == FORM0_AFFINITY)
>> +		return;
>> +	else if (affinity_form == FORM1_AFFINITY) {
>> +		initialize_form1_numa_distance(node);
>> +		return;
>> +	}
>> +}
>> +
>>  static int __init find_primary_domain_index(void)
>>  {
>>  	int index;
>> @@ -433,6 +451,48 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa)
>>  	return 0;
>>  }
>>  
>> +static int get_nid_and_numa_distance(struct drmem_lmb *lmb)
>> +{
>> +	struct assoc_arrays aa = { .arrays = NULL };
>> +	int default_nid = NUMA_NO_NODE;
>> +	int nid = default_nid;
>> +	int rc, index;
>> +
>> +	if ((primary_domain_index < 0) || !numa_enabled)
>
> Under what circumstances could you get primary_domain_index < 0?

IIUC that is to handle failure to parse device tree.
ea9f5b702fe0215188fba2eda117419e4ae90a67

>
>> +		return default_nid;
>> +
>> +	rc = of_get_assoc_arrays(&aa);
>> +	if (rc)
>> +		return default_nid;
>> +
>> +	if (primary_domain_index <= aa.array_sz &&
>> +	    !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
>> +		index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
>
> Does anywhere verify that primary_domain_index <= aa.array_sz?

That is the first part of the check?

>
>> +		nid = of_read_number(&aa.arrays[index], 1);
>> +
>> +		if (nid == 0xffff || nid >= nr_node_ids)
>> +			nid = default_nid;
>> +		if (nid > 0 && affinity_form == FORM1_AFFINITY) {
>> +			int i;
>> +			const __be32 *associativity;
>> +
>> +			index = lmb->aa_index * aa.array_sz;
>> +			associativity = &aa.arrays[index];
>> +			/*
>> +			 * lookup array associativity entries have different format
>> +			 * There is no length of the array as the first element.
>
> The difference it very small, and this is not a hot path.  Couldn't
> you reduce a chunk of code by prepending aa.array_sz, then re-using
> __initialize_form1_numa_distance.  Or even making
> __initialize_form1_numa_distance() take the length as a parameter.

The changes are small but confusing w.r.t how we look at the
associativity-lookup-arrays. The way we interpret associativity array
and associativity lookup array using primary_domain_index is different.
Hence the '-1' in the node lookup here. 

	index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
	nid = of_read_number(&aa.arrays[index], 1);


>
>> +			 */
>> +			for (i = 0; i < max_associativity_domain_index; i++) {
>> +				const __be32 *entry;
>> +
>> +				entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
>
> Does anywhere verify that distance_ref_points[i] <= aa.array_size for
> every i?

We do check for 

	if (primary_domain_index <= aa.array_sz &&

>
>> +				distance_lookup_table[nid][i] = of_read_number(entry, 1);
>> +			}
>> +		}
>> +	}
>> +	return nid;
>> +}
>> +
>>  /*
>>   * This is like of_node_to_nid_single() for memory represented in the
>>   * ibm,dynamic-reconfiguration-memory node.
>> @@ -458,21 +518,14 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
>>  
>>  		if (nid == 0xffff || nid >= nr_node_ids)
>>  			nid = default_nid;
>> -
>> -		if (nid > 0) {
>> -			index = lmb->aa_index * aa.array_sz;
>> -			initialize_distance_lookup_table(nid,
>> -							&aa.arrays[index]);
>> -		}
>>  	}
>> -
>>  	return nid;
>>  }
>>  
>>  #ifdef CONFIG_PPC_SPLPAR
>> -static int vphn_get_nid(long lcpu)
>> +
>> +static int __vphn_get_associativity(long lcpu, __be32 *associativity)
>>  {
>> -	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
>>  	long rc, hwid;
>>  
>>  	/*
>> @@ -492,10 +545,22 @@ static int vphn_get_nid(long lcpu)
>>  
>>  		rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity);
>>  		if (rc == H_SUCCESS)
>> -			return associativity_to_nid(associativity);
>> +			return 0;
>>  	}
>>  
>> +	return -1;
>> +}
>> +
>> +static int vphn_get_nid(long lcpu)
>> +{
>> +	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
>> +
>> +
>> +	if (!__vphn_get_associativity(lcpu, associativity))
>> +		return associativity_to_nid(associativity);
>> +
>>  	return NUMA_NO_NODE;
>> +
>>  }
>>  #else
>>  static int vphn_get_nid(long unused)
>> @@ -692,7 +757,7 @@ static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
>>  			size = read_n_cells(n_mem_size_cells, usm);
>>  		}
>>  
>> -		nid = of_drconf_to_nid_single(lmb);
>> +		nid = get_nid_and_numa_distance(lmb);
>>  		fake_numa_create_new_node(((base + size) >> PAGE_SHIFT),
>>  					  &nid);
>>  		node_set_online(nid);
>> @@ -709,6 +774,7 @@ static int __init parse_numa_properties(void)
>>  	struct device_node *memory;
>>  	int default_nid = 0;
>>  	unsigned long i;
>> +	const __be32 *associativity;
>>  
>>  	if (numa_enabled == 0) {
>>  		printk(KERN_WARNING "NUMA disabled by user\n");
>> @@ -734,18 +800,30 @@ static int __init parse_numa_properties(void)
>>  	 * each node to be onlined must have NODE_DATA etc backing it.
>>  	 */
>>  	for_each_present_cpu(i) {
>> +		__be32 vphn_assoc[VPHN_ASSOC_BUFSIZE];
>>  		struct device_node *cpu;
>> -		int nid = vphn_get_nid(i);
>> +		int nid = NUMA_NO_NODE;
>>  
>> -		/*
>> -		 * Don't fall back to default_nid yet -- we will plug
>> -		 * cpus into nodes once the memory scan has discovered
>> -		 * the topology.
>> -		 */
>> -		if (nid == NUMA_NO_NODE) {
>> +		memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32));
>
> What's the memset() for?  AFAICT you only look at vphn_assoc in the
> branch where __vphn_get_associativity() succeeds.

That was done to match the existing code. We do use a zero filled array
when making that hcall in this code path. I don't see us doing that
everywhere. But didn't want to change that behaviour in this patch.

-static int vphn_get_nid(long lcpu)
+
+static int __vphn_get_associativity(long lcpu, __be32 *associativity)
 {
 -	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
  	long rc, hwid;

>
>> +
>> +		if (__vphn_get_associativity(i, vphn_assoc) == 0) {
>> +			nid = associativity_to_nid(vphn_assoc);
>> +			__initialize_form1_numa_distance(vphn_assoc);
>> +		} else {
>> +
>> +			/*
>> +			 * Don't fall back to default_nid yet -- we will plug
>> +			 * cpus into nodes once the memory scan has discovered
>> +			 * the topology.
>> +			 */
>>  			cpu = of_get_cpu_node(i, NULL);
>>  			BUG_ON(!cpu);
>> -			nid = of_node_to_nid_single(cpu);
>> +
>> +			associativity = of_get_associativity(cpu);
>> +			if (associativity) {
>> +				nid = associativity_to_nid(associativity);
>> +				__initialize_form1_numa_distance(associativity);
>> +			}
>>  			of_node_put(cpu);
>>  		}
>>  
>> @@ -781,8 +859,11 @@ static int __init parse_numa_properties(void)
>>  		 * have associativity properties.  If none, then
>>  		 * everything goes to default_nid.
>>  		 */
>> -		nid = of_node_to_nid_single(memory);
>> -		if (nid < 0)
>> +		associativity = of_get_associativity(memory);
>> +		if (associativity) {
>> +			nid = associativity_to_nid(associativity);
>> +			__initialize_form1_numa_distance(associativity);
>> +		} else
>>  			nid = default_nid;
>>  
>>  		fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
>> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
>> index 7e970f81d8ff..778b6ab35f0d 100644
>> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
>> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
>> @@ -498,6 +498,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
>>  		return saved_rc;
>>  	}
>>  
>> +	update_numa_distance(dn);
>> +
>>  	rc = dlpar_online_cpu(dn);
>>  	if (rc) {
>>  		saved_rc = rc;
>> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
>> index 36f66556a7c6..40d350f31a34 100644
>> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
>> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
>> @@ -180,6 +180,8 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb)
>>  		return -ENODEV;
>>  	}
>>  
>> +	update_numa_distance(lmb_node);
>> +
>>  	dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
>>  	if (!dr_node) {
>>  		dlpar_free_cc_nodes(lmb_node);
>> diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
>> index 1f051a786fb3..663a0859cf13 100644
>> --- a/arch/powerpc/platforms/pseries/pseries.h
>> +++ b/arch/powerpc/platforms/pseries/pseries.h
>> @@ -113,4 +113,5 @@ extern u32 pseries_security_flavor;
>>  void pseries_setup_security_mitigations(void);
>>  void pseries_lpar_read_hblkrm_characteristics(void);
>>  
>> +void update_numa_distance(struct device_node *node);
>>  #endif /* _PSERIES_PSERIES_H */
>
> -- 
> David Gibson			| I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> 				| _way_ _around_!
> http://www.ozlabs.org/~dgibson

  reply	other threads:[~2021-07-22  7:08 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-06-28 15:11 [PATCH v5 0/6] Add support for FORM2 associativity Aneesh Kumar K.V
2021-06-28 15:11 ` [PATCH v5 1/6] powerpc/pseries: rename min_common_depth to primary_domain_index Aneesh Kumar K.V
2021-07-22  1:59   ` David Gibson
2021-07-22  2:36     ` David Gibson
2021-07-22  5:17       ` Aneesh Kumar K.V
2021-07-26  2:28         ` David Gibson
2021-06-28 15:11 ` [PATCH v5 2/6] powerpc/pseries: rename distance_ref_points_depth to max_associativity_domain_index Aneesh Kumar K.V
2021-07-22  0:59   ` David Gibson
2021-07-22  1:19     ` David Gibson
2021-06-28 15:11 ` [PATCH v5 3/6] powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY Aneesh Kumar K.V
2021-06-28 15:11 ` [PATCH v5 4/6] powerpc/pseries: Consolidate different NUMA distance update code paths Aneesh Kumar K.V
2021-06-28 20:21   ` kernel test robot
2021-06-28 20:40   ` kernel test robot
2021-07-22  1:40   ` David Gibson
2021-07-22  7:07     ` Aneesh Kumar K.V [this message]
2021-07-26  2:37       ` David Gibson
2021-07-27  3:32         ` Aneesh Kumar K.V
2021-07-27  5:59           ` David Gibson
2021-06-28 15:11 ` [PATCH v5 5/6] powerpc/pseries: Add a helper for form1 cpu distance Aneesh Kumar K.V
2021-07-22  1:42   ` David Gibson
2021-07-22  7:09     ` Aneesh Kumar K.V
2021-07-26  2:38       ` David Gibson
2021-06-28 15:11 ` [PATCH v5 6/6] powerpc/pseries: Add support for FORM2 associativity Aneesh Kumar K.V
2021-07-22  2:28   ` David Gibson
2021-07-22  7:34     ` Aneesh Kumar K.V
2021-07-26  2:41       ` David Gibson
2021-07-13 14:27 ` [PATCH v5 0/6] " Daniel Henrique Barboza
2021-07-13 14:30   ` Aneesh Kumar K.V

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87zgueu8ql.fsf@linux.ibm.com \
    --to=aneesh.kumar@linux.ibm.com \
    --cc=danielhb413@gmail.com \
    --cc=david@gibson.dropbear.id.au \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=nathanl@linux.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).