LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [RFC v4 2/4] hotplug/drcinfo: Provide parser with callback
From: Michael Bringmann @ 2018-05-23  0:42 UTC (permalink / raw)
  To: Nathan Fontenot, linuxppc-dev; +Cc: John Allen, Thomas Falcon, Tyrel Datwyler
In-Reply-To: <516edea6-8554-cdc6-db6d-9f6661e65ec4@linux.vnet.ibm.com>

See below.

On 05/22/2018 04:02 PM, Nathan Fontenot wrote:
> On 05/22/2018 11:37 AM, Michael Bringmann wrote:
>> This patch provides a common parse function for the ibm,drc-info
>> property that can be modified by a callback function.  The caller
>> provides a pointer to the function and a pointer to their unique
>> data, and the parser provides the current lmb set from the struct.
>> The callback function may return codes indicating that the parsing
>> is complete, or should continue, along with an error code that may
>> be returned to the caller.
>>
>> Signed-off-by: Michael Bringmann <mwb@linux.vnet.ibm.com>
>> Fixes: 3f38000eda48 ("powerpc/firmware: Add definitions for new drc-info firmwar
>> e feature" -- end of patch series applied to powerpc next)
>> ---
>> Changes in V4:
>>   -- Update code to account for latest kernel checkins.
>>   -- Rebased to 4.17-rc5 kernel
>>   -- Some patch cleanup including file combination
>> ---
>>  arch/powerpc/include/asm/prom.h             |    7 +++++
>>  arch/powerpc/platforms/pseries/of_helpers.c |   37 +++++++++++++++++++++++++++
>>  2 files changed, 44 insertions(+)
>>
>> diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
>> index b04c5ce..2e947b3 100644
>> --- a/arch/powerpc/include/asm/prom.h
>> +++ b/arch/powerpc/include/asm/prom.h
>> @@ -94,6 +94,13 @@ struct of_drc_info {
>>  extern int of_read_drc_info_cell(struct property **prop,
>>  			const __be32 **curval, struct of_drc_info *data);
>>
>> +extern int drc_info_parser(struct device_node *dn,
>> +			int (*usercb)(struct of_drc_info *drc,
>> +					void *data,
>> +					void *optional_data,
> 
> The optional_data parameter to the callback routine doesn't seem to be used.
> 
> -Nathan

There was one odd case when parsing the 'ibm,drc-info' code where
I remember needing extra data.  I think that I can compress it all
into the other pointer though.

Will remove the extra argument.

Michael
> 
>> +					int *ret_code),
>> +			char *opt_drc_type,
>> +			void *data);
>>
>>  /*
>>   * There are two methods for telling firmware what our capabilities are.
>> diff --git a/arch/powerpc/platforms/pseries/of_helpers.c b/arch/powerpc/platforms/pseries/of_helpers.c
>> index 11b2ef1..a588ee6 100644
>> --- a/arch/powerpc/platforms/pseries/of_helpers.c
>> +++ b/arch/powerpc/platforms/pseries/of_helpers.c
>> @@ -6,6 +6,9 @@
>>  #include <asm/prom.h>
>>
>>  #include "of_helpers.h"
>> +#include "pseries.h"
>> +
>> +#define	MAX_DRC_NAME_LEN 64
>>
>>  /**
>>   * pseries_of_derive_parent - basically like dirname(1)
>> @@ -87,3 +90,37 @@ int of_read_drc_info_cell(struct property **prop, const __be32 **curval,
>>  	return 0;
>>  }
>>  EXPORT_SYMBOL(of_read_drc_info_cell);
>> +
>> +int drc_info_parser(struct device_node *dn,
>> +		int (*usercb)(struct of_drc_info *drc,
>> +				void *data,
>> +				void *optional_data,
>> +				int *ret_code),
>> +		char *opt_drc_type,
>> +		void *data)
>> +{
>> +	struct property *info;
>> +	unsigned int entries;
>> +	struct of_drc_info drc;
>> +	const __be32 *value;
>> +	int j, done = 0, ret_code = -EINVAL;
>> +
>> +	info = of_find_property(dn, "ibm,drc-info", NULL);
>> +	if (info == NULL)
>> +		return -EINVAL;
>> +
>> +	value = info->value;
>> +	entries = of_read_number(value++, 1);
>> +
>> +	for (j = 0, done = 0; (j < entries) && (!done); j++) {
>> +		of_read_drc_info_cell(&info, &value, &drc);
>> +
>> +		if (opt_drc_type && strcmp(opt_drc_type, drc.drc_type))
>> +			continue;
>> +
>> +		done = usercb(&drc, data, NULL, &ret_code);
>> +	}
>> +
>> +	return ret_code;
>> +}
>> +EXPORT_SYMBOL(drc_info_parser);
>>
> 
> 

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:       (512) 466-0650
mwb@linux.vnet.ibm.com

^ permalink raw reply

* Re: [RFC v4 2/4] hotplug/drcinfo: Provide parser with callback
From: Michael Bringmann @ 2018-05-23  0:59 UTC (permalink / raw)
  To: Nathan Fontenot, linuxppc-dev; +Cc: John Allen, Thomas Falcon, Tyrel Datwyler
In-Reply-To: <c9f225f4-591b-37b6-3d70-bf1ae7395a0c@linux.vnet.ibm.com>

See below.

On 05/22/2018 04:23 PM, Nathan Fontenot wrote:
> On 05/22/2018 11:37 AM, Michael Bringmann wrote:
>> This patch provides a common parse function for the ibm,drc-info
>> property that can be modified by a callback function.  The caller
>> provides a pointer to the function and a pointer to their unique
>> data, and the parser provides the current lmb set from the struct.
>> The callback function may return codes indicating that the parsing
>> is complete, or should continue, along with an error code that may
>> be returned to the caller.
>>
>> Signed-off-by: Michael Bringmann <mwb@linux.vnet.ibm.com>
>> Fixes: 3f38000eda48 ("powerpc/firmware: Add definitions for new drc-info firmwar
>> e feature" -- end of patch series applied to powerpc next)
>> ---
>> Changes in V4:
>>   -- Update code to account for latest kernel checkins.
>>   -- Rebased to 4.17-rc5 kernel
>>   -- Some patch cleanup including file combination
>> ---
>>  arch/powerpc/include/asm/prom.h             |    7 +++++
>>  arch/powerpc/platforms/pseries/of_helpers.c |   37 +++++++++++++++++++++++++++
>>  2 files changed, 44 insertions(+)
>>
>> diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
>> index b04c5ce..2e947b3 100644
>> --- a/arch/powerpc/include/asm/prom.h
>> +++ b/arch/powerpc/include/asm/prom.h
>> @@ -94,6 +94,13 @@ struct of_drc_info {
>>  extern int of_read_drc_info_cell(struct property **prop,
>>  			const __be32 **curval, struct of_drc_info *data);
>>
>> +extern int drc_info_parser(struct device_node *dn,
>> +			int (*usercb)(struct of_drc_info *drc,
>> +					void *data,
>> +					void *optional_data,
>> +					int *ret_code),
>> +			char *opt_drc_type,
>> +			void *data);
> 
> After looking at the patch 3 in this series, I think a couple of comments and
> a small change may help. It was not clear at first what the call back function
> was supposed to return. After reading users of this routine it appears that the
> callback function is returning a bool value indicating whether or not the parser
> should continue. I documenting this and having the callback routine return a bool
> may make this clearer.

Okay.
> 
> Also, I see other places in the kernel name these types of routines as walk_*,
> perhaps a slight name change to walk_drc_info_entries() may also make it clearer
> what the code is doing.

Okay.
> 
> -Nathan

Michael
> 
>>
>>  /*
>>   * There are two methods for telling firmware what our capabilities are.
>> diff --git a/arch/powerpc/platforms/pseries/of_helpers.c b/arch/powerpc/platforms/pseries/of_helpers.c
>> index 11b2ef1..a588ee6 100644
>> --- a/arch/powerpc/platforms/pseries/of_helpers.c
>> +++ b/arch/powerpc/platforms/pseries/of_helpers.c
>> @@ -6,6 +6,9 @@
>>  #include <asm/prom.h>
>>
>>  #include "of_helpers.h"
>> +#include "pseries.h"
>> +
>> +#define	MAX_DRC_NAME_LEN 64
>>
>>  /**
>>   * pseries_of_derive_parent - basically like dirname(1)
>> @@ -87,3 +90,37 @@ int of_read_drc_info_cell(struct property **prop, const __be32 **curval,
>>  	return 0;
>>  }
>>  EXPORT_SYMBOL(of_read_drc_info_cell);
>> +
>> +int drc_info_parser(struct device_node *dn,
>> +		int (*usercb)(struct of_drc_info *drc,
>> +				void *data,
>> +				void *optional_data,
>> +				int *ret_code),
>> +		char *opt_drc_type,
>> +		void *data)
>> +{
>> +	struct property *info;
>> +	unsigned int entries;
>> +	struct of_drc_info drc;
>> +	const __be32 *value;
>> +	int j, done = 0, ret_code = -EINVAL;
>> +
>> +	info = of_find_property(dn, "ibm,drc-info", NULL);
>> +	if (info == NULL)
>> +		return -EINVAL;
>> +
>> +	value = info->value;
>> +	entries = of_read_number(value++, 1);
>> +
>> +	for (j = 0, done = 0; (j < entries) && (!done); j++) {
>> +		of_read_drc_info_cell(&info, &value, &drc);
>> +
>> +		if (opt_drc_type && strcmp(opt_drc_type, drc.drc_type))
>> +			continue;
>> +
>> +		done = usercb(&drc, data, NULL, &ret_code);
>> +	}
>> +
>> +	return ret_code;
>> +}
>> +EXPORT_SYMBOL(drc_info_parser);
>>
> 
> 

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:       (512) 466-0650
mwb@linux.vnet.ibm.com

^ permalink raw reply

* Re: [RFC v4 3/4] hotplug/drcinfo: Fix hot-add CPU issues
From: Michael Bringmann @ 2018-05-23  1:14 UTC (permalink / raw)
  To: Nathan Fontenot, linuxppc-dev; +Cc: John Allen, Thomas Falcon, Tyrel Datwyler
In-Reply-To: <15ece8c2-1f09-9e76-ff82-de8135486cd2@linux.vnet.ibm.com>

See below.

On 05/22/2018 04:31 PM, Nathan Fontenot wrote:
> On 05/22/2018 11:37 AM, Michael Bringmann wrote:
>> This patch applies a common parse function for the ibm,drc-info
>> property that can be modified by a callback function to the
>> hot-add CPU code.  Candidate code is replaced by a call to the
>> parser including a pointer to a local context-specific functions,
>> and local data.
>>
>> In addition, a bug in the release of the previous patch set may
>> break things in some of the CPU DLPAR operations.  For instance,
>> when attempting to hot-add a new CPU or set of CPUs, the original
>> patch failed to always properly calculate the available resources,
>> and aborted the operation.
>>
>> Signed-off-by: Michael Bringmann <mwb@linux.vnet.ibm.com>
>> Fixes: 3f38000eda48 ("powerpc/firmware: Add definitions for new drc-info firmwar
>> e feature" -- end of patch series applied to powerpc next)
>> ---
>> Changes in V4:
>>   -- Update code to account for latest kernel checkins.
>>   -- Rebased to 4.17-rc5 kernel
>>   -- Compress some more code
>> ---
>>  arch/powerpc/platforms/pseries/hotplug-cpu.c    |  118 +++++++++++++++++------
>>  arch/powerpc/platforms/pseries/pseries_energy.c |  107 +++++++++++----------
>>  2 files changed, 141 insertions(+), 84 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
>> index 6ef77ca..ceacad9 100644
>> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
>> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
>> @@ -411,27 +411,63 @@ static bool dlpar_cpu_exists(struct device_node *parent, u32 drc_index)
>>  	return found;
>>  }
>>
>> -static bool valid_cpu_drc_index(struct device_node *parent, u32 drc_index)
>> +static bool check_cpu_drc_index(struct device_node *parent,
>> +				int (*cb)(struct of_drc_info *drc,
>> +					void *data, void *not_used,
>> +					int *ret_code),
>> +				void *cdata)
>>  {
>>  	bool found = false;
>> -	int rc, index;
>>
>> -	index = 0;
>> -	while (!found) {
>> -		u32 drc;
>> +	if (firmware_has_feature(FW_FEATURE_DRC_INFO)) {
>> +		if (drc_info_parser(parent, cb, "CPU", cdata))
>> +			found = true;
>> +	} else {
>> +		int index = 0;
>>
>> -		rc = of_property_read_u32_index(parent, "ibm,drc-indexes",
>> -						index++, &drc);
>> -		if (rc)
>> -			break;
>> +		while (!found) {
>> +			u32 drc;
>>
>> -		if (drc == drc_index)
>> -			found = true;
>> +			if (of_property_read_u32_index(parent,
>> +						"ibm,drc-indexes",
>> +						index++, &drc))
>> +				break;
>> +			if (cb(NULL, cdata, &drc, NULL))
>> +				found = true;
>> +		}
>>  	}
>>
>>  	return found;
>>  }
>>
>> +struct valid_drc_index_struct {
>> +	u32 targ_drc_index;
>> +};
> 
> Can you help me understand the need to encapsulate the drc_index as a struct.

At this point, it is consistency of use across all of the instances
of using 'walk_drc_info'.  I believe that there were more values in
the structure earlier on.

> 
>> +
>> +static int valid_drc_index_cb(struct of_drc_info *drc, void *idata,
>> +				void *drc_index, int *ret_code)
>> +{
>> +	struct valid_drc_index_struct *cdata = idata;
>> +
>> +	if (drc) {
>> +		if (!((drc->drc_index_start <= cdata->targ_drc_index) &&
>> +			(cdata->targ_drc_index <= drc->last_drc_index)))
>> +			return 0;
>> +	} else {
>> +		if (*((u32 *)drc_index) != cdata->targ_drc_index)
>> +			return 0;
>> +	}
>> +	(*ret_code) = 1;
>> +	return 1;
>> +}
>> +
>> +static bool valid_cpu_drc_index(struct device_node *parent, u32 drc_index)
>> +{
>> +	struct valid_drc_index_struct cdata = { drc_index };
>> +
>> +	return check_cpu_drc_index(parent, valid_drc_index_cb, &cdata);
>> +}
>> +
>>  static ssize_t dlpar_cpu_add(u32 drc_index)
>>  {
>>  	struct device_node *dn, *parent;
>> @@ -721,11 +757,43 @@ static int dlpar_cpu_remove_by_count(u32 cpus_to_remove)
>>  	return rc;
>>  }
>>
>> +struct cpus_to_add_struct {
>> +	struct device_node *parent;
>> +	u32 *cpu_drcs;
>> +	u32 cpus_to_add;
>> +	u32 cpus_found;
>> +};
>> +
>> +static int cpus_to_add_cb(struct of_drc_info *drc, void *idata,
>> +			void *drc_index, int *ret_code)
>> +{
>> +	struct cpus_to_add_struct *cdata = idata;
>> +
>> +	if (drc) {
>> +		int k;
>> +
>> +		for (k = 0; (k < drc->num_sequential_elems) &&
>> +			(cdata->cpus_found < cdata->cpus_to_add); k++) {
>> +			u32 idrc = drc->drc_index_start +
>> +				(k * drc->sequential_inc);
>> +
>> +			if (dlpar_cpu_exists(cdata->parent, idrc))
>> +				continue;
>> +			cdata->cpu_drcs[cdata->cpus_found++] = idrc;
>> +		}
>> +	} else {
>> +		if (!dlpar_cpu_exists(cdata->parent, *((u32 *)drc_index)))
>> +			cdata->cpu_drcs[cdata->cpus_found++] =
>> +				*((u32 *)drc_index);
>> +	}
>> +	return 0;
>> +}
>> +
>>  static int find_dlpar_cpus_to_add(u32 *cpu_drcs, u32 cpus_to_add)
>>  {
>>  	struct device_node *parent;
>> -	int cpus_found = 0;
>> -	int index, rc;
>> +	struct cpus_to_add_struct cdata = {
>> +		NULL, cpu_drcs, cpus_to_add, 0 };
>>
>>  	parent = of_find_node_by_path("/cpus");
>>  	if (!parent) {
>> @@ -734,28 +802,14 @@ static int find_dlpar_cpus_to_add(u32 *cpu_drcs, u32 cpus_to_add)
>>  		return -1;
>>  	}
>>
>> -	/* Search the ibm,drc-indexes array for possible CPU drcs to
>> -	 * add. Note that the format of the ibm,drc-indexes array is
>> -	 * the number of entries in the array followed by the array
>> -	 * of drc values so we start looking at index = 1.
>> +	/* Search the appropriate property for possible CPU drcs to
>> +	 * add.
>>  	 */
>> -	index = 1;
>> -	while (cpus_found < cpus_to_add) {
>> -		u32 drc;
>> -
>> -		rc = of_property_read_u32_index(parent, "ibm,drc-indexes",
>> -						index++, &drc);
>> -		if (rc)
>> -			break;
>> -
>> -		if (dlpar_cpu_exists(parent, drc))
>> -			continue;
>> -
>> -		cpu_drcs[cpus_found++] = drc;
>> -	}
>> +	cdata.parent = parent;
>> +	check_cpu_drc_index(parent, cpus_to_add_cb, &cdata);
>>
>>  	of_node_put(parent);
>> -	return cpus_found;
>> +	return cdata.cpus_found;
>>  }
>>
>>  static int dlpar_cpu_add_by_count(u32 cpus_to_add)
>> diff --git a/arch/powerpc/platforms/pseries/pseries_energy.c b/arch/powerpc/platforms/pseries/pseries_energy.c
>> index 5261975..d8d7750 100644
>> --- a/arch/powerpc/platforms/pseries/pseries_energy.c
>> +++ b/arch/powerpc/platforms/pseries/pseries_energy.c
>> @@ -36,6 +36,26 @@
>>
>>  /* Helper Routines to convert between drc_index to cpu numbers */
>>
>> +struct cpu_to_drc_index_struct {
>> +	u32	thread_index;
>> +	u32	ret;
>> +};
>> +
>> +static int cpu_to_drc_index_cb(struct of_drc_info *drc, void *idata,
>> +				void *not_used, int *ret_code)
>> +{
>> +	struct cpu_to_drc_index_struct *cdata = idata;
>> +	int ret = 0;
>> +
>> +	if (cdata->thread_index < drc->last_drc_index) {
> 
> Is this correct? You're comparing a thread index to a drc index.

I believe so.  I will check when I retest this week, hopefully.

> 
>> +		cdata->ret = drc->drc_index_start +
>> +			(cdata->thread_index * drc->sequential_inc);
>> +		ret = 1;
>> +	}
>> +	(*ret_code) = ret;
>> +	return ret;
>> +}
>> +
>>  static u32 cpu_to_drc_index(int cpu)
>>  {
>>  	struct device_node *dn = NULL;
>> @@ -51,30 +71,14 @@ static u32 cpu_to_drc_index(int cpu)
>>  	thread_index = cpu_core_index_of_thread(cpu);
>>
>>  	if (firmware_has_feature(FW_FEATURE_DRC_INFO)) {
>> -		struct property *info = NULL;
>> -		struct of_drc_info drc;
>> -		int j;
>> -		u32 num_set_entries;
>> -		const __be32 *value;
>> -
>> -		info = of_find_property(dn, "ibm,drc-info", NULL);
>> -		if (info == NULL)
>> -			goto err_of_node_put;
>> +		struct cpu_to_drc_index_struct cdata = {
>> +			thread_index, 0 };
>>
>> -		value = info->value;
>> -		num_set_entries = of_read_number(value++, 1);
>> -
>> -		for (j = 0; j < num_set_entries; j++) {
>> -
>> -			of_read_drc_info_cell(&info, &value, &drc);
>> -			if (strncmp(drc.drc_type, "CPU", 3))
>> -				goto err;
>> -
>> -			if (thread_index < drc.last_drc_index)
>> -				break;
>> -		}
>> -
>> -		ret = drc.drc_index_start + (thread_index * drc.sequential_inc);
>> +		rc = drc_info_parser(dn, &cpu_to_drc_index_cb,
>> +					"CPU", &cdata);
>> +		if (rc < 0)
>> +			goto err_of_node_put;
>> +		ret = cdata.ret;
>>  	} else {
>>  		const __be32 *indexes;
>>
>> @@ -100,11 +104,33 @@ static u32 cpu_to_drc_index(int cpu)
>>  	return ret;
>>  }
>>
>> +struct drc_index_to_cpu_struct {
>> +	u32	drc_index;
>> +	u32	thread_index;
>> +	u32	cpu;
>> +};
>> +
>> +static int drc_index_to_cpu_cb(struct of_drc_info *drc,
>> +				void *idata, void *not_used, int *ret_code)
>> +{
>> +	struct drc_index_to_cpu_struct *cdata = idata;
>> +
>> +	if (cdata->drc_index > drc->last_drc_index) {
>> +		cdata->cpu += drc->num_sequential_elems;
>> +	} else {
>> +		cdata->cpu += ((cdata->drc_index - drc->drc_index_start) /
>> +				drc->sequential_inc);
>> +		cdata->thread_index = cpu_first_thread_of_core(cdata->cpu);
> 
> Should this return 1 here to avoid continuing to walk the drc_info entries?

Yes.

> 
> -Nathan

Michael

> 
>> +	}
>> +	(*ret_code) = 0;
>> +	return 0;
>> +}
>> +
>>  static int drc_index_to_cpu(u32 drc_index)
>>  {
>>  	struct device_node *dn = NULL;
>>  	const int *indexes;
>> -	int thread_index = 0, cpu = 0;
>> +	int thread_index = 0;
>>  	int rc = 1;
>>
>>  	dn = of_find_node_by_path("/cpus");
>> @@ -112,36 +138,13 @@ static int drc_index_to_cpu(u32 drc_index)
>>  		goto err;
>>
>>  	if (firmware_has_feature(FW_FEATURE_DRC_INFO)) {
>> -		struct property *info = NULL;
>> -		struct of_drc_info drc;
>> -		int j;
>> -		u32 num_set_entries;
>> -		const __be32 *value;
>> -
>> -		info = of_find_property(dn, "ibm,drc-info", NULL);
>> -		if (info == NULL)
>> -			goto err_of_node_put;
>> -
>> -		value = info->value;
>> -		num_set_entries = of_read_number(value++, 1);
>> -
>> -		for (j = 0; j < num_set_entries; j++) {
>> +		struct drc_index_to_cpu_struct cdata = {
>> +			drc_index, 0, 0 };
>>
>> -			of_read_drc_info_cell(&info, &value, &drc);
>> -			if (strncmp(drc.drc_type, "CPU", 3))
>> -				goto err;
>> +		rc = drc_info_parser(dn, &drc_index_to_cpu_cb,
>> +					"CPU", &cdata);
>> +		thread_index = cdata.thread_index;
>>
>> -			if (drc_index > drc.last_drc_index) {
>> -				cpu += drc.num_sequential_elems;
>> -				continue;
>> -			}
>> -			cpu += ((drc_index - drc.drc_index_start) /
>> -				drc.sequential_inc);
>> -
>> -			thread_index = cpu_first_thread_of_core(cpu);
>> -			rc = 0;
>> -			break;
>> -		}
>>  	} else {
>>  		unsigned long int i;
>>
> 
> 

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:       (512) 466-0650
mwb@linux.vnet.ibm.com

^ permalink raw reply

* Re: [RFC v4 4/4] hotplug/drcinfo: Code cleanup for devices
From: Michael Bringmann @ 2018-05-23  1:29 UTC (permalink / raw)
  To: Nathan Fontenot, linuxppc-dev; +Cc: John Allen, Thomas Falcon, Tyrel Datwyler
In-Reply-To: <048197e9-f24f-b51d-0b9f-40d66b243f19@linux.vnet.ibm.com>

See below.

On 05/22/2018 04:39 PM, Nathan Fontenot wrote:
> On 05/22/2018 11:37 AM, Michael Bringmann wrote:
>> This patch extends the use of a common parse function for the
>> ibm,drc-info property that can be modified by a callback function
>> to the hotplug device processing.  Candidate code is replaced by
>> a call to the parser including a pointer to a local context-specific
>> functions, and local data.
>>
>> In addition, several more opportunities to compress and reuse
>> common code between the old and new property parsers were applied.
>>
>> Finally, a bug with the registration of slots was observed on some
>> systems, and the code was rewritten to prevent its reoccurrence.
>>
>> Signed-off-by: Michael Bringmann <mwb@linux.vnet.ibm.com>
>> Fixes: 3f38000eda48 ("powerpc/firmware: Add definitions for new drc-info firmwar
>> e feature" -- end of patch series applied to powerpc next)
>> ---
>> Changes in V4:
>>   -- Update code to account for latest kernel checkins.
>>   -- Fix bug searching for virtual device slots.
>>   -- Rebased to 4.17-rc5 kernel
>>   -- Patch cleanup
>> ---
>>  drivers/pci/hotplug/rpaphp_core.c |  181 ++++++++++++++++++++++++++-----------
>>  1 file changed, 126 insertions(+), 55 deletions(-)
>>
>> diff --git a/drivers/pci/hotplug/rpaphp_core.c b/drivers/pci/hotplug/rpaphp_core.c
>> index 435c1a0..dc4ec68 100644
>> --- a/drivers/pci/hotplug/rpaphp_core.c
>> +++ b/drivers/pci/hotplug/rpaphp_core.c
>> @@ -222,47 +222,51 @@ static int rpaphp_check_drc_props_v1(struct device_node *dn, char *drc_name,
>>  	return -EINVAL;
>>  }
>>
>> -static int rpaphp_check_drc_props_v2(struct device_node *dn, char *drc_name,
>> -				char *drc_type, unsigned int my_index)
>> +struct check_drc_props_v2_struct {
>> +	char *drc_name;
>> +	char *drc_type;
>> +	unsigned int my_index;
>> +};
>> +
>> +static int check_drc_props_v2_cb(struct of_drc_info *drc, void *idata,
>> +				void *not_used, int *ret_code)
>>  {
>> -	struct property *info;
>> -	unsigned int entries;
>> -	struct of_drc_info drc;
>> -	const __be32 *value;
>> +	struct check_drc_props_v2_struct *cdata = idata;
>>  	char cell_drc_name[MAX_DRC_NAME_LEN];
>> -	int j, fndit;
>>
>> -	info = of_find_property(dn->parent, "ibm,drc-info", NULL);
>> -	if (info == NULL)
>> -		return -EINVAL;
>> +	(*ret_code) = -EINVAL;
>>
>> -	value = info->value;
>> -	entries = of_read_number(value++, 1);
>> -
>> -	for (j = 0; j < entries; j++) {
>> -		of_read_drc_info_cell(&info, &value, &drc);
>> -
>> -		/* Should now know end of current entry */
>> -
>> -		if (my_index > drc.last_drc_index)
>> -			continue;
>> +	if (cdata->my_index > drc->last_drc_index)
>> +		return 0;
>>
>> -		fndit = 1;
>> -		break;
>> +	/* Found drc_index.  Now match the rest. */
>> +	sprintf(cell_drc_name, "%s%d", drc->drc_name_prefix,
>> +		cdata->my_index - drc->drc_index_start +
>> +		drc->drc_name_suffix_start);
>> +
>> +	if (((cdata->drc_name == NULL) ||
>> +	     (cdata->drc_name && !strcmp(cdata->drc_name, cell_drc_name))) &&
>> +	    ((cdata->drc_type == NULL) ||
>> +	     (cdata->drc_type && !strcmp(cdata->drc_type, drc->drc_type)))) {
>> +		(*ret_code) = 0;
>> +		return 1;
>>  	}
>> -	/* Found it */
>>
>> -	if (fndit)
>> -		sprintf(cell_drc_name, "%s%d", drc.drc_name_prefix, 
>> -			my_index);
>> +	return 0;
>> +}
>>
>> -	if (((drc_name == NULL) ||
>> -	     (drc_name && !strcmp(drc_name, cell_drc_name))) &&
>> -	    ((drc_type == NULL) ||
>> -	     (drc_type && !strcmp(drc_type, drc.drc_type))))
>> -		return 0;
>> +static int rpaphp_check_drc_props_v2(struct device_node *dn, char *drc_name,
>> +				char *drc_type, unsigned int my_index)
>> +{
>> +	struct device_node *root = dn;
>> +	struct check_drc_props_v2_struct cdata = {
>> +		drc_name, drc_type, be32_to_cpu(my_index) };
>>
>> -	return -EINVAL;
>> +	if (!drc_type || (drc_type && strcmp(drc_type, "SLOT")))
>> +		root = dn->parent;
>> +
>> +	return drc_info_parser(root, check_drc_props_v2_cb,
>> +				drc_type, &cdata);
>>  }
>>
>>  int rpaphp_check_drc_props(struct device_node *dn, char *drc_name,
>> @@ -285,7 +289,6 @@ int rpaphp_check_drc_props(struct device_node *dn, char *drc_name,
>>  }
>>  EXPORT_SYMBOL_GPL(rpaphp_check_drc_props);
>>
>> -
>>  static int is_php_type(char *drc_type)
>>  {
>>  	unsigned long value;
>> @@ -345,17 +348,41 @@ static int is_php_dn(struct device_node *dn, const int **indexes,
>>   *
>>   * To remove a slot, it suffices to call rpaphp_deregister_slot().
>>   */
>> -int rpaphp_add_slot(struct device_node *dn)
>> +
>> +static int rpaphp_add_slot_common(struct device_node *dn,
>> +			u32 drc_index, char *drc_name, char *drc_type,
>> +			u32 drc_power_domain)
>>  {
>>  	struct slot *slot;
>>  	int retval = 0;
>> -	int i;
>> +
>> +	slot = alloc_slot_struct(dn, drc_index, drc_name, drc_power_domain);
>> +	if (!slot)
>> +		return -ENOMEM;
>> +
>> +	slot->type = simple_strtoul(drc_type, NULL, 10);
>> +	if (retval)
>> +		return -EINVAL;
>> +
>> +	dbg("Found drc-index:0x%x drc-name:%s drc-type:%s\n",
>> +		drc_index, drc_name, drc_type);
>> +
>> +	retval = rpaphp_enable_slot(slot);
>> +	if (!retval)
>> +		retval = rpaphp_register_slot(slot);
>> +
>> +	if (retval)
>> +		dealloc_slot_struct(slot);
>> +
>> +	return retval;
>> +}
>> +
>> +static int rpaphp_add_slot_v1(struct device_node *dn)
>> +{
>> +	int i, retval = 0;
>>  	const int *indexes, *names, *types, *power_domains;
>>  	char *name, *type;
>>
>> -	if (!dn->name || strcmp(dn->name, "pci"))
>> -		return 0;
>> -
>>  	/* If this is not a hotplug slot, return without doing anything. */
>>  	if (!is_php_dn(dn, &indexes, &names, &types, &power_domains))
>>  		return 0;
>> @@ -366,25 +393,12 @@ int rpaphp_add_slot(struct device_node *dn)
>>  	name = (char *) &names[1];
>>  	type = (char *) &types[1];
>>  	for (i = 0; i < be32_to_cpu(indexes[0]); i++) {
>> -		int index;
>> -
>> -		index = be32_to_cpu(indexes[i + 1]);
>> -		slot = alloc_slot_struct(dn, index, name,
>> -					 be32_to_cpu(power_domains[i + 1]));
>> -		if (!slot)
>> -			return -ENOMEM;
>> -
>> -		slot->type = simple_strtoul(type, NULL, 10);
>>
>> -		dbg("Found drc-index:0x%x drc-name:%s drc-type:%s\n",
>> -				index, name, type);
>> -
>> -		retval = rpaphp_enable_slot(slot);
>> +		retval = rpaphp_add_slot_common(dn,
>> +				be32_to_cpu(indexes[i + 1]), name, type,
>> +				be32_to_cpu(power_domains[i + 1]));
>>  		if (!retval)
>> -			retval = rpaphp_register_slot(slot);
>> -
>> -		if (retval)
>> -			dealloc_slot_struct(slot);
>> +			return retval;
>>
>>  		name += strlen(name) + 1;
>>  		type += strlen(type) + 1;
>> @@ -394,6 +408,63 @@ int rpaphp_add_slot(struct device_node *dn)
>>  	/* XXX FIXME: reports a failure only if last entry in loop failed */
>>  	return retval;
>>  }
>> +
>> +struct rpaphp_add_slot_v2_struct {
>> +	struct device_node *dn;
>> +};
>> +
>> +static int rpaphp_add_slot_v2_cb(struct of_drc_info *drc, void *idata,
>> +				void *not_used, int *ret_code)
>> +{
>> +	struct rpaphp_add_slot_v2_struct *cdata = idata;
>> +	u32 drc_index;
>> +	char drc_name[MAX_DRC_NAME_LEN];
>> +	int i, retval;
>> +
>> +	(*ret_code) = -EINVAL;
>> +
>> +	if (!is_php_type((char *) drc->drc_type)) {
>> +		(*ret_code) = 0;
>> +		return 1;
>> +	}
>> +
>> +	for (i = 0, drc_index = drc->drc_index_start;
>> +		i < drc->num_sequential_elems; i++, drc_index++) {
>> +
>> +		sprintf(drc_name, "%s%d", drc->drc_name_prefix,
>> +			drc_index - drc->drc_index_start +
>> +			drc->drc_name_suffix_start);
>> +
>> +		retval = rpaphp_add_slot_common(cdata->dn,
>> +				drc_index, drc_name, drc->drc_type,
>> +				drc->drc_power_domain);
>> +		if (!retval) {
>> +			(*ret_code) = retval;
>> +			return 1;
>> +		}
>> +	}
> 
> I may be reading this loop wrong, but it appears that it will add all drc indexes
> for all drc_info entries to the specified device node, minus drc-types that don't match.

It is supposed to exit the function after the first successful call
to rpaphp_add_slot_common().  That function is supposed to try to
allocate/enable a slot, and return 0 when it succeeds.

Will reconfirm this week.

> 
> -Nathan

Michael

> 
>> +
>> +	(*ret_code) = retval;
>> +	return 0;
>> +}
>> +
>> +static int rpaphp_add_slot_v2(struct device_node *dn)
>> +{
>> +	struct rpaphp_add_slot_v2_struct cdata = { dn };
>> +
>> +	return drc_info_parser(dn, rpaphp_add_slot_v2_cb, NULL, &cdata);
>> +}
>> +
>> +int rpaphp_add_slot(struct device_node *dn)
>> +{
>> +	if (!dn->name || strcmp(dn->name, "pci"))
>> +		return 0;
>> +
>> +	if (firmware_has_feature(FW_FEATURE_DRC_INFO))
>> +		return rpaphp_add_slot_v2(dn);
>> +	else
>> +		return rpaphp_add_slot_v1(dn);
>> +}
>>  EXPORT_SYMBOL_GPL(rpaphp_add_slot);
>>
>>  static void __exit cleanup_slots(void)
>>
> 
> 

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:       (512) 466-0650
mwb@linux.vnet.ibm.com

^ permalink raw reply

* Re: [PATCH v3 5/7] KVM: PPC: reimplements LOAD_VSX/STORE_VSX instruction mmio emulation with analyse_intr() input
From: Simon Guo @ 2018-05-23  3:06 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: kvm-ppc, kvm, linuxppc-dev
In-Reply-To: <20180522094151.GA9871@fergus.ozlabs.ibm.com>

Hi Paul,
On Tue, May 22, 2018 at 07:41:51PM +1000, Paul Mackerras wrote:
> On Mon, May 21, 2018 at 01:24:24PM +0800, wei.guo.simon@gmail.com wrote:
> > From: Simon Guo <wei.guo.simon@gmail.com>
> > 
> > This patch reimplements LOAD_VSX/STORE_VSX instruction MMIO emulation with
> > analyse_intr() input. It utilizes VSX_FPCONV/VSX_SPLAT/SIGNEXT exported
> > by analyse_instr() and handle accordingly.
> > 
> > When emulating VSX store, the VSX reg will need to be flushed so that
> > the right reg val can be retrieved before writing to IO MEM.
> 
> When I tested this patch set with the MMIO emulation test program I
> have, I got a host crash on the first test that used a VSX instruction
> with a register number >= 32, that is, a VMX register.  The crash was
> that it hit the BUG() at line 1193 of arch/powerpc/kvm/powerpc.c.
> 
> The reason it hit the BUG() is that vcpu->arch.io_gpr was 0xa3.
> What's happening here is that analyse_instr gives a register numbers
> in the range 32 - 63 for VSX instructions which access VMX registers.
> When 35 is ORed with 0x80 (KVM_MMIO_REG_VSX) we get 0xa3.
> 
> The old code didn't pass the high bit of the register number to
> kvmppc_handle_vsx_load/store, but instead passed it via the
> vcpu->arch.mmio_vsx_tx_sx_enabled field.  With your patch set we still
> set and use that field, so the patch below on top of your patches is
> the quick fix.  Ideally we would get rid of that field and just use
> the high (0x20) bit of the register number instead, but that can be
> cleaned up later.
> 
> If you like, I will fold the patch below into this patch and push the
> series to my kvm-ppc-next branch.
> 
> Paul.
Sorry my test missed this kind of cases. Please go ahead to fold the patch
as you suggested.  Thanks for point it out.

If you like, I can do the clean up work. If I understand correctly,
we need to expand io_gpr to u16 from u8 so that reg number can use 
6 bits and leave room for other reg flag bits.

BR,
- Simon

> ---
> diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c
> index 0165fcd..afde788 100644
> --- a/arch/powerpc/kvm/emulate_loadstore.c
> +++ b/arch/powerpc/kvm/emulate_loadstore.c
> @@ -242,8 +242,8 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
>  			}
>  
>  			emulated = kvmppc_handle_vsx_load(run, vcpu,
> -					KVM_MMIO_REG_VSX|op.reg, io_size_each,
> -					1, op.type & SIGNEXT);
> +					KVM_MMIO_REG_VSX | (op.reg & 0x1f),
> +					io_size_each, 1, op.type & SIGNEXT);
>  			break;
>  		}
>  #endif
> @@ -363,7 +363,7 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
>  			}
>  
>  			emulated = kvmppc_handle_vsx_store(run, vcpu,
> -					op.reg, io_size_each, 1);
> +					op.reg & 0x1f, io_size_each, 1);
>  			break;
>  		}
>  #endif

^ permalink raw reply

* Re: [PATCH v7 2/3] powerpc/mm: Only read faulting instruction when necessary in do_page_fault()
From: Nicholas Piggin @ 2018-05-23  6:29 UTC (permalink / raw)
  To: Christophe LEROY
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linux-kernel, linuxppc-dev
In-Reply-To: <cb0eaf6d-80ab-b503-d797-cadcd60d4c17@c-s.fr>

On Tue, 22 May 2018 16:50:55 +0200
Christophe LEROY <christophe.leroy@c-s.fr> wrote:

> Le 22/05/2018 =C3=A0 16:38, Nicholas Piggin a =C3=A9crit=C2=A0:
> > On Tue, 22 May 2018 16:02:56 +0200 (CEST)
> > Christophe Leroy <christophe.leroy@c-s.fr> wrote:
> >  =20
> >> Commit a7a9dcd882a67 ("powerpc: Avoid taking a data miss on every
> >> userspace instruction miss") has shown that limiting the read of
> >> faulting instruction to likely cases improves performance.
> >>
> >> This patch goes further into this direction by limiting the read
> >> of the faulting instruction to the only cases where it is likely
> >> needed.
> >>
> >> On an MPC885, with the same benchmark app as in the commit referred
> >> above, we see a reduction of about 3900 dTLB misses (approx 3%):
> >>
> >> Before the patch:
> >>   Performance counter stats for './fault 500' (10 runs):
> >>
> >>           683033312      cpu-cycles                                   =
                 ( +-  0.03% )
> >>              134538      dTLB-load-misses                             =
                 ( +-  0.03% )
> >>               46099      iTLB-load-misses                             =
                 ( +-  0.02% )
> >>               19681      faults                                       =
                 ( +-  0.02% )
> >>
> >>         5.389747878 seconds time elapsed                              =
            ( +-  0.06% )
> >>
> >> With the patch:
> >>
> >>   Performance counter stats for './fault 500' (10 runs):
> >>
> >>           682112862      cpu-cycles                                   =
                 ( +-  0.03% )
> >>              130619      dTLB-load-misses                             =
                 ( +-  0.03% )
> >>               46073      iTLB-load-misses                             =
                 ( +-  0.05% )
> >>               19681      faults                                       =
                 ( +-  0.01% )
> >>
> >>         5.381342641 seconds time elapsed                              =
            ( +-  0.07% )
> >>
> >> The proper work of the huge stack expansion was tested with the
> >> following app:
> >>
> >> int main(int argc, char **argv)
> >> {
> >> 	char buf[1024 * 1025];
> >>
> >> 	sprintf(buf, "Hello world !\n");
> >> 	printf(buf);
> >>
> >> 	exit(0);
> >> }
> >>
> >> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
> >> ---
> >>   v7: Following comment from Nicholas on v6 on possibility of the page=
 getting removed from the pagetables
> >>       between the fault and the read, I have reworked the patch in ord=
er to do the get_user() in
> >>       __do_page_fault() directly in order to reduce complexity compare=
d to version v5 =20
> >=20
> > This is looking better, thanks.
> >  =20
> >> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> >> index fcbb34431da2..dc64b8e06477 100644
> >> --- a/arch/powerpc/mm/fault.c
> >> +++ b/arch/powerpc/mm/fault.c
> >> @@ -450,9 +450,6 @@ static int __do_page_fault(struct pt_regs *regs, u=
nsigned long address,
> >>   	 * can result in fault, which will cause a deadlock when called with
> >>   	 * mmap_sem held
> >>   	 */
> >> -	if (is_write && is_user)
> >> -		get_user(inst, (unsigned int __user *)regs->nip);
> >> -
> >>   	if (is_user)
> >>   		flags |=3D FAULT_FLAG_USER;
> >>   	if (is_write)
> >> @@ -498,6 +495,26 @@ static int __do_page_fault(struct pt_regs *regs, =
unsigned long address,
> >>   	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
> >>   		return bad_area(regs, address);
> >>  =20
> >> +	if (unlikely(is_write && is_user && address + 0x100000 < vma->vm_end=
 &&
> >> +		     !inst)) {
> >> +		unsigned int __user *nip =3D (unsigned int __user *)regs->nip;
> >> +
> >> +		if (likely(access_ok(VERIFY_READ, nip, sizeof(inst)))) {
> >> +			int res;
> >> +
> >> +			pagefault_disable();
> >> +			res =3D __get_user_inatomic(inst, nip);
> >> +			pagefault_enable();
> >> +			if (unlikely(res)) {
> >> +				up_read(&mm->mmap_sem);
> >> +				res =3D __get_user(inst, nip);
> >> +				if (!res && inst)
> >> +					goto retry; =20
> >=20
> > You're handling error here but the previous code did not? =20
>=20
> The previous code did in store_updates_sp()
>=20
> When I moved get_user() out of that function in preceeding patch, I did=20
> consider that if get_user() fails, inst will remain 0, which means that=20
> store_updates_sp() will return false if ever called.

Well it handles it just by saying no the store does not update SP.
Yours now segfaults it, doesn't it?

I don't think that's a bad idea, I think it should go in a patch by
itself though. In theory we can have execute but not read, I guess
that's not really going to work here either way and I don't know if
Linux exposes it ever.

>=20
> Now, as the semaphore has been released, we really need to do something,=
=20
> because if we goto retry inconditionally, we may end up in an infinite=20
> loop, and we can't let it continue either as the semaphore is not held=20
> anymore.
>
> >  =20
> >> +				return bad_area_nosemaphore(regs, address);
> >> +			}
> >> +		}
> >> +	} =20
> >=20
> > Would it be nicer to move all this up into bad_stack_expansion().
> > It would need a way to handle the retry and insn, but I think it
> > would still look better. =20
>=20
> That's what I did in v5 indeed, but it looked too complex to me at the=20
> end. Can you have a look at it=20
> (https://patchwork.ozlabs.org/patch/799053/) and tell me if you feel it=20
> better than v7, or if you have any suggestion to improve based on v5=20
> and/or v7 ?

Yeah I'm kind of liking that direction a bit more. I took your code
and hacked on it a bit more... Completely untested but I wonder what
you think?

We can put almost all the checking logic out of the main fault
path, and the retry stuff can fit into the unlikely failure
path. Also we only get_user at the last minute.

It does use fault_in_pages_readable which in theory means you might
repeat the loop if the page gets faulted out between retry, but that
same pattern exists in places in the filesystem code. Basically it
would be edge case trashing and if it persists then the system is
already finished. So I think it's okay. Just makes the retry loop a
bit simpler.

Any thoughts?

Thanks,
Nick


diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index c01d627e687a..f0d36ec949b3 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -69,12 +69,8 @@ static inline bool notify_page_fault(struct pt_regs *reg=
s)
  * Check whether the instruction at regs->nip is a store using
  * an update addressing form which will update r1.
  */
-static bool store_updates_sp(struct pt_regs *regs)
+static bool store_updates_sp(unsigned int inst)
 {
-	unsigned int inst;
-
-	if (get_user(inst, (unsigned int __user *)regs->nip))
-		return false;
 	/* check for 1 in the rA field */
 	if (((inst >> 16) & 0x1f) !=3D 1)
 		return false;
@@ -233,10 +229,23 @@ static bool bad_kernel_fault(bool is_exec, unsigned l=
ong error_code,
 	return is_exec || (address >=3D TASK_SIZE);
 }
=20
-static bool bad_stack_expansion(struct pt_regs *regs, unsigned long addres=
s,
+static bool bad_stack_expand(struct pt_regs *regs, unsigned long address,
 				struct vm_area_struct *vma,
-				bool store_update_sp)
+				bool *retry)
 {
+	unsigned int __user *nip =3D (unsigned int __user *)regs->nip;
+	struct pt_regs *uregs =3D current->thread.regs;
+	unsigned int inst;
+	int res;
+
+	/*
+	 * We want to do this outside mmap_sem, because reading code around nip
+	 * can result in fault, which will cause a deadlock when called with
+	 * mmap_sem held
+	 */
+	if (is_write && is_user)
+		store_update_sp =3D store_updates_sp(regs);
+
 	/*
 	 * N.B. The POWER/Open ABI allows programs to access up to
 	 * 288 bytes below the stack pointer.
@@ -246,26 +255,46 @@ static bool bad_stack_expansion(struct pt_regs *regs,=
 unsigned long address,
 	 * before setting the user r1.  Thus we allow the stack to
 	 * expand to 1MB without further checks.
 	 */
-	if (address + 0x100000 < vma->vm_end) {
-		/* get user regs even if this fault is in kernel mode */
-		struct pt_regs *uregs =3D current->thread.regs;
-		if (uregs =3D=3D NULL)
-			return true;
+	if (address + 0x100000 >=3D vma->vm_end)
+		return false;
=20
-		/*
-		 * A user-mode access to an address a long way below
-		 * the stack pointer is only valid if the instruction
-		 * is one which would update the stack pointer to the
-		 * address accessed if the instruction completed,
-		 * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
-		 * (or the byte, halfword, float or double forms).
-		 *
-		 * If we don't check this then any write to the area
-		 * between the last mapped region and the stack will
-		 * expand the stack rather than segfaulting.
-		 */
-		if (address + 2048 < uregs->gpr[1] && !store_update_sp)
-			return true;
+	/* get user regs even if this fault is in kernel mode */
+	if (unlikely(uregs =3D=3D NULL)) {
+		*must_retry =3D false;
+		return true;
+	}
+
+	/*
+	 * A user-mode access to an address a long way below
+	 * the stack pointer is only valid if the instruction
+	 * is one which would update the stack pointer to the
+	 * address accessed if the instruction completed,
+	 * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
+	 * (or the byte, halfword, float or double forms).
+	 *
+	 * If we don't check this then any write to the area
+	 * between the last mapped region and the stack will
+	 * expand the stack rather than segfaulting.
+	 */
+	if (address + 2048 >=3D uregs->gpr[1])
+		return false;
+
+	if (unlikely(!access_ok(VERIFY_READ, nip, sizeof(inst)))) {
+		*must_retry =3D true;
+		return true;
+	}
+
+	pagefault_disable();
+	res =3D __get_user_inatomic(inst, nip);
+	pagefault_enable();
+	if (unlikely(res)) {
+		*must_retry =3D true;
+		return true;
+	}
+
+	if (!store_updates_sp(inst)) {
+		*must_retry =3D true;
+		return true;
 	}
 	return false;
 }
@@ -403,7 +432,7 @@ static int __do_page_fault(struct pt_regs *regs, unsign=
ed long address,
 	int is_user =3D user_mode(regs);
 	int is_write =3D page_fault_is_write(error_code);
 	int fault, major =3D 0;
-	bool store_update_sp =3D false;
+	bool must_retry;
=20
 	if (notify_page_fault(regs))
 		return 0;
@@ -449,14 +478,6 @@ static int __do_page_fault(struct pt_regs *regs, unsig=
ned long address,
 		return bad_key_fault_exception(regs, address,
 					       get_mm_addr_key(mm, address));
=20
-	/*
-	 * We want to do this outside mmap_sem, because reading code around nip
-	 * can result in fault, which will cause a deadlock when called with
-	 * mmap_sem held
-	 */
-	if (is_write && is_user)
-		store_update_sp =3D store_updates_sp(regs);
-
 	if (is_user)
 		flags |=3D FAULT_FLAG_USER;
 	if (is_write)
@@ -503,8 +524,17 @@ static int __do_page_fault(struct pt_regs *regs, unsig=
ned long address,
 		return bad_area(regs, address);
=20
 	/* The stack is being expanded, check if it's valid */
-	if (unlikely(bad_stack_expansion(regs, address, vma, store_update_sp)))
+	if (unlikely(bad_stack_expand(regs, address, vma, &must_retry))) {
+		if (must_retry) {
+			up_read(&mm->mmap_sem);
+			if (fault_in_pages_readable(address, sizeof(unsigned int)))
+				return bad_area_nosemaphore(regs, address);
+			goto retry;
+		}
+
 		return bad_area(regs, address);
+	}
+
=20
 	/* Try to expand it */
 	if (unlikely(expand_stack(vma, address)))

^ permalink raw reply related

* Re: [PATCH v7 2/3] powerpc/mm: Only read faulting instruction when necessary in do_page_fault()
From: Christophe LEROY @ 2018-05-23  7:00 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linux-kernel, linuxppc-dev
In-Reply-To: <20180523162901.791deea9@roar.ozlabs.ibm.com>



Le 23/05/2018 à 08:29, Nicholas Piggin a écrit :
> On Tue, 22 May 2018 16:50:55 +0200
> Christophe LEROY <christophe.leroy@c-s.fr> wrote:
> 
>> Le 22/05/2018 à 16:38, Nicholas Piggin a écrit :
>>> On Tue, 22 May 2018 16:02:56 +0200 (CEST)
>>> Christophe Leroy <christophe.leroy@c-s.fr> wrote:
>>>    
>>>> Commit a7a9dcd882a67 ("powerpc: Avoid taking a data miss on every
>>>> userspace instruction miss") has shown that limiting the read of
>>>> faulting instruction to likely cases improves performance.
>>>>
>>>> This patch goes further into this direction by limiting the read
>>>> of the faulting instruction to the only cases where it is likely
>>>> needed.
>>>>
>>>> On an MPC885, with the same benchmark app as in the commit referred
>>>> above, we see a reduction of about 3900 dTLB misses (approx 3%):
>>>>
>>>> Before the patch:
>>>>    Performance counter stats for './fault 500' (10 runs):
>>>>
>>>>            683033312      cpu-cycles                                                    ( +-  0.03% )
>>>>               134538      dTLB-load-misses                                              ( +-  0.03% )
>>>>                46099      iTLB-load-misses                                              ( +-  0.02% )
>>>>                19681      faults                                                        ( +-  0.02% )
>>>>
>>>>          5.389747878 seconds time elapsed                                          ( +-  0.06% )
>>>>
>>>> With the patch:
>>>>
>>>>    Performance counter stats for './fault 500' (10 runs):
>>>>
>>>>            682112862      cpu-cycles                                                    ( +-  0.03% )
>>>>               130619      dTLB-load-misses                                              ( +-  0.03% )
>>>>                46073      iTLB-load-misses                                              ( +-  0.05% )
>>>>                19681      faults                                                        ( +-  0.01% )
>>>>
>>>>          5.381342641 seconds time elapsed                                          ( +-  0.07% )
>>>>
>>>> The proper work of the huge stack expansion was tested with the
>>>> following app:
>>>>
>>>> int main(int argc, char **argv)
>>>> {
>>>> 	char buf[1024 * 1025];
>>>>
>>>> 	sprintf(buf, "Hello world !\n");
>>>> 	printf(buf);
>>>>
>>>> 	exit(0);
>>>> }
>>>>
>>>> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
>>>> ---
>>>>    v7: Following comment from Nicholas on v6 on possibility of the page getting removed from the pagetables
>>>>        between the fault and the read, I have reworked the patch in order to do the get_user() in
>>>>        __do_page_fault() directly in order to reduce complexity compared to version v5
>>>
>>> This is looking better, thanks.
>>>    
>>>> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
>>>> index fcbb34431da2..dc64b8e06477 100644
>>>> --- a/arch/powerpc/mm/fault.c
>>>> +++ b/arch/powerpc/mm/fault.c
>>>> @@ -450,9 +450,6 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
>>>>    	 * can result in fault, which will cause a deadlock when called with
>>>>    	 * mmap_sem held
>>>>    	 */
>>>> -	if (is_write && is_user)
>>>> -		get_user(inst, (unsigned int __user *)regs->nip);
>>>> -
>>>>    	if (is_user)
>>>>    		flags |= FAULT_FLAG_USER;
>>>>    	if (is_write)
>>>> @@ -498,6 +495,26 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
>>>>    	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
>>>>    		return bad_area(regs, address);
>>>>    
>>>> +	if (unlikely(is_write && is_user && address + 0x100000 < vma->vm_end &&
>>>> +		     !inst)) {
>>>> +		unsigned int __user *nip = (unsigned int __user *)regs->nip;
>>>> +
>>>> +		if (likely(access_ok(VERIFY_READ, nip, sizeof(inst)))) {
>>>> +			int res;
>>>> +
>>>> +			pagefault_disable();
>>>> +			res = __get_user_inatomic(inst, nip);
>>>> +			pagefault_enable();
>>>> +			if (unlikely(res)) {
>>>> +				up_read(&mm->mmap_sem);
>>>> +				res = __get_user(inst, nip);
>>>> +				if (!res && inst)
>>>> +					goto retry;
>>>
>>> You're handling error here but the previous code did not?
>>
>> The previous code did in store_updates_sp()
>>
>> When I moved get_user() out of that function in preceeding patch, I did
>> consider that if get_user() fails, inst will remain 0, which means that
>> store_updates_sp() will return false if ever called.
> 
> Well it handles it just by saying no the store does not update SP.
> Yours now segfaults it, doesn't it?

Yes it segfaults the same way as before, as it tell the expansion is bad.

> 
> I don't think that's a bad idea, I think it should go in a patch by
> itself though. In theory we can have execute but not read, I guess
> that's not really going to work here either way and I don't know if
> Linux exposes it ever.

I don't understand what you mean, that's not different from before, is it ?

> 
>>
>> Now, as the semaphore has been released, we really need to do something,
>> because if we goto retry inconditionally, we may end up in an infinite
>> loop, and we can't let it continue either as the semaphore is not held
>> anymore.
>>
>>>    
>>>> +				return bad_area_nosemaphore(regs, address);
>>>> +			}
>>>> +		}
>>>> +	}
>>>
>>> Would it be nicer to move all this up into bad_stack_expansion().
>>> It would need a way to handle the retry and insn, but I think it
>>> would still look better.
>>
>> That's what I did in v5 indeed, but it looked too complex to me at the
>> end. Can you have a look at it
>> (https://patchwork.ozlabs.org/patch/799053/) and tell me if you feel it
>> better than v7, or if you have any suggestion to improve based on v5
>> and/or v7 ?
> 
> Yeah I'm kind of liking that direction a bit more. I took your code
> and hacked on it a bit more... Completely untested but I wonder what
> you think?
> 
> We can put almost all the checking logic out of the main fault
> path, and the retry stuff can fit into the unlikely failure
> path. Also we only get_user at the last minute.
> 
> It does use fault_in_pages_readable which in theory means you might
> repeat the loop if the page gets faulted out between retry, but that
> same pattern exists in places in the filesystem code. Basically it
> would be edge case trashing and if it persists then the system is
> already finished. So I think it's okay. Just makes the retry loop a
> bit simpler.
> 
> Any thoughts?

Indeed, after writing you I looked at it once more and I think I ended 
up with something rather similar as what you are proposing here.
The complexity in v5 was because I left the get_user() in 
store_updates_sp(). By moving it up into bad_stack_expansion(), it looks 
better.
The main difference I see between your proposal and my v8 is that I do 
the up_read() in bad_stack_expansion(). Maybe that's not a good idea.

I'll release it in a few minutes, let me know what you think about it.

Thanks,
Christophe

> 
> Thanks,
> Nick
> 
> 
> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> index c01d627e687a..f0d36ec949b3 100644
> --- a/arch/powerpc/mm/fault.c
> +++ b/arch/powerpc/mm/fault.c
> @@ -69,12 +69,8 @@ static inline bool notify_page_fault(struct pt_regs *regs)
>    * Check whether the instruction at regs->nip is a store using
>    * an update addressing form which will update r1.
>    */
> -static bool store_updates_sp(struct pt_regs *regs)
> +static bool store_updates_sp(unsigned int inst)
>   {
> -	unsigned int inst;
> -
> -	if (get_user(inst, (unsigned int __user *)regs->nip))
> -		return false;
>   	/* check for 1 in the rA field */
>   	if (((inst >> 16) & 0x1f) != 1)
>   		return false;
> @@ -233,10 +229,23 @@ static bool bad_kernel_fault(bool is_exec, unsigned long error_code,
>   	return is_exec || (address >= TASK_SIZE);
>   }
>   
> -static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
> +static bool bad_stack_expand(struct pt_regs *regs, unsigned long address,
>   				struct vm_area_struct *vma,
> -				bool store_update_sp)
> +				bool *retry)
>   {
> +	unsigned int __user *nip = (unsigned int __user *)regs->nip;
> +	struct pt_regs *uregs = current->thread.regs;
> +	unsigned int inst;
> +	int res;
> +
> +	/*
> +	 * We want to do this outside mmap_sem, because reading code around nip
> +	 * can result in fault, which will cause a deadlock when called with
> +	 * mmap_sem held
> +	 */
> +	if (is_write && is_user)
> +		store_update_sp = store_updates_sp(regs);
> +
>   	/*
>   	 * N.B. The POWER/Open ABI allows programs to access up to
>   	 * 288 bytes below the stack pointer.
> @@ -246,26 +255,46 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
>   	 * before setting the user r1.  Thus we allow the stack to
>   	 * expand to 1MB without further checks.
>   	 */
> -	if (address + 0x100000 < vma->vm_end) {
> -		/* get user regs even if this fault is in kernel mode */
> -		struct pt_regs *uregs = current->thread.regs;
> -		if (uregs == NULL)
> -			return true;
> +	if (address + 0x100000 >= vma->vm_end)
> +		return false;
>   
> -		/*
> -		 * A user-mode access to an address a long way below
> -		 * the stack pointer is only valid if the instruction
> -		 * is one which would update the stack pointer to the
> -		 * address accessed if the instruction completed,
> -		 * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
> -		 * (or the byte, halfword, float or double forms).
> -		 *
> -		 * If we don't check this then any write to the area
> -		 * between the last mapped region and the stack will
> -		 * expand the stack rather than segfaulting.
> -		 */
> -		if (address + 2048 < uregs->gpr[1] && !store_update_sp)
> -			return true;
> +	/* get user regs even if this fault is in kernel mode */
> +	if (unlikely(uregs == NULL)) {
> +		*must_retry = false;
> +		return true;
> +	}
> +
> +	/*
> +	 * A user-mode access to an address a long way below
> +	 * the stack pointer is only valid if the instruction
> +	 * is one which would update the stack pointer to the
> +	 * address accessed if the instruction completed,
> +	 * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
> +	 * (or the byte, halfword, float or double forms).
> +	 *
> +	 * If we don't check this then any write to the area
> +	 * between the last mapped region and the stack will
> +	 * expand the stack rather than segfaulting.
> +	 */
> +	if (address + 2048 >= uregs->gpr[1])
> +		return false;
> +
> +	if (unlikely(!access_ok(VERIFY_READ, nip, sizeof(inst)))) {
> +		*must_retry = true;
> +		return true;
> +	}
> +
> +	pagefault_disable();
> +	res = __get_user_inatomic(inst, nip);
> +	pagefault_enable();
> +	if (unlikely(res)) {
> +		*must_retry = true;
> +		return true;
> +	}
> +
> +	if (!store_updates_sp(inst)) {
> +		*must_retry = true;
> +		return true;
>   	}
>   	return false;
>   }
> @@ -403,7 +432,7 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
>   	int is_user = user_mode(regs);
>   	int is_write = page_fault_is_write(error_code);
>   	int fault, major = 0;
> -	bool store_update_sp = false;
> +	bool must_retry;
>   
>   	if (notify_page_fault(regs))
>   		return 0;
> @@ -449,14 +478,6 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
>   		return bad_key_fault_exception(regs, address,
>   					       get_mm_addr_key(mm, address));
>   
> -	/*
> -	 * We want to do this outside mmap_sem, because reading code around nip
> -	 * can result in fault, which will cause a deadlock when called with
> -	 * mmap_sem held
> -	 */
> -	if (is_write && is_user)
> -		store_update_sp = store_updates_sp(regs);
> -
>   	if (is_user)
>   		flags |= FAULT_FLAG_USER;
>   	if (is_write)
> @@ -503,8 +524,17 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
>   		return bad_area(regs, address);
>   
>   	/* The stack is being expanded, check if it's valid */
> -	if (unlikely(bad_stack_expansion(regs, address, vma, store_update_sp)))
> +	if (unlikely(bad_stack_expand(regs, address, vma, &must_retry))) {
> +		if (must_retry) {
> +			up_read(&mm->mmap_sem);
> +			if (fault_in_pages_readable(address, sizeof(unsigned int)))
> +				return bad_area_nosemaphore(regs, address);
> +			goto retry;
> +		}
> +
>   		return bad_area(regs, address);
> +	}
> +
>   
>   	/* Try to expand it */
>   	if (unlikely(expand_stack(vma, address)))
> 

^ permalink raw reply

* [PATCH v8] powerpc/mm: Only read faulting instruction when necessary in do_page_fault()
From: Christophe Leroy @ 2018-05-23  7:01 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin
  Cc: linux-kernel, linuxppc-dev

Commit a7a9dcd882a67 ("powerpc: Avoid taking a data miss on every
userspace instruction miss") has shown that limiting the read of
faulting instruction to likely cases improves performance.

This patch goes further into this direction by limiting the read
of the faulting instruction to the only cases where it is likely
needed.

On an MPC885, with the same benchmark app as in the commit referred
above, we see a reduction of about 3900 dTLB misses (approx 3%):

Before the patch:
 Performance counter stats for './fault 500' (10 runs):

         683033312      cpu-cycles                                                    ( +-  0.03% )
            134538      dTLB-load-misses                                              ( +-  0.03% )
             46099      iTLB-load-misses                                              ( +-  0.02% )
             19681      faults                                                        ( +-  0.02% )

       5.389747878 seconds time elapsed                                          ( +-  0.06% )

With the patch:

 Performance counter stats for './fault 500' (10 runs):

         682112862      cpu-cycles                                                    ( +-  0.03% )
            130619      dTLB-load-misses                                              ( +-  0.03% )
             46073      iTLB-load-misses                                              ( +-  0.05% )
             19681      faults                                                        ( +-  0.01% )

       5.381342641 seconds time elapsed                                          ( +-  0.07% )

The proper work of the huge stack expansion was tested with the
following app:

int main(int argc, char **argv)
{
	char buf[1024 * 1025];

	sprintf(buf, "Hello world !\n");
	printf(buf);

	exit(0);
}

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 v8: Back to a single patch as it now makes no sense to split the first part in two. The third patch has no
     dependencies with the ones before, so it will be resend independantly. As suggested by Nicholas, the
     patch now does the get_user() stuff inside bad_stack_expansion(), that's a mid way between v5 and v7.

 v7: Following comment from Nicholas on v6 on possibility of the page getting removed from the pagetables
     between the fault and the read, I have reworked the patch in order to do the get_user() in
     __do_page_fault() directly in order to reduce complexity compared to version v5

 v6: Rebased on latest powerpc/merge branch ; Using __get_user_inatomic() instead of get_user() in order
     to move it inside the semaphored area. That removes all the complexity of the patch.

 v5: Reworked to fit after Benh do_fault improvement and rebased on top of powerpc/merge (65152902e43fef)

 v4: Rebased on top of powerpc/next (f718d426d7e42e) and doing access_ok() verification before __get_user_xxx()

 v3: Do a first try with pagefault disabled before releasing the semaphore

 v2: Changes 'if (cond1) if (cond2)' by 'if (cond1 && cond2)'

 arch/powerpc/mm/fault.c | 63 +++++++++++++++++++++++++++++++++++--------------
 1 file changed, 45 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 0c99f9b45e8f..7f9363879f4a 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -66,15 +66,11 @@ static inline bool notify_page_fault(struct pt_regs *regs)
 }
 
 /*
- * Check whether the instruction at regs->nip is a store using
+ * Check whether the instruction inst is a store using
  * an update addressing form which will update r1.
  */
-static bool store_updates_sp(struct pt_regs *regs)
+static bool store_updates_sp(unsigned int inst)
 {
-	unsigned int inst;
-
-	if (get_user(inst, (unsigned int __user *)regs->nip))
-		return false;
 	/* check for 1 in the rA field */
 	if (((inst >> 16) & 0x1f) != 1)
 		return false;
@@ -233,9 +229,10 @@ static bool bad_kernel_fault(bool is_exec, unsigned long error_code,
 	return is_exec || (address >= TASK_SIZE);
 }
 
-static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
-				struct vm_area_struct *vma,
-				bool store_update_sp)
+/* Return value is true if bad (sem. released), false if good, -1 for retry */
+static int bad_stack_expansion(struct pt_regs *regs, unsigned long address,
+				struct vm_area_struct *vma, unsigned int flags,
+				bool is_retry)
 {
 	/*
 	 * N.B. The POWER/Open ABI allows programs to access up to
@@ -247,10 +244,15 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
 	 * expand to 1MB without further checks.
 	 */
 	if (address + 0x100000 < vma->vm_end) {
+		struct mm_struct *mm = current->mm;
+		unsigned int __user *nip = (unsigned int __user *)regs->nip;
+		unsigned int inst;
 		/* get user regs even if this fault is in kernel mode */
 		struct pt_regs *uregs = current->thread.regs;
-		if (uregs == NULL)
+		if (uregs == NULL) {
+			up_read(&mm->mmap_sem);
 			return true;
+		}
 
 		/*
 		 * A user-mode access to an address a long way below
@@ -264,8 +266,30 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
 		 * between the last mapped region and the stack will
 		 * expand the stack rather than segfaulting.
 		 */
-		if (address + 2048 < uregs->gpr[1] && !store_update_sp)
-			return true;
+		if (address + 2048 >= uregs->gpr[1])
+			return false;
+		if (is_retry)
+			return false;
+
+		if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
+		    access_ok(VERIFY_READ, nip, sizeof(inst))) {
+			int res;
+
+			pagefault_disable();
+			res = __get_user_inatomic(inst, nip);
+			pagefault_enable();
+			if (res) {
+				up_read(&mm->mmap_sem);
+				res = __get_user(inst, nip);
+				if (!res && store_updates_sp(inst))
+					return -1;
+				return true;
+			}
+			if (store_updates_sp(inst))
+				return false;
+		}
+		up_read(&mm->mmap_sem);
+		return true;
 	}
 	return false;
 }
@@ -403,7 +427,8 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
 	int is_user = user_mode(regs);
 	int is_write = page_fault_is_write(error_code);
 	int fault, major = 0;
-	bool store_update_sp = false;
+	bool is_retry = false;
+	int is_bad;
 
 	if (notify_page_fault(regs))
 		return 0;
@@ -454,9 +479,6 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
 	 * can result in fault, which will cause a deadlock when called with
 	 * mmap_sem held
 	 */
-	if (is_write && is_user)
-		store_update_sp = store_updates_sp(regs);
-
 	if (is_user)
 		flags |= FAULT_FLAG_USER;
 	if (is_write)
@@ -503,8 +525,13 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
 		return bad_area(regs, address);
 
 	/* The stack is being expanded, check if it's valid */
-	if (unlikely(bad_stack_expansion(regs, address, vma, store_update_sp)))
-		return bad_area(regs, address);
+	is_bad = bad_stack_expansion(regs, address, vma, flags, is_retry);
+	if (unlikely(is_bad == -1)) {
+		is_retry = true;
+		goto retry;
+	}
+	if (unlikely(is_bad))
+		return bad_area_nosemaphore(regs, address);
 
 	/* Try to expand it */
 	if (unlikely(expand_stack(vma, address)))
-- 
2.13.3

^ permalink raw reply related

* [PATCH] powerpc/mm: Use instruction symbolic names in store_updates_sp()
From: Christophe Leroy @ 2018-05-23  7:04 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin
  Cc: linux-kernel, linuxppc-dev

Use symbolic names defined in asm/ppc-opcode.h
instead of hardcoded values.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 Resending as inpependant of the do_page_fault() stuff

 arch/powerpc/include/asm/ppc-opcode.h |  1 +
 arch/powerpc/mm/fault.c               | 26 +++++++++++++-------------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index 18883b8a6dac..4436887bc415 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -162,6 +162,7 @@
 /* VMX Vector Store Instructions */
 #define OP_31_XOP_STVX          231
 
+#define OP_31   31
 #define OP_LWZ  32
 #define OP_STFS 52
 #define OP_STFSU 53
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index c01d627e687a..0c99f9b45e8f 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -80,23 +80,23 @@ static bool store_updates_sp(struct pt_regs *regs)
 		return false;
 	/* check major opcode */
 	switch (inst >> 26) {
-	case 37:	/* stwu */
-	case 39:	/* stbu */
-	case 45:	/* sthu */
-	case 53:	/* stfsu */
-	case 55:	/* stfdu */
+	case OP_STWU:
+	case OP_STBU:
+	case OP_STHU:
+	case OP_STFSU:
+	case OP_STFDU:
 		return true;
-	case 62:	/* std or stdu */
+	case OP_STD:	/* std or stdu */
 		return (inst & 3) == 1;
-	case 31:
+	case OP_31:
 		/* check minor opcode */
 		switch ((inst >> 1) & 0x3ff) {
-		case 181:	/* stdux */
-		case 183:	/* stwux */
-		case 247:	/* stbux */
-		case 439:	/* sthux */
-		case 695:	/* stfsux */
-		case 759:	/* stfdux */
+		case OP_31_XOP_STDUX:
+		case OP_31_XOP_STWUX:
+		case OP_31_XOP_STBUX:
+		case OP_31_XOP_STHUX:
+		case OP_31_XOP_STFSUX:
+		case OP_31_XOP_STFDUX:
 			return true;
 		}
 	}
-- 
2.13.3

^ permalink raw reply related

* Re: [PATCH v8] powerpc/mm: Only read faulting instruction when necessary in do_page_fault()
From: Nicholas Piggin @ 2018-05-23  7:17 UTC (permalink / raw)
  To: Christophe Leroy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linux-kernel, linuxppc-dev
In-Reply-To: <61ef5847cedae91eb5415f40fb4276585950cb43.1527058032.git.christophe.leroy@c-s.fr>

On Wed, 23 May 2018 09:01:19 +0200 (CEST)
Christophe Leroy <christophe.leroy@c-s.fr> wrote:

> Commit a7a9dcd882a67 ("powerpc: Avoid taking a data miss on every
> userspace instruction miss") has shown that limiting the read of
> faulting instruction to likely cases improves performance.
> 
> This patch goes further into this direction by limiting the read
> of the faulting instruction to the only cases where it is likely
> needed.
> 
> On an MPC885, with the same benchmark app as in the commit referred
> above, we see a reduction of about 3900 dTLB misses (approx 3%):
> 
> Before the patch:
>  Performance counter stats for './fault 500' (10 runs):
> 
>          683033312      cpu-cycles                                                    ( +-  0.03% )
>             134538      dTLB-load-misses                                              ( +-  0.03% )
>              46099      iTLB-load-misses                                              ( +-  0.02% )
>              19681      faults                                                        ( +-  0.02% )
> 
>        5.389747878 seconds time elapsed                                          ( +-  0.06% )
> 
> With the patch:
> 
>  Performance counter stats for './fault 500' (10 runs):
> 
>          682112862      cpu-cycles                                                    ( +-  0.03% )
>             130619      dTLB-load-misses                                              ( +-  0.03% )
>              46073      iTLB-load-misses                                              ( +-  0.05% )
>              19681      faults                                                        ( +-  0.01% )
> 
>        5.381342641 seconds time elapsed                                          ( +-  0.07% )
> 
> The proper work of the huge stack expansion was tested with the
> following app:
> 
> int main(int argc, char **argv)
> {
> 	char buf[1024 * 1025];
> 
> 	sprintf(buf, "Hello world !\n");
> 	printf(buf);
> 
> 	exit(0);
> }
> 
> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
> ---
>  v8: Back to a single patch as it now makes no sense to split the first part in two. The third patch has no
>      dependencies with the ones before, so it will be resend independantly. As suggested by Nicholas, the
>      patch now does the get_user() stuff inside bad_stack_expansion(), that's a mid way between v5 and v7.
> 
>  v7: Following comment from Nicholas on v6 on possibility of the page getting removed from the pagetables
>      between the fault and the read, I have reworked the patch in order to do the get_user() in
>      __do_page_fault() directly in order to reduce complexity compared to version v5
> 
>  v6: Rebased on latest powerpc/merge branch ; Using __get_user_inatomic() instead of get_user() in order
>      to move it inside the semaphored area. That removes all the complexity of the patch.
> 
>  v5: Reworked to fit after Benh do_fault improvement and rebased on top of powerpc/merge (65152902e43fef)
> 
>  v4: Rebased on top of powerpc/next (f718d426d7e42e) and doing access_ok() verification before __get_user_xxx()
> 
>  v3: Do a first try with pagefault disabled before releasing the semaphore
> 
>  v2: Changes 'if (cond1) if (cond2)' by 'if (cond1 && cond2)'
> 
>  arch/powerpc/mm/fault.c | 63 +++++++++++++++++++++++++++++++++++--------------
>  1 file changed, 45 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> index 0c99f9b45e8f..7f9363879f4a 100644
> --- a/arch/powerpc/mm/fault.c
> +++ b/arch/powerpc/mm/fault.c
> @@ -66,15 +66,11 @@ static inline bool notify_page_fault(struct pt_regs *regs)
>  }
>  
>  /*
> - * Check whether the instruction at regs->nip is a store using
> + * Check whether the instruction inst is a store using
>   * an update addressing form which will update r1.
>   */
> -static bool store_updates_sp(struct pt_regs *regs)
> +static bool store_updates_sp(unsigned int inst)
>  {
> -	unsigned int inst;
> -
> -	if (get_user(inst, (unsigned int __user *)regs->nip))
> -		return false;
>  	/* check for 1 in the rA field */
>  	if (((inst >> 16) & 0x1f) != 1)
>  		return false;
> @@ -233,9 +229,10 @@ static bool bad_kernel_fault(bool is_exec, unsigned long error_code,
>  	return is_exec || (address >= TASK_SIZE);
>  }
>  
> -static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
> -				struct vm_area_struct *vma,
> -				bool store_update_sp)
> +/* Return value is true if bad (sem. released), false if good, -1 for retry */
> +static int bad_stack_expansion(struct pt_regs *regs, unsigned long address,
> +				struct vm_area_struct *vma, unsigned int flags,
> +				bool is_retry)
>  {
>  	/*
>  	 * N.B. The POWER/Open ABI allows programs to access up to
> @@ -247,10 +244,15 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
>  	 * expand to 1MB without further checks.
>  	 */
>  	if (address + 0x100000 < vma->vm_end) {
> +		struct mm_struct *mm = current->mm;
> +		unsigned int __user *nip = (unsigned int __user *)regs->nip;
> +		unsigned int inst;
>  		/* get user regs even if this fault is in kernel mode */
>  		struct pt_regs *uregs = current->thread.regs;
> -		if (uregs == NULL)
> +		if (uregs == NULL) {
> +			up_read(&mm->mmap_sem);
>  			return true;
> +		}
>  
>  		/*
>  		 * A user-mode access to an address a long way below
> @@ -264,8 +266,30 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
>  		 * between the last mapped region and the stack will
>  		 * expand the stack rather than segfaulting.
>  		 */
> -		if (address + 2048 < uregs->gpr[1] && !store_update_sp)
> -			return true;
> +		if (address + 2048 >= uregs->gpr[1])
> +			return false;
> +		if (is_retry)
> +			return false;
> +
> +		if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
> +		    access_ok(VERIFY_READ, nip, sizeof(inst))) {
> +			int res;
> +
> +			pagefault_disable();
> +			res = __get_user_inatomic(inst, nip);
> +			pagefault_enable();
> +			if (res) {
> +				up_read(&mm->mmap_sem);
> +				res = __get_user(inst, nip);
> +				if (!res && store_updates_sp(inst))
> +					return -1;
> +				return true;
> +			}
> +			if (store_updates_sp(inst))
> +				return false;
> +		}
> +		up_read(&mm->mmap_sem);

Starting to look pretty good... I think probably I prefer the mmap_sem
drop going into the caller so we don't don't drop in the child function.
I thought the retry logic was a little bit complex too, what do you
think of using fault_in_pages_readable and just doing a full retry to
avoid some of this complexity?

> +		return true;
>  	}
>  	return false;
>  }
> @@ -403,7 +427,8 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
>  	int is_user = user_mode(regs);
>  	int is_write = page_fault_is_write(error_code);
>  	int fault, major = 0;
> -	bool store_update_sp = false;
> +	bool is_retry = false;
> +	int is_bad;
>  
>  	if (notify_page_fault(regs))
>  		return 0;
> @@ -454,9 +479,6 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
>  	 * can result in fault, which will cause a deadlock when called with
>  	 * mmap_sem held
>  	 */
> -	if (is_write && is_user)
> -		store_update_sp = store_updates_sp(regs);
> -
>  	if (is_user)
>  		flags |= FAULT_FLAG_USER;
>  	if (is_write)
> @@ -503,8 +525,13 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
>  		return bad_area(regs, address);
>  
>  	/* The stack is being expanded, check if it's valid */
> -	if (unlikely(bad_stack_expansion(regs, address, vma, store_update_sp)))
> -		return bad_area(regs, address);
> +	is_bad = bad_stack_expansion(regs, address, vma, flags, is_retry);
> +	if (unlikely(is_bad == -1)) {
> +		is_retry = true;
> +		goto retry;
> +	}
> +	if (unlikely(is_bad))
> +		return bad_area_nosemaphore(regs, address);

Suggest making the return so that you can do a single unlikely test for
the retry or bad case, and then distinguish the retry in there. Code
generation should be better.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH v8] powerpc/mm: Only read faulting instruction when necessary in do_page_fault()
From: Christophe LEROY @ 2018-05-23  7:31 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linux-kernel, linuxppc-dev
In-Reply-To: <20180523171710.00792240@roar.ozlabs.ibm.com>



Le 23/05/2018 à 09:17, Nicholas Piggin a écrit :
> On Wed, 23 May 2018 09:01:19 +0200 (CEST)
> Christophe Leroy <christophe.leroy@c-s.fr> wrote:
> 
>> Commit a7a9dcd882a67 ("powerpc: Avoid taking a data miss on every
>> userspace instruction miss") has shown that limiting the read of
>> faulting instruction to likely cases improves performance.
>>
>> This patch goes further into this direction by limiting the read
>> of the faulting instruction to the only cases where it is likely
>> needed.
>>
>> On an MPC885, with the same benchmark app as in the commit referred
>> above, we see a reduction of about 3900 dTLB misses (approx 3%):
>>
>> Before the patch:
>>   Performance counter stats for './fault 500' (10 runs):
>>
>>           683033312      cpu-cycles                                                    ( +-  0.03% )
>>              134538      dTLB-load-misses                                              ( +-  0.03% )
>>               46099      iTLB-load-misses                                              ( +-  0.02% )
>>               19681      faults                                                        ( +-  0.02% )
>>
>>         5.389747878 seconds time elapsed                                          ( +-  0.06% )
>>
>> With the patch:
>>
>>   Performance counter stats for './fault 500' (10 runs):
>>
>>           682112862      cpu-cycles                                                    ( +-  0.03% )
>>              130619      dTLB-load-misses                                              ( +-  0.03% )
>>               46073      iTLB-load-misses                                              ( +-  0.05% )
>>               19681      faults                                                        ( +-  0.01% )
>>
>>         5.381342641 seconds time elapsed                                          ( +-  0.07% )
>>
>> The proper work of the huge stack expansion was tested with the
>> following app:
>>
>> int main(int argc, char **argv)
>> {
>> 	char buf[1024 * 1025];
>>
>> 	sprintf(buf, "Hello world !\n");
>> 	printf(buf);
>>
>> 	exit(0);
>> }
>>
>> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
>> ---
>>   v8: Back to a single patch as it now makes no sense to split the first part in two. The third patch has no
>>       dependencies with the ones before, so it will be resend independantly. As suggested by Nicholas, the
>>       patch now does the get_user() stuff inside bad_stack_expansion(), that's a mid way between v5 and v7.
>>
>>   v7: Following comment from Nicholas on v6 on possibility of the page getting removed from the pagetables
>>       between the fault and the read, I have reworked the patch in order to do the get_user() in
>>       __do_page_fault() directly in order to reduce complexity compared to version v5
>>
>>   v6: Rebased on latest powerpc/merge branch ; Using __get_user_inatomic() instead of get_user() in order
>>       to move it inside the semaphored area. That removes all the complexity of the patch.
>>
>>   v5: Reworked to fit after Benh do_fault improvement and rebased on top of powerpc/merge (65152902e43fef)
>>
>>   v4: Rebased on top of powerpc/next (f718d426d7e42e) and doing access_ok() verification before __get_user_xxx()
>>
>>   v3: Do a first try with pagefault disabled before releasing the semaphore
>>
>>   v2: Changes 'if (cond1) if (cond2)' by 'if (cond1 && cond2)'
>>
>>   arch/powerpc/mm/fault.c | 63 +++++++++++++++++++++++++++++++++++--------------
>>   1 file changed, 45 insertions(+), 18 deletions(-)
>>
>> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
>> index 0c99f9b45e8f..7f9363879f4a 100644
>> --- a/arch/powerpc/mm/fault.c
>> +++ b/arch/powerpc/mm/fault.c
>> @@ -66,15 +66,11 @@ static inline bool notify_page_fault(struct pt_regs *regs)
>>   }
>>   
>>   /*
>> - * Check whether the instruction at regs->nip is a store using
>> + * Check whether the instruction inst is a store using
>>    * an update addressing form which will update r1.
>>    */
>> -static bool store_updates_sp(struct pt_regs *regs)
>> +static bool store_updates_sp(unsigned int inst)
>>   {
>> -	unsigned int inst;
>> -
>> -	if (get_user(inst, (unsigned int __user *)regs->nip))
>> -		return false;
>>   	/* check for 1 in the rA field */
>>   	if (((inst >> 16) & 0x1f) != 1)
>>   		return false;
>> @@ -233,9 +229,10 @@ static bool bad_kernel_fault(bool is_exec, unsigned long error_code,
>>   	return is_exec || (address >= TASK_SIZE);
>>   }
>>   
>> -static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
>> -				struct vm_area_struct *vma,
>> -				bool store_update_sp)
>> +/* Return value is true if bad (sem. released), false if good, -1 for retry */
>> +static int bad_stack_expansion(struct pt_regs *regs, unsigned long address,
>> +				struct vm_area_struct *vma, unsigned int flags,
>> +				bool is_retry)
>>   {
>>   	/*
>>   	 * N.B. The POWER/Open ABI allows programs to access up to
>> @@ -247,10 +244,15 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
>>   	 * expand to 1MB without further checks.
>>   	 */
>>   	if (address + 0x100000 < vma->vm_end) {
>> +		struct mm_struct *mm = current->mm;
>> +		unsigned int __user *nip = (unsigned int __user *)regs->nip;
>> +		unsigned int inst;
>>   		/* get user regs even if this fault is in kernel mode */
>>   		struct pt_regs *uregs = current->thread.regs;
>> -		if (uregs == NULL)
>> +		if (uregs == NULL) {
>> +			up_read(&mm->mmap_sem);
>>   			return true;
>> +		}
>>   
>>   		/*
>>   		 * A user-mode access to an address a long way below
>> @@ -264,8 +266,30 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
>>   		 * between the last mapped region and the stack will
>>   		 * expand the stack rather than segfaulting.
>>   		 */
>> -		if (address + 2048 < uregs->gpr[1] && !store_update_sp)
>> -			return true;
>> +		if (address + 2048 >= uregs->gpr[1])
>> +			return false;
>> +		if (is_retry)
>> +			return false;
>> +
>> +		if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
>> +		    access_ok(VERIFY_READ, nip, sizeof(inst))) {
>> +			int res;
>> +
>> +			pagefault_disable();
>> +			res = __get_user_inatomic(inst, nip);
>> +			pagefault_enable();
>> +			if (res) {
>> +				up_read(&mm->mmap_sem);
>> +				res = __get_user(inst, nip);
>> +				if (!res && store_updates_sp(inst))
>> +					return -1;
>> +				return true;
>> +			}
>> +			if (store_updates_sp(inst))
>> +				return false;
>> +		}
>> +		up_read(&mm->mmap_sem);
> 
> Starting to look pretty good... I think probably I prefer the mmap_sem
> drop going into the caller so we don't don't drop in the child function.

Yes I can do that. I though it was ok as the drop is already done in 
children functions like bad_area(), bad_access(), ...

> I thought the retry logic was a little bit complex too, what do you
> think of using fault_in_pages_readable and just doing a full retry to
> avoid some of this complexity?

Yes lets try that way, allthough fault_in_pages_readable() is nothing 
else than a get_user().
Should we take any precaution to avoid retrying forever or is it just 
not worth it ?

> 
>> +		return true;
>>   	}
>>   	return false;
>>   }
>> @@ -403,7 +427,8 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
>>   	int is_user = user_mode(regs);
>>   	int is_write = page_fault_is_write(error_code);
>>   	int fault, major = 0;
>> -	bool store_update_sp = false;
>> +	bool is_retry = false;
>> +	int is_bad;
>>   
>>   	if (notify_page_fault(regs))
>>   		return 0;
>> @@ -454,9 +479,6 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
>>   	 * can result in fault, which will cause a deadlock when called with
>>   	 * mmap_sem held
>>   	 */
>> -	if (is_write && is_user)
>> -		store_update_sp = store_updates_sp(regs);
>> -
>>   	if (is_user)
>>   		flags |= FAULT_FLAG_USER;
>>   	if (is_write)
>> @@ -503,8 +525,13 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
>>   		return bad_area(regs, address);
>>   
>>   	/* The stack is being expanded, check if it's valid */
>> -	if (unlikely(bad_stack_expansion(regs, address, vma, store_update_sp)))
>> -		return bad_area(regs, address);
>> +	is_bad = bad_stack_expansion(regs, address, vma, flags, is_retry);
>> +	if (unlikely(is_bad == -1)) {
>> +		is_retry = true;
>> +		goto retry;
>> +	}
>> +	if (unlikely(is_bad))
>> +		return bad_area_nosemaphore(regs, address);
> 
> Suggest making the return so that you can do a single unlikely test for
> the retry or bad case, and then distinguish the retry in there. Code
> generation should be better.

Ok. I'll try and come with v9 during this morning.

Thanks,
Christophe

> 
> Thanks,
> Nick
> 

^ permalink raw reply

* [PATCH v5 0/4] powerpc/64: memcmp() optimization
From: wei.guo.simon @ 2018-05-23  6:47 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Paul Mackerras, Michael Ellerman, Naveen N.  Rao, Cyril Bur,
	Simon Guo

From: Simon Guo <wei.guo.simon@gmail.com>

There is some room to optimize memcmp() in powerpc 64 bits version for
following 2 cases:
(1) Even src/dst addresses are not aligned with 8 bytes at the beginning,
memcmp() can align them and go with .Llong comparision mode without
fallback to .Lshort comparision mode do compare buffer byte by byte.
(2) VMX instructions can be used to speed up for large size comparision,
currently the threshold is set for 4K bytes. Notes the VMX instructions
will lead to VMX regs save/load penalty. This patch set includes a
patch to add a 32 bytes pre-checking to minimize the penalty.

It did the similar with glibc commit dec4a7105e (powerpc: Improve memcmp 
performance for POWER8). Thanks Cyril Bur's information.
This patch set also updates memcmp selftest case to make it compiled and
incorporate large size comparison case.

v4 -> v5:
- Expand 32 bytes prechk to src/dst different offset case, and remove
KSM specific label/comment.

v3 -> v4:
- Add 32 bytes pre-checking before using VMX instructions.

v2 -> v3:
- add optimization for src/dst with different offset against 8 bytes
boundary.
- renamed some label names.
- reworked some comments from Cyril Bur, such as fill the pipeline, 
and use VMX when size == 4K.
- fix a bug of enter/exit_vmx_ops pairness issue. And revised test 
case to test whether enter/exit_vmx_ops are paired.

v1 -> v2:
- update 8bytes unaligned bytes comparison method.
- fix a VMX comparision bug.
- enhanced the original memcmp() selftest.
- add powerpc/64 to subject/commit message.

Simon Guo (4):
  powerpc/64: Align bytes before fall back to .Lshort in powerpc64
    memcmp()
  powerpc/64: enhance memcmp() with VMX instruction for long bytes
    comparision
  powerpc/64: add 32 bytes prechecking before using VMX optimization on
    memcmp()
  powerpc:selftest update memcmp_64 selftest for VMX implementation

 arch/powerpc/include/asm/asm-prototypes.h          |   4 +-
 arch/powerpc/lib/copypage_power7.S                 |   4 +-
 arch/powerpc/lib/memcmp_64.S                       | 408 ++++++++++++++++++++-
 arch/powerpc/lib/memcpy_power7.S                   |   6 +-
 arch/powerpc/lib/vmx-helper.c                      |   4 +-
 .../selftests/powerpc/copyloops/asm/ppc_asm.h      |   4 +-
 .../selftests/powerpc/stringloops/asm/ppc_asm.h    |  22 ++
 .../testing/selftests/powerpc/stringloops/memcmp.c |  98 +++--
 8 files changed, 510 insertions(+), 40 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* [PATCH v5 1/4] powerpc/64: Align bytes before fall back to .Lshort in powerpc64 memcmp()
From: wei.guo.simon @ 2018-05-23  6:48 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Paul Mackerras, Michael Ellerman, Naveen N.  Rao, Cyril Bur,
	Simon Guo
In-Reply-To: <1527058083-6998-1-git-send-email-wei.guo.simon@gmail.com>

From: Simon Guo <wei.guo.simon@gmail.com>

Currently memcmp() 64bytes version in powerpc will fall back to .Lshort
(compare per byte mode) if either src or dst address is not 8 bytes aligned.
It can be opmitized in 2 situations:

1) if both addresses are with the same offset with 8 bytes boundary:
memcmp() can compare the unaligned bytes within 8 bytes boundary firstly
and then compare the rest 8-bytes-aligned content with .Llong mode.

2)  If src/dst addrs are not with the same offset of 8 bytes boundary:
memcmp() can align src addr with 8 bytes, increment dst addr accordingly,
 then load src with aligned mode and load dst with unaligned mode.

This patch optmizes memcmp() behavior in the above 2 situations.

Tested with both little/big endian. Performance result below is based on
little endian.

Following is the test result with src/dst having the same offset case:
(a similar result was observed when src/dst having different offset):
(1) 256 bytes
Test with the existing tools/testing/selftests/powerpc/stringloops/memcmp:
- without patch
	29.773018302 seconds time elapsed                                          ( +- 0.09% )
- with patch
	16.485568173 seconds time elapsed                                          ( +-  0.02% )
		-> There is ~+80% percent improvement

(2) 32 bytes
To observe performance impact on < 32 bytes, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
-------
 #include <string.h>
 #include "utils.h"

-#define SIZE 256
+#define SIZE 32
 #define ITERATIONS 10000

 int test_memcmp(const void *s1, const void *s2, size_t n);
--------

- Without patch
	0.244746482 seconds time elapsed                                          ( +-  0.36%)
- with patch
	0.215069477 seconds time elapsed                                          ( +-  0.51%)
		-> There is ～+13% improvement

(3) 0~8 bytes
To observe <8 bytes performance impact, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
-------
 #include <string.h>
 #include "utils.h"

-#define SIZE 256
-#define ITERATIONS 10000
+#define SIZE 8
+#define ITERATIONS 1000000

 int test_memcmp(const void *s1, const void *s2, size_t n);
-------
- Without patch
       1.845642503 seconds time elapsed                                          ( +- 0.12% )
- With patch
       1.849767135 seconds time elapsed                                          ( +- 0.26% )
		-> They are nearly the same. (-0.2%)

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
 arch/powerpc/lib/memcmp_64.S | 143 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 136 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index d75d18b..f20e883 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -24,28 +24,41 @@
 #define rH	r31
 
 #ifdef __LITTLE_ENDIAN__
+#define LH	lhbrx
+#define LW	lwbrx
 #define LD	ldbrx
 #else
+#define LH	lhzx
+#define LW	lwzx
 #define LD	ldx
 #endif
 
+/*
+ * There are 2 categories for memcmp:
+ * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
+ * are named like .Lsameoffset_xxxx
+ * 2) src/dst has different offset to the 8 bytes boundary. The handlers
+ * are named like .Ldiffoffset_xxxx
+ */
 _GLOBAL(memcmp)
 	cmpdi	cr1,r5,0
 
-	/* Use the short loop if both strings are not 8B aligned */
-	or	r6,r3,r4
+	/* Use the short loop if the src/dst addresses are not
+	 * with the same offset of 8 bytes align boundary.
+	 */
+	xor	r6,r3,r4
 	andi.	r6,r6,7
 
-	/* Use the short loop if length is less than 32B */
-	cmpdi	cr6,r5,31
+	/* Fall back to short loop if compare at aligned addrs
+	 * with less than 8 bytes.
+	 */
+	cmpdi   cr6,r5,7
 
 	beq	cr1,.Lzero
-	bne	.Lshort
-	bgt	cr6,.Llong
+	bgt	cr6,.Lno_short
 
 .Lshort:
 	mtctr	r5
-
 1:	lbz	rA,0(r3)
 	lbz	rB,0(r4)
 	subf.	rC,rB,rA
@@ -78,11 +91,90 @@ _GLOBAL(memcmp)
 	li	r3,0
 	blr
 
+.Lno_short:
+	dcbt	0,r3
+	dcbt	0,r4
+	bne	.Ldiffoffset_8bytes_make_align_start
+
+
+.Lsameoffset_8bytes_make_align_start:
+	/* attempt to compare bytes not aligned with 8 bytes so that
+	 * rest comparison can run based on 8 bytes alignment.
+	 */
+	andi.   r6,r3,7
+
+	/* Try to compare the first double word which is not 8 bytes aligned:
+	 * load the first double word at (src & ~7UL) and shift left appropriate
+	 * bits before comparision.
+	 */
+	clrlwi  r6,r3,29
+	rlwinm  r6,r6,3,0,28
+	beq     .Lsameoffset_8bytes_aligned
+	clrrdi	r3,r3,3
+	clrrdi	r4,r4,3
+	LD	rA,0,r3
+	LD	rB,0,r4
+	sld	rA,rA,r6
+	sld	rB,rB,r6
+	cmpld	cr0,rA,rB
+	srwi	r6,r6,3
+	bne	cr0,.LcmpAB_lightweight
+	subfic  r6,r6,8
+	subfc.	r5,r6,r5
+	addi	r3,r3,8
+	addi	r4,r4,8
+	beq	.Lzero
+
+.Lsameoffset_8bytes_aligned:
+	/* now we are aligned with 8 bytes.
+	 * Use .Llong loop if left cmp bytes are equal or greater than 32B.
+	 */
+	cmpdi   cr6,r5,31
+	bgt	cr6,.Llong
+
+.Lcmp_lt32bytes:
+	/* compare 1 ~ 32 bytes, at least r3 addr is 8 bytes aligned now */
+	cmpdi   cr5,r5,7
+	srdi    r0,r5,3
+	ble	cr5,.Lcmp_rest_lt8bytes
+
+	/* handle 8 ~ 31 bytes */
+	clrldi  r5,r5,61
+	mtctr   r0
+2:
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	addi	r3,r3,8
+	addi	r4,r4,8
+	bne	cr0,.LcmpAB_lightweight
+	bdnz	2b
+
+	cmpwi   r5,0
+	beq	.Lzero
+
+.Lcmp_rest_lt8bytes:
+	/* Here we have only less than 8 bytes to compare with. at least s1
+	 * Address is aligned with 8 bytes.
+	 * The next double words are load and shift right with appropriate
+	 * bits.
+	 */
+	subfic  r6,r5,8
+	rlwinm  r6,r6,3,0,28
+	LD	rA,0,r3
+	LD	rB,0,r4
+	srd	rA,rA,r6
+	srd	rB,rB,r6
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+	b	.Lzero
+
 .Lnon_zero:
 	mr	r3,rC
 	blr
 
 .Llong:
+	/* At least s1 addr is aligned with 8 bytes */
 	li	off8,8
 	li	off16,16
 	li	off24,24
@@ -232,4 +324,41 @@ _GLOBAL(memcmp)
 	ld	r28,-32(r1)
 	ld	r27,-40(r1)
 	blr
+
+.LcmpAB_lightweight:   /* skip NV GPRS restore */
+	li	r3,1
+	bgt	cr0,8f
+	li	r3,-1
+8:
+	blr
+
+.Ldiffoffset_8bytes_make_align_start:
+	/* now try to align s1 with 8 bytes */
+	andi.   r6,r3,0x7
+	rlwinm  r6,r6,3,0,28
+	beq     .Ldiffoffset_align_s1_8bytes
+
+	clrrdi	r3,r3,3
+	LD	rA,0,r3
+	LD	rB,0,r4  /* unaligned load */
+	sld	rA,rA,r6
+	srd	rA,rA,r6
+	srd	rB,rB,r6
+	cmpld	cr0,rA,rB
+	srwi	r6,r6,3
+	bne	cr0,.LcmpAB_lightweight
+
+	subfic  r6,r6,8
+	subfc.	r5,r6,r5
+	addi	r3,r3,8
+	add	r4,r4,r6
+
+	beq	.Lzero
+
+.Ldiffoffset_align_s1_8bytes:
+	/* now s1 is aligned with 8 bytes. */
+	cmpdi   cr5,r5,31
+	ble	cr5,.Lcmp_lt32bytes
+	b	.Llong
+
 EXPORT_SYMBOL(memcmp)
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v5 2/4] powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision
From: wei.guo.simon @ 2018-05-23  6:48 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Paul Mackerras, Michael Ellerman, Naveen N.  Rao, Cyril Bur,
	Simon Guo
In-Reply-To: <1527058083-6998-1-git-send-email-wei.guo.simon@gmail.com>

From: Simon Guo <wei.guo.simon@gmail.com>

This patch add VMX primitives to do memcmp() in case the compare size
exceeds 4K bytes. KSM feature can benefit from this.

Test result with following test program(replace the "^>" with ""):
------
># cat tools/testing/selftests/powerpc/stringloops/memcmp.c
>#include <malloc.h>
>#include <stdlib.h>
>#include <string.h>
>#include <time.h>
>#include "utils.h"
>#define SIZE (1024 * 1024 * 900)
>#define ITERATIONS 40

int test_memcmp(const void *s1, const void *s2, size_t n);

static int testcase(void)
{
        char *s1;
        char *s2;
        unsigned long i;

        s1 = memalign(128, SIZE);
        if (!s1) {
                perror("memalign");
                exit(1);
        }

        s2 = memalign(128, SIZE);
        if (!s2) {
                perror("memalign");
                exit(1);
        }

        for (i = 0; i < SIZE; i++)  {
                s1[i] = i & 0xff;
                s2[i] = i & 0xff;
        }
        for (i = 0; i < ITERATIONS; i++) {
		int ret = test_memcmp(s1, s2, SIZE);

		if (ret) {
			printf("return %d at[%ld]! should have returned zero\n", ret, i);
			abort();
		}
	}

        return 0;
}

int main(void)
{
        return test_harness(testcase, "memcmp");
}
------
Without this patch (but with the first patch "powerpc/64: Align bytes
before fall back to .Lshort in powerpc64 memcmp()." in the series):
	4.726728762 seconds time elapsed                                          ( +-  3.54%)
With VMX patch:
	4.234335473 seconds time elapsed                                          ( +-  2.63%)
		There is ~+10% improvement.

Testing with unaligned and different offset version (make s1 and s2 shift
random offset within 16 bytes) can archieve higher improvement than 10%..

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
 arch/powerpc/include/asm/asm-prototypes.h |   4 +-
 arch/powerpc/lib/copypage_power7.S        |   4 +-
 arch/powerpc/lib/memcmp_64.S              | 231 ++++++++++++++++++++++++++++++
 arch/powerpc/lib/memcpy_power7.S          |   6 +-
 arch/powerpc/lib/vmx-helper.c             |   4 +-
 5 files changed, 240 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index d9713ad..31fdcee 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -49,8 +49,8 @@ void __trace_hcall_exit(long opcode, unsigned long retval,
 /* VMX copying */
 int enter_vmx_usercopy(void);
 int exit_vmx_usercopy(void);
-int enter_vmx_copy(void);
-void * exit_vmx_copy(void *dest);
+int enter_vmx_ops(void);
+void *exit_vmx_ops(void *dest);
 
 /* Traps */
 long machine_check_early(struct pt_regs *regs);
diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S
index 8fa73b7..e38f956 100644
--- a/arch/powerpc/lib/copypage_power7.S
+++ b/arch/powerpc/lib/copypage_power7.S
@@ -57,7 +57,7 @@ _GLOBAL(copypage_power7)
 	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
-	bl	enter_vmx_copy
+	bl	enter_vmx_ops
 	cmpwi	r3,0
 	ld	r0,STACKFRAMESIZE+16(r1)
 	ld	r3,STK_REG(R31)(r1)
@@ -100,7 +100,7 @@ _GLOBAL(copypage_power7)
 	addi	r3,r3,128
 	bdnz	1b
 
-	b	exit_vmx_copy		/* tail call optimise */
+	b	exit_vmx_ops		/* tail call optimise */
 
 #else
 	li	r0,(PAGE_SIZE/128)
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index f20e883..6303bbf 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -27,12 +27,73 @@
 #define LH	lhbrx
 #define LW	lwbrx
 #define LD	ldbrx
+#define LVS	lvsr
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+	vperm _VRT,_VRB,_VRA,_VRC
 #else
 #define LH	lhzx
 #define LW	lwzx
 #define LD	ldx
+#define LVS	lvsl
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+	vperm _VRT,_VRA,_VRB,_VRC
 #endif
 
+#define VMX_OPS_THRES 4096
+#define ENTER_VMX_OPS	\
+	mflr    r0;	\
+	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+	std     r0,16(r1); \
+	stdu    r1,-STACKFRAMESIZE(r1); \
+	bl      enter_vmx_ops; \
+	cmpwi   cr1,r3,0; \
+	ld      r0,STACKFRAMESIZE+16(r1); \
+	ld      r3,STK_REG(R31)(r1); \
+	ld      r4,STK_REG(R30)(r1); \
+	ld      r5,STK_REG(R29)(r1); \
+	addi	r1,r1,STACKFRAMESIZE; \
+	mtlr    r0
+
+#define EXIT_VMX_OPS \
+	mflr    r0; \
+	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+	std     r0,16(r1); \
+	stdu    r1,-STACKFRAMESIZE(r1); \
+	bl      exit_vmx_ops; \
+	ld      r0,STACKFRAMESIZE+16(r1); \
+	ld      r3,STK_REG(R31)(r1); \
+	ld      r4,STK_REG(R30)(r1); \
+	ld      r5,STK_REG(R29)(r1); \
+	addi	r1,r1,STACKFRAMESIZE; \
+	mtlr    r0
+
+/*
+ * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
+ * 16 bytes boundary and permute the result with the 1st 16 bytes.
+
+ *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
+ *    ^                                  ^                                 ^
+ * 0xbbbb10                          0xbbbb20                          0xbbb30
+ *                                 ^
+ *                                _vaddr
+ *
+ *
+ * _vmask is the mask generated by LVS
+ * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
+ *   for example: 0xyyyyyyyyyyyyy012 for big endian
+ * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
+ *   for example: 0x3456789abcdefzzz for big endian
+ * The permute result is saved in _v_res.
+ *   for example: 0x0123456789abcdef for big endian.
+ */
+#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
+        lvx     _v2nd_qw,_vaddr,off16; \
+        VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
+
 /*
  * There are 2 categories for memcmp:
  * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
@@ -174,6 +235,13 @@ _GLOBAL(memcmp)
 	blr
 
 .Llong:
+#ifdef CONFIG_ALTIVEC
+	/* Try to use vmx loop if length is larger than 4K */
+	cmpldi  cr6,r5,VMX_OPS_THRES
+	bge	cr6,.Lsameoffset_vmx_cmp
+
+.Llong_novmx_cmp:
+#endif
 	/* At least s1 addr is aligned with 8 bytes */
 	li	off8,8
 	li	off16,16
@@ -332,7 +400,94 @@ _GLOBAL(memcmp)
 8:
 	blr
 
+#ifdef CONFIG_ALTIVEC
+.Lsameoffset_vmx_cmp:
+	/* Enter with src/dst addrs has the same offset with 8 bytes
+	 * align boundary
+	 */
+	ENTER_VMX_OPS
+	beq     cr1,.Llong_novmx_cmp
+
+3:
+	/* need to check whether r4 has the same offset with r3
+	 * for 16 bytes boundary.
+	 */
+	xor	r0,r3,r4
+	andi.	r0,r0,0xf
+	bne	.Ldiffoffset_vmx_cmp_start
+
+	/* len is no less than 4KB. Need to align with 16 bytes further.
+	 */
+	andi.	rA,r3,8
+	LD	rA,0,r3
+	beq	4f
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	addi	r3,r3,8
+	addi	r4,r4,8
+	addi	r5,r5,-8
+
+	beq	cr0,4f
+	/* save and restore cr0 */
+	mfocrf  r5,64
+	EXIT_VMX_OPS
+	mtocrf	64,r5
+	b	.LcmpAB_lightweight
+
+4:
+	/* compare 32 bytes for each loop */
+	srdi	r0,r5,5
+	mtctr	r0
+	clrldi  r5,r5,59
+	li	off16,16
+
+.balign 16
+5:
+	lvx 	v0,0,r3
+	lvx 	v1,0,r4
+	vcmpequd. v0,v0,v1
+	bf	24,7f
+	lvx 	v0,off16,r3
+	lvx 	v1,off16,r4
+	vcmpequd. v0,v0,v1
+	bf	24,6f
+	addi	r3,r3,32
+	addi	r4,r4,32
+	bdnz	5b
+
+	EXIT_VMX_OPS
+	cmpdi	r5,0
+	beq	.Lzero
+	b	.Lcmp_lt32bytes
+
+6:
+	addi	r3,r3,16
+	addi	r4,r4,16
+
+7:
+	/* diff the last 16 bytes */
+	EXIT_VMX_OPS
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	li	off8,8
+	bne	cr0,.LcmpAB_lightweight
+
+	LD	rA,off8,r3
+	LD	rB,off8,r4
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+	b	.Lzero
+#endif
+
 .Ldiffoffset_8bytes_make_align_start:
+#ifdef CONFIG_ALTIVEC
+	/* only do vmx ops when the size exceeds 4K bytes */
+	cmpdi	cr5,r5,VMX_OPS_THRES
+	bge	cr5,.Ldiffoffset_vmx_cmp
+.Ldiffoffset_novmx_cmp:
+#endif
+
 	/* now try to align s1 with 8 bytes */
 	andi.   r6,r3,0x7
 	rlwinm  r6,r6,3,0,28
@@ -359,6 +514,82 @@ _GLOBAL(memcmp)
 	/* now s1 is aligned with 8 bytes. */
 	cmpdi   cr5,r5,31
 	ble	cr5,.Lcmp_lt32bytes
+
+#ifdef CONFIG_ALTIVEC
+	b	.Llong_novmx_cmp
+#else
 	b	.Llong
+#endif
+
+#ifdef CONFIG_ALTIVEC
+.Ldiffoffset_vmx_cmp:
+	ENTER_VMX_OPS
+	beq     cr1,.Ldiffoffset_novmx_cmp
+
+.Ldiffoffset_vmx_cmp_start:
+	/* Firstly try to align r3 with 16 bytes */
+	andi.   r6,r3,0xf
+	li	off16,16
+	beq     .Ldiffoffset_vmx_s1_16bytes_align
 
+	LVS	v3,0,r3
+	LVS	v4,0,r4
+
+	lvx     v5,0,r3
+	lvx     v6,0,r4
+	LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
+	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+
+	vcmpequb.  v7,v9,v10
+	bnl	cr6,.Ldiffoffset_vmx_diff_found
+
+	subfic  r6,r6,16
+	subf    r5,r6,r5
+	add     r3,r3,r6
+	add     r4,r4,r6
+
+.Ldiffoffset_vmx_s1_16bytes_align:
+	/* now s1 is aligned with 16 bytes */
+	lvx     v6,0,r4
+	LVS	v4,0,r4
+	srdi	r6,r5,5  /* loop for 32 bytes each */
+	clrldi  r5,r5,59
+	mtctr	r6
+
+.balign	16
+.Ldiffoffset_vmx_32bytesloop:
+	/* the first qw of r4 was saved in v6 */
+	lvx	v9,0,r3
+	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+	vcmpequb.	v7,v9,v10
+	vor	v6,v8,v8
+	bnl	cr6,.Ldiffoffset_vmx_diff_found
+
+	addi	r3,r3,16
+	addi	r4,r4,16
+
+	lvx	v9,0,r3
+	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+	vcmpequb.	v7,v9,v10
+	vor	v6,v8,v8
+	bnl	cr6,.Ldiffoffset_vmx_diff_found
+
+	addi	r3,r3,16
+	addi	r4,r4,16
+
+	bdnz	.Ldiffoffset_vmx_32bytesloop
+
+	EXIT_VMX_OPS
+
+	cmpdi	r5,0
+	beq	.Lzero
+	b	.Lcmp_lt32bytes
+
+.Ldiffoffset_vmx_diff_found:
+	EXIT_VMX_OPS
+	/* anyway, the diff will appear in next 16 bytes */
+	li	r5,16
+	b	.Lcmp_lt32bytes
+
+#endif
 EXPORT_SYMBOL(memcmp)
diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S
index df7de9d..070cdf6 100644
--- a/arch/powerpc/lib/memcpy_power7.S
+++ b/arch/powerpc/lib/memcpy_power7.S
@@ -230,7 +230,7 @@ _GLOBAL(memcpy_power7)
 	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
-	bl	enter_vmx_copy
+	bl	enter_vmx_ops
 	cmpwi	cr1,r3,0
 	ld	r0,STACKFRAMESIZE+16(r1)
 	ld	r3,STK_REG(R31)(r1)
@@ -445,7 +445,7 @@ _GLOBAL(memcpy_power7)
 
 15:	addi	r1,r1,STACKFRAMESIZE
 	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
-	b	exit_vmx_copy		/* tail call optimise */
+	b	exit_vmx_ops		/* tail call optimise */
 
 .Lvmx_unaligned_copy:
 	/* Get the destination 16B aligned */
@@ -649,5 +649,5 @@ _GLOBAL(memcpy_power7)
 
 15:	addi	r1,r1,STACKFRAMESIZE
 	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
-	b	exit_vmx_copy		/* tail call optimise */
+	b	exit_vmx_ops		/* tail call optimise */
 #endif /* CONFIG_ALTIVEC */
diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c
index bf925cd..9f34049 100644
--- a/arch/powerpc/lib/vmx-helper.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -53,7 +53,7 @@ int exit_vmx_usercopy(void)
 	return 0;
 }
 
-int enter_vmx_copy(void)
+int enter_vmx_ops(void)
 {
 	if (in_interrupt())
 		return 0;
@@ -70,7 +70,7 @@ int enter_vmx_copy(void)
  * passed a pointer to the destination which we return as required by a
  * memcpy implementation.
  */
-void *exit_vmx_copy(void *dest)
+void *exit_vmx_ops(void *dest)
 {
 	disable_kernel_altivec();
 	preempt_enable();
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v5 3/4] powerpc/64: add 32 bytes prechecking before using VMX optimization on memcmp()
From: wei.guo.simon @ 2018-05-23  6:48 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Paul Mackerras, Michael Ellerman, Naveen N.  Rao, Cyril Bur,
	Simon Guo
In-Reply-To: <1527058083-6998-1-git-send-email-wei.guo.simon@gmail.com>

From: Simon Guo <wei.guo.simon@gmail.com>

This patch is based on the previous VMX patch on memcmp().

To optimize ppc64 memcmp() with VMX instruction, we need to think about
the VMX penalty brought with: If kernel uses VMX instruction, it needs
to save/restore current thread's VMX registers. There are 32 x 128 bits
VMX registers in PPC, which means 32 x 16 = 512 bytes for load and store.

The major concern regarding the memcmp() performance in kernel is KSM,
who will use memcmp() frequently to merge identical pages. So it will
make sense to take some measures/enhancement on KSM to see whether any
improvement can be done here.  Cyril Bur indicates that the memcmp() for
KSM has a higher possibility to fail (unmatch) early in previous bytes
in following mail.
	https://patchwork.ozlabs.org/patch/817322/#1773629
And I am taking a follow-up on this with this patch.

Per some testing, it shows KSM memcmp() will fail early at previous 32
bytes.  More specifically:
    - 76% cases will fail/unmatch before 16 bytes;
    - 83% cases will fail/unmatch before 32 bytes;
    - 84% cases will fail/unmatch before 64 bytes;
So 32 bytes looks a better choice than other bytes for pre-checking.

The early failure is also true for memcmp() for non-KSM case. With a
non-typical call load, it shows ~73% cases fail before first 32 bytes.

This patch adds a 32 bytes pre-checking firstly before jumping into VMX
operations, to avoid the unnecessary VMX penalty. It is not limited to
KSM case. And the testing shows ~20% improvement on memcmp() average
execution time with this patch.

And note the 32B pre-checking is only performed when the compare size
is long enough (>=4K currently) to allow VMX operation.

The detail data and analysis is at:
https://github.com/justdoitqd/publicFiles/blob/master/memcmp/README.md

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
 arch/powerpc/lib/memcmp_64.S | 50 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 42 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index 6303bbf..ee45348 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -403,8 +403,27 @@ _GLOBAL(memcmp)
 #ifdef CONFIG_ALTIVEC
 .Lsameoffset_vmx_cmp:
 	/* Enter with src/dst addrs has the same offset with 8 bytes
-	 * align boundary
+	 * align boundary.
+	 *
+	 * There is an optimization based on following fact: memcmp()
+	 * prones to fail early at the first 32 bytes.
+	 * Before applying VMX instructions which will lead to 32x128bits
+	 * VMX regs load/restore penalty, we compare the first 32 bytes
+	 * so that we can catch the ~80% fail cases.
 	 */
+
+	li	r0,4
+	mtctr	r0
+.Lsameoffset_prechk_32B_loop:
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	addi	r3,r3,8
+	addi	r4,r4,8
+	bne     cr0,.LcmpAB_lightweight
+	addi	r5,r5,-8
+	bdnz	.Lsameoffset_prechk_32B_loop
+
 	ENTER_VMX_OPS
 	beq     cr1,.Llong_novmx_cmp
 
@@ -481,13 +500,6 @@ _GLOBAL(memcmp)
 #endif
 
 .Ldiffoffset_8bytes_make_align_start:
-#ifdef CONFIG_ALTIVEC
-	/* only do vmx ops when the size exceeds 4K bytes */
-	cmpdi	cr5,r5,VMX_OPS_THRES
-	bge	cr5,.Ldiffoffset_vmx_cmp
-.Ldiffoffset_novmx_cmp:
-#endif
-
 	/* now try to align s1 with 8 bytes */
 	andi.   r6,r3,0x7
 	rlwinm  r6,r6,3,0,28
@@ -512,6 +524,13 @@ _GLOBAL(memcmp)
 
 .Ldiffoffset_align_s1_8bytes:
 	/* now s1 is aligned with 8 bytes. */
+#ifdef CONFIG_ALTIVEC
+	/* only do vmx ops when the size exceeds 4K bytes */
+	cmpdi	cr5,r5,VMX_OPS_THRES
+	bge	cr5,.Ldiffoffset_vmx_cmp
+.Ldiffoffset_novmx_cmp:
+#endif
+
 	cmpdi   cr5,r5,31
 	ble	cr5,.Lcmp_lt32bytes
 
@@ -523,6 +542,21 @@ _GLOBAL(memcmp)
 
 #ifdef CONFIG_ALTIVEC
 .Ldiffoffset_vmx_cmp:
+	/* perform a 32 bytes pre-checking before
+	 * enable VMX operations.
+	 */
+	li	r0,4
+	mtctr	r0
+.Ldiffoffset_prechk_32B_loop:
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	addi	r3,r3,8
+	addi	r4,r4,8
+	bne     cr0,.LcmpAB_lightweight
+	addi	r5,r5,-8
+	bdnz	.Ldiffoffset_prechk_32B_loop
+
 	ENTER_VMX_OPS
 	beq     cr1,.Ldiffoffset_novmx_cmp
 
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v5 4/4] powerpc:selftest update memcmp_64 selftest for VMX implementation
From: wei.guo.simon @ 2018-05-23  6:48 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Paul Mackerras, Michael Ellerman, Naveen N.  Rao, Cyril Bur,
	Simon Guo
In-Reply-To: <1527058083-6998-1-git-send-email-wei.guo.simon@gmail.com>

From: Simon Guo <wei.guo.simon@gmail.com>

This patch reworked selftest memcmp_64 so that memcmp selftest can
cover more test cases.

It adds testcases for:
- memcmp over 4K bytes size.
- s1/s2 with different/random offset on 16 bytes boundary.
- enter/exit_vmx_ops pairness.

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
 .../selftests/powerpc/copyloops/asm/ppc_asm.h      |  4 +-
 .../selftests/powerpc/stringloops/asm/ppc_asm.h    | 22 +++++
 .../testing/selftests/powerpc/stringloops/memcmp.c | 98 +++++++++++++++++-----
 3 files changed, 100 insertions(+), 24 deletions(-)

diff --git a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
index 5ffe04d..dfce161 100644
--- a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
+++ b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
@@ -36,11 +36,11 @@
 	li	r3,0
 	blr
 
-FUNC_START(enter_vmx_copy)
+FUNC_START(enter_vmx_ops)
 	li	r3,1
 	blr
 
-FUNC_START(exit_vmx_copy)
+FUNC_START(exit_vmx_ops)
 	blr
 
 FUNC_START(memcpy_power7)
diff --git a/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
index 136242e..185d257 100644
--- a/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
+++ b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
@@ -1,4 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PPC_ASM_H
+#define __PPC_ASM_H
 #include <ppc-asm.h>
 
 #ifndef r1
@@ -6,3 +8,23 @@
 #endif
 
 #define _GLOBAL(A) FUNC_START(test_ ## A)
+
+#define CONFIG_ALTIVEC
+
+#define R14 r14
+#define R15 r15
+#define R16 r16
+#define R17 r17
+#define R18 r18
+#define R19 r19
+#define R20 r20
+#define R21 r21
+#define R22 r22
+#define R29 r29
+#define R30 r30
+#define R31 r31
+
+#define STACKFRAMESIZE	256
+#define STK_REG(i)	(112 + ((i)-14)*8)
+
+#endif
diff --git a/tools/testing/selftests/powerpc/stringloops/memcmp.c b/tools/testing/selftests/powerpc/stringloops/memcmp.c
index 8250db2..b5cf717 100644
--- a/tools/testing/selftests/powerpc/stringloops/memcmp.c
+++ b/tools/testing/selftests/powerpc/stringloops/memcmp.c
@@ -2,20 +2,40 @@
 #include <malloc.h>
 #include <stdlib.h>
 #include <string.h>
+#include <time.h>
 #include "utils.h"
 
 #define SIZE 256
 #define ITERATIONS 10000
 
+#define LARGE_SIZE (5 * 1024)
+#define LARGE_ITERATIONS 1000
+#define LARGE_MAX_OFFSET 32
+#define LARGE_SIZE_START 4096
+
+#define MAX_OFFSET_DIFF_S1_S2 48
+
+int vmx_count;
+int enter_vmx_ops(void)
+{
+	vmx_count++;
+	return 1;
+}
+
+void exit_vmx_ops(void)
+{
+	vmx_count--;
+}
 int test_memcmp(const void *s1, const void *s2, size_t n);
 
 /* test all offsets and lengths */
-static void test_one(char *s1, char *s2)
+static void test_one(char *s1, char *s2, unsigned long max_offset,
+		unsigned long size_start, unsigned long max_size)
 {
 	unsigned long offset, size;
 
-	for (offset = 0; offset < SIZE; offset++) {
-		for (size = 0; size < (SIZE-offset); size++) {
+	for (offset = 0; offset < max_offset; offset++) {
+		for (size = size_start; size < (max_size - offset); size++) {
 			int x, y;
 			unsigned long i;
 
@@ -35,70 +55,104 @@ static void test_one(char *s1, char *s2)
 				printf("\n");
 				abort();
 			}
+
+			if (vmx_count != 0) {
+				printf("vmx enter/exit not paired.(offset:%ld size:%ld s1:%p s2:%p vc:%d\n",
+					offset, size, s1, s2, vmx_count);
+				printf("\n");
+				abort();
+			}
 		}
 	}
 }
 
-static int testcase(void)
+static int testcase(bool islarge)
 {
 	char *s1;
 	char *s2;
 	unsigned long i;
 
-	s1 = memalign(128, SIZE);
+	unsigned long comp_size = (islarge ? LARGE_SIZE : SIZE);
+	unsigned long alloc_size = comp_size + MAX_OFFSET_DIFF_S1_S2;
+	int iterations = islarge ? LARGE_ITERATIONS : ITERATIONS;
+
+	s1 = memalign(128, alloc_size);
 	if (!s1) {
 		perror("memalign");
 		exit(1);
 	}
 
-	s2 = memalign(128, SIZE);
+	s2 = memalign(128, alloc_size);
 	if (!s2) {
 		perror("memalign");
 		exit(1);
 	}
 
-	srandom(1);
+	srandom(time(0));
 
-	for (i = 0; i < ITERATIONS; i++) {
+	for (i = 0; i < iterations; i++) {
 		unsigned long j;
 		unsigned long change;
+		char *rand_s1 = s1;
+		char *rand_s2 = s2;
 
-		for (j = 0; j < SIZE; j++)
+		for (j = 0; j < alloc_size; j++)
 			s1[j] = random();
 
-		memcpy(s2, s1, SIZE);
+		rand_s1 += random() % MAX_OFFSET_DIFF_S1_S2;
+		rand_s2 += random() % MAX_OFFSET_DIFF_S1_S2;
+		memcpy(rand_s2, rand_s1, comp_size);
 
 		/* change one byte */
-		change = random() % SIZE;
-		s2[change] = random() & 0xff;
-
-		test_one(s1, s2);
+		change = random() % comp_size;
+		rand_s2[change] = random() & 0xff;
+
+		if (islarge)
+			test_one(rand_s1, rand_s2, LARGE_MAX_OFFSET,
+					LARGE_SIZE_START, comp_size);
+		else
+			test_one(rand_s1, rand_s2, SIZE, 0, comp_size);
 	}
 
-	srandom(1);
+	srandom(time(0));
 
-	for (i = 0; i < ITERATIONS; i++) {
+	for (i = 0; i < iterations; i++) {
 		unsigned long j;
 		unsigned long change;
+		char *rand_s1 = s1;
+		char *rand_s2 = s2;
 
-		for (j = 0; j < SIZE; j++)
+		for (j = 0; j < alloc_size; j++)
 			s1[j] = random();
 
-		memcpy(s2, s1, SIZE);
+		rand_s1 += random() % MAX_OFFSET_DIFF_S1_S2;
+		rand_s2 += random() % MAX_OFFSET_DIFF_S1_S2;
+		memcpy(rand_s2, rand_s1, comp_size);
 
 		/* change multiple bytes, 1/8 of total */
-		for (j = 0; j < SIZE / 8; j++) {
-			change = random() % SIZE;
+		for (j = 0; j < comp_size / 8; j++) {
+			change = random() % comp_size;
 			s2[change] = random() & 0xff;
 		}
 
-		test_one(s1, s2);
+		if (islarge)
+			test_one(rand_s1, rand_s2, LARGE_MAX_OFFSET,
+					LARGE_SIZE_START, comp_size);
+		else
+			test_one(rand_s1, rand_s2, SIZE, 0, comp_size);
 	}
 
 	return 0;
 }
 
+static int testcases(void)
+{
+	testcase(0);
+	testcase(1);
+	return 0;
+}
+
 int main(void)
 {
-	return test_harness(testcase, "memcmp");
+	return test_harness(testcases, "memcmp");
 }
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH v3 24/29] KVM: PPC: Book3S PR: Support TAR handling for PR KVM HTM.
From: Simon Guo @ 2018-05-23  7:01 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, kvm, kvm-ppc
In-Reply-To: <20180522114447.GB9871@fergus.ozlabs.ibm.com>

Hi Paul,
On Tue, May 22, 2018 at 09:44:47PM +1000, Paul Mackerras wrote:
> On Mon, May 21, 2018 at 12:09:41PM +0800, wei.guo.simon@gmail.com wrote:
> > From: Simon Guo <wei.guo.simon@gmail.com>
> > 
> > Currently guest kernel doesn't handle TAR fac unavailable and it always
> > runs with TAR bit on. PR KVM will lazily enable TAR. TAR is not a
> > frequent-use reg and it is not included in SVCPU struct.
> > 
> > Due to the above, the checkpointed TAR val might be a bogus TAR val.
> > To solve this issue, we will make vcpu->arch.fscr tar bit consistent
> > with shadow_fscr when TM enabled.
> > 
> > At the end of emulating treclaim., the correct TAR val need to be loaded
> > into reg if FSCR_TAR bit is on.
> > At the beginning of emulating trechkpt., TAR needs to be flushed so that
> > the right tar val can be copy into tar_tm.
> > 
> > Tested with:
> > tools/testing/selftests/powerpc/tm/tm-tar
> > tools/testing/selftests/powerpc/ptrace/ptrace-tm-tar (remove DSCR/PPR
> > related testing).
> 
> With this patch, a 32-bit powermac compile with PR KVM enabled gives
> this error:
> 
> arch/powerpc/kvm/book3s_pr.c:58:12: error: ‘kvmppc_handle_fac’ declared ‘static’ but never defined [-Werror=unused-function]
>  static int kvmppc_handle_fac(struct kvm_vcpu *vcpu, ulong fac);
>             ^
> cc1: all warnings being treated as errors
> scripts/Makefile.build:312: recipe for target 'arch/powerpc/kvm/book3s_pr.o' failed
> 
> The easy fix is a #ifdef CONFIG_PPC_BOOK3S_64 around the forward
> static definition.
Thx for find that. I will fix that and send V4 includes that simple fix.

BR,
- Simon

^ permalink raw reply

* [PATCH v4 00/29] KVM: PPC: Book3S PR: Transaction memory support on PR KVM
From: wei.guo.simon @ 2018-05-23  7:01 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Paul Mackerras, kvm, kvm-ppc, Simon Guo

From: Simon Guo <wei.guo.simon@gmail.com>

In current days, many OS distributions have utilized transaction
memory functionality. In PowerPC, HV KVM supports TM. But PR KVM
does not.

The drive for the transaction memory support of PR KVM is the
openstack Continuous Integration testing - They runs a HV(hypervisor)
KVM(as level 1) and then run PR KVM(as level 2) on top of that.

This patch set add transaction memory support on PR KVM.

v3 -> v4 changes:
- fix a powermac 32-bit compile failure.

v2 -> v3 changes:
1) rebase onto Paul's kvm-ppc-next branch, which includes rework 
KVM_CHECK_EXTENSION ioctl (patch #25) a little bit. 
2) allow mtspr TFHAR in TM suspend state
3) remove patch: 
  "KVM: PPC: add KVM_SET_ONE_REG/KVM_GET_ONE_REG to async ioctl"
4) some minor rework per comments

v1 -> v2 changes:
1. Correct a bug in trechkpt emulation: the tm sprs need to be 
flushed to vcpu before trechkpt.
2. add PR kvm ioctl functionalities for TM.
3. removed save_msr_tm and use kvmppc_get_msr() to determine 
whether a transaction state need to be restored.
4. Remove "KVM: PPC: Book3S PR: set MSR HV bit accordingly 
for PPC970 and others." patch.
It will prevent PR KVM to start as L1 hypervisor. Since if 
we set HV bit to 0 when rfid to guest (who is supposed to 
run at HV=1 && PR=1), the guest will not be able to access 
its original memory.
The original code always set HV bits for shadow_msr, it is 
benign since:
HV bits can only be altered by sc instruction; it can only 
be set to 0 by rfid/hrfid instruction.  
We return to guest with rfid. So:
* if KVM are running as L1 hypervisor, guest physical MSR 
expects HV=1.
* if KVM are running as L2 hypervisor, rfid cannot update 
HV =1 so the HV is still 0.
5. add XER register implementation to 
kvmppc_copyto_vcpu_tm/kvmppc_copyfrom_vcpu_tm()
6. remove unnecessary stack frame in _kvmppc_save/restore_tm().
7. move MSR bits sync into kvmppc_copy_from_svcpu() so that 
we always see inconsistent shadow_msr/kvmppc_get_msr() 
even when preemption.
8. doing failure recording in treclaim emulation when TEXASR_FS
is 0.

Simon Guo (29):
  powerpc: export symbol msr_check_and_set().
  powerpc: add TEXASR related macros
  powerpc: export tm_enable()/tm_disable/tm_abort() APIs
  KVM: PPC: Book3S PR: Move kvmppc_save_tm/kvmppc_restore_tm to separate
    file
  KVM: PPC: Book3S PR: add new parameter (guest MSR) for
    kvmppc_save_tm()/kvmppc_restore_tm()
  KVM: PPC: Book3S PR: turn on FP/VSX/VMX MSR bits in kvmppc_save_tm()
  KVM: PPC: Book3S PR: add C function wrapper for
    _kvmppc_save/restore_tm()
  KVM: PPC: Book3S PR: In PR KVM suspends Transactional state when
    inject an interrupt.
  KVM: PPC: Book3S PR: PR KVM pass through MSR TM/TS bits to shadow_msr.
  KVM: PPC: Book3S PR: Sync TM bits to shadow msr for problem state
    guest
  KVM: PPC: Book3S PR: implement RFID TM behavior to suppress change
    from S0 to N0
  KVM: PPC: Book3S PR: prevent TS bits change in kvmppc_interrupt_pr()
  KVM: PPC: Book3S PR: adds new
    kvmppc_copyto_vcpu_tm/kvmppc_copyfrom_vcpu_tm API for PR KVM.
  KVM: PPC: Book3S PR: add kvmppc_save/restore_tm_sprs() APIs
  KVM: PPC: Book3S PR: add transaction memory save/restore skeleton for
    PR KVM
  KVM: PPC: Book3S PR: add math support for PR KVM HTM
  KVM: PPC: Book3S PR: make mtspr/mfspr emulation behavior based on
    active TM SPRs
  KVM: PPC: Book3S PR: always fail transaction in guest privilege state
  KVM: PPC: Book3S PR: enable NV reg restore for reading TM SPR at guest
    privilege state
  KVM: PPC: Book3S PR: adds emulation for treclaim.
  KVM: PPC: Book3S PR: add emulation for trechkpt in PR KVM.
  KVM: PPC: Book3S PR: add emulation for tabort. for privilege guest
  KVM: PPC: Book3S PR: add guard code to prevent returning to guest with
    PR=0 and Transactional state
  KVM: PPC: Book3S PR: Support TAR handling for PR KVM HTM.
  KVM: PPC: Book3S PR: enable HTM for PR KVM for KVM_CHECK_EXTENSION
    ioctl
  KVM: PPC: move vcpu_load/vcpu_put down to each ioctl case in
    kvm_arch_vcpu_ioctl
  KVM: PPC: remove load/put vcpu for KVM_GET/SET_ONE_REG ioctl
  KVM: PPC: remove load/put vcpu for KVM_GET_REGS/KVM_SET_REGS
  KVM: PPC: Book3S PR: enable kvmppc_get/set_one_reg_pr() for HTM
    registers

 arch/powerpc/include/asm/asm-prototypes.h   |   9 +
 arch/powerpc/include/asm/kvm_book3s.h       |  16 +
 arch/powerpc/include/asm/kvm_host.h         |   1 -
 arch/powerpc/include/asm/reg.h              |  32 +-
 arch/powerpc/include/asm/tm.h               |   2 -
 arch/powerpc/include/uapi/asm/tm.h          |   2 +-
 arch/powerpc/kernel/process.c               |   1 +
 arch/powerpc/kernel/tm.S                    |  12 +
 arch/powerpc/kvm/Makefile                   |   3 +
 arch/powerpc/kvm/book3s.c                   |   6 -
 arch/powerpc/kvm/book3s.h                   |   6 +
 arch/powerpc/kvm/book3s_64_mmu.c            |  11 +-
 arch/powerpc/kvm/book3s_emulate.c           | 369 +++++++++++++++++++++-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S     | 343 +-------------------
 arch/powerpc/kvm/book3s_pr.c                | 341 ++++++++++++++++++--
 arch/powerpc/kvm/book3s_segment.S           |  13 +
 arch/powerpc/kvm/powerpc.c                  |  12 +-
 arch/powerpc/kvm/tm.S                       | 467 ++++++++++++++++++++++++++++
 arch/powerpc/mm/hash_utils_64.c             |   1 +
 arch/powerpc/platforms/powernv/copy-paste.h |   3 +-
 20 files changed, 1262 insertions(+), 388 deletions(-)
 create mode 100644 arch/powerpc/kvm/tm.S

-- 
1.8.3.1

^ permalink raw reply

* [PATCH v4 01/29] powerpc: export symbol msr_check_and_set().
From: wei.guo.simon @ 2018-05-23  7:01 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Paul Mackerras, kvm, kvm-ppc, Simon Guo
In-Reply-To: <1527058932-7434-1-git-send-email-wei.guo.simon@gmail.com>

From: Simon Guo <wei.guo.simon@gmail.com>

PR KVM will need to reuse msr_check_and_set().
This patch exports this API for reuse.

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
Reviewed-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kernel/process.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 1237f13..25db000 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -154,6 +154,7 @@ unsigned long msr_check_and_set(unsigned long bits)

 	return newmsr;
 }
+EXPORT_SYMBOL_GPL(msr_check_and_set);

 void __msr_check_and_clear(unsigned long bits)
 {
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v4 02/29] powerpc: add TEXASR related macros
From: wei.guo.simon @ 2018-05-23  7:01 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Paul Mackerras, kvm, kvm-ppc, Simon Guo
In-Reply-To: <1527058932-7434-1-git-send-email-wei.guo.simon@gmail.com>

From: Simon Guo <wei.guo.simon@gmail.com>

This patches add some macros for CR0/TEXASR bits so that PR KVM TM
logic(tbegin./treclaim./tabort.) can make use of them later.

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
Reviewed-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/reg.h              | 32 +++++++++++++++++++++++------
 arch/powerpc/platforms/powernv/copy-paste.h |  3 +--
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 44b2be4..5625684 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -146,6 +146,12 @@
 #define MSR_64BIT	0
 #endif
 
+/* Condition Register related */
+#define CR0_SHIFT	28
+#define CR0_MASK	0xF
+#define CR0_TBEGIN_FAILURE	(0x2 << 28) /* 0b0010 */
+
+
 /* Power Management - Processor Stop Status and Control Register Fields */
 #define PSSCR_RL_MASK		0x0000000F /* Requested Level */
 #define PSSCR_MTL_MASK		0x000000F0 /* Maximum Transition Level */
@@ -239,13 +245,27 @@
 #define SPRN_TFIAR	0x81	/* Transaction Failure Inst Addr   */
 #define SPRN_TEXASR	0x82	/* Transaction EXception & Summary */
 #define SPRN_TEXASRU	0x83	/* ''	   ''	   ''	 Upper 32  */
-#define   TEXASR_ABORT	__MASK(63-31) /* terminated by tabort or treclaim */
-#define   TEXASR_SUSP	__MASK(63-32) /* tx failed in suspended state */
-#define   TEXASR_HV	__MASK(63-34) /* MSR[HV] when failure occurred */
-#define   TEXASR_PR	__MASK(63-35) /* MSR[PR] when failure occurred */
-#define   TEXASR_FS	__MASK(63-36) /* TEXASR Failure Summary */
-#define   TEXASR_EXACT	__MASK(63-37) /* TFIAR value is exact */
+
+#define TEXASR_FC_LG	(63 - 7)	/* Failure Code */
+#define TEXASR_AB_LG	(63 - 31)	/* Abort */
+#define TEXASR_SU_LG	(63 - 32)	/* Suspend */
+#define TEXASR_HV_LG	(63 - 34)	/* Hypervisor state*/
+#define TEXASR_PR_LG	(63 - 35)	/* Privilege level */
+#define TEXASR_FS_LG	(63 - 36)	/* failure summary */
+#define TEXASR_EX_LG	(63 - 37)	/* TFIAR exact bit */
+#define TEXASR_ROT_LG	(63 - 38)	/* ROT bit */
+
+#define   TEXASR_ABORT	__MASK(TEXASR_AB_LG) /* terminated by tabort or treclaim */
+#define   TEXASR_SUSP	__MASK(TEXASR_SU_LG) /* tx failed in suspended state */
+#define   TEXASR_HV	__MASK(TEXASR_HV_LG) /* MSR[HV] when failure occurred */
+#define   TEXASR_PR	__MASK(TEXASR_PR_LG) /* MSR[PR] when failure occurred */
+#define   TEXASR_FS	__MASK(TEXASR_FS_LG) /* TEXASR Failure Summary */
+#define   TEXASR_EXACT	__MASK(TEXASR_EX_LG) /* TFIAR value is exact */
+#define   TEXASR_ROT	__MASK(TEXASR_ROT_LG)
+#define   TEXASR_FC	(ASM_CONST(0xFF) << TEXASR_FC_LG)
+
 #define SPRN_TFHAR	0x80	/* Transaction Failure Handler Addr */
+
 #define SPRN_TIDR	144	/* Thread ID register */
 #define SPRN_CTRLF	0x088
 #define SPRN_CTRLT	0x098
diff --git a/arch/powerpc/platforms/powernv/copy-paste.h b/arch/powerpc/platforms/powernv/copy-paste.h
index c9a5036..3fa62de 100644
--- a/arch/powerpc/platforms/powernv/copy-paste.h
+++ b/arch/powerpc/platforms/powernv/copy-paste.h
@@ -7,9 +7,8 @@
  * 2 of the License, or (at your option) any later version.
  */
 #include <asm/ppc-opcode.h>
+#include <asm/reg.h>
 
-#define CR0_SHIFT	28
-#define CR0_MASK	0xF
 /*
  * Copy/paste instructions:
  *
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v4 03/29] powerpc: export tm_enable()/tm_disable/tm_abort() APIs
From: wei.guo.simon @ 2018-05-23  7:01 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Paul Mackerras, kvm, kvm-ppc, Simon Guo
In-Reply-To: <1527058932-7434-1-git-send-email-wei.guo.simon@gmail.com>

From: Simon Guo <wei.guo.simon@gmail.com>

This patch exports tm_enable()/tm_disable/tm_abort() APIs, which
will be used for PR KVM transaction memory logic.

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
Reviewed-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/asm-prototypes.h |  3 +++
 arch/powerpc/include/asm/tm.h             |  2 --
 arch/powerpc/kernel/tm.S                  | 12 ++++++++++++
 arch/powerpc/mm/hash_utils_64.c           |  1 +
 4 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index d9713ad..dfdcb23 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -141,4 +141,7 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4,
 void pnv_power9_force_smt4_catch(void);
 void pnv_power9_force_smt4_release(void);
 
+void tm_enable(void);
+void tm_disable(void);
+void tm_abort(uint8_t cause);
 #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */
diff --git a/arch/powerpc/include/asm/tm.h b/arch/powerpc/include/asm/tm.h
index b1658c9..e94f6db 100644
--- a/arch/powerpc/include/asm/tm.h
+++ b/arch/powerpc/include/asm/tm.h
@@ -10,12 +10,10 @@
 
 #ifndef __ASSEMBLY__
 
-extern void tm_enable(void);
 extern void tm_reclaim(struct thread_struct *thread,
 		       uint8_t cause);
 extern void tm_reclaim_current(uint8_t cause);
 extern void tm_recheckpoint(struct thread_struct *thread);
-extern void tm_abort(uint8_t cause);
 extern void tm_save_sprs(struct thread_struct *thread);
 extern void tm_restore_sprs(struct thread_struct *thread);
 
diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
index b92ac8e..ff12f47 100644
--- a/arch/powerpc/kernel/tm.S
+++ b/arch/powerpc/kernel/tm.S
@@ -12,6 +12,7 @@
 #include <asm/ptrace.h>
 #include <asm/reg.h>
 #include <asm/bug.h>
+#include <asm/export.h>
 
 #ifdef CONFIG_VSX
 /* See fpu.S, this is borrowed from there */
@@ -55,6 +56,16 @@ _GLOBAL(tm_enable)
 	or	r4, r4, r3
 	mtmsrd	r4
 1:	blr
+EXPORT_SYMBOL_GPL(tm_enable);
+
+_GLOBAL(tm_disable)
+	mfmsr	r4
+	li	r3, MSR_TM >> 32
+	sldi	r3, r3, 32
+	andc	r4, r4, r3
+	mtmsrd	r4
+	blr
+EXPORT_SYMBOL_GPL(tm_disable);
 
 _GLOBAL(tm_save_sprs)
 	mfspr	r0, SPRN_TFHAR
@@ -78,6 +89,7 @@ _GLOBAL(tm_restore_sprs)
 _GLOBAL(tm_abort)
 	TABORT(R3)
 	blr
+EXPORT_SYMBOL_GPL(tm_abort);
 
 /* void tm_reclaim(struct thread_struct *thread,
  *		   uint8_t cause)
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 0bd3790..1bd8b4c1 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -64,6 +64,7 @@
 #include <asm/trace.h>
 #include <asm/ps3.h>
 #include <asm/pte-walk.h>
+#include <asm/asm-prototypes.h>
 
 #ifdef DEBUG
 #define DBG(fmt...) udbg_printf(fmt)
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v4 04/29] KVM: PPC: Book3S PR: Move kvmppc_save_tm/kvmppc_restore_tm to separate file
From: wei.guo.simon @ 2018-05-23  7:01 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Paul Mackerras, kvm, kvm-ppc, Simon Guo
In-Reply-To: <1527058932-7434-1-git-send-email-wei.guo.simon@gmail.com>

From: Simon Guo <wei.guo.simon@gmail.com>

It is a simple patch just for moving kvmppc_save_tm/kvmppc_restore_tm()
functionalities to tm.S. There is no logic change. The reconstruct of
those APIs will be done in later patches to improve readability.

It is for preparation of reusing those APIs on both HV/PR PPC KVM.

Some slight change during move the functions includes:
- surrounds some HV KVM specific code with CONFIG_KVM_BOOK3S_HV_POSSIBLE
for compilation.
- use _GLOBAL() to define kvmppc_save_tm/kvmppc_restore_tm()

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
 arch/powerpc/kvm/Makefile               |   3 +
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 322 ----------------------------
 arch/powerpc/kvm/tm.S                   | 363 ++++++++++++++++++++++++++++++++
 3 files changed, 366 insertions(+), 322 deletions(-)
 create mode 100644 arch/powerpc/kvm/tm.S

diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 4b19da8..f872c04 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -63,6 +63,9 @@ kvm-pr-y := \
 	book3s_64_mmu.o \
 	book3s_32_mmu.o
 
+kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
+	tm.o
+
 ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
 	book3s_rmhandlers.o
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 5e6e493..4db2b10 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -39,8 +39,6 @@ BEGIN_FTR_SECTION;				\
 	extsw	reg, reg;			\
 END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 
-#define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
-
 /* Values in HSTATE_NAPPING(r13) */
 #define NAPPING_CEDE	1
 #define NAPPING_NOVCPU	2
@@ -3119,326 +3117,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 	mr	r4,r31
 	blr
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-/*
- * Save transactional state and TM-related registers.
- * Called with r9 pointing to the vcpu struct.
- * This can modify all checkpointed registers, but
- * restores r1, r2 and r9 (vcpu pointer) before exit.
- */
-kvmppc_save_tm:
-	mflr	r0
-	std	r0, PPC_LR_STKOFF(r1)
-	stdu	r1, -PPC_MIN_STKFRM(r1)
-
-	/* Turn on TM. */
-	mfmsr	r8
-	li	r0, 1
-	rldimi	r8, r0, MSR_TM_LG, 63-MSR_TM_LG
-	mtmsrd	r8
-
-	ld	r5, VCPU_MSR(r9)
-	rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
-	beq	1f	/* TM not active in guest. */
-
-	std	r1, HSTATE_HOST_R1(r13)
-	li	r3, TM_CAUSE_KVM_RESCHED
-
-BEGIN_FTR_SECTION
-	lbz	r0, HSTATE_FAKE_SUSPEND(r13) /* Were we fake suspended? */
-	cmpwi	r0, 0
-	beq	3f
-	rldicl. r8, r8, 64 - MSR_TS_S_LG, 62 /* Did we actually hrfid? */
-	beq	4f
-BEGIN_FTR_SECTION_NESTED(96)
-	bl	pnv_power9_force_smt4_catch
-END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96)
-	nop
-	b	6f
-3:
-	/* Emulation of the treclaim instruction needs TEXASR before treclaim */
-	mfspr	r6, SPRN_TEXASR
-	std	r6, VCPU_ORIG_TEXASR(r9)
-6:
-END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
-
-	/* Clear the MSR RI since r1, r13 are all going to be foobar. */
-	li	r5, 0
-	mtmsrd	r5, 1
-
-	/* All GPRs are volatile at this point. */
-	TRECLAIM(R3)
-
-	/* Temporarily store r13 and r9 so we have some regs to play with */
-	SET_SCRATCH0(r13)
-	GET_PACA(r13)
-	std	r9, PACATMSCRATCH(r13)
-
-	/* If doing TM emulation on POWER9 DD2.2, check for fake suspend mode */
-BEGIN_FTR_SECTION
-	lbz	r9, HSTATE_FAKE_SUSPEND(r13)
-	cmpwi	r9, 0
-	beq	2f
-	/*
-	 * We were in fake suspend, so we are not going to save the
-	 * register state as the guest checkpointed state (since
-	 * we already have it), therefore we can now use any volatile GPR.
-	 */
-	/* Reload stack pointer and TOC. */
-	ld	r1, HSTATE_HOST_R1(r13)
-	ld	r2, PACATOC(r13)
-	/* Set MSR RI now we have r1 and r13 back. */
-	li	r5, MSR_RI
-	mtmsrd	r5, 1
-	HMT_MEDIUM
-	ld	r6, HSTATE_DSCR(r13)
-	mtspr	SPRN_DSCR, r6
-BEGIN_FTR_SECTION_NESTED(96)
-	bl	pnv_power9_force_smt4_release
-END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96)
-	nop
-
-4:
-	mfspr	r3, SPRN_PSSCR
-	/* PSSCR_FAKE_SUSPEND is a write-only bit, but clear it anyway */
-	li	r0, PSSCR_FAKE_SUSPEND
-	andc	r3, r3, r0
-	mtspr	SPRN_PSSCR, r3
-	ld	r9, HSTATE_KVM_VCPU(r13)
-	/* Don't save TEXASR, use value from last exit in real suspend state */
-	b	11f
-2:
-END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
-
-	ld	r9, HSTATE_KVM_VCPU(r13)
-
-	/* Get a few more GPRs free. */
-	std	r29, VCPU_GPRS_TM(29)(r9)
-	std	r30, VCPU_GPRS_TM(30)(r9)
-	std	r31, VCPU_GPRS_TM(31)(r9)
-
-	/* Save away PPR and DSCR soon so don't run with user values. */
-	mfspr	r31, SPRN_PPR
-	HMT_MEDIUM
-	mfspr	r30, SPRN_DSCR
-	ld	r29, HSTATE_DSCR(r13)
-	mtspr	SPRN_DSCR, r29
-
-	/* Save all but r9, r13 & r29-r31 */
-	reg = 0
-	.rept	29
-	.if (reg != 9) && (reg != 13)
-	std	reg, VCPU_GPRS_TM(reg)(r9)
-	.endif
-	reg = reg + 1
-	.endr
-	/* ... now save r13 */
-	GET_SCRATCH0(r4)
-	std	r4, VCPU_GPRS_TM(13)(r9)
-	/* ... and save r9 */
-	ld	r4, PACATMSCRATCH(r13)
-	std	r4, VCPU_GPRS_TM(9)(r9)
-
-	/* Reload stack pointer and TOC. */
-	ld	r1, HSTATE_HOST_R1(r13)
-	ld	r2, PACATOC(r13)
-
-	/* Set MSR RI now we have r1 and r13 back. */
-	li	r5, MSR_RI
-	mtmsrd	r5, 1
-
-	/* Save away checkpinted SPRs. */
-	std	r31, VCPU_PPR_TM(r9)
-	std	r30, VCPU_DSCR_TM(r9)
-	mflr	r5
-	mfcr	r6
-	mfctr	r7
-	mfspr	r8, SPRN_AMR
-	mfspr	r10, SPRN_TAR
-	mfxer	r11
-	std	r5, VCPU_LR_TM(r9)
-	stw	r6, VCPU_CR_TM(r9)
-	std	r7, VCPU_CTR_TM(r9)
-	std	r8, VCPU_AMR_TM(r9)
-	std	r10, VCPU_TAR_TM(r9)
-	std	r11, VCPU_XER_TM(r9)
-
-	/* Restore r12 as trap number. */
-	lwz	r12, VCPU_TRAP(r9)
-
-	/* Save FP/VSX. */
-	addi	r3, r9, VCPU_FPRS_TM
-	bl	store_fp_state
-	addi	r3, r9, VCPU_VRS_TM
-	bl	store_vr_state
-	mfspr	r6, SPRN_VRSAVE
-	stw	r6, VCPU_VRSAVE_TM(r9)
-1:
-	/*
-	 * We need to save these SPRs after the treclaim so that the software
-	 * error code is recorded correctly in the TEXASR.  Also the user may
-	 * change these outside of a transaction, so they must always be
-	 * context switched.
-	 */
-	mfspr	r7, SPRN_TEXASR
-	std	r7, VCPU_TEXASR(r9)
-11:
-	mfspr	r5, SPRN_TFHAR
-	mfspr	r6, SPRN_TFIAR
-	std	r5, VCPU_TFHAR(r9)
-	std	r6, VCPU_TFIAR(r9)
-
-	addi	r1, r1, PPC_MIN_STKFRM
-	ld	r0, PPC_LR_STKOFF(r1)
-	mtlr	r0
-	blr
-
-/*
- * Restore transactional state and TM-related registers.
- * Called with r4 pointing to the vcpu struct.
- * This potentially modifies all checkpointed registers.
- * It restores r1, r2, r4 from the PACA.
- */
-kvmppc_restore_tm:
-	mflr	r0
-	std	r0, PPC_LR_STKOFF(r1)
-
-	/* Turn on TM/FP/VSX/VMX so we can restore them. */
-	mfmsr	r5
-	li	r6, MSR_TM >> 32
-	sldi	r6, r6, 32
-	or	r5, r5, r6
-	ori	r5, r5, MSR_FP
-	oris	r5, r5, (MSR_VEC | MSR_VSX)@h
-	mtmsrd	r5
-
-	/*
-	 * The user may change these outside of a transaction, so they must
-	 * always be context switched.
-	 */
-	ld	r5, VCPU_TFHAR(r4)
-	ld	r6, VCPU_TFIAR(r4)
-	ld	r7, VCPU_TEXASR(r4)
-	mtspr	SPRN_TFHAR, r5
-	mtspr	SPRN_TFIAR, r6
-	mtspr	SPRN_TEXASR, r7
-
-	li	r0, 0
-	stb	r0, HSTATE_FAKE_SUSPEND(r13)
-	ld	r5, VCPU_MSR(r4)
-	rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
-	beqlr		/* TM not active in guest */
-	std	r1, HSTATE_HOST_R1(r13)
-
-	/* Make sure the failure summary is set, otherwise we'll program check
-	 * when we trechkpt.  It's possible that this might have been not set
-	 * on a kvmppc_set_one_reg() call but we shouldn't let this crash the
-	 * host.
-	 */
-	oris	r7, r7, (TEXASR_FS)@h
-	mtspr	SPRN_TEXASR, r7
-
-	/*
-	 * If we are doing TM emulation for the guest on a POWER9 DD2,
-	 * then we don't actually do a trechkpt -- we either set up
-	 * fake-suspend mode, or emulate a TM rollback.
-	 */
-BEGIN_FTR_SECTION
-	b	.Ldo_tm_fake_load
-END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
-
-	/*
-	 * We need to load up the checkpointed state for the guest.
-	 * We need to do this early as it will blow away any GPRs, VSRs and
-	 * some SPRs.
-	 */
-
-	mr	r31, r4
-	addi	r3, r31, VCPU_FPRS_TM
-	bl	load_fp_state
-	addi	r3, r31, VCPU_VRS_TM
-	bl	load_vr_state
-	mr	r4, r31
-	lwz	r7, VCPU_VRSAVE_TM(r4)
-	mtspr	SPRN_VRSAVE, r7
-
-	ld	r5, VCPU_LR_TM(r4)
-	lwz	r6, VCPU_CR_TM(r4)
-	ld	r7, VCPU_CTR_TM(r4)
-	ld	r8, VCPU_AMR_TM(r4)
-	ld	r9, VCPU_TAR_TM(r4)
-	ld	r10, VCPU_XER_TM(r4)
-	mtlr	r5
-	mtcr	r6
-	mtctr	r7
-	mtspr	SPRN_AMR, r8
-	mtspr	SPRN_TAR, r9
-	mtxer	r10
-
-	/*
-	 * Load up PPR and DSCR values but don't put them in the actual SPRs
-	 * till the last moment to avoid running with userspace PPR and DSCR for
-	 * too long.
-	 */
-	ld	r29, VCPU_DSCR_TM(r4)
-	ld	r30, VCPU_PPR_TM(r4)
-
-	std	r2, PACATMSCRATCH(r13) /* Save TOC */
-
-	/* Clear the MSR RI since r1, r13 are all going to be foobar. */
-	li	r5, 0
-	mtmsrd	r5, 1
-
-	/* Load GPRs r0-r28 */
-	reg = 0
-	.rept	29
-	ld	reg, VCPU_GPRS_TM(reg)(r31)
-	reg = reg + 1
-	.endr
-
-	mtspr	SPRN_DSCR, r29
-	mtspr	SPRN_PPR, r30
-
-	/* Load final GPRs */
-	ld	29, VCPU_GPRS_TM(29)(r31)
-	ld	30, VCPU_GPRS_TM(30)(r31)
-	ld	31, VCPU_GPRS_TM(31)(r31)
-
-	/* TM checkpointed state is now setup.  All GPRs are now volatile. */
-	TRECHKPT
-
-	/* Now let's get back the state we need. */
-	HMT_MEDIUM
-	GET_PACA(r13)
-	ld	r29, HSTATE_DSCR(r13)
-	mtspr	SPRN_DSCR, r29
-	ld	r4, HSTATE_KVM_VCPU(r13)
-	ld	r1, HSTATE_HOST_R1(r13)
-	ld	r2, PACATMSCRATCH(r13)
-
-	/* Set the MSR RI since we have our registers back. */
-	li	r5, MSR_RI
-	mtmsrd	r5, 1
-9:
-	ld	r0, PPC_LR_STKOFF(r1)
-	mtlr	r0
-	blr
-
-.Ldo_tm_fake_load:
-	cmpwi	r5, 1		/* check for suspended state */
-	bgt	10f
-	stb	r5, HSTATE_FAKE_SUSPEND(r13)
-	b	9b		/* and return */
-10:	stdu	r1, -PPC_MIN_STKFRM(r1)
-	/* guest is in transactional state, so simulate rollback */
-	mr	r3, r4
-	bl	kvmhv_emulate_tm_rollback
-	nop
-	ld      r4, HSTATE_KVM_VCPU(r13) /* our vcpu pointer has been trashed */
-	addi	r1, r1, PPC_MIN_STKFRM
-	b	9b
-#endif
-
 /*
  * We come here if we get any exception or interrupt while we are
  * executing host real mode code while in guest MMU context.
diff --git a/arch/powerpc/kvm/tm.S b/arch/powerpc/kvm/tm.S
new file mode 100644
index 0000000..e79b373
--- /dev/null
+++ b/arch/powerpc/kvm/tm.S
@@ -0,0 +1,363 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Derived from book3s_hv_rmhandlers.S,  which are:
+ *
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ */
+
+#include <asm/reg.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/export.h>
+#include <asm/tm.h>
+#include <asm/cputable.h>
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+#define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
+
+/*
+ * Save transactional state and TM-related registers.
+ * Called with r9 pointing to the vcpu struct.
+ * This can modify all checkpointed registers, but
+ * restores r1, r2 and r9 (vcpu pointer) before exit.
+ */
+_GLOBAL(kvmppc_save_tm)
+	mflr	r0
+	std	r0, PPC_LR_STKOFF(r1)
+	stdu	r1, -PPC_MIN_STKFRM(r1)
+
+	/* Turn on TM. */
+	mfmsr	r8
+	li	r0, 1
+	rldimi	r8, r0, MSR_TM_LG, 63-MSR_TM_LG
+	mtmsrd	r8
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	ld	r5, VCPU_MSR(r9)
+	rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+	beq	1f	/* TM not active in guest. */
+#endif
+
+	std	r1, HSTATE_HOST_R1(r13)
+	li	r3, TM_CAUSE_KVM_RESCHED
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+BEGIN_FTR_SECTION
+	lbz	r0, HSTATE_FAKE_SUSPEND(r13) /* Were we fake suspended? */
+	cmpwi	r0, 0
+	beq	3f
+	rldicl. r8, r8, 64 - MSR_TS_S_LG, 62 /* Did we actually hrfid? */
+	beq	4f
+BEGIN_FTR_SECTION_NESTED(96)
+	bl	pnv_power9_force_smt4_catch
+END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96)
+	nop
+	b	6f
+3:
+	/* Emulation of the treclaim instruction needs TEXASR before treclaim */
+	mfspr	r6, SPRN_TEXASR
+	std	r6, VCPU_ORIG_TEXASR(r9)
+6:
+END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
+#endif
+
+	/* Clear the MSR RI since r1, r13 are all going to be foobar. */
+	li	r5, 0
+	mtmsrd	r5, 1
+
+	/* All GPRs are volatile at this point. */
+	TRECLAIM(R3)
+
+	/* Temporarily store r13 and r9 so we have some regs to play with */
+	SET_SCRATCH0(r13)
+	GET_PACA(r13)
+	std	r9, PACATMSCRATCH(r13)
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	/* If doing TM emulation on POWER9 DD2.2, check for fake suspend mode */
+BEGIN_FTR_SECTION
+	lbz	r9, HSTATE_FAKE_SUSPEND(r13)
+	cmpwi	r9, 0
+	beq	2f
+	/*
+	 * We were in fake suspend, so we are not going to save the
+	 * register state as the guest checkpointed state (since
+	 * we already have it), therefore we can now use any volatile GPR.
+	 */
+	/* Reload stack pointer and TOC. */
+	ld	r1, HSTATE_HOST_R1(r13)
+	ld	r2, PACATOC(r13)
+	/* Set MSR RI now we have r1 and r13 back. */
+	li	r5, MSR_RI
+	mtmsrd	r5, 1
+	HMT_MEDIUM
+	ld	r6, HSTATE_DSCR(r13)
+	mtspr	SPRN_DSCR, r6
+BEGIN_FTR_SECTION_NESTED(96)
+	bl	pnv_power9_force_smt4_release
+END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96)
+	nop
+
+4:
+	mfspr	r3, SPRN_PSSCR
+	/* PSSCR_FAKE_SUSPEND is a write-only bit, but clear it anyway */
+	li	r0, PSSCR_FAKE_SUSPEND
+	andc	r3, r3, r0
+	mtspr	SPRN_PSSCR, r3
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	/* Don't save TEXASR, use value from last exit in real suspend state */
+	b	11f
+2:
+END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
+
+	ld	r9, HSTATE_KVM_VCPU(r13)
+#endif
+
+
+	/* Get a few more GPRs free. */
+	std	r29, VCPU_GPRS_TM(29)(r9)
+	std	r30, VCPU_GPRS_TM(30)(r9)
+	std	r31, VCPU_GPRS_TM(31)(r9)
+
+	/* Save away PPR and DSCR soon so don't run with user values. */
+	mfspr	r31, SPRN_PPR
+	HMT_MEDIUM
+	mfspr	r30, SPRN_DSCR
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	ld	r29, HSTATE_DSCR(r13)
+	mtspr	SPRN_DSCR, r29
+#endif
+
+	/* Save all but r9, r13 & r29-r31 */
+	reg = 0
+	.rept	29
+	.if (reg != 9) && (reg != 13)
+	std	reg, VCPU_GPRS_TM(reg)(r9)
+	.endif
+	reg = reg + 1
+	.endr
+	/* ... now save r13 */
+	GET_SCRATCH0(r4)
+	std	r4, VCPU_GPRS_TM(13)(r9)
+	/* ... and save r9 */
+	ld	r4, PACATMSCRATCH(r13)
+	std	r4, VCPU_GPRS_TM(9)(r9)
+
+	/* Reload stack pointer and TOC. */
+	ld	r1, HSTATE_HOST_R1(r13)
+	ld	r2, PACATOC(r13)
+
+	/* Set MSR RI now we have r1 and r13 back. */
+	li	r5, MSR_RI
+	mtmsrd	r5, 1
+
+	/* Save away checkpinted SPRs. */
+	std	r31, VCPU_PPR_TM(r9)
+	std	r30, VCPU_DSCR_TM(r9)
+	mflr	r5
+	mfcr	r6
+	mfctr	r7
+	mfspr	r8, SPRN_AMR
+	mfspr	r10, SPRN_TAR
+	mfxer	r11
+	std	r5, VCPU_LR_TM(r9)
+	stw	r6, VCPU_CR_TM(r9)
+	std	r7, VCPU_CTR_TM(r9)
+	std	r8, VCPU_AMR_TM(r9)
+	std	r10, VCPU_TAR_TM(r9)
+	std	r11, VCPU_XER_TM(r9)
+
+	/* Restore r12 as trap number. */
+	lwz	r12, VCPU_TRAP(r9)
+
+	/* Save FP/VSX. */
+	addi	r3, r9, VCPU_FPRS_TM
+	bl	store_fp_state
+	addi	r3, r9, VCPU_VRS_TM
+	bl	store_vr_state
+	mfspr	r6, SPRN_VRSAVE
+	stw	r6, VCPU_VRSAVE_TM(r9)
+1:
+	/*
+	 * We need to save these SPRs after the treclaim so that the software
+	 * error code is recorded correctly in the TEXASR.  Also the user may
+	 * change these outside of a transaction, so they must always be
+	 * context switched.
+	 */
+	mfspr	r7, SPRN_TEXASR
+	std	r7, VCPU_TEXASR(r9)
+11:
+	mfspr	r5, SPRN_TFHAR
+	mfspr	r6, SPRN_TFIAR
+	std	r5, VCPU_TFHAR(r9)
+	std	r6, VCPU_TFIAR(r9)
+
+	addi	r1, r1, PPC_MIN_STKFRM
+	ld	r0, PPC_LR_STKOFF(r1)
+	mtlr	r0
+	blr
+
+/*
+ * Restore transactional state and TM-related registers.
+ * Called with r4 pointing to the vcpu struct.
+ * This potentially modifies all checkpointed registers.
+ * It restores r1, r2, r4 from the PACA.
+ */
+_GLOBAL(kvmppc_restore_tm)
+	mflr	r0
+	std	r0, PPC_LR_STKOFF(r1)
+
+	/* Turn on TM/FP/VSX/VMX so we can restore them. */
+	mfmsr	r5
+	li	r6, MSR_TM >> 32
+	sldi	r6, r6, 32
+	or	r5, r5, r6
+	ori	r5, r5, MSR_FP
+	oris	r5, r5, (MSR_VEC | MSR_VSX)@h
+	mtmsrd	r5
+
+	/*
+	 * The user may change these outside of a transaction, so they must
+	 * always be context switched.
+	 */
+	ld	r5, VCPU_TFHAR(r4)
+	ld	r6, VCPU_TFIAR(r4)
+	ld	r7, VCPU_TEXASR(r4)
+	mtspr	SPRN_TFHAR, r5
+	mtspr	SPRN_TFIAR, r6
+	mtspr	SPRN_TEXASR, r7
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	li	r0, 0
+	stb	r0, HSTATE_FAKE_SUSPEND(r13)
+#endif
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	ld	r5, VCPU_MSR(r4)
+	rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+	beqlr		/* TM not active in guest */
+#endif
+	std	r1, HSTATE_HOST_R1(r13)
+
+	/* Make sure the failure summary is set, otherwise we'll program check
+	 * when we trechkpt.  It's possible that this might have been not set
+	 * on a kvmppc_set_one_reg() call but we shouldn't let this crash the
+	 * host.
+	 */
+	oris	r7, r7, (TEXASR_FS)@h
+	mtspr	SPRN_TEXASR, r7
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	/*
+	 * If we are doing TM emulation for the guest on a POWER9 DD2,
+	 * then we don't actually do a trechkpt -- we either set up
+	 * fake-suspend mode, or emulate a TM rollback.
+	 */
+BEGIN_FTR_SECTION
+	b	.Ldo_tm_fake_load
+END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
+#endif
+
+	/*
+	 * We need to load up the checkpointed state for the guest.
+	 * We need to do this early as it will blow away any GPRs, VSRs and
+	 * some SPRs.
+	 */
+
+	mr	r31, r4
+	addi	r3, r31, VCPU_FPRS_TM
+	bl	load_fp_state
+	addi	r3, r31, VCPU_VRS_TM
+	bl	load_vr_state
+	mr	r4, r31
+	lwz	r7, VCPU_VRSAVE_TM(r4)
+	mtspr	SPRN_VRSAVE, r7
+
+	ld	r5, VCPU_LR_TM(r4)
+	lwz	r6, VCPU_CR_TM(r4)
+	ld	r7, VCPU_CTR_TM(r4)
+	ld	r8, VCPU_AMR_TM(r4)
+	ld	r9, VCPU_TAR_TM(r4)
+	ld	r10, VCPU_XER_TM(r4)
+	mtlr	r5
+	mtcr	r6
+	mtctr	r7
+	mtspr	SPRN_AMR, r8
+	mtspr	SPRN_TAR, r9
+	mtxer	r10
+
+	/*
+	 * Load up PPR and DSCR values but don't put them in the actual SPRs
+	 * till the last moment to avoid running with userspace PPR and DSCR for
+	 * too long.
+	 */
+	ld	r29, VCPU_DSCR_TM(r4)
+	ld	r30, VCPU_PPR_TM(r4)
+
+	std	r2, PACATMSCRATCH(r13) /* Save TOC */
+
+	/* Clear the MSR RI since r1, r13 are all going to be foobar. */
+	li	r5, 0
+	mtmsrd	r5, 1
+
+	/* Load GPRs r0-r28 */
+	reg = 0
+	.rept	29
+	ld	reg, VCPU_GPRS_TM(reg)(r31)
+	reg = reg + 1
+	.endr
+
+	mtspr	SPRN_DSCR, r29
+	mtspr	SPRN_PPR, r30
+
+	/* Load final GPRs */
+	ld	29, VCPU_GPRS_TM(29)(r31)
+	ld	30, VCPU_GPRS_TM(30)(r31)
+	ld	31, VCPU_GPRS_TM(31)(r31)
+
+	/* TM checkpointed state is now setup.  All GPRs are now volatile. */
+	TRECHKPT
+
+	/* Now let's get back the state we need. */
+	HMT_MEDIUM
+	GET_PACA(r13)
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	ld	r29, HSTATE_DSCR(r13)
+	mtspr	SPRN_DSCR, r29
+	ld	r4, HSTATE_KVM_VCPU(r13)
+#endif
+	ld	r1, HSTATE_HOST_R1(r13)
+	ld	r2, PACATMSCRATCH(r13)
+
+	/* Set the MSR RI since we have our registers back. */
+	li	r5, MSR_RI
+	mtmsrd	r5, 1
+9:
+	ld	r0, PPC_LR_STKOFF(r1)
+	mtlr	r0
+	blr
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+.Ldo_tm_fake_load:
+	cmpwi	r5, 1		/* check for suspended state */
+	bgt	10f
+	stb	r5, HSTATE_FAKE_SUSPEND(r13)
+	b	9b		/* and return */
+10:	stdu	r1, -PPC_MIN_STKFRM(r1)
+	/* guest is in transactional state, so simulate rollback */
+	mr	r3, r4
+	bl	kvmhv_emulate_tm_rollback
+	nop
+	ld      r4, HSTATE_KVM_VCPU(r13) /* our vcpu pointer has been trashed */
+	addi	r1, r1, PPC_MIN_STKFRM
+	b	9b
+#endif
+#endif
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v4 05/29] KVM: PPC: Book3S PR: add new parameter (guest MSR) for kvmppc_save_tm()/kvmppc_restore_tm()
From: wei.guo.simon @ 2018-05-23  7:01 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Paul Mackerras, kvm, kvm-ppc, Simon Guo
In-Reply-To: <1527058932-7434-1-git-send-email-wei.guo.simon@gmail.com>

From: Simon Guo <wei.guo.simon@gmail.com>

HV KVM and PR KVM need different MSR source to indicate whether
treclaim. or trecheckpoint. is necessary.

This patch add new parameter (guest MSR) for these kvmppc_save_tm/
kvmppc_restore_tm() APIs:
- For HV KVM, it is VCPU_MSR
- For PR KVM, it is current host MSR or VCPU_SHADOW_SRR1

This enhancement enables these 2 APIs to be reused by PR KVM later.
And the patch keeps HV KVM logic unchanged.

This patch also reworks kvmppc_save_tm()/kvmppc_restore_tm() to
have a clean ABI: r3 for vcpu and r4 for guest_msr.

During kvmppc_save_tm/kvmppc_restore_tm(), the R1 need to be saved
or restored. Currently the R1 is saved into HSTATE_HOST_R1. In PR
KVM, we are going to add a C function wrapper for
kvmppc_save_tm/kvmppc_restore_tm() where the R1 will be incremented
with added stackframe and save into HSTATE_HOST_R1. There are several
places in HV KVM to load HSTATE_HOST_R1 as R1, and we don't want to
bring risk or confusion by TM code.

This patch will use HSTATE_SCRATCH2 to save/restore R1 in
kvmppc_save_tm/kvmppc_restore_tm() to avoid future confusion, since
the r1 is actually a temporary/scratch value to be saved/stored.

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 13 +++++-
 arch/powerpc/kvm/tm.S                   | 74 ++++++++++++++++-----------------
 2 files changed, 49 insertions(+), 38 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 4db2b10..6445d29 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -793,8 +793,12 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
 	/*
 	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
 	 */
+	mr      r3, r4
+	ld      r4, VCPU_MSR(r3)
 	bl	kvmppc_restore_tm
+	ld	r4, HSTATE_KVM_VCPU(r13)
 91:
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
 #endif
 
 	/* Load guest PMU registers */
@@ -1777,7 +1781,10 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
 	/*
 	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
 	 */
+	mr      r3, r9
+	ld      r4, VCPU_MSR(r3)
 	bl	kvmppc_save_tm
+	ld	r9, HSTATE_KVM_VCPU(r13)
 91:
 #endif
 
@@ -2680,7 +2687,8 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
 	/*
 	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
 	 */
-	ld	r9, HSTATE_KVM_VCPU(r13)
+	ld      r3, HSTATE_KVM_VCPU(r13)
+	ld      r4, VCPU_MSR(r3)
 	bl	kvmppc_save_tm
 91:
 #endif
@@ -2799,7 +2807,10 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
 	/*
 	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
 	 */
+	mr      r3, r4
+	ld      r4, VCPU_MSR(r3)
 	bl	kvmppc_restore_tm
+	ld	r4, HSTATE_KVM_VCPU(r13)
 91:
 #endif
 
diff --git a/arch/powerpc/kvm/tm.S b/arch/powerpc/kvm/tm.S
index e79b373..cbe608a 100644
--- a/arch/powerpc/kvm/tm.S
+++ b/arch/powerpc/kvm/tm.S
@@ -26,9 +26,12 @@
 
 /*
  * Save transactional state and TM-related registers.
- * Called with r9 pointing to the vcpu struct.
+ * Called with:
+ * - r3 pointing to the vcpu struct
+ * - r4 points to the MSR with current TS bits:
+ * 	(For HV KVM, it is VCPU_MSR ; For PR KVM, it is host MSR).
  * This can modify all checkpointed registers, but
- * restores r1, r2 and r9 (vcpu pointer) before exit.
+ * restores r1, r2 before exit.
  */
 _GLOBAL(kvmppc_save_tm)
 	mflr	r0
@@ -41,14 +44,11 @@ _GLOBAL(kvmppc_save_tm)
 	rldimi	r8, r0, MSR_TM_LG, 63-MSR_TM_LG
 	mtmsrd	r8
 
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-	ld	r5, VCPU_MSR(r9)
-	rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+	rldicl. r4, r4, 64 - MSR_TS_S_LG, 62
 	beq	1f	/* TM not active in guest. */
-#endif
 
-	std	r1, HSTATE_HOST_R1(r13)
-	li	r3, TM_CAUSE_KVM_RESCHED
+	std	r1, HSTATE_SCRATCH2(r13)
+	std	r3, HSTATE_SCRATCH1(r13)
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 BEGIN_FTR_SECTION
@@ -65,7 +65,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96)
 3:
 	/* Emulation of the treclaim instruction needs TEXASR before treclaim */
 	mfspr	r6, SPRN_TEXASR
-	std	r6, VCPU_ORIG_TEXASR(r9)
+	std	r6, VCPU_ORIG_TEXASR(r3)
 6:
 END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
 #endif
@@ -74,6 +74,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
 	li	r5, 0
 	mtmsrd	r5, 1
 
+	li	r3, TM_CAUSE_KVM_RESCHED
+
 	/* All GPRs are volatile at this point. */
 	TRECLAIM(R3)
 
@@ -94,7 +96,7 @@ BEGIN_FTR_SECTION
 	 * we already have it), therefore we can now use any volatile GPR.
 	 */
 	/* Reload stack pointer and TOC. */
-	ld	r1, HSTATE_HOST_R1(r13)
+	ld	r1, HSTATE_SCRATCH2(r13)
 	ld	r2, PACATOC(r13)
 	/* Set MSR RI now we have r1 and r13 back. */
 	li	r5, MSR_RI
@@ -118,10 +120,9 @@ END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96)
 	b	11f
 2:
 END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
-
-	ld	r9, HSTATE_KVM_VCPU(r13)
 #endif
 
+	ld	r9, HSTATE_SCRATCH1(r13)
 
 	/* Get a few more GPRs free. */
 	std	r29, VCPU_GPRS_TM(29)(r9)
@@ -153,7 +154,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
 	std	r4, VCPU_GPRS_TM(9)(r9)
 
 	/* Reload stack pointer and TOC. */
-	ld	r1, HSTATE_HOST_R1(r13)
+	ld	r1, HSTATE_SCRATCH2(r13)
 	ld	r2, PACATOC(r13)
 
 	/* Set MSR RI now we have r1 and r13 back. */
@@ -208,9 +209,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
 
 /*
  * Restore transactional state and TM-related registers.
- * Called with r4 pointing to the vcpu struct.
+ * Called with:
+ *  - r3 pointing to the vcpu struct.
+ *  - r4 is the guest MSR with desired TS bits:
+ * 	For HV KVM, it is VCPU_MSR
+ * 	For PR KVM, it is provided by caller
  * This potentially modifies all checkpointed registers.
- * It restores r1, r2, r4 from the PACA.
+ * It restores r1, r2 from the PACA.
  */
 _GLOBAL(kvmppc_restore_tm)
 	mflr	r0
@@ -229,9 +234,9 @@ _GLOBAL(kvmppc_restore_tm)
 	 * The user may change these outside of a transaction, so they must
 	 * always be context switched.
 	 */
-	ld	r5, VCPU_TFHAR(r4)
-	ld	r6, VCPU_TFIAR(r4)
-	ld	r7, VCPU_TEXASR(r4)
+	ld	r5, VCPU_TFHAR(r3)
+	ld	r6, VCPU_TFIAR(r3)
+	ld	r7, VCPU_TEXASR(r3)
 	mtspr	SPRN_TFHAR, r5
 	mtspr	SPRN_TFIAR, r6
 	mtspr	SPRN_TEXASR, r7
@@ -240,12 +245,10 @@ _GLOBAL(kvmppc_restore_tm)
 	li	r0, 0
 	stb	r0, HSTATE_FAKE_SUSPEND(r13)
 #endif
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-	ld	r5, VCPU_MSR(r4)
+	mr	r5, r4
 	rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
 	beqlr		/* TM not active in guest */
-#endif
-	std	r1, HSTATE_HOST_R1(r13)
+	std	r1, HSTATE_SCRATCH2(r13)
 
 	/* Make sure the failure summary is set, otherwise we'll program check
 	 * when we trechkpt.  It's possible that this might have been not set
@@ -272,21 +275,21 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
 	 * some SPRs.
 	 */
 
-	mr	r31, r4
+	mr	r31, r3
 	addi	r3, r31, VCPU_FPRS_TM
 	bl	load_fp_state
 	addi	r3, r31, VCPU_VRS_TM
 	bl	load_vr_state
-	mr	r4, r31
-	lwz	r7, VCPU_VRSAVE_TM(r4)
+	mr	r3, r31
+	lwz	r7, VCPU_VRSAVE_TM(r3)
 	mtspr	SPRN_VRSAVE, r7
 
-	ld	r5, VCPU_LR_TM(r4)
-	lwz	r6, VCPU_CR_TM(r4)
-	ld	r7, VCPU_CTR_TM(r4)
-	ld	r8, VCPU_AMR_TM(r4)
-	ld	r9, VCPU_TAR_TM(r4)
-	ld	r10, VCPU_XER_TM(r4)
+	ld	r5, VCPU_LR_TM(r3)
+	lwz	r6, VCPU_CR_TM(r3)
+	ld	r7, VCPU_CTR_TM(r3)
+	ld	r8, VCPU_AMR_TM(r3)
+	ld	r9, VCPU_TAR_TM(r3)
+	ld	r10, VCPU_XER_TM(r3)
 	mtlr	r5
 	mtcr	r6
 	mtctr	r7
@@ -299,8 +302,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
 	 * till the last moment to avoid running with userspace PPR and DSCR for
 	 * too long.
 	 */
-	ld	r29, VCPU_DSCR_TM(r4)
-	ld	r30, VCPU_PPR_TM(r4)
+	ld	r29, VCPU_DSCR_TM(r3)
+	ld	r30, VCPU_PPR_TM(r3)
 
 	std	r2, PACATMSCRATCH(r13) /* Save TOC */
 
@@ -332,9 +335,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 	ld	r29, HSTATE_DSCR(r13)
 	mtspr	SPRN_DSCR, r29
-	ld	r4, HSTATE_KVM_VCPU(r13)
 #endif
-	ld	r1, HSTATE_HOST_R1(r13)
+	ld	r1, HSTATE_SCRATCH2(r13)
 	ld	r2, PACATMSCRATCH(r13)
 
 	/* Set the MSR RI since we have our registers back. */
@@ -353,10 +355,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
 	b	9b		/* and return */
 10:	stdu	r1, -PPC_MIN_STKFRM(r1)
 	/* guest is in transactional state, so simulate rollback */
-	mr	r3, r4
 	bl	kvmhv_emulate_tm_rollback
 	nop
-	ld      r4, HSTATE_KVM_VCPU(r13) /* our vcpu pointer has been trashed */
 	addi	r1, r1, PPC_MIN_STKFRM
 	b	9b
 #endif
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v4 06/29] KVM: PPC: Book3S PR: turn on FP/VSX/VMX MSR bits in kvmppc_save_tm()
From: wei.guo.simon @ 2018-05-23  7:01 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Paul Mackerras, kvm, kvm-ppc, Simon Guo
In-Reply-To: <1527058932-7434-1-git-send-email-wei.guo.simon@gmail.com>

From: Simon Guo <wei.guo.simon@gmail.com>

kvmppc_save_tm() invokes  store_fp_state/store_vr_state(). So it is
mandatory to turn on FP/VSX/VMX MSR bits for its execution, just
like what kvmppc_restore_tm() did.

Previsouly HV KVM has turned the bits on outside of function
kvmppc_save_tm().  Now we include this bit change in kvmppc_save_tm()
so that the logic is more clean. And PR KVM can reuse it later.

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
Reviewed-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/tm.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kvm/tm.S b/arch/powerpc/kvm/tm.S
index cbe608a..b7057d5 100644
--- a/arch/powerpc/kvm/tm.S
+++ b/arch/powerpc/kvm/tm.S
@@ -42,6 +42,8 @@ _GLOBAL(kvmppc_save_tm)
 	mfmsr	r8
 	li	r0, 1
 	rldimi	r8, r0, MSR_TM_LG, 63-MSR_TM_LG
+	ori     r8, r8, MSR_FP
+	oris    r8, r8, (MSR_VEC | MSR_VSX)@h
 	mtmsrd	r8
 
 	rldicl. r4, r4, 64 - MSR_TS_S_LG, 62
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v4 07/29] KVM: PPC: Book3S PR: add C function wrapper for _kvmppc_save/restore_tm()
From: wei.guo.simon @ 2018-05-23  7:01 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Paul Mackerras, kvm, kvm-ppc, Simon Guo
In-Reply-To: <1527058932-7434-1-git-send-email-wei.guo.simon@gmail.com>

From: Simon Guo <wei.guo.simon@gmail.com>

Currently _kvmppc_save/restore_tm() APIs can only be invoked from
assembly function. This patch adds C function wrappers for them so
that they can be safely called from C function.

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
 arch/powerpc/include/asm/asm-prototypes.h |  6 ++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   |  8 +--
 arch/powerpc/kvm/tm.S                     | 94 ++++++++++++++++++++++++++++++-
 3 files changed, 102 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index dfdcb23..5da683b 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -141,7 +141,13 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4,
 void pnv_power9_force_smt4_catch(void);
 void pnv_power9_force_smt4_release(void);
 
+/* Transaction memory related */
 void tm_enable(void);
 void tm_disable(void);
 void tm_abort(uint8_t cause);
+
+struct kvm_vcpu;
+void _kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu, u64 guest_msr);
+void _kvmppc_save_tm_pr(struct kvm_vcpu *vcpu, u64 guest_msr);
+
 #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 6445d29..980df5f 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -795,7 +795,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
 	 */
 	mr      r3, r4
 	ld      r4, VCPU_MSR(r3)
-	bl	kvmppc_restore_tm
+	bl	__kvmppc_restore_tm
 	ld	r4, HSTATE_KVM_VCPU(r13)
 91:
 END_FTR_SECTION_IFSET(CPU_FTR_TM)
@@ -1783,7 +1783,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
 	 */
 	mr      r3, r9
 	ld      r4, VCPU_MSR(r3)
-	bl	kvmppc_save_tm
+	bl	__kvmppc_save_tm
 	ld	r9, HSTATE_KVM_VCPU(r13)
 91:
 #endif
@@ -2689,7 +2689,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
 	 */
 	ld      r3, HSTATE_KVM_VCPU(r13)
 	ld      r4, VCPU_MSR(r3)
-	bl	kvmppc_save_tm
+	bl	__kvmppc_save_tm
 91:
 #endif
 
@@ -2809,7 +2809,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
 	 */
 	mr      r3, r4
 	ld      r4, VCPU_MSR(r3)
-	bl	kvmppc_restore_tm
+	bl	__kvmppc_restore_tm
 	ld	r4, HSTATE_KVM_VCPU(r13)
 91:
 #endif
diff --git a/arch/powerpc/kvm/tm.S b/arch/powerpc/kvm/tm.S
index b7057d5..42a7cd8 100644
--- a/arch/powerpc/kvm/tm.S
+++ b/arch/powerpc/kvm/tm.S
@@ -33,7 +33,7 @@
  * This can modify all checkpointed registers, but
  * restores r1, r2 before exit.
  */
-_GLOBAL(kvmppc_save_tm)
+_GLOBAL(__kvmppc_save_tm)
 	mflr	r0
 	std	r0, PPC_LR_STKOFF(r1)
 	stdu	r1, -PPC_MIN_STKFRM(r1)
@@ -210,6 +210,52 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
 	blr
 
 /*
+ * _kvmppc_save_tm_pr() is a wrapper around __kvmppc_save_tm(), so that it can
+ * be invoked from C function by PR KVM only.
+ */
+_GLOBAL(_kvmppc_save_tm_pr)
+	mflr	r5
+	std	r5, PPC_LR_STKOFF(r1)
+	stdu    r1, -SWITCH_FRAME_SIZE(r1)
+	SAVE_NVGPRS(r1)
+
+	/* save MSR since TM/math bits might be impacted
+	 * by __kvmppc_save_tm().
+	 */
+	mfmsr	r5
+	SAVE_GPR(5, r1)
+
+	/* also save DSCR/CR so that it can be recovered later */
+	mfspr   r6, SPRN_DSCR
+	SAVE_GPR(6, r1)
+
+	mfcr    r7
+	stw     r7, _CCR(r1)
+
+	bl	__kvmppc_save_tm
+
+	ld      r7, _CCR(r1)
+	mtcr	r7
+
+	REST_GPR(6, r1)
+	mtspr   SPRN_DSCR, r6
+
+	/* need preserve current MSR's MSR_TS bits */
+	REST_GPR(5, r1)
+	mfmsr   r6
+	rldicl  r6, r6, 64 - MSR_TS_S_LG, 62
+	rldimi  r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG
+	mtmsrd  r5
+
+	REST_NVGPRS(r1)
+	addi    r1, r1, SWITCH_FRAME_SIZE
+	ld	r5, PPC_LR_STKOFF(r1)
+	mtlr	r5
+	blr
+
+EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr);
+
+/*
  * Restore transactional state and TM-related registers.
  * Called with:
  *  - r3 pointing to the vcpu struct.
@@ -219,7 +265,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
  * This potentially modifies all checkpointed registers.
  * It restores r1, r2 from the PACA.
  */
-_GLOBAL(kvmppc_restore_tm)
+_GLOBAL(__kvmppc_restore_tm)
 	mflr	r0
 	std	r0, PPC_LR_STKOFF(r1)
 
@@ -362,4 +408,48 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
 	addi	r1, r1, PPC_MIN_STKFRM
 	b	9b
 #endif
+
+/*
+ * _kvmppc_restore_tm_pr() is a wrapper around __kvmppc_restore_tm(), so that it
+ * can be invoked from C function by PR KVM only.
+ */
+_GLOBAL(_kvmppc_restore_tm_pr)
+	mflr	r5
+	std	r5, PPC_LR_STKOFF(r1)
+	stdu    r1, -SWITCH_FRAME_SIZE(r1)
+	SAVE_NVGPRS(r1)
+
+	/* save MSR to avoid TM/math bits change */
+	mfmsr	r5
+	SAVE_GPR(5, r1)
+
+	/* also save DSCR/CR so that it can be recovered later */
+	mfspr   r6, SPRN_DSCR
+	SAVE_GPR(6, r1)
+
+	mfcr    r7
+	stw     r7, _CCR(r1)
+
+	bl	__kvmppc_restore_tm
+
+	ld      r7, _CCR(r1)
+	mtcr	r7
+
+	REST_GPR(6, r1)
+	mtspr   SPRN_DSCR, r6
+
+	/* need preserve current MSR's MSR_TS bits */
+	REST_GPR(5, r1)
+	mfmsr   r6
+	rldicl  r6, r6, 64 - MSR_TS_S_LG, 62
+	rldimi  r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG
+	mtmsrd  r5
+
+	REST_NVGPRS(r1)
+	addi    r1, r1, SWITCH_FRAME_SIZE
+	ld	r5, PPC_LR_STKOFF(r1)
+	mtlr	r5
+	blr
+
+EXPORT_SYMBOL_GPL(_kvmppc_restore_tm_pr);
 #endif
-- 
1.8.3.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox