LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [PATCH 2/5] soc/fsl/qe: qe.c: reduce static memory footprint by 1.7K
From: Christophe Leroy @ 2019-04-30 17:12 UTC (permalink / raw)
  To: Rasmus Villemoes, Qiang Zhao, Li Yang
  Cc: Valentin Longchamp, linux-kernel@vger.kernel.org, Scott Wood,
	Rasmus Villemoes, Rob Herring, linuxppc-dev@lists.ozlabs.org,
	linux-arm-kernel@lists.infradead.org
In-Reply-To: <20190430133615.25721-3-rasmus.villemoes@prevas.dk>



Le 30/04/2019 à 15:36, Rasmus Villemoes a écrit :
> The current array of struct qe_snum use 256*4 bytes for just keeping
> track of the free/used state of each index, and the struct layout
> means there's another 768 bytes of padding. If we just unzip that
> structure, the array of snum values just use 256 bytes, while the
> free/inuse state can be tracked in a 32 byte bitmap.
> 
> So this reduces the .data footprint by 1760 bytes. It also serves as
> preparation for introducing another DT binding for specifying the snum
> values.
> 
> Signed-off-by: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
> ---
>   drivers/soc/fsl/qe/qe.c | 37 ++++++++++++-------------------------
>   1 file changed, 12 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/soc/fsl/qe/qe.c b/drivers/soc/fsl/qe/qe.c
> index 855373deb746..d0393f83145c 100644
> --- a/drivers/soc/fsl/qe/qe.c
> +++ b/drivers/soc/fsl/qe/qe.c
> @@ -14,6 +14,7 @@
>    * Free Software Foundation;  either version 2 of the  License, or (at your
>    * option) any later version.
>    */
> +#include <linux/bitmap.h>
>   #include <linux/errno.h>
>   #include <linux/sched.h>
>   #include <linux/kernel.h>
> @@ -43,25 +44,14 @@ static DEFINE_SPINLOCK(qe_lock);
>   DEFINE_SPINLOCK(cmxgcr_lock);
>   EXPORT_SYMBOL(cmxgcr_lock);
>   
> -/* QE snum state */
> -enum qe_snum_state {
> -	QE_SNUM_STATE_USED,
> -	QE_SNUM_STATE_FREE
> -};
> -
> -/* QE snum */
> -struct qe_snum {
> -	u8 num;
> -	enum qe_snum_state state;
> -};
> -
>   /* We allocate this here because it is used almost exclusively for
>    * the communication processor devices.
>    */
>   struct qe_immap __iomem *qe_immr;
>   EXPORT_SYMBOL(qe_immr);
>   
> -static struct qe_snum snums[QE_NUM_OF_SNUM];	/* Dynamically allocated SNUMs */
> +static u8 snums[QE_NUM_OF_SNUM];	/* Dynamically allocated SNUMs */
> +static DECLARE_BITMAP(snum_state, QE_NUM_OF_SNUM);
>   static unsigned int qe_num_of_snum;
>   
>   static phys_addr_t qebase = -1;
> @@ -308,6 +298,7 @@ static void qe_snums_init(void)
>   	};
>   	const u8 *snum_init;
>   
> +	bitmap_zero(snum_state, QE_NUM_OF_SNUM);

Doesn't make much importance, but wouldn't it be more logical to add 
this line where the setting of .state = QE_SNUM_STATE_FREE was done 
previously, ie around the for() loop below ?

>   	qe_num_of_snum = qe_get_num_of_snums();
>   
>   	if (qe_num_of_snum == 76)
> @@ -315,10 +306,8 @@ static void qe_snums_init(void)
>   	else
>   		snum_init = snum_init_46;
>   
> -	for (i = 0; i < qe_num_of_snum; i++) {
> -		snums[i].num = snum_init[i];
> -		snums[i].state = QE_SNUM_STATE_FREE;
> -	}
> +	for (i = 0; i < qe_num_of_snum; i++)
> +		snums[i] = snum_init[i];

Could use memcpy() instead ?

>   }
>   
>   int qe_get_snum(void)
> @@ -328,12 +317,10 @@ int qe_get_snum(void)
>   	int i;
>   
>   	spin_lock_irqsave(&qe_lock, flags);
> -	for (i = 0; i < qe_num_of_snum; i++) {
> -		if (snums[i].state == QE_SNUM_STATE_FREE) {
> -			snums[i].state = QE_SNUM_STATE_USED;
> -			snum = snums[i].num;
> -			break;
> -		}
> +	i = find_first_zero_bit(snum_state, qe_num_of_snum);
> +	if (i < qe_num_of_snum) {
> +		set_bit(i, snum_state);
> +		snum = snums[i];
>   	}
>   	spin_unlock_irqrestore(&qe_lock, flags);
>   
> @@ -346,8 +333,8 @@ void qe_put_snum(u8 snum)
>   	int i;
>   
>   	for (i = 0; i < qe_num_of_snum; i++) {
> -		if (snums[i].num == snum) {
> -			snums[i].state = QE_SNUM_STATE_FREE;
> +		if (snums[i] == snum) {
> +			clear_bit(i, snum_state);
>   			break;
>   		}
>   	}

Can we replace this loop by memchr() ?

Christophe


^ permalink raw reply

* Re: [PATCH 3/5] soc/fsl/qe: qe.c: introduce qe_get_device_node helper
From: Christophe Leroy @ 2019-04-30 17:14 UTC (permalink / raw)
  To: Rasmus Villemoes, Qiang Zhao, Li Yang
  Cc: Valentin Longchamp, linux-kernel@vger.kernel.org, Scott Wood,
	Rasmus Villemoes, Rob Herring, linuxppc-dev@lists.ozlabs.org,
	linux-arm-kernel@lists.infradead.org
In-Reply-To: <20190430133615.25721-4-rasmus.villemoes@prevas.dk>



Le 30/04/2019 à 15:36, Rasmus Villemoes a écrit :
> The 'try of_find_compatible_node(NULL, NULL, "fsl,qe"), fall back to
> of_find_node_by_type(NULL, "qe")' pattern is repeated five
> times. Factor it into a common helper.
> 
> Signed-off-by: Rasmus Villemoes <rasmus.villemoes@prevas.dk>

Reviewed-by: Christophe Leroy <christophe.leroy@c-s.fr>


> ---
>   drivers/soc/fsl/qe/qe.c | 71 +++++++++++++++++------------------------
>   1 file changed, 29 insertions(+), 42 deletions(-)
> 
> diff --git a/drivers/soc/fsl/qe/qe.c b/drivers/soc/fsl/qe/qe.c
> index d0393f83145c..aff9d1373529 100644
> --- a/drivers/soc/fsl/qe/qe.c
> +++ b/drivers/soc/fsl/qe/qe.c
> @@ -56,6 +56,20 @@ static unsigned int qe_num_of_snum;
>   
>   static phys_addr_t qebase = -1;
>   
> +static struct device_node *qe_get_device_node(void)
> +{
> +	struct device_node *qe;
> +
> +	/*
> +	 * Newer device trees have an "fsl,qe" compatible property for the QE
> +	 * node, but we still need to support older device trees.
> +	 */
> +	qe = of_find_compatible_node(NULL, NULL, "fsl,qe");
> +	if (qe)
> +		return qe;
> +	return of_find_node_by_type(NULL, "qe");
> +}
> +
>   static phys_addr_t get_qe_base(void)
>   {
>   	struct device_node *qe;
> @@ -65,12 +79,9 @@ static phys_addr_t get_qe_base(void)
>   	if (qebase != -1)
>   		return qebase;
>   
> -	qe = of_find_compatible_node(NULL, NULL, "fsl,qe");
> -	if (!qe) {
> -		qe = of_find_node_by_type(NULL, "qe");
> -		if (!qe)
> -			return qebase;
> -	}
> +	qe = qe_get_device_node();
> +	if (!qe)
> +		return qebase;
>   
>   	ret = of_address_to_resource(qe, 0, &res);
>   	if (!ret)
> @@ -164,12 +175,9 @@ unsigned int qe_get_brg_clk(void)
>   	if (brg_clk)
>   		return brg_clk;
>   
> -	qe = of_find_compatible_node(NULL, NULL, "fsl,qe");
> -	if (!qe) {
> -		qe = of_find_node_by_type(NULL, "qe");
> -		if (!qe)
> -			return brg_clk;
> -	}
> +	qe = qe_get_device_node();
> +	if (!qe)
> +		return brg_clk;
>   
>   	prop = of_get_property(qe, "brg-frequency", &size);
>   	if (prop && size == sizeof(*prop))
> @@ -563,16 +571,9 @@ struct qe_firmware_info *qe_get_firmware_info(void)
>   
>   	initialized = 1;
>   
> -	/*
> -	 * Newer device trees have an "fsl,qe" compatible property for the QE
> -	 * node, but we still need to support older device trees.
> -	*/
> -	qe = of_find_compatible_node(NULL, NULL, "fsl,qe");
> -	if (!qe) {
> -		qe = of_find_node_by_type(NULL, "qe");
> -		if (!qe)
> -			return NULL;
> -	}
> +	qe = qe_get_device_node();
> +	if (!qe)
> +		return NULL;
>   
>   	/* Find the 'firmware' child node */
>   	fw = of_get_child_by_name(qe, "firmware");
> @@ -618,16 +619,9 @@ unsigned int qe_get_num_of_risc(void)
>   	unsigned int num_of_risc = 0;
>   	const u32 *prop;
>   
> -	qe = of_find_compatible_node(NULL, NULL, "fsl,qe");
> -	if (!qe) {
> -		/* Older devices trees did not have an "fsl,qe"
> -		 * compatible property, so we need to look for
> -		 * the QE node by name.
> -		 */
> -		qe = of_find_node_by_type(NULL, "qe");
> -		if (!qe)
> -			return num_of_risc;
> -	}
> +	qe = qe_get_device_node();
> +	if (!qe)
> +		return num_of_risc;
>   
>   	prop = of_get_property(qe, "fsl,qe-num-riscs", &size);
>   	if (prop && size == sizeof(*prop))
> @@ -647,16 +641,9 @@ unsigned int qe_get_num_of_snums(void)
>   	const u32 *prop;
>   
>   	num_of_snums = 28; /* The default number of snum for threads is 28 */
> -	qe = of_find_compatible_node(NULL, NULL, "fsl,qe");
> -	if (!qe) {
> -		/* Older devices trees did not have an "fsl,qe"
> -		 * compatible property, so we need to look for
> -		 * the QE node by name.
> -		 */
> -		qe = of_find_node_by_type(NULL, "qe");
> -		if (!qe)
> -			return num_of_snums;
> -	}
> +	qe = qe_get_device_node();
> +	if (!qe)
> +		return num_of_snums;
>   
>   	prop = of_get_property(qe, "fsl,qe-num-snums", &size);
>   	if (prop && size == sizeof(*prop)) {
> 

^ permalink raw reply

* Re: [PATCH 4/5] soc/fsl/qe: qe.c: support fsl,qe-snums property
From: Christophe Leroy @ 2019-04-30 17:19 UTC (permalink / raw)
  To: Rasmus Villemoes, Qiang Zhao, Li Yang
  Cc: Valentin Longchamp, linux-kernel@vger.kernel.org, Scott Wood,
	Rasmus Villemoes, Rob Herring, linuxppc-dev@lists.ozlabs.org,
	linux-arm-kernel@lists.infradead.org
In-Reply-To: <20190430133615.25721-5-rasmus.villemoes@prevas.dk>



Le 30/04/2019 à 15:36, Rasmus Villemoes a écrit :
> The current code assumes that the set of snum _values_ to populate the
> snums[] array with is a function of the _number_ of snums
> alone. However, reading table 4-30, and its footnotes, of the QUICC
> Engine Block Reference Manual shows that that is a bit too naive.
> 
> As an alternative, this introduces a new binding fsl,qe-snums, which
> automatically encodes both the number of snums and the actual values to
> use. Conveniently, of_property_read_variable_u8_array does exactly
> what we need.
> 
> For example, for the MPC8309, one would specify the property as
> 
>                 fsl,qe-snums = /bits/ 8 <
>                         0x88 0x89 0x98 0x99 0xa8 0xa9 0xb8 0xb9
>                         0xc8 0xc9 0xd8 0xd9 0xe8 0xe9>;
> 
> Signed-off-by: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
> ---
>   .../devicetree/bindings/soc/fsl/cpm_qe/qe.txt      |  8 +++++++-
>   drivers/soc/fsl/qe/qe.c                            | 14 +++++++++++++-
>   2 files changed, 20 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/devicetree/bindings/soc/fsl/cpm_qe/qe.txt b/Documentation/devicetree/bindings/soc/fsl/cpm_qe/qe.txt
> index d7afaff5faff..05f5f485562a 100644
> --- a/Documentation/devicetree/bindings/soc/fsl/cpm_qe/qe.txt
> +++ b/Documentation/devicetree/bindings/soc/fsl/cpm_qe/qe.txt
> @@ -18,7 +18,8 @@ Required properties:
>   - reg : offset and length of the device registers.
>   - bus-frequency : the clock frequency for QUICC Engine.
>   - fsl,qe-num-riscs: define how many RISC engines the QE has.
> -- fsl,qe-num-snums: define how many serial number(SNUM) the QE can use for the
> +- fsl,qe-snums: This property has to be specified as '/bits/ 8' value,
> +  defining the array of serial number (SNUM) values for the virtual
>     threads.
>   
>   Optional properties:
> @@ -34,6 +35,11 @@ Recommended properties
>   - brg-frequency : the internal clock source frequency for baud-rate
>     generators in Hz.
>   
> +Deprecated properties
> +- fsl,qe-num-snums: define how many serial number(SNUM) the QE can use
> +  for the threads. Use fsl,qe-snums instead to not only specify the
> +  number of snums, but also their values.
> +
>   Example:
>        qe@e0100000 {
>   	#address-cells = <1>;
> diff --git a/drivers/soc/fsl/qe/qe.c b/drivers/soc/fsl/qe/qe.c
> index aff9d1373529..af3c2b2b268f 100644
> --- a/drivers/soc/fsl/qe/qe.c
> +++ b/drivers/soc/fsl/qe/qe.c
> @@ -283,7 +283,6 @@ EXPORT_SYMBOL(qe_clock_source);
>    */
>   static void qe_snums_init(void)
>   {
> -	int i;

Why do you move this one ?

>   	static const u8 snum_init_76[] = {
>   		0x04, 0x05, 0x0C, 0x0D, 0x14, 0x15, 0x1C, 0x1D,
>   		0x24, 0x25, 0x2C, 0x2D, 0x34, 0x35, 0x88, 0x89,
> @@ -304,9 +303,22 @@ static void qe_snums_init(void)
>   		0x28, 0x29, 0x38, 0x39, 0x48, 0x49, 0x58, 0x59,
>   		0x68, 0x69, 0x78, 0x79, 0x80, 0x81,
>   	};
> +	struct device_node *qe;
>   	const u8 *snum_init;
> +	int i;
>   
>   	bitmap_zero(snum_state, QE_NUM_OF_SNUM);
> +	qe = qe_get_device_node();
> +	if (qe) {
> +		i = of_property_read_variable_u8_array(qe, "fsl,qe-snums",
> +						       snums, 1, QE_NUM_OF_SNUM);
> +		of_node_put(qe);
> +		if (i > 0) {
> +			qe_num_of_snum = i;
> +			return;

In that case you skip the rest of the init ? Can you explain ?

Christophe

> +		}
> +	}
> +
>   	qe_num_of_snum = qe_get_num_of_snums();
>   
>   	if (qe_num_of_snum == 76)
> 

^ permalink raw reply

* Re: [PATCH 5/5] soc/fsl/qe: qe.c: fold qe_get_num_of_snums into qe_snums_init
From: Christophe Leroy @ 2019-04-30 17:27 UTC (permalink / raw)
  To: Rasmus Villemoes, Qiang Zhao, Li Yang
  Cc: Valentin Longchamp, linux-kernel@vger.kernel.org, Scott Wood,
	Rasmus Villemoes, Rob Herring, linuxppc-dev@lists.ozlabs.org,
	linux-arm-kernel@lists.infradead.org
In-Reply-To: <20190430133615.25721-6-rasmus.villemoes@prevas.dk>



Le 30/04/2019 à 15:36, Rasmus Villemoes a écrit :
> The comment "No QE ever has fewer than 28 SNUMs" is false; e.g. the
> MPC8309 has 14. The code path returning -EINVAL is also a recipe for
> instant disaster, since the caller (qe_snums_init) uncritically
> assigns the return value to the unsigned qe_num_of_snum, and would
> thus proceed to attempt to copy 4GB from snum_init_46[] to the snum[]
> array.
> 
> So fold the handling of the legacy fsl,qe-num-snums into
> qe_snums_init, and make sure we do not end up using the snum_init_46
> array in cases other than the two where we know it makes sense.
> 
> Signed-off-by: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
> ---
>   drivers/net/ethernet/freescale/ucc_geth.c |  2 +-
>   drivers/soc/fsl/qe/qe.c                   | 54 +++++++----------------
>   include/soc/fsl/qe/qe.h                   |  2 +-
>   3 files changed, 19 insertions(+), 39 deletions(-)
> 
> diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c
> index eb3e65e8868f..5748eb8464d0 100644
> --- a/drivers/net/ethernet/freescale/ucc_geth.c
> +++ b/drivers/net/ethernet/freescale/ucc_geth.c
> @@ -3837,7 +3837,7 @@ static int ucc_geth_probe(struct platform_device* ofdev)
>   		}
>   
>   	if (max_speed == SPEED_1000) {
> -		unsigned int snums = qe_get_num_of_snums();
> +		unsigned int snums = qe_num_of_snum;
>   
>   		/* configure muram FIFOs for gigabit operation */
>   		ug_info->uf_info.urfs = UCC_GETH_URFS_GIGA_INIT;
> diff --git a/drivers/soc/fsl/qe/qe.c b/drivers/soc/fsl/qe/qe.c
> index af3c2b2b268f..8c3b3c62d81b 100644
> --- a/drivers/soc/fsl/qe/qe.c
> +++ b/drivers/soc/fsl/qe/qe.c
> @@ -52,7 +52,8 @@ EXPORT_SYMBOL(qe_immr);
>   
>   static u8 snums[QE_NUM_OF_SNUM];	/* Dynamically allocated SNUMs */
>   static DECLARE_BITMAP(snum_state, QE_NUM_OF_SNUM);
> -static unsigned int qe_num_of_snum;
> +unsigned int qe_num_of_snum;
> +EXPORT_SYMBOL(qe_num_of_snum);

By exporting the object you allow other drivers to modify it. Is that 
really what we want ?

Why not keep qe_get_num_of_snums() as a helper that simply returns 
qe_num_of_snum ?

>   
>   static phys_addr_t qebase = -1;
>   
> @@ -308,26 +309,34 @@ static void qe_snums_init(void)
>   	int i;
>   
>   	bitmap_zero(snum_state, QE_NUM_OF_SNUM);
> +	qe_num_of_snum = 28; /* The default number of snum for threads is 28 */
>   	qe = qe_get_device_node();
>   	if (qe) {
>   		i = of_property_read_variable_u8_array(qe, "fsl,qe-snums",
>   						       snums, 1, QE_NUM_OF_SNUM);
> -		of_node_put(qe);
>   		if (i > 0) {
> +			of_node_put(qe);
>   			qe_num_of_snum = i;
>   			return;
>   		}
> +		/*
> +		 * Fall back to legacy binding of using the value of
> +		 * fsl,qe-num-snums to choose one of the static arrays
> +		 * above.
> +		 */
> +		of_property_read_u32(qe, "fsl,qe-num-snums", &qe_num_of_snum);
> +		of_node_put(qe);
>   	}
>   
> -	qe_num_of_snum = qe_get_num_of_snums();
> -
>   	if (qe_num_of_snum == 76)
>   		snum_init = snum_init_76;
> -	else
> +	else if (qe_num_of_snum == 28 || qe_num_of_snum == 46)
>   		snum_init = snum_init_46;
> -
> -	for (i = 0; i < qe_num_of_snum; i++)
> -		snums[i] = snum_init[i];
> +	else {
> +		pr_err("QE: unsupported value of fsl,qe-num-snums: %u\n", qe_num_of_snum);
> +		return;
> +	}

The first leg of the if/else must have {} too when the second leg has them.

> +	memcpy(snums, snum_init, qe_num_of_snum);
>   }
>   
>   int qe_get_snum(void)
> @@ -645,35 +654,6 @@ unsigned int qe_get_num_of_risc(void)
>   }
>   EXPORT_SYMBOL(qe_get_num_of_risc);
>   
> -unsigned int qe_get_num_of_snums(void)

I think this function should remain and just return num_of_snums, see my 
other comment above.

Christophe


> -{
> -	struct device_node *qe;
> -	int size;
> -	unsigned int num_of_snums;
> -	const u32 *prop;
> -
> -	num_of_snums = 28; /* The default number of snum for threads is 28 */
> -	qe = qe_get_device_node();
> -	if (!qe)
> -		return num_of_snums;
> -
> -	prop = of_get_property(qe, "fsl,qe-num-snums", &size);
> -	if (prop && size == sizeof(*prop)) {
> -		num_of_snums = *prop;
> -		if ((num_of_snums < 28) || (num_of_snums > QE_NUM_OF_SNUM)) {
> -			/* No QE ever has fewer than 28 SNUMs */
> -			pr_err("QE: number of snum is invalid\n");
> -			of_node_put(qe);
> -			return -EINVAL;
> -		}
> -	}
> -
> -	of_node_put(qe);
> -
> -	return num_of_snums;
> -}
> -EXPORT_SYMBOL(qe_get_num_of_snums);
> -
>   static int __init qe_init(void)
>   {
>   	struct device_node *np;
> diff --git a/include/soc/fsl/qe/qe.h b/include/soc/fsl/qe/qe.h
> index b3d1aff5e8ad..af5739850bf4 100644
> --- a/include/soc/fsl/qe/qe.h
> +++ b/include/soc/fsl/qe/qe.h
> @@ -212,7 +212,7 @@ int qe_setbrg(enum qe_clock brg, unsigned int rate, unsigned int multiplier);
>   int qe_get_snum(void);
>   void qe_put_snum(u8 snum);
>   unsigned int qe_get_num_of_risc(void);
> -unsigned int qe_get_num_of_snums(void);
> +extern unsigned int qe_num_of_snum;
>   
>   static inline int qe_alive_during_sleep(void)
>   {
> 

^ permalink raw reply

* [PATCH v2] powerpc: remove the __kernel_io_end export
From: Christoph Hellwig @ 2019-04-30 18:27 UTC (permalink / raw)
  To: mpe, aneesh.kumar; +Cc: linuxppc-dev, linux-kernel

This export was added in this merge window, but without any actual
user, or justification for a modular user.

Fixes: a35a3c6f6065 ("powerpc/mm/hash64: Add a variable to track the end of IO mapping")
Signed-off-by: Christoph Hellwig <hch@lst.de>
---

Chanes since v1:

 - actually compiles now..

 arch/powerpc/mm/pgtable_64.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 72f58c076e26..dd610dab98e0 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -97,7 +97,6 @@ EXPORT_SYMBOL(__vmalloc_end);
 unsigned long __kernel_io_start;
 EXPORT_SYMBOL(__kernel_io_start);
 unsigned long __kernel_io_end;
-EXPORT_SYMBOL(__kernel_io_end);
 struct page *vmemmap;
 EXPORT_SYMBOL(vmemmap);
 unsigned long __pte_frag_nr;
-- 
2.20.1


^ permalink raw reply related

* Re: [PATCH v4] powerpc/pseries: Remove limit in wait for dying CPU
From: Thiago Jung Bauermann @ 2019-04-30 19:59 UTC (permalink / raw)
  To: Nathan Lynch
  Cc: Gautham R Shenoy, linux-kernel, Nicholas Piggin,
	Michael Bringmann, Tyrel Datwyler, Vaidyanathan Srinivasan,
	linuxppc-dev
In-Reply-To: <877ebbsb8u.fsf@linux.ibm.com>


Hello Nathan,

Thanks for reviewing the patch!

Nathan Lynch <nathanl@linux.ibm.com> writes:

> Thiago Jung Bauermann <bauerman@linux.ibm.com> writes:
>> This can be a problem because if the busy loop finishes too early, then the
>> kernel may offline another CPU before the previous one finished dying,
>> which would lead to two concurrent calls to rtas-stop-self, which is
>> prohibited by the PAPR.
>>
>> Since the hotplug machinery already assumes that cpu_die() is going to
>> work, we can simply loop until the CPU stops.
>>
>> Also change the loop to wait 100 µs between each call to
>> smp_query_cpu_stopped() to avoid querying RTAS too often.
>
> [...]
>
>> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
>> index 97feb6e79f1a..d75cee60644c 100644
>> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
>> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
>> @@ -214,13 +214,17 @@ static void pseries_cpu_die(unsigned int cpu)
>>  			msleep(1);
>>  		}
>>  	} else if (get_preferred_offline_state(cpu) == CPU_STATE_OFFLINE) {
>> -
>> -		for (tries = 0; tries < 25; tries++) {
>> +		/*
>> +		 * rtas_stop_self() panics if the CPU fails to stop and our
>> +		 * callers already assume that we are going to succeed, so we
>> +		 * can just loop until the CPU stops.
>> +		 */
>> +		while (true) {
>>  			cpu_status = smp_query_cpu_stopped(pcpu);
>>  			if (cpu_status == QCSS_STOPPED ||
>>  			    cpu_status == QCSS_HARDWARE_ERROR)
>>  				break;
>> -			cpu_relax();
>> +			udelay(100);
>>  		}
>>  	}
>
> I agree with looping indefinitely but doesn't it need a cond_resched()
> or similar check?

If there's no kernel or hypervisor bug, it shouldn't take more than a
few tens of ms for this loop to complete (Gautham measured a maximum of
10 ms on a POWER9 with an earlier version of this patch).

In case of bugs related to CPU hotplug (either in the kernel or the
hypervisor), I was hoping that the resulting lockup warnings would be a
good indicator that something is wrong. :-)

Though perhaps adding a cond_resched() every 10 ms or so, with a
WARN_ON() if it loops for more than 50 ms would be better.

I'll send an alternative patch.

--
Thiago Jung Bauermann
IBM Linux Technology Center

^ permalink raw reply

* Re: [PATCH] powerpc: vdso: drop unnecessary cc-ldoption
From: Nick Desaulniers @ 2019-04-30 20:25 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Joel Stanley
  Cc: Masahiro Yamada, LKML, Nicholas Piggin, clang-built-linux,
	Andrew Donnellan, linuxppc-dev, Dmitry Vyukov
In-Reply-To: <20190423211116.261111-1-ndesaulniers@google.com>

On Tue, Apr 23, 2019 at 2:11 PM Nick Desaulniers
<ndesaulniers@google.com> wrote:
>
> Towards the goal of removing cc-ldoption, it seems that --hash-style=
> was added to binutils 2.17.50.0.2 in 2006. The minimal required version
> of binutils for the kernel according to
> Documentation/process/changes.rst is 2.20.
>
> Link: https://gcc.gnu.org/ml/gcc/2007-01/msg01141.html
> Cc: clang-built-linux@googlegroups.com
> Suggested-by: Masahiro Yamada <yamada.masahiro@socionext.com>
> Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
> ---
>  arch/powerpc/kernel/vdso32/Makefile | 5 ++---
>  arch/powerpc/kernel/vdso64/Makefile | 5 ++---
>  2 files changed, 4 insertions(+), 6 deletions(-)
>
> diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile
> index ce199f6e4256..06f54d947057 100644
> --- a/arch/powerpc/kernel/vdso32/Makefile
> +++ b/arch/powerpc/kernel/vdso32/Makefile
> @@ -26,9 +26,8 @@ GCOV_PROFILE := n
>  KCOV_INSTRUMENT := n
>  UBSAN_SANITIZE := n
>
> -ccflags-y := -shared -fno-common -fno-builtin
> -ccflags-y += -nostdlib -Wl,-soname=linux-vdso32.so.1 \
> -               $(call cc-ldoption, -Wl$(comma)--hash-style=both)
> +ccflags-y := -shared -fno-common -fno-builtin -nostdlib \
> +       -Wl,-soname=linux-vdso32.so.1 -Wl,--hash-style=both
>  asflags-y := -D__VDSO32__ -s
>
>  obj-y += vdso32_wrapper.o
> diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile
> index 28e7d112aa2f..32ebb3522ea1 100644
> --- a/arch/powerpc/kernel/vdso64/Makefile
> +++ b/arch/powerpc/kernel/vdso64/Makefile
> @@ -12,9 +12,8 @@ GCOV_PROFILE := n
>  KCOV_INSTRUMENT := n
>  UBSAN_SANITIZE := n
>
> -ccflags-y := -shared -fno-common -fno-builtin
> -ccflags-y += -nostdlib -Wl,-soname=linux-vdso64.so.1 \
> -               $(call cc-ldoption, -Wl$(comma)--hash-style=both)
> +ccflags-y := -shared -fno-common -fno-builtin -nostdlib \
> +       -Wl,-soname=linux-vdso64.so.1 -Wl,--hash-style=both
>  asflags-y := -D__VDSO64__ -s
>
>  obj-y += vdso64_wrapper.o
> --
> 2.21.0.593.g511ec345e18-goog
>

bumping for review
-- 
Thanks,
~Nick Desaulniers

^ permalink raw reply

* Re: [PATCH] powerpc: Fix kobject memleak
From: Tyrel Datwyler @ 2019-04-30 20:37 UTC (permalink / raw)
  To: Tobin C. Harding, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman
  Cc: Greg Kroah-Hartman, linuxppc-dev, linux-kernel
In-Reply-To: <20190430010923.17092-1-tobin@kernel.org>

On 04/29/2019 06:09 PM, Tobin C. Harding wrote:
> Currently error return from kobject_init_and_add() is not followed by a
> call to kobject_put().  This means there is a memory leak.
> 
> Add call to kobject_put() in error path of kobject_init_and_add().
> 
> Signed-off-by: Tobin C. Harding <tobin@kernel.org>
> ---

Reviewed-by: Tyrel Datwyler <tyreld@linux.ibm.com>


^ permalink raw reply

* Re: [PATCH 41/41] drivers: tty: serial: lpc32xx_hs: fill mapsize and use it
From: Vladimir Zapolskiy @ 2019-04-30 20:52 UTC (permalink / raw)
  To: Enrico Weigelt, metux IT consult, linux-kernel
  Cc: sudeep.holla, linux-ia64, linux-serial, andrew, gregkh,
	slemieux.tyco, liviu.dudau, linux-mips, linux, sparclinux,
	lorenzo.pieralisi, macro, khilman, matthias.bgg, jacmet,
	linux-amlogic, andriy.shevchenko, linuxppc-dev, davem
In-Reply-To: <1556369542-13247-42-git-send-email-info@metux.net>

Hi Enrico,

On 04/27/2019 03:52 PM, Enrico Weigelt, metux IT consult wrote:
> Fill the struct uart_port->mapsize field and use it, insteaf of

typo, s/insteaf/instead/

> hardcoded values in many places. This makes the code layout a bit
> more consistent and easily allows using generic helpers for the
> io memory handling.
> 
> Candidates for such helpers could be eg. the request+ioremap and
> iounmap+release combinations.
> 
> Signed-off-by: Enrico Weigelt <info@metux.net>

Acked-by: Vladimir Zapolskiy <vz@mleia.com>

--
Best wishes,
Vladimir

^ permalink raw reply

* Re: [PATCH] powerpc: vdso: drop unnecessary cc-ldoption
From: Nicholas Piggin @ 2019-04-30 22:52 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Joel Stanley, Michael Ellerman,
	Nick Desaulniers, Paul Mackerras
  Cc: clang-built-linux, LKML, Masahiro Yamada, Andrew Donnellan,
	linuxppc-dev, Dmitry Vyukov
In-Reply-To: <CAKwvOd=dBLXQUzv8R3-JqF=pUTH0-5O3v+_ceekT3W23VxtDbg@mail.gmail.com>

Nick Desaulniers's on May 1, 2019 6:25 am:
> On Tue, Apr 23, 2019 at 2:11 PM Nick Desaulniers
> <ndesaulniers@google.com> wrote:
>>
>> Towards the goal of removing cc-ldoption, it seems that --hash-style=
>> was added to binutils 2.17.50.0.2 in 2006. The minimal required version
>> of binutils for the kernel according to
>> Documentation/process/changes.rst is 2.20.
>>
>> Link: https://gcc.gnu.org/ml/gcc/2007-01/msg01141.html
>> Cc: clang-built-linux@googlegroups.com
>> Suggested-by: Masahiro Yamada <yamada.masahiro@socionext.com>
>> Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
>> ---
>>  arch/powerpc/kernel/vdso32/Makefile | 5 ++---
>>  arch/powerpc/kernel/vdso64/Makefile | 5 ++---
>>  2 files changed, 4 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile
>> index ce199f6e4256..06f54d947057 100644
>> --- a/arch/powerpc/kernel/vdso32/Makefile
>> +++ b/arch/powerpc/kernel/vdso32/Makefile
>> @@ -26,9 +26,8 @@ GCOV_PROFILE := n
>>  KCOV_INSTRUMENT := n
>>  UBSAN_SANITIZE := n
>>
>> -ccflags-y := -shared -fno-common -fno-builtin
>> -ccflags-y += -nostdlib -Wl,-soname=linux-vdso32.so.1 \
>> -               $(call cc-ldoption, -Wl$(comma)--hash-style=both)
>> +ccflags-y := -shared -fno-common -fno-builtin -nostdlib \
>> +       -Wl,-soname=linux-vdso32.so.1 -Wl,--hash-style=both
>>  asflags-y := -D__VDSO32__ -s
>>
>>  obj-y += vdso32_wrapper.o
>> diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile
>> index 28e7d112aa2f..32ebb3522ea1 100644
>> --- a/arch/powerpc/kernel/vdso64/Makefile
>> +++ b/arch/powerpc/kernel/vdso64/Makefile
>> @@ -12,9 +12,8 @@ GCOV_PROFILE := n
>>  KCOV_INSTRUMENT := n
>>  UBSAN_SANITIZE := n
>>
>> -ccflags-y := -shared -fno-common -fno-builtin
>> -ccflags-y += -nostdlib -Wl,-soname=linux-vdso64.so.1 \
>> -               $(call cc-ldoption, -Wl$(comma)--hash-style=both)
>> +ccflags-y := -shared -fno-common -fno-builtin -nostdlib \
>> +       -Wl,-soname=linux-vdso64.so.1 -Wl,--hash-style=both
>>  asflags-y := -D__VDSO64__ -s
>>
>>  obj-y += vdso64_wrapper.o
>> --
>> 2.21.0.593.g511ec345e18-goog
>>
> 
> bumping for review

This looks like a good cleanup.

Reviewed-by: Nicholas Piggin <npiggin@gmail.com>


^ permalink raw reply

* Re: [PATCH v2] powerpc/32s: fix BATs setting with CONFIG_STRICT_KERNEL_RWX
From: Michael Ellerman @ 2019-05-01  0:55 UTC (permalink / raw)
  To: Christophe Leroy, Benjamin Herrenschmidt, Paul Mackerras,
	Serge Belyshev, Segher Boessenkool
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <09733bd9d90f2ab9dfee9838442e0bea01df194d.1556640535.git.christophe.leroy@c-s.fr>

Christophe Leroy <christophe.leroy@c-s.fr> writes:
> Serge reported some crashes with CONFIG_STRICT_KERNEL_RWX enabled
> on a book3s32 machine.
>
> Analysis shows two issues:
> - BATs addresses and sizes are not properly aligned.
> - There is a gap between the last address covered by BATs and the
> first address covered by pages.
>
> Memory mapped with DBATs:
> 0: 0xc0000000-0xc07fffff 0x00000000 Kernel RO coherent
> 1: 0xc0800000-0xc0bfffff 0x00800000 Kernel RO coherent
> 2: 0xc0c00000-0xc13fffff 0x00c00000 Kernel RW coherent
> 3: 0xc1400000-0xc23fffff 0x01400000 Kernel RW coherent
> 4: 0xc2400000-0xc43fffff 0x02400000 Kernel RW coherent
> 5: 0xc4400000-0xc83fffff 0x04400000 Kernel RW coherent
> 6: 0xc8400000-0xd03fffff 0x08400000 Kernel RW coherent
> 7: 0xd0400000-0xe03fffff 0x10400000 Kernel RW coherent
>
> Memory mapped with pages:
> 0xe1000000-0xefffffff  0x21000000       240M        rw       present           dirty  accessed
>
> This patch fixes both issues. With the patch, we get the following
> which is as expected:
>
> Memory mapped with DBATs:
> 0: 0xc0000000-0xc07fffff 0x00000000 Kernel RO coherent
> 1: 0xc0800000-0xc0bfffff 0x00800000 Kernel RO coherent
> 2: 0xc0c00000-0xc0ffffff 0x00c00000 Kernel RW coherent
> 3: 0xc1000000-0xc1ffffff 0x01000000 Kernel RW coherent
> 4: 0xc2000000-0xc3ffffff 0x02000000 Kernel RW coherent
> 5: 0xc4000000-0xc7ffffff 0x04000000 Kernel RW coherent
> 6: 0xc8000000-0xcfffffff 0x08000000 Kernel RW coherent
> 7: 0xd0000000-0xdfffffff 0x10000000 Kernel RW coherent
>
> Memory mapped with pages:
> 0xe0000000-0xefffffff  0x20000000       256M        rw       present           dirty  accessed
>
> Reported-by: Serge Belyshev <belyshev@depni.sinp.msu.ru>
> Fixes: 63b2bc619565 ("powerpc/mm/32s: Use BATs for STRICT_KERNEL_RWX")
> Cc: stable@vger.kernel.org

I could probably still get this into v5.1 if you're confident it's a
good fix.

cheers

^ permalink raw reply

* Re: [PATCH] powerpc/mm/radix: Fix kernel crash when running subpage protect test
From: Michael Ellerman @ 2019-05-01  1:44 UTC (permalink / raw)
  To: Aneesh Kumar K.V, npiggin, paulus
  Cc: Sachin Sant, Aneesh Kumar K.V, linuxppc-dev
In-Reply-To: <20190430075907.7701-1-aneesh.kumar@linux.ibm.com>

"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:

> This patch fixes the below crash by making sure we touch the subpage protection
> related structures only if we know they are allocated on the platform. With
> radix translation we don't allocate hash context at all and trying to access
> subpage_prot_table results in
>
>  Faulting instruction address: 0xc00000000008bdb4
>  Oops: Kernel access of bad area, sig: 11 [#1]
>  LE PAGE_SIZE=64K MMU=Radix MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV
>  ....
>  NIP [c00000000008bdb4] sys_subpage_prot+0x74/0x590
>  LR [c00000000000b688] system_call+0x5c/0x70
>  Call Trace:
>  [c00020002c6b7d30] [c00020002c6b7d90] 0xc00020002c6b7d90 (unreliable)
>  [c00020002c6b7e20] [c00000000000b688] system_call+0x5c/0x70
>  Instruction dump:
>  fb61ffd8 fb81ffe0 fba1ffe8 fbc1fff0 fbe1fff8 f821ff11 e92d1178 f9210068
>  39200000 e92d0968 ebe90630 e93f03e8 <eb891038> 60000000 3860fffe e9410068
>
> We also move the subpage_prot_table with mmp_sem held to avoid racec
> between two parallel subpage_prot syscall.
>
> Reported-by: Sachin Sant <sachinp@linux.vnet.ibm.com>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>

Presumably it was:

701101865f5d ("powerpc/mm: Reduce memory usage for mm_context_t for radix")

That caused the breakage?

cheers

^ permalink raw reply

* Re: [PATCH 01/41] drivers: tty: serial: dz: use dev_err() instead of printk()
From: Maciej W. Rozycki @ 2019-05-01  2:20 UTC (permalink / raw)
  To: Greg KH
  Cc: linux-ia64, lorenzo.pieralisi, linux-mips, sparclinux, andrew,
	khilman, sudeep.holla, liviu.dudau, linux-kernel, vz, linux,
	linuxppc-dev, Enrico Weigelt, metux IT consult, linux-serial,
	slemieux.tyco, matthias.bgg, jacmet, linux-amlogic,
	andriy.shevchenko, Enrico Weigelt, metux IT consult,
	David S. Miller
In-Reply-To: <20190429131224.GA27385@kroah.com>

On Mon, 29 Apr 2019, Greg KH wrote:

> > >>  drivers/tty/serial/dz.c | 8 ++++----
> > > 
> > > Do you have this hardware to test any of these changes with?
> > 
> > Unfortunately not :(
> 
> Then I can take the "basic" types of patches for the driver (like this
> one), but not any others, sorry.

 I can verify changes to dz.c, sb1250-duart.c and zs.c with real hardware, 
but regrettably not right away: the hardware is in a remote location and 
while I have it wired for remote operation unfortunately its connectivity 
has been cut off by an unfriendly ISP.

 I'm not sure if all the changes make sense though: if there is a compiler 
warning or a usability issue, then a patch is surely welcome, otherwise: 
"If it ain't broke, don't fix it".

  Maciej

^ permalink raw reply

* Re: [PATCH 06/41] drivers: tty: serial: sb1250-duart: use dev_err() instead of printk()
From: Maciej W. Rozycki @ 2019-05-01  2:29 UTC (permalink / raw)
  To: Enrico Weigelt, metux IT consult
  Cc: linux-ia64, lorenzo.pieralisi, linux-mips, andrew,
	Greg Kroah-Hartman, sudeep.holla, liviu.dudau, linux-kernel, vz,
	linux, sparclinux, khilman, linux-serial, slemieux.tyco,
	matthias.bgg, jacmet, linux-amlogic, andriy.shevchenko,
	linuxppc-dev, David S. Miller
In-Reply-To: <1556369542-13247-7-git-send-email-info@metux.net>

On Sat, 27 Apr 2019, Enrico Weigelt, metux IT consult wrote:

> diff --git a/drivers/tty/serial/sb1250-duart.c b/drivers/tty/serial/sb1250-duart.c
> index 329aced..655961c 100644
> --- a/drivers/tty/serial/sb1250-duart.c
> +++ b/drivers/tty/serial/sb1250-duart.c
> @@ -663,7 +663,6 @@ static void sbd_release_port(struct uart_port *uport)
>  
>  static int sbd_map_port(struct uart_port *uport)
>  {
> -	const char *err = KERN_ERR "sbd: Cannot map MMIO\n";
>  	struct sbd_port *sport = to_sport(uport);
>  	struct sbd_duart *duart = sport->duart;
>  
> @@ -671,7 +670,7 @@ static int sbd_map_port(struct uart_port *uport)
>  		uport->membase = ioremap_nocache(uport->mapbase,
>  						 DUART_CHANREG_SPACING);
>  	if (!uport->membase) {
> -		printk(err);
> +		dev_err(uport->dev, "Cannot map MMIO (base)\n");
>  		return -ENOMEM;
>  	}
>  
> @@ -679,7 +678,7 @@ static int sbd_map_port(struct uart_port *uport)
>  		sport->memctrl = ioremap_nocache(duart->mapctrl,
>  						 DUART_CHANREG_SPACING);
>  	if (!sport->memctrl) {
> -		printk(err);
> +		dev_err(uport->dev, "Cannot map MMIO (ctrl)\n");
>  		iounmap(uport->membase);
>  		uport->membase = NULL;
>  		return -ENOMEM;

 Hmm, what's the point to have separate messages, which consume extra 
memory, for a hardly if at all possible error condition?

  Maciej

^ permalink raw reply

* [PATCH kernel] prom_init: Fetch flatten device tree from the system firmware
From: Alexey Kardashevskiy @ 2019-05-01  3:42 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Alexey Kardashevskiy, Suraj Jitindar Singh, David Gibson

At the moment, on 256CPU + 256 PCI devices guest, it takes the guest
about 8.5sec to fetch the entire device tree via the client interface
as the DT is traversed twice - for strings blob and for struct blob.
Also, "getprop" is quite slow too as SLOF stores properties in a linked
list.

However, since [1] SLOF builds flattened device tree (FDT) for another
purpose. [2] adds a new "fdt-fetch" client interface for the OS to fetch
the FDT.

This tries the new method; if not supported, this falls back to
the old method.

There is a change in the FDT layout - the old method produced
(reserved map, strings, structs), the new one receives only strings and
structs from the firmware and adds the final reserved map to the end,
so it is (fw reserved map, strings, structs, reserved map).
This still produces the same unflattened device tree.

This merges the reserved map from the firmware into the kernel's reserved
map. At the moment SLOF generates an empty reserved map so this does not
change the existing behaviour in regard of reservations.

This supports only v17 onward as only that version provides dt_struct_size
which works as "fdt-fetch" only produces v17 blobs.

If "fdt-fetch" is not available, the old method of fetching the DT is used.

[1] https://git.qemu.org/?p=SLOF.git;a=commitdiff;h=e6fc84652c9c00
[2] https://git.qemu.org/?p=SLOF.git;a=commit;h=ecda95906930b80

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/kernel/prom_init.c | 43 +++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index f33ff4163a51..72e7a602b68e 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -2457,6 +2457,48 @@ static void __init flatten_device_tree(void)
 		prom_panic("Can't allocate initial device-tree chunk\n");
 	mem_end = mem_start + room;

+	hdr = (void *) mem_start;
+	if (!call_prom_ret("fdt-fetch", 2, 1, NULL, mem_start,
+				room - sizeof(mem_reserve_map)) &&
+			hdr->version >= 17) {
+		u32 size;
+		struct mem_map_entry *fwrmap;
+
+		/* Fixup the boot cpuid */
+		hdr->boot_cpuid_phys = cpu_to_be32(prom.cpu);
+
+		/*
+		 * Store the struct and strings addresses, mostly
+		 * for consistency, only dt_header_start actually matters later.
+		 */
+		dt_header_start = mem_start;
+		dt_string_start = mem_start + be32_to_cpu(hdr->off_dt_strings);
+		dt_string_end = dt_string_start +
+			be32_to_cpu(hdr->dt_strings_size);
+		dt_struct_start = mem_start + be32_to_cpu(hdr->off_dt_struct);
+		dt_struct_end = dt_struct_start +
+			be32_to_cpu(hdr->dt_struct_size);
+
+		/*
+		 * Calculate the reserved map location (which we put
+		 * at the blob end) and update total size.
+		 */
+		fwrmap = (void *)(mem_start + be32_to_cpu(hdr->off_mem_rsvmap));
+		hdr->off_mem_rsvmap = hdr->totalsize;
+		size = be32_to_cpu(hdr->totalsize);
+		hdr->totalsize = cpu_to_be32(size + sizeof(mem_reserve_map));
+
+		/* Merge reserved map from firmware to ours */
+		for ( ; fwrmap->size; ++fwrmap)
+			reserve_mem(be64_to_cpu(fwrmap->base),
+					be64_to_cpu(fwrmap->size));
+
+		rsvmap = (u64 *)(mem_start + size);
+
+		prom_debug("Fetched DTB: %d bytes to @%lx\n", size, mem_start);
+		goto finalize_exit;
+	}
+
 	/* Get root of tree */
 	root = call_prom("peer", 1, 1, (phandle)0);
 	if (root == (phandle)0)
@@ -2504,6 +2546,7 @@ static void __init flatten_device_tree(void)
 	/* Version 16 is not backward compatible */
 	hdr->last_comp_version = cpu_to_be32(0x10);

+finalize_exit:
 	/* Copy the reserve map in */
 	memcpy(rsvmap, mem_reserve_map, sizeof(mem_reserve_map));

-- 
2.17.1

^ permalink raw reply related

* [PATCH kernel v2 0/2] powerpc/ioda2: Another attempt to allow DMA masks between 32 and 59
From: Alexey Kardashevskiy @ 2019-05-01  5:28 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Alexey Kardashevskiy, Alistair Popple, Oliver O'Halloran,
	David Gibson

This is an attempt to allow DMA masks between 32..59 which are not large
enough to use either a PHB3 bypass mode or a sketchy bypass. Depending
on the max order, up to 40 is usually available.


This is based on sha1
37624b58542f Linus Torvalds "Linux 5.1-rc7".

Please comment. Thanks.



Alexey Kardashevskiy (2):
  powerpc/powernv/ioda2: Allocate TCE table levels on demand for default
    DMA window
  powerpc/powernv/ioda2: Create bigger default window with 64k IOMMU
    pages

 arch/powerpc/include/asm/iommu.h              |  8 ++-
 arch/powerpc/platforms/powernv/pci.h          |  2 +-
 arch/powerpc/kernel/iommu.c                   | 58 +++++++++++++------
 arch/powerpc/platforms/powernv/pci-ioda-tce.c | 20 +++----
 arch/powerpc/platforms/powernv/pci-ioda.c     | 40 +++++++++++--
 5 files changed, 90 insertions(+), 38 deletions(-)

-- 
2.17.1


^ permalink raw reply

* [PATCH kernel v2 1/2] powerpc/powernv/ioda2: Allocate TCE table levels on demand for default DMA window
From: Alexey Kardashevskiy @ 2019-05-01  5:28 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Alexey Kardashevskiy, Alistair Popple, Oliver O'Halloran,
	David Gibson
In-Reply-To: <20190501052822.64667-1-aik@ozlabs.ru>

We allocate only the first level of multilevel TCE tables for KVM
already (alloc_userspace_copy==true), and the rest is allocated on demand.
This is not enabled though for baremetal.

This removes the KVM limitation (implicit, via the alloc_userspace_copy
parameter) and always allocates just the first level. The on-demand
allocation of missing levels is already implemented.

As from now on DMA map might happen with disabled interrupts, this
allocates TCEs with GFP_ATOMIC.

To save time when creating a new clean table, this skips non-allocated
indirect TCE entries in pnv_tce_free just like we already do in
the VFIO IOMMU TCE driver.

This changes the default level number from 1 to 2 to reduce the amount
of memory required for the default 32bit DMA window at the boot time.
The default window size is up to 2GB which requires 4MB of TCEs which is
unlikely to be used entirely or at all as most devices these days are
64bit capable so by switching to 2 levels by default we save 4032KB of
RAM per a device.

While at this, add __GFP_NOWARN to alloc_pages_node() as the userspace
can trigger this path via VFIO, see the failure and try creating a table
again with different parameters which might succeed.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v2:
* added __GFP_NOWARN to alloc_pages_node
---
 arch/powerpc/platforms/powernv/pci.h          |  2 +-
 arch/powerpc/platforms/powernv/pci-ioda-tce.c | 20 +++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 8e36da379252..f44987b90ac2 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -223,7 +223,7 @@ extern struct iommu_table_group *pnv_npu_compound_attach(
 		struct pnv_ioda_pe *pe);
 
 /* pci-ioda-tce.c */
-#define POWERNV_IOMMU_DEFAULT_LEVELS	1
+#define POWERNV_IOMMU_DEFAULT_LEVELS	2
 #define POWERNV_IOMMU_MAX_LEVELS	5
 
 extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index e28f03e1eb5e..c75ec37bf0cd 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -36,7 +36,8 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift)
 	struct page *tce_mem = NULL;
 	__be64 *addr;
 
-	tce_mem = alloc_pages_node(nid, GFP_KERNEL, shift - PAGE_SHIFT);
+	tce_mem = alloc_pages_node(nid, GFP_ATOMIC | __GFP_NOWARN,
+			shift - PAGE_SHIFT);
 	if (!tce_mem) {
 		pr_err("Failed to allocate a TCE memory, level shift=%d\n",
 				shift);
@@ -161,6 +162,9 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
 
 		if (ptce)
 			*ptce = cpu_to_be64(0);
+		else
+			/* Skip the rest of the level */
+			i |= tbl->it_level_size - 1;
 	}
 }
 
@@ -260,7 +264,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 	unsigned int table_shift = max_t(unsigned int, entries_shift + 3,
 			PAGE_SHIFT);
 	const unsigned long tce_table_size = 1UL << table_shift;
-	unsigned int tmplevels = levels;
 
 	if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
 		return -EINVAL;
@@ -268,9 +271,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 	if (!is_power_of_2(window_size))
 		return -EINVAL;
 
-	if (alloc_userspace_copy && (window_size > (1ULL << 32)))
-		tmplevels = 1;
-
 	/* Adjust direct table size from window_size and levels */
 	entries_shift = (entries_shift + levels - 1) / levels;
 	level_shift = entries_shift + 3;
@@ -281,7 +281,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 
 	/* Allocate TCE table */
 	addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
-			tmplevels, tce_table_size, &offset, &total_allocated);
+			1, tce_table_size, &offset, &total_allocated);
 
 	/* addr==NULL means that the first level allocation failed */
 	if (!addr)
@@ -292,18 +292,18 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 	 * we did not allocate as much as we wanted,
 	 * release partially allocated table.
 	 */
-	if (tmplevels == levels && offset < tce_table_size)
+	if (levels == 1 && offset < tce_table_size)
 		goto free_tces_exit;
 
 	/* Allocate userspace view of the TCE table */
 	if (alloc_userspace_copy) {
 		offset = 0;
 		uas = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
-				tmplevels, tce_table_size, &offset,
+				1, tce_table_size, &offset,
 				&total_allocated_uas);
 		if (!uas)
 			goto free_tces_exit;
-		if (tmplevels == levels && (offset < tce_table_size ||
+		if (levels == 1 && (offset < tce_table_size ||
 				total_allocated_uas != total_allocated))
 			goto free_uas_exit;
 	}
@@ -318,7 +318,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 
 	pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d/%d\n",
 			window_size, tce_table_size, bus_offset, tbl->it_base,
-			tbl->it_userspace, tmplevels, levels);
+			tbl->it_userspace, 1, levels);
 
 	return 0;
 
-- 
2.17.1


^ permalink raw reply related

* [PATCH kernel v2 2/2] powerpc/powernv/ioda2: Create bigger default window with 64k IOMMU pages
From: Alexey Kardashevskiy @ 2019-05-01  5:28 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Alexey Kardashevskiy, Alistair Popple, Oliver O'Halloran,
	David Gibson
In-Reply-To: <20190501052822.64667-1-aik@ozlabs.ru>

At the moment we create a small window only for 32bit devices, the window
maps 0..2GB of the PCI space only. For other devices we either use
a sketchy bypass or hardware bypass but the former can only work if
the amount of RAM is no bigger than the device's DMA mask and the latter
requires devices to support at least 59bit DMA.

This extends the default DMA window to the maximum size possible to allow
a wider DMA mask than just 32bit. The default window size is now limited
by the the iommu_table::it_map allocation bitmap which is a contiguous
array, 1 bit per an IOMMU page.

This increases the default IOMMU page size from hard coded 4K to
the system page size to allow wider DMA masks.

This increases the level number to not exceed the max order allocation
limit per TCE level. By the same time, this keeps minimal levels number
as 2 in order to save memory.

As the extended window now overlaps the 32bit MMIO region, this adds
an area reservation to iommu_init_table().

After this change the default window size is 0x80000000000==1<<43 so
devices limited to DMA mask smaller than the amount of system RAM can
still use more than just 2GB of memory for DMA.

With the on-demand allocation of indirect TCE table levels enabled and
2 levels, the first TCE level size is just
1<<ceil((log2(0x7ffffffffff+1)-16)/2)=16384 TCEs or 2 system pages.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v2:
* adjusted level number to the max order
---
 arch/powerpc/include/asm/iommu.h          |  8 +++-
 arch/powerpc/kernel/iommu.c               | 58 +++++++++++++++--------
 arch/powerpc/platforms/powernv/pci-ioda.c | 40 +++++++++++++---
 3 files changed, 79 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 0ac52392ed99..5ea782e04803 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -124,6 +124,8 @@ struct iommu_table {
 	struct iommu_table_ops *it_ops;
 	struct kref    it_kref;
 	int it_nid;
+	unsigned long it_reserved_start; /* Start of not-DMA-able (MMIO) area */
+	unsigned long it_reserved_end;
 };
 
 #define IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry) \
@@ -162,8 +164,10 @@ extern int iommu_tce_table_put(struct iommu_table *tbl);
 /* Initializes an iommu_table based in values set in the passed-in
  * structure
  */
-extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
-					    int nid);
+extern struct iommu_table *iommu_init_table_res(struct iommu_table *tbl,
+		int nid, unsigned long res_start, unsigned long res_end);
+#define iommu_init_table(tbl, nid) iommu_init_table_res((tbl), (nid), 0, 0)
+
 #define IOMMU_TABLE_GROUP_MAX_TABLES	2
 
 struct iommu_table_group;
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 33bbd59cff79..209306ce7f4b 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -646,11 +646,43 @@ static void iommu_table_clear(struct iommu_table *tbl)
 #endif
 }
 
+static void iommu_table_reserve_pages(struct iommu_table *tbl)
+{
+	int i;
+
+	/*
+	 * Reserve page 0 so it will not be used for any mappings.
+	 * This avoids buggy drivers that consider page 0 to be invalid
+	 * to crash the machine or even lose data.
+	 */
+	if (tbl->it_offset == 0)
+		set_bit(0, tbl->it_map);
+
+	for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
+		set_bit(i, tbl->it_map);
+}
+
+static void iommu_table_release_pages(struct iommu_table *tbl)
+{
+	int i;
+
+	/*
+	 * In case we have reserved the first bit, we should not emit
+	 * the warning below.
+	 */
+	if (tbl->it_offset == 0)
+		clear_bit(0, tbl->it_map);
+
+	for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
+		clear_bit(i, tbl->it_map);
+}
+
 /*
  * Build a iommu_table structure.  This contains a bit map which
  * is used to manage allocation of the tce space.
  */
-struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
+struct iommu_table *iommu_init_table_res(struct iommu_table *tbl, int nid,
+		unsigned long res_start, unsigned long res_end)
 {
 	unsigned long sz;
 	static int welcomed = 0;
@@ -669,13 +701,9 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
 	tbl->it_map = page_address(page);
 	memset(tbl->it_map, 0, sz);
 
-	/*
-	 * Reserve page 0 so it will not be used for any mappings.
-	 * This avoids buggy drivers that consider page 0 to be invalid
-	 * to crash the machine or even lose data.
-	 */
-	if (tbl->it_offset == 0)
-		set_bit(0, tbl->it_map);
+	tbl->it_reserved_start = res_start;
+	tbl->it_reserved_end = res_end;
+	iommu_table_reserve_pages(tbl);
 
 	/* We only split the IOMMU table if we have 1GB or more of space */
 	if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024))
@@ -727,12 +755,7 @@ static void iommu_table_free(struct kref *kref)
 		return;
 	}
 
-	/*
-	 * In case we have reserved the first bit, we should not emit
-	 * the warning below.
-	 */
-	if (tbl->it_offset == 0)
-		clear_bit(0, tbl->it_map);
+	iommu_table_release_pages(tbl);
 
 	/* verify that table contains no entries */
 	if (!bitmap_empty(tbl->it_map, tbl->it_size))
@@ -1037,8 +1060,7 @@ int iommu_take_ownership(struct iommu_table *tbl)
 	for (i = 0; i < tbl->nr_pools; i++)
 		spin_lock(&tbl->pools[i].lock);
 
-	if (tbl->it_offset == 0)
-		clear_bit(0, tbl->it_map);
+	iommu_table_reserve_pages(tbl);
 
 	if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
 		pr_err("iommu_tce: it_map is not empty");
@@ -1068,9 +1090,7 @@ void iommu_release_ownership(struct iommu_table *tbl)
 
 	memset(tbl->it_map, 0, sz);
 
-	/* Restore bit#0 set by iommu_init_table() */
-	if (tbl->it_offset == 0)
-		set_bit(0, tbl->it_map);
+	iommu_table_release_pages(tbl);
 
 	for (i = 0; i < tbl->nr_pools; i++)
 		spin_unlock(&tbl->pools[i].lock);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 3ead4c237ed0..d4fd23fc7b86 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2421,6 +2421,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
 {
 	struct iommu_table *tbl = NULL;
 	long rc;
+	unsigned long res_start, res_end;
 
 	/*
 	 * crashkernel= specifies the kdump kernel's maximum memory at
@@ -2434,19 +2435,46 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
 	 * DMA window can be larger than available memory, which will
 	 * cause errors later.
 	 */
-	const u64 window_size = min((u64)pe->table_group.tce32_size, max_memory);
+	const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER - 1);
 
-	rc = pnv_pci_ioda2_create_table(&pe->table_group, 0,
-			IOMMU_PAGE_SHIFT_4K,
-			window_size,
-			POWERNV_IOMMU_DEFAULT_LEVELS, false, &tbl);
+	/*
+	 * We create the default window as big as we can. The constraint is
+	 * the max order of allocation possible. The TCE tableis likely to
+	 * end up being multilevel and with on-demand allocation in place,
+	 * the initial use is not going to be huge as the default window aims
+	 * to support cripplied devices (i.e. not fully 64bit DMAble) only.
+	 */
+	/* iommu_table::it_map uses 1 bit per IOMMU page, hence 8 */
+	const u64 window_size = min((maxblock * 8) << PAGE_SHIFT, max_memory);
+	/* Each TCE level cannot exceed maxblock so go multilevel if needed */
+	unsigned long tces = window_size >> PAGE_SHIFT;
+	unsigned long tces_level = maxblock >> 3;
+	unsigned int levels = tces / tces_level;
+
+	if (tces % tces_level)
+		levels += 1;
+	/*
+	 * We try to stick to default levels (which is >1 at the moment) in
+	 * order to save memory by relying on on-demain TCE level allocation.
+	 */
+	levels = max_t(unsigned int, levels, POWERNV_IOMMU_DEFAULT_LEVELS);
+
+	rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, PAGE_SHIFT,
+			window_size, levels, false, &tbl);
 	if (rc) {
 		pe_err(pe, "Failed to create 32-bit TCE table, err %ld",
 				rc);
 		return rc;
 	}
 
-	iommu_init_table(tbl, pe->phb->hose->node);
+	/* We use top part of 32bit space for MMIO so exclude it from DMA */
+	res_start = 0;
+	res_end = 0;
+	if (window_size > pe->phb->ioda.m32_pci_base) {
+		res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift;
+		res_end = min(window_size, SZ_4G) >> tbl->it_page_shift;
+	}
+	iommu_init_table_res(tbl, pe->phb->hose->node, res_start, res_end);
 
 	rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
 	if (rc) {
-- 
2.17.1


^ permalink raw reply related

* Re: [PATCH] crypto: caam/jr - Remove extra memory barrier during job ring dequeue
From: Michael Ellerman @ 2019-05-01  5:49 UTC (permalink / raw)
  To: vakul.garg
  Cc: aymen.sghaier, herbert, horia.geanta, linux-crypto, linuxppc-dev,
	davem

Vakul Garg wrote:
> In function caam_jr_dequeue(), a full memory barrier is used before
> writing response job ring's register to signal removal of the completed
> job. Therefore for writing the register, we do not need another write
> memory barrier. Hence it is removed by replacing the call to wr_reg32()
> with a newly defined function wr_reg32_relaxed().
> 
> Signed-off-by: Vakul Garg <vakul.garg@nxp.com>
> ---
>  drivers/crypto/caam/jr.c   | 2 +-
>  drivers/crypto/caam/regs.h | 8 ++++++++
>  2 files changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/crypto/caam/jr.c b/drivers/crypto/caam/jr.c
> index 4e9b3fca5627..2ce6d7d2ad72 100644
> --- a/drivers/crypto/caam/jr.c
> +++ b/drivers/crypto/caam/jr.c
> @@ -266,7 +266,7 @@ static void caam_jr_dequeue(unsigned long devarg)
>  		mb();
>  
>  		/* set done */
> -		wr_reg32(&jrp->rregs->outring_rmvd, 1);
> +		wr_reg32_relaxed(&jrp->rregs->outring_rmvd, 1);
>  
>  		jrp->out_ring_read_index = (jrp->out_ring_read_index + 1) &
>  					   (JOBR_DEPTH - 1);
> diff --git a/drivers/crypto/caam/regs.h b/drivers/crypto/caam/regs.h
> index 3cd0822ea819..9e912c722e33 100644
> --- a/drivers/crypto/caam/regs.h
> +++ b/drivers/crypto/caam/regs.h
> @@ -96,6 +96,14 @@ cpu_to_caam(16)
>  cpu_to_caam(32)
>  cpu_to_caam(64)
>  
> +static inline void wr_reg32_relaxed(void __iomem *reg, u32 data)
> +{
> +	if (caam_little_end)
> +		writel_relaxed(data, reg);
> +	else
> +		writel_relaxed(cpu_to_be32(data), reg);
> +}
> +
>  static inline void wr_reg32(void __iomem *reg, u32 data)
>  {
>  	if (caam_little_end)

This crashes on my p5020ds. Did you test on powerpc?

# first bad commit: [bbfcac5ff5f26aafa51935a62eb86b6eacfe8a49] crypto: caam/jr - Remove extra memory barrier during job ring dequeue

Log:

  ------------[ cut here ]------------
  kernel BUG at drivers/crypto/caam/jr.c:191!
  Oops: Exception in kernel mode, sig: 5 [#1]
  BE PAGE_SIZE=4K SMP NR_CPUS=24 CoreNet Generic
  Modules linked in:
  CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.1.0-rc1-gcc-8.2.0-00060-gbbfcac5ff5f2 #31
  NIP:  c00000000079d704 LR: c00000000079d498 CTR: c000000000086914
  REGS: c0000000fffc7970 TRAP: 0700   Not tainted  (5.1.0-rc1-gcc-8.2.0-00060-gbbfcac5ff5f2)
  MSR:  0000000080029000 <CE,EE,ME>  CR: 28008484  XER: 00000000
  IRQMASK: 0
  GPR00: c00000000079d6b0 c0000000fffc7c00 c000000000fbc800 0000000000000001
  GPR04: 000000007e080080 000000000000ffc0 0000000000000001 00000000000067d7
  GPR08: 00000000880401a9 0000000000000000 0000000000000001 00000000fa83b2da
  GPR12: 0000000028008224 c00000003ffff800 c000000000fc20b0 0000000000000100
  GPR16: 8920f09520bea117 c000000000def480 0000000000000000 0000000000000001
  GPR20: c000000000fc3940 c0000000f3537e18 0000000000000001 c000000001026cc5
  GPR24: 0000000000000001 c0000000f3328000 0000000000000001 c0000000f3451010
  GPR28: 0000000000000000 0000000000000001 0000000000000000 0000000000000000
  NIP [c00000000079d704] .caam_jr_dequeue+0x2f0/0x410
  LR [c00000000079d498] .caam_jr_dequeue+0x84/0x410
  Call Trace:
  [c0000000fffc7c00] [c00000000079d6b0] .caam_jr_dequeue+0x29c/0x410 (unreliable)
  [c0000000fffc7cd0] [c00000000004fef0] .tasklet_action_common.isra.3+0xac/0x180
  [c0000000fffc7d80] [c000000000a2f99c] .__do_softirq+0x174/0x3f8
  [c0000000fffc7e90] [c00000000004fb94] .irq_exit+0xc4/0xdc
  [c0000000fffc7f00] [c000000000007348] .__do_irq+0x8c/0x1b0
  [c0000000fffc7f90] [c0000000000150c4] .call_do_irq+0x14/0x24
  [c0000000f3137930] [c0000000000074e4] .do_IRQ+0x78/0xd4
  [c0000000f31379c0] [c000000000019998] exc_0x500_common+0xfc/0x100
  --- interrupt: 501 at .book3e_idle+0x24/0x5c
      LR = .book3e_idle+0x24/0x5c
  [c0000000f3137cc0] [c00000000000a6a4] .arch_cpu_idle+0x34/0xa0 (unreliable)
  [c0000000f3137d30] [c000000000a2f2e8] .default_idle_call+0x5c/0x70
  [c0000000f3137da0] [c000000000084210] .do_idle+0x1b0/0x1f4
  [c0000000f3137e40] [c000000000084434] .cpu_startup_entry+0x28/0x30
  [c0000000f3137eb0] [c000000000021538] .start_secondary+0x59c/0x5b0
  [c0000000f3137f90] [c00000000000045c] start_secondary_prolog+0x10/0x14
  Instruction dump:
  7d284a14 e9290018 2fa90000 40de001c 3bbd0001 57bd05fe 7d3db050 712901ff
  7fbd07b4 40e2ffcc 93b500dc 4bffff94 <0fe00000> 78890022 79270020 41d600ec
  ---[ end trace 7bedbdf37a95ab35 ]---

That's hitting:

		/* we should never fail to find a matching descriptor */
		BUG_ON(CIRC_CNT(head, tail + i, JOBR_DEPTH) <= 0);

cheers

^ permalink raw reply

* Re: [PATCH 2/5] soc/fsl/qe: qe.c: reduce static memory footprint by 1.7K
From: Rasmus Villemoes @ 2019-05-01  5:51 UTC (permalink / raw)
  To: Christophe Leroy, Qiang Zhao, Li Yang
  Cc: Rasmus Villemoes, linux-kernel@vger.kernel.org, Scott Wood,
	Rob Herring, linuxppc-dev@lists.ozlabs.org,
	linux-arm-kernel@lists.infradead.org
In-Reply-To: <51b00425-58a7-089a-67a9-61cd315d5097@c-s.fr>

On 30/04/2019 19.12, Christophe Leroy wrote:
>  
> Le 30/04/2019 à 15:36, Rasmus Villemoes a écrit :
>> The current array of struct qe_snum use 256*4 bytes for just keeping
>> track of the free/used state of each index, and the struct layout
>> means there's another 768 bytes of padding. If we just unzip that
>> structure, the array of snum values just use 256 bytes, while the
>> free/inuse state can be tracked in a 32 byte bitmap.
>>
>> So this reduces the .data footprint by 1760 bytes. It also serves as
>> preparation for introducing another DT binding for specifying the snum
>> values.
>>
>> Signed-off-by: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
>> ---
>> -
>>   /* We allocate this here because it is used almost exclusively for
>>    * the communication processor devices.
>>    */
>>   struct qe_immap __iomem *qe_immr;
>>   EXPORT_SYMBOL(qe_immr);
>>   -static struct qe_snum snums[QE_NUM_OF_SNUM];    /* Dynamically
>> allocated SNUMs */
>> +static u8 snums[QE_NUM_OF_SNUM];    /* Dynamically allocated SNUMs */
>> +static DECLARE_BITMAP(snum_state, QE_NUM_OF_SNUM);
>>   static unsigned int qe_num_of_snum;
>>     static phys_addr_t qebase = -1;
>> @@ -308,6 +298,7 @@ static void qe_snums_init(void)
>>       };
>>       const u8 *snum_init;
>>   +    bitmap_zero(snum_state, QE_NUM_OF_SNUM);
> 
> Doesn't make much importance, but wouldn't it be more logical to add
> this line where the setting of .state = QE_SNUM_STATE_FREE was done
> previously, ie around the for() loop below ?

This was on purpose, to avoid having to move it up in patch 4, where we
don't necessarily reach the for loop.

>>       qe_num_of_snum = qe_get_num_of_snums();
>>         if (qe_num_of_snum == 76)
>> @@ -315,10 +306,8 @@ static void qe_snums_init(void)
>>       else
>>           snum_init = snum_init_46;
>>   -    for (i = 0; i < qe_num_of_snum; i++) {
>> -        snums[i].num = snum_init[i];
>> -        snums[i].state = QE_SNUM_STATE_FREE;
>> -    }
>> +    for (i = 0; i < qe_num_of_snum; i++)
>> +        snums[i] = snum_init[i];
> 
> Could use memcpy() instead ?

Yes, I switch to that in 5/5. Sure, I could do it here already, but I
did it this way to keep close to the current style. I don't care either
way, so if you prefer introducing memcpy here, fine by me.


>>       spin_unlock_irqrestore(&qe_lock, flags);
>>   @@ -346,8 +333,8 @@ void qe_put_snum(u8 snum)
>>       int i;
>>         for (i = 0; i < qe_num_of_snum; i++) {
>> -        if (snums[i].num == snum) {
>> -            snums[i].state = QE_SNUM_STATE_FREE;
>> +        if (snums[i] == snum) {
>> +            clear_bit(i, snum_state);
>>               break;
>>           }
>>       }
> 
> Can we replace this loop by memchr() ?

Hm, yes. So that would be

  const u8 *p = memchr(snums, snum, qe_num_of_snum)
  if (p)
    clear_bit(p - snums, snum_state);

I guess. Let me fold that in and see how it looks.

Thanks,
Rasmus

^ permalink raw reply

* Re: [PATCH 4/5] soc/fsl/qe: qe.c: support fsl,qe-snums property
From: Rasmus Villemoes @ 2019-05-01  6:00 UTC (permalink / raw)
  To: Christophe Leroy, Qiang Zhao, Li Yang
  Cc: Rasmus Villemoes, linux-kernel@vger.kernel.org, Scott Wood,
	Rob Herring, linuxppc-dev@lists.ozlabs.org,
	linux-arm-kernel@lists.infradead.org
In-Reply-To: <4c1c4fe8-9412-2543-e9bc-83b7e5d7c202@c-s.fr>

On 30/04/2019 19.19, Christophe Leroy wrote:
> 
> 
> Le 30/04/2019 à 15:36, Rasmus Villemoes a écrit :
>> The current code assumes that the set of snum _values_ to populate the
>> snums[] array with is a function of the _number_ of snums
>> alone. However, reading table 4-30, and its footnotes, of the QUICC
>> Engine Block Reference Manual shows that that is a bit too naive.
>>
>> As an alternative, this introduces a new binding fsl,qe-snums, which
>> automatically encodes both the number of snums and the actual values to
>> use. Conveniently, of_property_read_variable_u8_array does exactly
>> what we need.
>>
>> For example, for the MPC8309, one would specify the property as
>>
>>                 fsl,qe-snums = /bits/ 8 <
>>                         0x88 0x89 0x98 0x99 0xa8 0xa9 0xb8 0xb9
>>                         0xc8 0xc9 0xd8 0xd9 0xe8 0xe9>;
>>
>> Signed-off-by: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
>> ---
>>   .../devicetree/bindings/soc/fsl/cpm_qe/qe.txt      |  8 +++++++-
>>   drivers/soc/fsl/qe/qe.c                            | 14 +++++++++++++-
>>   2 files changed, 20 insertions(+), 2 deletions(-)
>>
>> diff --git a/Documentation/devicetree/bindings/soc/fsl/cpm_qe/qe.txt
>> b/Documentation/devicetree/bindings/soc/fsl/cpm_qe/qe.txt
>> index d7afaff5faff..05f5f485562a 100644
>> --- a/Documentation/devicetree/bindings/soc/fsl/cpm_qe/qe.txt
>> +++ b/Documentation/devicetree/bindings/soc/fsl/cpm_qe/qe.txt
>> @@ -18,7 +18,8 @@ Required properties:
>>   - reg : offset and length of the device registers.
>>   - bus-frequency : the clock frequency for QUICC Engine.
>>   - fsl,qe-num-riscs: define how many RISC engines the QE has.
>> -- fsl,qe-num-snums: define how many serial number(SNUM) the QE can
>> use for the
>> +- fsl,qe-snums: This property has to be specified as '/bits/ 8' value,
>> +  defining the array of serial number (SNUM) values for the virtual
>>     threads.
>>     Optional properties:
>> @@ -34,6 +35,11 @@ Recommended properties
>>   - brg-frequency : the internal clock source frequency for baud-rate
>>     generators in Hz.
>>   +Deprecated properties
>> +- fsl,qe-num-snums: define how many serial number(SNUM) the QE can use
>> +  for the threads. Use fsl,qe-snums instead to not only specify the
>> +  number of snums, but also their values.
>> +
>>   Example:
>>        qe@e0100000 {
>>       #address-cells = <1>;
>> diff --git a/drivers/soc/fsl/qe/qe.c b/drivers/soc/fsl/qe/qe.c
>> index aff9d1373529..af3c2b2b268f 100644
>> --- a/drivers/soc/fsl/qe/qe.c
>> +++ b/drivers/soc/fsl/qe/qe.c
>> @@ -283,7 +283,6 @@ EXPORT_SYMBOL(qe_clock_source);
>>    */
>>   static void qe_snums_init(void)
>>   {
>> -    int i;
> 
> Why do you move this one ?

To keep the declarations of the auto variables together. When reading
the code and needing to know the type of i, it's much harder to find its
declaration if one has to skip back over the two tables, and it's
unnatural to have it separate from the others.

>>       static const u8 snum_init_76[] = {
>>           0x04, 0x05, 0x0C, 0x0D, 0x14, 0x15, 0x1C, 0x1D,
>>           0x24, 0x25, 0x2C, 0x2D, 0x34, 0x35, 0x88, 0x89,
>> @@ -304,9 +303,22 @@ static void qe_snums_init(void)
>>           0x28, 0x29, 0x38, 0x39, 0x48, 0x49, 0x58, 0x59,
>>           0x68, 0x69, 0x78, 0x79, 0x80, 0x81,
>>       };
>> +    struct device_node *qe;
>>       const u8 *snum_init;
>> +    int i;
>>         bitmap_zero(snum_state, QE_NUM_OF_SNUM);
>> +    qe = qe_get_device_node();
>> +    if (qe) {
>> +        i = of_property_read_variable_u8_array(qe, "fsl,qe-snums",
>> +                               snums, 1, QE_NUM_OF_SNUM);
>> +        of_node_put(qe);
>> +        if (i > 0) {
>> +            qe_num_of_snum = i;
>> +            return;
> 
> In that case you skip the rest of the init ? Can you explain ?

If of_property_read_variable_u8_array is succesful, it has already
stored the values into the snums array, so there's no copying left to
do, and the return value is the length of the array (which we save for
later in qe_num_of_snum). So there's really nothing more to do.

This was what I tried to hint at with "Conveniently,
of_property_read_variable_u8_array does exactly
what we need.", but I can see that that might need elaborating a little.

Rasmus

^ permalink raw reply

* Re: [PATCH kernel v2 0/2] powerpc/ioda2: Another attempt to allow DMA masks between 32 and 59
From: Alistair Popple @ 2019-05-01  6:09 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: Oliver O'Halloran, linuxppc-dev, David Gibson
In-Reply-To: <20190501052822.64667-1-aik@ozlabs.ru>

Hi Alexey,

Do we need a seperate patch to allow this to be used? Last time I tried 
calling dma_set_mask(52) on powernv it returned an error and there doesn't 
seem to be anything obvious to me in this series to change that behaviour, but 
perhaps I missed something.

- Alistair

On Wednesday, 1 May 2019 3:28:20 PM AEST Alexey Kardashevskiy wrote:
> This is an attempt to allow DMA masks between 32..59 which are not large
> enough to use either a PHB3 bypass mode or a sketchy bypass. Depending
> on the max order, up to 40 is usually available.
> 
> 
> This is based on sha1
> 37624b58542f Linus Torvalds "Linux 5.1-rc7".
> 
> Please comment. Thanks.
> 
> 
> 
> Alexey Kardashevskiy (2):
>   powerpc/powernv/ioda2: Allocate TCE table levels on demand for default
>     DMA window
>   powerpc/powernv/ioda2: Create bigger default window with 64k IOMMU
>     pages
> 
>  arch/powerpc/include/asm/iommu.h              |  8 ++-
>  arch/powerpc/platforms/powernv/pci.h          |  2 +-
>  arch/powerpc/kernel/iommu.c                   | 58 +++++++++++++------
>  arch/powerpc/platforms/powernv/pci-ioda-tce.c | 20 +++----
>  arch/powerpc/platforms/powernv/pci-ioda.c     | 40 +++++++++++--
>  5 files changed, 90 insertions(+), 38 deletions(-)



^ permalink raw reply

* Re: [PATCH] cxl: Add new kernel traces
From: Michael Ellerman @ 2019-05-01  6:44 UTC (permalink / raw)
  To: Christophe Lombard, linuxppc-dev, fbarrat, vaibhav,
	andrew.donnellan
In-Reply-To: <1520933440-24652-1-git-send-email-clombard@linux.vnet.ibm.com>

Christophe Lombard <clombard@linux.vnet.ibm.com> writes:

> This patch adds new kernel traces in the current in-kernel 'library'
> which can be called by other drivers to help interacting with an
> IBM XSL on a POWER9 system.
>
> If some kernel traces exist in the 'normal path' to handle a page or a
> segment fault, some others are missing when a page fault is handle
> through cxllib.
>
> Signed-off-by: Christophe Lombard <clombard@linux.vnet.ibm.com>
> ---
>  drivers/misc/cxl/cxllib.c |   3 ++
>  drivers/misc/cxl/fault.c  |   2 +
>  drivers/misc/cxl/irq.c    |   2 +-
>  drivers/misc/cxl/trace.h  | 115 ++++++++++++++++++++++++++--------------------
>  4 files changed, 72 insertions(+), 50 deletions(-)

Sorry this no longer builds:

drivers/misc/cxl/cxllib.c:215:35: note: each undeclared identifier is reported only once for each function it appears in
drivers/misc/cxl/cxllib.c:215:41: error: 'flags' undeclared (first use in this function); did you mean 'class'?
  trace_cxl_lib_handle_fault(addr, size, flags);
                                         ^~~~~
                                         class

cheers

^ permalink raw reply

* Re: [PATCH 2/2] powerpc/mm: Warn if W+X pages found on boot
From: Russell Currey @ 2019-05-01  7:04 UTC (permalink / raw)
  To: Christophe Leroy, linuxppc-dev; +Cc: Julia.Lawall, rashmica.g
In-Reply-To: <8e659a8f-af3f-e889-3f7a-560178c1f7b1@c-s.fr>

On Wed, 2019-04-24 at 09:14 +0200, Christophe Leroy wrote:
> 
> Le 24/04/2019 à 08:39, Russell Currey a écrit :
> > Implement code to walk all pages and warn if any are found to be
> > both
> > writable and executable.  Depends on STRICT_KERNEL_RWX enabled, and
> > is
> > behind the DEBUG_WX config option.
> > 
> > This only runs on boot and has no runtime performance implications.
> > 
> > Very heavily influenced (and in some cases copied verbatim) from
> > the
> > ARM64 code written by Laura Abbott (thanks!), since our ptdump
> > infrastructure is similar.
> > 
> > Signed-off-by: Russell Currey <ruscur@russell.cc>
> > ---
> >   arch/powerpc/Kconfig.debug         | 19 +++++++++++++++
> >   arch/powerpc/include/asm/pgtable.h |  5 ++++
> >   arch/powerpc/mm/pgtable_32.c       |  5 ++++
> >   arch/powerpc/mm/pgtable_64.c       |  5 ++++
> >   arch/powerpc/mm/ptdump/ptdump.c    | 38
> > ++++++++++++++++++++++++++++++
> >   5 files changed, 72 insertions(+)
> > 
> > diff --git a/arch/powerpc/Kconfig.debug
> > b/arch/powerpc/Kconfig.debug
> > index 4e00cb0a5464..a4160ff02ed4 100644
> > --- a/arch/powerpc/Kconfig.debug
> > +++ b/arch/powerpc/Kconfig.debug
> > @@ -361,6 +361,25 @@ config PPC_PTDUMP
> >   
> >   	  If you are unsure, say N.
> >   
> > +config DEBUG_WX
> 
> I would call it PPC_DEBUG_WX to avoid confusion.

It's the same functionality as on other architectures and is an arch-
local thing, I personally think it should be left as-is but given we
already put the PPC prefix on PTDUMP, I'll add it so it's consistent

<snip>

> > +	if (radix_enabled())
> > +		st.start_address = PAGE_OFFSET;
> > +	else
> +		st.start_address = KERN_VIRT_START;
>
> KERN_VIRT_START doesn't exist on PPC32.
> 
> Christophe
> 
Thanks a lot for the review!  Applied all your suggestions.  What
should I use on PPC32 instead?

- Russell


^ permalink raw reply

* Re: [PATCH] powerpc/tm: Avoid machine crash on rt_sigreturn
From: Michael Neuling @ 2019-05-01  7:17 UTC (permalink / raw)
  To: Breno Leitao, linuxppc-dev; +Cc: gromero
In-Reply-To: <1547657264-28761-1-git-send-email-leitao@debian.org>

On Wed, 2019-01-16 at 14:47 -0200, Breno Leitao wrote:
> There is a kernel crash that happens if rt_sigreturn is called inside a
> transactional block.
> 
> This crash happens if the kernel hits an in-kernel page fault when
> accessing userspace memory, usually through copy_ckvsx_to_user(). A major
> page fault calls might_sleep() function, which can cause a task reschedule.
> A task reschedule (switch_to()) reclaim and recheckpoint the TM states,
> but, in the signal return path, the checkpointed memory was already
> reclaimed, thus the exception stack has MSR that points to MSR[TS]=0.
> 
> When the code returns from might_sleep() and a task reschedule happened,
> then this task is returned with the memory recheckpointed, and
> CPU MSR[TS] = suspended.
> 
> This means that there is a side effect at might_sleep() if it is called
> with CPU MSR[TS] = 0 and the task has regs->msr[TS] != 0.
> 
> This side effect can cause a TM bad thing, since at the exception entrance,
> the stack saves MSR[TS]=0, and this is what will be used at RFID, but,
> the processor has MSR[TS] = Suspended, and this transition will be invalid
> and a TM Bad thing will be raised, causing the following crash:
> 
> 	Unexpected TM Bad Thing exception at c00000000000e9ec (msr
> 0x8000000302a03031) tm_scratch=800000010280b033
> 	cpu 0xc: Vector: 700 (Program Check) at [c00000003ff1fd70]
> 	    pc: c00000000000e9ec: fast_exception_return+0x100/0x1bc
> 	    lr: c000000000032948: handle_rt_signal64+0xb8/0xaf0
> 	    sp: c0000004263ebc40
> 	   msr: 8000000302a03031
> 	  current = 0xc000000415050300
> 	  paca    = 0xc00000003ffc4080	 irqmask: 0x03	 irq_happened: 0x01
> 	    pid   = 25006, comm = sigfuz
> 	Linux version 5.0.0-rc1-00001-g3bd6e94bec12 (breno@debian) (gcc version
> 8.2.0 (Debian 8.2.0-3)) #899 SMP Mon Jan 7 11:30:07 EST 2019
> 	WARNING: exception is not recoverable, can't continue
> 	enter ? for help
> 	[c0000004263ebc40] c000000000032948 handle_rt_signal64+0xb8/0xaf0
> (unreliable)
> 	[c0000004263ebd30] c000000000022780 do_notify_resume+0x2f0/0x430
> 	[c0000004263ebe20] c00000000000e844 ret_from_except_lite+0x70/0x74
> 	--- Exception: c00 (System Call) at 00007fffbaac400c
> 	SP (7fffeca90f40) is in userspace
> 
> The solution for this problem is running the sigreturn code with
> regs->msr[TS] disabled, thus, avoiding hitting the side effect above. This
> does not seem to be a problem since regs->msr will be replaced by the
> ucontext value, so, it is being flushed already. In this case, it is
> flushed earlier.
> 
> Signed-off-by: Breno Leitao <leitao@debian.org>

Acked-by: Michael Neuling <mikey@neuling.org>

This still applies on powerpc/next so just acking rather than reposting

> ---
>  arch/powerpc/kernel/signal_64.c | 27 ++++++++++++++++++++++++++-
>  1 file changed, 26 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
> index 6794466f6420..06c299ef6132 100644
> --- a/arch/powerpc/kernel/signal_64.c
> +++ b/arch/powerpc/kernel/signal_64.c
> @@ -565,7 +565,7 @@ static long restore_tm_sigcontexts(struct task_struct
> *tsk,
>  	preempt_disable();
>  
>  	/* pull in MSR TS bits from user context */
> -	regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK);
> +	regs->msr |= msr & MSR_TS_MASK;
>  
>  	/*
>  	 * Ensure that TM is enabled in regs->msr before we leave the signal
> @@ -745,6 +745,31 @@ SYSCALL_DEFINE0(rt_sigreturn)
>  	if (MSR_TM_SUSPENDED(mfmsr()))
>  		tm_reclaim_current(0);
>  
> +	/*
> +	 * Disable MSR[TS] bit also, so, if there is an exception in the
> +	 * code below (as a page fault in copy_ckvsx_to_user()), it does
> +	 * not recheckpoint this task if there was a context switch inside
> +	 * the exception.
> +	 *
> +	 * A major page fault can indirectly call schedule(). A reschedule
> +	 * process in the middle of an exception can have a side effect
> +	 * (Changing the CPU MSR[TS] state), since schedule() is called
> +	 * with the CPU MSR[TS] disable and returns with MSR[TS]=Suspended
> +	 * (switch_to() calls tm_recheckpoint() for the 'new' process). In
> +	 * this case, the process continues to be the same in the CPU, but
> +	 * the CPU state just changed.
> +	 *
> +	 * This can cause a TM Bad Thing, since the MSR in the stack will
> +	 * have the MSR[TS]=0, and this is what will be used to RFID.
> +	 *
> +	 * Clearing MSR[TS] state here will avoid a recheckpoint if there
> +	 * is any process reschedule in kernel space. The MSR[TS] state
> +	 * does not need to be saved also, since it will be replaced with
> +	 * the MSR[TS] that came from user context later, at
> +	 * restore_tm_sigcontexts.
> +	 */
> +	regs->msr &= ~MSR_TS_MASK;
> +
>  	if (__get_user(msr, &uc->uc_mcontext.gp_regs[PT_MSR]))
>  		goto badframe;
>  	if (MSR_TM_ACTIVE(msr)) {


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox